mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-11 07:57:21 +00:00
rfctr(pptx): minify HTML and table.text is cct (#3734)
**Summary** Eliminate historical "idiosyncracies" of `table.metadata.text_as_html` HTML introduced by `partition_pptx()`. Produce minified `.text_as_html` consistent with that formed by chunking. **Additional Context** - PPTX `.metadata.text_as_html` is minified (no extra whitespace or thead, tbody, tfoot elements). - `table.text` is clean-concatenated-text (CCT) of table. - Last use of `tabulate` library is removed and that dependency is removed from `base.in`.
This commit is contained in:
parent
3dea723656
commit
3240e3d17a
@ -1,4 +1,4 @@
|
|||||||
## 0.16.1-dev3
|
## 0.16.1-dev4
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -8,10 +8,11 @@
|
|||||||
|
|
||||||
* **Remove unsupported chipper model**
|
* **Remove unsupported chipper model**
|
||||||
* **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.
|
* **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.
|
||||||
* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
|
* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text.
|
||||||
* **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file.
|
* **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file.
|
||||||
* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
|
* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text.
|
||||||
* **Minify text_as_html from CSV.** Previously `.metadata.text_as_html` for CSV tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
|
* **Minify text_as_html from CSV.** Previously `.metadata.text_as_html` for CSV tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text.
|
||||||
|
* **Minify text_as_html from PPTX.** Previously `.metadata.text_as_html` for PPTX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text and structure.
|
||||||
|
|
||||||
## 0.16.0
|
## 0.16.0
|
||||||
|
|
||||||
|
|||||||
@ -4,7 +4,6 @@ filetype
|
|||||||
python-magic
|
python-magic
|
||||||
lxml
|
lxml
|
||||||
nltk
|
nltk
|
||||||
tabulate
|
|
||||||
requests
|
requests
|
||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
emoji
|
emoji
|
||||||
|
|||||||
@ -388,17 +388,6 @@ def test_convert_office_docs_respects_wait_timeout():
|
|||||||
assert np.sum([(path / "simple.docx").is_file() for path in paths_to_save]) < 3
|
assert np.sum([(path / "simple.docx").is_file() for path in paths_to_save]) < 3
|
||||||
|
|
||||||
|
|
||||||
class MockDocxEmptyTable:
|
|
||||||
def __init__(self):
|
|
||||||
self.rows = []
|
|
||||||
|
|
||||||
|
|
||||||
def test_convert_ms_office_table_to_text_works_with_empty_tables():
|
|
||||||
table = MockDocxEmptyTable()
|
|
||||||
assert common.convert_ms_office_table_to_text(table, as_html=True) == ""
|
|
||||||
assert common.convert_ms_office_table_to_text(table, as_html=False) == ""
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("text", "expected"),
|
("text", "expected"),
|
||||||
[
|
[
|
||||||
|
|||||||
@ -247,15 +247,11 @@ def test_partition_pptx_grabs_tables():
|
|||||||
assert elements[1].text.startswith("Column 1")
|
assert elements[1].text.startswith("Column 1")
|
||||||
assert elements[1].text.strip().endswith("Aqua")
|
assert elements[1].text.strip().endswith("Aqua")
|
||||||
assert elements[1].metadata.text_as_html == (
|
assert elements[1].metadata.text_as_html == (
|
||||||
"<table>\n"
|
"<table>"
|
||||||
"<thead>\n"
|
"<tr><td>Column 1</td><td>Column 2</td><td>Column 3</td></tr>"
|
||||||
"<tr><th>Column 1 </th><th>Column 2 </th><th>Column 3 </th></tr>\n"
|
"<tr><td>Red</td><td>Green</td><td>Blue</td></tr>"
|
||||||
"</thead>\n"
|
"<tr><td>Purple</td><td>Orange</td><td>Yellow</td></tr>"
|
||||||
"<tbody>\n"
|
"<tr><td>Tangerine</td><td>Pink</td><td>Aqua</td></tr>"
|
||||||
"<tr><td>Red </td><td>Green </td><td>Blue </td></tr>\n"
|
|
||||||
"<tr><td>Purple </td><td>Orange </td><td>Yellow </td></tr>\n"
|
|
||||||
"<tr><td>Tangerine </td><td>Pink </td><td>Aqua </td></tr>\n"
|
|
||||||
"</tbody>\n"
|
|
||||||
"</table>"
|
"</table>"
|
||||||
)
|
)
|
||||||
assert elements[1].metadata.filename == "fake-power-point-table.pptx"
|
assert elements[1].metadata.filename == "fake-power-point-table.pptx"
|
||||||
@ -516,7 +512,7 @@ def test_partition_pptx_hierarchy_sample_document():
|
|||||||
(2, "6ec455f5f19782facf184886876c9a66", "5614b00c3f6bff23ebba1360e10f6428"),
|
(2, "6ec455f5f19782facf184886876c9a66", "5614b00c3f6bff23ebba1360e10f6428"),
|
||||||
(0, "8319096532fe2e55f66c491ea8313150", "2f57a8d4182e6fd5bd5842b0a2d9841b"),
|
(0, "8319096532fe2e55f66c491ea8313150", "2f57a8d4182e6fd5bd5842b0a2d9841b"),
|
||||||
(None, None, "4120066d251ba675ade42e8a167ca61f"),
|
(None, None, "4120066d251ba675ade42e8a167ca61f"),
|
||||||
(None, None, "2ed3bd10daace79ac129cbf8faf22bfc"),
|
(None, None, "efb9d74b4f8be6308c9a9006da994e12"),
|
||||||
(0, None, "fd08cacbaddafee5cbacc02528536ee5"),
|
(0, None, "fd08cacbaddafee5cbacc02528536ee5"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.16.1-dev3" # pragma: no cover
|
__version__ = "0.16.1-dev4" # pragma: no cover
|
||||||
|
|||||||
@ -9,7 +9,6 @@ from typing import IO, TYPE_CHECKING, Any, Optional, TypeVar, cast
|
|||||||
|
|
||||||
import emoji
|
import emoji
|
||||||
import psutil
|
import psutil
|
||||||
from tabulate import tabulate
|
|
||||||
|
|
||||||
from unstructured.documents.coordinates import CoordinateSystem, PixelSpace
|
from unstructured.documents.coordinates import CoordinateSystem, PixelSpace
|
||||||
from unstructured.documents.elements import (
|
from unstructured.documents.elements import (
|
||||||
@ -29,9 +28,6 @@ from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
|
|||||||
from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT
|
from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT
|
||||||
from unstructured.utils import dependency_exists, first
|
from unstructured.utils import dependency_exists, first
|
||||||
|
|
||||||
if dependency_exists("pptx") and dependency_exists("pptx.table"):
|
|
||||||
from pptx.table import Table as PptxTable
|
|
||||||
|
|
||||||
if dependency_exists("numpy") and dependency_exists("cv2"):
|
if dependency_exists("numpy") and dependency_exists("cv2"):
|
||||||
from unstructured.partition.utils.sorting import sort_page_elements
|
from unstructured.partition.utils.sorting import sort_page_elements
|
||||||
|
|
||||||
@ -396,27 +392,6 @@ def convert_to_bytes(file: bytes | IO[bytes]) -> bytes:
|
|||||||
raise ValueError("Invalid file-like object type")
|
raise ValueError("Invalid file-like object type")
|
||||||
|
|
||||||
|
|
||||||
def convert_ms_office_table_to_text(table: PptxTable, as_html: bool = True) -> str:
|
|
||||||
"""Convert a PPTX table object to an HTML table string using the tabulate library.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
table (Table): A pptx.table.Table object.
|
|
||||||
as_html (bool): Whether to return the table as an HTML string (True) or a
|
|
||||||
plain text string (False)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: An table string representation of the input table.
|
|
||||||
"""
|
|
||||||
rows = list(table.rows)
|
|
||||||
|
|
||||||
if not rows:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
headers = [cell.text for cell in rows[0].cells]
|
|
||||||
data = [[cell.text for cell in row.cells] for row in rows[1:]]
|
|
||||||
return tabulate(data, headers=headers, tablefmt="html" if as_html else "plain")
|
|
||||||
|
|
||||||
|
|
||||||
def contains_emoji(s: str) -> bool:
|
def contains_emoji(s: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if the input string contains any emoji characters.
|
Check if the input string contains any emoji characters.
|
||||||
|
|||||||
@ -22,6 +22,7 @@ from pptx.slide import Slide
|
|||||||
from pptx.text.text import _Paragraph # pyright: ignore [reportPrivateUsage]
|
from pptx.text.text import _Paragraph # pyright: ignore [reportPrivateUsage]
|
||||||
|
|
||||||
from unstructured.chunking import add_chunking_strategy
|
from unstructured.chunking import add_chunking_strategy
|
||||||
|
from unstructured.common.html_table import HtmlTable, htmlify_matrix_of_cell_texts
|
||||||
from unstructured.documents.elements import (
|
from unstructured.documents.elements import (
|
||||||
Element,
|
Element,
|
||||||
ElementMetadata,
|
ElementMetadata,
|
||||||
@ -34,7 +35,6 @@ from unstructured.documents.elements import (
|
|||||||
Title,
|
Title,
|
||||||
)
|
)
|
||||||
from unstructured.file_utils.model import FileType
|
from unstructured.file_utils.model import FileType
|
||||||
from unstructured.partition.common.common import convert_ms_office_table_to_text
|
|
||||||
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
|
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
|
||||||
from unstructured.partition.text_type import (
|
from unstructured.partition.text_type import (
|
||||||
is_email_address,
|
is_email_address,
|
||||||
@ -213,38 +213,6 @@ class _PptxPartitioner:
|
|||||||
PicturePartitionerCls = self._opts.picture_partitioner
|
PicturePartitionerCls = self._opts.picture_partitioner
|
||||||
yield from PicturePartitionerCls.iter_elements(picture, self._opts)
|
yield from PicturePartitionerCls.iter_elements(picture, self._opts)
|
||||||
|
|
||||||
def _iter_title_shape_element(self, shape: Shape) -> Iterator[Element]:
|
|
||||||
"""Generate Title element for each paragraph in title `shape`.
|
|
||||||
|
|
||||||
Text is most likely a title, but in the rare case that the title shape was used
|
|
||||||
for the slide body text, also check for bulleted paragraphs."""
|
|
||||||
if self._shape_is_off_slide(shape):
|
|
||||||
return
|
|
||||||
|
|
||||||
depth = 0
|
|
||||||
for paragraph in shape.text_frame.paragraphs:
|
|
||||||
text = paragraph.text
|
|
||||||
if text.strip() == "":
|
|
||||||
continue
|
|
||||||
|
|
||||||
if self._is_bulleted_paragraph(paragraph):
|
|
||||||
bullet_depth = paragraph.level or 0
|
|
||||||
yield ListItem(
|
|
||||||
text=text,
|
|
||||||
metadata=self._opts.text_metadata(category_depth=bullet_depth),
|
|
||||||
detection_origin=DETECTION_ORIGIN,
|
|
||||||
)
|
|
||||||
elif is_email_address(text):
|
|
||||||
yield EmailAddress(text=text, detection_origin=DETECTION_ORIGIN)
|
|
||||||
else:
|
|
||||||
# increment the category depth by the paragraph increment in the shape
|
|
||||||
yield Title(
|
|
||||||
text=text,
|
|
||||||
metadata=self._opts.text_metadata(category_depth=depth),
|
|
||||||
detection_origin=DETECTION_ORIGIN,
|
|
||||||
)
|
|
||||||
depth += 1 # Cannot enumerate because we want to skip empty paragraphs
|
|
||||||
|
|
||||||
def _iter_shape_elements(self, shape: Shape) -> Iterator[Element]:
|
def _iter_shape_elements(self, shape: Shape) -> Iterator[Element]:
|
||||||
"""Generate Text or subtype element for each paragraph in `shape`."""
|
"""Generate Text or subtype element for each paragraph in `shape`."""
|
||||||
if self._shape_is_off_slide(shape):
|
if self._shape_is_off_slide(shape):
|
||||||
@ -280,17 +248,54 @@ class _PptxPartitioner:
|
|||||||
|
|
||||||
An empty table does not produce an element.
|
An empty table does not produce an element.
|
||||||
"""
|
"""
|
||||||
text_table = convert_ms_office_table_to_text(graphfrm.table, as_html=False).strip()
|
if not (rows := list(graphfrm.table.rows)):
|
||||||
if not text_table:
|
|
||||||
return
|
return
|
||||||
html_table = None
|
|
||||||
if self._opts.infer_table_structure:
|
html_text = htmlify_matrix_of_cell_texts(
|
||||||
html_table = convert_ms_office_table_to_text(graphfrm.table, as_html=True)
|
[[cell.text for cell in row.cells] for row in rows]
|
||||||
yield Table(
|
|
||||||
text=text_table,
|
|
||||||
metadata=self._opts.table_metadata(html_table),
|
|
||||||
detection_origin=DETECTION_ORIGIN,
|
|
||||||
)
|
)
|
||||||
|
html_table = HtmlTable.from_html_text(html_text)
|
||||||
|
|
||||||
|
if not html_table.text:
|
||||||
|
return
|
||||||
|
|
||||||
|
metadata = self._opts.table_metadata(
|
||||||
|
html_table.html if self._opts.infer_table_structure else None
|
||||||
|
)
|
||||||
|
|
||||||
|
yield Table(text=html_table.text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
|
||||||
|
|
||||||
|
def _iter_title_shape_element(self, shape: Shape) -> Iterator[Element]:
|
||||||
|
"""Generate Title element for each paragraph in title `shape`.
|
||||||
|
|
||||||
|
Text is most likely a title, but in the rare case that the title shape was used
|
||||||
|
for the slide body text, also check for bulleted paragraphs."""
|
||||||
|
if self._shape_is_off_slide(shape):
|
||||||
|
return
|
||||||
|
|
||||||
|
depth = 0
|
||||||
|
for paragraph in shape.text_frame.paragraphs:
|
||||||
|
text = paragraph.text
|
||||||
|
if text.strip() == "":
|
||||||
|
continue
|
||||||
|
|
||||||
|
if self._is_bulleted_paragraph(paragraph):
|
||||||
|
bullet_depth = paragraph.level or 0
|
||||||
|
yield ListItem(
|
||||||
|
text=text,
|
||||||
|
metadata=self._opts.text_metadata(category_depth=bullet_depth),
|
||||||
|
detection_origin=DETECTION_ORIGIN,
|
||||||
|
)
|
||||||
|
elif is_email_address(text):
|
||||||
|
yield EmailAddress(text=text, detection_origin=DETECTION_ORIGIN)
|
||||||
|
else:
|
||||||
|
# increment the category depth by the paragraph increment in the shape
|
||||||
|
yield Title(
|
||||||
|
text=text,
|
||||||
|
metadata=self._opts.text_metadata(category_depth=depth),
|
||||||
|
detection_origin=DETECTION_ORIGIN,
|
||||||
|
)
|
||||||
|
depth += 1 # Cannot enumerate because we want to skip empty paragraphs
|
||||||
|
|
||||||
def _order_shapes(self, slide: Slide) -> tuple[Shape | None, Sequence[BaseShape]]:
|
def _order_shapes(self, slide: Slide) -> tuple[Shape | None, Sequence[BaseShape]]:
|
||||||
"""Orders the shapes on `slide` from top to bottom and left to right.
|
"""Orders the shapes on `slide` from top to bottom and left to right.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user