From 0e2c21e5a26cf9f75b43f16df41f18febb2a52ff Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Wed, 8 Nov 2023 11:05:19 -0800 Subject: [PATCH] fix: handle sectionless-docx in the general case (#1829) A DOCX document that has no sections can still contain one or more tables. Such files are never created by Word but Word can open them just fine. These can be and are generated by other applications. Use the newly-added `Document.iter_inner_content()` method added upstream in `python-docx` to capture both paragraphs and tables from a section-less DOCX document. This generalizes the fix for MS Teams chat-transcripts (an example of sectionless-docx) implemented in #1825. --- CHANGELOG.md | 3 +- example-docs/teams_chat.docx | Bin 1270 -> 1359 bytes test_unstructured/partition/docx/test_docx.py | 6 +- typings/docx/blkcntnr.pyi | 3 +- typings/docx/document.pyi | 12 +++- typings/docx/table.pyi | 6 +- unstructured/__version__.py | 2 +- unstructured/partition/docx.py | 57 +++++++++++++----- 8 files changed, 62 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 434c33f94..4923711c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.30-dev3 +## 0.10.30-dev4 ### Enhancements @@ -12,6 +12,7 @@ ### Fixes * **Fix ingest partition parameters not being passed to the api.** When using the --partition-by-api flag via unstructured-ingest, none of the partition arguments are forwarded, meaning that these options are disregarded. With this change, we now pass through all of the relevant partition arguments to the api. This allows a user to specify all of the same partition arguments they would locally and have them respected when specifying --partition-by-api. +* **Support tables in section-less DOCX.** Generalize solution for MS Chat Transcripts exported as DOCX by including tables in the partitioned output when present. ## 0.10.29 diff --git a/example-docs/teams_chat.docx b/example-docs/teams_chat.docx index b4424bd0c153786966517d53477e0b0592532f23..fb3394f91ea6648eb01d53daf4af83aa34d7aabb 100644 GIT binary patch delta 433 zcmeyyd7eumz?+#xgn@y9gP}Y&J$y;cRd#zu28R926D4Hoiza)ef^caCHv=QfSD-jp z>Du7){>=swdq0Pp*j6lNy(8n8&?)8n<-x&<4;)*Vg>FsSCjD;z_P2AGA2qsMK3#pk z|N8a!-k0C1zdq3Db(L%0K>=2QfH=-epRAgz_kK>^advNsN5_h9Jw?1e%;6^|9CUka zcQ7K9H?_6iY0>gKPs*G&IZ10}+i-YRX=<&Saxq!9ujv8X=0#5pzB3rCTB_jAE&I~{ zL}KN!zLz#0XB-(n{9PS&pUK;K3tzLy?KeBNO{uu{TTOzs{Mj4U3yG8XoRf9{Vx>x_-s8}#(JC_aD z%D!C->&qVB$OyNW-Zbw{M(&p{p*mLeNALV*+HA=9iir<3yeDh2XfXo=dvX|y6=T=r Zi7XaOcbO(HU=p4DfJJ~Ug9R8x3;>*q!)gEk delta 339 zcmX@l^^H>^z?+#xgn@y9gJEVsNO)Q%W9c&>kBw=fgiO8RW}{RPF0J5ZU}X8q$iM(r z+B?zE|FD5T>-Rrgrpoqbt#32Qzo5=!*Fsj|DGl#>q)P8!2)20QUvfF^Yl!CMxdJ5$g$}>7vR()@t+n`m zmqEkUSggL}jckvzp3B+b@7v4Y|CoPsuA_qf$EB*z-byRWE3lU(`#n+*a@uY2 z*;ZRVKWKf#&bTSRLY00jAcR*Z&|U$Iy)woF!LRb%640R|fb0FV2MUH||9 diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py index 6b084a4fd..25ceb556c 100644 --- a/test_unstructured/partition/docx/test_docx.py +++ b/test_unstructured/partition/docx/test_docx.py @@ -113,12 +113,14 @@ class Describe_DocxPartitioner: def test_parition_docx_from_team_chat(): + """Docx with no sections partitions recognizing both paragraphs and tables.""" elements = cast(List[Text], partition_docx(example_doc_path("teams_chat.docx"))) - assert [element.text for element in elements] == [ + assert [e.text for e in elements] == [ "0:0:0.0 --> 0:0:1.510\nSome Body\nOK. Yeah.", "0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.", + "saved-by Dennis Forsythe", ] - assert all(element.category == "UncategorizedText" for element in elements) + assert [e.category for e in elements] == ["UncategorizedText", "UncategorizedText", "Table"] def test_partition_docx_from_filename( diff --git a/typings/docx/blkcntnr.pyi b/typings/docx/blkcntnr.pyi index 9e09ea8c2..76c3e69d6 100644 --- a/typings/docx/blkcntnr.pyi +++ b/typings/docx/blkcntnr.pyi @@ -1,4 +1,4 @@ -from typing import Sequence +from typing import Iterator, Sequence from docx.oxml.xmlchemy import BaseOxmlElement from docx.table import Table @@ -6,6 +6,7 @@ from docx.text.paragraph import Paragraph class BlockItemContainer: _element: BaseOxmlElement + def iter_inner_content(self) -> Iterator[Paragraph | Table]: ... @property def paragraphs(self) -> Sequence[Paragraph]: ... @property diff --git a/typings/docx/document.pyi b/typings/docx/document.pyi index 086e9a72c..27c790dde 100644 --- a/typings/docx/document.pyi +++ b/typings/docx/document.pyi @@ -1,13 +1,14 @@ -from typing import IO +from typing import IO, Iterator, List -from docx.blkcntnr import BlockItemContainer from docx.oxml.document import CT_Document from docx.section import Sections from docx.settings import Settings +from docx.shared import ElementProxy from docx.styles.style import ParagraphStyle +from docx.table import Table from docx.text.paragraph import Paragraph -class Document(BlockItemContainer): +class Document(ElementProxy): def add_paragraph( self, text: str = "", @@ -15,6 +16,11 @@ class Document(BlockItemContainer): ) -> Paragraph: ... @property def element(self) -> CT_Document: ... + def iter_inner_content(self) -> Iterator[Paragraph | Table]: ... + @property + def paragraphs(self) -> List[Paragraph]: ... + @property + def tables(self) -> List[Table]: ... def save(self, path_or_stream: str | IO[bytes]) -> None: ... @property def sections(self) -> Sections: ... diff --git a/typings/docx/table.pyi b/typings/docx/table.pyi index 22296ae91..de8036ba6 100644 --- a/typings/docx/table.pyi +++ b/typings/docx/table.pyi @@ -1,15 +1,11 @@ -from typing import Iterator, Sequence +from typing import Sequence from docx.blkcntnr import BlockItemContainer from docx.oxml.table import CT_Tbl, CT_Tc from docx.shared import Parented -from docx.text.paragraph import Paragraph class _Cell(BlockItemContainer): _tc: CT_Tc - def iter_inner_content(self) -> Iterator[Paragraph | Table]: ... - @property - def paragraphs(self) -> Sequence[Paragraph]: ... @property def text(self) -> str: ... diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 8f3ca3937..ce3efec6e 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.30-dev3" # pragma: no cover +__version__ = "0.10.30-dev4" # pragma: no cover diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index f903df22f..ffaab3556 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -263,14 +263,23 @@ class _DocxPartitioner: metadata_last_modified: Optional[str] = None, ) -> Iterator[Element]: """Partition MS Word documents (.docx format) into its document elements.""" - return cls( - filename, - file, - metadata_filename, - include_page_breaks, - infer_table_structure, - metadata_last_modified, - )._iter_document_elements() + self = cls( + filename=filename, + file=file, + metadata_filename=metadata_filename, + include_page_breaks=include_page_breaks, + infer_table_structure=infer_table_structure, + metadata_last_modified=metadata_last_modified, + ) + # NOTE(scanny): It's possible for a Word document to have no sections. In particular, a + # Microsoft Teams chat transcript exported to DOCX contains no sections. Such a + # "section-less" document has to be interated differently and has no headers or footers and + # therefore no page-size or margins. + return ( + self._iter_document_elements() + if self._document_contains_sections + else self._iter_sectionless_document_elements() + ) def _iter_document_elements(self) -> Iterator[Element]: """Generate each document-element in (docx) `document` in document order.""" @@ -285,11 +294,6 @@ class _DocxPartitioner: # -- concept of what it's doing. You can see the same pattern repeating in the "sub" # -- functions like `._iter_paragraph_elements()` where the "just return when done" # -- characteristic of a generator avoids repeated code to form interim results into lists. - - if not self._document.sections: - for paragraph in self._document.paragraphs: - yield from self._iter_paragraph_elements(paragraph) - for section_idx, section in enumerate(self._document.sections): yield from self._iter_section_page_breaks(section_idx, section) yield from self._iter_section_headers(section) @@ -308,6 +312,21 @@ class _DocxPartitioner: yield from self._iter_section_footers(section) + def _iter_sectionless_document_elements(self) -> Iterator[Element]: + """Generate each document-element in a docx `document` that has no sections. + + A "section-less" DOCX must be iterated differently. Also it will have no headers or footers + (because those live in a section). + """ + for block_item in self._document.iter_inner_content(): + if isinstance(block_item, Paragraph): + yield from self._iter_paragraph_elements(block_item) + # -- a paragraph can contain a page-break -- + yield from self._iter_maybe_paragraph_page_breaks(block_item) + # -- can only be a Paragraph or Table so far but more types may come later -- + elif isinstance(block_item, DocxTable): # pyright: ignore[reportUnnecessaryIsInstance] + yield from self._iter_table_element(block_item) + def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> str: """HTML string version of `table`. @@ -393,7 +412,17 @@ class _DocxPartitioner: @lazyproperty def _document_contains_pagebreaks(self) -> bool: """True when there is at least one page-break detected in the document.""" - return self._element_contains_pagebreak(self._document._element) + return self._element_contains_pagebreak(self._document.element) + + @lazyproperty + def _document_contains_sections(self) -> bool: + """True when there is at least one section in the document. + + This is always true for a document produced by Word, but may not always be the case when the + document results from conversion or export. In particular, a Microsoft Teams chat-transcript + export will have no sections. + """ + return bool(self._document.sections) def _element_contains_pagebreak(self, element: BaseOxmlElement) -> bool: """True when `element` contains a page break.