From 0e2c21e5a26cf9f75b43f16df41f18febb2a52ff Mon Sep 17 00:00:00 2001
From: Steve Canny <stcanny@gmail.com>
Date: Wed, 8 Nov 2023 11:05:19 -0800
Subject: [PATCH] fix: handle sectionless-docx in the general case (#1829)

A DOCX document that has no sections can still contain one or more
tables. Such files are never created by Word but Word can open them just
fine. These can be and are generated by other applications.

Use the newly-added `Document.iter_inner_content()` method added
upstream in `python-docx` to capture both paragraphs and tables from a
section-less DOCX document.

This generalizes the fix for MS Teams chat-transcripts (an example of
sectionless-docx) implemented in #1825.
---
 CHANGELOG.md                                  |   3 +-
 example-docs/teams_chat.docx                  | Bin 1270 -> 1359 bytes
 test_unstructured/partition/docx/test_docx.py |   6 +-
 typings/docx/blkcntnr.pyi                     |   3 +-
 typings/docx/document.pyi                     |  12 +++-
 typings/docx/table.pyi                        |   6 +-
 unstructured/__version__.py                   |   2 +-
 unstructured/partition/docx.py                |  57 +++++++++++++-----
 8 files changed, 62 insertions(+), 27 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 434c33f94..4923711c3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.10.30-dev3
+## 0.10.30-dev4
 
 ### Enhancements
 
@@ -12,6 +12,7 @@
 ### Fixes
 
 * **Fix ingest partition parameters not being passed to the api.** When using the --partition-by-api flag via unstructured-ingest, none of the partition arguments are forwarded, meaning that these options are disregarded. With this change, we now pass through all of the relevant partition arguments to the api. This allows a user to specify all of the same partition arguments they would locally and have them respected when specifying --partition-by-api.
+* **Support tables in section-less DOCX.** Generalize solution for MS Chat Transcripts exported as DOCX by including tables in the partitioned output when present.
 
 ## 0.10.29
 
diff --git a/example-docs/teams_chat.docx b/example-docs/teams_chat.docx
index b4424bd0c153786966517d53477e0b0592532f23..fb3394f91ea6648eb01d53daf4af83aa34d7aabb 100644
GIT binary patch
delta 433
zcmeyyd7eumz?+#xgn@y9gP}Y&J$y;cRd#zu28R926D4Hoiza)ef^caCHv=QfSD-jp
z>Du7){>=swdq0Pp*j6lNy(8n8&?)8n<-x&<4;)*Vg>FsSCjD;z_P2AGA2qsMK3#pk
z|N8a!-k0C1zdq3Db(L%0K>=2QfH=-epRAgz_kK>^advNsN5_h9Jw?1e%;6^|9CUka
zcQ7K9H?_6iY0>gKPs*G&IZ10}+i-YRX=<&Saxq!9ujv8X=0#5pzB3rCTB_jAE&I~{
zL}KN!zLz#0XB-(n{9PS&pUK;K3tzLy?KeBNO{uu{TTOzs{Mj<qqwDoKYA!5toSfvn
zPdD<0{<oUmMQ4wE<#K)Px^6zV+tiI`PfmJOeoV~jTB`Y{oEs67b56RfHEXQC*my9n
za%*RD$Mk1&TURdGn_@QemD6(PKe<yDyRUlW&>4U3yG8XoRf9{Vx>x_-s8}#(JC_aD
z%D!C->&qVB$OyNW-Zbw{M(&p{p*mLeNALV*+HA=9iir<3yeDh2XfXo=dvX|y6=T=r
Zi7XaOcbO(HU=p4DfJJ~Ug9R8x3;>*q!)gEk

delta 339
zcmX@l^^H>^z?+#xgn@y9gJEVsNO)Q%W9c&>kBw=fgiO8RW}{RPF0J5ZU}X8q$iM(r
z+B?zE|FD5T>-Rrgr<tR3)VDM^P4en7C|MyPareh8msdBZy%qZLeQU5*pxC15eaoxM
z>poqbt#32Qzo5=!*Fsj|DGl#>q)P8!2)20QUvfF^Yl!CMxdJ5$g$}>7vR()@t+n`m
zmqEkUSggL}jckvz<f0PW`8~ncX7cUn|NY^Fx1yZIsldx-$|`wxrN7QC5c}q2yj%Oc
z)|N$GM_EpO-fd?yS%rI>p3B+b@7v4Y|CoPsuA_qf$EB*z-byRWE3lU(`#n+*a@uY2
z*;ZRVKWKf#&bTSRLY00<M6XtTSAOKbu=<Wa6^mZG-hRUqu(_M@B@-WNuuN`Z(P9RM
c$>jAcR*Z&|U$Iy)woF!LRb%640R|fb0FV2MUH||9

diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py
index 6b084a4fd..25ceb556c 100644
--- a/test_unstructured/partition/docx/test_docx.py
+++ b/test_unstructured/partition/docx/test_docx.py
@@ -113,12 +113,14 @@ class Describe_DocxPartitioner:
 
 
 def test_parition_docx_from_team_chat():
+    """Docx with no sections partitions recognizing both paragraphs and tables."""
     elements = cast(List[Text], partition_docx(example_doc_path("teams_chat.docx")))
-    assert [element.text for element in elements] == [
+    assert [e.text for e in elements] == [
         "0:0:0.0 --> 0:0:1.510\nSome Body\nOK. Yeah.",
         "0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
+        "saved-by  Dennis Forsythe",
     ]
-    assert all(element.category == "UncategorizedText" for element in elements)
+    assert [e.category for e in elements] == ["UncategorizedText", "UncategorizedText", "Table"]
 
 
 def test_partition_docx_from_filename(
diff --git a/typings/docx/blkcntnr.pyi b/typings/docx/blkcntnr.pyi
index 9e09ea8c2..76c3e69d6 100644
--- a/typings/docx/blkcntnr.pyi
+++ b/typings/docx/blkcntnr.pyi
@@ -1,4 +1,4 @@
-from typing import Sequence
+from typing import Iterator, Sequence
 
 from docx.oxml.xmlchemy import BaseOxmlElement
 from docx.table import Table
@@ -6,6 +6,7 @@ from docx.text.paragraph import Paragraph
 
 class BlockItemContainer:
     _element: BaseOxmlElement
+    def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
     @property
     def paragraphs(self) -> Sequence[Paragraph]: ...
     @property
diff --git a/typings/docx/document.pyi b/typings/docx/document.pyi
index 086e9a72c..27c790dde 100644
--- a/typings/docx/document.pyi
+++ b/typings/docx/document.pyi
@@ -1,13 +1,14 @@
-from typing import IO
+from typing import IO, Iterator, List
 
-from docx.blkcntnr import BlockItemContainer
 from docx.oxml.document import CT_Document
 from docx.section import Sections
 from docx.settings import Settings
+from docx.shared import ElementProxy
 from docx.styles.style import ParagraphStyle
+from docx.table import Table
 from docx.text.paragraph import Paragraph
 
-class Document(BlockItemContainer):
+class Document(ElementProxy):
     def add_paragraph(
         self,
         text: str = "",
@@ -15,6 +16,11 @@ class Document(BlockItemContainer):
     ) -> Paragraph: ...
     @property
     def element(self) -> CT_Document: ...
+    def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
+    @property
+    def paragraphs(self) -> List[Paragraph]: ...
+    @property
+    def tables(self) -> List[Table]: ...
     def save(self, path_or_stream: str | IO[bytes]) -> None: ...
     @property
     def sections(self) -> Sections: ...
diff --git a/typings/docx/table.pyi b/typings/docx/table.pyi
index 22296ae91..de8036ba6 100644
--- a/typings/docx/table.pyi
+++ b/typings/docx/table.pyi
@@ -1,15 +1,11 @@
-from typing import Iterator, Sequence
+from typing import Sequence
 
 from docx.blkcntnr import BlockItemContainer
 from docx.oxml.table import CT_Tbl, CT_Tc
 from docx.shared import Parented
-from docx.text.paragraph import Paragraph
 
 class _Cell(BlockItemContainer):
     _tc: CT_Tc
-    def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
-    @property
-    def paragraphs(self) -> Sequence[Paragraph]: ...
     @property
     def text(self) -> str: ...
 
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 8f3ca3937..ce3efec6e 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.30-dev3"  # pragma: no cover
+__version__ = "0.10.30-dev4"  # pragma: no cover
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
index f903df22f..ffaab3556 100644
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@@ -263,14 +263,23 @@ class _DocxPartitioner:
         metadata_last_modified: Optional[str] = None,
     ) -> Iterator[Element]:
         """Partition MS Word documents (.docx format) into its document elements."""
-        return cls(
-            filename,
-            file,
-            metadata_filename,
-            include_page_breaks,
-            infer_table_structure,
-            metadata_last_modified,
-        )._iter_document_elements()
+        self = cls(
+            filename=filename,
+            file=file,
+            metadata_filename=metadata_filename,
+            include_page_breaks=include_page_breaks,
+            infer_table_structure=infer_table_structure,
+            metadata_last_modified=metadata_last_modified,
+        )
+        # NOTE(scanny): It's possible for a Word document to have no sections. In particular, a
+        # Microsoft Teams chat transcript exported to DOCX contains no sections. Such a
+        # "section-less" document has to be interated differently and has no headers or footers and
+        # therefore no page-size or margins.
+        return (
+            self._iter_document_elements()
+            if self._document_contains_sections
+            else self._iter_sectionless_document_elements()
+        )
 
     def _iter_document_elements(self) -> Iterator[Element]:
         """Generate each document-element in (docx) `document` in document order."""
@@ -285,11 +294,6 @@ class _DocxPartitioner:
         # -- concept of what it's doing. You can see the same pattern repeating in the "sub"
         # -- functions like `._iter_paragraph_elements()` where the "just return when done"
         # -- characteristic of a generator avoids repeated code to form interim results into lists.
-
-        if not self._document.sections:
-            for paragraph in self._document.paragraphs:
-                yield from self._iter_paragraph_elements(paragraph)
-
         for section_idx, section in enumerate(self._document.sections):
             yield from self._iter_section_page_breaks(section_idx, section)
             yield from self._iter_section_headers(section)
@@ -308,6 +312,21 @@ class _DocxPartitioner:
 
             yield from self._iter_section_footers(section)
 
+    def _iter_sectionless_document_elements(self) -> Iterator[Element]:
+        """Generate each document-element in a docx `document` that has no sections.
+
+        A "section-less" DOCX must be iterated differently. Also it will have no headers or footers
+        (because those live in a section).
+        """
+        for block_item in self._document.iter_inner_content():
+            if isinstance(block_item, Paragraph):
+                yield from self._iter_paragraph_elements(block_item)
+                # -- a paragraph can contain a page-break --
+                yield from self._iter_maybe_paragraph_page_breaks(block_item)
+            # -- can only be a Paragraph or Table so far but more types may come later --
+            elif isinstance(block_item, DocxTable):  # pyright: ignore[reportUnnecessaryIsInstance]
+                yield from self._iter_table_element(block_item)
+
     def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> str:
         """HTML string version of `table`.
 
@@ -393,7 +412,17 @@ class _DocxPartitioner:
     @lazyproperty
     def _document_contains_pagebreaks(self) -> bool:
         """True when there is at least one page-break detected in the document."""
-        return self._element_contains_pagebreak(self._document._element)
+        return self._element_contains_pagebreak(self._document.element)
+
+    @lazyproperty
+    def _document_contains_sections(self) -> bool:
+        """True when there is at least one section in the document.
+
+        This is always true for a document produced by Word, but may not always be the case when the
+        document results from conversion or export. In particular, a Microsoft Teams chat-transcript
+        export will have no sections.
+        """
+        return bool(self._document.sections)
 
     def _element_contains_pagebreak(self, element: BaseOxmlElement) -> bool:
         """True when `element` contains a page break.