fix: handle sectionless-docx in the general case (#1829)

A DOCX document that has no sections can still contain one or more tables. Such files are never created by Word but Word can open them just fine. These can be and are generated by other applications. Use the newly-added `Document.iter_inner_content()` method added upstream in `python-docx` to capture both paragraphs and tables from a section-less DOCX document. This generalizes the fix for MS Teams chat-transcripts (an example of sectionless-docx) implemented in #1825.
2025-09-30 10:53:59 +00:00 · 2023-11-08 11:05:19 -08:00 · 2023-11-08 11:05:19 -08:00 · 0e2c21e5a2
commit 0e2c21e5a2
parent 67fa7ad867
8 changed files with 62 additions and 27 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.10.30-dev3
+## 0.10.30-dev4
 ### Enhancements
@ -12,6 +12,7 @@
 ### Fixes
 * **Fix ingest partition parameters not being passed to the api.** When using the --partition-by-api flag via unstructured-ingest, none of the partition arguments are forwarded, meaning that these options are disregarded. With this change, we now pass through all of the relevant partition arguments to the api. This allows a user to specify all of the same partition arguments they would locally and have them respected when specifying --partition-by-api.
 * **Support tables in section-less DOCX.** Generalize solution for MS Chat Transcripts exported as DOCX by including tables in the partitioned output when present.
 ## 0.10.29
--- a/example-docs/teams_chat.docx
+++ b/example-docs/teams_chat.docx
--- a/test_unstructured/partition/docx/test_docx.py
+++ b/test_unstructured/partition/docx/test_docx.py
@ -113,12 +113,14 @@ class Describe_DocxPartitioner:
 def test_parition_docx_from_team_chat():
    """Docx with no sections partitions recognizing both paragraphs and tables."""
    elements = cast(List[Text], partition_docx(example_doc_path("teams_chat.docx")))
-    assert [element.text for element in elements] == [
+    assert [e.text for e in elements] == [
        "0:0:0.0 --> 0:0:1.510\nSome Body\nOK. Yeah.",
        "0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
        "saved-by  Dennis Forsythe",
    ]
-    assert all(element.category == "UncategorizedText" for element in elements)
+    assert [e.category for e in elements] == ["UncategorizedText", "UncategorizedText", "Table"]
 def test_partition_docx_from_filename(
--- a/typings/docx/blkcntnr.pyi
+++ b/typings/docx/blkcntnr.pyi
@ -1,4 +1,4 @@
-from typing import Sequence
+from typing import Iterator, Sequence
 from docx.oxml.xmlchemy import BaseOxmlElement
 from docx.table import Table
@ -6,6 +6,7 @@ from docx.text.paragraph import Paragraph
 class BlockItemContainer:
    _element: BaseOxmlElement
    def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
    @property
    def paragraphs(self) -> Sequence[Paragraph]: ...
    @property
--- a/typings/docx/document.pyi
+++ b/typings/docx/document.pyi
@ -1,13 +1,14 @@
-from typing import IO
+from typing import IO, Iterator, List
 from docx.blkcntnr import BlockItemContainer
 from docx.oxml.document import CT_Document
 from docx.section import Sections
 from docx.settings import Settings
 from docx.shared import ElementProxy
 from docx.styles.style import ParagraphStyle
 from docx.table import Table
 from docx.text.paragraph import Paragraph
-class Document(BlockItemContainer):
+class Document(ElementProxy):
    def add_paragraph(
        self,
        text: str = "",
@ -15,6 +16,11 @@ class Document(BlockItemContainer):
    ) -> Paragraph: ...
    @property
    def element(self) -> CT_Document: ...
    def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
    @property
    def paragraphs(self) -> List[Paragraph]: ...
    @property
    def tables(self) -> List[Table]: ...
    def save(self, path_or_stream: str | IO[bytes]) -> None: ...
    @property
    def sections(self) -> Sections: ...
--- a/typings/docx/table.pyi
+++ b/typings/docx/table.pyi
@ -1,15 +1,11 @@
-from typing import Iterator, Sequence
+from typing import Sequence
 from docx.blkcntnr import BlockItemContainer
 from docx.oxml.table import CT_Tbl, CT_Tc
 from docx.shared import Parented
 from docx.text.paragraph import Paragraph
 class _Cell(BlockItemContainer):
    _tc: CT_Tc
    def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
    @property
    def paragraphs(self) -> Sequence[Paragraph]: ...
    @property
    def text(self) -> str: ...
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.10.30-dev3"  # pragma: no cover
+__version__ = "0.10.30-dev4"  # pragma: no cover
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -263,14 +263,23 @@ class _DocxPartitioner:
        metadata_last_modified: Optional[str] = None,
    ) -> Iterator[Element]:
        """Partition MS Word documents (.docx format) into its document elements."""
-        return cls(
+        self = cls(
-            filename,
+            filename=filename,
-            file,
+            file=file,
-            metadata_filename,
+            metadata_filename=metadata_filename,
-            include_page_breaks,
+            include_page_breaks=include_page_breaks,
-            infer_table_structure,
+            infer_table_structure=infer_table_structure,
-            metadata_last_modified,
+            metadata_last_modified=metadata_last_modified,
-        )._iter_document_elements()
+        )
        # NOTE(scanny): It's possible for a Word document to have no sections. In particular, a
        # Microsoft Teams chat transcript exported to DOCX contains no sections. Such a
        # "section-less" document has to be interated differently and has no headers or footers and
        # therefore no page-size or margins.
        return (
            self._iter_document_elements()
            if self._document_contains_sections
            else self._iter_sectionless_document_elements()
        )
    def _iter_document_elements(self) -> Iterator[Element]:
        """Generate each document-element in (docx) `document` in document order."""
@ -285,11 +294,6 @@ class _DocxPartitioner:
        # -- concept of what it's doing. You can see the same pattern repeating in the "sub"
        # -- functions like `._iter_paragraph_elements()` where the "just return when done"
        # -- characteristic of a generator avoids repeated code to form interim results into lists.
        if not self._document.sections:
            for paragraph in self._document.paragraphs:
                yield from self._iter_paragraph_elements(paragraph)
        for section_idx, section in enumerate(self._document.sections):
            yield from self._iter_section_page_breaks(section_idx, section)
            yield from self._iter_section_headers(section)
@ -308,6 +312,21 @@ class _DocxPartitioner:
            yield from self._iter_section_footers(section)
    def _iter_sectionless_document_elements(self) -> Iterator[Element]:
        """Generate each document-element in a docx `document` that has no sections.
        A "section-less" DOCX must be iterated differently. Also it will have no headers or footers
        (because those live in a section).
        """
        for block_item in self._document.iter_inner_content():
            if isinstance(block_item, Paragraph):
                yield from self._iter_paragraph_elements(block_item)
                # -- a paragraph can contain a page-break --
                yield from self._iter_maybe_paragraph_page_breaks(block_item)
            # -- can only be a Paragraph or Table so far but more types may come later --
            elif isinstance(block_item, DocxTable):  # pyright: ignore[reportUnnecessaryIsInstance]
                yield from self._iter_table_element(block_item)
    def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> str:
        """HTML string version of `table`.
@ -393,7 +412,17 @@ class _DocxPartitioner:
    @lazyproperty
    def _document_contains_pagebreaks(self) -> bool:
        """True when there is at least one page-break detected in the document."""
-        return self._element_contains_pagebreak(self._document._element)
+        return self._element_contains_pagebreak(self._document.element)
    @lazyproperty
    def _document_contains_sections(self) -> bool:
        """True when there is at least one section in the document.
        This is always true for a document produced by Word, but may not always be the case when the
        document results from conversion or export. In particular, a Microsoft Teams chat-transcript
        export will have no sections.
        """
        return bool(self._document.sections)
    def _element_contains_pagebreak(self, element: BaseOxmlElement) -> bool:
        """True when `element` contains a page break.
`@ -1 +1 @@`
	`__version__ = "0.10.30-dev3" # pragma: no cover`	`__version__ = "0.10.30-dev4" # pragma: no cover`