mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
fix: handle sectionless-docx in the general case (#1829)
A DOCX document that has no sections can still contain one or more tables. Such files are never created by Word but Word can open them just fine. These can be and are generated by other applications. Use the newly-added `Document.iter_inner_content()` method added upstream in `python-docx` to capture both paragraphs and tables from a section-less DOCX document. This generalizes the fix for MS Teams chat-transcripts (an example of sectionless-docx) implemented in #1825.
This commit is contained in:
parent
67fa7ad867
commit
0e2c21e5a2
@ -1,4 +1,4 @@
|
||||
## 0.10.30-dev3
|
||||
## 0.10.30-dev4
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -12,6 +12,7 @@
|
||||
### Fixes
|
||||
|
||||
* **Fix ingest partition parameters not being passed to the api.** When using the --partition-by-api flag via unstructured-ingest, none of the partition arguments are forwarded, meaning that these options are disregarded. With this change, we now pass through all of the relevant partition arguments to the api. This allows a user to specify all of the same partition arguments they would locally and have them respected when specifying --partition-by-api.
|
||||
* **Support tables in section-less DOCX.** Generalize solution for MS Chat Transcripts exported as DOCX by including tables in the partitioned output when present.
|
||||
|
||||
## 0.10.29
|
||||
|
||||
|
Binary file not shown.
@ -113,12 +113,14 @@ class Describe_DocxPartitioner:
|
||||
|
||||
|
||||
def test_parition_docx_from_team_chat():
|
||||
"""Docx with no sections partitions recognizing both paragraphs and tables."""
|
||||
elements = cast(List[Text], partition_docx(example_doc_path("teams_chat.docx")))
|
||||
assert [element.text for element in elements] == [
|
||||
assert [e.text for e in elements] == [
|
||||
"0:0:0.0 --> 0:0:1.510\nSome Body\nOK. Yeah.",
|
||||
"0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
|
||||
"saved-by Dennis Forsythe",
|
||||
]
|
||||
assert all(element.category == "UncategorizedText" for element in elements)
|
||||
assert [e.category for e in elements] == ["UncategorizedText", "UncategorizedText", "Table"]
|
||||
|
||||
|
||||
def test_partition_docx_from_filename(
|
||||
|
@ -1,4 +1,4 @@
|
||||
from typing import Sequence
|
||||
from typing import Iterator, Sequence
|
||||
|
||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||
from docx.table import Table
|
||||
@ -6,6 +6,7 @@ from docx.text.paragraph import Paragraph
|
||||
|
||||
class BlockItemContainer:
|
||||
_element: BaseOxmlElement
|
||||
def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
|
||||
@property
|
||||
def paragraphs(self) -> Sequence[Paragraph]: ...
|
||||
@property
|
||||
|
@ -1,13 +1,14 @@
|
||||
from typing import IO
|
||||
from typing import IO, Iterator, List
|
||||
|
||||
from docx.blkcntnr import BlockItemContainer
|
||||
from docx.oxml.document import CT_Document
|
||||
from docx.section import Sections
|
||||
from docx.settings import Settings
|
||||
from docx.shared import ElementProxy
|
||||
from docx.styles.style import ParagraphStyle
|
||||
from docx.table import Table
|
||||
from docx.text.paragraph import Paragraph
|
||||
|
||||
class Document(BlockItemContainer):
|
||||
class Document(ElementProxy):
|
||||
def add_paragraph(
|
||||
self,
|
||||
text: str = "",
|
||||
@ -15,6 +16,11 @@ class Document(BlockItemContainer):
|
||||
) -> Paragraph: ...
|
||||
@property
|
||||
def element(self) -> CT_Document: ...
|
||||
def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
|
||||
@property
|
||||
def paragraphs(self) -> List[Paragraph]: ...
|
||||
@property
|
||||
def tables(self) -> List[Table]: ...
|
||||
def save(self, path_or_stream: str | IO[bytes]) -> None: ...
|
||||
@property
|
||||
def sections(self) -> Sections: ...
|
||||
|
@ -1,15 +1,11 @@
|
||||
from typing import Iterator, Sequence
|
||||
from typing import Sequence
|
||||
|
||||
from docx.blkcntnr import BlockItemContainer
|
||||
from docx.oxml.table import CT_Tbl, CT_Tc
|
||||
from docx.shared import Parented
|
||||
from docx.text.paragraph import Paragraph
|
||||
|
||||
class _Cell(BlockItemContainer):
|
||||
_tc: CT_Tc
|
||||
def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
|
||||
@property
|
||||
def paragraphs(self) -> Sequence[Paragraph]: ...
|
||||
@property
|
||||
def text(self) -> str: ...
|
||||
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.10.30-dev3" # pragma: no cover
|
||||
__version__ = "0.10.30-dev4" # pragma: no cover
|
||||
|
@ -263,14 +263,23 @@ class _DocxPartitioner:
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
) -> Iterator[Element]:
|
||||
"""Partition MS Word documents (.docx format) into its document elements."""
|
||||
return cls(
|
||||
filename,
|
||||
file,
|
||||
metadata_filename,
|
||||
include_page_breaks,
|
||||
infer_table_structure,
|
||||
metadata_last_modified,
|
||||
)._iter_document_elements()
|
||||
self = cls(
|
||||
filename=filename,
|
||||
file=file,
|
||||
metadata_filename=metadata_filename,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
)
|
||||
# NOTE(scanny): It's possible for a Word document to have no sections. In particular, a
|
||||
# Microsoft Teams chat transcript exported to DOCX contains no sections. Such a
|
||||
# "section-less" document has to be interated differently and has no headers or footers and
|
||||
# therefore no page-size or margins.
|
||||
return (
|
||||
self._iter_document_elements()
|
||||
if self._document_contains_sections
|
||||
else self._iter_sectionless_document_elements()
|
||||
)
|
||||
|
||||
def _iter_document_elements(self) -> Iterator[Element]:
|
||||
"""Generate each document-element in (docx) `document` in document order."""
|
||||
@ -285,11 +294,6 @@ class _DocxPartitioner:
|
||||
# -- concept of what it's doing. You can see the same pattern repeating in the "sub"
|
||||
# -- functions like `._iter_paragraph_elements()` where the "just return when done"
|
||||
# -- characteristic of a generator avoids repeated code to form interim results into lists.
|
||||
|
||||
if not self._document.sections:
|
||||
for paragraph in self._document.paragraphs:
|
||||
yield from self._iter_paragraph_elements(paragraph)
|
||||
|
||||
for section_idx, section in enumerate(self._document.sections):
|
||||
yield from self._iter_section_page_breaks(section_idx, section)
|
||||
yield from self._iter_section_headers(section)
|
||||
@ -308,6 +312,21 @@ class _DocxPartitioner:
|
||||
|
||||
yield from self._iter_section_footers(section)
|
||||
|
||||
def _iter_sectionless_document_elements(self) -> Iterator[Element]:
|
||||
"""Generate each document-element in a docx `document` that has no sections.
|
||||
|
||||
A "section-less" DOCX must be iterated differently. Also it will have no headers or footers
|
||||
(because those live in a section).
|
||||
"""
|
||||
for block_item in self._document.iter_inner_content():
|
||||
if isinstance(block_item, Paragraph):
|
||||
yield from self._iter_paragraph_elements(block_item)
|
||||
# -- a paragraph can contain a page-break --
|
||||
yield from self._iter_maybe_paragraph_page_breaks(block_item)
|
||||
# -- can only be a Paragraph or Table so far but more types may come later --
|
||||
elif isinstance(block_item, DocxTable): # pyright: ignore[reportUnnecessaryIsInstance]
|
||||
yield from self._iter_table_element(block_item)
|
||||
|
||||
def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> str:
|
||||
"""HTML string version of `table`.
|
||||
|
||||
@ -393,7 +412,17 @@ class _DocxPartitioner:
|
||||
@lazyproperty
|
||||
def _document_contains_pagebreaks(self) -> bool:
|
||||
"""True when there is at least one page-break detected in the document."""
|
||||
return self._element_contains_pagebreak(self._document._element)
|
||||
return self._element_contains_pagebreak(self._document.element)
|
||||
|
||||
@lazyproperty
|
||||
def _document_contains_sections(self) -> bool:
|
||||
"""True when there is at least one section in the document.
|
||||
|
||||
This is always true for a document produced by Word, but may not always be the case when the
|
||||
document results from conversion or export. In particular, a Microsoft Teams chat-transcript
|
||||
export will have no sections.
|
||||
"""
|
||||
return bool(self._document.sections)
|
||||
|
||||
def _element_contains_pagebreak(self, element: BaseOxmlElement) -> bool:
|
||||
"""True when `element` contains a page break.
|
||||
|
Loading…
x
Reference in New Issue
Block a user