mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
fix: handle sectionless-docx in the general case (#1829)
A DOCX document that has no sections can still contain one or more tables. Such files are never created by Word but Word can open them just fine. These can be and are generated by other applications. Use the newly-added `Document.iter_inner_content()` method added upstream in `python-docx` to capture both paragraphs and tables from a section-less DOCX document. This generalizes the fix for MS Teams chat-transcripts (an example of sectionless-docx) implemented in #1825.
This commit is contained in:
parent
67fa7ad867
commit
0e2c21e5a2
@ -1,4 +1,4 @@
|
|||||||
## 0.10.30-dev3
|
## 0.10.30-dev4
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -12,6 +12,7 @@
|
|||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
* **Fix ingest partition parameters not being passed to the api.** When using the --partition-by-api flag via unstructured-ingest, none of the partition arguments are forwarded, meaning that these options are disregarded. With this change, we now pass through all of the relevant partition arguments to the api. This allows a user to specify all of the same partition arguments they would locally and have them respected when specifying --partition-by-api.
|
* **Fix ingest partition parameters not being passed to the api.** When using the --partition-by-api flag via unstructured-ingest, none of the partition arguments are forwarded, meaning that these options are disregarded. With this change, we now pass through all of the relevant partition arguments to the api. This allows a user to specify all of the same partition arguments they would locally and have them respected when specifying --partition-by-api.
|
||||||
|
* **Support tables in section-less DOCX.** Generalize solution for MS Chat Transcripts exported as DOCX by including tables in the partitioned output when present.
|
||||||
|
|
||||||
## 0.10.29
|
## 0.10.29
|
||||||
|
|
||||||
|
Binary file not shown.
@ -113,12 +113,14 @@ class Describe_DocxPartitioner:
|
|||||||
|
|
||||||
|
|
||||||
def test_parition_docx_from_team_chat():
|
def test_parition_docx_from_team_chat():
|
||||||
|
"""Docx with no sections partitions recognizing both paragraphs and tables."""
|
||||||
elements = cast(List[Text], partition_docx(example_doc_path("teams_chat.docx")))
|
elements = cast(List[Text], partition_docx(example_doc_path("teams_chat.docx")))
|
||||||
assert [element.text for element in elements] == [
|
assert [e.text for e in elements] == [
|
||||||
"0:0:0.0 --> 0:0:1.510\nSome Body\nOK. Yeah.",
|
"0:0:0.0 --> 0:0:1.510\nSome Body\nOK. Yeah.",
|
||||||
"0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
|
"0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
|
||||||
|
"saved-by Dennis Forsythe",
|
||||||
]
|
]
|
||||||
assert all(element.category == "UncategorizedText" for element in elements)
|
assert [e.category for e in elements] == ["UncategorizedText", "UncategorizedText", "Table"]
|
||||||
|
|
||||||
|
|
||||||
def test_partition_docx_from_filename(
|
def test_partition_docx_from_filename(
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Sequence
|
from typing import Iterator, Sequence
|
||||||
|
|
||||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||||
from docx.table import Table
|
from docx.table import Table
|
||||||
@ -6,6 +6,7 @@ from docx.text.paragraph import Paragraph
|
|||||||
|
|
||||||
class BlockItemContainer:
|
class BlockItemContainer:
|
||||||
_element: BaseOxmlElement
|
_element: BaseOxmlElement
|
||||||
|
def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
|
||||||
@property
|
@property
|
||||||
def paragraphs(self) -> Sequence[Paragraph]: ...
|
def paragraphs(self) -> Sequence[Paragraph]: ...
|
||||||
@property
|
@property
|
||||||
|
@ -1,13 +1,14 @@
|
|||||||
from typing import IO
|
from typing import IO, Iterator, List
|
||||||
|
|
||||||
from docx.blkcntnr import BlockItemContainer
|
|
||||||
from docx.oxml.document import CT_Document
|
from docx.oxml.document import CT_Document
|
||||||
from docx.section import Sections
|
from docx.section import Sections
|
||||||
from docx.settings import Settings
|
from docx.settings import Settings
|
||||||
|
from docx.shared import ElementProxy
|
||||||
from docx.styles.style import ParagraphStyle
|
from docx.styles.style import ParagraphStyle
|
||||||
|
from docx.table import Table
|
||||||
from docx.text.paragraph import Paragraph
|
from docx.text.paragraph import Paragraph
|
||||||
|
|
||||||
class Document(BlockItemContainer):
|
class Document(ElementProxy):
|
||||||
def add_paragraph(
|
def add_paragraph(
|
||||||
self,
|
self,
|
||||||
text: str = "",
|
text: str = "",
|
||||||
@ -15,6 +16,11 @@ class Document(BlockItemContainer):
|
|||||||
) -> Paragraph: ...
|
) -> Paragraph: ...
|
||||||
@property
|
@property
|
||||||
def element(self) -> CT_Document: ...
|
def element(self) -> CT_Document: ...
|
||||||
|
def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
|
||||||
|
@property
|
||||||
|
def paragraphs(self) -> List[Paragraph]: ...
|
||||||
|
@property
|
||||||
|
def tables(self) -> List[Table]: ...
|
||||||
def save(self, path_or_stream: str | IO[bytes]) -> None: ...
|
def save(self, path_or_stream: str | IO[bytes]) -> None: ...
|
||||||
@property
|
@property
|
||||||
def sections(self) -> Sections: ...
|
def sections(self) -> Sections: ...
|
||||||
|
@ -1,15 +1,11 @@
|
|||||||
from typing import Iterator, Sequence
|
from typing import Sequence
|
||||||
|
|
||||||
from docx.blkcntnr import BlockItemContainer
|
from docx.blkcntnr import BlockItemContainer
|
||||||
from docx.oxml.table import CT_Tbl, CT_Tc
|
from docx.oxml.table import CT_Tbl, CT_Tc
|
||||||
from docx.shared import Parented
|
from docx.shared import Parented
|
||||||
from docx.text.paragraph import Paragraph
|
|
||||||
|
|
||||||
class _Cell(BlockItemContainer):
|
class _Cell(BlockItemContainer):
|
||||||
_tc: CT_Tc
|
_tc: CT_Tc
|
||||||
def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
|
|
||||||
@property
|
|
||||||
def paragraphs(self) -> Sequence[Paragraph]: ...
|
|
||||||
@property
|
@property
|
||||||
def text(self) -> str: ...
|
def text(self) -> str: ...
|
||||||
|
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.10.30-dev3" # pragma: no cover
|
__version__ = "0.10.30-dev4" # pragma: no cover
|
||||||
|
@ -263,14 +263,23 @@ class _DocxPartitioner:
|
|||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
) -> Iterator[Element]:
|
) -> Iterator[Element]:
|
||||||
"""Partition MS Word documents (.docx format) into its document elements."""
|
"""Partition MS Word documents (.docx format) into its document elements."""
|
||||||
return cls(
|
self = cls(
|
||||||
filename,
|
filename=filename,
|
||||||
file,
|
file=file,
|
||||||
metadata_filename,
|
metadata_filename=metadata_filename,
|
||||||
include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
metadata_last_modified,
|
metadata_last_modified=metadata_last_modified,
|
||||||
)._iter_document_elements()
|
)
|
||||||
|
# NOTE(scanny): It's possible for a Word document to have no sections. In particular, a
|
||||||
|
# Microsoft Teams chat transcript exported to DOCX contains no sections. Such a
|
||||||
|
# "section-less" document has to be interated differently and has no headers or footers and
|
||||||
|
# therefore no page-size or margins.
|
||||||
|
return (
|
||||||
|
self._iter_document_elements()
|
||||||
|
if self._document_contains_sections
|
||||||
|
else self._iter_sectionless_document_elements()
|
||||||
|
)
|
||||||
|
|
||||||
def _iter_document_elements(self) -> Iterator[Element]:
|
def _iter_document_elements(self) -> Iterator[Element]:
|
||||||
"""Generate each document-element in (docx) `document` in document order."""
|
"""Generate each document-element in (docx) `document` in document order."""
|
||||||
@ -285,11 +294,6 @@ class _DocxPartitioner:
|
|||||||
# -- concept of what it's doing. You can see the same pattern repeating in the "sub"
|
# -- concept of what it's doing. You can see the same pattern repeating in the "sub"
|
||||||
# -- functions like `._iter_paragraph_elements()` where the "just return when done"
|
# -- functions like `._iter_paragraph_elements()` where the "just return when done"
|
||||||
# -- characteristic of a generator avoids repeated code to form interim results into lists.
|
# -- characteristic of a generator avoids repeated code to form interim results into lists.
|
||||||
|
|
||||||
if not self._document.sections:
|
|
||||||
for paragraph in self._document.paragraphs:
|
|
||||||
yield from self._iter_paragraph_elements(paragraph)
|
|
||||||
|
|
||||||
for section_idx, section in enumerate(self._document.sections):
|
for section_idx, section in enumerate(self._document.sections):
|
||||||
yield from self._iter_section_page_breaks(section_idx, section)
|
yield from self._iter_section_page_breaks(section_idx, section)
|
||||||
yield from self._iter_section_headers(section)
|
yield from self._iter_section_headers(section)
|
||||||
@ -308,6 +312,21 @@ class _DocxPartitioner:
|
|||||||
|
|
||||||
yield from self._iter_section_footers(section)
|
yield from self._iter_section_footers(section)
|
||||||
|
|
||||||
|
def _iter_sectionless_document_elements(self) -> Iterator[Element]:
|
||||||
|
"""Generate each document-element in a docx `document` that has no sections.
|
||||||
|
|
||||||
|
A "section-less" DOCX must be iterated differently. Also it will have no headers or footers
|
||||||
|
(because those live in a section).
|
||||||
|
"""
|
||||||
|
for block_item in self._document.iter_inner_content():
|
||||||
|
if isinstance(block_item, Paragraph):
|
||||||
|
yield from self._iter_paragraph_elements(block_item)
|
||||||
|
# -- a paragraph can contain a page-break --
|
||||||
|
yield from self._iter_maybe_paragraph_page_breaks(block_item)
|
||||||
|
# -- can only be a Paragraph or Table so far but more types may come later --
|
||||||
|
elif isinstance(block_item, DocxTable): # pyright: ignore[reportUnnecessaryIsInstance]
|
||||||
|
yield from self._iter_table_element(block_item)
|
||||||
|
|
||||||
def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> str:
|
def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> str:
|
||||||
"""HTML string version of `table`.
|
"""HTML string version of `table`.
|
||||||
|
|
||||||
@ -393,7 +412,17 @@ class _DocxPartitioner:
|
|||||||
@lazyproperty
|
@lazyproperty
|
||||||
def _document_contains_pagebreaks(self) -> bool:
|
def _document_contains_pagebreaks(self) -> bool:
|
||||||
"""True when there is at least one page-break detected in the document."""
|
"""True when there is at least one page-break detected in the document."""
|
||||||
return self._element_contains_pagebreak(self._document._element)
|
return self._element_contains_pagebreak(self._document.element)
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def _document_contains_sections(self) -> bool:
|
||||||
|
"""True when there is at least one section in the document.
|
||||||
|
|
||||||
|
This is always true for a document produced by Word, but may not always be the case when the
|
||||||
|
document results from conversion or export. In particular, a Microsoft Teams chat-transcript
|
||||||
|
export will have no sections.
|
||||||
|
"""
|
||||||
|
return bool(self._document.sections)
|
||||||
|
|
||||||
def _element_contains_pagebreak(self, element: BaseOxmlElement) -> bool:
|
def _element_contains_pagebreak(self, element: BaseOxmlElement) -> bool:
|
||||||
"""True when `element` contains a page break.
|
"""True when `element` contains a page break.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user