mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

A DOCX document that has no sections can still contain one or more tables. Such files are never created by Word but Word can open them just fine. These can be and are generated by other applications. Use the newly-added `Document.iter_inner_content()` method added upstream in `python-docx` to capture both paragraphs and tables from a section-less DOCX document. This generalizes the fix for MS Teams chat-transcripts (an example of sectionless-docx) implemented in #1825.
29 lines
887 B
Python
29 lines
887 B
Python
from typing import IO, Iterator, List
|
|
|
|
from docx.oxml.document import CT_Document
|
|
from docx.section import Sections
|
|
from docx.settings import Settings
|
|
from docx.shared import ElementProxy
|
|
from docx.styles.style import ParagraphStyle
|
|
from docx.table import Table
|
|
from docx.text.paragraph import Paragraph
|
|
|
|
class Document(ElementProxy):
|
|
def add_paragraph(
|
|
self,
|
|
text: str = "",
|
|
style: ParagraphStyle | str | None = None,
|
|
) -> Paragraph: ...
|
|
@property
|
|
def element(self) -> CT_Document: ...
|
|
def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
|
|
@property
|
|
def paragraphs(self) -> List[Paragraph]: ...
|
|
@property
|
|
def tables(self) -> List[Table]: ...
|
|
def save(self, path_or_stream: str | IO[bytes]) -> None: ...
|
|
@property
|
|
def sections(self) -> Sections: ...
|
|
@property
|
|
def settings(self) -> Settings: ...
|