mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-27 19:10:33 +00:00

Improves hierarchy from docx files by leveraging natural hierarchies built into docx documents. Hierarchy can now be detected from an indentation level for list bullets/numbers and by style name (e.g. Heading 1, List Bullet 2, List Number). Hierarchy detection is improved by determining category depth via the following: 1. Check if the paragraph item has an indentation level (ilvl) xpath - these are typically on list bullet/numbers. Return the indentation level if it exists 2. Check the name of the paragraph style if it contains any category depth information (e.g. Heading 1 vs Heading 2 or List Bullet vs List Bullet 2). Return the category depth if found, else default to depth of 0. 3. Check the paragraph ilvl via the paragraph's style name. Outside of the paragraph's metadata, docx stores default ilvls for various style names, which requires a complex lookup. This check is yet to be implemented, as the above methods cover most usecases but the implementation is stubbed out. --- Co-authored-by: Steve Canny <stcanny@gmail.com>
23 lines
710 B
Python
23 lines
710 B
Python
# pyright: reportPrivateUsage=false
|
|
|
|
from typing import IO, Union
|
|
|
|
from docx.blkcntnr import BlockItemContainer
|
|
from docx.oxml.document import CT_Document
|
|
from docx.section import Sections
|
|
from docx.settings import Settings
|
|
from docx.styles.style import _ParagraphStyle
|
|
from docx.text.paragraph import Paragraph
|
|
|
|
class Document(BlockItemContainer):
|
|
def add_paragraph(
|
|
self, text: str = "", style: Union[_ParagraphStyle, str, None] = None,
|
|
) -> Paragraph: ...
|
|
@property
|
|
def element(self) -> CT_Document: ...
|
|
def save(self, path_or_stream: Union[str, IO[bytes]]) -> None: ...
|
|
@property
|
|
def sections(self) -> Sections: ...
|
|
@property
|
|
def settings(self) -> Settings: ...
|