mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-05 13:24:44 +00:00

Reviewers: I recommend reviewing commit-by-commit or just looking at the final version of `partition/docx.py` as View File. This refactor solves a few problems but mostly lays the groundwork to allow us to refine further aspects such as page-break detection, list-item detection, and moving python-docx internals upstream to that library so our work doesn't depend on that domain-knowledge.
18 lines
637 B
Python
18 lines
637 B
Python
from typing import Any, Iterator
|
|
|
|
from lxml import etree
|
|
|
|
class BaseOxmlElement(etree.ElementBase):
|
|
def __iter__(self) -> Iterator[BaseOxmlElement]: ...
|
|
@property
|
|
def xml(self) -> str: ...
|
|
def xpath(self, xpath_str: str) -> Any:
|
|
"""Return type is typically Sequence[ElementBase], but ...
|
|
|
|
lxml.etree.XPath has many possible return types including bool, (a "smart") str,
|
|
float. The return type can also be a list containing ElementBase, comments,
|
|
processing instructions, str, and tuple. So you need to cast the result based on
|
|
the XPath expression you use.
|
|
"""
|
|
...
|