mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-13 12:05:54 +00:00

**Summary** Extract as much mechanical refactoring from the HTML parser change-over into the PR as possible. This leaves the next PR focused on installing the new parser and the ingest-test impact. **Reviewers:** Commits are well groomed and reviewing commit-by-commit is probably easier. **Additional Context** This PR introduces the rewritten HTML parser. Its general design is recursive, consistent with the recursive structure of HTML (tree of elements). It also adds the unit tests for that parser but it does not _install_ the parser. So the behavior of `partition_html()` is unchanged by this PR. The next PR in this series will do that and handle the ingest and other unit test changes required to reflect the dozen or so bug-fixes the new parser provides.
51 lines
1.4 KiB
Python
51 lines
1.4 KiB
Python
# pyright: reportPrivateUsage=false
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import Collection, Generic, Iterator, TypeVar, overload
|
|
|
|
from typing_extensions import Self
|
|
|
|
from .. import _types as _t
|
|
|
|
_T = TypeVar("_T")
|
|
|
|
class _Element:
|
|
@overload
|
|
def __getitem__(self, __x: int) -> Self: ...
|
|
@overload
|
|
def __getitem__(self, __x: slice) -> list[Self]: ...
|
|
def __contains__(self, __o: object) -> bool: ...
|
|
def __len__(self) -> int: ...
|
|
def __iter__(self) -> Iterator[Self]: ...
|
|
def find(self, path: _t._ElemPathArg) -> Self | None: ...
|
|
@overload
|
|
def get(self, key: _t._AttrName) -> str | None: ...
|
|
@overload
|
|
def get(self, key: _t._AttrName, default: _T) -> str | _T: ...
|
|
def iterancestors(
|
|
self, *, tag: _t._TagSelector | Collection[_t._TagSelector] | None = None
|
|
) -> Iterator[Self]: ...
|
|
@overload
|
|
def itertext(self, *tags: _t._TagSelector, with_tail: bool = True) -> Iterator[str]: ...
|
|
@overload
|
|
def itertext(
|
|
self,
|
|
*,
|
|
tag: _t._TagSelector | Collection[_t._TagSelector] | None = None,
|
|
with_tail: bool = True,
|
|
) -> Iterator[str]: ...
|
|
@property
|
|
def tag(self) -> str: ...
|
|
@property
|
|
def tail(self) -> str | None: ...
|
|
@property
|
|
def text(self) -> str | None: ...
|
|
def xpath(
|
|
self,
|
|
_path: str,
|
|
/,
|
|
) -> _t._XPathObject: ...
|
|
|
|
class _ElementTree(Generic[_t._ET_co]): ...
|