diff --git a/CHANGELOG.md b/CHANGELOG.md index 18162421a..ffbd87343 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.5-dev5 +## 0.14.5-dev6 ### Enhancements diff --git a/test_unstructured/documents/test_html.py b/test_unstructured/documents/test_html.py index 377672272..1baae57ad 100644 --- a/test_unstructured/documents/test_html.py +++ b/test_unstructured/documents/test_html.py @@ -1,4 +1,5 @@ # pyright: reportPrivateUsage=false +# pyright: reportUnusedFunction=false """Test suite for `unstructured.documents.html` module.""" @@ -27,15 +28,14 @@ from unstructured.documents.elements import ( Text, Title, ) -from unstructured.documents.html import ( +from unstructured.documents.html import HTMLDocument, _parse_HTMLTable_from_table_elem +from unstructured.documents.html_elements import ( HTMLAddress, - HTMLDocument, HTMLNarrativeText, HTMLTable, HTMLText, HTMLTitle, TagsMixin, - _parse_HTMLTable_from_table_elem, ) TAGS = ( @@ -287,9 +287,6 @@ def test_read_html_doc(tmp_path: pathlib.Path): f.write( "\n" "
\n" - "Here is a header. We want to ignore anything that is in this section.
\n" - "Dear Leader is the best. He is such a wonderful engineer!
\n" " \n" @@ -298,42 +295,35 @@ def test_read_html_doc(tmp_path: pathlib.Path): "Skip me because I'm in a table | \n"
+ " I'm in a table | \n"
"
Let's ignore anything after the footer too since it's probably garbage.
\n" - "Hi there! I am Matt! | \n"
- "
A table thing. |
A non-table thing
\n" - ) - assert len(doc.elements) == 2 - - doc.doc_after_cleaners(skip_table=True, inplace=True) - - assert len(doc.elements) == 1 - - # -- HTMLDocument.elements ----------------------------------------------------------------------- @@ -429,19 +361,16 @@ def test_parses_tags_correctly(): def test_nested_text_tags(): - tag1, tag2 = [tag for tag in html.TEXT_TAGS if tag not in html.TABLE_TAGS][:2] - html_str = ( - f"\n" - f" <{tag1}>\n" - f" <{tag2}>\n" - f" There is some text here.\n" - f" {tag2}>\n" - f" {tag1}>\n" - f"\n" + html_document = HTMLDocument.from_string( + "\n" + "\n" + " \n" + " There is some text here.\n" + " \n" + "
\n" + "\n" ) - html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table=False) - assert len(html_document.pages[0].elements) == 1 @@ -575,19 +504,6 @@ def test_exclude_tag_types(tag: str): assert len(html_document.pages) == 0 -# -- has_table_ancestor() ------------------------------------------------------------------------ - - -def test_has_table_ancestor(): - title = HTMLTitle("I am a Title", tag="td", ancestortags=["html", "body", "table", "tr"]) - assert html.has_table_ancestor(title) - - -def test_has_no_table_ancestor(): - title = HTMLTitle("I am a Title", tag="p", ancestortags=["html", "body"]) - assert not html.has_table_ancestor(title) - - # -- _bulleted_text_from_table() ----------------------------------------------------------------- @@ -856,8 +772,8 @@ def test_parse_nothing(): assert parsed_el is None -def test_parse_not_anything(is_narrative_tag_: Mock, is_possible_title_: Mock): - is_narrative_tag_.return_value = False +def test_parse_not_anything(_is_narrative_tag_: Mock, is_possible_title_: Mock): # noqa: PT019 + _is_narrative_tag_.return_value = False is_possible_title_.return_value = False doc = """This is nothing
""" document_tree = etree.fromstring(doc, etree.HTMLParser()) @@ -942,7 +858,7 @@ def test_process_list_item_returns_none_if_next_has_no_text(): document_tree = etree.fromstring(doc, etree.HTMLParser()) el = document_tree.find(".//div") assert el is not None - assert html.is_list_item_tag(el) is True + assert html._is_list_item_tag(el) is True parsed_el, _ = html._process_list_item(el) assert parsed_el is None @@ -1071,8 +987,8 @@ class Describe_parse_HTMLTable_from_table_elem: @pytest.fixture -def is_narrative_tag_(request: FixtureRequest): - return function_mock(request, "unstructured.documents.html.is_narrative_tag") +def _is_narrative_tag_(request: FixtureRequest): + return function_mock(request, "unstructured.documents.html._is_narrative_tag") @pytest.fixture diff --git a/test_unstructured/partition/test_html.py b/test_unstructured/partition/test_html.py index e21e66414..2e353d6f6 100644 --- a/test_unstructured/partition/test_html.py +++ b/test_unstructured/partition/test_html.py @@ -28,7 +28,7 @@ from unstructured.documents.elements import ( TableChunk, Title, ) -from unstructured.documents.html import HTMLTable, TagsMixin +from unstructured.documents.html_elements import HTMLTable, TagsMixin from unstructured.partition.html import partition_html # -- document-source (filename, file, text, url) ------------------------------------------------- diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 195c73cca..40be1c9bb 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.5-dev5" # pragma: no cover +__version__ = "0.14.5-dev6" # pragma: no cover diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index 284742981..1c868acae 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -2,26 +2,21 @@ from __future__ import annotations -from typing import Any, Callable, Final, Iterator, Optional, Sequence, cast +from typing import Final, Iterator, Optional, cast from lxml import etree -from unstructured.cleaners.core import ( - clean_bullets, - replace_unicode_quotes, -) +from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes from unstructured.documents.base import Page -from unstructured.documents.elements import ( - Address, - Element, - ElementMetadata, - EmailAddress, - Link, - ListItem, - NarrativeText, - Table, - Text, - Title, +from unstructured.documents.elements import Element, ElementMetadata, Link +from unstructured.documents.html_elements import ( + HTMLAddress, + HTMLEmailAddress, + HTMLListItem, + HTMLNarrativeText, + HTMLTable, + HTMLText, + HTMLTitle, ) from unstructured.documents.xml import VALID_PARSERS, XMLDocument from unstructured.logger import logger @@ -47,82 +42,6 @@ HEADER_OR_FOOTER_TAGS: Final[list[str]] = ["header", "footer"] SECTION_TAGS: Final[list[str]] = ["div", "pre"] -# -- HTML-specific document-elements and methods ------------------------------------------------- - - -class TagsMixin: - """Mixin that allows a class to retain tag information.""" - - def __init__( - self, - *args: Any, - tag: Optional[str] = None, - ancestortags: Sequence[str] = (), - links: Sequence[Link] = [], - emphasized_texts: Sequence[dict[str, str]] = [], - text_as_html: Optional[str] = None, - **kwargs: Any, - ): - if tag is None: - raise TypeError("tag argument must be passed and not None") - else: - self.tag = tag - self.ancestortags = ancestortags - self.links = links - self.emphasized_texts = emphasized_texts - self.text_as_html = text_as_html - super().__init__(*args, **kwargs) - - -class HTMLText(TagsMixin, Text): - """Text with tag information.""" - - -class HTMLAddress(TagsMixin, Address): - """Address with tag information.""" - - -class HTMLEmailAddress(TagsMixin, EmailAddress): - """EmailAddress with tag information""" - - -class HTMLTitle(TagsMixin, Title): - """Title with tag information.""" - - -class HTMLNarrativeText(TagsMixin, NarrativeText): - """NarrativeText with tag information.""" - - -class HTMLListItem(TagsMixin, ListItem): - """NarrativeText with tag information.""" - - -class HTMLTable(TagsMixin, Table): - """NarrativeText with tag information""" - - -def has_table_ancestor(element: TagsMixin) -> bool: - """Checks to see if an element has ancestors that are table elements. If so, we consider - it to be a table element rather than a section of narrative text.""" - return any(ancestor in TABLE_TAGS for ancestor in element.ancestortags) - - -def in_header_or_footer(element: TagsMixin) -> bool: - """Checks to see if an element is contained within a header or a footer tag.""" - if any(ancestor in HEADER_OR_FOOTER_TAGS for ancestor in element.ancestortags): - return True - return False - - -def is_table(element: TagsMixin) -> bool: - """Checks to see if an element is a table""" - return element.tag in TABLE_TAGS - - -# -- HTML element-tree processing ---------------------------------------------------------------- - - class HTMLDocument(XMLDocument): """Class for handling HTML documents. Uses rules based parsing to identify sections of interest within the document.""" @@ -200,7 +119,7 @@ class HTMLDocument(XMLDocument): page.elements.extend(bulleted_text) descendanttag_elems = tuple(tag_elem.iterdescendants()) - elif is_list_item_tag(tag_elem): + elif _is_list_item_tag(tag_elem): element, next_element = _process_list_item(tag_elem) if element is not None: page.elements.append(element) @@ -228,67 +147,170 @@ class HTMLDocument(XMLDocument): return pages - def doc_after_cleaners( - self, - skip_headers_and_footers: bool = False, - skip_table: bool = False, - inplace: bool = False, - ) -> HTMLDocument: - """Filters elements returning new instance based on the criteria specified. - Note that the number of pages can change in the case that all elements on a page are - filtered out. +# -- candidate HTMLDocument methods -------------------------------------------------------------- - Parameters - ---------- - skip_table: - If True, skips table element - skip_headers_and_footers: - If True, ignores any content that is within