fix: enable partition_html to grab content outside of <article> tags (#772)

* optionally dont assemble articles * add test for content outside of articles * pass kwargs in partition * changelog and version * update default to False * bump version for release * back to dev version to get another fix in the release
2025-12-11 23:21:32 +00:00 · 2023-06-20 13:07:30 -04:00 · 2023-06-20 13:07:30 -04:00 · c53ce117bc
commit c53ce117bc
parent feaf1cb4df
7 changed files with 92 additions and 18 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -12,6 +12,9 @@

 ### Fixes

+* Adds an `html_assemble_articles` kwarg to `partition_html` to enable users to capture
+  control whether content outside of `<article>` tags is captured when
+  `<article>` tags are present.
 * Check for the `xml` attribute on `element` before looking for pagebreaks in `partition_docx`.

 ## 0.7.6
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -329,6 +329,15 @@ to disable SSL verification in the request.
  elements = partition_html(url="https://python.org/", ssl_verify=False)


+
+If you website contains news articles, it can be helpful to only grab content that appears in
+between the ``<article>`` tags, if the site uses that convention.
+To activate this behavior, you can set ``html_assemble_articles=True``.
+If ``html_assemble_articles`` is ``True``, each ``<article>`` tag will be treated as a a page.
+If ``html_assemble_articles`` is ``True`` and no ``<article>`` tags are present, the behavior
+is the same as ``html_assemble_articles=False``.
+
+
 ``partition_image``
 ---------------------

--- a/test_unstructured/partition/test_html_partition.py
+++ b/test_unstructured/partition/test_html_partition.py
@ -246,3 +246,20 @@ def test_emoji_appears_with_emoji_utf8_code():
    html_text = """\n<html charset="utf-8"><p>Hello &#128512;</p></html>"""
    elements = partition_html(text=html_text)
    assert elements[0] == Title("Hello 😀")
+
+
+def test_partition_html_can_turn_off_assemble_articles():
+    html_text = """<html>
+    <article>
+        <h1>Some important stuff is going on!</h1>
+        <p>Here is a description of that stuff</p>
+    </article>
+    <article>
+        <h1>Some other important stuff is going on!</h1>
+        <p>Here is a description of that stuff</p>
+    </article>
+    <h4>This is outside of the article.</h4>
+</html>
+"""
+    elements = partition_html(text=html_text, html_assemble_articles=False)
+    assert elements[-1] == Title("This is outside of the article.")
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@ -20,7 +20,7 @@ from unstructured.documents.elements import (
    Text,
    Title,
 )
-from unstructured.documents.xml import XMLDocument
+from unstructured.documents.xml import VALID_PARSERS, XMLDocument
 from unstructured.logger import logger
 from unstructured.partition.text_type import (
    is_bulleted_text,
@ -90,6 +90,15 @@ class HTMLDocument(XMLDocument):
    """Class for handling HTML documents. Uses rules based parsing to identify sections
    of interest within the document."""

+    def __init__(
+        self,
+        stylesheet: Optional[str] = None,
+        parser: VALID_PARSERS = None,
+        assemble_articles: bool = True,
+    ):
+        self.assembled_articles = assemble_articles
+        super().__init__(stylesheet=stylesheet, parser=parser)
+
    def _read(self) -> List[Page]:
        """Reads and structures and HTML document. If present, looks for article tags.
        if there are multiple article sections present, a page break is inserted between them.
@ -101,7 +110,7 @@ class HTMLDocument(XMLDocument):
        etree.strip_elements(self.document_tree, ["script"])
        root = _find_main(self.document_tree)

-        articles = _find_articles(root)
+        articles = _find_articles(root, assemble_articles=self.assembled_articles)
        page_number = 0
        page = Page(number=page_number)
        for article in articles:
@ -407,9 +416,12 @@ def _find_main(root: etree.Element) -> etree.Element:
    return main_tag_elem if main_tag_elem is not None else root


-def _find_articles(root: etree.Element) -> List[etree.Element]:
+def _find_articles(root: etree.Element, assemble_articles: bool = True) -> List[etree.Element]:
    """Tries to break the HTML document into distinct articles. If there are no article
    tags, the entire document is returned as a single item list."""
+    if assemble_articles is False:
+        return root
+
    articles = root.findall(".//article")
    if len(articles) == 0:
        # NOTE(robinson) - ref: https://schema.org/Article
--- a/unstructured/documents/xml.py
+++ b/unstructured/documents/xml.py
@ -92,10 +92,16 @@ class XMLDocument(Document):
        return self.document_tree

    @classmethod
-    def from_string(cls, text: str, parser: VALID_PARSERS = None, stylesheet: Optional[str] = None):
+    def from_string(
+        cls,
+        text: str,
+        parser: VALID_PARSERS = None,
+        stylesheet: Optional[str] = None,
+        **kwargs,
+    ):
        """Supports reading in an XML file as a raw string rather than as a file."""
        logger.info("Reading document from string ...")
-        doc = cls(parser=parser, stylesheet=stylesheet)
+        doc = cls(parser=parser, stylesheet=stylesheet, **kwargs)
        doc._read_xml(text)
        return doc

@ -106,6 +112,7 @@ class XMLDocument(Document):
        parser: VALID_PARSERS = None,
        stylesheet: Optional[str] = None,
        encoding: Optional[str] = None,
+        **kwargs,
    ):
        _, content = read_txt_file(filename=filename, encoding=encoding)
-        return cls.from_string(content, parser=parser, stylesheet=stylesheet)
+        return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -50,6 +50,7 @@ def partition(
    pdf_infer_table_structure: bool = False,
    xml_keep_tags: bool = False,
    data_source_metadata: Optional[DataSourceMetadata] = None,
+    **kwargs,
 ):
    """Partitions a document into its constituent elements. Will use libmagic to determine
    the file's type and route it to the appropriate partitioning function. Applies the default
@ -121,21 +122,22 @@ def partition(
        file.seek(0)

    if filetype == FileType.DOC:
-        elements = partition_doc(filename=filename, file=file)
+        elements = partition_doc(filename=filename, file=file, **kwargs)
    elif filetype == FileType.DOCX:
-        elements = partition_docx(filename=filename, file=file)
+        elements = partition_docx(filename=filename, file=file, **kwargs)
    elif filetype == FileType.ODT:
-        elements = partition_odt(filename=filename, file=file)
+        elements = partition_odt(filename=filename, file=file, **kwargs)
    elif filetype == FileType.EML:
-        elements = partition_email(filename=filename, file=file, encoding=encoding)
+        elements = partition_email(filename=filename, file=file, encoding=encoding, **kwargs)
    elif filetype == FileType.MSG:
-        elements = partition_msg(filename=filename, file=file)
+        elements = partition_msg(filename=filename, file=file, **kwargs)
    elif filetype == FileType.HTML:
        elements = partition_html(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
            encoding=encoding,
+            **kwargs,
        )
    elif filetype == FileType.XML:
        elements = partition_xml(
@ -143,24 +145,28 @@ def partition(
            file=file,
            encoding=encoding,
            xml_keep_tags=xml_keep_tags,
+            **kwargs,
        )
    elif filetype == FileType.EPUB:
        elements = partition_epub(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
+            **kwargs,
        )
    elif filetype == FileType.RST:
        elements = partition_rst(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
+            **kwargs,
        )
    elif filetype == FileType.MD:
        elements = partition_md(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
+            **kwargs,
        )
    elif filetype == FileType.PDF:
        elements = partition_pdf(
@ -171,6 +177,7 @@ def partition(
            infer_table_structure=pdf_infer_table_structure,
            strategy=strategy,
            ocr_languages=ocr_languages,
+            **kwargs,
        )
    elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
        elements = partition_image(
@ -180,6 +187,7 @@ def partition(
            include_page_breaks=include_page_breaks,
            strategy=strategy,
            ocr_languages=ocr_languages,
+            **kwargs,
        )
    elif filetype == FileType.TXT:
        elements = partition_text(
@ -187,33 +195,37 @@ def partition(
            file=file,
            encoding=encoding,
            paragraph_grouper=paragraph_grouper,
+            **kwargs,
        )
    elif filetype == FileType.RTF:
        elements = partition_rtf(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
+            **kwargs,
        )
    elif filetype == FileType.PPT:
        elements = partition_ppt(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
+            **kwargs,
        )
    elif filetype == FileType.PPTX:
        elements = partition_pptx(
            filename=filename,
            file=file,
            include_page_breaks=include_page_breaks,
+            **kwargs,
        )
    elif filetype == FileType.JSON:
-        elements = partition_json(filename=filename, file=file)
+        elements = partition_json(filename=filename, file=file, **kwargs)
    elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
-        elements = partition_xlsx(filename=filename, file=file)
+        elements = partition_xlsx(filename=filename, file=file, **kwargs)
    elif filetype == FileType.CSV:
-        elements = partition_csv(filename=filename, file=file)
+        elements = partition_csv(filename=filename, file=file, **kwargs)
    elif filetype == FileType.TSV:
-        elements = partition_tsv(filename=filename, file=file)
+        elements = partition_tsv(filename=filename, file=file, **kwargs)
    elif filetype == FileType.EMPTY:
        elements = []
    else:
--- a/unstructured/partition/html.py
+++ b/unstructured/partition/html.py
@ -30,6 +30,7 @@ def partition_html(
    headers: Dict[str, str] = {},
    ssl_verify: bool = True,
    parser: VALID_PARSERS = None,
+    html_assemble_articles: bool = False,
    **kwargs,
 ) -> List[Element]:
    """Partitions an HTML document into its constituent elements.
@ -66,15 +67,28 @@ def partition_html(
    exactly_one(filename=filename, file=file, text=text, url=url)

    if filename is not None:
-        document = HTMLDocument.from_file(filename, parser=parser, encoding=encoding)
+        document = HTMLDocument.from_file(
+            filename,
+            parser=parser,
+            encoding=encoding,
+            assemble_articles=html_assemble_articles,
+        )

    elif file is not None:
        _, file_text = read_txt_file(file=file, encoding=encoding)
-        document = HTMLDocument.from_string(file_text, parser=parser)
+        document = HTMLDocument.from_string(
+            file_text,
+            parser=parser,
+            assemble_articles=html_assemble_articles,
+        )

    elif text is not None:
        _text: str = str(text)
-        document = HTMLDocument.from_string(_text, parser=parser)
+        document = HTMLDocument.from_string(
+            _text,
+            parser=parser,
+            assemble_articles=html_assemble_articles,
+        )

    elif url is not None:
        response = requests.get(url, headers=headers, verify=ssl_verify)