diff --git a/CHANGELOG.md b/CHANGELOG.md index a82f2b6df..af8a57a90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,9 @@ ### Fixes +* Adds an `html_assemble_articles` kwarg to `partition_html` to enable users to capture + control whether content outside of `
` tags is captured when + `
` tags are present. * Check for the `xml` attribute on `element` before looking for pagebreaks in `partition_docx`. ## 0.7.6 diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index f5fe78875..20cc73b6c 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -329,6 +329,15 @@ to disable SSL verification in the request. elements = partition_html(url="https://python.org/", ssl_verify=False) + +If you website contains news articles, it can be helpful to only grab content that appears in +between the ``
`` tags, if the site uses that convention. +To activate this behavior, you can set ``html_assemble_articles=True``. +If ``html_assemble_articles`` is ``True``, each ``
`` tag will be treated as a a page. +If ``html_assemble_articles`` is ``True`` and no ``
`` tags are present, the behavior +is the same as ``html_assemble_articles=False``. + + ``partition_image`` --------------------- diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py index 6faac1784..d7b24fa36 100644 --- a/test_unstructured/partition/test_html_partition.py +++ b/test_unstructured/partition/test_html_partition.py @@ -246,3 +246,20 @@ def test_emoji_appears_with_emoji_utf8_code(): html_text = """\n

Hello 😀

""" elements = partition_html(text=html_text) assert elements[0] == Title("Hello 😀") + + +def test_partition_html_can_turn_off_assemble_articles(): + html_text = """ +
+

Some important stuff is going on!

+

Here is a description of that stuff

+
+
+

Some other important stuff is going on!

+

Here is a description of that stuff

+
+

This is outside of the article.

+ +""" + elements = partition_html(text=html_text, html_assemble_articles=False) + assert elements[-1] == Title("This is outside of the article.") diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index 98b505dbc..0cc71a63c 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -20,7 +20,7 @@ from unstructured.documents.elements import ( Text, Title, ) -from unstructured.documents.xml import XMLDocument +from unstructured.documents.xml import VALID_PARSERS, XMLDocument from unstructured.logger import logger from unstructured.partition.text_type import ( is_bulleted_text, @@ -90,6 +90,15 @@ class HTMLDocument(XMLDocument): """Class for handling HTML documents. Uses rules based parsing to identify sections of interest within the document.""" + def __init__( + self, + stylesheet: Optional[str] = None, + parser: VALID_PARSERS = None, + assemble_articles: bool = True, + ): + self.assembled_articles = assemble_articles + super().__init__(stylesheet=stylesheet, parser=parser) + def _read(self) -> List[Page]: """Reads and structures and HTML document. If present, looks for article tags. if there are multiple article sections present, a page break is inserted between them. @@ -101,7 +110,7 @@ class HTMLDocument(XMLDocument): etree.strip_elements(self.document_tree, ["script"]) root = _find_main(self.document_tree) - articles = _find_articles(root) + articles = _find_articles(root, assemble_articles=self.assembled_articles) page_number = 0 page = Page(number=page_number) for article in articles: @@ -407,9 +416,12 @@ def _find_main(root: etree.Element) -> etree.Element: return main_tag_elem if main_tag_elem is not None else root -def _find_articles(root: etree.Element) -> List[etree.Element]: +def _find_articles(root: etree.Element, assemble_articles: bool = True) -> List[etree.Element]: """Tries to break the HTML document into distinct articles. If there are no article tags, the entire document is returned as a single item list.""" + if assemble_articles is False: + return root + articles = root.findall(".//article") if len(articles) == 0: # NOTE(robinson) - ref: https://schema.org/Article diff --git a/unstructured/documents/xml.py b/unstructured/documents/xml.py index a1b252a62..56d87aa7c 100644 --- a/unstructured/documents/xml.py +++ b/unstructured/documents/xml.py @@ -92,10 +92,16 @@ class XMLDocument(Document): return self.document_tree @classmethod - def from_string(cls, text: str, parser: VALID_PARSERS = None, stylesheet: Optional[str] = None): + def from_string( + cls, + text: str, + parser: VALID_PARSERS = None, + stylesheet: Optional[str] = None, + **kwargs, + ): """Supports reading in an XML file as a raw string rather than as a file.""" logger.info("Reading document from string ...") - doc = cls(parser=parser, stylesheet=stylesheet) + doc = cls(parser=parser, stylesheet=stylesheet, **kwargs) doc._read_xml(text) return doc @@ -106,6 +112,7 @@ class XMLDocument(Document): parser: VALID_PARSERS = None, stylesheet: Optional[str] = None, encoding: Optional[str] = None, + **kwargs, ): _, content = read_txt_file(filename=filename, encoding=encoding) - return cls.from_string(content, parser=parser, stylesheet=stylesheet) + return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs) diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 7807b225b..16efb36f2 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -50,6 +50,7 @@ def partition( pdf_infer_table_structure: bool = False, xml_keep_tags: bool = False, data_source_metadata: Optional[DataSourceMetadata] = None, + **kwargs, ): """Partitions a document into its constituent elements. Will use libmagic to determine the file's type and route it to the appropriate partitioning function. Applies the default @@ -121,21 +122,22 @@ def partition( file.seek(0) if filetype == FileType.DOC: - elements = partition_doc(filename=filename, file=file) + elements = partition_doc(filename=filename, file=file, **kwargs) elif filetype == FileType.DOCX: - elements = partition_docx(filename=filename, file=file) + elements = partition_docx(filename=filename, file=file, **kwargs) elif filetype == FileType.ODT: - elements = partition_odt(filename=filename, file=file) + elements = partition_odt(filename=filename, file=file, **kwargs) elif filetype == FileType.EML: - elements = partition_email(filename=filename, file=file, encoding=encoding) + elements = partition_email(filename=filename, file=file, encoding=encoding, **kwargs) elif filetype == FileType.MSG: - elements = partition_msg(filename=filename, file=file) + elements = partition_msg(filename=filename, file=file, **kwargs) elif filetype == FileType.HTML: elements = partition_html( filename=filename, file=file, include_page_breaks=include_page_breaks, encoding=encoding, + **kwargs, ) elif filetype == FileType.XML: elements = partition_xml( @@ -143,24 +145,28 @@ def partition( file=file, encoding=encoding, xml_keep_tags=xml_keep_tags, + **kwargs, ) elif filetype == FileType.EPUB: elements = partition_epub( filename=filename, file=file, include_page_breaks=include_page_breaks, + **kwargs, ) elif filetype == FileType.RST: elements = partition_rst( filename=filename, file=file, include_page_breaks=include_page_breaks, + **kwargs, ) elif filetype == FileType.MD: elements = partition_md( filename=filename, file=file, include_page_breaks=include_page_breaks, + **kwargs, ) elif filetype == FileType.PDF: elements = partition_pdf( @@ -171,6 +177,7 @@ def partition( infer_table_structure=pdf_infer_table_structure, strategy=strategy, ocr_languages=ocr_languages, + **kwargs, ) elif (filetype == FileType.PNG) or (filetype == FileType.JPG): elements = partition_image( @@ -180,6 +187,7 @@ def partition( include_page_breaks=include_page_breaks, strategy=strategy, ocr_languages=ocr_languages, + **kwargs, ) elif filetype == FileType.TXT: elements = partition_text( @@ -187,33 +195,37 @@ def partition( file=file, encoding=encoding, paragraph_grouper=paragraph_grouper, + **kwargs, ) elif filetype == FileType.RTF: elements = partition_rtf( filename=filename, file=file, include_page_breaks=include_page_breaks, + **kwargs, ) elif filetype == FileType.PPT: elements = partition_ppt( filename=filename, file=file, include_page_breaks=include_page_breaks, + **kwargs, ) elif filetype == FileType.PPTX: elements = partition_pptx( filename=filename, file=file, include_page_breaks=include_page_breaks, + **kwargs, ) elif filetype == FileType.JSON: - elements = partition_json(filename=filename, file=file) + elements = partition_json(filename=filename, file=file, **kwargs) elif (filetype == FileType.XLSX) or (filetype == FileType.XLS): - elements = partition_xlsx(filename=filename, file=file) + elements = partition_xlsx(filename=filename, file=file, **kwargs) elif filetype == FileType.CSV: - elements = partition_csv(filename=filename, file=file) + elements = partition_csv(filename=filename, file=file, **kwargs) elif filetype == FileType.TSV: - elements = partition_tsv(filename=filename, file=file) + elements = partition_tsv(filename=filename, file=file, **kwargs) elif filetype == FileType.EMPTY: elements = [] else: diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py index ee80349d3..fcd111391 100644 --- a/unstructured/partition/html.py +++ b/unstructured/partition/html.py @@ -30,6 +30,7 @@ def partition_html( headers: Dict[str, str] = {}, ssl_verify: bool = True, parser: VALID_PARSERS = None, + html_assemble_articles: bool = False, **kwargs, ) -> List[Element]: """Partitions an HTML document into its constituent elements. @@ -66,15 +67,28 @@ def partition_html( exactly_one(filename=filename, file=file, text=text, url=url) if filename is not None: - document = HTMLDocument.from_file(filename, parser=parser, encoding=encoding) + document = HTMLDocument.from_file( + filename, + parser=parser, + encoding=encoding, + assemble_articles=html_assemble_articles, + ) elif file is not None: _, file_text = read_txt_file(file=file, encoding=encoding) - document = HTMLDocument.from_string(file_text, parser=parser) + document = HTMLDocument.from_string( + file_text, + parser=parser, + assemble_articles=html_assemble_articles, + ) elif text is not None: _text: str = str(text) - document = HTMLDocument.from_string(_text, parser=parser) + document = HTMLDocument.from_string( + _text, + parser=parser, + assemble_articles=html_assemble_articles, + ) elif url is not None: response = requests.get(url, headers=headers, verify=ssl_verify)