fix: enable partition_html to grab content outside of <article> tags (#772)

* optionally dont assemble articles

* add test for content outside of articles

* pass kwargs in partition

* changelog and version

* update default to False

* bump version for release

* back to dev version to get another fix in the release
This commit is contained in:
Matt Robinson 2023-06-20 13:07:30 -04:00 committed by GitHub
parent feaf1cb4df
commit c53ce117bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 92 additions and 18 deletions

View File

@ -12,6 +12,9 @@
### Fixes ### Fixes
* Adds an `html_assemble_articles` kwarg to `partition_html` to enable users to capture
control whether content outside of `<article>` tags is captured when
`<article>` tags are present.
* Check for the `xml` attribute on `element` before looking for pagebreaks in `partition_docx`. * Check for the `xml` attribute on `element` before looking for pagebreaks in `partition_docx`.
## 0.7.6 ## 0.7.6

View File

@ -329,6 +329,15 @@ to disable SSL verification in the request.
elements = partition_html(url="https://python.org/", ssl_verify=False) elements = partition_html(url="https://python.org/", ssl_verify=False)
If you website contains news articles, it can be helpful to only grab content that appears in
between the ``<article>`` tags, if the site uses that convention.
To activate this behavior, you can set ``html_assemble_articles=True``.
If ``html_assemble_articles`` is ``True``, each ``<article>`` tag will be treated as a a page.
If ``html_assemble_articles`` is ``True`` and no ``<article>`` tags are present, the behavior
is the same as ``html_assemble_articles=False``.
``partition_image`` ``partition_image``
--------------------- ---------------------

View File

@ -246,3 +246,20 @@ def test_emoji_appears_with_emoji_utf8_code():
html_text = """\n<html charset="utf-8"><p>Hello &#128512;</p></html>""" html_text = """\n<html charset="utf-8"><p>Hello &#128512;</p></html>"""
elements = partition_html(text=html_text) elements = partition_html(text=html_text)
assert elements[0] == Title("Hello 😀") assert elements[0] == Title("Hello 😀")
def test_partition_html_can_turn_off_assemble_articles():
html_text = """<html>
<article>
<h1>Some important stuff is going on!</h1>
<p>Here is a description of that stuff</p>
</article>
<article>
<h1>Some other important stuff is going on!</h1>
<p>Here is a description of that stuff</p>
</article>
<h4>This is outside of the article.</h4>
</html>
"""
elements = partition_html(text=html_text, html_assemble_articles=False)
assert elements[-1] == Title("This is outside of the article.")

View File

@ -20,7 +20,7 @@ from unstructured.documents.elements import (
Text, Text,
Title, Title,
) )
from unstructured.documents.xml import XMLDocument from unstructured.documents.xml import VALID_PARSERS, XMLDocument
from unstructured.logger import logger from unstructured.logger import logger
from unstructured.partition.text_type import ( from unstructured.partition.text_type import (
is_bulleted_text, is_bulleted_text,
@ -90,6 +90,15 @@ class HTMLDocument(XMLDocument):
"""Class for handling HTML documents. Uses rules based parsing to identify sections """Class for handling HTML documents. Uses rules based parsing to identify sections
of interest within the document.""" of interest within the document."""
def __init__(
self,
stylesheet: Optional[str] = None,
parser: VALID_PARSERS = None,
assemble_articles: bool = True,
):
self.assembled_articles = assemble_articles
super().__init__(stylesheet=stylesheet, parser=parser)
def _read(self) -> List[Page]: def _read(self) -> List[Page]:
"""Reads and structures and HTML document. If present, looks for article tags. """Reads and structures and HTML document. If present, looks for article tags.
if there are multiple article sections present, a page break is inserted between them. if there are multiple article sections present, a page break is inserted between them.
@ -101,7 +110,7 @@ class HTMLDocument(XMLDocument):
etree.strip_elements(self.document_tree, ["script"]) etree.strip_elements(self.document_tree, ["script"])
root = _find_main(self.document_tree) root = _find_main(self.document_tree)
articles = _find_articles(root) articles = _find_articles(root, assemble_articles=self.assembled_articles)
page_number = 0 page_number = 0
page = Page(number=page_number) page = Page(number=page_number)
for article in articles: for article in articles:
@ -407,9 +416,12 @@ def _find_main(root: etree.Element) -> etree.Element:
return main_tag_elem if main_tag_elem is not None else root return main_tag_elem if main_tag_elem is not None else root
def _find_articles(root: etree.Element) -> List[etree.Element]: def _find_articles(root: etree.Element, assemble_articles: bool = True) -> List[etree.Element]:
"""Tries to break the HTML document into distinct articles. If there are no article """Tries to break the HTML document into distinct articles. If there are no article
tags, the entire document is returned as a single item list.""" tags, the entire document is returned as a single item list."""
if assemble_articles is False:
return root
articles = root.findall(".//article") articles = root.findall(".//article")
if len(articles) == 0: if len(articles) == 0:
# NOTE(robinson) - ref: https://schema.org/Article # NOTE(robinson) - ref: https://schema.org/Article

View File

@ -92,10 +92,16 @@ class XMLDocument(Document):
return self.document_tree return self.document_tree
@classmethod @classmethod
def from_string(cls, text: str, parser: VALID_PARSERS = None, stylesheet: Optional[str] = None): def from_string(
cls,
text: str,
parser: VALID_PARSERS = None,
stylesheet: Optional[str] = None,
**kwargs,
):
"""Supports reading in an XML file as a raw string rather than as a file.""" """Supports reading in an XML file as a raw string rather than as a file."""
logger.info("Reading document from string ...") logger.info("Reading document from string ...")
doc = cls(parser=parser, stylesheet=stylesheet) doc = cls(parser=parser, stylesheet=stylesheet, **kwargs)
doc._read_xml(text) doc._read_xml(text)
return doc return doc
@ -106,6 +112,7 @@ class XMLDocument(Document):
parser: VALID_PARSERS = None, parser: VALID_PARSERS = None,
stylesheet: Optional[str] = None, stylesheet: Optional[str] = None,
encoding: Optional[str] = None, encoding: Optional[str] = None,
**kwargs,
): ):
_, content = read_txt_file(filename=filename, encoding=encoding) _, content = read_txt_file(filename=filename, encoding=encoding)
return cls.from_string(content, parser=parser, stylesheet=stylesheet) return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)

View File

@ -50,6 +50,7 @@ def partition(
pdf_infer_table_structure: bool = False, pdf_infer_table_structure: bool = False,
xml_keep_tags: bool = False, xml_keep_tags: bool = False,
data_source_metadata: Optional[DataSourceMetadata] = None, data_source_metadata: Optional[DataSourceMetadata] = None,
**kwargs,
): ):
"""Partitions a document into its constituent elements. Will use libmagic to determine """Partitions a document into its constituent elements. Will use libmagic to determine
the file's type and route it to the appropriate partitioning function. Applies the default the file's type and route it to the appropriate partitioning function. Applies the default
@ -121,21 +122,22 @@ def partition(
file.seek(0) file.seek(0)
if filetype == FileType.DOC: if filetype == FileType.DOC:
elements = partition_doc(filename=filename, file=file) elements = partition_doc(filename=filename, file=file, **kwargs)
elif filetype == FileType.DOCX: elif filetype == FileType.DOCX:
elements = partition_docx(filename=filename, file=file) elements = partition_docx(filename=filename, file=file, **kwargs)
elif filetype == FileType.ODT: elif filetype == FileType.ODT:
elements = partition_odt(filename=filename, file=file) elements = partition_odt(filename=filename, file=file, **kwargs)
elif filetype == FileType.EML: elif filetype == FileType.EML:
elements = partition_email(filename=filename, file=file, encoding=encoding) elements = partition_email(filename=filename, file=file, encoding=encoding, **kwargs)
elif filetype == FileType.MSG: elif filetype == FileType.MSG:
elements = partition_msg(filename=filename, file=file) elements = partition_msg(filename=filename, file=file, **kwargs)
elif filetype == FileType.HTML: elif filetype == FileType.HTML:
elements = partition_html( elements = partition_html(
filename=filename, filename=filename,
file=file, file=file,
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
encoding=encoding, encoding=encoding,
**kwargs,
) )
elif filetype == FileType.XML: elif filetype == FileType.XML:
elements = partition_xml( elements = partition_xml(
@ -143,24 +145,28 @@ def partition(
file=file, file=file,
encoding=encoding, encoding=encoding,
xml_keep_tags=xml_keep_tags, xml_keep_tags=xml_keep_tags,
**kwargs,
) )
elif filetype == FileType.EPUB: elif filetype == FileType.EPUB:
elements = partition_epub( elements = partition_epub(
filename=filename, filename=filename,
file=file, file=file,
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
**kwargs,
) )
elif filetype == FileType.RST: elif filetype == FileType.RST:
elements = partition_rst( elements = partition_rst(
filename=filename, filename=filename,
file=file, file=file,
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
**kwargs,
) )
elif filetype == FileType.MD: elif filetype == FileType.MD:
elements = partition_md( elements = partition_md(
filename=filename, filename=filename,
file=file, file=file,
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
**kwargs,
) )
elif filetype == FileType.PDF: elif filetype == FileType.PDF:
elements = partition_pdf( elements = partition_pdf(
@ -171,6 +177,7 @@ def partition(
infer_table_structure=pdf_infer_table_structure, infer_table_structure=pdf_infer_table_structure,
strategy=strategy, strategy=strategy,
ocr_languages=ocr_languages, ocr_languages=ocr_languages,
**kwargs,
) )
elif (filetype == FileType.PNG) or (filetype == FileType.JPG): elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
elements = partition_image( elements = partition_image(
@ -180,6 +187,7 @@ def partition(
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
strategy=strategy, strategy=strategy,
ocr_languages=ocr_languages, ocr_languages=ocr_languages,
**kwargs,
) )
elif filetype == FileType.TXT: elif filetype == FileType.TXT:
elements = partition_text( elements = partition_text(
@ -187,33 +195,37 @@ def partition(
file=file, file=file,
encoding=encoding, encoding=encoding,
paragraph_grouper=paragraph_grouper, paragraph_grouper=paragraph_grouper,
**kwargs,
) )
elif filetype == FileType.RTF: elif filetype == FileType.RTF:
elements = partition_rtf( elements = partition_rtf(
filename=filename, filename=filename,
file=file, file=file,
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
**kwargs,
) )
elif filetype == FileType.PPT: elif filetype == FileType.PPT:
elements = partition_ppt( elements = partition_ppt(
filename=filename, filename=filename,
file=file, file=file,
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
**kwargs,
) )
elif filetype == FileType.PPTX: elif filetype == FileType.PPTX:
elements = partition_pptx( elements = partition_pptx(
filename=filename, filename=filename,
file=file, file=file,
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
**kwargs,
) )
elif filetype == FileType.JSON: elif filetype == FileType.JSON:
elements = partition_json(filename=filename, file=file) elements = partition_json(filename=filename, file=file, **kwargs)
elif (filetype == FileType.XLSX) or (filetype == FileType.XLS): elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
elements = partition_xlsx(filename=filename, file=file) elements = partition_xlsx(filename=filename, file=file, **kwargs)
elif filetype == FileType.CSV: elif filetype == FileType.CSV:
elements = partition_csv(filename=filename, file=file) elements = partition_csv(filename=filename, file=file, **kwargs)
elif filetype == FileType.TSV: elif filetype == FileType.TSV:
elements = partition_tsv(filename=filename, file=file) elements = partition_tsv(filename=filename, file=file, **kwargs)
elif filetype == FileType.EMPTY: elif filetype == FileType.EMPTY:
elements = [] elements = []
else: else:

View File

@ -30,6 +30,7 @@ def partition_html(
headers: Dict[str, str] = {}, headers: Dict[str, str] = {},
ssl_verify: bool = True, ssl_verify: bool = True,
parser: VALID_PARSERS = None, parser: VALID_PARSERS = None,
html_assemble_articles: bool = False,
**kwargs, **kwargs,
) -> List[Element]: ) -> List[Element]:
"""Partitions an HTML document into its constituent elements. """Partitions an HTML document into its constituent elements.
@ -66,15 +67,28 @@ def partition_html(
exactly_one(filename=filename, file=file, text=text, url=url) exactly_one(filename=filename, file=file, text=text, url=url)
if filename is not None: if filename is not None:
document = HTMLDocument.from_file(filename, parser=parser, encoding=encoding) document = HTMLDocument.from_file(
filename,
parser=parser,
encoding=encoding,
assemble_articles=html_assemble_articles,
)
elif file is not None: elif file is not None:
_, file_text = read_txt_file(file=file, encoding=encoding) _, file_text = read_txt_file(file=file, encoding=encoding)
document = HTMLDocument.from_string(file_text, parser=parser) document = HTMLDocument.from_string(
file_text,
parser=parser,
assemble_articles=html_assemble_articles,
)
elif text is not None: elif text is not None:
_text: str = str(text) _text: str = str(text)
document = HTMLDocument.from_string(_text, parser=parser) document = HTMLDocument.from_string(
_text,
parser=parser,
assemble_articles=html_assemble_articles,
)
elif url is not None: elif url is not None:
response = requests.get(url, headers=headers, verify=ssl_verify) response = requests.get(url, headers=headers, verify=ssl_verify)