fix: enable partition_html to grab content outside of <article> tags (#772)

* optionally dont assemble articles

* add test for content outside of articles

* pass kwargs in partition

* changelog and version

* update default to False

* bump version for release

* back to dev version to get another fix in the release
This commit is contained in:
Matt Robinson 2023-06-20 13:07:30 -04:00 committed by GitHub
parent feaf1cb4df
commit c53ce117bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 92 additions and 18 deletions

View File

@ -12,6 +12,9 @@
### Fixes
* Adds an `html_assemble_articles` kwarg to `partition_html` to enable users to capture
control whether content outside of `<article>` tags is captured when
`<article>` tags are present.
* Check for the `xml` attribute on `element` before looking for pagebreaks in `partition_docx`.
## 0.7.6

View File

@ -329,6 +329,15 @@ to disable SSL verification in the request.
elements = partition_html(url="https://python.org/", ssl_verify=False)
If you website contains news articles, it can be helpful to only grab content that appears in
between the ``<article>`` tags, if the site uses that convention.
To activate this behavior, you can set ``html_assemble_articles=True``.
If ``html_assemble_articles`` is ``True``, each ``<article>`` tag will be treated as a a page.
If ``html_assemble_articles`` is ``True`` and no ``<article>`` tags are present, the behavior
is the same as ``html_assemble_articles=False``.
``partition_image``
---------------------

View File

@ -246,3 +246,20 @@ def test_emoji_appears_with_emoji_utf8_code():
html_text = """\n<html charset="utf-8"><p>Hello &#128512;</p></html>"""
elements = partition_html(text=html_text)
assert elements[0] == Title("Hello 😀")
def test_partition_html_can_turn_off_assemble_articles():
html_text = """<html>
<article>
<h1>Some important stuff is going on!</h1>
<p>Here is a description of that stuff</p>
</article>
<article>
<h1>Some other important stuff is going on!</h1>
<p>Here is a description of that stuff</p>
</article>
<h4>This is outside of the article.</h4>
</html>
"""
elements = partition_html(text=html_text, html_assemble_articles=False)
assert elements[-1] == Title("This is outside of the article.")

View File

@ -20,7 +20,7 @@ from unstructured.documents.elements import (
Text,
Title,
)
from unstructured.documents.xml import XMLDocument
from unstructured.documents.xml import VALID_PARSERS, XMLDocument
from unstructured.logger import logger
from unstructured.partition.text_type import (
is_bulleted_text,
@ -90,6 +90,15 @@ class HTMLDocument(XMLDocument):
"""Class for handling HTML documents. Uses rules based parsing to identify sections
of interest within the document."""
def __init__(
self,
stylesheet: Optional[str] = None,
parser: VALID_PARSERS = None,
assemble_articles: bool = True,
):
self.assembled_articles = assemble_articles
super().__init__(stylesheet=stylesheet, parser=parser)
def _read(self) -> List[Page]:
"""Reads and structures and HTML document. If present, looks for article tags.
if there are multiple article sections present, a page break is inserted between them.
@ -101,7 +110,7 @@ class HTMLDocument(XMLDocument):
etree.strip_elements(self.document_tree, ["script"])
root = _find_main(self.document_tree)
articles = _find_articles(root)
articles = _find_articles(root, assemble_articles=self.assembled_articles)
page_number = 0
page = Page(number=page_number)
for article in articles:
@ -407,9 +416,12 @@ def _find_main(root: etree.Element) -> etree.Element:
return main_tag_elem if main_tag_elem is not None else root
def _find_articles(root: etree.Element) -> List[etree.Element]:
def _find_articles(root: etree.Element, assemble_articles: bool = True) -> List[etree.Element]:
"""Tries to break the HTML document into distinct articles. If there are no article
tags, the entire document is returned as a single item list."""
if assemble_articles is False:
return root
articles = root.findall(".//article")
if len(articles) == 0:
# NOTE(robinson) - ref: https://schema.org/Article

View File

@ -92,10 +92,16 @@ class XMLDocument(Document):
return self.document_tree
@classmethod
def from_string(cls, text: str, parser: VALID_PARSERS = None, stylesheet: Optional[str] = None):
def from_string(
cls,
text: str,
parser: VALID_PARSERS = None,
stylesheet: Optional[str] = None,
**kwargs,
):
"""Supports reading in an XML file as a raw string rather than as a file."""
logger.info("Reading document from string ...")
doc = cls(parser=parser, stylesheet=stylesheet)
doc = cls(parser=parser, stylesheet=stylesheet, **kwargs)
doc._read_xml(text)
return doc
@ -106,6 +112,7 @@ class XMLDocument(Document):
parser: VALID_PARSERS = None,
stylesheet: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs,
):
_, content = read_txt_file(filename=filename, encoding=encoding)
return cls.from_string(content, parser=parser, stylesheet=stylesheet)
return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)

View File

@ -50,6 +50,7 @@ def partition(
pdf_infer_table_structure: bool = False,
xml_keep_tags: bool = False,
data_source_metadata: Optional[DataSourceMetadata] = None,
**kwargs,
):
"""Partitions a document into its constituent elements. Will use libmagic to determine
the file's type and route it to the appropriate partitioning function. Applies the default
@ -121,21 +122,22 @@ def partition(
file.seek(0)
if filetype == FileType.DOC:
elements = partition_doc(filename=filename, file=file)
elements = partition_doc(filename=filename, file=file, **kwargs)
elif filetype == FileType.DOCX:
elements = partition_docx(filename=filename, file=file)
elements = partition_docx(filename=filename, file=file, **kwargs)
elif filetype == FileType.ODT:
elements = partition_odt(filename=filename, file=file)
elements = partition_odt(filename=filename, file=file, **kwargs)
elif filetype == FileType.EML:
elements = partition_email(filename=filename, file=file, encoding=encoding)
elements = partition_email(filename=filename, file=file, encoding=encoding, **kwargs)
elif filetype == FileType.MSG:
elements = partition_msg(filename=filename, file=file)
elements = partition_msg(filename=filename, file=file, **kwargs)
elif filetype == FileType.HTML:
elements = partition_html(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
encoding=encoding,
**kwargs,
)
elif filetype == FileType.XML:
elements = partition_xml(
@ -143,24 +145,28 @@ def partition(
file=file,
encoding=encoding,
xml_keep_tags=xml_keep_tags,
**kwargs,
)
elif filetype == FileType.EPUB:
elements = partition_epub(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
**kwargs,
)
elif filetype == FileType.RST:
elements = partition_rst(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
**kwargs,
)
elif filetype == FileType.MD:
elements = partition_md(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
**kwargs,
)
elif filetype == FileType.PDF:
elements = partition_pdf(
@ -171,6 +177,7 @@ def partition(
infer_table_structure=pdf_infer_table_structure,
strategy=strategy,
ocr_languages=ocr_languages,
**kwargs,
)
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
elements = partition_image(
@ -180,6 +187,7 @@ def partition(
include_page_breaks=include_page_breaks,
strategy=strategy,
ocr_languages=ocr_languages,
**kwargs,
)
elif filetype == FileType.TXT:
elements = partition_text(
@ -187,33 +195,37 @@ def partition(
file=file,
encoding=encoding,
paragraph_grouper=paragraph_grouper,
**kwargs,
)
elif filetype == FileType.RTF:
elements = partition_rtf(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
**kwargs,
)
elif filetype == FileType.PPT:
elements = partition_ppt(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
**kwargs,
)
elif filetype == FileType.PPTX:
elements = partition_pptx(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
**kwargs,
)
elif filetype == FileType.JSON:
elements = partition_json(filename=filename, file=file)
elements = partition_json(filename=filename, file=file, **kwargs)
elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
elements = partition_xlsx(filename=filename, file=file)
elements = partition_xlsx(filename=filename, file=file, **kwargs)
elif filetype == FileType.CSV:
elements = partition_csv(filename=filename, file=file)
elements = partition_csv(filename=filename, file=file, **kwargs)
elif filetype == FileType.TSV:
elements = partition_tsv(filename=filename, file=file)
elements = partition_tsv(filename=filename, file=file, **kwargs)
elif filetype == FileType.EMPTY:
elements = []
else:

View File

@ -30,6 +30,7 @@ def partition_html(
headers: Dict[str, str] = {},
ssl_verify: bool = True,
parser: VALID_PARSERS = None,
html_assemble_articles: bool = False,
**kwargs,
) -> List[Element]:
"""Partitions an HTML document into its constituent elements.
@ -66,15 +67,28 @@ def partition_html(
exactly_one(filename=filename, file=file, text=text, url=url)
if filename is not None:
document = HTMLDocument.from_file(filename, parser=parser, encoding=encoding)
document = HTMLDocument.from_file(
filename,
parser=parser,
encoding=encoding,
assemble_articles=html_assemble_articles,
)
elif file is not None:
_, file_text = read_txt_file(file=file, encoding=encoding)
document = HTMLDocument.from_string(file_text, parser=parser)
document = HTMLDocument.from_string(
file_text,
parser=parser,
assemble_articles=html_assemble_articles,
)
elif text is not None:
_text: str = str(text)
document = HTMLDocument.from_string(_text, parser=parser)
document = HTMLDocument.from_string(
_text,
parser=parser,
assemble_articles=html_assemble_articles,
)
elif url is not None:
response = requests.get(url, headers=headers, verify=ssl_verify)