mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-31 20:33:52 +00:00
fix: enable partition_html
to grab content outside of <article>
tags (#772)
* optionally dont assemble articles * add test for content outside of articles * pass kwargs in partition * changelog and version * update default to False * bump version for release * back to dev version to get another fix in the release
This commit is contained in:
parent
feaf1cb4df
commit
c53ce117bc
@ -12,6 +12,9 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* Adds an `html_assemble_articles` kwarg to `partition_html` to enable users to capture
|
||||
control whether content outside of `<article>` tags is captured when
|
||||
`<article>` tags are present.
|
||||
* Check for the `xml` attribute on `element` before looking for pagebreaks in `partition_docx`.
|
||||
|
||||
## 0.7.6
|
||||
|
@ -329,6 +329,15 @@ to disable SSL verification in the request.
|
||||
elements = partition_html(url="https://python.org/", ssl_verify=False)
|
||||
|
||||
|
||||
|
||||
If you website contains news articles, it can be helpful to only grab content that appears in
|
||||
between the ``<article>`` tags, if the site uses that convention.
|
||||
To activate this behavior, you can set ``html_assemble_articles=True``.
|
||||
If ``html_assemble_articles`` is ``True``, each ``<article>`` tag will be treated as a a page.
|
||||
If ``html_assemble_articles`` is ``True`` and no ``<article>`` tags are present, the behavior
|
||||
is the same as ``html_assemble_articles=False``.
|
||||
|
||||
|
||||
``partition_image``
|
||||
---------------------
|
||||
|
||||
|
@ -246,3 +246,20 @@ def test_emoji_appears_with_emoji_utf8_code():
|
||||
html_text = """\n<html charset="utf-8"><p>Hello 😀</p></html>"""
|
||||
elements = partition_html(text=html_text)
|
||||
assert elements[0] == Title("Hello 😀")
|
||||
|
||||
|
||||
def test_partition_html_can_turn_off_assemble_articles():
|
||||
html_text = """<html>
|
||||
<article>
|
||||
<h1>Some important stuff is going on!</h1>
|
||||
<p>Here is a description of that stuff</p>
|
||||
</article>
|
||||
<article>
|
||||
<h1>Some other important stuff is going on!</h1>
|
||||
<p>Here is a description of that stuff</p>
|
||||
</article>
|
||||
<h4>This is outside of the article.</h4>
|
||||
</html>
|
||||
"""
|
||||
elements = partition_html(text=html_text, html_assemble_articles=False)
|
||||
assert elements[-1] == Title("This is outside of the article.")
|
||||
|
@ -20,7 +20,7 @@ from unstructured.documents.elements import (
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.documents.xml import XMLDocument
|
||||
from unstructured.documents.xml import VALID_PARSERS, XMLDocument
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.text_type import (
|
||||
is_bulleted_text,
|
||||
@ -90,6 +90,15 @@ class HTMLDocument(XMLDocument):
|
||||
"""Class for handling HTML documents. Uses rules based parsing to identify sections
|
||||
of interest within the document."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
stylesheet: Optional[str] = None,
|
||||
parser: VALID_PARSERS = None,
|
||||
assemble_articles: bool = True,
|
||||
):
|
||||
self.assembled_articles = assemble_articles
|
||||
super().__init__(stylesheet=stylesheet, parser=parser)
|
||||
|
||||
def _read(self) -> List[Page]:
|
||||
"""Reads and structures and HTML document. If present, looks for article tags.
|
||||
if there are multiple article sections present, a page break is inserted between them.
|
||||
@ -101,7 +110,7 @@ class HTMLDocument(XMLDocument):
|
||||
etree.strip_elements(self.document_tree, ["script"])
|
||||
root = _find_main(self.document_tree)
|
||||
|
||||
articles = _find_articles(root)
|
||||
articles = _find_articles(root, assemble_articles=self.assembled_articles)
|
||||
page_number = 0
|
||||
page = Page(number=page_number)
|
||||
for article in articles:
|
||||
@ -407,9 +416,12 @@ def _find_main(root: etree.Element) -> etree.Element:
|
||||
return main_tag_elem if main_tag_elem is not None else root
|
||||
|
||||
|
||||
def _find_articles(root: etree.Element) -> List[etree.Element]:
|
||||
def _find_articles(root: etree.Element, assemble_articles: bool = True) -> List[etree.Element]:
|
||||
"""Tries to break the HTML document into distinct articles. If there are no article
|
||||
tags, the entire document is returned as a single item list."""
|
||||
if assemble_articles is False:
|
||||
return root
|
||||
|
||||
articles = root.findall(".//article")
|
||||
if len(articles) == 0:
|
||||
# NOTE(robinson) - ref: https://schema.org/Article
|
||||
|
@ -92,10 +92,16 @@ class XMLDocument(Document):
|
||||
return self.document_tree
|
||||
|
||||
@classmethod
|
||||
def from_string(cls, text: str, parser: VALID_PARSERS = None, stylesheet: Optional[str] = None):
|
||||
def from_string(
|
||||
cls,
|
||||
text: str,
|
||||
parser: VALID_PARSERS = None,
|
||||
stylesheet: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Supports reading in an XML file as a raw string rather than as a file."""
|
||||
logger.info("Reading document from string ...")
|
||||
doc = cls(parser=parser, stylesheet=stylesheet)
|
||||
doc = cls(parser=parser, stylesheet=stylesheet, **kwargs)
|
||||
doc._read_xml(text)
|
||||
return doc
|
||||
|
||||
@ -106,6 +112,7 @@ class XMLDocument(Document):
|
||||
parser: VALID_PARSERS = None,
|
||||
stylesheet: Optional[str] = None,
|
||||
encoding: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
_, content = read_txt_file(filename=filename, encoding=encoding)
|
||||
return cls.from_string(content, parser=parser, stylesheet=stylesheet)
|
||||
return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)
|
||||
|
@ -50,6 +50,7 @@ def partition(
|
||||
pdf_infer_table_structure: bool = False,
|
||||
xml_keep_tags: bool = False,
|
||||
data_source_metadata: Optional[DataSourceMetadata] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
||||
the file's type and route it to the appropriate partitioning function. Applies the default
|
||||
@ -121,21 +122,22 @@ def partition(
|
||||
file.seek(0)
|
||||
|
||||
if filetype == FileType.DOC:
|
||||
elements = partition_doc(filename=filename, file=file)
|
||||
elements = partition_doc(filename=filename, file=file, **kwargs)
|
||||
elif filetype == FileType.DOCX:
|
||||
elements = partition_docx(filename=filename, file=file)
|
||||
elements = partition_docx(filename=filename, file=file, **kwargs)
|
||||
elif filetype == FileType.ODT:
|
||||
elements = partition_odt(filename=filename, file=file)
|
||||
elements = partition_odt(filename=filename, file=file, **kwargs)
|
||||
elif filetype == FileType.EML:
|
||||
elements = partition_email(filename=filename, file=file, encoding=encoding)
|
||||
elements = partition_email(filename=filename, file=file, encoding=encoding, **kwargs)
|
||||
elif filetype == FileType.MSG:
|
||||
elements = partition_msg(filename=filename, file=file)
|
||||
elements = partition_msg(filename=filename, file=file, **kwargs)
|
||||
elif filetype == FileType.HTML:
|
||||
elements = partition_html(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
encoding=encoding,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.XML:
|
||||
elements = partition_xml(
|
||||
@ -143,24 +145,28 @@ def partition(
|
||||
file=file,
|
||||
encoding=encoding,
|
||||
xml_keep_tags=xml_keep_tags,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.EPUB:
|
||||
elements = partition_epub(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.RST:
|
||||
elements = partition_rst(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.MD:
|
||||
elements = partition_md(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.PDF:
|
||||
elements = partition_pdf(
|
||||
@ -171,6 +177,7 @@ def partition(
|
||||
infer_table_structure=pdf_infer_table_structure,
|
||||
strategy=strategy,
|
||||
ocr_languages=ocr_languages,
|
||||
**kwargs,
|
||||
)
|
||||
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
|
||||
elements = partition_image(
|
||||
@ -180,6 +187,7 @@ def partition(
|
||||
include_page_breaks=include_page_breaks,
|
||||
strategy=strategy,
|
||||
ocr_languages=ocr_languages,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.TXT:
|
||||
elements = partition_text(
|
||||
@ -187,33 +195,37 @@ def partition(
|
||||
file=file,
|
||||
encoding=encoding,
|
||||
paragraph_grouper=paragraph_grouper,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.RTF:
|
||||
elements = partition_rtf(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.PPT:
|
||||
elements = partition_ppt(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.PPTX:
|
||||
elements = partition_pptx(
|
||||
filename=filename,
|
||||
file=file,
|
||||
include_page_breaks=include_page_breaks,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.JSON:
|
||||
elements = partition_json(filename=filename, file=file)
|
||||
elements = partition_json(filename=filename, file=file, **kwargs)
|
||||
elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
|
||||
elements = partition_xlsx(filename=filename, file=file)
|
||||
elements = partition_xlsx(filename=filename, file=file, **kwargs)
|
||||
elif filetype == FileType.CSV:
|
||||
elements = partition_csv(filename=filename, file=file)
|
||||
elements = partition_csv(filename=filename, file=file, **kwargs)
|
||||
elif filetype == FileType.TSV:
|
||||
elements = partition_tsv(filename=filename, file=file)
|
||||
elements = partition_tsv(filename=filename, file=file, **kwargs)
|
||||
elif filetype == FileType.EMPTY:
|
||||
elements = []
|
||||
else:
|
||||
|
@ -30,6 +30,7 @@ def partition_html(
|
||||
headers: Dict[str, str] = {},
|
||||
ssl_verify: bool = True,
|
||||
parser: VALID_PARSERS = None,
|
||||
html_assemble_articles: bool = False,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions an HTML document into its constituent elements.
|
||||
@ -66,15 +67,28 @@ def partition_html(
|
||||
exactly_one(filename=filename, file=file, text=text, url=url)
|
||||
|
||||
if filename is not None:
|
||||
document = HTMLDocument.from_file(filename, parser=parser, encoding=encoding)
|
||||
document = HTMLDocument.from_file(
|
||||
filename,
|
||||
parser=parser,
|
||||
encoding=encoding,
|
||||
assemble_articles=html_assemble_articles,
|
||||
)
|
||||
|
||||
elif file is not None:
|
||||
_, file_text = read_txt_file(file=file, encoding=encoding)
|
||||
document = HTMLDocument.from_string(file_text, parser=parser)
|
||||
document = HTMLDocument.from_string(
|
||||
file_text,
|
||||
parser=parser,
|
||||
assemble_articles=html_assemble_articles,
|
||||
)
|
||||
|
||||
elif text is not None:
|
||||
_text: str = str(text)
|
||||
document = HTMLDocument.from_string(_text, parser=parser)
|
||||
document = HTMLDocument.from_string(
|
||||
_text,
|
||||
parser=parser,
|
||||
assemble_articles=html_assemble_articles,
|
||||
)
|
||||
|
||||
elif url is not None:
|
||||
response = requests.get(url, headers=headers, verify=ssl_verify)
|
||||
|
Loading…
x
Reference in New Issue
Block a user