mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-31 20:33:52 +00:00
fix: enable partition_html
to grab content outside of <article>
tags (#772)
* optionally dont assemble articles * add test for content outside of articles * pass kwargs in partition * changelog and version * update default to False * bump version for release * back to dev version to get another fix in the release
This commit is contained in:
parent
feaf1cb4df
commit
c53ce117bc
@ -12,6 +12,9 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* Adds an `html_assemble_articles` kwarg to `partition_html` to enable users to capture
|
||||||
|
control whether content outside of `<article>` tags is captured when
|
||||||
|
`<article>` tags are present.
|
||||||
* Check for the `xml` attribute on `element` before looking for pagebreaks in `partition_docx`.
|
* Check for the `xml` attribute on `element` before looking for pagebreaks in `partition_docx`.
|
||||||
|
|
||||||
## 0.7.6
|
## 0.7.6
|
||||||
|
@ -329,6 +329,15 @@ to disable SSL verification in the request.
|
|||||||
elements = partition_html(url="https://python.org/", ssl_verify=False)
|
elements = partition_html(url="https://python.org/", ssl_verify=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
If you website contains news articles, it can be helpful to only grab content that appears in
|
||||||
|
between the ``<article>`` tags, if the site uses that convention.
|
||||||
|
To activate this behavior, you can set ``html_assemble_articles=True``.
|
||||||
|
If ``html_assemble_articles`` is ``True``, each ``<article>`` tag will be treated as a a page.
|
||||||
|
If ``html_assemble_articles`` is ``True`` and no ``<article>`` tags are present, the behavior
|
||||||
|
is the same as ``html_assemble_articles=False``.
|
||||||
|
|
||||||
|
|
||||||
``partition_image``
|
``partition_image``
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
@ -246,3 +246,20 @@ def test_emoji_appears_with_emoji_utf8_code():
|
|||||||
html_text = """\n<html charset="utf-8"><p>Hello 😀</p></html>"""
|
html_text = """\n<html charset="utf-8"><p>Hello 😀</p></html>"""
|
||||||
elements = partition_html(text=html_text)
|
elements = partition_html(text=html_text)
|
||||||
assert elements[0] == Title("Hello 😀")
|
assert elements[0] == Title("Hello 😀")
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_html_can_turn_off_assemble_articles():
|
||||||
|
html_text = """<html>
|
||||||
|
<article>
|
||||||
|
<h1>Some important stuff is going on!</h1>
|
||||||
|
<p>Here is a description of that stuff</p>
|
||||||
|
</article>
|
||||||
|
<article>
|
||||||
|
<h1>Some other important stuff is going on!</h1>
|
||||||
|
<p>Here is a description of that stuff</p>
|
||||||
|
</article>
|
||||||
|
<h4>This is outside of the article.</h4>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
elements = partition_html(text=html_text, html_assemble_articles=False)
|
||||||
|
assert elements[-1] == Title("This is outside of the article.")
|
||||||
|
@ -20,7 +20,7 @@ from unstructured.documents.elements import (
|
|||||||
Text,
|
Text,
|
||||||
Title,
|
Title,
|
||||||
)
|
)
|
||||||
from unstructured.documents.xml import XMLDocument
|
from unstructured.documents.xml import VALID_PARSERS, XMLDocument
|
||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
from unstructured.partition.text_type import (
|
from unstructured.partition.text_type import (
|
||||||
is_bulleted_text,
|
is_bulleted_text,
|
||||||
@ -90,6 +90,15 @@ class HTMLDocument(XMLDocument):
|
|||||||
"""Class for handling HTML documents. Uses rules based parsing to identify sections
|
"""Class for handling HTML documents. Uses rules based parsing to identify sections
|
||||||
of interest within the document."""
|
of interest within the document."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
stylesheet: Optional[str] = None,
|
||||||
|
parser: VALID_PARSERS = None,
|
||||||
|
assemble_articles: bool = True,
|
||||||
|
):
|
||||||
|
self.assembled_articles = assemble_articles
|
||||||
|
super().__init__(stylesheet=stylesheet, parser=parser)
|
||||||
|
|
||||||
def _read(self) -> List[Page]:
|
def _read(self) -> List[Page]:
|
||||||
"""Reads and structures and HTML document. If present, looks for article tags.
|
"""Reads and structures and HTML document. If present, looks for article tags.
|
||||||
if there are multiple article sections present, a page break is inserted between them.
|
if there are multiple article sections present, a page break is inserted between them.
|
||||||
@ -101,7 +110,7 @@ class HTMLDocument(XMLDocument):
|
|||||||
etree.strip_elements(self.document_tree, ["script"])
|
etree.strip_elements(self.document_tree, ["script"])
|
||||||
root = _find_main(self.document_tree)
|
root = _find_main(self.document_tree)
|
||||||
|
|
||||||
articles = _find_articles(root)
|
articles = _find_articles(root, assemble_articles=self.assembled_articles)
|
||||||
page_number = 0
|
page_number = 0
|
||||||
page = Page(number=page_number)
|
page = Page(number=page_number)
|
||||||
for article in articles:
|
for article in articles:
|
||||||
@ -407,9 +416,12 @@ def _find_main(root: etree.Element) -> etree.Element:
|
|||||||
return main_tag_elem if main_tag_elem is not None else root
|
return main_tag_elem if main_tag_elem is not None else root
|
||||||
|
|
||||||
|
|
||||||
def _find_articles(root: etree.Element) -> List[etree.Element]:
|
def _find_articles(root: etree.Element, assemble_articles: bool = True) -> List[etree.Element]:
|
||||||
"""Tries to break the HTML document into distinct articles. If there are no article
|
"""Tries to break the HTML document into distinct articles. If there are no article
|
||||||
tags, the entire document is returned as a single item list."""
|
tags, the entire document is returned as a single item list."""
|
||||||
|
if assemble_articles is False:
|
||||||
|
return root
|
||||||
|
|
||||||
articles = root.findall(".//article")
|
articles = root.findall(".//article")
|
||||||
if len(articles) == 0:
|
if len(articles) == 0:
|
||||||
# NOTE(robinson) - ref: https://schema.org/Article
|
# NOTE(robinson) - ref: https://schema.org/Article
|
||||||
|
@ -92,10 +92,16 @@ class XMLDocument(Document):
|
|||||||
return self.document_tree
|
return self.document_tree
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_string(cls, text: str, parser: VALID_PARSERS = None, stylesheet: Optional[str] = None):
|
def from_string(
|
||||||
|
cls,
|
||||||
|
text: str,
|
||||||
|
parser: VALID_PARSERS = None,
|
||||||
|
stylesheet: Optional[str] = None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
"""Supports reading in an XML file as a raw string rather than as a file."""
|
"""Supports reading in an XML file as a raw string rather than as a file."""
|
||||||
logger.info("Reading document from string ...")
|
logger.info("Reading document from string ...")
|
||||||
doc = cls(parser=parser, stylesheet=stylesheet)
|
doc = cls(parser=parser, stylesheet=stylesheet, **kwargs)
|
||||||
doc._read_xml(text)
|
doc._read_xml(text)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
@ -106,6 +112,7 @@ class XMLDocument(Document):
|
|||||||
parser: VALID_PARSERS = None,
|
parser: VALID_PARSERS = None,
|
||||||
stylesheet: Optional[str] = None,
|
stylesheet: Optional[str] = None,
|
||||||
encoding: Optional[str] = None,
|
encoding: Optional[str] = None,
|
||||||
|
**kwargs,
|
||||||
):
|
):
|
||||||
_, content = read_txt_file(filename=filename, encoding=encoding)
|
_, content = read_txt_file(filename=filename, encoding=encoding)
|
||||||
return cls.from_string(content, parser=parser, stylesheet=stylesheet)
|
return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)
|
||||||
|
@ -50,6 +50,7 @@ def partition(
|
|||||||
pdf_infer_table_structure: bool = False,
|
pdf_infer_table_structure: bool = False,
|
||||||
xml_keep_tags: bool = False,
|
xml_keep_tags: bool = False,
|
||||||
data_source_metadata: Optional[DataSourceMetadata] = None,
|
data_source_metadata: Optional[DataSourceMetadata] = None,
|
||||||
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
||||||
the file's type and route it to the appropriate partitioning function. Applies the default
|
the file's type and route it to the appropriate partitioning function. Applies the default
|
||||||
@ -121,21 +122,22 @@ def partition(
|
|||||||
file.seek(0)
|
file.seek(0)
|
||||||
|
|
||||||
if filetype == FileType.DOC:
|
if filetype == FileType.DOC:
|
||||||
elements = partition_doc(filename=filename, file=file)
|
elements = partition_doc(filename=filename, file=file, **kwargs)
|
||||||
elif filetype == FileType.DOCX:
|
elif filetype == FileType.DOCX:
|
||||||
elements = partition_docx(filename=filename, file=file)
|
elements = partition_docx(filename=filename, file=file, **kwargs)
|
||||||
elif filetype == FileType.ODT:
|
elif filetype == FileType.ODT:
|
||||||
elements = partition_odt(filename=filename, file=file)
|
elements = partition_odt(filename=filename, file=file, **kwargs)
|
||||||
elif filetype == FileType.EML:
|
elif filetype == FileType.EML:
|
||||||
elements = partition_email(filename=filename, file=file, encoding=encoding)
|
elements = partition_email(filename=filename, file=file, encoding=encoding, **kwargs)
|
||||||
elif filetype == FileType.MSG:
|
elif filetype == FileType.MSG:
|
||||||
elements = partition_msg(filename=filename, file=file)
|
elements = partition_msg(filename=filename, file=file, **kwargs)
|
||||||
elif filetype == FileType.HTML:
|
elif filetype == FileType.HTML:
|
||||||
elements = partition_html(
|
elements = partition_html(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
encoding=encoding,
|
encoding=encoding,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.XML:
|
elif filetype == FileType.XML:
|
||||||
elements = partition_xml(
|
elements = partition_xml(
|
||||||
@ -143,24 +145,28 @@ def partition(
|
|||||||
file=file,
|
file=file,
|
||||||
encoding=encoding,
|
encoding=encoding,
|
||||||
xml_keep_tags=xml_keep_tags,
|
xml_keep_tags=xml_keep_tags,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.EPUB:
|
elif filetype == FileType.EPUB:
|
||||||
elements = partition_epub(
|
elements = partition_epub(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.RST:
|
elif filetype == FileType.RST:
|
||||||
elements = partition_rst(
|
elements = partition_rst(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.MD:
|
elif filetype == FileType.MD:
|
||||||
elements = partition_md(
|
elements = partition_md(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.PDF:
|
elif filetype == FileType.PDF:
|
||||||
elements = partition_pdf(
|
elements = partition_pdf(
|
||||||
@ -171,6 +177,7 @@ def partition(
|
|||||||
infer_table_structure=pdf_infer_table_structure,
|
infer_table_structure=pdf_infer_table_structure,
|
||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
|
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
|
||||||
elements = partition_image(
|
elements = partition_image(
|
||||||
@ -180,6 +187,7 @@ def partition(
|
|||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.TXT:
|
elif filetype == FileType.TXT:
|
||||||
elements = partition_text(
|
elements = partition_text(
|
||||||
@ -187,33 +195,37 @@ def partition(
|
|||||||
file=file,
|
file=file,
|
||||||
encoding=encoding,
|
encoding=encoding,
|
||||||
paragraph_grouper=paragraph_grouper,
|
paragraph_grouper=paragraph_grouper,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.RTF:
|
elif filetype == FileType.RTF:
|
||||||
elements = partition_rtf(
|
elements = partition_rtf(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.PPT:
|
elif filetype == FileType.PPT:
|
||||||
elements = partition_ppt(
|
elements = partition_ppt(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.PPTX:
|
elif filetype == FileType.PPTX:
|
||||||
elements = partition_pptx(
|
elements = partition_pptx(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif filetype == FileType.JSON:
|
elif filetype == FileType.JSON:
|
||||||
elements = partition_json(filename=filename, file=file)
|
elements = partition_json(filename=filename, file=file, **kwargs)
|
||||||
elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
|
elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
|
||||||
elements = partition_xlsx(filename=filename, file=file)
|
elements = partition_xlsx(filename=filename, file=file, **kwargs)
|
||||||
elif filetype == FileType.CSV:
|
elif filetype == FileType.CSV:
|
||||||
elements = partition_csv(filename=filename, file=file)
|
elements = partition_csv(filename=filename, file=file, **kwargs)
|
||||||
elif filetype == FileType.TSV:
|
elif filetype == FileType.TSV:
|
||||||
elements = partition_tsv(filename=filename, file=file)
|
elements = partition_tsv(filename=filename, file=file, **kwargs)
|
||||||
elif filetype == FileType.EMPTY:
|
elif filetype == FileType.EMPTY:
|
||||||
elements = []
|
elements = []
|
||||||
else:
|
else:
|
||||||
|
@ -30,6 +30,7 @@ def partition_html(
|
|||||||
headers: Dict[str, str] = {},
|
headers: Dict[str, str] = {},
|
||||||
ssl_verify: bool = True,
|
ssl_verify: bool = True,
|
||||||
parser: VALID_PARSERS = None,
|
parser: VALID_PARSERS = None,
|
||||||
|
html_assemble_articles: bool = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partitions an HTML document into its constituent elements.
|
"""Partitions an HTML document into its constituent elements.
|
||||||
@ -66,15 +67,28 @@ def partition_html(
|
|||||||
exactly_one(filename=filename, file=file, text=text, url=url)
|
exactly_one(filename=filename, file=file, text=text, url=url)
|
||||||
|
|
||||||
if filename is not None:
|
if filename is not None:
|
||||||
document = HTMLDocument.from_file(filename, parser=parser, encoding=encoding)
|
document = HTMLDocument.from_file(
|
||||||
|
filename,
|
||||||
|
parser=parser,
|
||||||
|
encoding=encoding,
|
||||||
|
assemble_articles=html_assemble_articles,
|
||||||
|
)
|
||||||
|
|
||||||
elif file is not None:
|
elif file is not None:
|
||||||
_, file_text = read_txt_file(file=file, encoding=encoding)
|
_, file_text = read_txt_file(file=file, encoding=encoding)
|
||||||
document = HTMLDocument.from_string(file_text, parser=parser)
|
document = HTMLDocument.from_string(
|
||||||
|
file_text,
|
||||||
|
parser=parser,
|
||||||
|
assemble_articles=html_assemble_articles,
|
||||||
|
)
|
||||||
|
|
||||||
elif text is not None:
|
elif text is not None:
|
||||||
_text: str = str(text)
|
_text: str = str(text)
|
||||||
document = HTMLDocument.from_string(_text, parser=parser)
|
document = HTMLDocument.from_string(
|
||||||
|
_text,
|
||||||
|
parser=parser,
|
||||||
|
assemble_articles=html_assemble_articles,
|
||||||
|
)
|
||||||
|
|
||||||
elif url is not None:
|
elif url is not None:
|
||||||
response = requests.get(url, headers=headers, verify=ssl_verify)
|
response = requests.get(url, headers=headers, verify=ssl_verify)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user