mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

* move partition_pdf to partition folder * move partition.py * refactor partioning bricks into partition diretory * import to nlp for backward compatibility * update docs * update version and bump changelog * fix typo in changelog * update readme reference
361 lines
13 KiB
Python
361 lines
13 KiB
Python
from __future__ import annotations
|
|
from typing import List, Optional, Sequence, Tuple
|
|
import sys
|
|
|
|
if sys.version_info < (3, 8):
|
|
from typing_extensions import Final
|
|
else:
|
|
from typing import Final
|
|
|
|
from lxml import etree
|
|
|
|
from unstructured.logger import get_logger
|
|
|
|
from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
|
|
from unstructured.documents.base import Page
|
|
from unstructured.documents.elements import ListItem, Element, NarrativeText, Title
|
|
from unstructured.documents.xml import XMLDocument
|
|
from unstructured.partition.text_type import (
|
|
is_bulleted_text,
|
|
is_possible_narrative_text,
|
|
is_possible_title,
|
|
)
|
|
|
|
logger = get_logger()
|
|
|
|
TEXT_TAGS: Final[List[str]] = ["p", "a", "td", "span", "font"]
|
|
LIST_ITEM_TAGS: Final[List[str]] = ["li", "dd"]
|
|
HEADING_TAGS: Final[List[str]] = ["h1", "h2", "h3", "h4", "h5", "h6"]
|
|
TABLE_TAGS: Final[List[str]] = ["table", "tbody", "td", "tr"]
|
|
PAGEBREAK_TAGS: Final[List[str]] = ["hr"]
|
|
HEADER_OR_FOOTER_TAGS: Final[List[str]] = ["header", "footer"]
|
|
|
|
|
|
class TagsMixin:
|
|
"""Mixin that allows a class to retain tag information."""
|
|
|
|
def __init__(
|
|
self,
|
|
*args,
|
|
tag: Optional[str] = None,
|
|
ancestortags: Sequence[str] = tuple(),
|
|
**kwargs,
|
|
):
|
|
if tag is None:
|
|
raise TypeError("tag argument must be passed and not None")
|
|
else:
|
|
self.tag = tag
|
|
self.ancestortags = ancestortags
|
|
super().__init__(*args, **kwargs)
|
|
|
|
|
|
class HTMLTitle(TagsMixin, Title):
|
|
"""Title with tag information."""
|
|
|
|
pass
|
|
|
|
|
|
class HTMLNarrativeText(TagsMixin, NarrativeText):
|
|
"""NarrativeText with tag information."""
|
|
|
|
pass
|
|
|
|
|
|
class HTMLListItem(TagsMixin, ListItem):
|
|
"""NarrativeText with tag information."""
|
|
|
|
pass
|
|
|
|
|
|
class HTMLDocument(XMLDocument):
|
|
"""Class for handling HTML documents. Uses rules based parsing to identify sections
|
|
of interest within the document."""
|
|
|
|
def _read(self) -> List[Page]:
|
|
"""Reads and structures and HTML document. If present, looks for article tags.
|
|
if there are multiple article sections present, a page break is inserted between them.
|
|
"""
|
|
if self._pages:
|
|
return self._pages
|
|
logger.info("Reading document ...")
|
|
pages: List[Page] = list()
|
|
root = _find_main(self.document_tree)
|
|
|
|
articles = _find_articles(root)
|
|
page_number = 0
|
|
page = Page(number=page_number)
|
|
for article in articles:
|
|
descendanttag_elems: Tuple[etree.Element, ...] = tuple()
|
|
for tag_elem in article.iter():
|
|
if tag_elem in descendanttag_elems:
|
|
# Prevent repeating something that's been flagged as text as we chase it
|
|
# down a chain
|
|
continue
|
|
|
|
if _is_text_tag(tag_elem):
|
|
element = _parse_tag(tag_elem)
|
|
if element is not None:
|
|
page.elements.append(element)
|
|
descendanttag_elems = tuple(tag_elem.iterdescendants())
|
|
|
|
elif _is_bulleted_table(tag_elem):
|
|
bulleted_text = _bulleted_text_from_table(tag_elem)
|
|
page.elements.extend(bulleted_text)
|
|
descendanttag_elems = tuple(tag_elem.iterdescendants())
|
|
|
|
elif is_list_item_tag(tag_elem):
|
|
element, next_element = _process_list_item(tag_elem)
|
|
if element is not None:
|
|
page.elements.append(element)
|
|
descendanttag_elems = _get_bullet_descendants(tag_elem, next_element)
|
|
|
|
elif tag_elem.tag in PAGEBREAK_TAGS and len(page.elements) > 0:
|
|
pages.append(page)
|
|
page_number += 1
|
|
page = Page(number=page_number)
|
|
|
|
if len(page.elements) > 0:
|
|
pages.append(page)
|
|
page_number += 1
|
|
page = Page(number=page_number)
|
|
|
|
return pages
|
|
|
|
def doc_after_cleaners(
|
|
self, skip_headers_and_footers=False, skip_table_text=False, inplace=False
|
|
) -> HTMLDocument:
|
|
"""Filters the elements and returns a new instance of the class based on the criteria
|
|
specified. Note that the number of pages can change in the case that all elements on a
|
|
page are filtered out.
|
|
|
|
Parameters
|
|
----------
|
|
skip_table_text:
|
|
If True, skips text that is contained within a table element
|
|
skip_headers_and_footers:
|
|
If True, ignores any content that is within <header> or <footer> tags
|
|
inplace:
|
|
If True, document is modified in place and returned.
|
|
"""
|
|
|
|
excluders = []
|
|
if skip_headers_and_footers:
|
|
excluders.append(in_header_or_footer)
|
|
if skip_table_text:
|
|
excluders.append(has_table_ancestor)
|
|
|
|
pages = []
|
|
page_number = 0
|
|
new_page = Page(number=page_number)
|
|
for page in self.pages:
|
|
elements: List[Element] = []
|
|
for el in page.elements:
|
|
if not isinstance(el, TagsMixin):
|
|
raise ValueError(
|
|
f"elements of class {self.__class__} should be of type HTMLTitle "
|
|
f"HTMLNarrativeText, or HTMLListItem but "
|
|
f"object has an element of type {type(el)}"
|
|
)
|
|
if not any(excluder(el) for excluder in excluders):
|
|
elements.append(el)
|
|
if skip_headers_and_footers and "footer" in tuple(el.ancestortags) + (el.tag,):
|
|
break
|
|
if elements:
|
|
new_page.elements = elements
|
|
pages.append(new_page)
|
|
page_number += 1
|
|
new_page = Page(number=page_number)
|
|
if inplace:
|
|
self._pages = pages
|
|
self._elements = None
|
|
return self
|
|
else:
|
|
out = self.__class__.from_pages(pages)
|
|
if not isinstance(out, HTMLDocument):
|
|
# NOTE(robinson) - Skipping for test coverage because this condition is impossible.
|
|
# Added type check because from_pages is a method on Document. Without the type
|
|
# check, mypy complains about returning Document instead of HTMLDocument
|
|
raise ValueError(f"Unexpected class: {self.__class__.__name__}") # pragma: no cover
|
|
return out
|
|
|
|
|
|
def _parse_tag(
|
|
tag_elem: etree.Element,
|
|
) -> Optional[Element]:
|
|
"""Converts an etree element to a Text element if there is applicable text in the element.
|
|
Ancestor tags are kept so they can be used for filtering or classification without
|
|
processing the document tree again. In the future we might want to keep descendants too,
|
|
but we don't have a use for them at the moment."""
|
|
ancestortags: Tuple[str, ...] = tuple(el.tag for el in tag_elem.iterancestors())[::-1]
|
|
text = _construct_text(tag_elem)
|
|
if not text:
|
|
return None
|
|
if is_bulleted_text(text):
|
|
if not clean_bullets(text):
|
|
return None
|
|
return HTMLListItem(text=clean_bullets(text), tag=tag_elem.tag, ancestortags=ancestortags)
|
|
|
|
if len(text) < 2:
|
|
return None
|
|
elif is_narrative_tag(text, tag_elem.tag):
|
|
return HTMLNarrativeText(text, tag=tag_elem.tag, ancestortags=ancestortags)
|
|
elif is_possible_title(text):
|
|
return HTMLTitle(text, tag=tag_elem.tag, ancestortags=ancestortags)
|
|
else:
|
|
# Something that might end up here is text that's just a number.
|
|
return None
|
|
|
|
|
|
def is_narrative_tag(text: str, tag: str) -> bool:
|
|
"""Uses tag information to infer whether text is narrative."""
|
|
return tag not in HEADING_TAGS and is_possible_narrative_text(text)
|
|
|
|
|
|
def _construct_text(tag_elem: etree.Element) -> str:
|
|
"""Extracts text from a text tag element."""
|
|
text = ""
|
|
for item in tag_elem.itertext():
|
|
if item:
|
|
text += item
|
|
|
|
if tag_elem.tail:
|
|
text = text + tag_elem.tail
|
|
|
|
text = replace_unicode_quotes(text)
|
|
return text.strip()
|
|
|
|
|
|
def _is_text_tag(tag_elem: etree.Element) -> bool:
|
|
"""Deteremines if a tag potentially contains narrative text."""
|
|
if tag_elem.tag in TEXT_TAGS + HEADING_TAGS:
|
|
return True
|
|
|
|
# NOTE(robinson) - This indicates that a div tag has no children. If that's the
|
|
# case and the tag has text, its potential a text tag
|
|
children = tag_elem.getchildren()
|
|
if tag_elem.tag == "div" and len(children) == 0:
|
|
return True
|
|
|
|
if _has_adjacent_bulleted_spans(tag_elem, children):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def _process_list_item(
|
|
tag_elem: etree.Element, max_predecessor_len: int = 5
|
|
) -> Tuple[Optional[Element], etree.Element]:
|
|
"""If an etree element contains bulleted text, extracts the relevant bulleted text
|
|
and converts it to ListItem objects. Also returns the next html elements so that
|
|
we can skip processing if bullets are found in a div element."""
|
|
if tag_elem.tag in LIST_ITEM_TAGS:
|
|
text = _construct_text(tag_elem)
|
|
return HTMLListItem(text=text, tag=tag_elem.tag), None
|
|
|
|
elif tag_elem.tag == "div":
|
|
text = _construct_text(tag_elem)
|
|
next_element = tag_elem.getnext()
|
|
if next_element is None:
|
|
return None, None
|
|
next_text = _construct_text(next_element)
|
|
# NOTE(robinson) - Only consider elements with limited depth. Otherwise,
|
|
# it could be the text representation of a giant div
|
|
if len(tag_elem) > max_predecessor_len:
|
|
return None, None
|
|
if next_text:
|
|
return HTMLListItem(text=next_text, tag=next_element.tag), next_element
|
|
|
|
return None, None
|
|
|
|
|
|
def _get_bullet_descendants(element, next_element) -> Tuple[etree.Element, ...]:
|
|
descendants = list()
|
|
if element is not None:
|
|
if next_element is not None:
|
|
descendants += list(next_element.iterdescendants())
|
|
descendanttag_elems = tuple(descendants)
|
|
return descendanttag_elems
|
|
|
|
|
|
def is_list_item_tag(tag_elem: etree.Element) -> bool:
|
|
"""Checks to see if a tag contains bulleted text."""
|
|
if tag_elem.tag in LIST_ITEM_TAGS:
|
|
return True
|
|
elif tag_elem.tag == "div":
|
|
if is_bulleted_text(_construct_text(tag_elem)):
|
|
return True
|
|
return False
|
|
|
|
|
|
def _bulleted_text_from_table(table) -> List[Element]:
|
|
"""Extracts bulletized narrative text from a table.
|
|
|
|
NOTE: if a table has mixed bullets and non-bullets, only bullets are extracted.
|
|
I.e., _read() will drop non-bullet narrative text in the table.
|
|
"""
|
|
bulleted_text: List[Element] = list()
|
|
rows = table.findall(".//tr")
|
|
for row in rows:
|
|
text = _construct_text(row)
|
|
if is_bulleted_text(text):
|
|
bulleted_text.append(HTMLListItem(text=clean_bullets(text), tag=row.tag))
|
|
return bulleted_text
|
|
|
|
|
|
def _is_bulleted_table(tag_elem) -> bool:
|
|
"""Checks to see if a table element contains bulleted text."""
|
|
if tag_elem.tag != "table":
|
|
return False
|
|
|
|
rows = tag_elem.findall(".//tr")
|
|
for row in rows:
|
|
text = _construct_text(row)
|
|
if text and not is_bulleted_text(text):
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def _has_adjacent_bulleted_spans(tag_elem: etree.Element, children: List[etree.Element]) -> bool:
|
|
"""Checks to see if a div contains two or more adjacent spans beginning with a bullet. If
|
|
this is the case, it is treated as a single bulleted text element."""
|
|
if tag_elem.tag == "div":
|
|
all_spans = all([child.tag == "span" for child in children])
|
|
_is_bulleted = children[0].text is not None and is_bulleted_text(children[0].text)
|
|
if all_spans and _is_bulleted:
|
|
return True
|
|
return False
|
|
|
|
|
|
def has_table_ancestor(element: TagsMixin) -> bool:
|
|
"""Checks to see if an element has ancestors that are table elements. If so, we consider
|
|
it to be a table element rather than a section of narrative text."""
|
|
for ancestor in element.ancestortags:
|
|
if ancestor in TABLE_TAGS:
|
|
return True
|
|
return False
|
|
|
|
|
|
def in_header_or_footer(element: TagsMixin) -> bool:
|
|
"""Checks to see if an element is contained within a header or a footer tag."""
|
|
if any(ancestor in HEADER_OR_FOOTER_TAGS for ancestor in element.ancestortags):
|
|
return True
|
|
return False
|
|
|
|
|
|
def _find_main(root: etree.Element) -> etree.Element:
|
|
"""Finds the main tag of the HTML document if it exists. Otherwise, returns the
|
|
whole document."""
|
|
main_tag_elem = root.find(".//main")
|
|
return main_tag_elem if main_tag_elem is not None else root
|
|
|
|
|
|
def _find_articles(root: etree.Element) -> List[etree.Element]:
|
|
"""Tries to break the HTML document into distinct articles. If there are no article
|
|
tags, the entire document is returned as a single item list."""
|
|
articles = root.findall(".//article")
|
|
if len(articles) == 0:
|
|
# NOTE(robinson) - ref: https://schema.org/Article
|
|
articles = root.findall(".//div[@itemprop='articleBody']")
|
|
return [root] if len(articles) == 0 else articles
|