unstructured/unstructured/documents/html.py

from __future__ import annotations
from typing import List, Optional, Sequence, Tuple
import sys

if sys.version_info < (3, 8):
    from typing_extensions import Final
else:
    from typing import Final

from lxml import etree

from unstructured.logger import get_logger

from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
from unstructured.documents.base import Page
from unstructured.documents.elements import ListItem, Element, NarrativeText, Title
from unstructured.documents.xml import XMLDocument
from unstructured.partition.text_type import (
    is_bulleted_text,
    is_possible_narrative_text,
    is_possible_title,
)

logger = get_logger()

TEXT_TAGS: Final[List[str]] = ["p", "a", "td", "span", "font"]
LIST_ITEM_TAGS: Final[List[str]] = ["li", "dd"]
HEADING_TAGS: Final[List[str]] = ["h1", "h2", "h3", "h4", "h5", "h6"]
TABLE_TAGS: Final[List[str]] = ["table", "tbody", "td", "tr"]
PAGEBREAK_TAGS: Final[List[str]] = ["hr"]
HEADER_OR_FOOTER_TAGS: Final[List[str]] = ["header", "footer"]


class TagsMixin:
    """Mixin that allows a class to retain tag information."""

    def __init__(
        self,
        *args,
        tag: Optional[str] = None,
        ancestortags: Sequence[str] = tuple(),
        **kwargs,
    ):
        if tag is None:
            raise TypeError("tag argument must be passed and not None")
        else:
            self.tag = tag
        self.ancestortags = ancestortags
        super().__init__(*args, **kwargs)


class HTMLTitle(TagsMixin, Title):
    """Title with tag information."""

    pass


class HTMLNarrativeText(TagsMixin, NarrativeText):
    """NarrativeText with tag information."""

    pass


class HTMLListItem(TagsMixin, ListItem):
    """NarrativeText with tag information."""

    pass


class HTMLDocument(XMLDocument):
    """Class for handling HTML documents. Uses rules based parsing to identify sections
    of interest within the document."""

    def _read(self) -> List[Page]:
        """Reads and structures and HTML document. If present, looks for article tags.
        if there are multiple article sections present, a page break is inserted between them.
        """
        if self._pages:
            return self._pages
        logger.info("Reading document ...")
        pages: List[Page] = list()
        root = _find_main(self.document_tree)

        articles = _find_articles(root)
        page_number = 0
        page = Page(number=page_number)
        for article in articles:
            descendanttag_elems: Tuple[etree.Element, ...] = tuple()
            for tag_elem in article.iter():
                if tag_elem in descendanttag_elems:
                    # Prevent repeating something that's been flagged as text as we chase it
                    # down a chain
                    continue

                if _is_text_tag(tag_elem):
                    element = _parse_tag(tag_elem)
                    if element is not None:
                        page.elements.append(element)
                        descendanttag_elems = tuple(tag_elem.iterdescendants())

                elif _is_bulleted_table(tag_elem):
                    bulleted_text = _bulleted_text_from_table(tag_elem)
                    page.elements.extend(bulleted_text)
                    descendanttag_elems = tuple(tag_elem.iterdescendants())

                elif is_list_item_tag(tag_elem):
                    element, next_element = _process_list_item(tag_elem)
                    if element is not None:
                        page.elements.append(element)
                        descendanttag_elems = _get_bullet_descendants(tag_elem, next_element)

                elif tag_elem.tag in PAGEBREAK_TAGS and len(page.elements) > 0:
                    pages.append(page)
                    page_number += 1
                    page = Page(number=page_number)

            if len(page.elements) > 0:
                pages.append(page)
                page_number += 1
                page = Page(number=page_number)

        return pages

    def doc_after_cleaners(
        self, skip_headers_and_footers=False, skip_table_text=False, inplace=False
    ) -> HTMLDocument:
        """Filters the elements and returns a new instance of the class based on the criteria
        specified. Note that the number of pages can change in the case that all elements on a
        page are filtered out.

        Parameters
        ----------
        skip_table_text:
            If True, skips text that is contained within a table element
        skip_headers_and_footers:
            If True, ignores any content that is within <header> or <footer> tags
        inplace:
            If True, document is modified in place and returned.
        """

        excluders = []
        if skip_headers_and_footers:
            excluders.append(in_header_or_footer)
        if skip_table_text:
            excluders.append(has_table_ancestor)

        pages = []
        page_number = 0
        new_page = Page(number=page_number)
        for page in self.pages:
            elements: List[Element] = []
            for el in page.elements:
                if not isinstance(el, TagsMixin):
                    raise ValueError(
                        f"elements of class {self.__class__} should be of type HTMLTitle "
                        f"HTMLNarrativeText, or HTMLListItem but "
                        f"object has an element of type {type(el)}"
                    )
                if not any(excluder(el) for excluder in excluders):
                    elements.append(el)
                if skip_headers_and_footers and "footer" in tuple(el.ancestortags) + (el.tag,):
                    break
            if elements:
                new_page.elements = elements
                pages.append(new_page)
                page_number += 1
                new_page = Page(number=page_number)
        if inplace:
            self._pages = pages
            self._elements = None
            return self
        else:
            out = self.__class__.from_pages(pages)
            if not isinstance(out, HTMLDocument):
                # NOTE(robinson) - Skipping for test coverage because this condition is impossible.
                # Added type check because from_pages is a method on Document. Without the type
                # check, mypy complains about returning Document instead of HTMLDocument
                raise ValueError(f"Unexpected class: {self.__class__.__name__}")  # pragma: no cover
            return out


def _parse_tag(
    tag_elem: etree.Element,
) -> Optional[Element]:
    """Converts an etree element to a Text element if there is applicable text in the element.
    Ancestor tags are kept so they can be used for filtering or classification without
    processing the document tree again. In the future we might want to keep descendants too,
    but we don't have a use for them at the moment."""
    ancestortags: Tuple[str, ...] = tuple(el.tag for el in tag_elem.iterancestors())[::-1]
    text = _construct_text(tag_elem)
    if not text:
        return None
    if is_bulleted_text(text):
        if not clean_bullets(text):
            return None
        return HTMLListItem(text=clean_bullets(text), tag=tag_elem.tag, ancestortags=ancestortags)

    if len(text) < 2:
        return None
    elif is_narrative_tag(text, tag_elem.tag):
        return HTMLNarrativeText(text, tag=tag_elem.tag, ancestortags=ancestortags)
    elif is_possible_title(text):
        return HTMLTitle(text, tag=tag_elem.tag, ancestortags=ancestortags)
    else:
        # Something that might end up here is text that's just a number.
        return None


def is_narrative_tag(text: str, tag: str) -> bool:
    """Uses tag information to infer whether text is narrative."""
    return tag not in HEADING_TAGS and is_possible_narrative_text(text)


def _construct_text(tag_elem: etree.Element) -> str:
    """Extracts text from a text tag element."""
    text = ""
    for item in tag_elem.itertext():
        if item:
            text += item

    if tag_elem.tail:
        text = text + tag_elem.tail

    text = replace_unicode_quotes(text)
    return text.strip()


def _is_text_tag(tag_elem: etree.Element) -> bool:
    """Deteremines if a tag potentially contains narrative text."""
    if tag_elem.tag in TEXT_TAGS + HEADING_TAGS:
        return True

    # NOTE(robinson) - This indicates that a div tag has no children. If that's the
    # case and the tag has text, its potential a text tag
    children = tag_elem.getchildren()
    if tag_elem.tag == "div" and len(children) == 0:
        return True

    if _has_adjacent_bulleted_spans(tag_elem, children):
        return True

    return False


def _process_list_item(
    tag_elem: etree.Element, max_predecessor_len: int = 5
) -> Tuple[Optional[Element], etree.Element]:
    """If an etree element contains bulleted text, extracts the relevant bulleted text
    and converts it to ListItem objects. Also returns the next html elements so that
    we can skip processing if bullets are found in a div element."""
    if tag_elem.tag in LIST_ITEM_TAGS:
        text = _construct_text(tag_elem)
        return HTMLListItem(text=text, tag=tag_elem.tag), None

    elif tag_elem.tag == "div":
        text = _construct_text(tag_elem)
        next_element = tag_elem.getnext()
        if next_element is None:
            return None, None
        next_text = _construct_text(next_element)
        # NOTE(robinson) - Only consider elements with limited depth. Otherwise,
        # it could be the text representation of a giant div
        if len(tag_elem) > max_predecessor_len:
            return None, None
        if next_text:
            return HTMLListItem(text=next_text, tag=next_element.tag), next_element

    return None, None


def _get_bullet_descendants(element, next_element) -> Tuple[etree.Element, ...]:
    descendants = list()
    if element is not None:
        if next_element is not None:
            descendants += list(next_element.iterdescendants())
    descendanttag_elems = tuple(descendants)
    return descendanttag_elems


def is_list_item_tag(tag_elem: etree.Element) -> bool:
    """Checks to see if a tag contains bulleted text."""
    if tag_elem.tag in LIST_ITEM_TAGS:
        return True
    elif tag_elem.tag == "div":
        if is_bulleted_text(_construct_text(tag_elem)):
            return True
    return False


def _bulleted_text_from_table(table) -> List[Element]:
    """Extracts bulletized narrative text from a table.

    NOTE: if a table has mixed bullets and non-bullets, only bullets are extracted.
    I.e., _read() will drop non-bullet narrative text in the table.
    """
    bulleted_text: List[Element] = list()
    rows = table.findall(".//tr")
    for row in rows:
        text = _construct_text(row)
        if is_bulleted_text(text):
            bulleted_text.append(HTMLListItem(text=clean_bullets(text), tag=row.tag))
    return bulleted_text


def _is_bulleted_table(tag_elem) -> bool:
    """Checks to see if a table element contains bulleted text."""
    if tag_elem.tag != "table":
        return False

    rows = tag_elem.findall(".//tr")
    for row in rows:
        text = _construct_text(row)
        if text and not is_bulleted_text(text):
            return False

    return True


def _has_adjacent_bulleted_spans(tag_elem: etree.Element, children: List[etree.Element]) -> bool:
    """Checks to see if a div contains two or more adjacent spans beginning with a bullet. If
    this is the case, it is treated as a single bulleted text element."""
    if tag_elem.tag == "div":
        all_spans = all([child.tag == "span" for child in children])
        _is_bulleted = children[0].text is not None and is_bulleted_text(children[0].text)
        if all_spans and _is_bulleted:
            return True
    return False


def has_table_ancestor(element: TagsMixin) -> bool:
    """Checks to see if an element has ancestors that are table elements. If so, we consider
    it to be a table element rather than a section of narrative text."""
    for ancestor in element.ancestortags:
        if ancestor in TABLE_TAGS:
            return True
    return False


def in_header_or_footer(element: TagsMixin) -> bool:
    """Checks to see if an element is contained within a header or a footer tag."""
    if any(ancestor in HEADER_OR_FOOTER_TAGS for ancestor in element.ancestortags):
        return True
    return False


def _find_main(root: etree.Element) -> etree.Element:
    """Finds the main tag of the HTML document if it exists. Otherwise, returns the
    whole document."""
    main_tag_elem = root.find(".//main")
    return main_tag_elem if main_tag_elem is not None else root


def _find_articles(root: etree.Element) -> List[etree.Element]:
    """Tries to break the HTML document into distinct articles. If there are no article
    tags, the entire document is returned as a single item list."""
    articles = root.findall(".//article")
    if len(articles) == 0:
        # NOTE(robinson) - ref: https://schema.org/Article
        articles = root.findall(".//div[@itemprop='articleBody']")
    return [root] if len(articles) == 0 else articles