unstructured/unstructured/partition/html/transformations.py

from __future__ import annotations

import html
from collections import OrderedDict
from itertools import chain
from typing import Sequence, Type

from bs4 import BeautifulSoup, Tag

from unstructured.documents import elements, ontology
from unstructured.documents.mappings import (
    CSS_CLASS_TO_ELEMENT_TYPE_MAP,
    HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP,
    HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP,
    ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE,
)

RECURSION_LIMIT = 50


def ontology_to_unstructured_elements(
    ontology_element: ontology.OntologyElement,
    parent_id: str = None,
    page_number: int = None,
    depth: int = 0,
    filename: str | None = None,
) -> list[elements.Element]:
    """
    Converts an OntologyElement object to a list of unstructured Element objects.

    To preserve the structure of the ontology, the function is recursive
    and the tree structure is represented in flatten list by the parent_id
    attribute in the metadata of each Element object.
    To preserve all the attributes of the ontology element, the HTML code
    is injected to unstructured Element in ElementMetadata.text_as_html attribute.

    For Layout elements, the function creates an empty Text Element (with the
    HTML code injected the same way).

    TODO (Pluto): Better way would be to have special Element type in Unstructured

    Args:
        ontology_element (OntologyElement): The ontology element to be converted.
        parent_id (str, optional): The ID of the parent element. Defaults to None.
        page_number (int, optional): The page number of the element. Defaults to None.
        depth (int, optional): The depth of the element in the hierarchy. Defaults to 0.

    Returns:
        list[Element]: A list of unstructured Element objects.
    """
    elements_to_return = []
    if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT:

        if page_number is None and isinstance(ontology_element, ontology.Page):
            page_number = ontology_element.page_number

        if not isinstance(ontology_element, ontology.Document):
            elements_to_return += [
                elements.Text(
                    text="",
                    element_id=ontology_element.id,
                    detection_origin="vlm_partitioner",
                    metadata=elements.ElementMetadata(
                        parent_id=parent_id,
                        text_as_html=ontology_element.to_html(add_children=False),
                        page_number=page_number,
                        category_depth=depth,
                        filename=filename,
                    ),
                )
            ]
        children = []
        for child in ontology_element.children:
            child = ontology_to_unstructured_elements(
                child,
                parent_id=ontology_element.id,
                page_number=page_number,
                depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
                filename=filename,
            )
            children += child

        combined_children = combine_inline_elements(children)
        elements_to_return += combined_children
    else:
        element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
        html_code_of_ontology_element = ontology_element.to_html()
        element_text = ontology_element.to_text()

        unstructured_element = element_class(
            text=element_text,
            element_id=ontology_element.id,
            detection_origin="vlm_partitioner",
            metadata=elements.ElementMetadata(
                parent_id=parent_id,
                text_as_html=html_code_of_ontology_element,
                page_number=page_number,
                category_depth=depth,
                filename=filename,
            ),
        )
        elements_to_return = [unstructured_element]

    return elements_to_return


def combine_inline_elements(elements: list[elements.Element]) -> list[elements.Element]:
    """
    Combines consecutive inline elements into a single element. Inline elements
    can be also combined with text elements.

    Combined elements contains multiple HTML tags together eg.
    {
        'text': "Text from element 1 Text from element 2",
        'metadata': {
            'text_as_html': "<p>Text from element 1</p><a>Text from element 2</a>"
        }
    }

    Args:
        elements (list[Element]): A list of elements to be combined.

    Returns:
        list[Element]: A list of combined elements.
    """
    result_elements = []

    current_element = None
    for next_element in elements:
        if current_element is None:
            current_element = next_element
            continue

        if can_unstructured_elements_be_merged(current_element, next_element):
            current_element.text += " " + next_element.text
            current_element.metadata.text_as_html += " " + next_element.metadata.text_as_html
        else:
            result_elements.append(current_element)
            current_element = next_element

    if current_element is not None:
        result_elements.append(current_element)

    return result_elements


def can_unstructured_elements_be_merged(
    current_element: elements.Element, next_element: elements.Element
) -> bool:
    """
    Elements can be merged when:
    - They are on the same level in the HTML tree
    - Neither of them has children
    - All elements are inline elements or text element
    """
    if current_element.metadata.category_depth != next_element.metadata.category_depth:
        return False

    current_html_tags = BeautifulSoup(
        current_element.metadata.text_as_html, "html.parser"
    ).find_all(recursive=False)
    next_html_tags = BeautifulSoup(next_element.metadata.text_as_html, "html.parser").find_all(
        recursive=False
    )

    ontology_elements = [
        parse_html_to_ontology_element(html_tag)
        for html_tag in chain(current_html_tags, next_html_tags)
    ]

    for ontology_element in ontology_elements:
        if ontology_element.children:
            return False

        if not (is_inline_element(ontology_element) or is_text_element(ontology_element)):
            return False

    return True


def is_text_element(ontology_element: ontology.OntologyElement) -> bool:
    """Categories or classes that we want to combine with inline text"""

    text_classes = [
        ontology.NarrativeText,
        ontology.Quote,
        ontology.Paragraph,
        ontology.Footnote,
        ontology.FootnoteReference,
        ontology.Citation,
        ontology.Bibliography,
        ontology.Glossary,
    ]
    text_categories = [ontology.ElementTypeEnum.metadata]

    if any(isinstance(ontology_element, class_) for class_ in text_classes):
        return True

    if any(ontology_element.elementType == category for category in text_categories):
        return True

    return False


def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
    """Categories or classes that we want to combine with text elements"""

    inline_classes = [ontology.Hyperlink]
    inline_categories = [
        ontology.ElementTypeEnum.specialized_text,
        ontology.ElementTypeEnum.annotation,
    ]

    if any(isinstance(ontology_element, class_) for class_ in inline_classes):
        return True

    if any(ontology_element.elementType == category for category in inline_categories):
        return True

    return False


def unstructured_elements_to_ontology(
    unstructured_elements: Sequence[elements.Element],
) -> ontology.OntologyElement:
    """
    Converts a sequence of unstructured Element objects to an OntologyElement object.

    The function caches the elements in a dictionary and each element is assigned to its parent.
    At the end the root element is popped from the dictionary and returned.

    Such approach comes with limitations:
        - The parent element has to be in the list before the child element

    Args:
        unstructured_elements (Sequence[Element]): The sequence of unstructured Element objects.

    Returns:
        OntologyElement: The converted OntologyElement object.
    """
    id_to_element_mapping = OrderedDict()

    document_element_id = unstructured_elements[0].metadata.parent_id

    if document_element_id is None:
        document_element_id = ontology.OntologyElement.generate_unique_id()
        unstructured_elements[0].metadata.parent_id = document_element_id

    id_to_element_mapping[document_element_id] = ontology.Document(
        additional_attributes={"id": document_element_id}
    )

    for element in unstructured_elements:
        html_as_tags = BeautifulSoup(element.metadata.text_as_html, "html.parser").find_all(
            recursive=False
        )
        for html_as_tag in html_as_tags:
            ontology_element = parse_html_to_ontology_element(html_as_tag)
            # Note: Each HTML of non-terminal Element doesn't have children in HTML
            # So we just add Ontology Element with tag and class, later children are appended by
            # parent_id.
            # For terminal Elements entire HTML is added to text_as_html, thus it allows us to
            # recreate the entire HTML structure

            id_to_element_mapping[ontology_element.id] = ontology_element

            if element.metadata.parent_id and element.metadata.parent_id in id_to_element_mapping:
                id_to_element_mapping[element.metadata.parent_id].children.append(ontology_element)

    root_id, root_element = id_to_element_mapping.popitem(last=False)
    return root_element


def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement:
    """
    Parses the given HTML code and converts it into an Element object.

    Args:
        html_code (str): The HTML code to be parsed.
            Parsing HTML will start from <div class="Page">.

    Returns:
        OntologyElement: The parsed Element object.

    Raises:
        ValueError: If no <body class="Document"> element is found in the HTML.
    """
    html_code = remove_empty_divs_from_html_content(html_code)
    html_code = remove_empty_tags_from_html_content(html_code)
    soup = BeautifulSoup(html_code, "html.parser")
    document = soup.find("body", class_="Document")
    if not document:
        document = soup.find("div", class_="Page")

    if not document:
        raise ValueError(
            "No <body class='Document'> or <div class='Page'> element found in the HTML."
        )

    document_element = parse_html_to_ontology_element(document)
    return document_element


def remove_empty_divs_from_html_content(html_content: str) -> str:
    soup = BeautifulSoup(html_content, "html.parser")
    divs = soup.find_all("div")
    for div in reversed(divs):
        if not div.attrs:
            div.unwrap()
    return str(soup)


def remove_empty_tags_from_html_content(html_content: str) -> str:
    soup = BeautifulSoup(html_content, "html.parser")

    def is_empty(tag):
        # Remove only specific tags, omit self-closing ones
        if tag.name not in ["p", "span", "div", "h1", "h2", "h3", "h4", "h5", "h6"]:
            return False

        if tag.find():
            return False

        if tag.attrs:
            return False

        if not tag.get_text(strip=True):
            return True

        return False

    def remove_empty_tags(soup):
        for tag in soup.find_all():
            if is_empty(tag):
                tag.decompose()

    remove_empty_tags(soup)

    return str(soup)


def parse_html_to_ontology_element(
    soup: Tag, recursion_depth: int = 1
) -> ontology.OntologyElement | None:
    """
    Converts a BeautifulSoup Tag object into an OntologyElement object. This function is recursive.
    First tries to recognize a class from Unstructured Ontology, then if class is matched tries
    to go deeper inside HTML tree. The recursive parsing is ended if the class is not recognized or
    there are no HTML Tags inside HTML - just text. Then it is parsed to
    Paragraph or UncategorizedText object.

    Args:
        soup (Tag): The BeautifulSoup Tag object to be converted.
        recursion_depth (int): Flag to control limit of recursion depth.

    Returns:
        OntologyElement: The converted OntologyElement object.
    """
    ontology_html_tag, ontology_class = extract_tag_and_ontology_class_from_tag(soup)
    escaped_attrs = get_escaped_attributes(soup)

    if soup.name == "br":  # Note(Pluto) should it be <br class="UncategorizedText">?
        return ontology.Paragraph(
            text="",
            css_class_name=None,
            html_tag_name="br",
            additional_attributes=escaped_attrs,
        )

    has_children = (
        (ontology_class != ontology.UncategorizedText)
        and any(isinstance(content, Tag) for content in soup.contents)
        or ontology_class().elementType == ontology.ElementTypeEnum.layout
    )
    should_unwrap_html = has_children and recursion_depth <= RECURSION_LIMIT

    if should_unwrap_html:
        text = ""
        children = [
            (
                parse_html_to_ontology_element(child, recursion_depth=recursion_depth + 1)
                if isinstance(child, Tag)
                else ontology.Paragraph(text=str(child).strip())
            )
            for child in soup.children
            if str(child).strip()
        ]
    else:
        text = "\n".join([str(content).strip() for content in soup.contents]).strip()
        children = []

    output_element = ontology_class(
        text=text,
        children=children,
        html_tag_name=ontology_html_tag,
        additional_attributes=escaped_attrs,
    )
    # TODO (Pluto): <input class="FormFieldValue"/> requires being wrapped in <label> tags
    return output_element


def extract_tag_and_ontology_class_from_tag(
    soup: Tag,
) -> tuple[str, Type[ontology.OntologyElement]]:
    """
    Extracts the HTML tag and corresponding ontology class
    from a BeautifulSoup Tag object. The CSS class is prioritized over
    the HTML tag. If not recognized soup.name and UnstructuredText is returned.

    Args:
        soup (Tag): The BeautifulSoup Tag object to extract information from.

    Returns:
        tuple: A tuple containing the HTML tag (str) and the ontology class (Type[OntologyElement]).
    """
    html_tag, element_class = None, None

    # Scenario 1: Valid Ontology Element
    if soup.attrs.get("class"):
        html_tag, element_class = soup.name, HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get(
            (soup.name, soup.attrs["class"][0])
        )

    # Scenario 2: HTML tag incorrect, CSS class correct
    # Fallback to css name selector and overwrite html tag
    if (
        not element_class
        and soup.attrs.get("class")
        and soup.attrs["class"][0] in CSS_CLASS_TO_ELEMENT_TYPE_MAP
    ):
        element_class = CSS_CLASS_TO_ELEMENT_TYPE_MAP.get(soup.attrs["class"][0])
        html_tag = element_class().allowed_tags[0]

    # Scenario 3: CSS class incorrect, but HTML tag correct and exclusive in ontology
    if not element_class and soup.name in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP:
        html_tag, element_class = soup.name, HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[soup.name]

    # Scenario 4: CSS class incorrect, HTML tag incorrect
    # Fallback to default UncategorizedText
    if not element_class:
        # TODO (Pluto): Sometimes we could infer that from parent type and soup.name
        #  e.g. parent=FormField soup.name=input -> element=FormFieldInput

        html_tag = "span"
        element_class = ontology.UncategorizedText

    return html_tag, element_class


def get_escaped_attributes(soup: Tag):
    """
    Escapes the attributes of a BeautifulSoup Tag object.

    Args:
        soup (Tag): The BeautifulSoup Tag object whose attributes need to be escaped.

    Returns:
        dict: A dictionary with escaped attribute names and values.
    """
    escaped_attrs = {}
    for key, value in soup.attrs.items():
        escaped_key = html.escape(key)
        escaped_value = None
        if value:
            if isinstance(value, list):
                escaped_value = [html.escape(v) for v in value]
            else:
                escaped_value = html.escape(value)
        escaped_attrs[escaped_key] = escaped_value
    return escaped_attrs