unstructured/unstructured/partition/docx.py

import os
import tempfile
from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast

import docx
import pypandoc
from docx.oxml.shared import qn
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from tabulate import tabulate

from unstructured.cleaners.core import clean_bullets
from unstructured.documents.elements import (
    Address,
    Element,
    ElementMetadata,
    ListItem,
    NarrativeText,
    Table,
    Text,
    Title,
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
from unstructured.partition.text_type import (
    is_bulleted_text,
    is_possible_narrative_text,
    is_possible_title,
    is_us_city_state_zip,
)

# NOTE(robinson) - documentation on built in styles can be found at the link below
# ref: https://python-docx.readthedocs.io/en/latest/user/
#   styles-understanding.html#paragraph-styles-in-default-template
STYLE_TO_ELEMENT_MAPPING = {
    "Caption": Text,  # TODO(robinson) - add caption element type
    "Heading 1": Title,
    "Heading 2": Title,
    "Heading 3": Title,
    "Heading 4": Title,
    "Heading 5": Title,
    "Heading 6": Title,
    "Heading 7": Title,
    "Heading 8": Title,
    "Heading 9": Title,
    "Intense Quote": Text,  # TODO(robinson) - add quote element type
    "List": ListItem,
    "List 2": ListItem,
    "List 3": ListItem,
    "List Bullet": ListItem,
    "List Bullet 2": ListItem,
    "List Bullet 3": ListItem,
    "List Continue": ListItem,
    "List Continue 2": ListItem,
    "List Continue 3": ListItem,
    "List Number": ListItem,
    "List Number 2": ListItem,
    "List Number 3": ListItem,
    "List Paragraph": ListItem,
    "Macro Text": Text,
    "No Spacing": Text,
    "Quote": Text,  # TODO(robinson) - add quote element type
    "Subtitle": Title,
    "TOCHeading": Title,
    "Title": Title,
}


def _get_paragraph_runs(paragraph):
    """
    Get hyperlink text from a paragraph object.
    Without this, the default runs function skips over hyperlinks.

    Args:
        paragraph (Paragraph): A Paragraph object.

    Returns:
        list: A list of Run objects.
    """

    # Recursively get runs.
    def _get_runs(node, parent):
        for child in node:
            # If the child is a run, yield a Run object
            if child.tag == qn("w:r"):
                yield Run(child, parent)
            # If the child is a hyperlink, search for runs within it recursively
            if child.tag == qn("w:hyperlink"):
                yield from _get_runs(child, parent)

    return list(_get_runs(paragraph._element, paragraph))


# Add the runs property to the Paragraph class
Paragraph.runs = property(lambda self: _get_paragraph_runs(self))


@add_metadata_with_filetype(FileType.DOCX)
def partition_docx(
    filename: Optional[str] = None,
    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
    metadata_filename: Optional[str] = None,
) -> List[Element]:
    """Partitions Microsoft Word Documents in .docx format into its document elements.

    Parameters
    ----------
     filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    metadata_filename
        The filename to use for the metadata. Relevant because partition_doc converts the
        document to .docx before partition. We want the original source filename in the
        metadata.
    """

    # Verify that only one of the arguments was provided
    exactly_one(filename=filename, file=file)

    if filename is not None:
        document = docx.Document(filename)
    elif file is not None:
        document = docx.Document(
            spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)),
        )

    metadata_filename = metadata_filename or filename
    elements: List[Element] = []
    table_index = 0

    for element_item in document.element.body:
        if element_item.tag.endswith("tbl"):
            table = document.tables[table_index]
            html_table = _convert_table_to_text(table, as_html=True)
            text_table = _convert_table_to_text(table, as_html=False)
            element = Table(text_table)
            if element is not None:
                element.metadata = ElementMetadata(
                    text_as_html=html_table,
                    filename=metadata_filename,
                )
                elements.append(element)
            table_index += 1
        elif element_item.tag.endswith("p"):
            paragraph = docx.text.paragraph.Paragraph(element_item, document)
            para_element: Optional[Text] = _paragraph_to_element(paragraph)
            if para_element is not None:
                para_element.metadata = ElementMetadata(filename=metadata_filename)
                elements.append(para_element)

    return elements


def _convert_table_to_text(table, as_html):
    """
    Convert a table object from a Word document to an HTML table string using the tabulate library.

    Args:
        table (Table): A Table object.
        as_html (bool): Whether to return the table as an HTML string (True) or a
            plain text string (False)

    Returns:
        str: An table string representation of the input table.
    """
    fmt = "html" if as_html else "plain"
    headers = [cell.text for cell in table.rows[0].cells]
    data = [[cell.text for cell in row.cells] for row in table.rows[1:]]
    return tabulate(data, headers=headers, tablefmt=fmt)


def _paragraph_to_element(paragraph: docx.text.paragraph.Paragraph) -> Optional[Text]:
    """Converts a docx Paragraph object into the appropriate unstructured document element.
    If the paragraph style is "Normal" or unknown, we try to predict the element type from the
    raw text."""
    text = paragraph.text
    style_name = paragraph.style and paragraph.style.name  # .style can be None

    if len(text.strip()) == 0:
        return None

    element_class = STYLE_TO_ELEMENT_MAPPING.get(style_name)

    # NOTE(robinson) - The "Normal" style name will return None since it's in the mapping.
    # Unknown style names will also return None
    if element_class is None:
        return _text_to_element(text)
    else:
        return element_class(text)


def _text_to_element(text: str) -> Optional[Text]:
    """Converts raw text into an unstructured Text element."""
    if is_bulleted_text(text):
        clean_text = clean_bullets(text).strip()
        return ListItem(text=clean_bullets(text)) if clean_text else None

    elif is_us_city_state_zip(text):
        return Address(text=text)

    if len(text) < 2:
        return None
    elif is_possible_narrative_text(text):
        return NarrativeText(text)
    elif is_possible_title(text):
        return Title(text)
    else:
        return Text(text)


def convert_and_partition_docx(
    source_format: str,
    filename: Optional[str] = None,
    file: Optional[IO] = None,
) -> List[Element]:
    """Converts a document to DOCX and then partitions it using partition_html. Works with
    any file format support by pandoc.

    Parameters
    ----------
    source_format
        The format of the source document, .e.g. odt
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    """
    if filename is None:
        filename = ""
    exactly_one(filename=filename, file=file)

    if len(filename) > 0:
        _, filename_no_path = os.path.split(os.path.abspath(filename))
        base_filename, _ = os.path.splitext(filename_no_path)
        if not os.path.exists(filename):
            raise ValueError(f"The file {filename} does not exist.")
    elif file is not None:
        tmp = tempfile.NamedTemporaryFile(delete=False)
        tmp.write(file.read())
        tmp.close()
        filename = tmp.name
        _, filename_no_path = os.path.split(os.path.abspath(tmp.name))

    base_filename, _ = os.path.splitext(filename_no_path)

    with tempfile.TemporaryDirectory() as tmpdir:
        docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
        pypandoc.convert_file(filename, "docx", format=source_format, outputfile=docx_filename)
        elements = partition_docx(filename=docx_filename, metadata_filename=filename)

    return elements