Matt Robinson b6bfbf9108
fix: track filename in metadata for docx tables (#597)
* fix: track filename in metadata for docx tables

* bump version

* remove accidental commit
2023-05-18 10:20:38 -04:00

254 lines
8.3 KiB
Python

import os
import tempfile
from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast
import docx
import pypandoc
from docx.oxml.shared import qn
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from tabulate import tabulate
from unstructured.cleaners.core import clean_bullets
from unstructured.documents.elements import (
Address,
Element,
ElementMetadata,
ListItem,
NarrativeText,
Table,
Text,
Title,
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
from unstructured.partition.text_type import (
is_bulleted_text,
is_possible_narrative_text,
is_possible_title,
is_us_city_state_zip,
)
# NOTE(robinson) - documentation on built in styles can be found at the link below
# ref: https://python-docx.readthedocs.io/en/latest/user/
# styles-understanding.html#paragraph-styles-in-default-template
STYLE_TO_ELEMENT_MAPPING = {
"Caption": Text, # TODO(robinson) - add caption element type
"Heading 1": Title,
"Heading 2": Title,
"Heading 3": Title,
"Heading 4": Title,
"Heading 5": Title,
"Heading 6": Title,
"Heading 7": Title,
"Heading 8": Title,
"Heading 9": Title,
"Intense Quote": Text, # TODO(robinson) - add quote element type
"List": ListItem,
"List 2": ListItem,
"List 3": ListItem,
"List Bullet": ListItem,
"List Bullet 2": ListItem,
"List Bullet 3": ListItem,
"List Continue": ListItem,
"List Continue 2": ListItem,
"List Continue 3": ListItem,
"List Number": ListItem,
"List Number 2": ListItem,
"List Number 3": ListItem,
"List Paragraph": ListItem,
"Macro Text": Text,
"No Spacing": Text,
"Quote": Text, # TODO(robinson) - add quote element type
"Subtitle": Title,
"TOCHeading": Title,
"Title": Title,
}
def _get_paragraph_runs(paragraph):
"""
Get hyperlink text from a paragraph object.
Without this, the default runs function skips over hyperlinks.
Args:
paragraph (Paragraph): A Paragraph object.
Returns:
list: A list of Run objects.
"""
# Recursively get runs.
def _get_runs(node, parent):
for child in node:
# If the child is a run, yield a Run object
if child.tag == qn("w:r"):
yield Run(child, parent)
# If the child is a hyperlink, search for runs within it recursively
if child.tag == qn("w:hyperlink"):
yield from _get_runs(child, parent)
return list(_get_runs(paragraph._element, paragraph))
# Add the runs property to the Paragraph class
Paragraph.runs = property(lambda self: _get_paragraph_runs(self))
@add_metadata_with_filetype(FileType.DOCX)
def partition_docx(
filename: Optional[str] = None,
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
metadata_filename: Optional[str] = None,
) -> List[Element]:
"""Partitions Microsoft Word Documents in .docx format into its document elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
metadata_filename
The filename to use for the metadata. Relevant because partition_doc converts the
document to .docx before partition. We want the original source filename in the
metadata.
"""
# Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file)
if filename is not None:
document = docx.Document(filename)
elif file is not None:
document = docx.Document(
spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)),
)
metadata_filename = metadata_filename or filename
elements: List[Element] = []
table_index = 0
for element_item in document.element.body:
if element_item.tag.endswith("tbl"):
table = document.tables[table_index]
html_table = _convert_table_to_text(table, as_html=True)
text_table = _convert_table_to_text(table, as_html=False)
element = Table(text_table)
if element is not None:
element.metadata = ElementMetadata(
text_as_html=html_table,
filename=metadata_filename,
)
elements.append(element)
table_index += 1
elif element_item.tag.endswith("p"):
paragraph = docx.text.paragraph.Paragraph(element_item, document)
para_element: Optional[Text] = _paragraph_to_element(paragraph)
if para_element is not None:
para_element.metadata = ElementMetadata(filename=metadata_filename)
elements.append(para_element)
return elements
def _convert_table_to_text(table, as_html):
"""
Convert a table object from a Word document to an HTML table string using the tabulate library.
Args:
table (Table): A Table object.
as_html (bool): Whether to return the table as an HTML string (True) or a
plain text string (False)
Returns:
str: An table string representation of the input table.
"""
fmt = "html" if as_html else "plain"
headers = [cell.text for cell in table.rows[0].cells]
data = [[cell.text for cell in row.cells] for row in table.rows[1:]]
return tabulate(data, headers=headers, tablefmt=fmt)
def _paragraph_to_element(paragraph: docx.text.paragraph.Paragraph) -> Optional[Text]:
"""Converts a docx Paragraph object into the appropriate unstructured document element.
If the paragraph style is "Normal" or unknown, we try to predict the element type from the
raw text."""
text = paragraph.text
style_name = paragraph.style and paragraph.style.name # .style can be None
if len(text.strip()) == 0:
return None
element_class = STYLE_TO_ELEMENT_MAPPING.get(style_name)
# NOTE(robinson) - The "Normal" style name will return None since it's in the mapping.
# Unknown style names will also return None
if element_class is None:
return _text_to_element(text)
else:
return element_class(text)
def _text_to_element(text: str) -> Optional[Text]:
"""Converts raw text into an unstructured Text element."""
if is_bulleted_text(text):
clean_text = clean_bullets(text).strip()
return ListItem(text=clean_bullets(text)) if clean_text else None
elif is_us_city_state_zip(text):
return Address(text=text)
if len(text) < 2:
return None
elif is_possible_narrative_text(text):
return NarrativeText(text)
elif is_possible_title(text):
return Title(text)
else:
return Text(text)
def convert_and_partition_docx(
source_format: str,
filename: Optional[str] = None,
file: Optional[IO] = None,
) -> List[Element]:
"""Converts a document to DOCX and then partitions it using partition_html. Works with
any file format support by pandoc.
Parameters
----------
source_format
The format of the source document, .e.g. odt
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
"""
if filename is None:
filename = ""
exactly_one(filename=filename, file=file)
if len(filename) > 0:
_, filename_no_path = os.path.split(os.path.abspath(filename))
base_filename, _ = os.path.splitext(filename_no_path)
if not os.path.exists(filename):
raise ValueError(f"The file {filename} does not exist.")
elif file is not None:
tmp = tempfile.NamedTemporaryFile(delete=False)
tmp.write(file.read())
tmp.close()
filename = tmp.name
_, filename_no_path = os.path.split(os.path.abspath(tmp.name))
base_filename, _ = os.path.splitext(filename_no_path)
with tempfile.TemporaryDirectory() as tmpdir:
docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
pypandoc.convert_file(filename, "docx", format=source_format, outputfile=docx_filename)
elements = partition_docx(filename=docx_filename, metadata_filename=filename)
return elements