63 lines
2.0 KiB
Python
Raw Normal View History

from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast
import lxml.html
import pandas as pd
from unstructured.documents.elements import Element, ElementMetadata, Table
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
@add_metadata_with_filetype(FileType.XLSX)
def partition_xlsx(
filename: Optional[str] = None,
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
) -> List[Element]:
"""Partitions Microsoft Excel Documents in .xlsx format into its document elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
metadata_filename
The filename to use for the metadata.
include_metadata
Determines whether or not metadata is included in the output.
"""
exactly_one(filename=filename, file=file)
if filename:
sheets = pd.read_excel(filename, sheet_name=None)
else:
f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file))
sheets = pd.read_excel(f, sheet_name=None)
metadata_filename = filename or metadata_filename
elements: List[Element] = []
page_number = 0
for sheet_name, table in sheets.items():
page_number += 1
html_text = table.to_html(index=False, header=False, na_rep="")
text = lxml.html.document_fromstring(html_text).text_content()
if include_metadata:
metadata = ElementMetadata(
text_as_html=html_text,
page_name=sheet_name,
page_number=page_number,
filename=metadata_filename,
)
else:
metadata = ElementMetadata()
table = Table(text=text, metadata=metadata)
elements.append(table)
return elements