2023-05-16 15:40:40 -04:00
|
|
|
from tempfile import SpooledTemporaryFile
|
|
|
|
|
from typing import IO, BinaryIO, List, Optional, Union, cast
|
|
|
|
|
|
|
|
|
|
import lxml.html
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
|
from unstructured.documents.elements import Element, ElementMetadata, Table
|
|
|
|
|
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
|
|
|
|
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@add_metadata_with_filetype(FileType.XLSX)
|
|
|
|
|
def partition_xlsx(
|
|
|
|
|
filename: Optional[str] = None,
|
|
|
|
|
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
|
|
|
|
|
metadata_filename: Optional[str] = None,
|
|
|
|
|
include_metadata: bool = True,
|
|
|
|
|
) -> List[Element]:
|
|
|
|
|
"""Partitions Microsoft Excel Documents in .xlsx format into its document elements.
|
|
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
|
----------
|
|
|
|
|
filename
|
|
|
|
|
A string defining the target filename path.
|
|
|
|
|
file
|
|
|
|
|
A file-like object using "rb" mode --> open(filename, "rb").
|
|
|
|
|
metadata_filename
|
2023-05-19 15:57:42 -04:00
|
|
|
The filename to use for the metadata.
|
2023-05-16 15:40:40 -04:00
|
|
|
include_metadata
|
|
|
|
|
Determines whether or not metadata is included in the output.
|
|
|
|
|
"""
|
|
|
|
|
exactly_one(filename=filename, file=file)
|
|
|
|
|
|
|
|
|
|
if filename:
|
|
|
|
|
sheets = pd.read_excel(filename, sheet_name=None)
|
|
|
|
|
else:
|
|
|
|
|
f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file))
|
|
|
|
|
sheets = pd.read_excel(f, sheet_name=None)
|
|
|
|
|
|
|
|
|
|
metadata_filename = filename or metadata_filename
|
|
|
|
|
|
|
|
|
|
elements: List[Element] = []
|
|
|
|
|
page_number = 0
|
|
|
|
|
for sheet_name, table in sheets.items():
|
|
|
|
|
page_number += 1
|
|
|
|
|
html_text = table.to_html(index=False, header=False, na_rep="")
|
|
|
|
|
text = lxml.html.document_fromstring(html_text).text_content()
|
|
|
|
|
|
|
|
|
|
if include_metadata:
|
|
|
|
|
metadata = ElementMetadata(
|
|
|
|
|
text_as_html=html_text,
|
2023-05-18 16:53:23 +03:00
|
|
|
page_name=sheet_name,
|
2023-05-16 15:40:40 -04:00
|
|
|
page_number=page_number,
|
|
|
|
|
filename=metadata_filename,
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
metadata = ElementMetadata()
|
|
|
|
|
|
|
|
|
|
table = Table(text=text, metadata=metadata)
|
|
|
|
|
elements.append(table)
|
|
|
|
|
|
|
|
|
|
return elements
|