feat: add partition_xlsx for MSFT Excel files (#594)

* first pass on partition_xlsx

* add support for files

* add test for xlsx from filename

* added filetype metadata

* add xlsx to auto

* remove fake excel from unsupported

* version and changelog

* update docs

* update readme

* fix removed file reference

* fix some more tests

* pass in metadata filename

* add include_metadata flag
This commit is contained in:
Matt Robinson 2023-05-16 15:40:40 -04:00 committed by GitHub
parent 830d67f653
commit b8037118c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 223 additions and 10 deletions

View File

@ -10,6 +10,8 @@
### Features
* Add `partition_xlsx` for Microsoft Excel documents.
### Fixes
* Supports `hml` filetype for partition as a variation of html filetype.

View File

@ -183,7 +183,7 @@ You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCj
The following examples show how to get started with the `unstructured` library.
You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
**ODT**, **PPT**, **PPTX**, **JPG**,
**XLSX**, **ODT**, **PPT**, **PPTX**, **JPG**,
and **PNG** documents with one line of code!
<br></br>
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
@ -198,7 +198,7 @@ If you are using the `partition` brick, you may need to install additional param
instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
`partition` will always apply the default arguments. If you need
advanced features, use a document-specific brick. The `partition` brick currently works for
`.txt`, `.doc`, `.docx`, `.ppt`, `.pptx`, `.jpg`, `.png`, `.eml`, `.msg`, `.html`, and `.pdf` documents.
`.txt`, `.doc`, `.docx`, `.ppt`, `.pptx`, `.xlsx`, `.jpg`, `.png`, `.eml`, `.msg`, `.html`, and `.pdf` documents.
```python
from unstructured.partition.auto import partition

View File

@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
file type and route it to the appropriate partitioning brick. All partitioning bricks
called within ``partition`` are called using the default kwargs. Use the document-type
specific bricks if you need to apply non-default settings.
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
``.png``, ``.jpg``, and ``.txt`` files.
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
``.png``, and ``.jpg``.
@ -251,6 +251,24 @@ Examples:
elements = partition_doc(filename="example-docs/fake.doc")
``partition_xlsx``
------------------
The ``partition_xlsx`` function pre-processes Microsoft Excel documents. Each
sheet in the Excel file will be stored as a ``Table`` object. The plain text
of the sheet will be the ``text`` attribute of the ``Table``. The ``text_as_html``
attribute in the element metadata will contain an HTML representation of the table.
Examples:
.. code:: python
from unstructured.partition.xlsx import partition_xlsx
elements = partition_xlsx(filename="example-docs/stanley-cups.xlsx")
print(elements[0].metadata.text_as_html)
``partition_odt``
------------------

Binary file not shown.

View File

@ -29,7 +29,7 @@ EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs"
("unsupported/factbook.xml", FileType.XML),
("example-10k.html", FileType.HTML),
("fake-html.html", FileType.HTML),
("unsupported/fake-excel.xlsx", FileType.XLSX),
("stanley-cups.xlsx", FileType.XLSX),
("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB),
("spring-weather.html.json", FileType.JSON),
@ -52,7 +52,7 @@ def test_detect_filetype_from_filename(file, expected):
("unsupported/factbook.xml", FileType.XML),
("example-10k.html", FileType.HTML),
("fake-html.html", FileType.HTML),
("unsupported/fake-excel.xlsx", FileType.XLSX),
("stanley-cups.xlsx", FileType.XLSX),
("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB),
("fake-doc.rtf", FileType.RTF),
@ -87,7 +87,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
# */xml and some return */html. Either could be acceptable depending on the OS
("example-10k.html", [FileType.HTML, FileType.XML]),
("fake-html.html", FileType.HTML),
("unsupported/fake-excel.xlsx", FileType.XLSX),
("stanley-cups.xlsx", FileType.XLSX),
("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB),
],
@ -192,7 +192,7 @@ def test_detect_xls_file_from_mime_type(monkeypatch):
def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
with open(filename, "rb") as f:
filetype = detect_filetype(file=f)
assert filetype == FileType.XLSX
@ -200,7 +200,7 @@ def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
filetype = detect_filetype(filename=filename)
assert filetype == FileType.XLSX
@ -246,7 +246,7 @@ def test_detect_docx_filetype_word_mime_type(monkeypatch):
def test_detect_xlsx_filetype_word_mime_type(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: XLSX_MIME_TYPES[0])
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
with open(filename, "rb") as f:
filetype = detect_filetype(file=f)
assert filetype == FileType.XLSX

View File

@ -9,12 +9,14 @@ import docx
import pypandoc
import pytest
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import (
Address,
ElementMetadata,
ListItem,
NarrativeText,
PageBreak,
Table,
Text,
Title,
)
@ -609,3 +611,59 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
elements = fun(str(file))
assert all(el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype] for el in elements)
break
EXPECTED_XLSX_TABLE = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
</tbody>
</table>"""
EXPECTED_XLSX_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
elements = partition(filename=filename)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
assert elements[0].metadata.page_number == 1
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition(file=f)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
assert elements[0].metadata.page_number == 1
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE

View File

@ -0,0 +1,70 @@
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table
from unstructured.partition.xlsx import partition_xlsx
EXPECTED_TABLE = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
</tbody>
</table>"""
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
EXPECTED_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
elements = partition_xlsx(filename=filename)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.page_number == 1
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition_xlsx(file=f)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.page_number == 1
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
def test_partition_xlsx_can_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
elements = partition_xlsx(filename=filename, include_metadata=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html is None
assert elements[0].metadata.page_number is None
assert elements[0].metadata.filetype is None

View File

@ -77,7 +77,6 @@ EXPECTED_DOCX_FILES = [
]
EXPECTED_XLSX_FILES = [
"docProps/core.xml",
"xl/workbook.xml",
]

View File

@ -26,6 +26,7 @@ from unstructured.partition.ppt import partition_ppt
from unstructured.partition.pptx import partition_pptx
from unstructured.partition.rtf import partition_rtf
from unstructured.partition.text import partition_text
from unstructured.partition.xlsx import partition_xlsx
def partition(
@ -183,6 +184,8 @@ def partition(
)
elif filetype == FileType.JSON:
elements = partition_json(filename=filename, file=file)
elif filetype == FileType.XLSX:
elements = partition_xlsx(filename=filename, file=file)
else:
msg = "Invalid file" if not filename else f"Invalid file {filename}"
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")

View File

@ -0,0 +1,63 @@
from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast
import lxml.html
import pandas as pd
from unstructured.documents.elements import Element, ElementMetadata, Table
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
@add_metadata_with_filetype(FileType.XLSX)
def partition_xlsx(
filename: Optional[str] = None,
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
) -> List[Element]:
"""Partitions Microsoft Excel Documents in .xlsx format into its document elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
metadata_filename
The filename to use for the metadata. Relevant because partition_doc converts the
document to .xlsx before partition. We want the original source filename in the
metadata.
include_metadata
Determines whether or not metadata is included in the output.
"""
exactly_one(filename=filename, file=file)
if filename:
sheets = pd.read_excel(filename, sheet_name=None)
else:
f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file))
sheets = pd.read_excel(f, sheet_name=None)
metadata_filename = filename or metadata_filename
elements: List[Element] = []
page_number = 0
for sheet_name, table in sheets.items():
page_number += 1
html_text = table.to_html(index=False, header=False, na_rep="")
text = lxml.html.document_fromstring(html_text).text_content()
if include_metadata:
metadata = ElementMetadata(
text_as_html=html_text,
page_number=page_number,
filename=metadata_filename,
)
else:
metadata = ElementMetadata()
table = Table(text=text, metadata=metadata)
elements.append(table)
return elements