feat: add partition_xlsx for MSFT Excel files (#594)

* first pass on partition_xlsx * add support for files * add test for xlsx from filename * added filetype metadata * add xlsx to auto * remove fake excel from unsupported * version and changelog * update docs * update readme * fix removed file reference * fix some more tests * pass in metadata filename * add include_metadata flag
2025-06-27 02:30:08 +00:00 · 2023-05-16 15:40:40 -04:00 · 2023-05-16 15:40:40 -04:00 · b8037118c4
commit b8037118c4
parent 830d67f653
11 changed files with 223 additions and 10 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -10,6 +10,8 @@

 ### Features

+* Add `partition_xlsx` for Microsoft Excel documents.
+
 ### Fixes

 * Supports `hml` filetype for partition as a variation of html filetype.
--- a/README.md
+++ b/README.md
@ -183,7 +183,7 @@ You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCj

 The following examples show how to get started with the `unstructured` library.
 You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
-**ODT**, **PPT**, **PPTX**, **JPG**,
+**XLSX**, **ODT**, **PPT**, **PPTX**, **JPG**,
 and **PNG** documents with one line of code!
 <br></br>
 See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
@ -198,7 +198,7 @@ If you are using the `partition` brick, you may need to install additional param
 instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
 `partition` will always apply the default arguments. If you need
 advanced features, use a document-specific brick. The `partition` brick currently works for
-`.txt`, `.doc`, `.docx`, `.ppt`, `.pptx`, `.jpg`, `.png`, `.eml`, `.msg`, `.html`, and `.pdf` documents.
+`.txt`, `.doc`, `.docx`, `.ppt`, `.pptx`, `.xlsx`, `.jpg`, `.png`, `.eml`, `.msg`, `.html`, and `.pdf` documents.

 ```python
 from unstructured.partition.auto import partition
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
 file type and route it to the appropriate partitioning brick. All partitioning bricks
 called within ``partition`` are called using the default kwargs. Use the document-type
 specific bricks if you need to apply non-default settings.
-``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
+``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
 ``.png``, ``.jpg``, and ``.txt`` files.
 If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
 ``.png``, and ``.jpg``.
@ -251,6 +251,24 @@ Examples:
  elements = partition_doc(filename="example-docs/fake.doc")


+``partition_xlsx``
+------------------
+
+The ``partition_xlsx`` function pre-processes Microsoft Excel documents. Each
+sheet in the Excel file will be stored as a ``Table`` object. The plain text
+of the sheet will be the ``text`` attribute of the ``Table``. The ``text_as_html``
+attribute in the element metadata will contain an HTML representation of the table.
+
+Examples:
+
+.. code:: python
+
+  from unstructured.partition.xlsx import partition_xlsx
+
+  elements = partition_xlsx(filename="example-docs/stanley-cups.xlsx")
+  print(elements[0].metadata.text_as_html)
+
+
 ``partition_odt``
 ------------------

--- a/example-docs/stanley-cups.xlsx
+++ b/example-docs/stanley-cups.xlsx
--- a/example-docs/unsupported/fake-excel.xlsx
+++ b/example-docs/unsupported/fake-excel.xlsx
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -29,7 +29,7 @@ EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs"
        ("unsupported/factbook.xml", FileType.XML),
        ("example-10k.html", FileType.HTML),
        ("fake-html.html", FileType.HTML),
-        ("unsupported/fake-excel.xlsx", FileType.XLSX),
+        ("stanley-cups.xlsx", FileType.XLSX),
        ("fake-power-point.pptx", FileType.PPTX),
        ("winter-sports.epub", FileType.EPUB),
        ("spring-weather.html.json", FileType.JSON),
@ -52,7 +52,7 @@ def test_detect_filetype_from_filename(file, expected):
        ("unsupported/factbook.xml", FileType.XML),
        ("example-10k.html", FileType.HTML),
        ("fake-html.html", FileType.HTML),
-        ("unsupported/fake-excel.xlsx", FileType.XLSX),
+        ("stanley-cups.xlsx", FileType.XLSX),
        ("fake-power-point.pptx", FileType.PPTX),
        ("winter-sports.epub", FileType.EPUB),
        ("fake-doc.rtf", FileType.RTF),
@ -87,7 +87,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
        # */xml and some return */html. Either could be acceptable depending on the OS
        ("example-10k.html", [FileType.HTML, FileType.XML]),
        ("fake-html.html", FileType.HTML),
-        ("unsupported/fake-excel.xlsx", FileType.XLSX),
+        ("stanley-cups.xlsx", FileType.XLSX),
        ("fake-power-point.pptx", FileType.PPTX),
        ("winter-sports.epub", FileType.EPUB),
    ],
@ -192,7 +192,7 @@ def test_detect_xls_file_from_mime_type(monkeypatch):

 def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
    monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
    with open(filename, "rb") as f:
        filetype = detect_filetype(file=f)
    assert filetype == FileType.XLSX
@ -200,7 +200,7 @@ def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):

 def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch):
    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
    filetype = detect_filetype(filename=filename)
    assert filetype == FileType.XLSX

@ -246,7 +246,7 @@ def test_detect_docx_filetype_word_mime_type(monkeypatch):

 def test_detect_xlsx_filetype_word_mime_type(monkeypatch):
    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: XLSX_MIME_TYPES[0])
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
    with open(filename, "rb") as f:
        filetype = detect_filetype(file=f)
    assert filetype == FileType.XLSX
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -9,12 +9,14 @@ import docx
 import pypandoc
 import pytest

+from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import (
    Address,
    ElementMetadata,
    ListItem,
    NarrativeText,
    PageBreak,
+    Table,
    Text,
    Title,
 )
@ -609,3 +611,59 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
            elements = fun(str(file))
            assert all(el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype] for el in elements)
            break
+
+
+EXPECTED_XLSX_TABLE = """<table border="1" class="dataframe">
+  <tbody>
+    <tr>
+      <td>Team</td>
+      <td>Location</td>
+      <td>Stanley Cups</td>
+    </tr>
+    <tr>
+      <td>Blues</td>
+      <td>STL</td>
+      <td>1</td>
+    </tr>
+    <tr>
+      <td>Flyers</td>
+      <td>PHI</td>
+      <td>2</td>
+    </tr>
+    <tr>
+      <td>Maple Leafs</td>
+      <td>TOR</td>
+      <td>13</td>
+    </tr>
+  </tbody>
+</table>"""
+
+
+EXPECTED_XLSX_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
+
+EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+
+
+def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
+    elements = partition(filename=filename)
+
+    assert all(isinstance(element, Table) for element in elements)
+    assert len(elements) == 2
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
+    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
+    assert elements[0].metadata.page_number == 1
+    assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
+
+
+def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
+    with open(filename, "rb") as f:
+        elements = partition(file=f)
+
+    assert all(isinstance(element, Table) for element in elements)
+    assert len(elements) == 2
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
+    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
+    assert elements[0].metadata.page_number == 1
+    assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
--- a/test_unstructured/partition/test_xlsx.py
+++ b/test_unstructured/partition/test_xlsx.py
@ -0,0 +1,70 @@
+from unstructured.cleaners.core import clean_extra_whitespace
+from unstructured.documents.elements import Table
+from unstructured.partition.xlsx import partition_xlsx
+
+EXPECTED_TABLE = """<table border="1" class="dataframe">
+  <tbody>
+    <tr>
+      <td>Team</td>
+      <td>Location</td>
+      <td>Stanley Cups</td>
+    </tr>
+    <tr>
+      <td>Blues</td>
+      <td>STL</td>
+      <td>1</td>
+    </tr>
+    <tr>
+      <td>Flyers</td>
+      <td>PHI</td>
+      <td>2</td>
+    </tr>
+    <tr>
+      <td>Maple Leafs</td>
+      <td>TOR</td>
+      <td>13</td>
+    </tr>
+  </tbody>
+</table>"""
+
+
+EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
+
+EXPECTED_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+
+
+def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
+    elements = partition_xlsx(filename=filename)
+
+    assert all(isinstance(element, Table) for element in elements)
+    assert len(elements) == 2
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
+    assert elements[0].metadata.page_number == 1
+    assert elements[0].metadata.filetype == EXPECTED_FILETYPE
+
+
+def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
+    with open(filename, "rb") as f:
+        elements = partition_xlsx(file=f)
+
+    assert all(isinstance(element, Table) for element in elements)
+    assert len(elements) == 2
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
+    assert elements[0].metadata.page_number == 1
+    assert elements[0].metadata.filetype == EXPECTED_FILETYPE
+
+
+def test_partition_xlsx_can_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
+    elements = partition_xlsx(filename=filename, include_metadata=False)
+
+    assert all(isinstance(element, Table) for element in elements)
+    assert len(elements) == 2
+
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+    assert elements[0].metadata.text_as_html is None
+    assert elements[0].metadata.page_number is None
+    assert elements[0].metadata.filetype is None
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -77,7 +77,6 @@ EXPECTED_DOCX_FILES = [
 ]

 EXPECTED_XLSX_FILES = [
-    "docProps/core.xml",
    "xl/workbook.xml",
 ]

--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -26,6 +26,7 @@ from unstructured.partition.ppt import partition_ppt
 from unstructured.partition.pptx import partition_pptx
 from unstructured.partition.rtf import partition_rtf
 from unstructured.partition.text import partition_text
+from unstructured.partition.xlsx import partition_xlsx


 def partition(
@ -183,6 +184,8 @@ def partition(
        )
    elif filetype == FileType.JSON:
        elements = partition_json(filename=filename, file=file)
+    elif filetype == FileType.XLSX:
+        elements = partition_xlsx(filename=filename, file=file)
    else:
        msg = "Invalid file" if not filename else f"Invalid file {filename}"
        raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
--- a/unstructured/partition/xlsx.py
+++ b/unstructured/partition/xlsx.py
@ -0,0 +1,63 @@
+from tempfile import SpooledTemporaryFile
+from typing import IO, BinaryIO, List, Optional, Union, cast
+
+import lxml.html
+import pandas as pd
+
+from unstructured.documents.elements import Element, ElementMetadata, Table
+from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
+from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
+
+
+@add_metadata_with_filetype(FileType.XLSX)
+def partition_xlsx(
+    filename: Optional[str] = None,
+    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
+    metadata_filename: Optional[str] = None,
+    include_metadata: bool = True,
+) -> List[Element]:
+    """Partitions Microsoft Excel Documents in .xlsx format into its document elements.
+
+    Parameters
+    ----------
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object using "rb" mode --> open(filename, "rb").
+    metadata_filename
+        The filename to use for the metadata. Relevant because partition_doc converts the
+        document to .xlsx before partition. We want the original source filename in the
+        metadata.
+    include_metadata
+        Determines whether or not metadata is included in the output.
+    """
+    exactly_one(filename=filename, file=file)
+
+    if filename:
+        sheets = pd.read_excel(filename, sheet_name=None)
+    else:
+        f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file))
+        sheets = pd.read_excel(f, sheet_name=None)
+
+    metadata_filename = filename or metadata_filename
+
+    elements: List[Element] = []
+    page_number = 0
+    for sheet_name, table in sheets.items():
+        page_number += 1
+        html_text = table.to_html(index=False, header=False, na_rep="")
+        text = lxml.html.document_fromstring(html_text).text_content()
+
+        if include_metadata:
+            metadata = ElementMetadata(
+                text_as_html=html_text,
+                page_number=page_number,
+                filename=metadata_filename,
+            )
+        else:
+            metadata = ElementMetadata()
+
+        table = Table(text=text, metadata=metadata)
+        elements.append(table)
+
+    return elements