mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feat: add partition_xlsx
for MSFT Excel files (#594)
* first pass on partition_xlsx * add support for files * add test for xlsx from filename * added filetype metadata * add xlsx to auto * remove fake excel from unsupported * version and changelog * update docs * update readme * fix removed file reference * fix some more tests * pass in metadata filename * add include_metadata flag
This commit is contained in:
parent
830d67f653
commit
b8037118c4
@ -10,6 +10,8 @@
|
||||
|
||||
### Features
|
||||
|
||||
* Add `partition_xlsx` for Microsoft Excel documents.
|
||||
|
||||
### Fixes
|
||||
|
||||
* Supports `hml` filetype for partition as a variation of html filetype.
|
||||
|
@ -183,7 +183,7 @@ You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCj
|
||||
|
||||
The following examples show how to get started with the `unstructured` library.
|
||||
You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
|
||||
**ODT**, **PPT**, **PPTX**, **JPG**,
|
||||
**XLSX**, **ODT**, **PPT**, **PPTX**, **JPG**,
|
||||
and **PNG** documents with one line of code!
|
||||
<br></br>
|
||||
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
|
||||
@ -198,7 +198,7 @@ If you are using the `partition` brick, you may need to install additional param
|
||||
instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
|
||||
`partition` will always apply the default arguments. If you need
|
||||
advanced features, use a document-specific brick. The `partition` brick currently works for
|
||||
`.txt`, `.doc`, `.docx`, `.ppt`, `.pptx`, `.jpg`, `.png`, `.eml`, `.msg`, `.html`, and `.pdf` documents.
|
||||
`.txt`, `.doc`, `.docx`, `.ppt`, `.pptx`, `.xlsx`, `.jpg`, `.png`, `.eml`, `.msg`, `.html`, and `.pdf` documents.
|
||||
|
||||
```python
|
||||
from unstructured.partition.auto import partition
|
||||
|
@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
|
||||
file type and route it to the appropriate partitioning brick. All partitioning bricks
|
||||
called within ``partition`` are called using the default kwargs. Use the document-type
|
||||
specific bricks if you need to apply non-default settings.
|
||||
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
|
||||
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
|
||||
``.png``, ``.jpg``, and ``.txt`` files.
|
||||
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
|
||||
``.png``, and ``.jpg``.
|
||||
@ -251,6 +251,24 @@ Examples:
|
||||
elements = partition_doc(filename="example-docs/fake.doc")
|
||||
|
||||
|
||||
``partition_xlsx``
|
||||
------------------
|
||||
|
||||
The ``partition_xlsx`` function pre-processes Microsoft Excel documents. Each
|
||||
sheet in the Excel file will be stored as a ``Table`` object. The plain text
|
||||
of the sheet will be the ``text`` attribute of the ``Table``. The ``text_as_html``
|
||||
attribute in the element metadata will contain an HTML representation of the table.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.partition.xlsx import partition_xlsx
|
||||
|
||||
elements = partition_xlsx(filename="example-docs/stanley-cups.xlsx")
|
||||
print(elements[0].metadata.text_as_html)
|
||||
|
||||
|
||||
``partition_odt``
|
||||
------------------
|
||||
|
||||
|
BIN
example-docs/stanley-cups.xlsx
Normal file
BIN
example-docs/stanley-cups.xlsx
Normal file
Binary file not shown.
Binary file not shown.
@ -29,7 +29,7 @@ EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs"
|
||||
("unsupported/factbook.xml", FileType.XML),
|
||||
("example-10k.html", FileType.HTML),
|
||||
("fake-html.html", FileType.HTML),
|
||||
("unsupported/fake-excel.xlsx", FileType.XLSX),
|
||||
("stanley-cups.xlsx", FileType.XLSX),
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
("winter-sports.epub", FileType.EPUB),
|
||||
("spring-weather.html.json", FileType.JSON),
|
||||
@ -52,7 +52,7 @@ def test_detect_filetype_from_filename(file, expected):
|
||||
("unsupported/factbook.xml", FileType.XML),
|
||||
("example-10k.html", FileType.HTML),
|
||||
("fake-html.html", FileType.HTML),
|
||||
("unsupported/fake-excel.xlsx", FileType.XLSX),
|
||||
("stanley-cups.xlsx", FileType.XLSX),
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
("winter-sports.epub", FileType.EPUB),
|
||||
("fake-doc.rtf", FileType.RTF),
|
||||
@ -87,7 +87,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
|
||||
# */xml and some return */html. Either could be acceptable depending on the OS
|
||||
("example-10k.html", [FileType.HTML, FileType.XML]),
|
||||
("fake-html.html", FileType.HTML),
|
||||
("unsupported/fake-excel.xlsx", FileType.XLSX),
|
||||
("stanley-cups.xlsx", FileType.XLSX),
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
("winter-sports.epub", FileType.EPUB),
|
||||
],
|
||||
@ -192,7 +192,7 @@ def test_detect_xls_file_from_mime_type(monkeypatch):
|
||||
|
||||
def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
|
||||
with open(filename, "rb") as f:
|
||||
filetype = detect_filetype(file=f)
|
||||
assert filetype == FileType.XLSX
|
||||
@ -200,7 +200,7 @@ def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
|
||||
|
||||
def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
|
||||
filetype = detect_filetype(filename=filename)
|
||||
assert filetype == FileType.XLSX
|
||||
|
||||
@ -246,7 +246,7 @@ def test_detect_docx_filetype_word_mime_type(monkeypatch):
|
||||
|
||||
def test_detect_xlsx_filetype_word_mime_type(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: XLSX_MIME_TYPES[0])
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "unsupported", "fake-excel.xlsx")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
|
||||
with open(filename, "rb") as f:
|
||||
filetype = detect_filetype(file=f)
|
||||
assert filetype == FileType.XLSX
|
||||
|
@ -9,12 +9,14 @@ import docx
|
||||
import pypandoc
|
||||
import pytest
|
||||
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.elements import (
|
||||
Address,
|
||||
ElementMetadata,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
PageBreak,
|
||||
Table,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
@ -609,3 +611,59 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
|
||||
elements = fun(str(file))
|
||||
assert all(el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype] for el in elements)
|
||||
break
|
||||
|
||||
|
||||
EXPECTED_XLSX_TABLE = """<table border="1" class="dataframe">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Team</td>
|
||||
<td>Location</td>
|
||||
<td>Stanley Cups</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Blues</td>
|
||||
<td>STL</td>
|
||||
<td>1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Flyers</td>
|
||||
<td>PHI</td>
|
||||
<td>2</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Maple Leafs</td>
|
||||
<td>TOR</td>
|
||||
<td>13</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>"""
|
||||
|
||||
|
||||
EXPECTED_XLSX_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
||||
|
||||
EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
|
||||
|
||||
def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
|
||||
elements = partition(filename=filename)
|
||||
|
||||
assert all(isinstance(element, Table) for element in elements)
|
||||
assert len(elements) == 2
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
|
||||
assert elements[0].metadata.page_number == 1
|
||||
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
|
||||
|
||||
|
||||
def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(file=f)
|
||||
|
||||
assert all(isinstance(element, Table) for element in elements)
|
||||
assert len(elements) == 2
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
|
||||
assert elements[0].metadata.page_number == 1
|
||||
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
|
||||
|
70
test_unstructured/partition/test_xlsx.py
Normal file
70
test_unstructured/partition/test_xlsx.py
Normal file
@ -0,0 +1,70 @@
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.elements import Table
|
||||
from unstructured.partition.xlsx import partition_xlsx
|
||||
|
||||
EXPECTED_TABLE = """<table border="1" class="dataframe">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Team</td>
|
||||
<td>Location</td>
|
||||
<td>Stanley Cups</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Blues</td>
|
||||
<td>STL</td>
|
||||
<td>1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Flyers</td>
|
||||
<td>PHI</td>
|
||||
<td>2</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Maple Leafs</td>
|
||||
<td>TOR</td>
|
||||
<td>13</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>"""
|
||||
|
||||
|
||||
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
||||
|
||||
EXPECTED_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
|
||||
|
||||
def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
|
||||
elements = partition_xlsx(filename=filename)
|
||||
|
||||
assert all(isinstance(element, Table) for element in elements)
|
||||
assert len(elements) == 2
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
||||
assert elements[0].metadata.page_number == 1
|
||||
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
|
||||
|
||||
|
||||
def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_xlsx(file=f)
|
||||
|
||||
assert all(isinstance(element, Table) for element in elements)
|
||||
assert len(elements) == 2
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
||||
assert elements[0].metadata.page_number == 1
|
||||
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
|
||||
|
||||
|
||||
def test_partition_xlsx_can_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
|
||||
elements = partition_xlsx(filename=filename, include_metadata=False)
|
||||
|
||||
assert all(isinstance(element, Table) for element in elements)
|
||||
assert len(elements) == 2
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
assert elements[0].metadata.text_as_html is None
|
||||
assert elements[0].metadata.page_number is None
|
||||
assert elements[0].metadata.filetype is None
|
@ -77,7 +77,6 @@ EXPECTED_DOCX_FILES = [
|
||||
]
|
||||
|
||||
EXPECTED_XLSX_FILES = [
|
||||
"docProps/core.xml",
|
||||
"xl/workbook.xml",
|
||||
]
|
||||
|
||||
|
@ -26,6 +26,7 @@ from unstructured.partition.ppt import partition_ppt
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
from unstructured.partition.rtf import partition_rtf
|
||||
from unstructured.partition.text import partition_text
|
||||
from unstructured.partition.xlsx import partition_xlsx
|
||||
|
||||
|
||||
def partition(
|
||||
@ -183,6 +184,8 @@ def partition(
|
||||
)
|
||||
elif filetype == FileType.JSON:
|
||||
elements = partition_json(filename=filename, file=file)
|
||||
elif filetype == FileType.XLSX:
|
||||
elements = partition_xlsx(filename=filename, file=file)
|
||||
else:
|
||||
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
||||
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
|
||||
|
63
unstructured/partition/xlsx.py
Normal file
63
unstructured/partition/xlsx.py
Normal file
@ -0,0 +1,63 @@
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import IO, BinaryIO, List, Optional, Union, cast
|
||||
|
||||
import lxml.html
|
||||
import pandas as pd
|
||||
|
||||
from unstructured.documents.elements import Element, ElementMetadata, Table
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
|
||||
|
||||
|
||||
@add_metadata_with_filetype(FileType.XLSX)
|
||||
def partition_xlsx(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
include_metadata: bool = True,
|
||||
) -> List[Element]:
|
||||
"""Partitions Microsoft Excel Documents in .xlsx format into its document elements.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
metadata_filename
|
||||
The filename to use for the metadata. Relevant because partition_doc converts the
|
||||
document to .xlsx before partition. We want the original source filename in the
|
||||
metadata.
|
||||
include_metadata
|
||||
Determines whether or not metadata is included in the output.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
if filename:
|
||||
sheets = pd.read_excel(filename, sheet_name=None)
|
||||
else:
|
||||
f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file))
|
||||
sheets = pd.read_excel(f, sheet_name=None)
|
||||
|
||||
metadata_filename = filename or metadata_filename
|
||||
|
||||
elements: List[Element] = []
|
||||
page_number = 0
|
||||
for sheet_name, table in sheets.items():
|
||||
page_number += 1
|
||||
html_text = table.to_html(index=False, header=False, na_rep="")
|
||||
text = lxml.html.document_fromstring(html_text).text_content()
|
||||
|
||||
if include_metadata:
|
||||
metadata = ElementMetadata(
|
||||
text_as_html=html_text,
|
||||
page_number=page_number,
|
||||
filename=metadata_filename,
|
||||
)
|
||||
else:
|
||||
metadata = ElementMetadata()
|
||||
|
||||
table = Table(text=text, metadata=metadata)
|
||||
elements.append(table)
|
||||
|
||||
return elements
|
Loading…
x
Reference in New Issue
Block a user