feat: add partition_csv function (#619)

* add csv into filetype detection

* first pass on csv

* add tests for csv

* add csv to auto

* version bump

* update readme and docs

* fix doc strings
This commit is contained in:
Matt Robinson 2023-05-19 15:57:42 -04:00 committed by GitHub
parent 046af734d7
commit 21c821d651
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 176 additions and 6 deletions

View File

@ -1,3 +1,13 @@
## 0.6.8
### Enhancements
### Features
* Add `partition_csv` for CSV files.
### Fixes
## 0.6.7 ## 0.6.7
### Enhancements ### Enhancements

View File

@ -184,7 +184,7 @@ You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCj
The following examples show how to get started with the `unstructured` library. The following examples show how to get started with the `unstructured` library.
You can parse **TXT**, **HTML**, **XML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**, You can parse **TXT**, **HTML**, **XML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
**XLSX**, **ODT**, **PPT**, **PPTX**, **JPG**, **XLSX**, **CSV**, **ODT**, **PPT**, **PPTX**, **JPG**,
and **PNG** documents with one line of code! and **PNG** documents with one line of code!
<br></br> <br></br>
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description

View File

@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
file type and route it to the appropriate partitioning brick. All partitioning bricks file type and route it to the appropriate partitioning brick. All partitioning bricks
called within ``partition`` are called using the default kwargs. Use the document-type called within ``partition`` are called using the default kwargs. Use the document-type
specific bricks if you need to apply non-default settings. specific bricks if you need to apply non-default settings.
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``, ``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.csv``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
``.png``, ``.jpg``, and ``.txt`` files. ``.png``, ``.jpg``, and ``.txt`` files.
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``, If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
``.png``, and ``.jpg``. ``.png``, and ``.jpg``.
@ -269,6 +269,23 @@ Examples:
print(elements[0].metadata.text_as_html) print(elements[0].metadata.text_as_html)
``partition_csv``
------------------
The ``partition_csv`` function pre-processes CSV files. The output is a single
``Table`` element. The ``text_as_html`` attribute in the element metadata will
contain an HTML representation of the table.
Examples:
.. code:: python
from unstructured.partition.csv import partition_csv
elements = partition_csv(filename="example-docs/stanley-cups.csv")
print(elements[0].metadata.text_as_html)
``partition_odt`` ``partition_odt``
------------------ ------------------

View File

@ -0,0 +1,5 @@
Stanley Cups,,
Team,Location,Stanley Cups
Blues,STL,1
Flyers,PHI,2
Maple Leafs,TOR,13
1 Stanley Cups
2 Team Location Stanley Cups
3 Blues STL 1
4 Flyers PHI 2
5 Maple Leafs TOR 13

View File

@ -36,6 +36,7 @@ XLSX_MIME_TYPES = [
("example-10k.html", FileType.HTML), ("example-10k.html", FileType.HTML),
("fake-html.html", FileType.HTML), ("fake-html.html", FileType.HTML),
("stanley-cups.xlsx", FileType.XLSX), ("stanley-cups.xlsx", FileType.XLSX),
("stanley-cups.csv", FileType.CSV),
("fake-power-point.pptx", FileType.PPTX), ("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB), ("winter-sports.epub", FileType.EPUB),
("spring-weather.html.json", FileType.JSON), ("spring-weather.html.json", FileType.JSON),
@ -59,6 +60,7 @@ def test_detect_filetype_from_filename(file, expected):
("example-10k.html", FileType.HTML), ("example-10k.html", FileType.HTML),
("fake-html.html", FileType.HTML), ("fake-html.html", FileType.HTML),
("stanley-cups.xlsx", FileType.XLSX), ("stanley-cups.xlsx", FileType.XLSX),
("stanley-cups.csv", FileType.CSV),
("fake-power-point.pptx", FileType.PPTX), ("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB), ("winter-sports.epub", FileType.EPUB),
("fake-doc.rtf", FileType.RTF), ("fake-doc.rtf", FileType.RTF),
@ -94,6 +96,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
("example-10k.html", [FileType.HTML, FileType.XML]), ("example-10k.html", [FileType.HTML, FileType.XML]),
("fake-html.html", FileType.HTML), ("fake-html.html", FileType.HTML),
("stanley-cups.xlsx", FileType.XLSX), ("stanley-cups.xlsx", FileType.XLSX),
("stanley-cups.csv", FileType.CSV),
("fake-power-point.pptx", FileType.PPTX), ("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB), ("winter-sports.epub", FileType.EPUB),
], ],

View File

@ -693,3 +693,21 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
assert elements[0].metadata.page_number == 1 assert elements[0].metadata.page_number == 1
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
elements = partition(filename=filename)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
assert elements[0].metadata.filetype == "text/csv"
def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
with open(filename, "rb") as f:
elements = partition(file=f)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
assert isinstance(elements[0], Table)
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
assert elements[0].metadata.filetype == "text/csv"

View File

@ -0,0 +1,60 @@
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table
from unstructured.partition.csv import partition_csv
EXPECTED_TABLE = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
</tbody>
</table>"""
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
EXPECTED_FILETYPE = "text/csv"
def test_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
elements = partition_csv(filename=filename)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
def test_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
with open(filename, "rb") as f:
elements = partition_csv(file=f)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert isinstance(elements[0], Table)
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
def test_partition_csv_can_exclude_metadata(filename="example-docs/stanley-cups.csv"):
elements = partition_csv(filename=filename, include_metadata=False)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert isinstance(elements[0], Table)
assert elements[0].metadata.text_as_html is None
assert elements[0].metadata.filetype is None

View File

@ -1 +1 @@
__version__ = "0.6.7" # pragma: no cover __version__ = "0.6.8" # pragma: no cover

View File

@ -67,6 +67,7 @@ class FileType(Enum):
RTF = 41 RTF = 41
TXT = 42 TXT = 42
JSON = 43 JSON = 43
CSV = 44
# Markup Types # Markup Types
HTML = 50 HTML = 50
@ -92,6 +93,7 @@ STR_TO_FILETYPE = {
"image/jpeg": FileType.JPG, "image/jpeg": FileType.JPG,
"image/png": FileType.PNG, "image/png": FileType.PNG,
"text/plain": FileType.TXT, "text/plain": FileType.TXT,
"text/csv": FileType.CSV,
"text/markdown": FileType.MD, "text/markdown": FileType.MD,
"text/x-markdown": FileType.MD, "text/x-markdown": FileType.MD,
"application/epub": FileType.EPUB, "application/epub": FileType.EPUB,
@ -139,6 +141,7 @@ EXT_TO_FILETYPE = {
".epub": FileType.EPUB, ".epub": FileType.EPUB,
".msg": FileType.MSG, ".msg": FileType.MSG,
".odt": FileType.ODT, ".odt": FileType.ODT,
".csv": FileType.CSV,
None: FileType.UNK, None: FileType.UNK,
} }

View File

@ -11,6 +11,7 @@ from unstructured.file_utils.filetype import (
) )
from unstructured.logger import logger from unstructured.logger import logger
from unstructured.partition.common import exactly_one from unstructured.partition.common import exactly_one
from unstructured.partition.csv import partition_csv
from unstructured.partition.doc import partition_doc from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import partition_docx from unstructured.partition.docx import partition_docx
from unstructured.partition.email import partition_email from unstructured.partition.email import partition_email
@ -198,6 +199,8 @@ def partition(
elements = partition_json(filename=filename, file=file) elements = partition_json(filename=filename, file=file)
elif filetype == FileType.XLSX: elif filetype == FileType.XLSX:
elements = partition_xlsx(filename=filename, file=file) elements = partition_xlsx(filename=filename, file=file)
elif filetype == FileType.CSV:
elements = partition_csv(filename=filename, file=file)
else: else:
msg = "Invalid file" if not filename else f"Invalid file {filename}" msg = "Invalid file" if not filename else f"Invalid file {filename}"
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.") raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")

View File

@ -0,0 +1,53 @@
from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast
import lxml.html
import pandas as pd
from unstructured.documents.elements import Element, ElementMetadata, Table
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
@add_metadata_with_filetype(FileType.CSV)
def partition_csv(
filename: Optional[str] = None,
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
) -> List[Element]:
"""Partitions Microsoft Excel Documents in .csv format into its document elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
metadata_filename
The filename to use for the metadata.
include_metadata
Determines whether or not metadata is included in the output.
"""
exactly_one(filename=filename, file=file)
if filename:
table = pd.read_csv(filename)
else:
f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file))
table = pd.read_csv(f)
metadata_filename = filename or metadata_filename
html_text = table.to_html(index=False, header=False, na_rep="")
text = lxml.html.document_fromstring(html_text).text_content()
if include_metadata:
metadata = ElementMetadata(
text_as_html=html_text,
filename=metadata_filename,
)
else:
metadata = ElementMetadata()
return [Table(text=text, metadata=metadata)]

View File

@ -25,9 +25,7 @@ def partition_xlsx(
file file
A file-like object using "rb" mode --> open(filename, "rb"). A file-like object using "rb" mode --> open(filename, "rb").
metadata_filename metadata_filename
The filename to use for the metadata. Relevant because partition_doc converts the The filename to use for the metadata.
document to .xlsx before partition. We want the original source filename in the
metadata.
include_metadata include_metadata
Determines whether or not metadata is included in the output. Determines whether or not metadata is included in the output.
""" """