mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 04:08:49 +00:00
feat: add partition_csv
function (#619)
* add csv into filetype detection * first pass on csv * add tests for csv * add csv to auto * version bump * update readme and docs * fix doc strings
This commit is contained in:
parent
046af734d7
commit
21c821d651
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
|||||||
|
## 0.6.8
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
* Add `partition_csv` for CSV files.
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
## 0.6.7
|
## 0.6.7
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
@ -184,7 +184,7 @@ You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCj
|
|||||||
The following examples show how to get started with the `unstructured` library.
|
The following examples show how to get started with the `unstructured` library.
|
||||||
|
|
||||||
You can parse **TXT**, **HTML**, **XML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
|
You can parse **TXT**, **HTML**, **XML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
|
||||||
**XLSX**, **ODT**, **PPT**, **PPTX**, **JPG**,
|
**XLSX**, **CSV**, **ODT**, **PPT**, **PPTX**, **JPG**,
|
||||||
and **PNG** documents with one line of code!
|
and **PNG** documents with one line of code!
|
||||||
<br></br>
|
<br></br>
|
||||||
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
|
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
|
||||||
|
@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
|
|||||||
file type and route it to the appropriate partitioning brick. All partitioning bricks
|
file type and route it to the appropriate partitioning brick. All partitioning bricks
|
||||||
called within ``partition`` are called using the default kwargs. Use the document-type
|
called within ``partition`` are called using the default kwargs. Use the document-type
|
||||||
specific bricks if you need to apply non-default settings.
|
specific bricks if you need to apply non-default settings.
|
||||||
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
|
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.csv``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
|
||||||
``.png``, ``.jpg``, and ``.txt`` files.
|
``.png``, ``.jpg``, and ``.txt`` files.
|
||||||
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
|
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
|
||||||
``.png``, and ``.jpg``.
|
``.png``, and ``.jpg``.
|
||||||
@ -269,6 +269,23 @@ Examples:
|
|||||||
print(elements[0].metadata.text_as_html)
|
print(elements[0].metadata.text_as_html)
|
||||||
|
|
||||||
|
|
||||||
|
``partition_csv``
|
||||||
|
------------------
|
||||||
|
|
||||||
|
The ``partition_csv`` function pre-processes CSV files. The output is a single
|
||||||
|
``Table`` element. The ``text_as_html`` attribute in the element metadata will
|
||||||
|
contain an HTML representation of the table.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.partition.csv import partition_csv
|
||||||
|
|
||||||
|
elements = partition_csv(filename="example-docs/stanley-cups.csv")
|
||||||
|
print(elements[0].metadata.text_as_html)
|
||||||
|
|
||||||
|
|
||||||
``partition_odt``
|
``partition_odt``
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
|
5
example-docs/stanley-cups.csv
Normal file
5
example-docs/stanley-cups.csv
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
Stanley Cups,,
|
||||||
|
Team,Location,Stanley Cups
|
||||||
|
Blues,STL,1
|
||||||
|
Flyers,PHI,2
|
||||||
|
Maple Leafs,TOR,13
|
|
@ -36,6 +36,7 @@ XLSX_MIME_TYPES = [
|
|||||||
("example-10k.html", FileType.HTML),
|
("example-10k.html", FileType.HTML),
|
||||||
("fake-html.html", FileType.HTML),
|
("fake-html.html", FileType.HTML),
|
||||||
("stanley-cups.xlsx", FileType.XLSX),
|
("stanley-cups.xlsx", FileType.XLSX),
|
||||||
|
("stanley-cups.csv", FileType.CSV),
|
||||||
("fake-power-point.pptx", FileType.PPTX),
|
("fake-power-point.pptx", FileType.PPTX),
|
||||||
("winter-sports.epub", FileType.EPUB),
|
("winter-sports.epub", FileType.EPUB),
|
||||||
("spring-weather.html.json", FileType.JSON),
|
("spring-weather.html.json", FileType.JSON),
|
||||||
@ -59,6 +60,7 @@ def test_detect_filetype_from_filename(file, expected):
|
|||||||
("example-10k.html", FileType.HTML),
|
("example-10k.html", FileType.HTML),
|
||||||
("fake-html.html", FileType.HTML),
|
("fake-html.html", FileType.HTML),
|
||||||
("stanley-cups.xlsx", FileType.XLSX),
|
("stanley-cups.xlsx", FileType.XLSX),
|
||||||
|
("stanley-cups.csv", FileType.CSV),
|
||||||
("fake-power-point.pptx", FileType.PPTX),
|
("fake-power-point.pptx", FileType.PPTX),
|
||||||
("winter-sports.epub", FileType.EPUB),
|
("winter-sports.epub", FileType.EPUB),
|
||||||
("fake-doc.rtf", FileType.RTF),
|
("fake-doc.rtf", FileType.RTF),
|
||||||
@ -94,6 +96,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
|
|||||||
("example-10k.html", [FileType.HTML, FileType.XML]),
|
("example-10k.html", [FileType.HTML, FileType.XML]),
|
||||||
("fake-html.html", FileType.HTML),
|
("fake-html.html", FileType.HTML),
|
||||||
("stanley-cups.xlsx", FileType.XLSX),
|
("stanley-cups.xlsx", FileType.XLSX),
|
||||||
|
("stanley-cups.csv", FileType.CSV),
|
||||||
("fake-power-point.pptx", FileType.PPTX),
|
("fake-power-point.pptx", FileType.PPTX),
|
||||||
("winter-sports.epub", FileType.EPUB),
|
("winter-sports.epub", FileType.EPUB),
|
||||||
],
|
],
|
||||||
|
@ -693,3 +693,21 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
|
|||||||
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
|
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
|
||||||
assert elements[0].metadata.page_number == 1
|
assert elements[0].metadata.page_number == 1
|
||||||
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
|
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
|
||||||
|
elements = partition(filename=filename)
|
||||||
|
|
||||||
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
|
||||||
|
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
|
||||||
|
assert elements[0].metadata.filetype == "text/csv"
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
elements = partition(file=f)
|
||||||
|
|
||||||
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
|
||||||
|
assert isinstance(elements[0], Table)
|
||||||
|
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
|
||||||
|
assert elements[0].metadata.filetype == "text/csv"
|
||||||
|
60
test_unstructured/partition/test_csv.py
Normal file
60
test_unstructured/partition/test_csv.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
from unstructured.cleaners.core import clean_extra_whitespace
|
||||||
|
from unstructured.documents.elements import Table
|
||||||
|
from unstructured.partition.csv import partition_csv
|
||||||
|
|
||||||
|
EXPECTED_TABLE = """<table border="1" class="dataframe">
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td>Team</td>
|
||||||
|
<td>Location</td>
|
||||||
|
<td>Stanley Cups</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Blues</td>
|
||||||
|
<td>STL</td>
|
||||||
|
<td>1</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Flyers</td>
|
||||||
|
<td>PHI</td>
|
||||||
|
<td>2</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Maple Leafs</td>
|
||||||
|
<td>TOR</td>
|
||||||
|
<td>13</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>"""
|
||||||
|
|
||||||
|
|
||||||
|
EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
|
||||||
|
|
||||||
|
EXPECTED_FILETYPE = "text/csv"
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
|
||||||
|
elements = partition_csv(filename=filename)
|
||||||
|
|
||||||
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||||
|
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
||||||
|
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
elements = partition_csv(file=f)
|
||||||
|
|
||||||
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||||
|
assert isinstance(elements[0], Table)
|
||||||
|
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
||||||
|
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_csv_can_exclude_metadata(filename="example-docs/stanley-cups.csv"):
|
||||||
|
elements = partition_csv(filename=filename, include_metadata=False)
|
||||||
|
|
||||||
|
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||||
|
assert isinstance(elements[0], Table)
|
||||||
|
assert elements[0].metadata.text_as_html is None
|
||||||
|
assert elements[0].metadata.filetype is None
|
@ -1 +1 @@
|
|||||||
__version__ = "0.6.7" # pragma: no cover
|
__version__ = "0.6.8" # pragma: no cover
|
||||||
|
@ -67,6 +67,7 @@ class FileType(Enum):
|
|||||||
RTF = 41
|
RTF = 41
|
||||||
TXT = 42
|
TXT = 42
|
||||||
JSON = 43
|
JSON = 43
|
||||||
|
CSV = 44
|
||||||
|
|
||||||
# Markup Types
|
# Markup Types
|
||||||
HTML = 50
|
HTML = 50
|
||||||
@ -92,6 +93,7 @@ STR_TO_FILETYPE = {
|
|||||||
"image/jpeg": FileType.JPG,
|
"image/jpeg": FileType.JPG,
|
||||||
"image/png": FileType.PNG,
|
"image/png": FileType.PNG,
|
||||||
"text/plain": FileType.TXT,
|
"text/plain": FileType.TXT,
|
||||||
|
"text/csv": FileType.CSV,
|
||||||
"text/markdown": FileType.MD,
|
"text/markdown": FileType.MD,
|
||||||
"text/x-markdown": FileType.MD,
|
"text/x-markdown": FileType.MD,
|
||||||
"application/epub": FileType.EPUB,
|
"application/epub": FileType.EPUB,
|
||||||
@ -139,6 +141,7 @@ EXT_TO_FILETYPE = {
|
|||||||
".epub": FileType.EPUB,
|
".epub": FileType.EPUB,
|
||||||
".msg": FileType.MSG,
|
".msg": FileType.MSG,
|
||||||
".odt": FileType.ODT,
|
".odt": FileType.ODT,
|
||||||
|
".csv": FileType.CSV,
|
||||||
None: FileType.UNK,
|
None: FileType.UNK,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11,6 +11,7 @@ from unstructured.file_utils.filetype import (
|
|||||||
)
|
)
|
||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
from unstructured.partition.common import exactly_one
|
from unstructured.partition.common import exactly_one
|
||||||
|
from unstructured.partition.csv import partition_csv
|
||||||
from unstructured.partition.doc import partition_doc
|
from unstructured.partition.doc import partition_doc
|
||||||
from unstructured.partition.docx import partition_docx
|
from unstructured.partition.docx import partition_docx
|
||||||
from unstructured.partition.email import partition_email
|
from unstructured.partition.email import partition_email
|
||||||
@ -198,6 +199,8 @@ def partition(
|
|||||||
elements = partition_json(filename=filename, file=file)
|
elements = partition_json(filename=filename, file=file)
|
||||||
elif filetype == FileType.XLSX:
|
elif filetype == FileType.XLSX:
|
||||||
elements = partition_xlsx(filename=filename, file=file)
|
elements = partition_xlsx(filename=filename, file=file)
|
||||||
|
elif filetype == FileType.CSV:
|
||||||
|
elements = partition_csv(filename=filename, file=file)
|
||||||
else:
|
else:
|
||||||
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
||||||
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
|
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
|
||||||
|
53
unstructured/partition/csv.py
Normal file
53
unstructured/partition/csv.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
from tempfile import SpooledTemporaryFile
|
||||||
|
from typing import IO, BinaryIO, List, Optional, Union, cast
|
||||||
|
|
||||||
|
import lxml.html
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from unstructured.documents.elements import Element, ElementMetadata, Table
|
||||||
|
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||||
|
from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
|
||||||
|
|
||||||
|
|
||||||
|
@add_metadata_with_filetype(FileType.CSV)
|
||||||
|
def partition_csv(
|
||||||
|
filename: Optional[str] = None,
|
||||||
|
file: Optional[Union[IO, SpooledTemporaryFile]] = None,
|
||||||
|
metadata_filename: Optional[str] = None,
|
||||||
|
include_metadata: bool = True,
|
||||||
|
) -> List[Element]:
|
||||||
|
"""Partitions Microsoft Excel Documents in .csv format into its document elements.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename
|
||||||
|
A string defining the target filename path.
|
||||||
|
file
|
||||||
|
A file-like object using "rb" mode --> open(filename, "rb").
|
||||||
|
metadata_filename
|
||||||
|
The filename to use for the metadata.
|
||||||
|
include_metadata
|
||||||
|
Determines whether or not metadata is included in the output.
|
||||||
|
"""
|
||||||
|
exactly_one(filename=filename, file=file)
|
||||||
|
|
||||||
|
if filename:
|
||||||
|
table = pd.read_csv(filename)
|
||||||
|
else:
|
||||||
|
f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file))
|
||||||
|
table = pd.read_csv(f)
|
||||||
|
|
||||||
|
metadata_filename = filename or metadata_filename
|
||||||
|
|
||||||
|
html_text = table.to_html(index=False, header=False, na_rep="")
|
||||||
|
text = lxml.html.document_fromstring(html_text).text_content()
|
||||||
|
|
||||||
|
if include_metadata:
|
||||||
|
metadata = ElementMetadata(
|
||||||
|
text_as_html=html_text,
|
||||||
|
filename=metadata_filename,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
metadata = ElementMetadata()
|
||||||
|
|
||||||
|
return [Table(text=text, metadata=metadata)]
|
@ -25,9 +25,7 @@ def partition_xlsx(
|
|||||||
file
|
file
|
||||||
A file-like object using "rb" mode --> open(filename, "rb").
|
A file-like object using "rb" mode --> open(filename, "rb").
|
||||||
metadata_filename
|
metadata_filename
|
||||||
The filename to use for the metadata. Relevant because partition_doc converts the
|
The filename to use for the metadata.
|
||||||
document to .xlsx before partition. We want the original source filename in the
|
|
||||||
metadata.
|
|
||||||
include_metadata
|
include_metadata
|
||||||
Determines whether or not metadata is included in the output.
|
Determines whether or not metadata is included in the output.
|
||||||
"""
|
"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user