diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d9d38904..9386d9711 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.6.8 + +### Enhancements + +### Features + +* Add `partition_csv` for CSV files. + +### Fixes + ## 0.6.7 ### Enhancements diff --git a/README.md b/README.md index 8fa558c94..756d80c79 100644 --- a/README.md +++ b/README.md @@ -184,7 +184,7 @@ You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCj The following examples show how to get started with the `unstructured` library. You can parse **TXT**, **HTML**, **XML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**, -**XLSX**, **ODT**, **PPT**, **PPTX**, **JPG**, +**XLSX**, **CSV**, **ODT**, **PPT**, **PPTX**, **JPG**, and **PNG** documents with one line of code!

See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index 496c66020..56cd6bd7e 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect file type and route it to the appropriate partitioning brick. All partitioning bricks called within ``partition`` are called using the default kwargs. Use the document-type specific bricks if you need to apply non-default settings. -``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``, +``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.csv``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``, ``.png``, ``.jpg``, and ``.txt`` files. If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``, ``.png``, and ``.jpg``. @@ -269,6 +269,23 @@ Examples: print(elements[0].metadata.text_as_html) +``partition_csv`` +------------------ + +The ``partition_csv`` function pre-processes CSV files. The output is a single +``Table`` element. The ``text_as_html`` attribute in the element metadata will +contain an HTML representation of the table. + +Examples: + +.. code:: python + + from unstructured.partition.csv import partition_csv + + elements = partition_csv(filename="example-docs/stanley-cups.csv") + print(elements[0].metadata.text_as_html) + + ``partition_odt`` ------------------ diff --git a/example-docs/stanley-cups.csv b/example-docs/stanley-cups.csv new file mode 100644 index 000000000..4414023f0 --- /dev/null +++ b/example-docs/stanley-cups.csv @@ -0,0 +1,5 @@ +Stanley Cups,, +Team,Location,Stanley Cups +Blues,STL,1 +Flyers,PHI,2 +Maple Leafs,TOR,13 \ No newline at end of file diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 76c6ced3d..16169abfa 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -36,6 +36,7 @@ XLSX_MIME_TYPES = [ ("example-10k.html", FileType.HTML), ("fake-html.html", FileType.HTML), ("stanley-cups.xlsx", FileType.XLSX), + ("stanley-cups.csv", FileType.CSV), ("fake-power-point.pptx", FileType.PPTX), ("winter-sports.epub", FileType.EPUB), ("spring-weather.html.json", FileType.JSON), @@ -59,6 +60,7 @@ def test_detect_filetype_from_filename(file, expected): ("example-10k.html", FileType.HTML), ("fake-html.html", FileType.HTML), ("stanley-cups.xlsx", FileType.XLSX), + ("stanley-cups.csv", FileType.CSV), ("fake-power-point.pptx", FileType.PPTX), ("winter-sports.epub", FileType.EPUB), ("fake-doc.rtf", FileType.RTF), @@ -94,6 +96,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte ("example-10k.html", [FileType.HTML, FileType.XML]), ("fake-html.html", FileType.HTML), ("stanley-cups.xlsx", FileType.XLSX), + ("stanley-cups.csv", FileType.CSV), ("fake-power-point.pptx", FileType.PPTX), ("winter-sports.epub", FileType.EPUB), ], diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 7174f42a9..398351911 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -693,3 +693,21 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx" assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE assert elements[0].metadata.page_number == 1 assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE + + +def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"): + elements = partition(filename=filename) + + assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT + assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE + assert elements[0].metadata.filetype == "text/csv" + + +def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"): + with open(filename, "rb") as f: + elements = partition(file=f) + + assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT + assert isinstance(elements[0], Table) + assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE + assert elements[0].metadata.filetype == "text/csv" diff --git a/test_unstructured/partition/test_csv.py b/test_unstructured/partition/test_csv.py new file mode 100644 index 000000000..db5e1a418 --- /dev/null +++ b/test_unstructured/partition/test_csv.py @@ -0,0 +1,60 @@ +from unstructured.cleaners.core import clean_extra_whitespace +from unstructured.documents.elements import Table +from unstructured.partition.csv import partition_csv + +EXPECTED_TABLE = """ + + + + + + + + + + + + + + + + + + + + + + +
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
""" + + +EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13" + +EXPECTED_FILETYPE = "text/csv" + + +def test_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"): + elements = partition_csv(filename=filename) + + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT + assert elements[0].metadata.text_as_html == EXPECTED_TABLE + assert elements[0].metadata.filetype == EXPECTED_FILETYPE + + +def test_partition_csv_from_file(filename="example-docs/stanley-cups.csv"): + with open(filename, "rb") as f: + elements = partition_csv(file=f) + + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT + assert isinstance(elements[0], Table) + assert elements[0].metadata.text_as_html == EXPECTED_TABLE + assert elements[0].metadata.filetype == EXPECTED_FILETYPE + + +def test_partition_csv_can_exclude_metadata(filename="example-docs/stanley-cups.csv"): + elements = partition_csv(filename=filename, include_metadata=False) + + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT + assert isinstance(elements[0], Table) + assert elements[0].metadata.text_as_html is None + assert elements[0].metadata.filetype is None diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ba99456df..de61145b6 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.6.7" # pragma: no cover +__version__ = "0.6.8" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 5e770c4bf..3e723c5d7 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -67,6 +67,7 @@ class FileType(Enum): RTF = 41 TXT = 42 JSON = 43 + CSV = 44 # Markup Types HTML = 50 @@ -92,6 +93,7 @@ STR_TO_FILETYPE = { "image/jpeg": FileType.JPG, "image/png": FileType.PNG, "text/plain": FileType.TXT, + "text/csv": FileType.CSV, "text/markdown": FileType.MD, "text/x-markdown": FileType.MD, "application/epub": FileType.EPUB, @@ -139,6 +141,7 @@ EXT_TO_FILETYPE = { ".epub": FileType.EPUB, ".msg": FileType.MSG, ".odt": FileType.ODT, + ".csv": FileType.CSV, None: FileType.UNK, } diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 3b234182e..c4e8a7b6c 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -11,6 +11,7 @@ from unstructured.file_utils.filetype import ( ) from unstructured.logger import logger from unstructured.partition.common import exactly_one +from unstructured.partition.csv import partition_csv from unstructured.partition.doc import partition_doc from unstructured.partition.docx import partition_docx from unstructured.partition.email import partition_email @@ -198,6 +199,8 @@ def partition( elements = partition_json(filename=filename, file=file) elif filetype == FileType.XLSX: elements = partition_xlsx(filename=filename, file=file) + elif filetype == FileType.CSV: + elements = partition_csv(filename=filename, file=file) else: msg = "Invalid file" if not filename else f"Invalid file {filename}" raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.") diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py new file mode 100644 index 000000000..17a08e8ce --- /dev/null +++ b/unstructured/partition/csv.py @@ -0,0 +1,53 @@ +from tempfile import SpooledTemporaryFile +from typing import IO, BinaryIO, List, Optional, Union, cast + +import lxml.html +import pandas as pd + +from unstructured.documents.elements import Element, ElementMetadata, Table +from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed + + +@add_metadata_with_filetype(FileType.CSV) +def partition_csv( + filename: Optional[str] = None, + file: Optional[Union[IO, SpooledTemporaryFile]] = None, + metadata_filename: Optional[str] = None, + include_metadata: bool = True, +) -> List[Element]: + """Partitions Microsoft Excel Documents in .csv format into its document elements. + + Parameters + ---------- + filename + A string defining the target filename path. + file + A file-like object using "rb" mode --> open(filename, "rb"). + metadata_filename + The filename to use for the metadata. + include_metadata + Determines whether or not metadata is included in the output. + """ + exactly_one(filename=filename, file=file) + + if filename: + table = pd.read_csv(filename) + else: + f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)) + table = pd.read_csv(f) + + metadata_filename = filename or metadata_filename + + html_text = table.to_html(index=False, header=False, na_rep="") + text = lxml.html.document_fromstring(html_text).text_content() + + if include_metadata: + metadata = ElementMetadata( + text_as_html=html_text, + filename=metadata_filename, + ) + else: + metadata = ElementMetadata() + + return [Table(text=text, metadata=metadata)] diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index 4038759ee..d4db0fa73 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -25,9 +25,7 @@ def partition_xlsx( file A file-like object using "rb" mode --> open(filename, "rb"). metadata_filename - The filename to use for the metadata. Relevant because partition_doc converts the - document to .xlsx before partition. We want the original source filename in the - metadata. + The filename to use for the metadata. include_metadata Determines whether or not metadata is included in the output. """