diff --git a/CHANGELOG.md b/CHANGELOG.md index 59ad13223..de8f16bd9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ ### Features * Updates `partition_docx` to include headers and footers in the output. +* Create `partition_tsv` and associated tests. Make additional changes to `detect_filetype`. ### Fixes diff --git a/README.md b/README.md index 34b2654bf..496c3df5d 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,9 @@ about the library. | Power Points (`.pptx`) | `partition_pptx` | N/A | Yes | Include Page Breaks | | ReStructured Text (`.rst`) | `partition_rst` | N/A | Yes | Include Page Breaks | | Rich Text Files (`.rtf`) | `partition_rtf` | N/A | Yes | Include Page Breaks | +| TSV Files (`.tsv`) | `partition_tsv` | N/A | Yes | None | +| Word Documents (`.doc`) | `partition_doc` | N/A | Yes | None | +| Word Documents (`.docx`) | `partition_docx` | N/A | Yes | None | | Word Documents (`.doc`) | `partition_doc` | N/A | Yes | Include Page Breaks | | Word Documents (`.docx`) | `partition_docx` | N/A | Yes | Include Page Breaks | | XML Documents (`.xml`) | `partition_xml` | N/A | No | Encoding; XML Keep Tags | diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index 45dce63ee..f5fe78875 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -82,7 +82,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect file type and route it to the appropriate partitioning brick. All partitioning bricks called within ``partition`` are called using the default kwargs. Use the document-type specific bricks if you need to apply non-default settings. -``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.csv``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``, +``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.csv``, ``.tsv``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``, ``.png``, ``.jpg``, and ``.txt`` files. If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``, ``.png``, and ``.jpg``. @@ -149,6 +149,23 @@ Examples: print(elements[0].metadata.text_as_html) +``partition_tsv`` +------------------ + +The ``partition_tsv`` function pre-processes TSV files. The output is a single +``Table`` element. The ``text_as_html`` attribute in the element metadata will +contain an HTML representation of the table. + +Examples: + +.. code:: python + + from unstructured.partition.tsv import partition_tsv + + elements = partition_tsv(filename="example-docs/stanley-cups.tsv") + print(elements[0].metadata.text_as_html) + + ``partition_doc`` ------------------ diff --git a/example-docs/stanley-cups.tsv b/example-docs/stanley-cups.tsv new file mode 100644 index 000000000..36402ac9b --- /dev/null +++ b/example-docs/stanley-cups.tsv @@ -0,0 +1,5 @@ +Stanley Cups +Team Location Stanley Cups +Blues STL 1 +Flyers PHI 2 +Maple Leafs TOR 13 diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 06f1d5a1c..cd3c60b20 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -41,6 +41,7 @@ XLSX_MIME_TYPES = [ # NOTE(robinson) - currently failing in the docker tests because the detected # MIME type is text/csv # ("stanley-cups.csv", FileType.CSV), + ("stanley-cups.tsv", FileType.TSV), ("fake-power-point.pptx", FileType.PPTX), ("winter-sports.epub", FileType.EPUB), ("spring-weather.html.json", FileType.JSON), @@ -67,6 +68,7 @@ def test_detect_filetype_from_filename(file, expected): ("fake-html.html", FileType.HTML), ("stanley-cups.xlsx", FileType.XLSX), ("stanley-cups.csv", FileType.CSV), + ("stanley-cups.tsv", FileType.TSV), ("fake-power-point.pptx", FileType.PPTX), ("winter-sports.epub", FileType.EPUB), ("fake-doc.rtf", FileType.RTF), @@ -105,6 +107,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte # NOTE(robinson) - currently failing in the docker tests because the detected # MIME type is text/csv # ("stanley-cups.csv", FileType.CSV), + ("stanley-cups.tsv", FileType.TSV), ("fake-power-point.pptx", FileType.PPTX), ("winter-sports.epub", FileType.EPUB), ], diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 4503d07fb..e04f9eb4c 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -8,6 +8,7 @@ from unittest.mock import patch import docx import pytest +from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import ( Address, @@ -630,34 +631,6 @@ def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook. assert elements[5].text == "United States" -EXPECTED_XLSX_TABLE = """ - - - - - - - - - - - - - - - - - - - - - - -
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
""" - - -EXPECTED_XLSX_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13" - EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" @@ -667,8 +640,8 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x assert all(isinstance(element, Table) for element in elements) assert len(elements) == 2 - assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT - assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT + assert elements[0].metadata.text_as_html == EXPECTED_TABLE assert elements[0].metadata.page_number == 1 assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE @@ -680,8 +653,8 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx" assert all(isinstance(element, Table) for element in elements) assert len(elements) == 2 - assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT - assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT + assert elements[0].metadata.text_as_html == EXPECTED_TABLE assert elements[0].metadata.page_number == 1 assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE @@ -780,8 +753,8 @@ def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.x def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"): elements = partition(filename=filename) - assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT - assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT + assert elements[0].metadata.text_as_html == EXPECTED_TABLE assert elements[0].metadata.filetype == "text/csv" @@ -790,9 +763,9 @@ def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"): with open(filename, "rb") as f: elements = partition(file=f) - assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT assert isinstance(elements[0], Table) - assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE + assert elements[0].metadata.text_as_html == EXPECTED_TABLE assert elements[0].metadata.filetype == "text/csv" diff --git a/test_unstructured/partition/test_constants.py b/test_unstructured/partition/test_constants.py new file mode 100644 index 000000000..492559dde --- /dev/null +++ b/test_unstructured/partition/test_constants.py @@ -0,0 +1,27 @@ +EXPECTED_TABLE = """ + + + + + + + + + + + + + + + + + + + + + + +
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
""" + + +EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13" diff --git a/test_unstructured/partition/test_csv.py b/test_unstructured/partition/test_csv.py index db5e1a418..8fa0a0e86 100644 --- a/test_unstructured/partition/test_csv.py +++ b/test_unstructured/partition/test_csv.py @@ -1,35 +1,8 @@ +from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import Table from unstructured.partition.csv import partition_csv -EXPECTED_TABLE = """ - - - - - - - - - - - - - - - - - - - - - - -
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
""" - - -EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13" - EXPECTED_FILETYPE = "text/csv" diff --git a/test_unstructured/partition/test_tsv.py b/test_unstructured/partition/test_tsv.py new file mode 100644 index 000000000..131bc00ac --- /dev/null +++ b/test_unstructured/partition/test_tsv.py @@ -0,0 +1,33 @@ +from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT +from unstructured.cleaners.core import clean_extra_whitespace +from unstructured.documents.elements import Table +from unstructured.partition.tsv import partition_tsv + +EXPECTED_FILETYPE = "text/tsv" + + +def test_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"): + elements = partition_tsv(filename=filename) + + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT + assert elements[0].metadata.text_as_html == EXPECTED_TABLE + assert elements[0].metadata.filetype == EXPECTED_FILETYPE + + +def test_partition_tsv_from_file(filename="example-docs/stanley-cups.tsv"): + with open(filename, "rb") as f: + elements = partition_tsv(file=f) + + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT + assert isinstance(elements[0], Table) + assert elements[0].metadata.text_as_html == EXPECTED_TABLE + assert elements[0].metadata.filetype == EXPECTED_FILETYPE + + +def test_partition_tsv_can_exclude_metadata(filename="example-docs/stanley-cups.tsv"): + elements = partition_tsv(filename=filename, include_metadata=False) + + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT + assert isinstance(elements[0], Table) + assert elements[0].metadata.text_as_html is None + assert elements[0].metadata.filetype is None diff --git a/test_unstructured/partition/test_xlsx.py b/test_unstructured/partition/test_xlsx.py index 63e477337..efc379053 100644 --- a/test_unstructured/partition/test_xlsx.py +++ b/test_unstructured/partition/test_xlsx.py @@ -1,35 +1,8 @@ +from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import Table from unstructured.partition.xlsx import partition_xlsx -EXPECTED_TABLE = """ - - - - - - - - - - - - - - - - - - - - - - -
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
""" - - -EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13" - EXPECTED_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" EXCEPTED_PAGE_NAME = "Stanley Cups" diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index d3d16dae8..1dc2f3e3f 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -74,6 +74,7 @@ class FileType(Enum): TXT = 42 JSON = 43 CSV = 44 + TSV = 45 # Markup Types HTML = 50 @@ -106,6 +107,7 @@ STR_TO_FILETYPE = { "text/comma-separated-values": FileType.CSV, "text/x-comma-separated-values": FileType.CSV, "text/csv": FileType.CSV, + "text/tsv": FileType.TSV, "text/markdown": FileType.MD, "text/x-markdown": FileType.MD, "text/x-rst": FileType.RST, @@ -166,6 +168,7 @@ EXT_TO_FILETYPE = { ".msg": FileType.MSG, ".odt": FileType.ODT, ".csv": FileType.CSV, + ".tsv": FileType.TSV, # NOTE(robinson) - for now we are treating code files as plain text ".js": FileType.TXT, ".py": FileType.TXT, @@ -229,7 +232,11 @@ def detect_filetype( return EXT_TO_FILETYPE.get(extension, FileType.UNK) elif file is not None: - extension = None + if hasattr(file, "name"): + _, extension = os.path.splitext(file.name) + else: + extension = "" + extension = extension.lower() # NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes # Increased to 4096 because otherwise .xlsx files get detected as a zip file # ref: https://github.com/ahupp/python-magic#usage @@ -251,43 +258,33 @@ def detect_filetype( """Mime type special cases.""" # third check (mime_type) - # NOTE(crag): for older versions of the OS libmagic package, such as is currently - # installed on the Unstructured docker image, .json files resolve to "text/plain" - # rather than "application/json". this corrects for that case. - if mime_type == "text/plain" and extension == ".json": - return FileType.JSON # NOTE(Crag): older magic lib does not differentiate between xls and doc if mime_type == "application/msword" and extension == ".xls": return FileType.XLS elif mime_type.endswith("xml"): - if extension and (extension == ".html" or extension == ".htm"): + if extension == ".html" or extension == ".htm": return FileType.HTML else: return FileType.XML elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"): - if extension and extension == ".eml": - return FileType.EML - elif extension and extension == ".md": - return FileType.MD - elif extension and extension == ".rst": - return FileType.RST - elif extension and extension == ".rtf": - return FileType.RTF - elif extension and extension == ".html": - return FileType.HTML - + # NOTE(crag): for older versions of the OS libmagic package, such as is currently + # installed on the Unstructured docker image, .json files resolve to "text/plain" + # rather than "application/json". this corrects for that case. if _is_text_file_a_json(file=file, filename=filename, encoding=encoding): return FileType.JSON if _is_text_file_a_csv(file=file, filename=filename, encoding=encoding): return FileType.CSV - if file and not extension and _check_eml_from_buffer(file=file) is True: + if file and _check_eml_from_buffer(file=file) is True: return FileType.EML + if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".tsv", ".json"]: + return EXT_TO_FILETYPE.get(extension) + # Safety catch if mime_type in STR_TO_FILETYPE: return STR_TO_FILETYPE[mime_type] @@ -295,14 +292,16 @@ def detect_filetype( return FileType.TXT elif mime_type == "application/octet-stream": - if file and not extension: + if extension == ".docx": + return FileType.DOCX + elif file: return _detect_filetype_from_octet_stream(file=file) else: return EXT_TO_FILETYPE.get(extension, FileType.UNK) elif mime_type == "application/zip": filetype = FileType.UNK - if file and not extension: + if file: filetype = _detect_filetype_from_octet_stream(file=file) elif filename is not None: with open(filename, "rb") as f: @@ -310,9 +309,9 @@ def detect_filetype( extension = extension if extension else "" if filetype == FileType.UNK: - return EXT_TO_FILETYPE.get(extension.lower(), FileType.ZIP) + return FileType.ZIP else: - return EXT_TO_FILETYPE.get(extension.lower(), filetype) + return EXT_TO_FILETYPE.get(extension, filetype) elif _is_code_mime_type(mime_type): # NOTE(robinson) - we'll treat all code files as plain text for now. diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 4af63ac35..7807b225b 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -29,6 +29,7 @@ from unstructured.partition.pptx import partition_pptx from unstructured.partition.rst import partition_rst from unstructured.partition.rtf import partition_rtf from unstructured.partition.text import partition_text +from unstructured.partition.tsv import partition_tsv from unstructured.partition.xlsx import partition_xlsx from unstructured.partition.xml import partition_xml @@ -211,6 +212,8 @@ def partition( elements = partition_xlsx(filename=filename, file=file) elif filetype == FileType.CSV: elements = partition_csv(filename=filename, file=file) + elif filetype == FileType.TSV: + elements = partition_tsv(filename=filename, file=file) elif filetype == FileType.EMPTY: elements = [] else: diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py new file mode 100644 index 000000000..5c4441222 --- /dev/null +++ b/unstructured/partition/tsv.py @@ -0,0 +1,53 @@ +from tempfile import SpooledTemporaryFile +from typing import IO, BinaryIO, List, Optional, Union, cast + +import lxml.html +import pandas as pd + +from unstructured.documents.elements import Element, ElementMetadata, Table +from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed + + +@add_metadata_with_filetype(FileType.TSV) +def partition_tsv( + filename: Optional[str] = None, + file: Optional[Union[IO, SpooledTemporaryFile]] = None, + metadata_filename: Optional[str] = None, + include_metadata: bool = True, +) -> List[Element]: + """Partitions TSV files into document elements. + + Parameters + ---------- + filename + A string defining the target filename path. + file + A file-like object using "rb" mode --> open(filename, "rb"). + metadata_filename + The filename to use for the metadata. + include_metadata + Determines whether or not metadata is included in the output. + """ + exactly_one(filename=filename, file=file) + + if filename: + table = pd.read_csv(filename, sep="\t") + else: + f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file)) + table = pd.read_csv(f, sep="\t") + + metadata_filename = filename or metadata_filename + + html_text = table.to_html(index=False, header=False, na_rep="") + text = lxml.html.document_fromstring(html_text).text_content() + + if include_metadata: + metadata = ElementMetadata( + text_as_html=html_text, + filename=metadata_filename, + ) + else: + metadata = ElementMetadata() + + return [Table(text=text, metadata=metadata)]