feat: partition_tsv for tab separated value files (#758)

* first pass at partition_tsv * working tests * create constants for tests and debug `make test` failure * make check and tidy * undo changes for testing locally * update changelog and version * fix bricks.rst * refactor if statements * make tidy * fix README and change try/except to if/else * update changelog and version * fix\ docstring
2025-06-27 02:30:08 +00:00 · 2023-06-15 13:50:53 -05:00 · 2023-06-15 13:50:53 -05:00 · a9b9b873b1
commit a9b9b873b1
parent 075bf0bdba
13 changed files with 179 additions and 116 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -9,6 +9,7 @@
 ### Features
 * Updates `partition_docx` to include headers and footers in the output.
 * Create `partition_tsv` and associated tests. Make additional changes to `detect_filetype`.
 ### Fixes
--- a/README.md
+++ b/README.md
@ -102,6 +102,9 @@ about the library.
 | Power Points (`.pptx`) | `partition_pptx` | N/A | Yes | Include Page Breaks |
 | ReStructured Text (`.rst`) | `partition_rst` | N/A | Yes | Include Page Breaks |
 | Rich Text Files (`.rtf`) | `partition_rtf` | N/A | Yes | Include Page Breaks |
 | TSV Files (`.tsv`) | `partition_tsv` | N/A | Yes | None |
 | Word Documents (`.doc`) | `partition_doc` | N/A | Yes | None |
 | Word Documents (`.docx`) | `partition_docx` | N/A | Yes | None |
 | Word Documents (`.doc`) | `partition_doc` | N/A | Yes | Include Page Breaks |
 | Word Documents (`.docx`) | `partition_docx` | N/A | Yes | Include Page Breaks |
 | XML Documents (`.xml`) | `partition_xml` | N/A | No | Encoding; XML Keep Tags |
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -82,7 +82,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
 file type and route it to the appropriate partitioning brick. All partitioning bricks
 called within ``partition`` are called using the default kwargs. Use the document-type
 specific bricks if you need to apply non-default settings.
-``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.csv``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
+``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.csv``, ``.tsv``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
 ``.png``, ``.jpg``, and ``.txt`` files.
 If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
 ``.png``, and ``.jpg``.
@ -149,6 +149,23 @@ Examples:
  print(elements[0].metadata.text_as_html)
 ``partition_tsv``
 ------------------
 The ``partition_tsv`` function pre-processes TSV files. The output is a single
 ``Table`` element. The ``text_as_html`` attribute in the element metadata will
 contain an HTML representation of the table.
 Examples:
 .. code:: python
  from unstructured.partition.tsv import partition_tsv
  elements = partition_tsv(filename="example-docs/stanley-cups.tsv")
  print(elements[0].metadata.text_as_html)  
 ``partition_doc``
 ------------------
--- a/example-docs/stanley-cups.tsv
+++ b/example-docs/stanley-cups.tsv
@ -0,0 +1,5 @@
 Stanley Cups		
 Team	Location	Stanley Cups
 Blues	STL	1
 Flyers	PHI	2
 Maple Leafs	TOR	13
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -41,6 +41,7 @@ XLSX_MIME_TYPES = [
        # NOTE(robinson) - currently failing in the docker tests because the detected
        # MIME type is text/csv
        # ("stanley-cups.csv", FileType.CSV),
        ("stanley-cups.tsv", FileType.TSV),
        ("fake-power-point.pptx", FileType.PPTX),
        ("winter-sports.epub", FileType.EPUB),
        ("spring-weather.html.json", FileType.JSON),
@ -67,6 +68,7 @@ def test_detect_filetype_from_filename(file, expected):
        ("fake-html.html", FileType.HTML),
        ("stanley-cups.xlsx", FileType.XLSX),
        ("stanley-cups.csv", FileType.CSV),
        ("stanley-cups.tsv", FileType.TSV),
        ("fake-power-point.pptx", FileType.PPTX),
        ("winter-sports.epub", FileType.EPUB),
        ("fake-doc.rtf", FileType.RTF),
@ -105,6 +107,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
        # NOTE(robinson) - currently failing in the docker tests because the detected
        # MIME type is text/csv
        # ("stanley-cups.csv", FileType.CSV),
        ("stanley-cups.tsv", FileType.TSV),
        ("fake-power-point.pptx", FileType.PPTX),
        ("winter-sports.epub", FileType.EPUB),
    ],
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -8,6 +8,7 @@ from unittest.mock import patch
 import docx
 import pytest
 from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
 from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import (
    Address,
@ -630,34 +631,6 @@ def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.
    assert elements[5].text == "<name>United States</name>"
 EXPECTED_XLSX_TABLE = """<table border="1" class="dataframe">
  <tbody>
    <tr>
      <td>Team</td>
      <td>Location</td>
      <td>Stanley Cups</td>
    </tr>
    <tr>
      <td>Blues</td>
      <td>STL</td>
      <td>1</td>
    </tr>
    <tr>
      <td>Flyers</td>
      <td>PHI</td>
      <td>2</td>
    </tr>
    <tr>
      <td>Maple Leafs</td>
      <td>TOR</td>
      <td>13</td>
    </tr>
  </tbody>
 </table>"""
 EXPECTED_XLSX_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
 EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
@ -667,8 +640,8 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x
    assert all(isinstance(element, Table) for element in elements)
    assert len(elements) == 2
-    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
-    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
+    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.page_number == 1
    assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
@ -680,8 +653,8 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
    assert all(isinstance(element, Table) for element in elements)
    assert len(elements) == 2
-    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
-    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
+    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.page_number == 1
    assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
@ -780,8 +753,8 @@ def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.x
 def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
    elements = partition(filename=filename)
-    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
-    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
+    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == "text/csv"
@ -790,9 +763,9 @@ def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
    with open(filename, "rb") as f:
        elements = partition(file=f)
-    assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
+    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert isinstance(elements[0], Table)
-    assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
+    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == "text/csv"
--- a/test_unstructured/partition/test_constants.py
+++ b/test_unstructured/partition/test_constants.py
@ -0,0 +1,27 @@
 EXPECTED_TABLE = """<table border="1" class="dataframe">
  <tbody>
    <tr>
      <td>Team</td>
      <td>Location</td>
      <td>Stanley Cups</td>
    </tr>
    <tr>
      <td>Blues</td>
      <td>STL</td>
      <td>1</td>
    </tr>
    <tr>
      <td>Flyers</td>
      <td>PHI</td>
      <td>2</td>
    </tr>
    <tr>
      <td>Maple Leafs</td>
      <td>TOR</td>
      <td>13</td>
    </tr>
  </tbody>
 </table>"""
 EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
--- a/test_unstructured/partition/test_csv.py
+++ b/test_unstructured/partition/test_csv.py
@ -1,35 +1,8 @@
 from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
 from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import Table
 from unstructured.partition.csv import partition_csv
 EXPECTED_TABLE = """<table border="1" class="dataframe">
  <tbody>
    <tr>
      <td>Team</td>
      <td>Location</td>
      <td>Stanley Cups</td>
    </tr>
    <tr>
      <td>Blues</td>
      <td>STL</td>
      <td>1</td>
    </tr>
    <tr>
      <td>Flyers</td>
      <td>PHI</td>
      <td>2</td>
    </tr>
    <tr>
      <td>Maple Leafs</td>
      <td>TOR</td>
      <td>13</td>
    </tr>
  </tbody>
 </table>"""
 EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
 EXPECTED_FILETYPE = "text/csv"
--- a/test_unstructured/partition/test_tsv.py
+++ b/test_unstructured/partition/test_tsv.py
@ -0,0 +1,33 @@
 from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
 from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import Table
 from unstructured.partition.tsv import partition_tsv
 EXPECTED_FILETYPE = "text/tsv"
 def test_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
    elements = partition_tsv(filename=filename)
    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == EXPECTED_FILETYPE
 def test_partition_tsv_from_file(filename="example-docs/stanley-cups.tsv"):
    with open(filename, "rb") as f:
        elements = partition_tsv(file=f)
    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert isinstance(elements[0], Table)
    assert elements[0].metadata.text_as_html == EXPECTED_TABLE
    assert elements[0].metadata.filetype == EXPECTED_FILETYPE
 def test_partition_tsv_can_exclude_metadata(filename="example-docs/stanley-cups.tsv"):
    elements = partition_tsv(filename=filename, include_metadata=False)
    assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
    assert isinstance(elements[0], Table)
    assert elements[0].metadata.text_as_html is None
    assert elements[0].metadata.filetype is None
--- a/test_unstructured/partition/test_xlsx.py
+++ b/test_unstructured/partition/test_xlsx.py
@ -1,35 +1,8 @@
 from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
 from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import Table
 from unstructured.partition.xlsx import partition_xlsx
 EXPECTED_TABLE = """<table border="1" class="dataframe">
  <tbody>
    <tr>
      <td>Team</td>
      <td>Location</td>
      <td>Stanley Cups</td>
    </tr>
    <tr>
      <td>Blues</td>
      <td>STL</td>
      <td>1</td>
    </tr>
    <tr>
      <td>Flyers</td>
      <td>PHI</td>
      <td>2</td>
    </tr>
    <tr>
      <td>Maple Leafs</td>
      <td>TOR</td>
      <td>13</td>
    </tr>
  </tbody>
 </table>"""
 EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
 EXPECTED_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 EXCEPTED_PAGE_NAME = "Stanley Cups"
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -74,6 +74,7 @@ class FileType(Enum):
    TXT = 42
    JSON = 43
    CSV = 44
    TSV = 45
    # Markup Types
    HTML = 50
@ -106,6 +107,7 @@ STR_TO_FILETYPE = {
    "text/comma-separated-values": FileType.CSV,
    "text/x-comma-separated-values": FileType.CSV,
    "text/csv": FileType.CSV,
    "text/tsv": FileType.TSV,
    "text/markdown": FileType.MD,
    "text/x-markdown": FileType.MD,
    "text/x-rst": FileType.RST,
@ -166,6 +168,7 @@ EXT_TO_FILETYPE = {
    ".msg": FileType.MSG,
    ".odt": FileType.ODT,
    ".csv": FileType.CSV,
    ".tsv": FileType.TSV,
    # NOTE(robinson) - for now we are treating code files as plain text
    ".js": FileType.TXT,
    ".py": FileType.TXT,
@ -229,7 +232,11 @@ def detect_filetype(
            return EXT_TO_FILETYPE.get(extension, FileType.UNK)
    elif file is not None:
-        extension = None
+        if hasattr(file, "name"):
            _, extension = os.path.splitext(file.name)
        else:
            extension = ""
        extension = extension.lower()
        # NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
        # Increased to 4096 because otherwise .xlsx files get detected as a zip file
        # ref: https://github.com/ahupp/python-magic#usage
@ -251,43 +258,33 @@ def detect_filetype(
    """Mime type special cases."""
    # third check (mime_type)
    # NOTE(crag): for older versions of the OS libmagic package, such as is currently
    # installed on the Unstructured docker image, .json files resolve to "text/plain"
    # rather than "application/json". this corrects for that case.
    if mime_type == "text/plain" and extension == ".json":
        return FileType.JSON
    # NOTE(Crag): older magic lib does not differentiate between xls and doc
    if mime_type == "application/msword" and extension == ".xls":
        return FileType.XLS
    elif mime_type.endswith("xml"):
-        if extension and (extension == ".html" or extension == ".htm"):
+        if extension == ".html" or extension == ".htm":
            return FileType.HTML
        else:
            return FileType.XML
    elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
-        if extension and extension == ".eml":
+        # NOTE(crag): for older versions of the OS libmagic package, such as is currently
-            return FileType.EML
+        # installed on the Unstructured docker image, .json files resolve to "text/plain"
-        elif extension and extension == ".md":
+        # rather than "application/json". this corrects for that case.
            return FileType.MD
        elif extension and extension == ".rst":
            return FileType.RST
        elif extension and extension == ".rtf":
            return FileType.RTF
        elif extension and extension == ".html":
            return FileType.HTML
        if _is_text_file_a_json(file=file, filename=filename, encoding=encoding):
            return FileType.JSON
        if _is_text_file_a_csv(file=file, filename=filename, encoding=encoding):
            return FileType.CSV
-        if file and not extension and _check_eml_from_buffer(file=file) is True:
+        if file and _check_eml_from_buffer(file=file) is True:
            return FileType.EML
        if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".tsv", ".json"]:
            return EXT_TO_FILETYPE.get(extension)
        # Safety catch
        if mime_type in STR_TO_FILETYPE:
            return STR_TO_FILETYPE[mime_type]
@ -295,14 +292,16 @@ def detect_filetype(
        return FileType.TXT
    elif mime_type == "application/octet-stream":
-        if file and not extension:
+        if extension == ".docx":
            return FileType.DOCX
        elif file:
            return _detect_filetype_from_octet_stream(file=file)
        else:
            return EXT_TO_FILETYPE.get(extension, FileType.UNK)
    elif mime_type == "application/zip":
        filetype = FileType.UNK
-        if file and not extension:
+        if file:
            filetype = _detect_filetype_from_octet_stream(file=file)
        elif filename is not None:
            with open(filename, "rb") as f:
@ -310,9 +309,9 @@ def detect_filetype(
        extension = extension if extension else ""
        if filetype == FileType.UNK:
-            return EXT_TO_FILETYPE.get(extension.lower(), FileType.ZIP)
+            return FileType.ZIP
        else:
-            return EXT_TO_FILETYPE.get(extension.lower(), filetype)
+            return EXT_TO_FILETYPE.get(extension, filetype)
    elif _is_code_mime_type(mime_type):
        # NOTE(robinson) - we'll treat all code files as plain text for now.
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -29,6 +29,7 @@ from unstructured.partition.pptx import partition_pptx
 from unstructured.partition.rst import partition_rst
 from unstructured.partition.rtf import partition_rtf
 from unstructured.partition.text import partition_text
 from unstructured.partition.tsv import partition_tsv
 from unstructured.partition.xlsx import partition_xlsx
 from unstructured.partition.xml import partition_xml
@ -211,6 +212,8 @@ def partition(
        elements = partition_xlsx(filename=filename, file=file)
    elif filetype == FileType.CSV:
        elements = partition_csv(filename=filename, file=file)
    elif filetype == FileType.TSV:
        elements = partition_tsv(filename=filename, file=file)
    elif filetype == FileType.EMPTY:
        elements = []
    else:
--- a/unstructured/partition/tsv.py
+++ b/unstructured/partition/tsv.py
@ -0,0 +1,53 @@
 from tempfile import SpooledTemporaryFile
 from typing import IO, BinaryIO, List, Optional, Union, cast
 import lxml.html
 import pandas as pd
 from unstructured.documents.elements import Element, ElementMetadata, Table
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
@add_metadata_with_filetype(FileType.TSV)
 def partition_tsv(
    filename: Optional[str] = None,
    file: Optional[Union[IO, SpooledTemporaryFile]] = None,
    metadata_filename: Optional[str] = None,
    include_metadata: bool = True,
 ) -> List[Element]:
    """Partitions TSV files into document elements.
    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    metadata_filename
        The filename to use for the metadata.
    include_metadata
        Determines whether or not metadata is included in the output.
    """
    exactly_one(filename=filename, file=file)
    if filename:
        table = pd.read_csv(filename, sep="\t")
    else:
        f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file))
        table = pd.read_csv(f, sep="\t")
    metadata_filename = filename or metadata_filename
    html_text = table.to_html(index=False, header=False, na_rep="")
    text = lxml.html.document_fromstring(html_text).text_content()
    if include_metadata:
        metadata = ElementMetadata(
            text_as_html=html_text,
            filename=metadata_filename,
        )
    else:
        metadata = ElementMetadata()
    return [Table(text=text, metadata=metadata)]