diff --git a/CHANGELOG.md b/CHANGELOG.md
index 59ad13223..de8f16bd9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@
### Features
* Updates `partition_docx` to include headers and footers in the output.
+* Create `partition_tsv` and associated tests. Make additional changes to `detect_filetype`.
### Fixes
diff --git a/README.md b/README.md
index 34b2654bf..496c3df5d 100644
--- a/README.md
+++ b/README.md
@@ -102,6 +102,9 @@ about the library.
| Power Points (`.pptx`) | `partition_pptx` | N/A | Yes | Include Page Breaks |
| ReStructured Text (`.rst`) | `partition_rst` | N/A | Yes | Include Page Breaks |
| Rich Text Files (`.rtf`) | `partition_rtf` | N/A | Yes | Include Page Breaks |
+| TSV Files (`.tsv`) | `partition_tsv` | N/A | Yes | None |
+| Word Documents (`.doc`) | `partition_doc` | N/A | Yes | None |
+| Word Documents (`.docx`) | `partition_docx` | N/A | Yes | None |
| Word Documents (`.doc`) | `partition_doc` | N/A | Yes | Include Page Breaks |
| Word Documents (`.docx`) | `partition_docx` | N/A | Yes | Include Page Breaks |
| XML Documents (`.xml`) | `partition_xml` | N/A | No | Encoding; XML Keep Tags |
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
index 45dce63ee..f5fe78875 100644
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@@ -82,7 +82,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
file type and route it to the appropriate partitioning brick. All partitioning bricks
called within ``partition`` are called using the default kwargs. Use the document-type
specific bricks if you need to apply non-default settings.
-``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.csv``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
+``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.xlsx``, ``.csv``, ``.tsv``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.xml``, ``.pdf``,
``.png``, ``.jpg``, and ``.txt`` files.
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
``.png``, and ``.jpg``.
@@ -149,6 +149,23 @@ Examples:
print(elements[0].metadata.text_as_html)
+``partition_tsv``
+------------------
+
+The ``partition_tsv`` function pre-processes TSV files. The output is a single
+``Table`` element. The ``text_as_html`` attribute in the element metadata will
+contain an HTML representation of the table.
+
+Examples:
+
+.. code:: python
+
+ from unstructured.partition.tsv import partition_tsv
+
+ elements = partition_tsv(filename="example-docs/stanley-cups.tsv")
+ print(elements[0].metadata.text_as_html)
+
+
``partition_doc``
------------------
diff --git a/example-docs/stanley-cups.tsv b/example-docs/stanley-cups.tsv
new file mode 100644
index 000000000..36402ac9b
--- /dev/null
+++ b/example-docs/stanley-cups.tsv
@@ -0,0 +1,5 @@
+Stanley Cups
+Team Location Stanley Cups
+Blues STL 1
+Flyers PHI 2
+Maple Leafs TOR 13
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
index 06f1d5a1c..cd3c60b20 100644
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@@ -41,6 +41,7 @@ XLSX_MIME_TYPES = [
# NOTE(robinson) - currently failing in the docker tests because the detected
# MIME type is text/csv
# ("stanley-cups.csv", FileType.CSV),
+ ("stanley-cups.tsv", FileType.TSV),
("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB),
("spring-weather.html.json", FileType.JSON),
@@ -67,6 +68,7 @@ def test_detect_filetype_from_filename(file, expected):
("fake-html.html", FileType.HTML),
("stanley-cups.xlsx", FileType.XLSX),
("stanley-cups.csv", FileType.CSV),
+ ("stanley-cups.tsv", FileType.TSV),
("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB),
("fake-doc.rtf", FileType.RTF),
@@ -105,6 +107,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
# NOTE(robinson) - currently failing in the docker tests because the detected
# MIME type is text/csv
# ("stanley-cups.csv", FileType.CSV),
+ ("stanley-cups.tsv", FileType.TSV),
("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB),
],
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
index 4503d07fb..e04f9eb4c 100644
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@@ -8,6 +8,7 @@ from unittest.mock import patch
import docx
import pytest
+from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import (
Address,
@@ -630,34 +631,6 @@ def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.
assert elements[5].text == "United States"
-EXPECTED_XLSX_TABLE = """
-
-
- Team |
- Location |
- Stanley Cups |
-
-
- Blues |
- STL |
- 1 |
-
-
- Flyers |
- PHI |
- 2 |
-
-
- Maple Leafs |
- TOR |
- 13 |
-
-
-
"""
-
-
-EXPECTED_XLSX_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
-
EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
@@ -667,8 +640,8 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
- assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
- assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
+ assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+ assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.page_number == 1
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
@@ -680,8 +653,8 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
- assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
- assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
+ assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+ assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.page_number == 1
assert elements[0].metadata.filetype == EXPECTED_XLSX_FILETYPE
@@ -780,8 +753,8 @@ def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.x
def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
elements = partition(filename=filename)
- assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
- assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
+ assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+ assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.filetype == "text/csv"
@@ -790,9 +763,9 @@ def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
with open(filename, "rb") as f:
elements = partition(file=f)
- assert clean_extra_whitespace(elements[0].text) == EXPECTED_XLSX_TEXT
+ assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert isinstance(elements[0], Table)
- assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
+ assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.filetype == "text/csv"
diff --git a/test_unstructured/partition/test_constants.py b/test_unstructured/partition/test_constants.py
new file mode 100644
index 000000000..492559dde
--- /dev/null
+++ b/test_unstructured/partition/test_constants.py
@@ -0,0 +1,27 @@
+EXPECTED_TABLE = """
+
+
+ Team |
+ Location |
+ Stanley Cups |
+
+
+ Blues |
+ STL |
+ 1 |
+
+
+ Flyers |
+ PHI |
+ 2 |
+
+
+ Maple Leafs |
+ TOR |
+ 13 |
+
+
+
"""
+
+
+EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
diff --git a/test_unstructured/partition/test_csv.py b/test_unstructured/partition/test_csv.py
index db5e1a418..8fa0a0e86 100644
--- a/test_unstructured/partition/test_csv.py
+++ b/test_unstructured/partition/test_csv.py
@@ -1,35 +1,8 @@
+from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table
from unstructured.partition.csv import partition_csv
-EXPECTED_TABLE = """
-
-
- Team |
- Location |
- Stanley Cups |
-
-
- Blues |
- STL |
- 1 |
-
-
- Flyers |
- PHI |
- 2 |
-
-
- Maple Leafs |
- TOR |
- 13 |
-
-
-
"""
-
-
-EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
-
EXPECTED_FILETYPE = "text/csv"
diff --git a/test_unstructured/partition/test_tsv.py b/test_unstructured/partition/test_tsv.py
new file mode 100644
index 000000000..131bc00ac
--- /dev/null
+++ b/test_unstructured/partition/test_tsv.py
@@ -0,0 +1,33 @@
+from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
+from unstructured.cleaners.core import clean_extra_whitespace
+from unstructured.documents.elements import Table
+from unstructured.partition.tsv import partition_tsv
+
+EXPECTED_FILETYPE = "text/tsv"
+
+
+def test_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
+ elements = partition_tsv(filename=filename)
+
+ assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+ assert elements[0].metadata.text_as_html == EXPECTED_TABLE
+ assert elements[0].metadata.filetype == EXPECTED_FILETYPE
+
+
+def test_partition_tsv_from_file(filename="example-docs/stanley-cups.tsv"):
+ with open(filename, "rb") as f:
+ elements = partition_tsv(file=f)
+
+ assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+ assert isinstance(elements[0], Table)
+ assert elements[0].metadata.text_as_html == EXPECTED_TABLE
+ assert elements[0].metadata.filetype == EXPECTED_FILETYPE
+
+
+def test_partition_tsv_can_exclude_metadata(filename="example-docs/stanley-cups.tsv"):
+ elements = partition_tsv(filename=filename, include_metadata=False)
+
+ assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
+ assert isinstance(elements[0], Table)
+ assert elements[0].metadata.text_as_html is None
+ assert elements[0].metadata.filetype is None
diff --git a/test_unstructured/partition/test_xlsx.py b/test_unstructured/partition/test_xlsx.py
index 63e477337..efc379053 100644
--- a/test_unstructured/partition/test_xlsx.py
+++ b/test_unstructured/partition/test_xlsx.py
@@ -1,35 +1,8 @@
+from test_unstructured.partition.test_constants import EXPECTED_TABLE, EXPECTED_TEXT
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table
from unstructured.partition.xlsx import partition_xlsx
-EXPECTED_TABLE = """
-
-
- Team |
- Location |
- Stanley Cups |
-
-
- Blues |
- STL |
- 1 |
-
-
- Flyers |
- PHI |
- 2 |
-
-
- Maple Leafs |
- TOR |
- 13 |
-
-
-
"""
-
-
-EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
-
EXPECTED_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
EXCEPTED_PAGE_NAME = "Stanley Cups"
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
index d3d16dae8..1dc2f3e3f 100644
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@@ -74,6 +74,7 @@ class FileType(Enum):
TXT = 42
JSON = 43
CSV = 44
+ TSV = 45
# Markup Types
HTML = 50
@@ -106,6 +107,7 @@ STR_TO_FILETYPE = {
"text/comma-separated-values": FileType.CSV,
"text/x-comma-separated-values": FileType.CSV,
"text/csv": FileType.CSV,
+ "text/tsv": FileType.TSV,
"text/markdown": FileType.MD,
"text/x-markdown": FileType.MD,
"text/x-rst": FileType.RST,
@@ -166,6 +168,7 @@ EXT_TO_FILETYPE = {
".msg": FileType.MSG,
".odt": FileType.ODT,
".csv": FileType.CSV,
+ ".tsv": FileType.TSV,
# NOTE(robinson) - for now we are treating code files as plain text
".js": FileType.TXT,
".py": FileType.TXT,
@@ -229,7 +232,11 @@ def detect_filetype(
return EXT_TO_FILETYPE.get(extension, FileType.UNK)
elif file is not None:
- extension = None
+ if hasattr(file, "name"):
+ _, extension = os.path.splitext(file.name)
+ else:
+ extension = ""
+ extension = extension.lower()
# NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
# Increased to 4096 because otherwise .xlsx files get detected as a zip file
# ref: https://github.com/ahupp/python-magic#usage
@@ -251,43 +258,33 @@ def detect_filetype(
"""Mime type special cases."""
# third check (mime_type)
- # NOTE(crag): for older versions of the OS libmagic package, such as is currently
- # installed on the Unstructured docker image, .json files resolve to "text/plain"
- # rather than "application/json". this corrects for that case.
- if mime_type == "text/plain" and extension == ".json":
- return FileType.JSON
# NOTE(Crag): older magic lib does not differentiate between xls and doc
if mime_type == "application/msword" and extension == ".xls":
return FileType.XLS
elif mime_type.endswith("xml"):
- if extension and (extension == ".html" or extension == ".htm"):
+ if extension == ".html" or extension == ".htm":
return FileType.HTML
else:
return FileType.XML
elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
- if extension and extension == ".eml":
- return FileType.EML
- elif extension and extension == ".md":
- return FileType.MD
- elif extension and extension == ".rst":
- return FileType.RST
- elif extension and extension == ".rtf":
- return FileType.RTF
- elif extension and extension == ".html":
- return FileType.HTML
-
+ # NOTE(crag): for older versions of the OS libmagic package, such as is currently
+ # installed on the Unstructured docker image, .json files resolve to "text/plain"
+ # rather than "application/json". this corrects for that case.
if _is_text_file_a_json(file=file, filename=filename, encoding=encoding):
return FileType.JSON
if _is_text_file_a_csv(file=file, filename=filename, encoding=encoding):
return FileType.CSV
- if file and not extension and _check_eml_from_buffer(file=file) is True:
+ if file and _check_eml_from_buffer(file=file) is True:
return FileType.EML
+ if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".tsv", ".json"]:
+ return EXT_TO_FILETYPE.get(extension)
+
# Safety catch
if mime_type in STR_TO_FILETYPE:
return STR_TO_FILETYPE[mime_type]
@@ -295,14 +292,16 @@ def detect_filetype(
return FileType.TXT
elif mime_type == "application/octet-stream":
- if file and not extension:
+ if extension == ".docx":
+ return FileType.DOCX
+ elif file:
return _detect_filetype_from_octet_stream(file=file)
else:
return EXT_TO_FILETYPE.get(extension, FileType.UNK)
elif mime_type == "application/zip":
filetype = FileType.UNK
- if file and not extension:
+ if file:
filetype = _detect_filetype_from_octet_stream(file=file)
elif filename is not None:
with open(filename, "rb") as f:
@@ -310,9 +309,9 @@ def detect_filetype(
extension = extension if extension else ""
if filetype == FileType.UNK:
- return EXT_TO_FILETYPE.get(extension.lower(), FileType.ZIP)
+ return FileType.ZIP
else:
- return EXT_TO_FILETYPE.get(extension.lower(), filetype)
+ return EXT_TO_FILETYPE.get(extension, filetype)
elif _is_code_mime_type(mime_type):
# NOTE(robinson) - we'll treat all code files as plain text for now.
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
index 4af63ac35..7807b225b 100644
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@@ -29,6 +29,7 @@ from unstructured.partition.pptx import partition_pptx
from unstructured.partition.rst import partition_rst
from unstructured.partition.rtf import partition_rtf
from unstructured.partition.text import partition_text
+from unstructured.partition.tsv import partition_tsv
from unstructured.partition.xlsx import partition_xlsx
from unstructured.partition.xml import partition_xml
@@ -211,6 +212,8 @@ def partition(
elements = partition_xlsx(filename=filename, file=file)
elif filetype == FileType.CSV:
elements = partition_csv(filename=filename, file=file)
+ elif filetype == FileType.TSV:
+ elements = partition_tsv(filename=filename, file=file)
elif filetype == FileType.EMPTY:
elements = []
else:
diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py
new file mode 100644
index 000000000..5c4441222
--- /dev/null
+++ b/unstructured/partition/tsv.py
@@ -0,0 +1,53 @@
+from tempfile import SpooledTemporaryFile
+from typing import IO, BinaryIO, List, Optional, Union, cast
+
+import lxml.html
+import pandas as pd
+
+from unstructured.documents.elements import Element, ElementMetadata, Table
+from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
+from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
+
+
+@add_metadata_with_filetype(FileType.TSV)
+def partition_tsv(
+ filename: Optional[str] = None,
+ file: Optional[Union[IO, SpooledTemporaryFile]] = None,
+ metadata_filename: Optional[str] = None,
+ include_metadata: bool = True,
+) -> List[Element]:
+ """Partitions TSV files into document elements.
+
+ Parameters
+ ----------
+ filename
+ A string defining the target filename path.
+ file
+ A file-like object using "rb" mode --> open(filename, "rb").
+ metadata_filename
+ The filename to use for the metadata.
+ include_metadata
+ Determines whether or not metadata is included in the output.
+ """
+ exactly_one(filename=filename, file=file)
+
+ if filename:
+ table = pd.read_csv(filename, sep="\t")
+ else:
+ f = spooled_to_bytes_io_if_needed(cast(Union[BinaryIO, SpooledTemporaryFile], file))
+ table = pd.read_csv(f, sep="\t")
+
+ metadata_filename = filename or metadata_filename
+
+ html_text = table.to_html(index=False, header=False, na_rep="")
+ text = lxml.html.document_fromstring(html_text).text_content()
+
+ if include_metadata:
+ metadata = ElementMetadata(
+ text_as_html=html_text,
+ filename=metadata_filename,
+ )
+ else:
+ metadata = ElementMetadata()
+
+ return [Table(text=text, metadata=metadata)]