feat: add support for .txt files in partition (#150)

* added partition_text for auto * rename partition_text tests * bump version and update docs
2025-11-16 10:27:23 +00:00 · 2023-01-13 16:39:53 -05:00 · 2023-01-13 16:39:53 -05:00 · f12240c5e7
commit f12240c5e7
parent eba4c80b1e
7 changed files with 55 additions and 26 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,7 @@
 ## 0.4.1-dev0
 * Added support for text files in the `partition` function
 ## 0.4.0
 * Added generic `partition` brick that detects the file type and routes a file to the appropriate
--- a/README.md
+++ b/README.md
@ -62,7 +62,7 @@ To install the library, run `pip install unstructured`.
 You can run this [Colab notebook](https://colab.research.google.com/drive/1RnXEiSTUaru8vZSGbh1U2T2P9aUa5tQD#scrollTo=E_WN7p3JGcLJ) to run the examples below.
 The following examples show how to get started with the `unstructured` library.
-You can parse **HTML**, **PDF**, **EML** and **DOCX** documents with one line of code!
+You can parse **TXT**, **HTML**, **PDF**, **EML** and **DOCX** documents with one line of code!
 <br></br>
 See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
 of the features in the library.
@ -76,7 +76,7 @@ If you are using the `partition` brick, ensure you first install `libmagic` usin
 instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
 `partition` will always apply the default arguments. If you need
 advanced features, use a document-specific brick. The `partition` brick currently works for
-`.docx`, `eml`, `.html`, and `.pdf` documents.
+`.txt`, `.docx`, `eml`, `.html`, and `.pdf` documents.
 ```python
 from unstructured.partition.auto import partition
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -22,6 +22,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
 file type and route it to the appropriate partitioning brick. All partitioning bricks
 called within ``partition`` are called using the defualt kwargs. Use the document-type
 specific bricks if you need to apply non-default settings.
 ``partition`` currently supports ``.docx``, ``.eml``, ``.html``, ``.pdf``, and ``.txt`` files.
 .. code:: python
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -113,7 +113,28 @@ def test_auto_partition_html_from_file_rb():
    assert len(elements) > 0
-def test_auto_partition_pdf():
+EXPECTED_TEXT_OUTPUT = [
    NarrativeText(text="This is a test document to use for unit tests."),
    Title(text="Important points:"),
    ListItem(text="Hamburgers are delicious"),
    ListItem(text="Dogs are the best"),
    ListItem(text="I love fuzzy blankets"),
 ]
 def test_auto_partition_text_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
    elements = partition(filename=filename)
    assert len(elements) > 0
    assert elements == EXPECTED_TEXT_OUTPUT
 def test_auto_partition_text_from_file():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
    with open(filename, "r") as f:
        elements = partition(file=f)
    assert len(elements) > 0
    assert elements == EXPECTED_TEXT_OUTPUT
    filename = os.path.join(
        EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf"
    )
--- a/test_unstructured/partition/test_text.py
+++ b/test_unstructured/partition/test_text.py
@ -16,14 +16,14 @@ EXPECTED_OUTPUT = [
 ]
-def test_partition_email_from_filename():
+def test_partition_text_from_filename():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
    elements = partition_text(filename=filename)
    assert len(elements) > 0
    assert elements == EXPECTED_OUTPUT
-def test_partition_email_from_file():
+def test_partition_text_from_file():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
    with open(filename, "r") as f:
        elements = partition_text(file=f)
@ -31,7 +31,7 @@ def test_partition_email_from_file():
    assert elements == EXPECTED_OUTPUT
-def test_partition_email_from_text():
+def test_partition_text_from_text():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
    with open(filename, "r") as f:
        text = f.read()
@ -40,12 +40,12 @@ def test_partition_email_from_text():
    assert elements == EXPECTED_OUTPUT
-def test_partition_email_raises_with_none_specified():
+def test_partition_text_raises_with_none_specified():
    with pytest.raises(ValueError):
        partition_text()
-def test_partition_email_raises_with_too_many_specified():
+def test_partition_text_raises_with_too_many_specified():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
    with open(filename, "r") as f:
        text = f.read()
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.4.0"  # pragma: no cover
+__version__ = "0.4.1-dev0"  # pragma: no cover
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -5,6 +5,7 @@ from unstructured.partition.docx import partition_docx
 from unstructured.partition.email import partition_email
 from unstructured.partition.html import partition_html
 from unstructured.partition.pdf import partition_pdf
 from unstructured.partition.text import partition_text
 def partition(filename: Optional[str] = None, file: Optional[IO] = None):
@ -33,6 +34,8 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
        return partition_html(filename=filename, file=file)
    elif filetype == FileType.PDF:
        return partition_pdf(filename=filename, file=file, url=None)  # type: ignore
    elif filetype == FileType.TXT:
        return partition_text(filename=filename, file=file)
    else:
        msg = "Invalid file" if not filename else f"Invalid file {filename}"
        raise ValueError(f"{msg}. File type not support in partition.")
`@ -1 +1 @@`
	`__version__ = "0.4.0" # pragma: no cover`	`__version__ = "0.4.1-dev0" # pragma: no cover`