feat: add support for .txt files in partition (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
This commit is contained in:
Matt Robinson 2023-01-13 16:39:53 -05:00 committed by GitHub
parent eba4c80b1e
commit f12240c5e7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 55 additions and 26 deletions

View File

@ -1,3 +1,7 @@
## 0.4.1-dev0
* Added support for text files in the `partition` function
## 0.4.0 ## 0.4.0
* Added generic `partition` brick that detects the file type and routes a file to the appropriate * Added generic `partition` brick that detects the file type and routes a file to the appropriate

View File

@ -62,7 +62,7 @@ To install the library, run `pip install unstructured`.
You can run this [Colab notebook](https://colab.research.google.com/drive/1RnXEiSTUaru8vZSGbh1U2T2P9aUa5tQD#scrollTo=E_WN7p3JGcLJ) to run the examples below. You can run this [Colab notebook](https://colab.research.google.com/drive/1RnXEiSTUaru8vZSGbh1U2T2P9aUa5tQD#scrollTo=E_WN7p3JGcLJ) to run the examples below.
The following examples show how to get started with the `unstructured` library. The following examples show how to get started with the `unstructured` library.
You can parse **HTML**, **PDF**, **EML** and **DOCX** documents with one line of code! You can parse **TXT**, **HTML**, **PDF**, **EML** and **DOCX** documents with one line of code!
<br></br> <br></br>
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
of the features in the library. of the features in the library.
@ -76,7 +76,7 @@ If you are using the `partition` brick, ensure you first install `libmagic` usin
instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection) instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
`partition` will always apply the default arguments. If you need `partition` will always apply the default arguments. If you need
advanced features, use a document-specific brick. The `partition` brick currently works for advanced features, use a document-specific brick. The `partition` brick currently works for
`.docx`, `eml`, `.html`, and `.pdf` documents. `.txt`, `.docx`, `eml`, `.html`, and `.pdf` documents.
```python ```python
from unstructured.partition.auto import partition from unstructured.partition.auto import partition

View File

@ -22,6 +22,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
file type and route it to the appropriate partitioning brick. All partitioning bricks file type and route it to the appropriate partitioning brick. All partitioning bricks
called within ``partition`` are called using the defualt kwargs. Use the document-type called within ``partition`` are called using the defualt kwargs. Use the document-type
specific bricks if you need to apply non-default settings. specific bricks if you need to apply non-default settings.
``partition`` currently supports ``.docx``, ``.eml``, ``.html``, ``.pdf``, and ``.txt`` files.
.. code:: python .. code:: python

View File

@ -113,7 +113,28 @@ def test_auto_partition_html_from_file_rb():
assert len(elements) > 0 assert len(elements) > 0
def test_auto_partition_pdf(): EXPECTED_TEXT_OUTPUT = [
NarrativeText(text="This is a test document to use for unit tests."),
Title(text="Important points:"),
ListItem(text="Hamburgers are delicious"),
ListItem(text="Dogs are the best"),
ListItem(text="I love fuzzy blankets"),
]
def test_auto_partition_text_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
elements = partition(filename=filename)
assert len(elements) > 0
assert elements == EXPECTED_TEXT_OUTPUT
def test_auto_partition_text_from_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
with open(filename, "r") as f:
elements = partition(file=f)
assert len(elements) > 0
assert elements == EXPECTED_TEXT_OUTPUT
filename = os.path.join( filename = os.path.join(
EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf" EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf"
) )

View File

@ -16,14 +16,14 @@ EXPECTED_OUTPUT = [
] ]
def test_partition_email_from_filename(): def test_partition_text_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
elements = partition_text(filename=filename) elements = partition_text(filename=filename)
assert len(elements) > 0 assert len(elements) > 0
assert elements == EXPECTED_OUTPUT assert elements == EXPECTED_OUTPUT
def test_partition_email_from_file(): def test_partition_text_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
with open(filename, "r") as f: with open(filename, "r") as f:
elements = partition_text(file=f) elements = partition_text(file=f)
@ -31,7 +31,7 @@ def test_partition_email_from_file():
assert elements == EXPECTED_OUTPUT assert elements == EXPECTED_OUTPUT
def test_partition_email_from_text(): def test_partition_text_from_text():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
with open(filename, "r") as f: with open(filename, "r") as f:
text = f.read() text = f.read()
@ -40,12 +40,12 @@ def test_partition_email_from_text():
assert elements == EXPECTED_OUTPUT assert elements == EXPECTED_OUTPUT
def test_partition_email_raises_with_none_specified(): def test_partition_text_raises_with_none_specified():
with pytest.raises(ValueError): with pytest.raises(ValueError):
partition_text() partition_text()
def test_partition_email_raises_with_too_many_specified(): def test_partition_text_raises_with_too_many_specified():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
with open(filename, "r") as f: with open(filename, "r") as f:
text = f.read() text = f.read()

View File

@ -1 +1 @@
__version__ = "0.4.0" # pragma: no cover __version__ = "0.4.1-dev0" # pragma: no cover

View File

@ -5,6 +5,7 @@ from unstructured.partition.docx import partition_docx
from unstructured.partition.email import partition_email from unstructured.partition.email import partition_email
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html
from unstructured.partition.pdf import partition_pdf from unstructured.partition.pdf import partition_pdf
from unstructured.partition.text import partition_text
def partition(filename: Optional[str] = None, file: Optional[IO] = None): def partition(filename: Optional[str] = None, file: Optional[IO] = None):
@ -33,6 +34,8 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
return partition_html(filename=filename, file=file) return partition_html(filename=filename, file=file)
elif filetype == FileType.PDF: elif filetype == FileType.PDF:
return partition_pdf(filename=filename, file=file, url=None) # type: ignore return partition_pdf(filename=filename, file=file, url=None) # type: ignore
elif filetype == FileType.TXT:
return partition_text(filename=filename, file=file)
else: else:
msg = "Invalid file" if not filename else f"Invalid file {filename}" msg = "Invalid file" if not filename else f"Invalid file {filename}"
raise ValueError(f"{msg}. File type not support in partition.") raise ValueError(f"{msg}. File type not support in partition.")