mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-16 02:17:21 +00:00
feat: add support for .txt files in partition (#150)
* added partition_text for auto * rename partition_text tests * bump version and update docs
This commit is contained in:
parent
eba4c80b1e
commit
f12240c5e7
@ -1,3 +1,7 @@
|
||||
## 0.4.1-dev0
|
||||
|
||||
* Added support for text files in the `partition` function
|
||||
|
||||
## 0.4.0
|
||||
|
||||
* Added generic `partition` brick that detects the file type and routes a file to the appropriate
|
||||
|
||||
@ -62,7 +62,7 @@ To install the library, run `pip install unstructured`.
|
||||
You can run this [Colab notebook](https://colab.research.google.com/drive/1RnXEiSTUaru8vZSGbh1U2T2P9aUa5tQD#scrollTo=E_WN7p3JGcLJ) to run the examples below.
|
||||
|
||||
The following examples show how to get started with the `unstructured` library.
|
||||
You can parse **HTML**, **PDF**, **EML** and **DOCX** documents with one line of code!
|
||||
You can parse **TXT**, **HTML**, **PDF**, **EML** and **DOCX** documents with one line of code!
|
||||
<br></br>
|
||||
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
|
||||
of the features in the library.
|
||||
@ -76,7 +76,7 @@ If you are using the `partition` brick, ensure you first install `libmagic` usin
|
||||
instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
|
||||
`partition` will always apply the default arguments. If you need
|
||||
advanced features, use a document-specific brick. The `partition` brick currently works for
|
||||
`.docx`, `eml`, `.html`, and `.pdf` documents.
|
||||
`.txt`, `.docx`, `eml`, `.html`, and `.pdf` documents.
|
||||
|
||||
```python
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
@ -22,6 +22,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
|
||||
file type and route it to the appropriate partitioning brick. All partitioning bricks
|
||||
called within ``partition`` are called using the defualt kwargs. Use the document-type
|
||||
specific bricks if you need to apply non-default settings.
|
||||
``partition`` currently supports ``.docx``, ``.eml``, ``.html``, ``.pdf``, and ``.txt`` files.
|
||||
|
||||
|
||||
.. code:: python
|
||||
|
||||
@ -113,7 +113,28 @@ def test_auto_partition_html_from_file_rb():
|
||||
assert len(elements) > 0
|
||||
|
||||
|
||||
def test_auto_partition_pdf():
|
||||
EXPECTED_TEXT_OUTPUT = [
|
||||
NarrativeText(text="This is a test document to use for unit tests."),
|
||||
Title(text="Important points:"),
|
||||
ListItem(text="Hamburgers are delicious"),
|
||||
ListItem(text="Dogs are the best"),
|
||||
ListItem(text="I love fuzzy blankets"),
|
||||
]
|
||||
|
||||
|
||||
def test_auto_partition_text_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||
elements = partition(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_TEXT_OUTPUT
|
||||
|
||||
|
||||
def test_auto_partition_text_from_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||
with open(filename, "r") as f:
|
||||
elements = partition(file=f)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_TEXT_OUTPUT
|
||||
filename = os.path.join(
|
||||
EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf"
|
||||
)
|
||||
|
||||
@ -16,14 +16,14 @@ EXPECTED_OUTPUT = [
|
||||
]
|
||||
|
||||
|
||||
def test_partition_email_from_filename():
|
||||
def test_partition_text_from_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||
elements = partition_text(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_email_from_file():
|
||||
def test_partition_text_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||
with open(filename, "r") as f:
|
||||
elements = partition_text(file=f)
|
||||
@ -31,7 +31,7 @@ def test_partition_email_from_file():
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_email_from_text():
|
||||
def test_partition_text_from_text():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||
with open(filename, "r") as f:
|
||||
text = f.read()
|
||||
@ -40,12 +40,12 @@ def test_partition_email_from_text():
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_email_raises_with_none_specified():
|
||||
def test_partition_text_raises_with_none_specified():
|
||||
with pytest.raises(ValueError):
|
||||
partition_text()
|
||||
|
||||
|
||||
def test_partition_email_raises_with_too_many_specified():
|
||||
def test_partition_text_raises_with_too_many_specified():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||
with open(filename, "r") as f:
|
||||
text = f.read()
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.4.0" # pragma: no cover
|
||||
__version__ = "0.4.1-dev0" # pragma: no cover
|
||||
|
||||
@ -5,6 +5,7 @@ from unstructured.partition.docx import partition_docx
|
||||
from unstructured.partition.email import partition_email
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
|
||||
def partition(filename: Optional[str] = None, file: Optional[IO] = None):
|
||||
@ -33,6 +34,8 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
|
||||
return partition_html(filename=filename, file=file)
|
||||
elif filetype == FileType.PDF:
|
||||
return partition_pdf(filename=filename, file=file, url=None) # type: ignore
|
||||
elif filetype == FileType.TXT:
|
||||
return partition_text(filename=filename, file=file)
|
||||
else:
|
||||
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
||||
raise ValueError(f"{msg}. File type not support in partition.")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user