feat: add support for .txt files in partition (#150)

* added partition_text for auto

* rename partition_text tests

* bump version and update docs
This commit is contained in:
Matt Robinson 2023-01-13 16:39:53 -05:00 committed by GitHub
parent eba4c80b1e
commit f12240c5e7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 55 additions and 26 deletions

View File

@ -1,3 +1,7 @@
## 0.4.1-dev0
* Added support for text files in the `partition` function
## 0.4.0 ## 0.4.0
* Added generic `partition` brick that detects the file type and routes a file to the appropriate * Added generic `partition` brick that detects the file type and routes a file to the appropriate

View File

@ -62,7 +62,7 @@ To install the library, run `pip install unstructured`.
You can run this [Colab notebook](https://colab.research.google.com/drive/1RnXEiSTUaru8vZSGbh1U2T2P9aUa5tQD#scrollTo=E_WN7p3JGcLJ) to run the examples below. You can run this [Colab notebook](https://colab.research.google.com/drive/1RnXEiSTUaru8vZSGbh1U2T2P9aUa5tQD#scrollTo=E_WN7p3JGcLJ) to run the examples below.
The following examples show how to get started with the `unstructured` library. The following examples show how to get started with the `unstructured` library.
You can parse **HTML**, **PDF**, **EML** and **DOCX** documents with one line of code! You can parse **TXT**, **HTML**, **PDF**, **EML** and **DOCX** documents with one line of code!
<br></br> <br></br>
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
of the features in the library. of the features in the library.
@ -76,7 +76,7 @@ If you are using the `partition` brick, ensure you first install `libmagic` usin
instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection) instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
`partition` will always apply the default arguments. If you need `partition` will always apply the default arguments. If you need
advanced features, use a document-specific brick. The `partition` brick currently works for advanced features, use a document-specific brick. The `partition` brick currently works for
`.docx`, `eml`, `.html`, and `.pdf` documents. `.txt`, `.docx`, `eml`, `.html`, and `.pdf` documents.
```python ```python
from unstructured.partition.auto import partition from unstructured.partition.auto import partition

View File

@ -22,6 +22,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
file type and route it to the appropriate partitioning brick. All partitioning bricks file type and route it to the appropriate partitioning brick. All partitioning bricks
called within ``partition`` are called using the defualt kwargs. Use the document-type called within ``partition`` are called using the defualt kwargs. Use the document-type
specific bricks if you need to apply non-default settings. specific bricks if you need to apply non-default settings.
``partition`` currently supports ``.docx``, ``.eml``, ``.html``, ``.pdf``, and ``.txt`` files.
.. code:: python .. code:: python
@ -104,7 +105,7 @@ Examples:
``partition_pdf`` ``partition_pdf``
--------------------- ---------------------
The ``partition_pdf`` function segments a PDF document by calling the document image analysis API. The ``partition_pdf`` function segments a PDF document by calling the document image analysis API.
The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API, The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
if desired. if desired.
@ -122,7 +123,7 @@ Examples:
--------------------- ---------------------
The ``partition_email`` function partitions ``.eml`` documents and works with exports The ``partition_email`` function partitions ``.eml`` documents and works with exports
from email clients such as Microsoft Outlook and Gmail. The ``partition_email`` from email clients such as Microsoft Outlook and Gmail. The ``partition_email``
takes a filename, file-like object, or raw text as input and produces a list of takes a filename, file-like object, or raw text as input and produces a list of
document ``Element`` objects as output. Also ``content_source`` can be set to ``text/html`` document ``Element`` objects as output. Also ``content_source`` can be set to ``text/html``
(default) or ``text/plain`` to process the html or plain text version of the email, respectively. (default) or ``text/plain`` to process the html or plain text version of the email, respectively.
@ -157,7 +158,7 @@ Examples:
``partition_text`` ``partition_text``
--------------------- ---------------------
The ``partition_text`` function partitions text files. The ``partition_text`` The ``partition_text`` function partitions text files. The ``partition_text``
takes a filename, file-like object, and raw text as input and produces ``Element`` objects as output. takes a filename, file-like object, and raw text as input and produces ``Element`` objects as output.
Examples: Examples:
@ -629,7 +630,7 @@ addresses in the input string.
from unstructured.cleaners.extract import extract_email_address from unstructured.cleaners.extract import extract_email_address
text = """Me me@email.com and You <You@email.com> text = """Me me@email.com and You <You@email.com>
([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)""" ([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)"""
# Returns "['me@email.com', 'you@email.com']" # Returns "['me@email.com', 'you@email.com']"
@ -646,7 +647,7 @@ returns a list of all IP address in input string.
from unstructured.cleaners.extract import extract_ip_address from unstructured.cleaners.extract import extract_ip_address
text = """Me me@email.com and You <You@email.com> text = """Me me@email.com and You <You@email.com>
([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)""" ([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)"""
# Returns "['ba23::58b5:2236:45g2:88h2', '10.0.2.01']" # Returns "['ba23::58b5:2236:45g2:88h2', '10.0.2.01']"
@ -656,7 +657,7 @@ returns a list of all IP address in input string.
``extract_ip_address_name`` ``extract_ip_address_name``
---------------------------- ----------------------------
Extracts the names of each IP address in the ``Received`` field(s) from an ``.eml`` Extracts the names of each IP address in the ``Received`` field(s) from an ``.eml``
file. ``extract_ip_address_name`` takes in a string and returns a list of all file. ``extract_ip_address_name`` takes in a string and returns a list of all
IP addresses in the input string. IP addresses in the input string.
@ -675,7 +676,7 @@ IP addresses in the input string.
``extract_mapi_id`` ``extract_mapi_id``
---------------------- ----------------------
Extracts the ``mapi id`` in the ``Received`` field(s) from an ``.eml`` Extracts the ``mapi id`` in the ``Received`` field(s) from an ``.eml``
file. ``extract_mapi_id`` takes in a string and returns a list of a string file. ``extract_mapi_id`` takes in a string and returns a list of a string
containing the ``mapi id`` in the input string. containing the ``mapi id`` in the input string.
@ -694,7 +695,7 @@ containing the ``mapi id`` in the input string.
``extract_datetimetz`` ``extract_datetimetz``
---------------------- ----------------------
Extracts the date, time, and timezone in the ``Received`` field(s) from an ``.eml`` Extracts the date, time, and timezone in the ``Received`` field(s) from an ``.eml``
file. ``extract_datetimetz`` takes in a string and returns a datetime.datetime file. ``extract_datetimetz`` takes in a string and returns a datetime.datetime
object from the input string. object from the input string.
@ -754,7 +755,7 @@ other languages.
Parameters: Parameters:
* ``text``: the input string to translate. * ``text``: the input string to translate.
* ``source_lang``: the two letter language code for the source language of the text. * ``source_lang``: the two letter language code for the source language of the text.
If ``source_lang`` is not specified, If ``source_lang`` is not specified,
the language will be detected using ``langdetect``. the language will be detected using ``langdetect``.
* ``target_lang``: the two letter language code for the target language for translation. * ``target_lang``: the two letter language code for the target language for translation.
@ -857,7 +858,7 @@ Examples:
-------------------------- --------------------------
Prepares ``Text`` elements for processing in ``transformers`` pipelines Prepares ``Text`` elements for processing in ``transformers`` pipelines
by splitting the elements into chunks that fit into the model's attention window. by splitting the elements into chunks that fit into the model's attention window.
Examples: Examples:
@ -960,7 +961,7 @@ Examples:
json.dump(label_studio_data, f, indent=4) json.dump(label_studio_data, f, indent=4)
You can also include pre-annotations and predictions as part of your LabelStudio upload. You can also include pre-annotations and predictions as part of your LabelStudio upload.
The ``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of The ``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of
annotations for each element in the ``elements`` list. If an element does not have any annotations, annotations for each element in the ``elements`` list. If an element does not have any annotations,
@ -1009,7 +1010,7 @@ task in LabelStudio:
Similar to annotations, the ``predictions`` kwarg is also a list of lists. A ``prediction`` is an annotation with Similar to annotations, the ``predictions`` kwarg is also a list of lists. A ``prediction`` is an annotation with
the addition of a ``score`` value. If ``predictions`` is specified, there must be a list of the addition of a ``score`` value. If ``predictions`` is specified, there must be a list of
predictions for each element in the ``elements`` list. If an element does not have any predictions, use an empty list. predictions for each element in the ``elements`` list. If an element does not have any predictions, use an empty list.
The following shows an example of how to upload predictions for the "Text Classification" The following shows an example of how to upload predictions for the "Text Classification"
task in LabelStudio: task in LabelStudio:
@ -1167,13 +1168,13 @@ Examples:
``stage_for_label_box`` ``stage_for_label_box``
-------------------------- --------------------------
Formats outputs for use with `LabelBox <https://docs.labelbox.com/docs/overview>`_. LabelBox accepts cloud-hosted data Formats outputs for use with `LabelBox <https://docs.labelbox.com/docs/overview>`_. LabelBox accepts cloud-hosted data
and does not support importing text directly. The ``stage_for_label_box`` does the following: and does not support importing text directly. The ``stage_for_label_box`` does the following:
* Stages the data files in the ``output_directory`` specified in function arguments to be uploaded to a cloud storage service. * Stages the data files in the ``output_directory`` specified in function arguments to be uploaded to a cloud storage service.
* Returns a config of type ``List[Dict[str, Any]]`` that can be written to a ``json`` file and imported into LabelBox. * Returns a config of type ``List[Dict[str, Any]]`` that can be written to a ``json`` file and imported into LabelBox.
**Note:** ``stage_for_label_box`` does not upload the data to remote storage such as S3. Users can upload the data to S3 **Note:** ``stage_for_label_box`` does not upload the data to remote storage such as S3. Users can upload the data to S3
using ``aws s3 sync ${output_directory} ${url_prefix}`` after running the ``stage_for_label_box`` staging brick. using ``aws s3 sync ${output_directory} ${url_prefix}`` after running the ``stage_for_label_box`` staging brick.
Examples: Examples:
@ -1197,7 +1198,7 @@ files to an S3 bucket.
# The URL prefix where the data files will be accessed. # The URL prefix where the data files will be accessed.
S3_URL_PREFIX = f"https://{S3_BUCKET_NAME}.s3.amazonaws.com/{S3_BUCKET_KEY_PREFIX}" S3_URL_PREFIX = f"https://{S3_BUCKET_NAME}.s3.amazonaws.com/{S3_BUCKET_KEY_PREFIX}"
# The local output directory where the data files will be staged for uploading to a Cloud Storage service. # The local output directory where the data files will be staged for uploading to a Cloud Storage service.
LOCAL_OUTPUT_DIRECTORY = "/tmp/labelbox-staging" LOCAL_OUTPUT_DIRECTORY = "/tmp/labelbox-staging"
@ -1232,7 +1233,7 @@ files to an S3 bucket.
-------------------------- --------------------------
Formats a list of ``Text`` elements as input to token based tasks in Datasaur. Formats a list of ``Text`` elements as input to token based tasks in Datasaur.
Example: Example:
.. code:: python .. code:: python
@ -1243,7 +1244,7 @@ Example:
datasaur_data = stage_for_datasaur(elements) datasaur_data = stage_for_datasaur(elements)
The output is a list of dictionaries, each one with two keys: The output is a list of dictionaries, each one with two keys:
"text" with the content of the element and "text" with the content of the element and
"entities" with an empty list. "entities" with an empty list.
You can also specify specify entities in the ``stage_for_datasaur`` brick. Entities You can also specify specify entities in the ``stage_for_datasaur`` brick. Entities

View File

@ -113,7 +113,28 @@ def test_auto_partition_html_from_file_rb():
assert len(elements) > 0 assert len(elements) > 0
def test_auto_partition_pdf(): EXPECTED_TEXT_OUTPUT = [
NarrativeText(text="This is a test document to use for unit tests."),
Title(text="Important points:"),
ListItem(text="Hamburgers are delicious"),
ListItem(text="Dogs are the best"),
ListItem(text="I love fuzzy blankets"),
]
def test_auto_partition_text_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
elements = partition(filename=filename)
assert len(elements) > 0
assert elements == EXPECTED_TEXT_OUTPUT
def test_auto_partition_text_from_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
with open(filename, "r") as f:
elements = partition(file=f)
assert len(elements) > 0
assert elements == EXPECTED_TEXT_OUTPUT
filename = os.path.join( filename = os.path.join(
EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf" EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf"
) )

View File

@ -16,14 +16,14 @@ EXPECTED_OUTPUT = [
] ]
def test_partition_email_from_filename(): def test_partition_text_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
elements = partition_text(filename=filename) elements = partition_text(filename=filename)
assert len(elements) > 0 assert len(elements) > 0
assert elements == EXPECTED_OUTPUT assert elements == EXPECTED_OUTPUT
def test_partition_email_from_file(): def test_partition_text_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
with open(filename, "r") as f: with open(filename, "r") as f:
elements = partition_text(file=f) elements = partition_text(file=f)
@ -31,7 +31,7 @@ def test_partition_email_from_file():
assert elements == EXPECTED_OUTPUT assert elements == EXPECTED_OUTPUT
def test_partition_email_from_text(): def test_partition_text_from_text():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
with open(filename, "r") as f: with open(filename, "r") as f:
text = f.read() text = f.read()
@ -40,12 +40,12 @@ def test_partition_email_from_text():
assert elements == EXPECTED_OUTPUT assert elements == EXPECTED_OUTPUT
def test_partition_email_raises_with_none_specified(): def test_partition_text_raises_with_none_specified():
with pytest.raises(ValueError): with pytest.raises(ValueError):
partition_text() partition_text()
def test_partition_email_raises_with_too_many_specified(): def test_partition_text_raises_with_too_many_specified():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
with open(filename, "r") as f: with open(filename, "r") as f:
text = f.read() text = f.read()

View File

@ -1 +1 @@
__version__ = "0.4.0" # pragma: no cover __version__ = "0.4.1-dev0" # pragma: no cover

View File

@ -5,6 +5,7 @@ from unstructured.partition.docx import partition_docx
from unstructured.partition.email import partition_email from unstructured.partition.email import partition_email
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html
from unstructured.partition.pdf import partition_pdf from unstructured.partition.pdf import partition_pdf
from unstructured.partition.text import partition_text
def partition(filename: Optional[str] = None, file: Optional[IO] = None): def partition(filename: Optional[str] = None, file: Optional[IO] = None):
@ -33,6 +34,8 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
return partition_html(filename=filename, file=file) return partition_html(filename=filename, file=file)
elif filetype == FileType.PDF: elif filetype == FileType.PDF:
return partition_pdf(filename=filename, file=file, url=None) # type: ignore return partition_pdf(filename=filename, file=file, url=None) # type: ignore
elif filetype == FileType.TXT:
return partition_text(filename=filename, file=file)
else: else:
msg = "Invalid file" if not filename else f"Invalid file {filename}" msg = "Invalid file" if not filename else f"Invalid file {filename}"
raise ValueError(f"{msg}. File type not support in partition.") raise ValueError(f"{msg}. File type not support in partition.")