mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-15 18:07:26 +00:00
feat: add support for .txt files in partition (#150)
* added partition_text for auto * rename partition_text tests * bump version and update docs
This commit is contained in:
parent
eba4c80b1e
commit
f12240c5e7
@ -1,3 +1,7 @@
|
|||||||
|
## 0.4.1-dev0
|
||||||
|
|
||||||
|
* Added support for text files in the `partition` function
|
||||||
|
|
||||||
## 0.4.0
|
## 0.4.0
|
||||||
|
|
||||||
* Added generic `partition` brick that detects the file type and routes a file to the appropriate
|
* Added generic `partition` brick that detects the file type and routes a file to the appropriate
|
||||||
|
|||||||
@ -62,7 +62,7 @@ To install the library, run `pip install unstructured`.
|
|||||||
You can run this [Colab notebook](https://colab.research.google.com/drive/1RnXEiSTUaru8vZSGbh1U2T2P9aUa5tQD#scrollTo=E_WN7p3JGcLJ) to run the examples below.
|
You can run this [Colab notebook](https://colab.research.google.com/drive/1RnXEiSTUaru8vZSGbh1U2T2P9aUa5tQD#scrollTo=E_WN7p3JGcLJ) to run the examples below.
|
||||||
|
|
||||||
The following examples show how to get started with the `unstructured` library.
|
The following examples show how to get started with the `unstructured` library.
|
||||||
You can parse **HTML**, **PDF**, **EML** and **DOCX** documents with one line of code!
|
You can parse **TXT**, **HTML**, **PDF**, **EML** and **DOCX** documents with one line of code!
|
||||||
<br></br>
|
<br></br>
|
||||||
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
|
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
|
||||||
of the features in the library.
|
of the features in the library.
|
||||||
@ -76,7 +76,7 @@ If you are using the `partition` brick, ensure you first install `libmagic` usin
|
|||||||
instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
|
instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
|
||||||
`partition` will always apply the default arguments. If you need
|
`partition` will always apply the default arguments. If you need
|
||||||
advanced features, use a document-specific brick. The `partition` brick currently works for
|
advanced features, use a document-specific brick. The `partition` brick currently works for
|
||||||
`.docx`, `eml`, `.html`, and `.pdf` documents.
|
`.txt`, `.docx`, `eml`, `.html`, and `.pdf` documents.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from unstructured.partition.auto import partition
|
from unstructured.partition.auto import partition
|
||||||
|
|||||||
@ -22,6 +22,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
|
|||||||
file type and route it to the appropriate partitioning brick. All partitioning bricks
|
file type and route it to the appropriate partitioning brick. All partitioning bricks
|
||||||
called within ``partition`` are called using the defualt kwargs. Use the document-type
|
called within ``partition`` are called using the defualt kwargs. Use the document-type
|
||||||
specific bricks if you need to apply non-default settings.
|
specific bricks if you need to apply non-default settings.
|
||||||
|
``partition`` currently supports ``.docx``, ``.eml``, ``.html``, ``.pdf``, and ``.txt`` files.
|
||||||
|
|
||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
@ -104,7 +105,7 @@ Examples:
|
|||||||
``partition_pdf``
|
``partition_pdf``
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
The ``partition_pdf`` function segments a PDF document by calling the document image analysis API.
|
The ``partition_pdf`` function segments a PDF document by calling the document image analysis API.
|
||||||
The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
|
The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
|
||||||
if desired.
|
if desired.
|
||||||
|
|
||||||
@ -122,7 +123,7 @@ Examples:
|
|||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
The ``partition_email`` function partitions ``.eml`` documents and works with exports
|
The ``partition_email`` function partitions ``.eml`` documents and works with exports
|
||||||
from email clients such as Microsoft Outlook and Gmail. The ``partition_email``
|
from email clients such as Microsoft Outlook and Gmail. The ``partition_email``
|
||||||
takes a filename, file-like object, or raw text as input and produces a list of
|
takes a filename, file-like object, or raw text as input and produces a list of
|
||||||
document ``Element`` objects as output. Also ``content_source`` can be set to ``text/html``
|
document ``Element`` objects as output. Also ``content_source`` can be set to ``text/html``
|
||||||
(default) or ``text/plain`` to process the html or plain text version of the email, respectively.
|
(default) or ``text/plain`` to process the html or plain text version of the email, respectively.
|
||||||
@ -157,7 +158,7 @@ Examples:
|
|||||||
``partition_text``
|
``partition_text``
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
The ``partition_text`` function partitions text files. The ``partition_text``
|
The ``partition_text`` function partitions text files. The ``partition_text``
|
||||||
takes a filename, file-like object, and raw text as input and produces ``Element`` objects as output.
|
takes a filename, file-like object, and raw text as input and produces ``Element`` objects as output.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
@ -629,7 +630,7 @@ addresses in the input string.
|
|||||||
|
|
||||||
from unstructured.cleaners.extract import extract_email_address
|
from unstructured.cleaners.extract import extract_email_address
|
||||||
|
|
||||||
text = """Me me@email.com and You <You@email.com>
|
text = """Me me@email.com and You <You@email.com>
|
||||||
([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)"""
|
([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)"""
|
||||||
|
|
||||||
# Returns "['me@email.com', 'you@email.com']"
|
# Returns "['me@email.com', 'you@email.com']"
|
||||||
@ -646,7 +647,7 @@ returns a list of all IP address in input string.
|
|||||||
|
|
||||||
from unstructured.cleaners.extract import extract_ip_address
|
from unstructured.cleaners.extract import extract_ip_address
|
||||||
|
|
||||||
text = """Me me@email.com and You <You@email.com>
|
text = """Me me@email.com and You <You@email.com>
|
||||||
([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)"""
|
([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)"""
|
||||||
|
|
||||||
# Returns "['ba23::58b5:2236:45g2:88h2', '10.0.2.01']"
|
# Returns "['ba23::58b5:2236:45g2:88h2', '10.0.2.01']"
|
||||||
@ -656,7 +657,7 @@ returns a list of all IP address in input string.
|
|||||||
``extract_ip_address_name``
|
``extract_ip_address_name``
|
||||||
----------------------------
|
----------------------------
|
||||||
|
|
||||||
Extracts the names of each IP address in the ``Received`` field(s) from an ``.eml``
|
Extracts the names of each IP address in the ``Received`` field(s) from an ``.eml``
|
||||||
file. ``extract_ip_address_name`` takes in a string and returns a list of all
|
file. ``extract_ip_address_name`` takes in a string and returns a list of all
|
||||||
IP addresses in the input string.
|
IP addresses in the input string.
|
||||||
|
|
||||||
@ -675,7 +676,7 @@ IP addresses in the input string.
|
|||||||
``extract_mapi_id``
|
``extract_mapi_id``
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
Extracts the ``mapi id`` in the ``Received`` field(s) from an ``.eml``
|
Extracts the ``mapi id`` in the ``Received`` field(s) from an ``.eml``
|
||||||
file. ``extract_mapi_id`` takes in a string and returns a list of a string
|
file. ``extract_mapi_id`` takes in a string and returns a list of a string
|
||||||
containing the ``mapi id`` in the input string.
|
containing the ``mapi id`` in the input string.
|
||||||
|
|
||||||
@ -694,7 +695,7 @@ containing the ``mapi id`` in the input string.
|
|||||||
``extract_datetimetz``
|
``extract_datetimetz``
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
Extracts the date, time, and timezone in the ``Received`` field(s) from an ``.eml``
|
Extracts the date, time, and timezone in the ``Received`` field(s) from an ``.eml``
|
||||||
file. ``extract_datetimetz`` takes in a string and returns a datetime.datetime
|
file. ``extract_datetimetz`` takes in a string and returns a datetime.datetime
|
||||||
object from the input string.
|
object from the input string.
|
||||||
|
|
||||||
@ -754,7 +755,7 @@ other languages.
|
|||||||
Parameters:
|
Parameters:
|
||||||
|
|
||||||
* ``text``: the input string to translate.
|
* ``text``: the input string to translate.
|
||||||
* ``source_lang``: the two letter language code for the source language of the text.
|
* ``source_lang``: the two letter language code for the source language of the text.
|
||||||
If ``source_lang`` is not specified,
|
If ``source_lang`` is not specified,
|
||||||
the language will be detected using ``langdetect``.
|
the language will be detected using ``langdetect``.
|
||||||
* ``target_lang``: the two letter language code for the target language for translation.
|
* ``target_lang``: the two letter language code for the target language for translation.
|
||||||
@ -857,7 +858,7 @@ Examples:
|
|||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
Prepares ``Text`` elements for processing in ``transformers`` pipelines
|
Prepares ``Text`` elements for processing in ``transformers`` pipelines
|
||||||
by splitting the elements into chunks that fit into the model's attention window.
|
by splitting the elements into chunks that fit into the model's attention window.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
@ -960,7 +961,7 @@ Examples:
|
|||||||
json.dump(label_studio_data, f, indent=4)
|
json.dump(label_studio_data, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
You can also include pre-annotations and predictions as part of your LabelStudio upload.
|
You can also include pre-annotations and predictions as part of your LabelStudio upload.
|
||||||
|
|
||||||
The ``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of
|
The ``annotations`` kwarg is a list of lists. If ``annotations`` is specified, there must be a list of
|
||||||
annotations for each element in the ``elements`` list. If an element does not have any annotations,
|
annotations for each element in the ``elements`` list. If an element does not have any annotations,
|
||||||
@ -1009,7 +1010,7 @@ task in LabelStudio:
|
|||||||
|
|
||||||
Similar to annotations, the ``predictions`` kwarg is also a list of lists. A ``prediction`` is an annotation with
|
Similar to annotations, the ``predictions`` kwarg is also a list of lists. A ``prediction`` is an annotation with
|
||||||
the addition of a ``score`` value. If ``predictions`` is specified, there must be a list of
|
the addition of a ``score`` value. If ``predictions`` is specified, there must be a list of
|
||||||
predictions for each element in the ``elements`` list. If an element does not have any predictions, use an empty list.
|
predictions for each element in the ``elements`` list. If an element does not have any predictions, use an empty list.
|
||||||
The following shows an example of how to upload predictions for the "Text Classification"
|
The following shows an example of how to upload predictions for the "Text Classification"
|
||||||
task in LabelStudio:
|
task in LabelStudio:
|
||||||
|
|
||||||
@ -1167,13 +1168,13 @@ Examples:
|
|||||||
``stage_for_label_box``
|
``stage_for_label_box``
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
Formats outputs for use with `LabelBox <https://docs.labelbox.com/docs/overview>`_. LabelBox accepts cloud-hosted data
|
Formats outputs for use with `LabelBox <https://docs.labelbox.com/docs/overview>`_. LabelBox accepts cloud-hosted data
|
||||||
and does not support importing text directly. The ``stage_for_label_box`` does the following:
|
and does not support importing text directly. The ``stage_for_label_box`` does the following:
|
||||||
|
|
||||||
* Stages the data files in the ``output_directory`` specified in function arguments to be uploaded to a cloud storage service.
|
* Stages the data files in the ``output_directory`` specified in function arguments to be uploaded to a cloud storage service.
|
||||||
* Returns a config of type ``List[Dict[str, Any]]`` that can be written to a ``json`` file and imported into LabelBox.
|
* Returns a config of type ``List[Dict[str, Any]]`` that can be written to a ``json`` file and imported into LabelBox.
|
||||||
|
|
||||||
**Note:** ``stage_for_label_box`` does not upload the data to remote storage such as S3. Users can upload the data to S3
|
**Note:** ``stage_for_label_box`` does not upload the data to remote storage such as S3. Users can upload the data to S3
|
||||||
using ``aws s3 sync ${output_directory} ${url_prefix}`` after running the ``stage_for_label_box`` staging brick.
|
using ``aws s3 sync ${output_directory} ${url_prefix}`` after running the ``stage_for_label_box`` staging brick.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
@ -1197,7 +1198,7 @@ files to an S3 bucket.
|
|||||||
|
|
||||||
# The URL prefix where the data files will be accessed.
|
# The URL prefix where the data files will be accessed.
|
||||||
S3_URL_PREFIX = f"https://{S3_BUCKET_NAME}.s3.amazonaws.com/{S3_BUCKET_KEY_PREFIX}"
|
S3_URL_PREFIX = f"https://{S3_BUCKET_NAME}.s3.amazonaws.com/{S3_BUCKET_KEY_PREFIX}"
|
||||||
|
|
||||||
# The local output directory where the data files will be staged for uploading to a Cloud Storage service.
|
# The local output directory where the data files will be staged for uploading to a Cloud Storage service.
|
||||||
LOCAL_OUTPUT_DIRECTORY = "/tmp/labelbox-staging"
|
LOCAL_OUTPUT_DIRECTORY = "/tmp/labelbox-staging"
|
||||||
|
|
||||||
@ -1232,7 +1233,7 @@ files to an S3 bucket.
|
|||||||
--------------------------
|
--------------------------
|
||||||
Formats a list of ``Text`` elements as input to token based tasks in Datasaur.
|
Formats a list of ``Text`` elements as input to token based tasks in Datasaur.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
|
|
||||||
@ -1243,7 +1244,7 @@ Example:
|
|||||||
datasaur_data = stage_for_datasaur(elements)
|
datasaur_data = stage_for_datasaur(elements)
|
||||||
|
|
||||||
The output is a list of dictionaries, each one with two keys:
|
The output is a list of dictionaries, each one with two keys:
|
||||||
"text" with the content of the element and
|
"text" with the content of the element and
|
||||||
"entities" with an empty list.
|
"entities" with an empty list.
|
||||||
|
|
||||||
You can also specify specify entities in the ``stage_for_datasaur`` brick. Entities
|
You can also specify specify entities in the ``stage_for_datasaur`` brick. Entities
|
||||||
|
|||||||
@ -113,7 +113,28 @@ def test_auto_partition_html_from_file_rb():
|
|||||||
assert len(elements) > 0
|
assert len(elements) > 0
|
||||||
|
|
||||||
|
|
||||||
def test_auto_partition_pdf():
|
EXPECTED_TEXT_OUTPUT = [
|
||||||
|
NarrativeText(text="This is a test document to use for unit tests."),
|
||||||
|
Title(text="Important points:"),
|
||||||
|
ListItem(text="Hamburgers are delicious"),
|
||||||
|
ListItem(text="Dogs are the best"),
|
||||||
|
ListItem(text="I love fuzzy blankets"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_partition_text_from_filename():
|
||||||
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||||
|
elements = partition(filename=filename)
|
||||||
|
assert len(elements) > 0
|
||||||
|
assert elements == EXPECTED_TEXT_OUTPUT
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_partition_text_from_file():
|
||||||
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||||
|
with open(filename, "r") as f:
|
||||||
|
elements = partition(file=f)
|
||||||
|
assert len(elements) > 0
|
||||||
|
assert elements == EXPECTED_TEXT_OUTPUT
|
||||||
filename = os.path.join(
|
filename = os.path.join(
|
||||||
EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf"
|
EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf"
|
||||||
)
|
)
|
||||||
|
|||||||
@ -16,14 +16,14 @@ EXPECTED_OUTPUT = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_partition_email_from_filename():
|
def test_partition_text_from_filename():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||||
elements = partition_text(filename=filename)
|
elements = partition_text(filename=filename)
|
||||||
assert len(elements) > 0
|
assert len(elements) > 0
|
||||||
assert elements == EXPECTED_OUTPUT
|
assert elements == EXPECTED_OUTPUT
|
||||||
|
|
||||||
|
|
||||||
def test_partition_email_from_file():
|
def test_partition_text_from_file():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
elements = partition_text(file=f)
|
elements = partition_text(file=f)
|
||||||
@ -31,7 +31,7 @@ def test_partition_email_from_file():
|
|||||||
assert elements == EXPECTED_OUTPUT
|
assert elements == EXPECTED_OUTPUT
|
||||||
|
|
||||||
|
|
||||||
def test_partition_email_from_text():
|
def test_partition_text_from_text():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
text = f.read()
|
text = f.read()
|
||||||
@ -40,12 +40,12 @@ def test_partition_email_from_text():
|
|||||||
assert elements == EXPECTED_OUTPUT
|
assert elements == EXPECTED_OUTPUT
|
||||||
|
|
||||||
|
|
||||||
def test_partition_email_raises_with_none_specified():
|
def test_partition_text_raises_with_none_specified():
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
partition_text()
|
partition_text()
|
||||||
|
|
||||||
|
|
||||||
def test_partition_email_raises_with_too_many_specified():
|
def test_partition_text_raises_with_too_many_specified():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
text = f.read()
|
text = f.read()
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.4.0" # pragma: no cover
|
__version__ = "0.4.1-dev0" # pragma: no cover
|
||||||
|
|||||||
@ -5,6 +5,7 @@ from unstructured.partition.docx import partition_docx
|
|||||||
from unstructured.partition.email import partition_email
|
from unstructured.partition.email import partition_email
|
||||||
from unstructured.partition.html import partition_html
|
from unstructured.partition.html import partition_html
|
||||||
from unstructured.partition.pdf import partition_pdf
|
from unstructured.partition.pdf import partition_pdf
|
||||||
|
from unstructured.partition.text import partition_text
|
||||||
|
|
||||||
|
|
||||||
def partition(filename: Optional[str] = None, file: Optional[IO] = None):
|
def partition(filename: Optional[str] = None, file: Optional[IO] = None):
|
||||||
@ -33,6 +34,8 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
|
|||||||
return partition_html(filename=filename, file=file)
|
return partition_html(filename=filename, file=file)
|
||||||
elif filetype == FileType.PDF:
|
elif filetype == FileType.PDF:
|
||||||
return partition_pdf(filename=filename, file=file, url=None) # type: ignore
|
return partition_pdf(filename=filename, file=file, url=None) # type: ignore
|
||||||
|
elif filetype == FileType.TXT:
|
||||||
|
return partition_text(filename=filename, file=file)
|
||||||
else:
|
else:
|
||||||
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
||||||
raise ValueError(f"{msg}. File type not support in partition.")
|
raise ValueError(f"{msg}. File type not support in partition.")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user