2023-01-17 16:36:44 -06:00
|
|
|
import datetime
|
2023-01-03 11:41:54 -06:00
|
|
|
import email
|
2022-12-19 13:02:44 -05:00
|
|
|
import os
|
|
|
|
import pathlib
|
|
|
|
|
2023-02-27 17:30:54 +01:00
|
|
|
import pytest
|
2023-01-17 16:36:44 -06:00
|
|
|
|
Dynamic ElementMetadata implementation (#2043)
### Executive Summary
The structure of element metadata is currently static, meaning only
predefined fields can appear in the metadata. We would like the
flexibility for end-users, at their own discretion, to define and use
additional metadata fields that make sense for their particular
use-case.
### Concepts
A key concept for dynamic metadata is _known field_. A known-field is
one of those explicitly defined on `ElementMetadata`. Each of these has
a type and can be specified when _constructing_ a new `ElementMetadata`
instance. This is in contrast to an _end-user defined_ (or _ad-hoc_)
metadata field, one not known at "compile" time and added at the
discretion of an end-user to suit the purposes of their application.
An ad-hoc field can only be added by _assignment_ on an already
constructed instance.
### End-user ad-hoc metadata field behaviors
An ad-hoc field can be added to an `ElementMetadata` instance by
assignment:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
```
A field added in this way can be accessed by name:
```python
>>> metadata.coefficient
0.536
```
and that field will appear in the JSON/dict for that instance:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
>>> metadata.to_dict()
{"coefficient": 0.536}
```
However, accessing a "user-defined" value that has _not_ been assigned
on that instance raises `AttributeError`:
```python
>>> metadata.coeffcient # -- misspelled "coefficient" --
AttributeError: 'ElementMetadata' object has no attribute 'coeffcient'
```
This makes "tagging" a metadata item with a value very convenient, but
entails the proviso that if an end-user wants to add a metadata field to
_some_ elements and not others (sparse population), AND they want to
access that field by name on ANY element and receive `None` where it has
not been assigned, they will need to use an expression like this:
```python
coefficient = metadata.coefficient if hasattr(metadata, "coefficient") else None
```
### Implementation Notes
- **ad-hoc metadata fields** are discarded during consolidation (for
chunking) because we don't have a consolidation strategy defined for
those. We could consider using a default consolidation strategy like
`FIRST` or possibly allow a user to register a strategy (although that
gets hairy in non-private and multiple-memory-space situations.)
- ad-hoc metadata fields **cannot start with an underscore**.
- We have no way to distinguish an ad-hoc field from any "noise" fields
that might appear in a JSON/dict loaded using `.from_dict()`, so unlike
the original (which only loaded known-fields), we'll rehydrate anything
that we find there.
- No real type-safety is possible on ad-hoc fields but the type-checker
does not complain because the type of all ad-hoc fields is `Any` (which
is the best available behavior in my view).
- We may want to consider whether end-users should be able to add ad-hoc
fields to "sub" metadata objects too, like `DataSourceMetadata` and
conceivably `CoordinatesMetadata` (although I'm not immediately seeing a
use-case for the second one).
2023-11-15 13:22:15 -08:00
|
|
|
from test_unstructured.unit_utils import (
|
|
|
|
assert_round_trips_through_JSON,
|
|
|
|
example_doc_path,
|
|
|
|
parse_optional_datetime,
|
|
|
|
)
|
2023-09-11 16:00:14 -05:00
|
|
|
from unstructured.chunking.title import chunk_by_title
|
2023-04-04 14:23:41 -04:00
|
|
|
from unstructured.documents.elements import (
|
|
|
|
ElementMetadata,
|
|
|
|
Image,
|
|
|
|
ListItem,
|
|
|
|
NarrativeText,
|
2023-08-14 11:38:53 -07:00
|
|
|
Text,
|
2023-04-04 14:23:41 -04:00
|
|
|
Title,
|
|
|
|
)
|
2023-01-09 11:08:08 -06:00
|
|
|
from unstructured.documents.email_elements import (
|
|
|
|
MetaData,
|
2023-02-27 17:30:54 +01:00
|
|
|
ReceivedInfo,
|
2023-01-09 11:08:08 -06:00
|
|
|
Recipient,
|
|
|
|
Sender,
|
|
|
|
Subject,
|
|
|
|
)
|
|
|
|
from unstructured.partition.email import (
|
2023-05-11 10:36:25 -04:00
|
|
|
convert_to_iso_8601,
|
2023-01-09 11:08:08 -06:00
|
|
|
extract_attachment_info,
|
|
|
|
partition_email,
|
|
|
|
partition_email_header,
|
|
|
|
)
|
2023-06-29 18:01:12 -04:00
|
|
|
from unstructured.partition.text import partition_text
|
2022-12-19 13:02:44 -05:00
|
|
|
|
2023-06-16 17:52:13 -07:00
|
|
|
FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
|
|
EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs", "eml")
|
2022-12-19 13:02:44 -05:00
|
|
|
|
|
|
|
|
|
|
|
EXPECTED_OUTPUT = [
|
|
|
|
NarrativeText(text="This is a test email to use for unit tests."),
|
|
|
|
Title(text="Important points:"),
|
|
|
|
ListItem(text="Roses are red"),
|
|
|
|
ListItem(text="Violets are blue"),
|
|
|
|
]
|
|
|
|
|
2023-01-09 19:49:19 -06:00
|
|
|
IMAGE_EXPECTED_OUTPUT = [
|
|
|
|
NarrativeText(text="This is a test email to use for unit tests."),
|
|
|
|
Title(text="Important points:"),
|
|
|
|
NarrativeText(text="hello this is our logo."),
|
|
|
|
Image(text="unstructured_logo.png"),
|
|
|
|
ListItem(text="Roses are red"),
|
|
|
|
ListItem(text="Violets are blue"),
|
|
|
|
]
|
|
|
|
|
2023-01-17 16:36:44 -06:00
|
|
|
RECEIVED_HEADER_OUTPUT = [
|
|
|
|
ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="00.0.0.00"),
|
|
|
|
ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="ba23::58b5:2236:45g2:88h2"),
|
|
|
|
ReceivedInfo(
|
|
|
|
name="received_datetimetz",
|
|
|
|
text="2023-02-20 10:03:18+12:00",
|
|
|
|
datestamp=datetime.datetime(
|
2023-02-27 17:30:54 +01:00
|
|
|
2023,
|
|
|
|
2,
|
|
|
|
20,
|
|
|
|
10,
|
|
|
|
3,
|
|
|
|
18,
|
|
|
|
tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)),
|
2023-01-17 16:36:44 -06:00
|
|
|
),
|
|
|
|
),
|
|
|
|
MetaData(name="MIME-Version", text="1.0"),
|
|
|
|
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
|
|
|
|
MetaData(
|
|
|
|
name="Message-ID",
|
|
|
|
text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
|
|
|
|
),
|
|
|
|
Subject(text="Test Email"),
|
|
|
|
Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"),
|
|
|
|
Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"),
|
|
|
|
MetaData(
|
2023-02-27 17:30:54 +01:00
|
|
|
name="Content-Type",
|
|
|
|
text='multipart/alternative; boundary="00000000000095c9b205eff92630"',
|
2023-01-17 16:36:44 -06:00
|
|
|
),
|
|
|
|
]
|
|
|
|
|
2023-01-09 11:08:08 -06:00
|
|
|
HEADER_EXPECTED_OUTPUT = [
|
|
|
|
MetaData(name="MIME-Version", text="1.0"),
|
|
|
|
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
|
|
|
|
MetaData(
|
|
|
|
name="Message-ID",
|
|
|
|
text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
|
|
|
|
),
|
|
|
|
Subject(text="Test Email"),
|
|
|
|
Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"),
|
|
|
|
Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"),
|
|
|
|
MetaData(
|
2023-02-27 17:30:54 +01:00
|
|
|
name="Content-Type",
|
|
|
|
text='multipart/alternative; boundary="00000000000095c9b205eff92630"',
|
2023-01-09 11:08:08 -06:00
|
|
|
),
|
|
|
|
]
|
|
|
|
|
|
|
|
ALL_EXPECTED_OUTPUT = HEADER_EXPECTED_OUTPUT + EXPECTED_OUTPUT
|
|
|
|
|
2023-01-03 11:41:54 -06:00
|
|
|
ATTACH_EXPECTED_OUTPUT = [
|
2023-02-27 17:30:54 +01:00
|
|
|
{"filename": "fake-attachment.txt", "payload": b"Hey this is a fake attachment!"},
|
2023-01-03 11:41:54 -06:00
|
|
|
]
|
|
|
|
|
2022-12-19 13:02:44 -05:00
|
|
|
|
|
|
|
def test_partition_email_from_filename():
|
2023-06-16 17:52:13 -07:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
2022-12-19 13:02:44 -05:00
|
|
|
elements = partition_email(filename=filename)
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == EXPECTED_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "fake-email.eml"
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_from_filename_with_metadata_filename():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
|
|
|
elements = partition_email(filename=filename, metadata_filename="test")
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert all(element.metadata.filename == "test" for element in elements)
|
2022-12-19 13:02:44 -05:00
|
|
|
|
|
|
|
|
2023-07-06 09:49:27 -04:00
|
|
|
def test_partition_email_from_filename_malformed_encoding():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-malformed-encoding.eml")
|
|
|
|
elements = partition_email(filename=filename)
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == EXPECTED_OUTPUT
|
|
|
|
|
|
|
|
|
2023-05-30 10:24:02 -07:00
|
|
|
@pytest.mark.parametrize(
|
2023-06-16 17:52:13 -07:00
|
|
|
("filename", "expected_output"),
|
|
|
|
[
|
|
|
|
("fake-email-utf-16.eml", EXPECTED_OUTPUT),
|
|
|
|
("fake-email-utf-16-be.eml", EXPECTED_OUTPUT),
|
|
|
|
("fake-email-utf-16-le.eml", EXPECTED_OUTPUT),
|
2023-12-20 07:37:17 +00:00
|
|
|
("fake-email-b64.eml", EXPECTED_OUTPUT),
|
2023-06-16 17:52:13 -07:00
|
|
|
("email-no-utf8-2008-07-16.062410.eml", None),
|
|
|
|
("email-no-utf8-2014-03-17.111517.eml", None),
|
|
|
|
("email-replace-mime-encodings-error-1.eml", None),
|
|
|
|
("email-replace-mime-encodings-error-2.eml", None),
|
|
|
|
("email-replace-mime-encodings-error-3.eml", None),
|
|
|
|
("email-replace-mime-encodings-error-4.eml", None),
|
|
|
|
("email-replace-mime-encodings-error-5.eml", None),
|
|
|
|
],
|
2023-05-30 10:24:02 -07:00
|
|
|
)
|
2023-06-16 17:52:13 -07:00
|
|
|
def test_partition_email_from_filename_default_encoding(filename, expected_output):
|
2023-07-05 15:02:22 -05:00
|
|
|
filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
|
|
|
elements = partition_email(filename=filename_path)
|
2023-05-30 10:24:02 -07:00
|
|
|
assert len(elements) > 0
|
2023-06-16 17:52:13 -07:00
|
|
|
if expected_output:
|
|
|
|
assert elements == expected_output
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == filename
|
2023-05-30 10:24:02 -07:00
|
|
|
|
|
|
|
|
2022-12-19 13:02:44 -05:00
|
|
|
def test_partition_email_from_file():
|
2023-06-16 17:52:13 -07:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
2023-02-27 17:30:54 +01:00
|
|
|
with open(filename) as f:
|
2022-12-19 13:02:44 -05:00
|
|
|
elements = partition_email(file=f)
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == EXPECTED_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
2022-12-19 13:02:44 -05:00
|
|
|
|
|
|
|
|
2023-05-30 10:24:02 -07:00
|
|
|
@pytest.mark.parametrize(
|
2023-06-16 17:52:13 -07:00
|
|
|
("filename", "expected_output"),
|
|
|
|
[
|
|
|
|
("fake-email-utf-16.eml", EXPECTED_OUTPUT),
|
|
|
|
("fake-email-utf-16-be.eml", EXPECTED_OUTPUT),
|
|
|
|
("fake-email-utf-16-le.eml", EXPECTED_OUTPUT),
|
2023-12-20 07:37:17 +00:00
|
|
|
("fake-email-b64.eml", EXPECTED_OUTPUT),
|
2023-06-16 17:52:13 -07:00
|
|
|
("email-no-utf8-2008-07-16.062410.eml", None),
|
|
|
|
("email-no-utf8-2014-03-17.111517.eml", None),
|
|
|
|
("email-replace-mime-encodings-error-1.eml", None),
|
|
|
|
("email-replace-mime-encodings-error-2.eml", None),
|
|
|
|
("email-replace-mime-encodings-error-3.eml", None),
|
|
|
|
("email-replace-mime-encodings-error-4.eml", None),
|
|
|
|
("email-replace-mime-encodings-error-5.eml", None),
|
|
|
|
],
|
2023-05-30 10:24:02 -07:00
|
|
|
)
|
2023-06-16 17:52:13 -07:00
|
|
|
def test_partition_email_from_file_default_encoding(filename, expected_output):
|
2023-07-05 15:02:22 -05:00
|
|
|
filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
|
|
|
with open(filename_path) as f:
|
2023-05-30 10:24:02 -07:00
|
|
|
elements = partition_email(file=f)
|
|
|
|
assert len(elements) > 0
|
2023-06-16 17:52:13 -07:00
|
|
|
if expected_output:
|
|
|
|
assert elements == expected_output
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
2023-05-30 10:24:02 -07:00
|
|
|
|
|
|
|
|
2023-01-09 16:15:14 -05:00
|
|
|
def test_partition_email_from_file_rb():
|
2023-06-16 17:52:13 -07:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
2023-01-09 16:15:14 -05:00
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = partition_email(file=f)
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == EXPECTED_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
2023-05-30 10:24:02 -07:00
|
|
|
@pytest.mark.parametrize(
|
2023-06-16 17:52:13 -07:00
|
|
|
("filename", "expected_output"),
|
|
|
|
[
|
|
|
|
("fake-email-utf-16.eml", EXPECTED_OUTPUT),
|
|
|
|
("fake-email-utf-16-be.eml", EXPECTED_OUTPUT),
|
|
|
|
("fake-email-utf-16-le.eml", EXPECTED_OUTPUT),
|
|
|
|
("email-no-utf8-2008-07-16.062410.eml", None),
|
|
|
|
("email-no-utf8-2014-03-17.111517.eml", None),
|
|
|
|
("email-replace-mime-encodings-error-1.eml", None),
|
|
|
|
("email-replace-mime-encodings-error-2.eml", None),
|
|
|
|
("email-replace-mime-encodings-error-3.eml", None),
|
|
|
|
("email-replace-mime-encodings-error-4.eml", None),
|
|
|
|
("email-replace-mime-encodings-error-5.eml", None),
|
|
|
|
],
|
2023-05-30 10:24:02 -07:00
|
|
|
)
|
2023-06-16 17:52:13 -07:00
|
|
|
def test_partition_email_from_file_rb_default_encoding(filename, expected_output):
|
2023-07-05 15:02:22 -05:00
|
|
|
filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
|
|
|
with open(filename_path, "rb") as f:
|
2023-05-30 10:24:02 -07:00
|
|
|
elements = partition_email(file=f)
|
|
|
|
assert len(elements) > 0
|
2023-06-16 17:52:13 -07:00
|
|
|
if expected_output:
|
|
|
|
assert elements == expected_output
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
2023-05-30 10:24:02 -07:00
|
|
|
|
|
|
|
|
2023-01-09 11:08:08 -06:00
|
|
|
def test_partition_email_from_text_file():
|
2023-06-16 17:52:13 -07:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
|
2023-02-27 17:30:54 +01:00
|
|
|
with open(filename) as f:
|
2023-01-09 11:08:08 -06:00
|
|
|
elements = partition_email(file=f, content_source="text/plain")
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == EXPECTED_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
2023-01-09 11:08:08 -06:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_from_text_file_with_headers():
|
2023-06-16 17:52:13 -07:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
|
2023-02-27 17:30:54 +01:00
|
|
|
with open(filename) as f:
|
2023-07-26 15:10:14 -04:00
|
|
|
elements = partition_email(
|
|
|
|
file=f,
|
|
|
|
content_source="text/plain",
|
|
|
|
include_headers=True,
|
|
|
|
)
|
2023-01-09 11:08:08 -06:00
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == ALL_EXPECTED_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
2023-01-09 11:08:08 -06:00
|
|
|
|
|
|
|
|
2023-07-24 10:57:24 -05:00
|
|
|
def test_partition_email_from_text_file_max():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
|
|
|
|
with open(filename) as f:
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
elements = partition_email(
|
|
|
|
file=f,
|
|
|
|
content_source="text/plain",
|
|
|
|
max_partition=20,
|
|
|
|
)
|
2023-07-24 10:57:24 -05:00
|
|
|
assert len(elements) == 6
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_from_text_file_raises_value_error():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
|
|
|
|
with pytest.raises(ValueError), open(filename) as f:
|
|
|
|
partition_email(file=f, content_source="text/plain", min_partition=1000)
|
|
|
|
|
|
|
|
|
2022-12-19 13:02:44 -05:00
|
|
|
def test_partition_email_from_text():
|
2023-06-16 17:52:13 -07:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
2023-02-27 17:30:54 +01:00
|
|
|
with open(filename) as f:
|
2022-12-19 13:02:44 -05:00
|
|
|
text = f.read()
|
|
|
|
elements = partition_email(text=text)
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == EXPECTED_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
2022-12-19 13:02:44 -05:00
|
|
|
|
|
|
|
|
2023-03-28 17:03:51 -04:00
|
|
|
def test_partition_email_from_text_work_with_empty_string():
|
|
|
|
assert partition_email(text="") == []
|
|
|
|
|
|
|
|
|
2023-01-09 19:49:19 -06:00
|
|
|
def test_partition_email_from_filename_with_embedded_image():
|
2023-06-16 17:52:13 -07:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-image-embedded.eml")
|
2023-01-09 19:49:19 -06:00
|
|
|
elements = partition_email(filename=filename, content_source="text/plain")
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == IMAGE_EXPECTED_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "fake-email-image-embedded.eml"
|
2023-01-09 19:49:19 -06:00
|
|
|
|
|
|
|
|
2023-07-05 15:02:22 -05:00
|
|
|
def test_partition_email_from_file_with_header():
|
2023-06-16 17:52:13 -07:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
|
2023-02-27 17:30:54 +01:00
|
|
|
with open(filename) as f:
|
2023-01-09 11:08:08 -06:00
|
|
|
msg = email.message_from_file(f)
|
|
|
|
elements = partition_email_header(msg)
|
|
|
|
assert len(elements) > 0
|
2023-01-17 16:36:44 -06:00
|
|
|
assert elements == RECEIVED_HEADER_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
2023-01-09 11:08:08 -06:00
|
|
|
|
|
|
|
|
2023-07-05 15:02:22 -05:00
|
|
|
def test_partition_email_from_filename_has_metadata():
|
2023-07-25 21:09:26 -07:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
2023-04-04 14:23:41 -04:00
|
|
|
elements = partition_email(filename=filename)
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
parent_id = elements[0].metadata.parent_id
|
|
|
|
|
2023-04-04 14:23:41 -04:00
|
|
|
assert len(elements) > 0
|
2023-06-16 10:10:56 -04:00
|
|
|
assert (
|
|
|
|
elements[0].metadata.to_dict()
|
|
|
|
== ElementMetadata(
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinates=None,
|
2023-06-16 10:10:56 -04:00
|
|
|
filename=filename,
|
2023-07-31 19:55:43 -07:00
|
|
|
last_modified="2022-12-16T17:04:16-05:00",
|
2023-06-16 10:10:56 -04:00
|
|
|
page_number=None,
|
|
|
|
url=None,
|
|
|
|
sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
|
2023-07-25 21:09:26 -07:00
|
|
|
sent_to=["NotMatthew <NotMatthew@notunstructured.com>"],
|
2023-06-16 10:10:56 -04:00
|
|
|
subject="Test Email",
|
|
|
|
filetype="message/rfc822",
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
parent_id=parent_id,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=["eng"],
|
2023-06-16 10:10:56 -04:00
|
|
|
).to_dict()
|
2023-04-04 14:23:41 -04:00
|
|
|
)
|
2023-05-12 11:33:01 -04:00
|
|
|
expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
|
Dynamic ElementMetadata implementation (#2043)
### Executive Summary
The structure of element metadata is currently static, meaning only
predefined fields can appear in the metadata. We would like the
flexibility for end-users, at their own discretion, to define and use
additional metadata fields that make sense for their particular
use-case.
### Concepts
A key concept for dynamic metadata is _known field_. A known-field is
one of those explicitly defined on `ElementMetadata`. Each of these has
a type and can be specified when _constructing_ a new `ElementMetadata`
instance. This is in contrast to an _end-user defined_ (or _ad-hoc_)
metadata field, one not known at "compile" time and added at the
discretion of an end-user to suit the purposes of their application.
An ad-hoc field can only be added by _assignment_ on an already
constructed instance.
### End-user ad-hoc metadata field behaviors
An ad-hoc field can be added to an `ElementMetadata` instance by
assignment:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
```
A field added in this way can be accessed by name:
```python
>>> metadata.coefficient
0.536
```
and that field will appear in the JSON/dict for that instance:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
>>> metadata.to_dict()
{"coefficient": 0.536}
```
However, accessing a "user-defined" value that has _not_ been assigned
on that instance raises `AttributeError`:
```python
>>> metadata.coeffcient # -- misspelled "coefficient" --
AttributeError: 'ElementMetadata' object has no attribute 'coeffcient'
```
This makes "tagging" a metadata item with a value very convenient, but
entails the proviso that if an end-user wants to add a metadata field to
_some_ elements and not others (sparse population), AND they want to
access that field by name on ANY element and receive `None` where it has
not been assigned, they will need to use an expression like this:
```python
coefficient = metadata.coefficient if hasattr(metadata, "coefficient") else None
```
### Implementation Notes
- **ad-hoc metadata fields** are discarded during consolidation (for
chunking) because we don't have a consolidation strategy defined for
those. We could consider using a default consolidation strategy like
`FIRST` or possibly allow a user to register a strategy (although that
gets hairy in non-private and multiple-memory-space situations.)
- ad-hoc metadata fields **cannot start with an underscore**.
- We have no way to distinguish an ad-hoc field from any "noise" fields
that might appear in a JSON/dict loaded using `.from_dict()`, so unlike
the original (which only loaded known-fields), we'll rehydrate anything
that we find there.
- No real type-safety is possible on ad-hoc fields but the type-checker
does not complain because the type of all ad-hoc fields is `Any` (which
is the best available behavior in my view).
- We may want to consider whether end-users should be able to add ad-hoc
fields to "sub" metadata objects too, like `DataSourceMetadata` and
conceivably `CoordinatesMetadata` (although I'm not immediately seeing a
use-case for the second one).
2023-11-15 13:22:15 -08:00
|
|
|
assert parse_optional_datetime(elements[0].metadata.last_modified) == expected_dt
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
2023-07-25 21:09:26 -07:00
|
|
|
assert element.metadata.filename == "fake-email.eml"
|
2023-05-12 11:33:01 -04:00
|
|
|
|
2023-04-04 14:23:41 -04:00
|
|
|
|
2023-01-17 11:33:45 -05:00
|
|
|
def test_extract_email_text_matches_html():
|
2023-06-16 17:52:13 -07:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml")
|
2023-01-17 11:33:45 -05:00
|
|
|
elements_from_text = partition_email(filename=filename, content_source="text/plain")
|
|
|
|
elements_from_html = partition_email(filename=filename, content_source="text/html")
|
|
|
|
assert len(elements_from_text) == len(elements_from_html)
|
|
|
|
# NOTE(robinson) - checking each individually is necessary because the text/html returns
|
|
|
|
# HTMLTitle, HTMLNarrativeText, etc
|
|
|
|
for i, element in enumerate(elements_from_text):
|
|
|
|
assert element == elements_from_text[i]
|
2023-07-05 15:02:22 -05:00
|
|
|
assert element.metadata.filename == "fake-email-attachment.eml"
|
2023-01-17 11:33:45 -05:00
|
|
|
|
|
|
|
|
2023-12-20 07:37:17 +00:00
|
|
|
def test_extract_base64_email_text_matches_html():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-b64.eml")
|
|
|
|
elements_from_text = partition_email(filename=filename, content_source="text/plain")
|
|
|
|
elements_from_html = partition_email(filename=filename, content_source="text/html")
|
|
|
|
assert len(elements_from_text) == len(elements_from_html)
|
|
|
|
for i, element in enumerate(elements_from_text):
|
|
|
|
assert element == elements_from_text[i]
|
|
|
|
assert element.metadata.filename == "fake-email-b64.eml"
|
|
|
|
|
|
|
|
|
2023-01-03 11:41:54 -06:00
|
|
|
def test_extract_attachment_info():
|
2023-06-16 17:52:13 -07:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml")
|
2023-02-27 17:30:54 +01:00
|
|
|
with open(filename) as f:
|
2023-01-03 11:41:54 -06:00
|
|
|
msg = email.message_from_file(f)
|
|
|
|
attachment_info = extract_attachment_info(msg)
|
|
|
|
assert len(attachment_info) > 0
|
|
|
|
assert attachment_info == ATTACH_EXPECTED_OUTPUT
|
|
|
|
|
|
|
|
|
2022-12-19 13:02:44 -05:00
|
|
|
def test_partition_email_raises_with_none_specified():
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_email()
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_raises_with_too_many_specified():
|
2023-06-16 17:52:13 -07:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
2023-02-27 17:30:54 +01:00
|
|
|
with open(filename) as f:
|
2022-12-19 13:02:44 -05:00
|
|
|
text = f.read()
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_email(filename=filename, text=text)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_raises_with_invalid_content_type():
|
2023-06-16 17:52:13 -07:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
2022-12-19 13:02:44 -05:00
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_email(filename=filename, content_source="application/json")
|
2023-03-10 18:10:39 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_processes_fake_email_with_header():
|
2023-06-16 17:52:13 -07:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
|
2023-03-10 18:10:39 -05:00
|
|
|
elements = partition_email(filename=filename)
|
|
|
|
assert len(elements) > 0
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "fake-email-header.eml"
|
2023-05-11 10:36:25 -04:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
(("time", "expected")),
|
|
|
|
[
|
|
|
|
("Thu, 4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"),
|
|
|
|
("Thu, 4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"),
|
|
|
|
("Thu, 4 May 2023 02:32:49 +0000 (UTC)", "2023-05-04T02:32:49+00:00"),
|
|
|
|
("Thursday 5/3/2023 02:32:49", None),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_convert_to_iso_8601(time, expected):
|
|
|
|
iso_time = convert_to_iso_8601(time)
|
|
|
|
assert iso_time == expected
|
2023-06-16 17:52:13 -07:00
|
|
|
|
|
|
|
|
2024-02-07 17:31:49 -05:00
|
|
|
def test_partition_email_still_works_with_no_content(caplog):
|
2023-06-16 17:52:13 -07:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "email-no-html-content-1.eml")
|
2023-06-22 12:52:27 -04:00
|
|
|
elements = partition_email(filename=filename)
|
2024-02-07 17:31:49 -05:00
|
|
|
assert len(elements) == 1
|
|
|
|
assert elements[0].text.startswith("Hey there")
|
|
|
|
assert "text/html was not found. Falling back to text/plain" in caplog.text
|
2023-06-29 18:01:12 -04:00
|
|
|
|
|
|
|
|
2023-06-30 09:44:46 -05:00
|
|
|
def test_partition_email_from_filename_exclude_metadata():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
|
|
|
|
elements = partition_email(filename=filename, include_metadata=False)
|
Dynamic ElementMetadata implementation (#2043)
### Executive Summary
The structure of element metadata is currently static, meaning only
predefined fields can appear in the metadata. We would like the
flexibility for end-users, at their own discretion, to define and use
additional metadata fields that make sense for their particular
use-case.
### Concepts
A key concept for dynamic metadata is _known field_. A known-field is
one of those explicitly defined on `ElementMetadata`. Each of these has
a type and can be specified when _constructing_ a new `ElementMetadata`
instance. This is in contrast to an _end-user defined_ (or _ad-hoc_)
metadata field, one not known at "compile" time and added at the
discretion of an end-user to suit the purposes of their application.
An ad-hoc field can only be added by _assignment_ on an already
constructed instance.
### End-user ad-hoc metadata field behaviors
An ad-hoc field can be added to an `ElementMetadata` instance by
assignment:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
```
A field added in this way can be accessed by name:
```python
>>> metadata.coefficient
0.536
```
and that field will appear in the JSON/dict for that instance:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
>>> metadata.to_dict()
{"coefficient": 0.536}
```
However, accessing a "user-defined" value that has _not_ been assigned
on that instance raises `AttributeError`:
```python
>>> metadata.coeffcient # -- misspelled "coefficient" --
AttributeError: 'ElementMetadata' object has no attribute 'coeffcient'
```
This makes "tagging" a metadata item with a value very convenient, but
entails the proviso that if an end-user wants to add a metadata field to
_some_ elements and not others (sparse population), AND they want to
access that field by name on ANY element and receive `None` where it has
not been assigned, they will need to use an expression like this:
```python
coefficient = metadata.coefficient if hasattr(metadata, "coefficient") else None
```
### Implementation Notes
- **ad-hoc metadata fields** are discarded during consolidation (for
chunking) because we don't have a consolidation strategy defined for
those. We could consider using a default consolidation strategy like
`FIRST` or possibly allow a user to register a strategy (although that
gets hairy in non-private and multiple-memory-space situations.)
- ad-hoc metadata fields **cannot start with an underscore**.
- We have no way to distinguish an ad-hoc field from any "noise" fields
that might appear in a JSON/dict loaded using `.from_dict()`, so unlike
the original (which only loaded known-fields), we'll rehydrate anything
that we find there.
- No real type-safety is possible on ad-hoc fields but the type-checker
does not complain because the type of all ad-hoc fields is `Any` (which
is the best available behavior in my view).
- We may want to consider whether end-users should be able to add ad-hoc
fields to "sub" metadata objects too, like `DataSourceMetadata` and
conceivably `CoordinatesMetadata` (although I'm not immediately seeing a
use-case for the second one).
2023-11-15 13:22:15 -08:00
|
|
|
assert parse_optional_datetime(elements[0].metadata.last_modified) is None
|
2023-06-30 09:44:46 -05:00
|
|
|
assert elements[0].metadata.filetype is None
|
|
|
|
assert elements[0].metadata.page_name is None
|
|
|
|
assert elements[0].metadata.filename is None
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_from_text_file_exclude_metadata():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
|
|
|
|
with open(filename) as f:
|
2023-07-26 15:10:14 -04:00
|
|
|
elements = partition_email(
|
|
|
|
file=f,
|
|
|
|
content_source="text/plain",
|
|
|
|
include_metadata=False,
|
|
|
|
)
|
Dynamic ElementMetadata implementation (#2043)
### Executive Summary
The structure of element metadata is currently static, meaning only
predefined fields can appear in the metadata. We would like the
flexibility for end-users, at their own discretion, to define and use
additional metadata fields that make sense for their particular
use-case.
### Concepts
A key concept for dynamic metadata is _known field_. A known-field is
one of those explicitly defined on `ElementMetadata`. Each of these has
a type and can be specified when _constructing_ a new `ElementMetadata`
instance. This is in contrast to an _end-user defined_ (or _ad-hoc_)
metadata field, one not known at "compile" time and added at the
discretion of an end-user to suit the purposes of their application.
An ad-hoc field can only be added by _assignment_ on an already
constructed instance.
### End-user ad-hoc metadata field behaviors
An ad-hoc field can be added to an `ElementMetadata` instance by
assignment:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
```
A field added in this way can be accessed by name:
```python
>>> metadata.coefficient
0.536
```
and that field will appear in the JSON/dict for that instance:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
>>> metadata.to_dict()
{"coefficient": 0.536}
```
However, accessing a "user-defined" value that has _not_ been assigned
on that instance raises `AttributeError`:
```python
>>> metadata.coeffcient # -- misspelled "coefficient" --
AttributeError: 'ElementMetadata' object has no attribute 'coeffcient'
```
This makes "tagging" a metadata item with a value very convenient, but
entails the proviso that if an end-user wants to add a metadata field to
_some_ elements and not others (sparse population), AND they want to
access that field by name on ANY element and receive `None` where it has
not been assigned, they will need to use an expression like this:
```python
coefficient = metadata.coefficient if hasattr(metadata, "coefficient") else None
```
### Implementation Notes
- **ad-hoc metadata fields** are discarded during consolidation (for
chunking) because we don't have a consolidation strategy defined for
those. We could consider using a default consolidation strategy like
`FIRST` or possibly allow a user to register a strategy (although that
gets hairy in non-private and multiple-memory-space situations.)
- ad-hoc metadata fields **cannot start with an underscore**.
- We have no way to distinguish an ad-hoc field from any "noise" fields
that might appear in a JSON/dict loaded using `.from_dict()`, so unlike
the original (which only loaded known-fields), we'll rehydrate anything
that we find there.
- No real type-safety is possible on ad-hoc fields but the type-checker
does not complain because the type of all ad-hoc fields is `Any` (which
is the best available behavior in my view).
- We may want to consider whether end-users should be able to add ad-hoc
fields to "sub" metadata objects too, like `DataSourceMetadata` and
conceivably `CoordinatesMetadata` (although I'm not immediately seeing a
use-case for the second one).
2023-11-15 13:22:15 -08:00
|
|
|
assert parse_optional_datetime(elements[0].metadata.last_modified) is None
|
2023-06-30 09:44:46 -05:00
|
|
|
assert elements[0].metadata.filetype is None
|
|
|
|
assert elements[0].metadata.page_name is None
|
|
|
|
assert elements[0].metadata.filename is None
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_from_file_exclude_metadata():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
|
|
|
with open(filename) as f:
|
|
|
|
elements = partition_email(file=f, include_metadata=False)
|
Dynamic ElementMetadata implementation (#2043)
### Executive Summary
The structure of element metadata is currently static, meaning only
predefined fields can appear in the metadata. We would like the
flexibility for end-users, at their own discretion, to define and use
additional metadata fields that make sense for their particular
use-case.
### Concepts
A key concept for dynamic metadata is _known field_. A known-field is
one of those explicitly defined on `ElementMetadata`. Each of these has
a type and can be specified when _constructing_ a new `ElementMetadata`
instance. This is in contrast to an _end-user defined_ (or _ad-hoc_)
metadata field, one not known at "compile" time and added at the
discretion of an end-user to suit the purposes of their application.
An ad-hoc field can only be added by _assignment_ on an already
constructed instance.
### End-user ad-hoc metadata field behaviors
An ad-hoc field can be added to an `ElementMetadata` instance by
assignment:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
```
A field added in this way can be accessed by name:
```python
>>> metadata.coefficient
0.536
```
and that field will appear in the JSON/dict for that instance:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
>>> metadata.to_dict()
{"coefficient": 0.536}
```
However, accessing a "user-defined" value that has _not_ been assigned
on that instance raises `AttributeError`:
```python
>>> metadata.coeffcient # -- misspelled "coefficient" --
AttributeError: 'ElementMetadata' object has no attribute 'coeffcient'
```
This makes "tagging" a metadata item with a value very convenient, but
entails the proviso that if an end-user wants to add a metadata field to
_some_ elements and not others (sparse population), AND they want to
access that field by name on ANY element and receive `None` where it has
not been assigned, they will need to use an expression like this:
```python
coefficient = metadata.coefficient if hasattr(metadata, "coefficient") else None
```
### Implementation Notes
- **ad-hoc metadata fields** are discarded during consolidation (for
chunking) because we don't have a consolidation strategy defined for
those. We could consider using a default consolidation strategy like
`FIRST` or possibly allow a user to register a strategy (although that
gets hairy in non-private and multiple-memory-space situations.)
- ad-hoc metadata fields **cannot start with an underscore**.
- We have no way to distinguish an ad-hoc field from any "noise" fields
that might appear in a JSON/dict loaded using `.from_dict()`, so unlike
the original (which only loaded known-fields), we'll rehydrate anything
that we find there.
- No real type-safety is possible on ad-hoc fields but the type-checker
does not complain because the type of all ad-hoc fields is `Any` (which
is the best available behavior in my view).
- We may want to consider whether end-users should be able to add ad-hoc
fields to "sub" metadata objects too, like `DataSourceMetadata` and
conceivably `CoordinatesMetadata` (although I'm not immediately seeing a
use-case for the second one).
2023-11-15 13:22:15 -08:00
|
|
|
assert parse_optional_datetime(elements[0].metadata.last_modified) is None
|
2023-06-30 09:44:46 -05:00
|
|
|
assert elements[0].metadata.filetype is None
|
|
|
|
assert elements[0].metadata.page_name is None
|
|
|
|
assert elements[0].metadata.filename is None
|
|
|
|
|
|
|
|
|
2023-06-29 18:01:12 -04:00
|
|
|
def test_partition_email_can_process_attachments(
|
|
|
|
tmpdir,
|
|
|
|
filename="example-docs/eml/fake-email-attachment.eml",
|
|
|
|
):
|
|
|
|
with open(filename) as f:
|
|
|
|
msg = email.message_from_file(f)
|
|
|
|
extract_attachment_info(msg, output_dir=tmpdir.dirname)
|
2023-07-26 15:10:14 -04:00
|
|
|
attachment_filename = os.path.join(
|
|
|
|
tmpdir.dirname,
|
|
|
|
ATTACH_EXPECTED_OUTPUT[0]["filename"],
|
|
|
|
)
|
2023-08-18 18:21:11 -05:00
|
|
|
|
|
|
|
mocked_last_modification_date = "0000-00-05T09:24:28"
|
|
|
|
|
2023-06-29 18:01:12 -04:00
|
|
|
attachment_elements = partition_text(
|
|
|
|
filename=attachment_filename,
|
|
|
|
metadata_filename=attachment_filename,
|
2023-08-18 18:21:11 -05:00
|
|
|
metadata_last_modified=mocked_last_modification_date,
|
2023-06-29 18:01:12 -04:00
|
|
|
)
|
|
|
|
expected_metadata = attachment_elements[0].metadata
|
|
|
|
expected_metadata.file_directory = None
|
|
|
|
expected_metadata.attached_to_filename = filename
|
|
|
|
|
|
|
|
elements = partition_email(
|
|
|
|
filename=filename,
|
|
|
|
attachment_partitioner=partition_text,
|
|
|
|
process_attachments=True,
|
2023-08-18 18:21:11 -05:00
|
|
|
metadata_last_modified=mocked_last_modification_date,
|
2023-06-29 18:01:12 -04:00
|
|
|
)
|
|
|
|
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
# This test does not need to validate if hierarchy is working
|
|
|
|
# Patch to nullify parent_id
|
|
|
|
expected_metadata.parent_id = None
|
|
|
|
elements[-1].metadata.parent_id = None
|
|
|
|
|
2023-06-29 18:01:12 -04:00
|
|
|
assert elements[0].text.startswith("Hello!")
|
|
|
|
|
|
|
|
for element in elements[:-1]:
|
|
|
|
assert element.metadata.filename == "fake-email-attachment.eml"
|
|
|
|
assert element.metadata.subject == "Fake email with attachment"
|
|
|
|
|
|
|
|
assert elements[-1].text == "Hey this is a fake attachment!"
|
2023-08-18 18:21:11 -05:00
|
|
|
assert elements[-1].metadata == expected_metadata
|
|
|
|
|
2023-08-13 10:58:46 -07:00
|
|
|
|
2023-08-18 18:21:11 -05:00
|
|
|
def test_partition_email_can_process_min_max_with_attachments(
|
|
|
|
tmpdir,
|
|
|
|
filename="example-docs/eml/fake-email-attachment.eml",
|
|
|
|
):
|
|
|
|
with open(filename) as f:
|
|
|
|
msg = email.message_from_file(f)
|
|
|
|
extract_attachment_info(msg, output_dir=tmpdir.dirname)
|
|
|
|
attachment_filename = os.path.join(
|
|
|
|
tmpdir.dirname,
|
|
|
|
ATTACH_EXPECTED_OUTPUT[0]["filename"],
|
2023-08-13 10:58:46 -07:00
|
|
|
)
|
2023-08-18 18:21:11 -05:00
|
|
|
|
|
|
|
attachment_elements = partition_text(
|
|
|
|
filename=attachment_filename,
|
|
|
|
metadata_filename=attachment_filename,
|
|
|
|
min_partition=6,
|
|
|
|
max_partition=12,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = partition_email(
|
|
|
|
filename=filename,
|
|
|
|
attachment_partitioner=partition_text,
|
|
|
|
process_attachments=True,
|
|
|
|
min_partition=6,
|
|
|
|
max_partition=12,
|
|
|
|
)
|
|
|
|
|
|
|
|
assert elements[0].text.startswith("Hello!")
|
|
|
|
assert elements[-1].text == attachment_elements[-1].text
|
|
|
|
assert elements[-2].text == attachment_elements[-2].text
|
|
|
|
for element in elements:
|
|
|
|
if element.metadata.attached_to_filename is not None:
|
|
|
|
assert len(element.text) <= 12
|
|
|
|
assert len(element.text) >= 6
|
2023-06-29 18:01:12 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_raises_with_no_partitioner(
|
|
|
|
filename="example-docs/eml/fake-email-attachment.eml",
|
|
|
|
):
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_email(filename=filename, process_attachments=True)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
2024-03-18 02:09:44 +01:00
|
|
|
def test_partition_email_metadata_date_from_header(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/eml/fake-email-attachment.eml",
|
|
|
|
):
|
|
|
|
expected_last_modification_date = "2022-12-23T12:08:48-06:00"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.email.get_last_modified_date",
|
|
|
|
return_value=None,
|
|
|
|
)
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.email.get_last_modified_date_from_file",
|
|
|
|
return_value=None,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = partition_email(filename=filename)
|
|
|
|
|
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
|
|
|
|
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
def test_partition_email_from_file_custom_metadata_date(
|
|
|
|
filename="example-docs/eml/fake-email-attachment.eml",
|
|
|
|
):
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
with open(filename) as f:
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
elements = partition_email(
|
|
|
|
file=f,
|
|
|
|
metadata_last_modified=expected_last_modification_date,
|
|
|
|
)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_custom_metadata_date(
|
|
|
|
filename="example-docs/eml/fake-email-attachment.eml",
|
|
|
|
):
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
elements = partition_email(
|
|
|
|
filename=filename,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-08-13 20:35:18 -07:00
|
|
|
|
|
|
|
|
2023-08-14 11:38:53 -07:00
|
|
|
def test_partition_email_inline_content_disposition(
|
|
|
|
filename="example-docs/eml/email-inline-content-disposition.eml",
|
|
|
|
):
|
|
|
|
elements = partition_email(
|
|
|
|
filename=filename,
|
|
|
|
process_attachments=True,
|
|
|
|
attachment_partitioner=partition_text,
|
|
|
|
)
|
|
|
|
|
|
|
|
assert isinstance(elements[0], Text)
|
|
|
|
assert isinstance(elements[1], Text)
|
|
|
|
|
|
|
|
|
2023-08-13 20:35:18 -07:00
|
|
|
def test_partition_email_odd_attachment_filename(
|
|
|
|
filename="example-docs/eml/email-equals-attachment-filename.eml",
|
|
|
|
):
|
|
|
|
elements = partition_email(
|
|
|
|
filename=filename,
|
|
|
|
process_attachments=True,
|
|
|
|
attachment_partitioner=partition_text,
|
|
|
|
)
|
|
|
|
|
|
|
|
assert elements[1].metadata.filename == "odd=file=name.txt"
|
2023-08-25 20:09:25 -04:00
|
|
|
|
|
|
|
|
2023-10-12 12:47:55 -07:00
|
|
|
def test_partition_email_with_json():
|
|
|
|
elements = partition_email(example_doc_path("eml/fake-email.eml"))
|
|
|
|
assert_round_trips_through_JSON(elements)
|
2023-08-29 16:59:26 -04:00
|
|
|
|
|
|
|
|
2023-08-25 20:09:25 -04:00
|
|
|
def test_partition_email_with_pgp_encrypted_message(
|
|
|
|
caplog,
|
|
|
|
filename="example-docs/eml/fake-encrypted.eml",
|
|
|
|
):
|
|
|
|
elements = partition_email(filename=filename)
|
|
|
|
|
|
|
|
assert elements == []
|
|
|
|
assert "WARNING" in caplog.text
|
|
|
|
assert "Encrypted email detected" in caplog.text
|
2023-09-11 16:00:14 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_add_chunking_strategy_on_partition_email(
|
|
|
|
filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt"),
|
|
|
|
):
|
|
|
|
elements = partition_email(filename=filename)
|
|
|
|
chunk_elements = partition_email(filename, chunking_strategy="by_title")
|
|
|
|
chunks = chunk_by_title(elements)
|
|
|
|
assert chunk_elements != elements
|
|
|
|
assert chunk_elements == chunks
|
2023-10-10 20:47:56 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_element_metadata_has_languages():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
|
|
|
elements = partition_email(filename=filename)
|
|
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_respects_languages_arg():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
|
|
|
elements = partition_email(filename=filename, languages=["deu"])
|
|
|
|
assert all(element.metadata.languages == ["deu"] for element in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_eml_respects_detect_language_per_element():
|
|
|
|
filename = "example-docs/language-docs/eng_spa_mult.eml"
|
|
|
|
elements = partition_email(filename=filename, detect_language_per_element=True)
|
|
|
|
# languages other than English and Spanish are detected by this partitioner,
|
|
|
|
# so this test is slightly different from the other partition tests
|
|
|
|
langs = {element.metadata.languages[0] for element in elements}
|
|
|
|
assert "eng" in langs
|
|
|
|
assert "spa" in langs
|
2024-02-07 17:31:49 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_eml_add_signature_to_metadata():
|
|
|
|
elements = partition_email(filename="example-docs/eml/signed-doc.p7s")
|
|
|
|
assert len(elements) == 1
|
|
|
|
assert elements[0].text == "This is a test"
|
|
|
|
assert elements[0].metadata.signature == "<SIGNATURE>\n"
|