2024-08-01 15:24:17 -04:00
|
|
|
"""Test suite for `unstructured.partition.email` module."""
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
import io
|
2024-07-19 11:18:02 -07:00
|
|
|
import tempfile
|
2024-08-01 15:24:17 -04:00
|
|
|
from email.message import EmailMessage
|
2024-10-15 19:02:33 -07:00
|
|
|
from typing import Any
|
2022-12-19 13:02:44 -05:00
|
|
|
|
2023-02-27 17:30:54 +01:00
|
|
|
import pytest
|
2023-01-17 16:36:44 -06:00
|
|
|
|
Dynamic ElementMetadata implementation (#2043)
### Executive Summary
The structure of element metadata is currently static, meaning only
predefined fields can appear in the metadata. We would like the
flexibility for end-users, at their own discretion, to define and use
additional metadata fields that make sense for their particular
use-case.
### Concepts
A key concept for dynamic metadata is _known field_. A known-field is
one of those explicitly defined on `ElementMetadata`. Each of these has
a type and can be specified when _constructing_ a new `ElementMetadata`
instance. This is in contrast to an _end-user defined_ (or _ad-hoc_)
metadata field, one not known at "compile" time and added at the
discretion of an end-user to suit the purposes of their application.
An ad-hoc field can only be added by _assignment_ on an already
constructed instance.
### End-user ad-hoc metadata field behaviors
An ad-hoc field can be added to an `ElementMetadata` instance by
assignment:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
```
A field added in this way can be accessed by name:
```python
>>> metadata.coefficient
0.536
```
and that field will appear in the JSON/dict for that instance:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
>>> metadata.to_dict()
{"coefficient": 0.536}
```
However, accessing a "user-defined" value that has _not_ been assigned
on that instance raises `AttributeError`:
```python
>>> metadata.coeffcient # -- misspelled "coefficient" --
AttributeError: 'ElementMetadata' object has no attribute 'coeffcient'
```
This makes "tagging" a metadata item with a value very convenient, but
entails the proviso that if an end-user wants to add a metadata field to
_some_ elements and not others (sparse population), AND they want to
access that field by name on ANY element and receive `None` where it has
not been assigned, they will need to use an expression like this:
```python
coefficient = metadata.coefficient if hasattr(metadata, "coefficient") else None
```
### Implementation Notes
- **ad-hoc metadata fields** are discarded during consolidation (for
chunking) because we don't have a consolidation strategy defined for
those. We could consider using a default consolidation strategy like
`FIRST` or possibly allow a user to register a strategy (although that
gets hairy in non-private and multiple-memory-space situations.)
- ad-hoc metadata fields **cannot start with an underscore**.
- We have no way to distinguish an ad-hoc field from any "noise" fields
that might appear in a JSON/dict loaded using `.from_dict()`, so unlike
the original (which only loaded known-fields), we'll rehydrate anything
that we find there.
- No real type-safety is possible on ad-hoc fields but the type-checker
does not complain because the type of all ad-hoc fields is `Any` (which
is the best available behavior in my view).
- We may want to consider whether end-users should be able to add ad-hoc
fields to "sub" metadata objects too, like `DataSourceMetadata` and
conceivably `CoordinatesMetadata` (although I'm not immediately seeing a
use-case for the second one).
2023-11-15 13:22:15 -08:00
|
|
|
from test_unstructured.unit_utils import (
|
2024-10-15 19:02:33 -07:00
|
|
|
FixtureRequest,
|
|
|
|
Mock,
|
Dynamic ElementMetadata implementation (#2043)
### Executive Summary
The structure of element metadata is currently static, meaning only
predefined fields can appear in the metadata. We would like the
flexibility for end-users, at their own discretion, to define and use
additional metadata fields that make sense for their particular
use-case.
### Concepts
A key concept for dynamic metadata is _known field_. A known-field is
one of those explicitly defined on `ElementMetadata`. Each of these has
a type and can be specified when _constructing_ a new `ElementMetadata`
instance. This is in contrast to an _end-user defined_ (or _ad-hoc_)
metadata field, one not known at "compile" time and added at the
discretion of an end-user to suit the purposes of their application.
An ad-hoc field can only be added by _assignment_ on an already
constructed instance.
### End-user ad-hoc metadata field behaviors
An ad-hoc field can be added to an `ElementMetadata` instance by
assignment:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
```
A field added in this way can be accessed by name:
```python
>>> metadata.coefficient
0.536
```
and that field will appear in the JSON/dict for that instance:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
>>> metadata.to_dict()
{"coefficient": 0.536}
```
However, accessing a "user-defined" value that has _not_ been assigned
on that instance raises `AttributeError`:
```python
>>> metadata.coeffcient # -- misspelled "coefficient" --
AttributeError: 'ElementMetadata' object has no attribute 'coeffcient'
```
This makes "tagging" a metadata item with a value very convenient, but
entails the proviso that if an end-user wants to add a metadata field to
_some_ elements and not others (sparse population), AND they want to
access that field by name on ANY element and receive `None` where it has
not been assigned, they will need to use an expression like this:
```python
coefficient = metadata.coefficient if hasattr(metadata, "coefficient") else None
```
### Implementation Notes
- **ad-hoc metadata fields** are discarded during consolidation (for
chunking) because we don't have a consolidation strategy defined for
those. We could consider using a default consolidation strategy like
`FIRST` or possibly allow a user to register a strategy (although that
gets hairy in non-private and multiple-memory-space situations.)
- ad-hoc metadata fields **cannot start with an underscore**.
- We have no way to distinguish an ad-hoc field from any "noise" fields
that might appear in a JSON/dict loaded using `.from_dict()`, so unlike
the original (which only loaded known-fields), we'll rehydrate anything
that we find there.
- No real type-safety is possible on ad-hoc fields but the type-checker
does not complain because the type of all ad-hoc fields is `Any` (which
is the best available behavior in my view).
- We may want to consider whether end-users should be able to add ad-hoc
fields to "sub" metadata objects too, like `DataSourceMetadata` and
conceivably `CoordinatesMetadata` (although I'm not immediately seeing a
use-case for the second one).
2023-11-15 13:22:15 -08:00
|
|
|
assert_round_trips_through_JSON,
|
|
|
|
example_doc_path,
|
2024-10-15 19:02:33 -07:00
|
|
|
function_mock,
|
Dynamic ElementMetadata implementation (#2043)
### Executive Summary
The structure of element metadata is currently static, meaning only
predefined fields can appear in the metadata. We would like the
flexibility for end-users, at their own discretion, to define and use
additional metadata fields that make sense for their particular
use-case.
### Concepts
A key concept for dynamic metadata is _known field_. A known-field is
one of those explicitly defined on `ElementMetadata`. Each of these has
a type and can be specified when _constructing_ a new `ElementMetadata`
instance. This is in contrast to an _end-user defined_ (or _ad-hoc_)
metadata field, one not known at "compile" time and added at the
discretion of an end-user to suit the purposes of their application.
An ad-hoc field can only be added by _assignment_ on an already
constructed instance.
### End-user ad-hoc metadata field behaviors
An ad-hoc field can be added to an `ElementMetadata` instance by
assignment:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
```
A field added in this way can be accessed by name:
```python
>>> metadata.coefficient
0.536
```
and that field will appear in the JSON/dict for that instance:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
>>> metadata.to_dict()
{"coefficient": 0.536}
```
However, accessing a "user-defined" value that has _not_ been assigned
on that instance raises `AttributeError`:
```python
>>> metadata.coeffcient # -- misspelled "coefficient" --
AttributeError: 'ElementMetadata' object has no attribute 'coeffcient'
```
This makes "tagging" a metadata item with a value very convenient, but
entails the proviso that if an end-user wants to add a metadata field to
_some_ elements and not others (sparse population), AND they want to
access that field by name on ANY element and receive `None` where it has
not been assigned, they will need to use an expression like this:
```python
coefficient = metadata.coefficient if hasattr(metadata, "coefficient") else None
```
### Implementation Notes
- **ad-hoc metadata fields** are discarded during consolidation (for
chunking) because we don't have a consolidation strategy defined for
those. We could consider using a default consolidation strategy like
`FIRST` or possibly allow a user to register a strategy (although that
gets hairy in non-private and multiple-memory-space situations.)
- ad-hoc metadata fields **cannot start with an underscore**.
- We have no way to distinguish an ad-hoc field from any "noise" fields
that might appear in a JSON/dict loaded using `.from_dict()`, so unlike
the original (which only loaded known-fields), we'll rehydrate anything
that we find there.
- No real type-safety is possible on ad-hoc fields but the type-checker
does not complain because the type of all ad-hoc fields is `Any` (which
is the best available behavior in my view).
- We may want to consider whether end-users should be able to add ad-hoc
fields to "sub" metadata objects too, like `DataSourceMetadata` and
conceivably `CoordinatesMetadata` (although I'm not immediately seeing a
use-case for the second one).
2023-11-15 13:22:15 -08:00
|
|
|
)
|
2023-09-11 16:00:14 -05:00
|
|
|
from unstructured.chunking.title import chunk_by_title
|
2023-04-04 14:23:41 -04:00
|
|
|
from unstructured.documents.elements import (
|
2024-10-15 19:02:33 -07:00
|
|
|
CompositeElement,
|
2023-04-04 14:23:41 -04:00
|
|
|
ListItem,
|
|
|
|
NarrativeText,
|
2024-10-15 19:02:33 -07:00
|
|
|
Table,
|
|
|
|
TableChunk,
|
2023-08-14 11:38:53 -07:00
|
|
|
Text,
|
2023-04-04 14:23:41 -04:00
|
|
|
Title,
|
|
|
|
)
|
2024-10-15 19:02:33 -07:00
|
|
|
from unstructured.partition.email import EmailPartitioningContext, partition_email
|
2022-12-19 13:02:44 -05:00
|
|
|
|
|
|
|
EXPECTED_OUTPUT = [
|
|
|
|
NarrativeText(text="This is a test email to use for unit tests."),
|
2024-12-18 10:46:54 -08:00
|
|
|
Text(text="Important points:"),
|
2022-12-19 13:02:44 -05:00
|
|
|
ListItem(text="Roses are red"),
|
|
|
|
ListItem(text="Violets are blue"),
|
|
|
|
]
|
|
|
|
|
2023-01-09 19:49:19 -06:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_from_filename_can_partition_an_RFC_822_email():
|
|
|
|
assert partition_email(example_doc_path("eml/simple-rfc-822.eml")) == [
|
|
|
|
NarrativeText("This is an RFC 822 email message."),
|
|
|
|
NarrativeText(
|
|
|
|
"An RFC 822 message is characterized by its simple, text-based format, which includes"
|
|
|
|
' a header and a body. The header contains structured fields such as "From", "To",'
|
|
|
|
' "Date", and "Subject", each followed by a colon and the corresponding information.'
|
|
|
|
" The body follows the header, separated by a blank line, and contains the main"
|
|
|
|
" content of the email."
|
2023-01-17 16:36:44 -06:00
|
|
|
),
|
2024-10-15 19:02:33 -07:00
|
|
|
NarrativeText(
|
|
|
|
"The structure ensures compatibility and readability across different email systems"
|
|
|
|
" and clients, adhering to the standards set by the Internet Engineering Task Force"
|
|
|
|
" (IETF)."
|
|
|
|
),
|
|
|
|
]
|
2023-01-09 11:08:08 -06:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_from_file_can_partition_an_email():
|
|
|
|
with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
|
|
|
|
assert partition_email(file=f) == EXPECTED_OUTPUT
|
2023-01-03 11:41:54 -06:00
|
|
|
|
2022-12-19 13:02:44 -05:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_from_spooled_temp_file_can_partition_an_email():
|
|
|
|
with tempfile.SpooledTemporaryFile() as file:
|
|
|
|
with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
|
|
|
|
file.write(f.read())
|
|
|
|
file.seek(0)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert partition_email(file=file) == EXPECTED_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_can_partition_an_HTML_only_email_with_Base64_ISO_8859_1_charset():
|
|
|
|
assert partition_email(example_doc_path("eml/mime-html-only.eml")) == [
|
|
|
|
NarrativeText("This is a text/html part."),
|
|
|
|
NarrativeText(
|
|
|
|
"The first emoticon, :) , was proposed by Scott Fahlman in 1982 to indicate just or"
|
|
|
|
" sarcasm in text emails."
|
|
|
|
),
|
|
|
|
NarrativeText(
|
|
|
|
"Gmail was launched by Google in 2004 with 1 GB of free storage, significantly more"
|
|
|
|
" than what other services offered at the time."
|
|
|
|
),
|
|
|
|
]
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2023-07-06 09:49:27 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_extract_email_from_text_plain_matches_elements_extracted_from_text_html():
|
|
|
|
file_path = example_doc_path("eml/fake-email.eml")
|
2023-07-06 09:49:27 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
elements_from_text = partition_email(file_path, content_source="text/plain")
|
|
|
|
elements_from_html = partition_email(file_path, content_source="text/html")
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-12-18 10:46:54 -08:00
|
|
|
assert all(e.text == eo.text for e, eo in zip(elements_from_text, EXPECTED_OUTPUT))
|
2024-10-15 19:02:33 -07:00
|
|
|
assert elements_from_html == EXPECTED_OUTPUT
|
2024-12-18 10:46:54 -08:00
|
|
|
assert all(eh.text == et.text for eh, et in zip(elements_from_html, elements_from_text))
|
2023-05-30 10:24:02 -07:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_round_trips_via_json():
|
|
|
|
elements = partition_email(example_doc_path("eml/fake-email.eml"))
|
|
|
|
assert_round_trips_through_JSON(elements)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- transfer-encodings --------------------------------------------------------------------------
|
2023-05-30 10:24:02 -07:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_partitions_an_HTML_part_with_Base64_encoded_UTF_8_charset():
|
|
|
|
assert partition_email(example_doc_path("eml/fake-email-b64.eml")) == EXPECTED_OUTPUT
|
2024-08-01 15:24:17 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_partitions_a_text_plain_part_with_Base64_encoded_windows_1255_charset():
|
|
|
|
elements = partition_email(
|
|
|
|
example_doc_path("eml/email-no-utf8-2008-07-16.062410.eml"),
|
|
|
|
content_source="text/plain",
|
|
|
|
)
|
2024-07-19 11:18:02 -07:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert len(elements) == 30
|
|
|
|
assert elements[1].text.startswith("אני חושב שזה לא יהיה מקצועי והוגן שאני אראה לך היכן")
|
2024-07-19 11:18:02 -07:00
|
|
|
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_partitions_an_html_part_with_quoted_printable_encoded_ISO_8859_1_charset():
|
|
|
|
elements = partition_email(
|
|
|
|
example_doc_path("eml/email-no-utf8-2014-03-17.111517.eml"),
|
|
|
|
content_source="text/html",
|
|
|
|
process_attachments=False,
|
|
|
|
)
|
2023-01-09 11:08:08 -06:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert len(elements) == 1
|
|
|
|
assert isinstance(elements[0], Table)
|
|
|
|
assert elements[0].text.startswith("Slava Gxyzxyz Hi Slava, The password for your Google")
|
2023-01-09 11:08:08 -06:00
|
|
|
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- edge-cases ----------------------------------------------------------------------------------
|
2023-01-09 11:08:08 -06:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_accepts_a_whitespace_only_file():
|
|
|
|
"""Should produce no elements but should not raise an exception."""
|
|
|
|
assert partition_email(example_doc_path("eml/empty.eml")) == []
|
2024-08-01 15:24:17 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_can_partition_an_empty_email():
|
|
|
|
assert (
|
|
|
|
partition_email(example_doc_path("eml/mime-no-body.eml"), process_attachments=False) == []
|
|
|
|
)
|
2022-12-19 13:02:44 -05:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_does_not_break_on_an_encrypted_message():
|
|
|
|
assert (
|
|
|
|
partition_email(example_doc_path("eml/fake-encrypted.eml"), process_attachments=False) == []
|
|
|
|
)
|
2023-03-28 17:03:51 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_finds_content_when_it_is_marked_with_content_disposition_inline():
|
2024-08-01 15:24:17 -04:00
|
|
|
elements = partition_email(
|
2024-10-15 19:02:33 -07:00
|
|
|
example_doc_path("eml/email-inline-content-disposition.eml"), process_attachments=False
|
2024-08-01 15:24:17 -04:00
|
|
|
)
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert len(elements) == 1
|
|
|
|
e = elements[0]
|
|
|
|
assert isinstance(e, Text)
|
|
|
|
assert e.text == "This is a test of inline"
|
2023-01-09 19:49:19 -06:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_from_filename_malformed_encoding():
|
|
|
|
elements = partition_email(filename=example_doc_path("eml/fake-email-malformed-encoding.eml"))
|
|
|
|
assert elements == EXPECTED_OUTPUT
|
2023-01-09 11:08:08 -06:00
|
|
|
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- error behaviors -----------------------------------------------------------------------------
|
2023-05-12 11:33:01 -04:00
|
|
|
|
2023-04-04 14:23:41 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_raises_when_no_message_source_is_specified():
|
|
|
|
with pytest.raises(ValueError, match="no document specified; either a `filename` or `file`"):
|
|
|
|
partition_email()
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2023-01-17 11:33:45 -05:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_raises_with_invalid_content_type():
|
|
|
|
with pytest.raises(ValueError, match="'application/json' is not a valid value for content_s"):
|
|
|
|
partition_email(example_doc_path("eml/fake-email.eml"), content_source="application/json")
|
2023-01-17 11:33:45 -05:00
|
|
|
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- .metadata -----------------------------------------------------------------------------------
|
2023-12-20 07:37:17 +00:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_augments_message_body_elements_with_email_metadata():
|
|
|
|
elements = partition_email(example_doc_path("eml/mime-multi-to-cc-bcc.eml"))
|
2024-08-01 15:24:17 -04:00
|
|
|
|
|
|
|
assert all(
|
2024-10-15 19:02:33 -07:00
|
|
|
e.metadata.bcc_recipient == ["John <john@example.com>", "Mary <mary@example.com>"]
|
|
|
|
for e in elements
|
2024-08-01 15:24:17 -04:00
|
|
|
)
|
|
|
|
assert all(
|
2024-10-15 19:02:33 -07:00
|
|
|
e.metadata.cc_recipient == ["Tom <tom@example.com>", "Alice <alice@example.com>"]
|
|
|
|
for e in elements
|
2024-08-01 15:24:17 -04:00
|
|
|
)
|
2024-10-15 19:02:33 -07:00
|
|
|
assert all(e.metadata.email_message_id == "2143658709@example.com" for e in elements)
|
|
|
|
assert all(e.metadata.sent_from == ["sender@example.com"] for e in elements)
|
|
|
|
assert all(
|
|
|
|
e.metadata.sent_to == ["Bob <bob@example.com>", "Sue <sue@example.com>"] for e in elements
|
|
|
|
)
|
|
|
|
assert all(e.metadata.subject == "Example Plain-Text MIME Message" for e in elements)
|
2023-05-11 10:36:25 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- .metadata.filename --------------------------------------------------------------------------
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2023-06-16 17:52:13 -07:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_from_filename_gets_filename_metadata_from_file_path():
|
|
|
|
elements = partition_email(example_doc_path("eml/fake-email.eml"))
|
2023-06-16 17:52:13 -07:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert all(e.metadata.filename == "fake-email.eml" for e in elements)
|
|
|
|
assert all(e.metadata.file_directory == example_doc_path("eml") for e in elements)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2023-06-29 18:01:12 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_from_file_gets_filename_metadata_None():
|
|
|
|
with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
|
|
|
|
elements = partition_email(file=f)
|
|
|
|
|
|
|
|
assert all(e.metadata.filename is None for e in elements)
|
|
|
|
assert all(e.metadata.file_directory is None for e in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_from_filename_prefers_metadata_filename():
|
|
|
|
elements = partition_email(
|
|
|
|
example_doc_path("eml/fake-email.eml"), metadata_filename="a/b/c.eml"
|
|
|
|
)
|
|
|
|
|
|
|
|
assert all(e.metadata.filename == "c.eml" for e in elements)
|
|
|
|
assert all(e.metadata.file_directory == "a/b" for e in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_from_file_prefers_metadata_filename():
|
|
|
|
with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
|
|
|
|
elements = partition_email(file=f, metadata_filename="d/e/f.eml")
|
|
|
|
|
|
|
|
assert all(e.metadata.filename == "f.eml" for e in elements)
|
|
|
|
assert all(e.metadata.file_directory == "d/e" for e in elements)
|
2023-06-29 18:01:12 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
|
|
|
|
# -- .metadata.filetype --------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_gets_the_EML_MIME_type_in_metadata_filetype_for_message_body_elements():
|
|
|
|
EML_MIME_TYPE = "message/rfc822"
|
2024-08-01 15:24:17 -04:00
|
|
|
elements = partition_email(example_doc_path("eml/fake-email.eml"))
|
2024-10-15 19:02:33 -07:00
|
|
|
assert all(e.metadata.filetype == EML_MIME_TYPE for e in elements), (
|
|
|
|
f"Expected all elements to have '{EML_MIME_TYPE}' as their filetype, but got:"
|
|
|
|
f" {repr(elements[0].metadata.filetype)}"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# -- .metadata.languages -------------------------------------------------------------------------
|
|
|
|
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_element_metadata_has_languages():
|
|
|
|
elements = partition_email(example_doc_path("eml/fake-email.eml"))
|
|
|
|
assert all(e.metadata.languages == ["eng"] for e in elements)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_respects_languages_arg():
|
|
|
|
elements = partition_email(example_doc_path("eml/fake-email.eml"), languages=["deu"])
|
|
|
|
assert all(element.metadata.languages == ["deu"] for element in elements)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_eml_respects_detect_language_per_element():
|
2024-08-01 15:24:17 -04:00
|
|
|
elements = partition_email(
|
2024-10-15 19:02:33 -07:00
|
|
|
example_doc_path("language-docs/eng_spa_mult.eml"),
|
|
|
|
detect_language_per_element=True,
|
2024-08-01 15:24:17 -04:00
|
|
|
)
|
2024-10-15 19:02:33 -07:00
|
|
|
# languages other than English and Spanish are detected by this partitioner,
|
|
|
|
# so this test is slightly different from the other partition tests
|
|
|
|
langs = {e.metadata.languages[0] for e in elements if e.metadata.languages is not None}
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert "eng" in langs
|
|
|
|
assert "spa" in langs
|
2024-08-01 15:24:17 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- .metadata.last_modified ---------------------------------------------------------------------
|
2024-08-01 15:24:17 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_from_file_path_gets_last_modified_from_Date_header():
|
|
|
|
elements = partition_email(example_doc_path("eml/fake-email.eml"))
|
|
|
|
assert all(e.metadata.last_modified == "2022-12-16T22:04:16+00:00" for e in elements)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_from_file_gets_last_modified_from_Date_header():
|
|
|
|
with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
|
|
|
|
elements = partition_email(file=f)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert all(e.metadata.last_modified == "2022-12-16T22:04:16+00:00" for e in elements)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_from_file_path_prefers_metadata_last_modified():
|
|
|
|
metadata_last_modified = "2020-07-05T09:24:28"
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
elements = partition_email(
|
|
|
|
example_doc_path("eml/fake-email.eml"), metadata_last_modified=metadata_last_modified
|
|
|
|
)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_from_file_prefers_metadata_last_modified():
|
|
|
|
metadata_last_modified = "2020-07-05T09:24:28"
|
|
|
|
with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
|
|
|
|
elements = partition_email(file=f, metadata_last_modified=metadata_last_modified)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- chunking ------------------------------------------------------------------------------------
|
2024-08-01 15:24:17 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_chunks_when_so_instructed():
|
|
|
|
"""Note it's actually the delegate partitioners that do the chunking."""
|
|
|
|
elements = partition_email(example_doc_path("eml/fake-email.txt"))
|
|
|
|
chunks = partition_email(example_doc_path("eml/fake-email.txt"), chunking_strategy="by_title")
|
|
|
|
separately_chunked_chunks = chunk_by_title(elements)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert all(isinstance(c, (CompositeElement, Table, TableChunk)) for c in chunks)
|
|
|
|
assert chunks != elements
|
|
|
|
assert chunks == separately_chunked_chunks
|
2024-08-01 15:24:17 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_chunks_attachments_too():
|
|
|
|
chunks = partition_email(
|
|
|
|
example_doc_path("eml/fake-email-attachment.eml"),
|
|
|
|
chunking_strategy="by_title",
|
|
|
|
process_attachments=True,
|
|
|
|
)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert len(chunks) == 2
|
|
|
|
assert all(isinstance(c, CompositeElement) for c in chunks)
|
|
|
|
attachment_chunk = chunks[-1]
|
|
|
|
assert attachment_chunk.text == "Hey this is a fake attachment!"
|
|
|
|
assert attachment_chunk.metadata.filename == "fake-attachment.txt"
|
|
|
|
assert attachment_chunk.metadata.attached_to_filename == "fake-email-attachment.eml"
|
|
|
|
assert all(c.metadata.last_modified == "2022-12-23T18:08:48+00:00" for c in chunks)
|
|
|
|
|
|
|
|
|
|
|
|
# -- attachments ---------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_also_partitions_attachments_when_so_instructed():
|
|
|
|
elements = partition_email(
|
|
|
|
example_doc_path("eml/email-equals-attachment-filename.eml"), process_attachments=True
|
2024-08-01 15:24:17 -04:00
|
|
|
)
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert elements == [
|
|
|
|
NarrativeText("Below is an example of an odd filename"),
|
|
|
|
Title("Odd filename"),
|
|
|
|
]
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_email_can_process_attachments():
|
|
|
|
elements = partition_email(
|
|
|
|
example_doc_path("eml/fake-email-attachment.eml"), process_attachments=True
|
|
|
|
)
|
2024-10-04 14:01:32 -07:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert elements == [
|
2024-12-18 10:46:54 -08:00
|
|
|
Text("Hello!"),
|
2024-10-15 19:02:33 -07:00
|
|
|
NarrativeText("Here's the attachments!"),
|
|
|
|
NarrativeText("It includes:"),
|
|
|
|
ListItem("Lots of whitespace"),
|
|
|
|
ListItem("Little to no content"),
|
|
|
|
ListItem("and is a quick read"),
|
|
|
|
Text("Best,"),
|
2024-12-18 10:46:54 -08:00
|
|
|
Text("Mallori"),
|
2024-10-15 19:02:33 -07:00
|
|
|
NarrativeText("Hey this is a fake attachment!"),
|
|
|
|
]
|
|
|
|
assert all(e.metadata.last_modified == "2022-12-23T18:08:48+00:00" for e in elements)
|
|
|
|
attachment_element = elements[-1]
|
|
|
|
assert attachment_element.text == "Hey this is a fake attachment!"
|
|
|
|
assert attachment_element.metadata.filename == "fake-attachment.txt"
|
|
|
|
assert attachment_element.metadata.attached_to_filename == "fake-email-attachment.eml"
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_email_silently_skips_attachments_it_cannot_partition():
|
|
|
|
elements = partition_email(
|
|
|
|
example_doc_path("eml/mime-attach-mp3.eml"), process_attachments=True
|
2024-10-04 14:01:32 -07:00
|
|
|
)
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- no exception is raised --
|
|
|
|
assert elements == [
|
|
|
|
# -- the email body is partitioned --
|
|
|
|
NarrativeText("This is an email with an MP3 attachment."),
|
|
|
|
# -- no elements appear for the attachment --
|
|
|
|
]
|
2024-10-04 14:01:32 -07:00
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# ================================================================================================
|
|
|
|
# ISOLATED UNIT TESTS
|
|
|
|
# ================================================================================================
|
2024-09-23 15:23:10 -07:00
|
|
|
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
class DescribeEmailPartitionerOptions:
|
|
|
|
"""Unit-test suite for `unstructured.partition.email.EmailPartitioningContext` objects."""
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- .load() ---------------------------------
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def it_provides_a_validating_constructor(self, ctx_args: dict[str, Any]):
|
|
|
|
ctx_args["file_path"] = example_doc_path("eml/fake-email.eml")
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
ctx = EmailPartitioningContext.load(**ctx_args)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert isinstance(ctx, EmailPartitioningContext)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def but_it_raises_when_no_source_document_was_specified(self, ctx_args: dict[str, Any]):
|
|
|
|
with pytest.raises(ValueError, match="no document specified; either a `filename` or `fi"):
|
|
|
|
EmailPartitioningContext.load(**ctx_args)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def and_it_raises_when_a_file_open_for_reading_str_is_used(self, ctx_args: dict[str, Any]):
|
|
|
|
ctx_args["file"] = io.StringIO("abcdefg")
|
|
|
|
with pytest.raises(ValueError, match="file object must be opened in binary mode"):
|
|
|
|
EmailPartitioningContext.load(**ctx_args)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def and_it_raises_when_an_invalid_content_source_is_specified(self, ctx_args: dict[str, Any]):
|
|
|
|
ctx_args["file_path"] = example_doc_path("eml/fake-email.eml")
|
|
|
|
ctx_args["content_source"] = "application/json"
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
with pytest.raises(ValueError, match="'application/json' is not a valid value for conte"):
|
|
|
|
EmailPartitioningContext.load(**ctx_args)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- .bcc_addresses --------------------------
|
2024-09-23 15:23:10 -07:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def it_provides_access_to_the_Bcc_addresses_when_present(self):
|
|
|
|
ctx = EmailPartitioningContext(example_doc_path("eml/mime-multi-to-cc-bcc.eml"))
|
|
|
|
assert ctx.bcc_addresses == ["John <john@example.com>", "Mary <mary@example.com>"]
|
2024-09-23 15:23:10 -07:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def but_it_returns_None_when_there_are_no_Bcc_addresses(self):
|
|
|
|
ctx = EmailPartitioningContext(example_doc_path("eml/simple-rfc-822.eml"))
|
|
|
|
assert ctx.bcc_addresses is None
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- .body_part ------------------------------
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def it_returns_the_html_body_part_when_there_is_one_by_default(self):
|
|
|
|
ctx = EmailPartitioningContext(example_doc_path("eml/mime-different-plain-html.eml"))
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
body_part = ctx.body_part
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert isinstance(body_part, EmailMessage)
|
|
|
|
content = body_part.get_content()
|
|
|
|
assert isinstance(content, str)
|
|
|
|
assert content.startswith("<!DOCTYPE html>")
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def but_it_returns_the_plain_text_body_part_when_there_is_one_when_so_requested(self):
|
|
|
|
ctx = EmailPartitioningContext(
|
|
|
|
example_doc_path("eml/mime-different-plain-html.eml"), content_source="text/plain"
|
|
|
|
)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
body_part = ctx.body_part
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert isinstance(body_part, EmailMessage)
|
|
|
|
content = body_part.get_content()
|
|
|
|
assert isinstance(content, str)
|
|
|
|
assert content.startswith("This is the text/plain part.")
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def and_it_returns_None_when_the_email_has_no_body(self):
|
|
|
|
ctx = EmailPartitioningContext(example_doc_path("eml/mime-no-body.eml"))
|
|
|
|
assert ctx.body_part is None
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- .cc_addresses ---------------------------
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def it_provides_access_to_the_Cc_addresses_when_present(self):
|
|
|
|
ctx = EmailPartitioningContext(example_doc_path("eml/mime-multi-to-cc-bcc.eml"))
|
|
|
|
assert ctx.cc_addresses == ["Tom <tom@example.com>", "Alice <alice@example.com>"]
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def but_it_returns_None_when_there_are_no_Cc_addresses(self):
|
|
|
|
ctx = EmailPartitioningContext(example_doc_path("eml/simple-rfc-822.eml"))
|
|
|
|
assert ctx.cc_addresses is None
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- .content_type_preference ----------------
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("content_source", "expected_value"),
|
|
|
|
[
|
|
|
|
("text/html", ("html", "plain")),
|
|
|
|
("text/plain", ("plain", "html")),
|
|
|
|
],
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
2024-10-15 19:02:33 -07:00
|
|
|
def it_knows_whether_the_caller_prefers_the_HTML_or_plain_text_body(
|
|
|
|
self, content_source: str, expected_value: tuple[str, ...]
|
|
|
|
):
|
|
|
|
ctx = EmailPartitioningContext(content_source=content_source)
|
|
|
|
assert ctx.content_type_preference == expected_value
|
2023-08-18 18:21:11 -05:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def and_it_defaults_to_preferring_the_HTML_body(self):
|
|
|
|
ctx = EmailPartitioningContext()
|
|
|
|
assert ctx.content_type_preference == ("html", "plain")
|
2023-08-18 18:21:11 -05:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- .from -----------------------------------
|
2023-06-29 18:01:12 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def it_knows_the_From_address_of_the_email(self):
|
|
|
|
ctx = EmailPartitioningContext(example_doc_path("eml/mime-simple.eml"))
|
|
|
|
assert ctx.from_address == "sender@example.com"
|
2023-06-29 18:01:12 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- .message_id -----------------------------
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def it_provides_access_to_the_Message_ID_when_present(self):
|
|
|
|
ctx = EmailPartitioningContext(example_doc_path("eml/mime-simple.eml"))
|
|
|
|
assert ctx.message_id == "1234567890@example.com"
|
2023-08-18 18:21:11 -05:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def but_it_returns_None_when_there_is_no_Message_ID_header(self):
|
|
|
|
ctx = EmailPartitioningContext(example_doc_path("eml/simple-rfc-822.eml"))
|
|
|
|
assert ctx.message_id is None
|
2023-08-13 10:58:46 -07:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- .metadata_file_path ---------------------
|
2023-08-14 11:38:53 -07:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def it_uses_the_metadata_file_path_arg_value_when_one_was_provided(self):
|
|
|
|
ctx = EmailPartitioningContext(metadata_file_path="a/b/c.eml")
|
|
|
|
assert ctx.metadata_file_path == "a/b/c.eml"
|
2023-08-25 20:09:25 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def and_it_uses_the_file_path_arg_value_when_metadata_file_path_was_not_provided(self):
|
|
|
|
ctx = EmailPartitioningContext(file_path="x/y/z.eml")
|
|
|
|
assert ctx.metadata_file_path == "x/y/z.eml"
|
2023-09-11 16:00:14 -05:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def and_it_returns_None_when_neither_file_path_was_provided(self):
|
|
|
|
ctx = EmailPartitioningContext()
|
|
|
|
assert ctx.metadata_file_path is None
|
2023-10-10 20:47:56 -05:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- .metadata_last_modified -----------------
|
2023-10-10 20:47:56 -05:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def it_uses_the_metadata_last_modified_arg_value_when_one_was_provided(self):
|
|
|
|
metadata_last_modified = "2023-04-08T12:18:07"
|
|
|
|
ctx = EmailPartitioningContext(metadata_last_modified=metadata_last_modified)
|
|
|
|
assert ctx.metadata_last_modified == metadata_last_modified
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def and_it_uses_the_msg_Date_header_date_when_metadata_last_modified_was_not_provided(self):
|
|
|
|
ctx = EmailPartitioningContext(example_doc_path("eml/simple-rfc-822.eml"))
|
|
|
|
assert ctx.metadata_last_modified == "2024-10-01T17:34:56+00:00"
|
2023-10-10 20:47:56 -05:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def and_it_falls_back_to_filesystem_last_modified_when_no_Date_header_is_present(
|
|
|
|
self, get_last_modified_date_: Mock
|
|
|
|
):
|
|
|
|
"""Not an expected case as according to RFC 5322, the Date header is required."""
|
|
|
|
filesystem_last_modified = "2024-07-09T14:08:17"
|
|
|
|
get_last_modified_date_.return_value = filesystem_last_modified
|
2023-10-10 20:47:56 -05:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
ctx = EmailPartitioningContext(example_doc_path("eml/rfc822-no-date.eml"))
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
assert ctx.metadata_last_modified == filesystem_last_modified
|
|
|
|
|
|
|
|
def and_it_returns_None_when_no_last_modified_is_available(self):
|
|
|
|
with open(example_doc_path("eml/rfc822-no-date.eml"), "rb") as f:
|
|
|
|
ctx = EmailPartitioningContext(file=f)
|
|
|
|
assert ctx.metadata_last_modified is None
|
|
|
|
|
|
|
|
# -- .msg ------------------------------------
|
|
|
|
|
|
|
|
def it_loads_the_email_message_from_the_filesystem_when_a_path_is_provided(self):
|
|
|
|
ctx = EmailPartitioningContext(file_path=example_doc_path("eml/simple-rfc-822.eml"))
|
|
|
|
assert isinstance(ctx.msg, EmailMessage)
|
|
|
|
|
|
|
|
def and_it_loads_the_email_message_from_a_file_like_object_when_one_is_provided(self):
|
|
|
|
with open(example_doc_path("eml/simple-rfc-822.eml"), "rb") as f:
|
|
|
|
ctx = EmailPartitioningContext(file=f)
|
|
|
|
assert isinstance(ctx.msg, EmailMessage)
|
|
|
|
|
|
|
|
# -- .partitioning_kwargs --------------------
|
|
|
|
|
|
|
|
def it_passes_along_the_kwargs_it_received_on_construction(self):
|
|
|
|
kwargs = {"foo": "bar", "baz": "qux"}
|
|
|
|
ctx = EmailPartitioningContext(kwargs=kwargs)
|
|
|
|
|
|
|
|
assert ctx.partitioning_kwargs == kwargs
|
|
|
|
|
|
|
|
# -- .process_attachments --------------------
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("process_attachments", [True, False])
|
|
|
|
def it_knows_whether_the_caller_wants_to_also_partition_attachments(
|
|
|
|
self, process_attachments: bool
|
|
|
|
):
|
|
|
|
ctx = EmailPartitioningContext(process_attachments=process_attachments)
|
|
|
|
assert ctx.process_attachments == process_attachments
|
|
|
|
|
|
|
|
def but_by_default_it_ignores_attachments(self):
|
|
|
|
ctx = EmailPartitioningContext()
|
|
|
|
assert ctx.process_attachments is False
|
|
|
|
|
|
|
|
# -- .subject --------------------------------
|
|
|
|
|
|
|
|
def it_provides_access_to_the_email_Subject_as_a_string(self):
|
|
|
|
ctx = EmailPartitioningContext(example_doc_path("eml/mime-word-encoded-subject.eml"))
|
|
|
|
assert ctx.subject == "Simple email with ☸☿ Unicode subject"
|
|
|
|
|
|
|
|
def but_it_returns_None_when_there_is_no_Subject_header(self):
|
|
|
|
ctx = EmailPartitioningContext(example_doc_path("eml/mime-no-subject.eml"))
|
|
|
|
assert ctx.subject is None
|
|
|
|
|
|
|
|
# -- .to_addresses ---------------------------
|
|
|
|
|
|
|
|
def it_provides_access_to_the_To_addresses_when_present(self):
|
|
|
|
ctx = EmailPartitioningContext(example_doc_path("eml/mime-multi-to-cc-bcc.eml"))
|
|
|
|
assert ctx.to_addresses == ["Bob <bob@example.com>", "Sue <sue@example.com>"]
|
|
|
|
|
|
|
|
def but_it_returns_None_when_there_are_no_To_addresses(self):
|
|
|
|
ctx = EmailPartitioningContext(example_doc_path("eml/mime-no-to.eml"))
|
|
|
|
assert ctx.to_addresses is None
|
|
|
|
|
|
|
|
# -- fixtures --------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
@pytest.fixture()
|
|
|
|
def ctx_args(self) -> dict[str, Any]:
|
|
|
|
return {
|
|
|
|
"file_path": None,
|
|
|
|
"file": None,
|
|
|
|
"content_source": "text/html",
|
|
|
|
"metadata_file_path": None,
|
|
|
|
"metadata_last_modified": None,
|
|
|
|
"process_attachments": False,
|
|
|
|
"kwargs": {},
|
|
|
|
}
|
|
|
|
|
|
|
|
@pytest.fixture()
|
|
|
|
def get_last_modified_date_(self, request: FixtureRequest) -> Mock:
|
|
|
|
return function_mock(request, "unstructured.partition.email.get_last_modified_date")
|