mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 12:03:15 +00:00 
			
		
		
		
	### Executive Summary
The structure of element metadata is currently static, meaning only
predefined fields can appear in the metadata. We would like the
flexibility for end-users, at their own discretion, to define and use
additional metadata fields that make sense for their particular
use-case.
### Concepts
A key concept for dynamic metadata is _known field_. A known-field is
one of those explicitly defined on `ElementMetadata`. Each of these has
a type and can be specified when _constructing_ a new `ElementMetadata`
instance. This is in contrast to an _end-user defined_ (or _ad-hoc_)
metadata field, one not known at "compile" time and added at the
discretion of an end-user to suit the purposes of their application.
An ad-hoc field can only be added by _assignment_ on an already
constructed instance.
### End-user ad-hoc metadata field behaviors
An ad-hoc field can be added to an `ElementMetadata` instance by
assignment:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
```
A field added in this way can be accessed by name:
```python
>>> metadata.coefficient
0.536
```
and that field will appear in the JSON/dict for that instance:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
>>> metadata.to_dict()
{"coefficient": 0.536}
```
However, accessing a "user-defined" value that has _not_ been assigned
on that instance raises `AttributeError`:
```python
>>> metadata.coeffcient  # -- misspelled "coefficient" --
AttributeError: 'ElementMetadata' object has no attribute 'coeffcient'
```
This makes "tagging" a metadata item with a value very convenient, but
entails the proviso that if an end-user wants to add a metadata field to
_some_ elements and not others (sparse population), AND they want to
access that field by name on ANY element and receive `None` where it has
not been assigned, they will need to use an expression like this:
```python
coefficient = metadata.coefficient if hasattr(metadata, "coefficient") else None
``` 
### Implementation Notes
- **ad-hoc metadata fields** are discarded during consolidation (for
chunking) because we don't have a consolidation strategy defined for
those. We could consider using a default consolidation strategy like
`FIRST` or possibly allow a user to register a strategy (although that
gets hairy in non-private and multiple-memory-space situations.)
- ad-hoc metadata fields **cannot start with an underscore**.
- We have no way to distinguish an ad-hoc field from any "noise" fields
that might appear in a JSON/dict loaded using `.from_dict()`, so unlike
the original (which only loaded known-fields), we'll rehydrate anything
that we find there.
- No real type-safety is possible on ad-hoc fields but the type-checker
does not complain because the type of all ad-hoc fields is `Any` (which
is the best available behavior in my view).
- We may want to consider whether end-users should be able to add ad-hoc
fields to "sub" metadata objects too, like `DataSourceMetadata` and
conceivably `CoordinatesMetadata` (although I'm not immediately seeing a
use-case for the second one).
		
	
			
		
			
				
	
	
		
			622 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			622 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import datetime
 | 
						|
import email
 | 
						|
import os
 | 
						|
import pathlib
 | 
						|
 | 
						|
import pytest
 | 
						|
 | 
						|
from test_unstructured.unit_utils import (
 | 
						|
    assert_round_trips_through_JSON,
 | 
						|
    example_doc_path,
 | 
						|
    parse_optional_datetime,
 | 
						|
)
 | 
						|
from unstructured.chunking.title import chunk_by_title
 | 
						|
from unstructured.documents.elements import (
 | 
						|
    ElementMetadata,
 | 
						|
    Image,
 | 
						|
    ListItem,
 | 
						|
    NarrativeText,
 | 
						|
    Text,
 | 
						|
    Title,
 | 
						|
)
 | 
						|
from unstructured.documents.email_elements import (
 | 
						|
    MetaData,
 | 
						|
    ReceivedInfo,
 | 
						|
    Recipient,
 | 
						|
    Sender,
 | 
						|
    Subject,
 | 
						|
)
 | 
						|
from unstructured.partition.email import (
 | 
						|
    convert_to_iso_8601,
 | 
						|
    extract_attachment_info,
 | 
						|
    partition_email,
 | 
						|
    partition_email_header,
 | 
						|
)
 | 
						|
from unstructured.partition.text import partition_text
 | 
						|
 | 
						|
FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
 | 
						|
EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs", "eml")
 | 
						|
 | 
						|
 | 
						|
EXPECTED_OUTPUT = [
 | 
						|
    NarrativeText(text="This is a test email to use for unit tests."),
 | 
						|
    Title(text="Important points:"),
 | 
						|
    ListItem(text="Roses are red"),
 | 
						|
    ListItem(text="Violets are blue"),
 | 
						|
]
 | 
						|
 | 
						|
IMAGE_EXPECTED_OUTPUT = [
 | 
						|
    NarrativeText(text="This is a test email to use for unit tests."),
 | 
						|
    Title(text="Important points:"),
 | 
						|
    NarrativeText(text="hello this is our logo."),
 | 
						|
    Image(text="unstructured_logo.png"),
 | 
						|
    ListItem(text="Roses are red"),
 | 
						|
    ListItem(text="Violets are blue"),
 | 
						|
]
 | 
						|
 | 
						|
RECEIVED_HEADER_OUTPUT = [
 | 
						|
    ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="00.0.0.00"),
 | 
						|
    ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="ba23::58b5:2236:45g2:88h2"),
 | 
						|
    ReceivedInfo(
 | 
						|
        name="received_datetimetz",
 | 
						|
        text="2023-02-20 10:03:18+12:00",
 | 
						|
        datestamp=datetime.datetime(
 | 
						|
            2023,
 | 
						|
            2,
 | 
						|
            20,
 | 
						|
            10,
 | 
						|
            3,
 | 
						|
            18,
 | 
						|
            tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)),
 | 
						|
        ),
 | 
						|
    ),
 | 
						|
    MetaData(name="MIME-Version", text="1.0"),
 | 
						|
    MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
 | 
						|
    MetaData(
 | 
						|
        name="Message-ID",
 | 
						|
        text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
 | 
						|
    ),
 | 
						|
    Subject(text="Test Email"),
 | 
						|
    Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"),
 | 
						|
    Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"),
 | 
						|
    MetaData(
 | 
						|
        name="Content-Type",
 | 
						|
        text='multipart/alternative; boundary="00000000000095c9b205eff92630"',
 | 
						|
    ),
 | 
						|
]
 | 
						|
 | 
						|
HEADER_EXPECTED_OUTPUT = [
 | 
						|
    MetaData(name="MIME-Version", text="1.0"),
 | 
						|
    MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
 | 
						|
    MetaData(
 | 
						|
        name="Message-ID",
 | 
						|
        text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
 | 
						|
    ),
 | 
						|
    Subject(text="Test Email"),
 | 
						|
    Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"),
 | 
						|
    Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"),
 | 
						|
    MetaData(
 | 
						|
        name="Content-Type",
 | 
						|
        text='multipart/alternative; boundary="00000000000095c9b205eff92630"',
 | 
						|
    ),
 | 
						|
]
 | 
						|
 | 
						|
ALL_EXPECTED_OUTPUT = HEADER_EXPECTED_OUTPUT + EXPECTED_OUTPUT
 | 
						|
 | 
						|
ATTACH_EXPECTED_OUTPUT = [
 | 
						|
    {"filename": "fake-attachment.txt", "payload": b"Hey this is a fake attachment!"},
 | 
						|
]
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_filename():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
 | 
						|
    elements = partition_email(filename=filename)
 | 
						|
    assert len(elements) > 0
 | 
						|
    assert elements == EXPECTED_OUTPUT
 | 
						|
    for element in elements:
 | 
						|
        assert element.metadata.filename == "fake-email.eml"
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_filename_with_metadata_filename():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
 | 
						|
    elements = partition_email(filename=filename, metadata_filename="test")
 | 
						|
    assert len(elements) > 0
 | 
						|
    assert all(element.metadata.filename == "test" for element in elements)
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_filename_malformed_encoding():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-malformed-encoding.eml")
 | 
						|
    elements = partition_email(filename=filename)
 | 
						|
    assert len(elements) > 0
 | 
						|
    assert elements == EXPECTED_OUTPUT
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    ("filename", "expected_output"),
 | 
						|
    [
 | 
						|
        ("fake-email-utf-16.eml", EXPECTED_OUTPUT),
 | 
						|
        ("fake-email-utf-16-be.eml", EXPECTED_OUTPUT),
 | 
						|
        ("fake-email-utf-16-le.eml", EXPECTED_OUTPUT),
 | 
						|
        ("email-no-utf8-2008-07-16.062410.eml", None),
 | 
						|
        ("email-no-utf8-2014-03-17.111517.eml", None),
 | 
						|
        ("email-replace-mime-encodings-error-1.eml", None),
 | 
						|
        ("email-replace-mime-encodings-error-2.eml", None),
 | 
						|
        ("email-replace-mime-encodings-error-3.eml", None),
 | 
						|
        ("email-replace-mime-encodings-error-4.eml", None),
 | 
						|
        ("email-replace-mime-encodings-error-5.eml", None),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_partition_email_from_filename_default_encoding(filename, expected_output):
 | 
						|
    filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
 | 
						|
    elements = partition_email(filename=filename_path)
 | 
						|
    assert len(elements) > 0
 | 
						|
    if expected_output:
 | 
						|
        assert elements == expected_output
 | 
						|
    for element in elements:
 | 
						|
        assert element.metadata.filename == filename
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_file():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
 | 
						|
    with open(filename) as f:
 | 
						|
        elements = partition_email(file=f)
 | 
						|
    assert len(elements) > 0
 | 
						|
    assert elements == EXPECTED_OUTPUT
 | 
						|
    for element in elements:
 | 
						|
        assert element.metadata.filename is None
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    ("filename", "expected_output"),
 | 
						|
    [
 | 
						|
        ("fake-email-utf-16.eml", EXPECTED_OUTPUT),
 | 
						|
        ("fake-email-utf-16-be.eml", EXPECTED_OUTPUT),
 | 
						|
        ("fake-email-utf-16-le.eml", EXPECTED_OUTPUT),
 | 
						|
        ("email-no-utf8-2008-07-16.062410.eml", None),
 | 
						|
        ("email-no-utf8-2014-03-17.111517.eml", None),
 | 
						|
        ("email-replace-mime-encodings-error-1.eml", None),
 | 
						|
        ("email-replace-mime-encodings-error-2.eml", None),
 | 
						|
        ("email-replace-mime-encodings-error-3.eml", None),
 | 
						|
        ("email-replace-mime-encodings-error-4.eml", None),
 | 
						|
        ("email-replace-mime-encodings-error-5.eml", None),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_partition_email_from_file_default_encoding(filename, expected_output):
 | 
						|
    filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
 | 
						|
    with open(filename_path) as f:
 | 
						|
        elements = partition_email(file=f)
 | 
						|
    assert len(elements) > 0
 | 
						|
    if expected_output:
 | 
						|
        assert elements == expected_output
 | 
						|
    for element in elements:
 | 
						|
        assert element.metadata.filename is None
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_file_rb():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
 | 
						|
    with open(filename, "rb") as f:
 | 
						|
        elements = partition_email(file=f)
 | 
						|
    assert len(elements) > 0
 | 
						|
    assert elements == EXPECTED_OUTPUT
 | 
						|
    for element in elements:
 | 
						|
        assert element.metadata.filename is None
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    ("filename", "expected_output"),
 | 
						|
    [
 | 
						|
        ("fake-email-utf-16.eml", EXPECTED_OUTPUT),
 | 
						|
        ("fake-email-utf-16-be.eml", EXPECTED_OUTPUT),
 | 
						|
        ("fake-email-utf-16-le.eml", EXPECTED_OUTPUT),
 | 
						|
        ("email-no-utf8-2008-07-16.062410.eml", None),
 | 
						|
        ("email-no-utf8-2014-03-17.111517.eml", None),
 | 
						|
        ("email-replace-mime-encodings-error-1.eml", None),
 | 
						|
        ("email-replace-mime-encodings-error-2.eml", None),
 | 
						|
        ("email-replace-mime-encodings-error-3.eml", None),
 | 
						|
        ("email-replace-mime-encodings-error-4.eml", None),
 | 
						|
        ("email-replace-mime-encodings-error-5.eml", None),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_partition_email_from_file_rb_default_encoding(filename, expected_output):
 | 
						|
    filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
 | 
						|
    with open(filename_path, "rb") as f:
 | 
						|
        elements = partition_email(file=f)
 | 
						|
    assert len(elements) > 0
 | 
						|
    if expected_output:
 | 
						|
        assert elements == expected_output
 | 
						|
    for element in elements:
 | 
						|
        assert element.metadata.filename is None
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_text_file():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
 | 
						|
    with open(filename) as f:
 | 
						|
        elements = partition_email(file=f, content_source="text/plain")
 | 
						|
    assert len(elements) > 0
 | 
						|
    assert elements == EXPECTED_OUTPUT
 | 
						|
    for element in elements:
 | 
						|
        assert element.metadata.filename is None
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_text_file_with_headers():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
 | 
						|
    with open(filename) as f:
 | 
						|
        elements = partition_email(
 | 
						|
            file=f,
 | 
						|
            content_source="text/plain",
 | 
						|
            include_headers=True,
 | 
						|
        )
 | 
						|
    assert len(elements) > 0
 | 
						|
    assert elements == ALL_EXPECTED_OUTPUT
 | 
						|
    for element in elements:
 | 
						|
        assert element.metadata.filename is None
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_text_file_max():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
 | 
						|
    with open(filename) as f:
 | 
						|
        elements = partition_email(
 | 
						|
            file=f,
 | 
						|
            content_source="text/plain",
 | 
						|
            max_partition=20,
 | 
						|
        )
 | 
						|
    assert len(elements) == 6
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_text_file_raises_value_error():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
 | 
						|
    with pytest.raises(ValueError), open(filename) as f:
 | 
						|
        partition_email(file=f, content_source="text/plain", min_partition=1000)
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_text():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
 | 
						|
    with open(filename) as f:
 | 
						|
        text = f.read()
 | 
						|
    elements = partition_email(text=text)
 | 
						|
    assert len(elements) > 0
 | 
						|
    assert elements == EXPECTED_OUTPUT
 | 
						|
    for element in elements:
 | 
						|
        assert element.metadata.filename is None
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_text_work_with_empty_string():
 | 
						|
    assert partition_email(text="") == []
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_filename_with_embedded_image():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-image-embedded.eml")
 | 
						|
    elements = partition_email(filename=filename, content_source="text/plain")
 | 
						|
    assert len(elements) > 0
 | 
						|
    assert elements == IMAGE_EXPECTED_OUTPUT
 | 
						|
    for element in elements:
 | 
						|
        assert element.metadata.filename == "fake-email-image-embedded.eml"
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_file_with_header():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
 | 
						|
    with open(filename) as f:
 | 
						|
        msg = email.message_from_file(f)
 | 
						|
    elements = partition_email_header(msg)
 | 
						|
    assert len(elements) > 0
 | 
						|
    assert elements == RECEIVED_HEADER_OUTPUT
 | 
						|
    for element in elements:
 | 
						|
        assert element.metadata.filename is None
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_filename_has_metadata():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
 | 
						|
    elements = partition_email(filename=filename)
 | 
						|
    parent_id = elements[0].metadata.parent_id
 | 
						|
 | 
						|
    assert len(elements) > 0
 | 
						|
    assert (
 | 
						|
        elements[0].metadata.to_dict()
 | 
						|
        == ElementMetadata(
 | 
						|
            coordinates=None,
 | 
						|
            filename=filename,
 | 
						|
            last_modified="2022-12-16T17:04:16-05:00",
 | 
						|
            page_number=None,
 | 
						|
            url=None,
 | 
						|
            sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
 | 
						|
            sent_to=["NotMatthew <NotMatthew@notunstructured.com>"],
 | 
						|
            subject="Test Email",
 | 
						|
            filetype="message/rfc822",
 | 
						|
            parent_id=parent_id,
 | 
						|
            languages=["eng"],
 | 
						|
        ).to_dict()
 | 
						|
    )
 | 
						|
    expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
 | 
						|
    assert parse_optional_datetime(elements[0].metadata.last_modified) == expected_dt
 | 
						|
    for element in elements:
 | 
						|
        assert element.metadata.filename == "fake-email.eml"
 | 
						|
 | 
						|
 | 
						|
def test_extract_email_text_matches_html():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml")
 | 
						|
    elements_from_text = partition_email(filename=filename, content_source="text/plain")
 | 
						|
    elements_from_html = partition_email(filename=filename, content_source="text/html")
 | 
						|
    assert len(elements_from_text) == len(elements_from_html)
 | 
						|
    # NOTE(robinson) - checking each individually is necessary because the text/html returns
 | 
						|
    # HTMLTitle, HTMLNarrativeText, etc
 | 
						|
    for i, element in enumerate(elements_from_text):
 | 
						|
        assert element == elements_from_text[i]
 | 
						|
        assert element.metadata.filename == "fake-email-attachment.eml"
 | 
						|
 | 
						|
 | 
						|
def test_extract_attachment_info():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml")
 | 
						|
    with open(filename) as f:
 | 
						|
        msg = email.message_from_file(f)
 | 
						|
    attachment_info = extract_attachment_info(msg)
 | 
						|
    assert len(attachment_info) > 0
 | 
						|
    assert attachment_info == ATTACH_EXPECTED_OUTPUT
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_raises_with_none_specified():
 | 
						|
    with pytest.raises(ValueError):
 | 
						|
        partition_email()
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_raises_with_too_many_specified():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
 | 
						|
    with open(filename) as f:
 | 
						|
        text = f.read()
 | 
						|
    with pytest.raises(ValueError):
 | 
						|
        partition_email(filename=filename, text=text)
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_raises_with_invalid_content_type():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
 | 
						|
    with pytest.raises(ValueError):
 | 
						|
        partition_email(filename=filename, content_source="application/json")
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_processes_fake_email_with_header():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
 | 
						|
    elements = partition_email(filename=filename)
 | 
						|
    assert len(elements) > 0
 | 
						|
    for element in elements:
 | 
						|
        assert element.metadata.filename == "fake-email-header.eml"
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    (("time", "expected")),
 | 
						|
    [
 | 
						|
        ("Thu,  4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"),
 | 
						|
        ("Thu, 4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"),
 | 
						|
        ("Thu, 4 May 2023 02:32:49 +0000 (UTC)", "2023-05-04T02:32:49+00:00"),
 | 
						|
        ("Thursday 5/3/2023 02:32:49", None),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_convert_to_iso_8601(time, expected):
 | 
						|
    iso_time = convert_to_iso_8601(time)
 | 
						|
    assert iso_time == expected
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_still_works_with_no_content():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "email-no-html-content-1.eml")
 | 
						|
    elements = partition_email(filename=filename)
 | 
						|
    assert elements == []
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_filename_exclude_metadata():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
 | 
						|
    elements = partition_email(filename=filename, include_metadata=False)
 | 
						|
    assert parse_optional_datetime(elements[0].metadata.last_modified) is None
 | 
						|
    assert elements[0].metadata.filetype is None
 | 
						|
    assert elements[0].metadata.page_name is None
 | 
						|
    assert elements[0].metadata.filename is None
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_text_file_exclude_metadata():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
 | 
						|
    with open(filename) as f:
 | 
						|
        elements = partition_email(
 | 
						|
            file=f,
 | 
						|
            content_source="text/plain",
 | 
						|
            include_metadata=False,
 | 
						|
        )
 | 
						|
    assert parse_optional_datetime(elements[0].metadata.last_modified) is None
 | 
						|
    assert elements[0].metadata.filetype is None
 | 
						|
    assert elements[0].metadata.page_name is None
 | 
						|
    assert elements[0].metadata.filename is None
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_file_exclude_metadata():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
 | 
						|
    with open(filename) as f:
 | 
						|
        elements = partition_email(file=f, include_metadata=False)
 | 
						|
    assert parse_optional_datetime(elements[0].metadata.last_modified) is None
 | 
						|
    assert elements[0].metadata.filetype is None
 | 
						|
    assert elements[0].metadata.page_name is None
 | 
						|
    assert elements[0].metadata.filename is None
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_can_process_attachments(
 | 
						|
    tmpdir,
 | 
						|
    filename="example-docs/eml/fake-email-attachment.eml",
 | 
						|
):
 | 
						|
    with open(filename) as f:
 | 
						|
        msg = email.message_from_file(f)
 | 
						|
    extract_attachment_info(msg, output_dir=tmpdir.dirname)
 | 
						|
    attachment_filename = os.path.join(
 | 
						|
        tmpdir.dirname,
 | 
						|
        ATTACH_EXPECTED_OUTPUT[0]["filename"],
 | 
						|
    )
 | 
						|
 | 
						|
    mocked_last_modification_date = "0000-00-05T09:24:28"
 | 
						|
 | 
						|
    attachment_elements = partition_text(
 | 
						|
        filename=attachment_filename,
 | 
						|
        metadata_filename=attachment_filename,
 | 
						|
        metadata_last_modified=mocked_last_modification_date,
 | 
						|
    )
 | 
						|
    expected_metadata = attachment_elements[0].metadata
 | 
						|
    expected_metadata.file_directory = None
 | 
						|
    expected_metadata.attached_to_filename = filename
 | 
						|
 | 
						|
    elements = partition_email(
 | 
						|
        filename=filename,
 | 
						|
        attachment_partitioner=partition_text,
 | 
						|
        process_attachments=True,
 | 
						|
        metadata_last_modified=mocked_last_modification_date,
 | 
						|
    )
 | 
						|
 | 
						|
    # This test does not need to validate if hierarchy is working
 | 
						|
    # Patch to nullify parent_id
 | 
						|
    expected_metadata.parent_id = None
 | 
						|
    elements[-1].metadata.parent_id = None
 | 
						|
 | 
						|
    assert elements[0].text.startswith("Hello!")
 | 
						|
 | 
						|
    for element in elements[:-1]:
 | 
						|
        assert element.metadata.filename == "fake-email-attachment.eml"
 | 
						|
        assert element.metadata.subject == "Fake email with attachment"
 | 
						|
 | 
						|
    assert elements[-1].text == "Hey this is a fake attachment!"
 | 
						|
    assert elements[-1].metadata == expected_metadata
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_can_process_min_max_with_attachments(
 | 
						|
    tmpdir,
 | 
						|
    filename="example-docs/eml/fake-email-attachment.eml",
 | 
						|
):
 | 
						|
    with open(filename) as f:
 | 
						|
        msg = email.message_from_file(f)
 | 
						|
    extract_attachment_info(msg, output_dir=tmpdir.dirname)
 | 
						|
    attachment_filename = os.path.join(
 | 
						|
        tmpdir.dirname,
 | 
						|
        ATTACH_EXPECTED_OUTPUT[0]["filename"],
 | 
						|
    )
 | 
						|
 | 
						|
    attachment_elements = partition_text(
 | 
						|
        filename=attachment_filename,
 | 
						|
        metadata_filename=attachment_filename,
 | 
						|
        min_partition=6,
 | 
						|
        max_partition=12,
 | 
						|
    )
 | 
						|
 | 
						|
    elements = partition_email(
 | 
						|
        filename=filename,
 | 
						|
        attachment_partitioner=partition_text,
 | 
						|
        process_attachments=True,
 | 
						|
        min_partition=6,
 | 
						|
        max_partition=12,
 | 
						|
    )
 | 
						|
 | 
						|
    assert elements[0].text.startswith("Hello!")
 | 
						|
    assert elements[-1].text == attachment_elements[-1].text
 | 
						|
    assert elements[-2].text == attachment_elements[-2].text
 | 
						|
    for element in elements:
 | 
						|
        if element.metadata.attached_to_filename is not None:
 | 
						|
            assert len(element.text) <= 12
 | 
						|
            assert len(element.text) >= 6
 | 
						|
 | 
						|
 | 
						|
def test_partition_msg_raises_with_no_partitioner(
 | 
						|
    filename="example-docs/eml/fake-email-attachment.eml",
 | 
						|
):
 | 
						|
    with pytest.raises(ValueError):
 | 
						|
        partition_email(filename=filename, process_attachments=True)
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_from_file_custom_metadata_date(
 | 
						|
    filename="example-docs/eml/fake-email-attachment.eml",
 | 
						|
):
 | 
						|
    expected_last_modification_date = "2020-07-05T09:24:28"
 | 
						|
 | 
						|
    with open(filename) as f:
 | 
						|
        elements = partition_email(
 | 
						|
            file=f,
 | 
						|
            metadata_last_modified=expected_last_modification_date,
 | 
						|
        )
 | 
						|
 | 
						|
    assert elements[0].metadata.last_modified == expected_last_modification_date
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_custom_metadata_date(
 | 
						|
    filename="example-docs/eml/fake-email-attachment.eml",
 | 
						|
):
 | 
						|
    expected_last_modification_date = "2020-07-05T09:24:28"
 | 
						|
 | 
						|
    elements = partition_email(
 | 
						|
        filename=filename,
 | 
						|
        metadata_last_modified=expected_last_modification_date,
 | 
						|
    )
 | 
						|
 | 
						|
    assert elements[0].metadata.last_modified == expected_last_modification_date
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_inline_content_disposition(
 | 
						|
    filename="example-docs/eml/email-inline-content-disposition.eml",
 | 
						|
):
 | 
						|
    elements = partition_email(
 | 
						|
        filename=filename,
 | 
						|
        process_attachments=True,
 | 
						|
        attachment_partitioner=partition_text,
 | 
						|
    )
 | 
						|
 | 
						|
    assert isinstance(elements[0], Text)
 | 
						|
    assert isinstance(elements[1], Text)
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_odd_attachment_filename(
 | 
						|
    filename="example-docs/eml/email-equals-attachment-filename.eml",
 | 
						|
):
 | 
						|
    elements = partition_email(
 | 
						|
        filename=filename,
 | 
						|
        process_attachments=True,
 | 
						|
        attachment_partitioner=partition_text,
 | 
						|
    )
 | 
						|
 | 
						|
    assert elements[1].metadata.filename == "odd=file=name.txt"
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_with_json():
 | 
						|
    elements = partition_email(example_doc_path("eml/fake-email.eml"))
 | 
						|
    assert_round_trips_through_JSON(elements)
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_with_pgp_encrypted_message(
 | 
						|
    caplog,
 | 
						|
    filename="example-docs/eml/fake-encrypted.eml",
 | 
						|
):
 | 
						|
    elements = partition_email(filename=filename)
 | 
						|
 | 
						|
    assert elements == []
 | 
						|
    assert "WARNING" in caplog.text
 | 
						|
    assert "Encrypted email detected" in caplog.text
 | 
						|
 | 
						|
 | 
						|
def test_add_chunking_strategy_on_partition_email(
 | 
						|
    filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt"),
 | 
						|
):
 | 
						|
    elements = partition_email(filename=filename)
 | 
						|
    chunk_elements = partition_email(filename, chunking_strategy="by_title")
 | 
						|
    chunks = chunk_by_title(elements)
 | 
						|
    assert chunk_elements != elements
 | 
						|
    assert chunk_elements == chunks
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_element_metadata_has_languages():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
 | 
						|
    elements = partition_email(filename=filename)
 | 
						|
    assert elements[0].metadata.languages == ["eng"]
 | 
						|
 | 
						|
 | 
						|
def test_partition_email_respects_languages_arg():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
 | 
						|
    elements = partition_email(filename=filename, languages=["deu"])
 | 
						|
    assert all(element.metadata.languages == ["deu"] for element in elements)
 | 
						|
 | 
						|
 | 
						|
def test_partition_eml_respects_detect_language_per_element():
 | 
						|
    filename = "example-docs/language-docs/eng_spa_mult.eml"
 | 
						|
    elements = partition_email(filename=filename, detect_language_per_element=True)
 | 
						|
    # languages other than English and Spanish are detected by this partitioner,
 | 
						|
    # so this test is slightly different from the other partition tests
 | 
						|
    langs = {element.metadata.languages[0] for element in elements}
 | 
						|
    assert "eng" in langs
 | 
						|
    assert "spa" in langs
 |