468 lines
17 KiB
Python
Raw Permalink Normal View History

"""Test suite for `unstructured.partition.msg` module."""
from __future__ import annotations
import io
from typing import Any
import pytest
from oxmsg import Message
from test_unstructured.unit_utils import (
FixtureRequest,
LogCaptureFixture,
Mock,
assert_round_trips_through_JSON,
example_doc_path,
rfctr(email): eml partitioner rewrite (#3694) **Summary** Initial attempts to incrementally refactor `partition_email()` into shape to allow pluggable partitioning quickly became too complex for ready code-review. Prepare separate rewritten module and tests and swap them out whole. **Additional Context** - Uses the modern stdlib `email` module to reliably accomplish several manual decoding steps in the legacy code. - Remove obsolete email-specific element-types which were replaced 18 months or so ago with email-specific metadata fields for things like Cc: addresses, subject, etc. - Remove accepting an email as `text: str` because MIME-email is inherently a binary format which can and often does contain multiple and contradictory character-encodings. - Remove `encoding` parameters as it is now unused. An email file is not a text file and as such does not have a single overall encoding. Character encoding is specified individually for each MIME-part within the message and often varies from one part to another in the same message. - Remove the need for a caller to specify `attachment_partitioner`. There is only one reasonable choice for this which is `auto.partition()`, consistent with the same interface and operation in `partition_msg()`. - Fixes #3671 along the way by silently skipping attachments with a file-type for which there is no partitioner. - Substantially extend the test-suite to cover multiple transport-encoding/charset combinations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
2024-10-15 19:02:33 -07:00
function_mock,
property_mock,
)
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import (
ElementMetadata,
ListItem,
NarrativeText,
rfctr(email): eml partitioner rewrite (#3694) **Summary** Initial attempts to incrementally refactor `partition_email()` into shape to allow pluggable partitioning quickly became too complex for ready code-review. Prepare separate rewritten module and tests and swap them out whole. **Additional Context** - Uses the modern stdlib `email` module to reliably accomplish several manual decoding steps in the legacy code. - Remove obsolete email-specific element-types which were replaced 18 months or so ago with email-specific metadata fields for things like Cc: addresses, subject, etc. - Remove accepting an email as `text: str` because MIME-email is inherently a binary format which can and often does contain multiple and contradictory character-encodings. - Remove `encoding` parameters as it is now unused. An email file is not a text file and as such does not have a single overall encoding. Character encoding is specified individually for each MIME-part within the message and often varies from one part to another in the same message. - Remove the need for a caller to specify `attachment_partitioner`. There is only one reasonable choice for this which is `auto.partition()`, consistent with the same interface and operation in `partition_msg()`. - Fixes #3671 along the way by silently skipping attachments with a file-type for which there is no partitioner. - Substantially extend the test-suite to cover multiple transport-encoding/charset combinations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
2024-10-15 19:02:33 -07:00
Text,
)
rfctr(email): eml partitioner rewrite (#3694) **Summary** Initial attempts to incrementally refactor `partition_email()` into shape to allow pluggable partitioning quickly became too complex for ready code-review. Prepare separate rewritten module and tests and swap them out whole. **Additional Context** - Uses the modern stdlib `email` module to reliably accomplish several manual decoding steps in the legacy code. - Remove obsolete email-specific element-types which were replaced 18 months or so ago with email-specific metadata fields for things like Cc: addresses, subject, etc. - Remove accepting an email as `text: str` because MIME-email is inherently a binary format which can and often does contain multiple and contradictory character-encodings. - Remove `encoding` parameters as it is now unused. An email file is not a text file and as such does not have a single overall encoding. Character encoding is specified individually for each MIME-part within the message and often varies from one part to another in the same message. - Remove the need for a caller to specify `attachment_partitioner`. There is only one reasonable choice for this which is `auto.partition()`, consistent with the same interface and operation in `partition_msg()`. - Fixes #3671 along the way by silently skipping attachments with a file-type for which there is no partitioner. - Substantially extend the test-suite to cover multiple transport-encoding/charset combinations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
2024-10-15 19:02:33 -07:00
from unstructured.partition.common import UnsupportedFileFormatError
from unstructured.partition.msg import MsgPartitionerOptions, partition_msg
EXPECTED_MSG_OUTPUT = [
NarrativeText(text="This is a test email to use for unit tests."),
Text(text="Important points:"),
ListItem(text="Roses are red"),
ListItem(text="Violets are blue"),
]
def test_partition_msg_from_filename():
filename = example_doc_path("fake-email.msg")
elements = partition_msg(filename=filename)
Feat: Create a naive hierarchy for elements (#1268) ## **Summary** By adding hierarchy to unstructured elements, users will have more information for implementing vector db/LLM chunking strategies. For example, text elements could be queried by their preceding title element. The hierarchy is implemented by a parent_id tag in the element's metadata. ### Features - Introduces a parent_id to ElementMetadata (The id of the parent element, not a pointer) - Creates a rule set for assigning hierarchies. Sensible default is assigned, with an optional override parameter - Sets element parent ids if there isn't an existing parent id or matches the ruleset ### How it works Hierarchies are assigned via a parent id field in element metadata. Elements are read sequentially and evaluated against a ruleset. For example take the following elements: 1. Title, "This is the Title" 2. Text, "this is the text" And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of 2 will be the id of 1. The algorithm for determining this is more complex and resolves several edge cases, so please read the code for further details. ### Schema Changes ``` @dataclass class ElementMetadata: coordinates: Optional[CoordinatesMetadata] = None data_source: Optional[DataSourceMetadata] = None filename: Optional[str] = None file_directory: Optional[str] = None last_modified: Optional[str] = None filetype: Optional[str] = None attached_to_filename: Optional[str] = None + parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None + category_depth: Optional[int] = None ... ``` ### Testing ``` from unstructured.partition.auto import partition from typing import List elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto") for element in elements: print( f"Category: {getattr(element, 'category', '')}\n"\ f"Text: {getattr(element, 'text', '')}\n" f"ID: {element.id}\n" \ f"Parent ID: {element.metadata.parent_id}\n"\ f"Depth: {element.metadata.category_depth}\n" \ ) ``` ### Additional Notes Implementing this feature revealed a possibly undesired side-effect in how element metadata are processed. In `unstructured/partition/common.py` the `_add_element_metadata` is invoked as part of the `add_metadata_with_filetype` decorator for filetype partitioning. This method is intended to add additional information to the metadata generated with the element including filename and filetype, however the existing metadata is merged into a newly created metadata object rather than the other way around. Because of the way it's structured, new metadata fields can easily be forgotten and pose debugging challenges to developers. This likely warrants a new issue. I'm guessing that the implementation is done this way to avoid issues with deserializing elements, but could be wrong. --------- Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
parent_id = elements[0].metadata.parent_id
assert elements == EXPECTED_MSG_OUTPUT
assert (
elements[0].metadata.to_dict()
== ElementMetadata(
coordinates=None,
filename=filename,
last_modified="2023-03-28T17:00:31+00:00",
page_number=None,
url=None,
sent_from=['"Matthew Robinson" <mrobinson@unstructured.io>'],
sent_to=["mrobinson@unstructured.io"],
subject="Test Email",
filetype="application/vnd.ms-outlook",
Feat: Create a naive hierarchy for elements (#1268) ## **Summary** By adding hierarchy to unstructured elements, users will have more information for implementing vector db/LLM chunking strategies. For example, text elements could be queried by their preceding title element. The hierarchy is implemented by a parent_id tag in the element's metadata. ### Features - Introduces a parent_id to ElementMetadata (The id of the parent element, not a pointer) - Creates a rule set for assigning hierarchies. Sensible default is assigned, with an optional override parameter - Sets element parent ids if there isn't an existing parent id or matches the ruleset ### How it works Hierarchies are assigned via a parent id field in element metadata. Elements are read sequentially and evaluated against a ruleset. For example take the following elements: 1. Title, "This is the Title" 2. Text, "this is the text" And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of 2 will be the id of 1. The algorithm for determining this is more complex and resolves several edge cases, so please read the code for further details. ### Schema Changes ``` @dataclass class ElementMetadata: coordinates: Optional[CoordinatesMetadata] = None data_source: Optional[DataSourceMetadata] = None filename: Optional[str] = None file_directory: Optional[str] = None last_modified: Optional[str] = None filetype: Optional[str] = None attached_to_filename: Optional[str] = None + parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None + category_depth: Optional[int] = None ... ``` ### Testing ``` from unstructured.partition.auto import partition from typing import List elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto") for element in elements: print( f"Category: {getattr(element, 'category', '')}\n"\ f"Text: {getattr(element, 'text', '')}\n" f"ID: {element.id}\n" \ f"Parent ID: {element.metadata.parent_id}\n"\ f"Depth: {element.metadata.category_depth}\n" \ ) ``` ### Additional Notes Implementing this feature revealed a possibly undesired side-effect in how element metadata are processed. In `unstructured/partition/common.py` the `_add_element_metadata` is invoked as part of the `add_metadata_with_filetype` decorator for filetype partitioning. This method is intended to add additional information to the metadata generated with the element including filename and filetype, however the existing metadata is merged into a newly created metadata object rather than the other way around. Because of the way it's structured, new metadata fields can easily be forgotten and pose debugging challenges to developers. This likely warrants a new issue. I'm guessing that the implementation is done this way to avoid issues with deserializing elements, but could be wrong. --------- Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
parent_id=parent_id,
languages=["eng"],
).to_dict()
)
def test_partition_msg_from_filename_returns_uns_elements():
filename = example_doc_path("fake-email.msg")
elements = partition_msg(filename=filename)
assert isinstance(elements[0], NarrativeText)
def test_partition_msg_from_filename_with_metadata_filename():
filename = example_doc_path("fake-email.msg")
elements = partition_msg(filename=filename, metadata_filename="test")
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_msg_from_filename_with_text_content():
filename = example_doc_path("fake-email.msg")
elements = partition_msg(filename=filename)
assert str(elements[0]) == "This is a test email to use for unit tests."
assert elements[0].metadata.filename == "fake-email.msg"
assert elements[0].metadata.file_directory == example_doc_path("")
def test_partition_msg_raises_with_missing_file():
filename = example_doc_path("doesnt-exist.msg")
with pytest.raises(FileNotFoundError):
partition_msg(filename=filename)
def test_partition_msg_from_file():
filename = example_doc_path("fake-email.msg")
with open(filename, "rb") as f:
elements = partition_msg(file=f)
assert elements == EXPECTED_MSG_OUTPUT
for element in elements:
assert element.metadata.filename is None
def test_partition_msg_from_file_with_metadata_filename():
filename = example_doc_path("fake-email.msg")
with open(filename, "rb") as f:
elements = partition_msg(file=f, metadata_filename="test")
assert elements == EXPECTED_MSG_OUTPUT
for element in elements:
assert element.metadata.filename == "test"
def test_partition_msg_uses_file_path_when_both_are_specified():
elements = partition_msg(example_doc_path("fake-email.msg"), file=io.BytesIO(b"abcde"))
assert elements == EXPECTED_MSG_OUTPUT
def test_partition_msg_raises_with_neither():
with pytest.raises(ValueError):
partition_msg()
rfctr(email): eml partitioner rewrite (#3694) **Summary** Initial attempts to incrementally refactor `partition_email()` into shape to allow pluggable partitioning quickly became too complex for ready code-review. Prepare separate rewritten module and tests and swap them out whole. **Additional Context** - Uses the modern stdlib `email` module to reliably accomplish several manual decoding steps in the legacy code. - Remove obsolete email-specific element-types which were replaced 18 months or so ago with email-specific metadata fields for things like Cc: addresses, subject, etc. - Remove accepting an email as `text: str` because MIME-email is inherently a binary format which can and often does contain multiple and contradictory character-encodings. - Remove `encoding` parameters as it is now unused. An email file is not a text file and as such does not have a single overall encoding. Character encoding is specified individually for each MIME-part within the message and often varies from one part to another in the same message. - Remove the need for a caller to specify `attachment_partitioner`. There is only one reasonable choice for this which is `auto.partition()`, consistent with the same interface and operation in `partition_msg()`. - Fixes #3671 along the way by silently skipping attachments with a file-type for which there is no partitioner. - Substantially extend the test-suite to cover multiple transport-encoding/charset combinations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
2024-10-15 19:02:33 -07:00
# -- attachments ---------------------------------------------------------------------------------
def test_partition_msg_can_process_attachments():
elements = partition_msg(
example_doc_path("fake-email-multiple-attachments.msg"), process_attachments=True
)
assert all(e.metadata.filename == "fake-email-multiple-attachments.msg" for e in elements[:5])
assert all(e.metadata.filename == "unstructured_logo.png" for e in elements[5:7])
assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:343])
assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[343:])
assert [e.text for e in elements[:5]] == [
"Here are those documents.",
"--",
"Mallori Harrell",
"Unstructured Technologies",
"Data Scientist",
]
assert [type(e).__name__ for e in elements][:10] == [
"NarrativeText",
"Text",
"Text",
"Text",
"Text",
"Image",
"Text",
"Text",
"Title",
"Title",
]
assert [type(e).__name__ for e in elements][-10:] == [
"Title",
"ListItem",
"ListItem",
"ListItem",
"ListItem",
"ListItem",
"ListItem",
"ListItem",
"ListItem",
"ListItem",
]
rfctr(email): eml partitioner rewrite (#3694) **Summary** Initial attempts to incrementally refactor `partition_email()` into shape to allow pluggable partitioning quickly became too complex for ready code-review. Prepare separate rewritten module and tests and swap them out whole. **Additional Context** - Uses the modern stdlib `email` module to reliably accomplish several manual decoding steps in the legacy code. - Remove obsolete email-specific element-types which were replaced 18 months or so ago with email-specific metadata fields for things like Cc: addresses, subject, etc. - Remove accepting an email as `text: str` because MIME-email is inherently a binary format which can and often does contain multiple and contradictory character-encodings. - Remove `encoding` parameters as it is now unused. An email file is not a text file and as such does not have a single overall encoding. Character encoding is specified individually for each MIME-part within the message and often varies from one part to another in the same message. - Remove the need for a caller to specify `attachment_partitioner`. There is only one reasonable choice for this which is `auto.partition()`, consistent with the same interface and operation in `partition_msg()`. - Fixes #3671 along the way by silently skipping attachments with a file-type for which there is no partitioner. - Substantially extend the test-suite to cover multiple transport-encoding/charset combinations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
2024-10-15 19:02:33 -07:00
def test_partition_msg_silently_skips_attachments_it_cannot_partition(request: FixtureRequest):
function_mock(
request, "unstructured.partition.auto.partition", side_effect=UnsupportedFileFormatError()
)
elements = partition_msg(
example_doc_path("fake-email-multiple-attachments.msg"), process_attachments=True
)
# -- no exception is raised --
assert elements == [
# -- the email body is partitioned --
NarrativeText("Here are those documents."),
Text("--"),
Text("Mallori Harrell"),
Text("Unstructured Technologies"),
Text("Data Scientist"),
rfctr(email): eml partitioner rewrite (#3694) **Summary** Initial attempts to incrementally refactor `partition_email()` into shape to allow pluggable partitioning quickly became too complex for ready code-review. Prepare separate rewritten module and tests and swap them out whole. **Additional Context** - Uses the modern stdlib `email` module to reliably accomplish several manual decoding steps in the legacy code. - Remove obsolete email-specific element-types which were replaced 18 months or so ago with email-specific metadata fields for things like Cc: addresses, subject, etc. - Remove accepting an email as `text: str` because MIME-email is inherently a binary format which can and often does contain multiple and contradictory character-encodings. - Remove `encoding` parameters as it is now unused. An email file is not a text file and as such does not have a single overall encoding. Character encoding is specified individually for each MIME-part within the message and often varies from one part to another in the same message. - Remove the need for a caller to specify `attachment_partitioner`. There is only one reasonable choice for this which is `auto.partition()`, consistent with the same interface and operation in `partition_msg()`. - Fixes #3671 along the way by silently skipping attachments with a file-type for which there is no partitioner. - Substantially extend the test-suite to cover multiple transport-encoding/charset combinations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
2024-10-15 19:02:33 -07:00
# -- no elements appear for the attachment(s) --
]
# -- .metadata.filename --------------------------------------------------------------------------
def test_partition_msg_from_filename_gets_filename_metadata_from_file_path():
elements = partition_msg(example_doc_path("fake-email.msg"))
assert all(e.metadata.filename == "fake-email.msg" for e in elements)
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
def test_partition_msg_from_file_gets_filename_metadata_None():
with open(example_doc_path("fake-email.msg"), "rb") as f:
elements = partition_msg(file=f)
assert all(e.metadata.filename is None for e in elements)
assert all(e.metadata.file_directory is None for e in elements)
def test_partition_msg_from_filename_prefers_metadata_filename():
elements = partition_msg(example_doc_path("fake-email.msg"), metadata_filename="a/b/c.msg")
assert all(e.metadata.filename == "c.msg" for e in elements)
assert all(e.metadata.file_directory == "a/b" for e in elements)
def test_partition_msg_from_file_prefers_metadata_filename():
with open(example_doc_path("fake-email.msg"), "rb") as f:
elements = partition_msg(file=f, metadata_filename="d/e/f.msg")
assert all(e.metadata.filename == "f.msg" for e in elements)
assert all(e.metadata.file_directory == "d/e" for e in elements)
# -- .metadata.filetype --------------------------------------------------------------------------
def test_partition_msg_gets_the_MSG_mime_type_in_metadata_filetype():
MSG_MIME_TYPE = "application/vnd.ms-outlook"
elements = partition_msg(example_doc_path("fake-email.msg"))
assert all(e.metadata.filetype == MSG_MIME_TYPE for e in elements), (
f"Expected all elements to have '{MSG_MIME_TYPE}' as their filetype, but got:"
f" {repr(elements[0].metadata.filetype)}"
)
rfctr(part): prepare for pluggable auto-partitioners 1 (#3655) **Summary** In preparation for pluggable auto-partitioners simplify metadata as discussed. **Additional Context** - Pluggable auto-partitioners requires partitioners to have a consistent call signature. An arbitrary partitioner provided at runtime needs to have a call signature that is known and consistent. Basically `partition_x(filename, *, file, **kwargs)`. - The current `auto.partition()` is highly coupled to each distinct file-type partitioner, deciding which arguments to forward to each. - This is driven by the existence of "delegating" partitioners, those that convert their file-type and then call a second partitioner to do the actual partitioning. Both the delegating and proxy partitioners are decorated with metadata-post-processing decorators and those decorators are not idempotent. We call the situation where those decorators would run twice "double-decorating". For example, EPUB converts to HTML and calls `partition_html()` and both `partition_epub()` and `partition_html()` are decorated. - The way double-decorating has been avoided in the past is to avoid sending the arguments the metadata decorators are sensitive to to the proxy partitioner. This is very obscure, complex to reason about, error-prone, and just overall not a viable strategy. The better solution is to not decorate delegating partitioners and let the proxy partitioner handle all the metadata. - This first step in preparation for that is part of simplifying the metadata processing by removing unused or unwanted legacy parameters. - `date_from_file_object` is a misnomer because a file-object never contains last-modified data. - It can never produce useful results in the API where last-modified information must be provided by `metadata_last_modified`. - It is an undocumented parameter so not in use. - Using it can produce incorrect metadata.
2024-09-23 15:23:10 -07:00
# -- .metadata.last_modified ---------------------------------------------------------------------
def test_partition_msg_pulls_last_modified_from_message_sent_date():
elements = partition_msg(example_doc_path("fake-email.msg"))
assert all(e.metadata.last_modified == "2023-03-28T17:00:31+00:00" for e in elements)
rfctr(part): prepare for pluggable auto-partitioners 1 (#3655) **Summary** In preparation for pluggable auto-partitioners simplify metadata as discussed. **Additional Context** - Pluggable auto-partitioners requires partitioners to have a consistent call signature. An arbitrary partitioner provided at runtime needs to have a call signature that is known and consistent. Basically `partition_x(filename, *, file, **kwargs)`. - The current `auto.partition()` is highly coupled to each distinct file-type partitioner, deciding which arguments to forward to each. - This is driven by the existence of "delegating" partitioners, those that convert their file-type and then call a second partitioner to do the actual partitioning. Both the delegating and proxy partitioners are decorated with metadata-post-processing decorators and those decorators are not idempotent. We call the situation where those decorators would run twice "double-decorating". For example, EPUB converts to HTML and calls `partition_html()` and both `partition_epub()` and `partition_html()` are decorated. - The way double-decorating has been avoided in the past is to avoid sending the arguments the metadata decorators are sensitive to to the proxy partitioner. This is very obscure, complex to reason about, error-prone, and just overall not a viable strategy. The better solution is to not decorate delegating partitioners and let the proxy partitioner handle all the metadata. - This first step in preparation for that is part of simplifying the metadata processing by removing unused or unwanted legacy parameters. - `date_from_file_object` is a misnomer because a file-object never contains last-modified data. - It can never produce useful results in the API where last-modified information must be provided by `metadata_last_modified`. - It is an undocumented parameter so not in use. - Using it can produce incorrect metadata.
2024-09-23 15:23:10 -07:00
def test_partition_msg_from_file_path_prefers_metadata_last_modified():
metadata_last_modified = "2020-07-05T09:24:28"
elements = partition_msg(
example_doc_path("fake-email.msg"), metadata_last_modified=metadata_last_modified
)
assert elements[0].metadata.last_modified == metadata_last_modified
def test_partition_msg_from_file_prefers_metadata_last_modified():
metadata_last_modified = "2020-07-05T09:24:28"
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
with open(example_doc_path("fake-email.msg"), "rb") as f:
elements = partition_msg(file=f, metadata_last_modified=metadata_last_modified)
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
rfctr(part): prepare for pluggable auto-partitioners 1 (#3655) **Summary** In preparation for pluggable auto-partitioners simplify metadata as discussed. **Additional Context** - Pluggable auto-partitioners requires partitioners to have a consistent call signature. An arbitrary partitioner provided at runtime needs to have a call signature that is known and consistent. Basically `partition_x(filename, *, file, **kwargs)`. - The current `auto.partition()` is highly coupled to each distinct file-type partitioner, deciding which arguments to forward to each. - This is driven by the existence of "delegating" partitioners, those that convert their file-type and then call a second partitioner to do the actual partitioning. Both the delegating and proxy partitioners are decorated with metadata-post-processing decorators and those decorators are not idempotent. We call the situation where those decorators would run twice "double-decorating". For example, EPUB converts to HTML and calls `partition_html()` and both `partition_epub()` and `partition_html()` are decorated. - The way double-decorating has been avoided in the past is to avoid sending the arguments the metadata decorators are sensitive to to the proxy partitioner. This is very obscure, complex to reason about, error-prone, and just overall not a viable strategy. The better solution is to not decorate delegating partitioners and let the proxy partitioner handle all the metadata. - This first step in preparation for that is part of simplifying the metadata processing by removing unused or unwanted legacy parameters. - `date_from_file_object` is a misnomer because a file-object never contains last-modified data. - It can never produce useful results in the API where last-modified information must be provided by `metadata_last_modified`. - It is an undocumented parameter so not in use. - Using it can produce incorrect metadata.
2024-09-23 15:23:10 -07:00
# ------------------------------------------------------------------------------------------------
def test_partition_msg_with_json():
elements = partition_msg(example_doc_path("fake-email.msg"))
assert_round_trips_through_JSON(elements)
def test_partition_msg_with_pgp_encrypted_message(caplog: LogCaptureFixture):
elements = partition_msg(example_doc_path("fake-encrypted.msg"))
assert elements == []
assert "WARNING" in caplog.text
assert "Encrypted email detected" in caplog.text
def test_add_chunking_strategy_by_title_on_partition_msg():
filename = example_doc_path("fake-email.msg")
elements = partition_msg(filename=filename)
chunk_elements = partition_msg(filename, chunking_strategy="by_title")
chunks = chunk_by_title(elements)
assert chunk_elements != elements
assert chunk_elements == chunks
# -- language behaviors --------------------------------------------------------------------------
def test_partition_msg_element_metadata_has_languages():
filename = "example-docs/fake-email.msg"
elements = partition_msg(filename=filename)
assert elements[0].metadata.languages == ["eng"]
def test_partition_msg_respects_languages_arg():
filename = "example-docs/fake-email.msg"
elements = partition_msg(filename=filename, languages=["deu"])
assert all(element.metadata.languages == ["deu"] for element in elements)
def test_partition_msg_raises_TypeError_for_invalid_languages():
with pytest.raises(TypeError):
filename = "example-docs/fake-email.msg"
partition_msg(filename=filename, languages="eng")
# ================================================================================================
# ISOLATED UNIT TESTS
# ================================================================================================
# These test components used by `partition_msg()` in isolation such that all edge cases can be
# exercised.
# ================================================================================================
class DescribeMsgPartitionerOptions:
"""Unit-test suite for `unstructured.partition.msg.MsgPartitionerOptions` objects."""
# -- .extra_msg_metadata ---------------------
def it_provides_email_specific_metadata_to_add_to_each_element(self, opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg")
opts = MsgPartitionerOptions(**opts_args)
m = opts.extra_msg_metadata
assert m.bcc_recipient == ["hello@unstructured.io"]
assert m.cc_recipient == ["steve@unstructured.io"]
assert m.email_message_id == "14DDEF33-2BA7-4CDD-A4D8-E7C5873B37F2@gmail.com"
assert m.sent_from == ['"John" <johnjennings702@gmail.com>']
assert m.sent_to == [
"john-ctr@unstructured.io",
"steve@unstructured.io",
"hello@unstructured.io",
]
assert m.subject == "Fake email with cc and bcc recipients"
# -- .is_encrypted ---------------------------
@pytest.mark.parametrize(
("file_name", "expected_value"), [("fake-encrypted.msg", True), ("fake-email.msg", False)]
)
def it_knows_when_the_msg_is_encrypted(
self, file_name: str, expected_value: bool, opts_args: dict[str, Any]
):
opts_args["file_path"] = example_doc_path(file_name)
opts = MsgPartitionerOptions(**opts_args)
assert opts.is_encrypted is expected_value
# -- .metadata_file_path ---------------------
def it_uses_the_metadata_file_path_arg_when_provided(self, opts_args: dict[str, Any]):
opts_args["file_path"] = "x/y/z.msg"
opts_args["metadata_file_path"] = "a/b/c.msg"
opts = MsgPartitionerOptions(**opts_args)
assert opts.metadata_file_path == "a/b/c.msg"
def and_it_falls_back_to_the_MSG_file_path_arg_when_provided(self, opts_args: dict[str, Any]):
file_path = example_doc_path("fake-email.msg")
opts_args["file_path"] = file_path
opts = MsgPartitionerOptions(**opts_args)
assert opts.metadata_file_path == file_path
def but_it_returns_None_when_neither_path_is_available(self, opts_args: dict[str, Any]):
opts = MsgPartitionerOptions(**opts_args)
assert opts.metadata_file_path is None
# -- .metadata_last_modified -----------------
def it_uses_metadata_last_modified_when_provided_by_the_caller(self, opts_args: dict[str, Any]):
metadata_last_modified = "2024-03-05T17:02:53"
opts_args["metadata_last_modified"] = metadata_last_modified
opts_args["file_path"] = example_doc_path("fake-email.msg")
opts = MsgPartitionerOptions(**opts_args)
assert opts.metadata_last_modified == metadata_last_modified
def and_it_uses_the_message_Date_header_when_metadata_last_modified_is_not_provided(
self, opts_args: dict[str, Any]
):
opts_args["file_path"] = example_doc_path("fake-email.msg")
opts = MsgPartitionerOptions(**opts_args)
assert opts.metadata_last_modified == "2023-03-28T17:00:31+00:00"
@pytest.mark.parametrize("filesystem_last_modified", ["2024-06-03T20:12:53", None])
def and_it_uses_the_last_modified_date_from_the_source_file_when_the_message_has_no_sent_date(
self,
opts_args: dict[str, Any],
filesystem_last_modified: str | None,
Message_sent_date_: Mock,
_last_modified_prop_: Mock,
):
Message_sent_date_.return_value = None
_last_modified_prop_.return_value = filesystem_last_modified
opts_args["file_path"] = example_doc_path("fake-email.msg")
opts = MsgPartitionerOptions(**opts_args)
assert opts.metadata_last_modified == filesystem_last_modified
# -- .msg ------------------------------------
def it_loads_the_msg_document_from_a_file_path_when_provided(self, opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("fake-email.msg")
opts = MsgPartitionerOptions(**opts_args)
assert isinstance(opts.msg, Message)
def and_it_loads_the_msg_document_from_a_file_like_object_when_provided(
self, opts_args: dict[str, Any]
):
with open(example_doc_path("fake-email.msg"), "rb") as f:
opts_args["file"] = io.BytesIO(f.read())
opts = MsgPartitionerOptions(**opts_args)
assert isinstance(opts.msg, Message)
def but_it_raises_when_neither_is_provided(self, opts_args: dict[str, Any]):
with pytest.raises(ValueError, match="one of `file` or `filename` arguments must be prov"):
MsgPartitionerOptions(**opts_args).msg
# -- .partition_attachments ------------------
@pytest.mark.parametrize("partition_attachments", [True, False])
def it_knows_whether_attachments_should_also_be_partitioned(
self, partition_attachments: bool, opts_args: dict[str, Any]
):
opts_args["file_path"] = example_doc_path("fake-email.msg")
opts_args["partition_attachments"] = partition_attachments
opts = MsgPartitionerOptions(**opts_args)
assert opts.partition_attachments is partition_attachments
# -- .partitioning_kwargs --------------------
def it_provides_access_to_pass_through_kwargs_collected_by_the_partitioner_function(
self, opts_args: dict[str, Any]
):
opts_args["kwargs"] = {"foo": 42, "bar": "baz"}
opts = MsgPartitionerOptions(**opts_args)
assert opts.partitioning_kwargs == {"foo": 42, "bar": "baz"}
# -- fixtures --------------------------------------------------------------------------------
@pytest.fixture
def _last_modified_prop_(self, request: FixtureRequest):
return property_mock(request, MsgPartitionerOptions, "_last_modified")
@pytest.fixture
def Message_sent_date_(self, request: FixtureRequest):
return property_mock(request, Message, "sent_date")
@pytest.fixture
def opts_args(self) -> dict[str, Any]:
"""All default arguments for `MsgPartitionerOptions`.
Individual argument values can be changed to suit each test. Makes construction of opts more
compact for testing purposes.
"""
return {
"file": None,
"file_path": None,
"metadata_file_path": None,
"metadata_last_modified": None,
"partition_attachments": False,
"kwargs": {},
}