318 lines
10 KiB
Python
Raw Normal View History

import os
import pathlib
import msg_parser
import pytest
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import (
ElementMetadata,
ListItem,
NarrativeText,
Title,
)
from unstructured.partition.json import partition_json
from unstructured.partition.msg import extract_msg_attachment_info, partition_msg
from unstructured.partition.text import partition_text
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.staging.base import elements_to_json
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
EXPECTED_MSG_OUTPUT = [
NarrativeText(text="This is a test email to use for unit tests."),
Title(text="Important points:"),
ListItem(text="Roses are red"),
ListItem(text="Violets are blue"),
]
ATTACH_EXPECTED_OUTPUT = [
{
"filename": "fake-attachment.txt",
"extension": ".txt",
"file_size": "unknown",
"payload": b"Hey this is a fake attachment!",
},
]
def test_partition_msg_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
elements = partition_msg(filename=filename)
Feat: Create a naive hierarchy for elements (#1268) ## **Summary** By adding hierarchy to unstructured elements, users will have more information for implementing vector db/LLM chunking strategies. For example, text elements could be queried by their preceding title element. The hierarchy is implemented by a parent_id tag in the element's metadata. ### Features - Introduces a parent_id to ElementMetadata (The id of the parent element, not a pointer) - Creates a rule set for assigning hierarchies. Sensible default is assigned, with an optional override parameter - Sets element parent ids if there isn't an existing parent id or matches the ruleset ### How it works Hierarchies are assigned via a parent id field in element metadata. Elements are read sequentially and evaluated against a ruleset. For example take the following elements: 1. Title, "This is the Title" 2. Text, "this is the text" And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of 2 will be the id of 1. The algorithm for determining this is more complex and resolves several edge cases, so please read the code for further details. ### Schema Changes ``` @dataclass class ElementMetadata: coordinates: Optional[CoordinatesMetadata] = None data_source: Optional[DataSourceMetadata] = None filename: Optional[str] = None file_directory: Optional[str] = None last_modified: Optional[str] = None filetype: Optional[str] = None attached_to_filename: Optional[str] = None + parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None + category_depth: Optional[int] = None ... ``` ### Testing ``` from unstructured.partition.auto import partition from typing import List elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto") for element in elements: print( f"Category: {getattr(element, 'category', '')}\n"\ f"Text: {getattr(element, 'text', '')}\n" f"ID: {element.id}\n" \ f"Parent ID: {element.metadata.parent_id}\n"\ f"Depth: {element.metadata.category_depth}\n" \ ) ``` ### Additional Notes Implementing this feature revealed a possibly undesired side-effect in how element metadata are processed. In `unstructured/partition/common.py` the `_add_element_metadata` is invoked as part of the `add_metadata_with_filetype` decorator for filetype partitioning. This method is intended to add additional information to the metadata generated with the element including filename and filetype, however the existing metadata is merged into a newly created metadata object rather than the other way around. Because of the way it's structured, new metadata fields can easily be forgotten and pose debugging challenges to developers. This likely warrants a new issue. I'm guessing that the implementation is done this way to avoid issues with deserializing elements, but could be wrong. --------- Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
parent_id = elements[0].metadata.parent_id
assert elements == EXPECTED_MSG_OUTPUT
assert (
elements[0].metadata.to_dict()
== ElementMetadata(
coordinates=None,
filename=filename,
last_modified="2022-12-16T17:04:16-05:00",
page_number=None,
url=None,
sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
sent_to=["Matthew Robinson (None)"],
subject="Test Email",
filetype="application/vnd.ms-outlook",
Feat: Create a naive hierarchy for elements (#1268) ## **Summary** By adding hierarchy to unstructured elements, users will have more information for implementing vector db/LLM chunking strategies. For example, text elements could be queried by their preceding title element. The hierarchy is implemented by a parent_id tag in the element's metadata. ### Features - Introduces a parent_id to ElementMetadata (The id of the parent element, not a pointer) - Creates a rule set for assigning hierarchies. Sensible default is assigned, with an optional override parameter - Sets element parent ids if there isn't an existing parent id or matches the ruleset ### How it works Hierarchies are assigned via a parent id field in element metadata. Elements are read sequentially and evaluated against a ruleset. For example take the following elements: 1. Title, "This is the Title" 2. Text, "this is the text" And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of 2 will be the id of 1. The algorithm for determining this is more complex and resolves several edge cases, so please read the code for further details. ### Schema Changes ``` @dataclass class ElementMetadata: coordinates: Optional[CoordinatesMetadata] = None data_source: Optional[DataSourceMetadata] = None filename: Optional[str] = None file_directory: Optional[str] = None last_modified: Optional[str] = None filetype: Optional[str] = None attached_to_filename: Optional[str] = None + parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None + category_depth: Optional[int] = None ... ``` ### Testing ``` from unstructured.partition.auto import partition from typing import List elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto") for element in elements: print( f"Category: {getattr(element, 'category', '')}\n"\ f"Text: {getattr(element, 'text', '')}\n" f"ID: {element.id}\n" \ f"Parent ID: {element.metadata.parent_id}\n"\ f"Depth: {element.metadata.category_depth}\n" \ ) ``` ### Additional Notes Implementing this feature revealed a possibly undesired side-effect in how element metadata are processed. In `unstructured/partition/common.py` the `_add_element_metadata` is invoked as part of the `add_metadata_with_filetype` decorator for filetype partitioning. This method is intended to add additional information to the metadata generated with the element including filename and filetype, however the existing metadata is merged into a newly created metadata object rather than the other way around. Because of the way it's structured, new metadata fields can easily be forgotten and pose debugging challenges to developers. This likely warrants a new issue. I'm guessing that the implementation is done this way to avoid issues with deserializing elements, but could be wrong. --------- Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
parent_id=parent_id,
languages=["eng"],
).to_dict()
)
for element in elements:
assert element.metadata.filename == "fake-email.msg"
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.detection_origin for element in elements} == {"msg"}
def test_partition_msg_from_filename_returns_uns_elements():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
elements = partition_msg(filename=filename)
assert isinstance(elements[0], NarrativeText)
def test_partition_msg_from_filename_with_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
elements = partition_msg(filename=filename, metadata_filename="test")
assert all(element.metadata.filename == "test" for element in elements)
class MockMsOxMessage:
def __init__(self, filename):
self.body = "Here is an email with plain text."
self.header_dict = {"Content-Type": "text/plain"}
def test_partition_msg_from_filename_with_text_content(monkeypatch):
monkeypatch.setattr(msg_parser, "MsOxMessage", MockMsOxMessage)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
elements = partition_msg(filename=filename)
assert str(elements[0]) == "Here is an email with plain text."
assert elements[0].metadata.filename == "fake-email.msg"
assert elements[0].metadata.file_directory == EXAMPLE_DOCS_DIRECTORY
def test_partition_msg_raises_with_missing_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "doesnt-exist.msg")
with pytest.raises(FileNotFoundError):
partition_msg(filename=filename)
def test_partition_msg_from_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
with open(filename, "rb") as f:
elements = partition_msg(file=f)
assert elements == EXPECTED_MSG_OUTPUT
for element in elements:
assert element.metadata.filename is None
def test_partition_msg_from_file_with_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
with open(filename, "rb") as f:
elements = partition_msg(file=f, metadata_filename="test")
assert elements == EXPECTED_MSG_OUTPUT
for element in elements:
assert element.metadata.filename == "test"
def test_extract_attachment_info():
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
filename = os.path.join(
DIRECTORY,
"..",
"..",
"..",
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
"example-docs",
"fake-email-attachment.msg",
)
attachment_info = extract_msg_attachment_info(filename)
assert len(attachment_info) > 0
assert attachment_info == ATTACH_EXPECTED_OUTPUT
def test_partition_msg_raises_with_both_specified():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
with open(filename, "rb") as f, pytest.raises(ValueError):
partition_msg(filename=filename, file=f)
def test_partition_msg_raises_with_neither():
with pytest.raises(ValueError):
partition_msg()
def test_partition_msg_from_filename_exclude_metadata():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
elements = partition_msg(filename=filename, include_metadata=False)
for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {}
def test_partition_msg_from_file_exclude_metadata():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
with open(filename, "rb") as f:
elements = partition_msg(file=f, include_metadata=False)
for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {}
def test_partition_msg_can_process_attachments(
tmpdir,
filename="example-docs/fake-email-attachment.msg",
):
extract_msg_attachment_info(filename=filename, output_dir=tmpdir.dirname)
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
attachment_filename = os.path.join(
tmpdir.dirname,
ATTACH_EXPECTED_OUTPUT[0]["filename"],
)
mocked_last_modification_date = "2029-07-05T09:24:28"
attachment_elements = partition_text(
filename=attachment_filename,
metadata_filename=attachment_filename,
metadata_last_modified=mocked_last_modification_date,
)
expected_metadata = attachment_elements[0].metadata
expected_metadata.file_directory = None
expected_metadata.attached_to_filename = filename
elements = partition_msg(
filename=filename,
attachment_partitioner=partition_text,
process_attachments=True,
metadata_last_modified=mocked_last_modification_date,
)
Feat: Create a naive hierarchy for elements (#1268) ## **Summary** By adding hierarchy to unstructured elements, users will have more information for implementing vector db/LLM chunking strategies. For example, text elements could be queried by their preceding title element. The hierarchy is implemented by a parent_id tag in the element's metadata. ### Features - Introduces a parent_id to ElementMetadata (The id of the parent element, not a pointer) - Creates a rule set for assigning hierarchies. Sensible default is assigned, with an optional override parameter - Sets element parent ids if there isn't an existing parent id or matches the ruleset ### How it works Hierarchies are assigned via a parent id field in element metadata. Elements are read sequentially and evaluated against a ruleset. For example take the following elements: 1. Title, "This is the Title" 2. Text, "this is the text" And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of 2 will be the id of 1. The algorithm for determining this is more complex and resolves several edge cases, so please read the code for further details. ### Schema Changes ``` @dataclass class ElementMetadata: coordinates: Optional[CoordinatesMetadata] = None data_source: Optional[DataSourceMetadata] = None filename: Optional[str] = None file_directory: Optional[str] = None last_modified: Optional[str] = None filetype: Optional[str] = None attached_to_filename: Optional[str] = None + parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None + category_depth: Optional[int] = None ... ``` ### Testing ``` from unstructured.partition.auto import partition from typing import List elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto") for element in elements: print( f"Category: {getattr(element, 'category', '')}\n"\ f"Text: {getattr(element, 'text', '')}\n" f"ID: {element.id}\n" \ f"Parent ID: {element.metadata.parent_id}\n"\ f"Depth: {element.metadata.category_depth}\n" \ ) ``` ### Additional Notes Implementing this feature revealed a possibly undesired side-effect in how element metadata are processed. In `unstructured/partition/common.py` the `_add_element_metadata` is invoked as part of the `add_metadata_with_filetype` decorator for filetype partitioning. This method is intended to add additional information to the metadata generated with the element including filename and filetype, however the existing metadata is merged into a newly created metadata object rather than the other way around. Because of the way it's structured, new metadata fields can easily be forgotten and pose debugging challenges to developers. This likely warrants a new issue. I'm guessing that the implementation is done this way to avoid issues with deserializing elements, but could be wrong. --------- Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
# This test does not need to validate if hierarchy is working
# Patch to nullify parent_id
expected_metadata.parent_id = None
elements[-1].metadata.parent_id = None
assert elements[0].text.startswith("Hello!")
for element in elements[:-1]:
assert element.metadata.filename == "fake-email-attachment.msg"
assert element.metadata.subject == "Fake email with attachment"
assert elements[-1].text == "Hey this is a fake attachment!"
assert elements[-1].metadata == expected_metadata
def test_partition_msg_can_process_min_max_wtih_attachments(
tmpdir,
filename="example-docs/fake-email-attachment.msg",
):
extract_msg_attachment_info(filename=filename, output_dir=tmpdir.dirname)
attachment_filename = os.path.join(
tmpdir.dirname,
ATTACH_EXPECTED_OUTPUT[0]["filename"],
)
attachment_elements = partition_text(
filename=attachment_filename,
metadata_filename=attachment_filename,
min_partition=6,
max_partition=12,
)
elements = partition_msg(
filename=filename,
attachment_partitioner=partition_text,
process_attachments=True,
min_partition=6,
max_partition=12,
)
assert elements[0].text.startswith("Hello!")
assert elements[-1].text == attachment_elements[-1].text
assert elements[-2].text == attachment_elements[-2].text
for element in elements:
if element.metadata.attached_to_filename is not None:
assert len(element.text) <= 12
assert len(element.text) >= 6
def test_partition_msg_raises_with_no_partitioner(
filename="example-docs/fake-email-attachment.msg",
):
with pytest.raises(ValueError):
partition_msg(filename=filename, process_attachments=True)
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
def test_partition_msg_from_file_custom_metadata_date(
filename="example-docs/fake-email.msg",
):
expected_last_modification_date = "2020-07-05T09:24:28"
with open(filename, "rb") as f:
Feat: Create a naive hierarchy for elements (#1268) ## **Summary** By adding hierarchy to unstructured elements, users will have more information for implementing vector db/LLM chunking strategies. For example, text elements could be queried by their preceding title element. The hierarchy is implemented by a parent_id tag in the element's metadata. ### Features - Introduces a parent_id to ElementMetadata (The id of the parent element, not a pointer) - Creates a rule set for assigning hierarchies. Sensible default is assigned, with an optional override parameter - Sets element parent ids if there isn't an existing parent id or matches the ruleset ### How it works Hierarchies are assigned via a parent id field in element metadata. Elements are read sequentially and evaluated against a ruleset. For example take the following elements: 1. Title, "This is the Title" 2. Text, "this is the text" And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of 2 will be the id of 1. The algorithm for determining this is more complex and resolves several edge cases, so please read the code for further details. ### Schema Changes ``` @dataclass class ElementMetadata: coordinates: Optional[CoordinatesMetadata] = None data_source: Optional[DataSourceMetadata] = None filename: Optional[str] = None file_directory: Optional[str] = None last_modified: Optional[str] = None filetype: Optional[str] = None attached_to_filename: Optional[str] = None + parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None + category_depth: Optional[int] = None ... ``` ### Testing ``` from unstructured.partition.auto import partition from typing import List elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto") for element in elements: print( f"Category: {getattr(element, 'category', '')}\n"\ f"Text: {getattr(element, 'text', '')}\n" f"ID: {element.id}\n" \ f"Parent ID: {element.metadata.parent_id}\n"\ f"Depth: {element.metadata.category_depth}\n" \ ) ``` ### Additional Notes Implementing this feature revealed a possibly undesired side-effect in how element metadata are processed. In `unstructured/partition/common.py` the `_add_element_metadata` is invoked as part of the `add_metadata_with_filetype` decorator for filetype partitioning. This method is intended to add additional information to the metadata generated with the element including filename and filetype, however the existing metadata is merged into a newly created metadata object rather than the other way around. Because of the way it's structured, new metadata fields can easily be forgotten and pose debugging challenges to developers. This likely warrants a new issue. I'm guessing that the implementation is done this way to avoid issues with deserializing elements, but could be wrong. --------- Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
elements = partition_msg(
file=f,
metadata_last_modified=expected_last_modification_date,
)
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
assert elements[0].metadata.last_modified == expected_last_modification_date
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
def test_partition_msg_custom_metadata_date(
filename="example-docs/fake-email.msg",
):
expected_last_modification_date = "2020-07-05T09:24:28"
elements = partition_msg(
filename=filename,
metadata_last_modified=expected_last_modification_date,
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
)
assert elements[0].metadata.last_modified == expected_last_modification_date
def test_partition_msg_with_json():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
elements = partition_msg(filename=filename)
test_elements = partition_json(text=elements_to_json(elements))
assert elements == test_elements
assert elements[0].metadata.sent_from == test_elements[0].metadata.sent_from
assert elements[0].metadata.sent_to[0] == test_elements[0].metadata.sent_to[0]
assert elements[0].metadata.subject == test_elements[0].metadata.subject
def test_partition_msg_with_pgp_encrypted_message(
caplog,
filename="example-docs/fake-encrypted.msg",
):
elements = partition_msg(filename=filename)
assert elements == []
assert "WARNING" in caplog.text
assert "Encrypted email detected" in caplog.text
chore: Table chunking (#1540) This change is adding to our `add_chunking_strategy` logic so that we are able to chunk Table elements' `text` and `text_as_html` params. In order to keep the functionality under the same `by_title` chunking strategy we have renamed the `combine_under_n_chars` to `max_characters`. It functions the same way for the combining elements under Title's, as well as specifying a chunk size (in chars) for TableChunk elements. *renaming the variable to `max_characters` will also reflect the 'hard max' we will implement for large elements in followup PRs Additionally -> some lint changes snuck in when I ran `make tidy` hence the minor changes in unrelated files :) TODO: ✅ add unit tests --> note: added where I could to unit tests! Some unit tests I just clarified that the chunking strategy was now 'by_title' because we don't have a file example that has Table elements to test the 'by_num_characters' chunking strategy ✅ update changelog To manually test: ``` In [1]: filename="example-docs/example-10k.html" In [2]: from unstructured.chunking.title import chunk_table_element In [3]: from unstructured.partition.auto import partition In [4]: elements = partition(filename) # element at -2 happens to be a Table, and we'll get chunks of char size 4 here In [5]: chunks = chunk_table_element(elements[-2], 4) # examine text and text_as_html params ln [6]: for c in chunks: print(c.text) print(c.metadata.text_as_html) ``` --------- Co-authored-by: Yao You <theyaoyou@gmail.com>
2023-10-03 09:40:34 -07:00
def test_add_chunking_strategy_by_title_on_partition_msg(
filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg"),
):
elements = partition_msg(filename=filename)
chunk_elements = partition_msg(filename, chunking_strategy="by_title")
chunks = chunk_by_title(elements)
assert chunk_elements != elements
assert chunk_elements == chunks
def test_partition_msg_element_metadata_has_languages():
filename = "example-docs/fake-email.msg"
elements = partition_msg(filename=filename)
assert elements[0].metadata.languages == ["eng"]
def test_partition_msg_respects_languages_arg():
filename = "example-docs/fake-email.msg"
elements = partition_msg(filename=filename, languages=["deu"])
assert all(element.metadata.languages == ["deu"] for element in elements)
def test_partition_msg_raises_TypeError_for_invalid_languages():
with pytest.raises(TypeError):
filename = "example-docs/fake-email.msg"
partition_msg(filename=filename, languages="eng")