diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c3486335..624e659e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,24 +1,19 @@ -## 0.6.6-dev1 - -### Enhancements - -### Features - -### Fixes - -* fix: fileutils/file_type check json and eml decode ignore error +## 0.6.6-dev2 ### Enhancements * Added an additional trace logger for NLP debugging. +* Add `get_date` method to `ElementMetadata` for converting the datestring to a `datetime` object. +* Cleanup the `filename` attribute on `ElementMetadata` to remove the full filepath. ### Features * Added table reading as html with URL parsing to `partition_docx` in docx -* Added metadata field for text_as_html for docx files +* Added metadata field for text_as_html for docx files ### Fixes +* `fileutils/file_type` check json and eml decode ignore error * `partition_email` was updated to more flexibly handle deviations from the RFC-2822 standard. The time in the metadata returns `None` if the time does not match RFC-2822 at all. * Include all metadata fields when converting to dataframe or CSV diff --git a/slack-ingest-output/C052BGT7718.json b/slack-ingest-output/C052BGT7718.json new file mode 100644 index 000000000..56cb7386f --- /dev/null +++ b/slack-ingest-output/C052BGT7718.json @@ -0,0 +1,10 @@ +[ + { + "element_id": "e50da2f0aada6f89af788627ebf261b7", + "text": "testing <@U051UBRR946> has joined the channel <@U04ST78RXU3> has joined the channel", + "type": "NarrativeText", + "metadata": { + "filename": "C052BGT7718.txt" + } + } +] \ No newline at end of file diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index dd88deb90..645a594d1 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -41,7 +41,7 @@ def test_auto_partition_email_from_filename(): elements = partition(filename=filename) assert len(elements) > 0 assert elements == EXPECTED_EMAIL_OUTPUT - assert elements[0].metadata.filename == filename + assert elements[0].metadata.filename == os.path.basename(filename) def test_auto_partition_email_from_file(): @@ -100,7 +100,7 @@ def test_auto_partition_docx_with_filename(mock_docx_document, expected_docx_ele elements = partition(filename=filename) assert elements == expected_docx_elements - assert elements[0].metadata.filename == filename + assert elements[0].metadata.filename == os.path.basename(filename) def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_elements, tmpdir): @@ -134,7 +134,7 @@ def test_auto_partition_doc_with_filename( content_type=content_type, ) assert elements == expected_docx_elements - assert elements[0].metadata.filename == doc_filename + assert elements[0].metadata.filename == "mock_document.doc" # NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to @@ -160,7 +160,7 @@ def test_auto_partition_html_from_filename(pass_file_filename, content_type): file_filename = filename if pass_file_filename else None elements = partition(filename=filename, file_filename=file_filename, content_type=content_type) assert len(elements) > 0 - assert elements[0].metadata.filename == filename + assert elements[0].metadata.filename == os.path.basename(filename) @pytest.mark.parametrize( @@ -239,7 +239,7 @@ def test_auto_partition_text_from_filename(): elements = partition(filename=filename) assert len(elements) > 0 assert elements == EXPECTED_TEXT_OUTPUT - assert elements[0].metadata.filename == filename + assert elements[0].metadata.filename == os.path.basename(filename) def test_auto_partition_text_from_file(): @@ -266,7 +266,7 @@ def test_auto_partition_pdf_from_filename(pass_file_filename, content_type): assert isinstance(elements[1], NarrativeText) assert elements[1].text.startswith("Zejiang Shen") - assert elements[0].metadata.filename == filename + assert elements[0].metadata.filename == os.path.basename(filename) def test_auto_partition_pdf_uses_table_extraction(): @@ -369,7 +369,7 @@ def test_auto_partition_pptx_from_filename(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx") elements = partition(filename=filename) assert elements == EXPECTED_PPTX_OUTPUT - assert elements[0].metadata.filename == filename + assert elements[0].metadata.filename == os.path.basename(filename) @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") @@ -377,7 +377,7 @@ def test_auto_partition_ppt_from_filename(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt") elements = partition(filename=filename) assert elements == EXPECTED_PPTX_OUTPUT - assert elements[0].metadata.filename == filename + assert elements[0].metadata.filename == os.path.basename(filename) def test_auto_with_page_breaks(): diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index 7e6f6cac4..b1661a442 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -181,6 +181,9 @@ def test_partition_email_has_metadata(): subject="Test Email", ) + expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00") + assert elements[0].metadata.get_date() == expected_dt + def test_extract_email_text_matches_html(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml") diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py index 170de8535..4daa9a5d5 100644 --- a/test_unstructured/partition/test_msg.py +++ b/test_unstructured/partition/test_msg.py @@ -37,7 +37,7 @@ def test_partition_msg_from_filename(): elements = partition_msg(filename=filename) assert elements == EXPECTED_MSG_OUTPUT assert elements[0].metadata == ElementMetadata( - filename=filename, + filename="fake-email.msg", date="2022-12-16T17:04:16-05:00", page_number=None, url=None, diff --git a/test_unstructured_ingest/expected-structured-output/slack-ingest-channel/C052BGT7718.json b/test_unstructured_ingest/expected-structured-output/slack-ingest-channel/C052BGT7718.json index add5556ca..56cb7386f 100644 --- a/test_unstructured_ingest/expected-structured-output/slack-ingest-channel/C052BGT7718.json +++ b/test_unstructured_ingest/expected-structured-output/slack-ingest-channel/C052BGT7718.json @@ -4,7 +4,7 @@ "text": "testing <@U051UBRR946> has joined the channel <@U04ST78RXU3> has joined the channel", "type": "NarrativeText", "metadata": { - "filename": "slack-ingest-download/C052BGT7718.txt" + "filename": "C052BGT7718.txt" } } ] \ No newline at end of file diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 490af064a..c99aa34a4 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.6.6-dev1" # pragma: no cover +__version__ = "0.6.6-dev2" # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index e5d151e6f..b0df7bd07 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -1,4 +1,6 @@ +import datetime import hashlib +import os import pathlib from abc import ABC from dataclasses import dataclass @@ -34,6 +36,9 @@ class ElementMetadata: if isinstance(self.filename, pathlib.Path): self.filename = str(self.filename) + if self.filename is not None: + self.filename = os.path.basename(self.filename) + def to_dict(self): return {key: value for key, value in self.__dict__.items() if value is not None} @@ -41,6 +46,13 @@ class ElementMetadata: def from_dict(cls, input_dict): return cls(**input_dict) + def get_date(self) -> Optional[datetime.datetime]: + """Converts the date field to a datetime object.""" + dt = None + if self.date is not None: + dt = datetime.datetime.fromisoformat(self.date) + return dt + class Element(ABC): """An element is a section of a page in the document.""" diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 35ecec491..721ef8951 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -91,7 +91,7 @@ def partition_email_header(msg: Message) -> List[Element]: return elements -def build_email_metadata(msg: Message) -> ElementMetadata: +def build_email_metadata(msg: Message, filename: Optional[str]) -> ElementMetadata: """Creates an ElementMetadata object from the header information in the email.""" header_dict = dict(msg.raw_items()) email_date = header_dict.get("Date") @@ -111,6 +111,7 @@ def build_email_metadata(msg: Message) -> ElementMetadata: sent_from=sent_from, subject=header_dict.get("Subject"), date=email_date, + filename=filename, ) @@ -278,8 +279,7 @@ def partition_email( header = partition_email_header(msg) all_elements = header + elements - metadata = build_email_metadata(msg) - metadata.filename = filename + metadata = build_email_metadata(msg, filename=filename) for element in all_elements: element.metadata = metadata return all_elements diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py index 7a22c1c9c..dd648c342 100644 --- a/unstructured/partition/msg.py +++ b/unstructured/partition/msg.py @@ -39,15 +39,14 @@ def partition_msg( else: elements = partition_text(text=text) - metadata = build_msg_metadata(msg_obj) - metadata.filename = filename + metadata = build_msg_metadata(msg_obj, filename) for element in elements: element.metadata = metadata return elements -def build_msg_metadata(msg_obj: msg_parser.MsOxMessage) -> ElementMetadata: +def build_msg_metadata(msg_obj: msg_parser.MsOxMessage, filename: Optional[str]) -> ElementMetadata: """Creates an ElementMetadata object from the header information in the emai.""" email_date = getattr(msg_obj, "sent_date", None) if email_date is not None: @@ -66,6 +65,7 @@ def build_msg_metadata(msg_obj: msg_parser.MsOxMessage) -> ElementMetadata: sent_from=sent_from, subject=getattr(msg_obj, "subject", None), date=email_date, + filename=filename, )