enhancement: add method for getting datetime; cleanup filename attribute (#575)

* added method for extracting datetime

* change filename metadata to the base filename

* fix filename metadata for msg

* changelog and bump version

* fix expected structured output

* newline back in file

* reset outpout file

* update filename output

* update test fixtures

* update fixture
This commit is contained in:
Matt Robinson 2023-05-12 11:33:01 -04:00 committed by GitHub
parent 7c07b3f690
commit 8da1ddc6ec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 47 additions and 27 deletions

View File

@ -1,24 +1,19 @@
## 0.6.6-dev1
### Enhancements
### Features
### Fixes
* fix: fileutils/file_type check json and eml decode ignore error
## 0.6.6-dev2
### Enhancements
* Added an additional trace logger for NLP debugging.
* Add `get_date` method to `ElementMetadata` for converting the datestring to a `datetime` object.
* Cleanup the `filename` attribute on `ElementMetadata` to remove the full filepath.
### Features
* Added table reading as html with URL parsing to `partition_docx` in docx
* Added metadata field for text_as_html for docx files
* Added metadata field for text_as_html for docx files
### Fixes
* `fileutils/file_type` check json and eml decode ignore error
* `partition_email` was updated to more flexibly handle deviations from the RFC-2822 standard.
The time in the metadata returns `None` if the time does not match RFC-2822 at all.
* Include all metadata fields when converting to dataframe or CSV

View File

@ -0,0 +1,10 @@
[
{
"element_id": "e50da2f0aada6f89af788627ebf261b7",
"text": "testing <@U051UBRR946> has joined the channel <@U04ST78RXU3> has joined the channel",
"type": "NarrativeText",
"metadata": {
"filename": "C052BGT7718.txt"
}
}
]

View File

@ -41,7 +41,7 @@ def test_auto_partition_email_from_filename():
elements = partition(filename=filename)
assert len(elements) > 0
assert elements == EXPECTED_EMAIL_OUTPUT
assert elements[0].metadata.filename == filename
assert elements[0].metadata.filename == os.path.basename(filename)
def test_auto_partition_email_from_file():
@ -100,7 +100,7 @@ def test_auto_partition_docx_with_filename(mock_docx_document, expected_docx_ele
elements = partition(filename=filename)
assert elements == expected_docx_elements
assert elements[0].metadata.filename == filename
assert elements[0].metadata.filename == os.path.basename(filename)
def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_elements, tmpdir):
@ -134,7 +134,7 @@ def test_auto_partition_doc_with_filename(
content_type=content_type,
)
assert elements == expected_docx_elements
assert elements[0].metadata.filename == doc_filename
assert elements[0].metadata.filename == "mock_document.doc"
# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
@ -160,7 +160,7 @@ def test_auto_partition_html_from_filename(pass_file_filename, content_type):
file_filename = filename if pass_file_filename else None
elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
assert len(elements) > 0
assert elements[0].metadata.filename == filename
assert elements[0].metadata.filename == os.path.basename(filename)
@pytest.mark.parametrize(
@ -239,7 +239,7 @@ def test_auto_partition_text_from_filename():
elements = partition(filename=filename)
assert len(elements) > 0
assert elements == EXPECTED_TEXT_OUTPUT
assert elements[0].metadata.filename == filename
assert elements[0].metadata.filename == os.path.basename(filename)
def test_auto_partition_text_from_file():
@ -266,7 +266,7 @@ def test_auto_partition_pdf_from_filename(pass_file_filename, content_type):
assert isinstance(elements[1], NarrativeText)
assert elements[1].text.startswith("Zejiang Shen")
assert elements[0].metadata.filename == filename
assert elements[0].metadata.filename == os.path.basename(filename)
def test_auto_partition_pdf_uses_table_extraction():
@ -369,7 +369,7 @@ def test_auto_partition_pptx_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
elements = partition(filename=filename)
assert elements == EXPECTED_PPTX_OUTPUT
assert elements[0].metadata.filename == filename
assert elements[0].metadata.filename == os.path.basename(filename)
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@ -377,7 +377,7 @@ def test_auto_partition_ppt_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
elements = partition(filename=filename)
assert elements == EXPECTED_PPTX_OUTPUT
assert elements[0].metadata.filename == filename
assert elements[0].metadata.filename == os.path.basename(filename)
def test_auto_with_page_breaks():

View File

@ -181,6 +181,9 @@ def test_partition_email_has_metadata():
subject="Test Email",
)
expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
assert elements[0].metadata.get_date() == expected_dt
def test_extract_email_text_matches_html():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")

View File

@ -37,7 +37,7 @@ def test_partition_msg_from_filename():
elements = partition_msg(filename=filename)
assert elements == EXPECTED_MSG_OUTPUT
assert elements[0].metadata == ElementMetadata(
filename=filename,
filename="fake-email.msg",
date="2022-12-16T17:04:16-05:00",
page_number=None,
url=None,

View File

@ -4,7 +4,7 @@
"text": "testing <@U051UBRR946> has joined the channel <@U04ST78RXU3> has joined the channel",
"type": "NarrativeText",
"metadata": {
"filename": "slack-ingest-download/C052BGT7718.txt"
"filename": "C052BGT7718.txt"
}
}
]

View File

@ -1 +1 @@
__version__ = "0.6.6-dev1" # pragma: no cover
__version__ = "0.6.6-dev2" # pragma: no cover

View File

@ -1,4 +1,6 @@
import datetime
import hashlib
import os
import pathlib
from abc import ABC
from dataclasses import dataclass
@ -34,6 +36,9 @@ class ElementMetadata:
if isinstance(self.filename, pathlib.Path):
self.filename = str(self.filename)
if self.filename is not None:
self.filename = os.path.basename(self.filename)
def to_dict(self):
return {key: value for key, value in self.__dict__.items() if value is not None}
@ -41,6 +46,13 @@ class ElementMetadata:
def from_dict(cls, input_dict):
return cls(**input_dict)
def get_date(self) -> Optional[datetime.datetime]:
"""Converts the date field to a datetime object."""
dt = None
if self.date is not None:
dt = datetime.datetime.fromisoformat(self.date)
return dt
class Element(ABC):
"""An element is a section of a page in the document."""

View File

@ -91,7 +91,7 @@ def partition_email_header(msg: Message) -> List[Element]:
return elements
def build_email_metadata(msg: Message) -> ElementMetadata:
def build_email_metadata(msg: Message, filename: Optional[str]) -> ElementMetadata:
"""Creates an ElementMetadata object from the header information in the email."""
header_dict = dict(msg.raw_items())
email_date = header_dict.get("Date")
@ -111,6 +111,7 @@ def build_email_metadata(msg: Message) -> ElementMetadata:
sent_from=sent_from,
subject=header_dict.get("Subject"),
date=email_date,
filename=filename,
)
@ -278,8 +279,7 @@ def partition_email(
header = partition_email_header(msg)
all_elements = header + elements
metadata = build_email_metadata(msg)
metadata.filename = filename
metadata = build_email_metadata(msg, filename=filename)
for element in all_elements:
element.metadata = metadata
return all_elements

View File

@ -39,15 +39,14 @@ def partition_msg(
else:
elements = partition_text(text=text)
metadata = build_msg_metadata(msg_obj)
metadata.filename = filename
metadata = build_msg_metadata(msg_obj, filename)
for element in elements:
element.metadata = metadata
return elements
def build_msg_metadata(msg_obj: msg_parser.MsOxMessage) -> ElementMetadata:
def build_msg_metadata(msg_obj: msg_parser.MsOxMessage, filename: Optional[str]) -> ElementMetadata:
"""Creates an ElementMetadata object from the header information in the emai."""
email_date = getattr(msg_obj, "sent_date", None)
if email_date is not None:
@ -66,6 +65,7 @@ def build_msg_metadata(msg_obj: msg_parser.MsOxMessage) -> ElementMetadata:
sent_from=sent_from,
subject=getattr(msg_obj, "subject", None),
date=email_date,
filename=filename,
)