mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-01 18:43:04 +00:00
enhancement: add method for getting datetime; cleanup filename attribute (#575)
* added method for extracting datetime * change filename metadata to the base filename * fix filename metadata for msg * changelog and bump version * fix expected structured output * newline back in file * reset outpout file * update filename output * update test fixtures * update fixture
This commit is contained in:
parent
7c07b3f690
commit
8da1ddc6ec
15
CHANGELOG.md
15
CHANGELOG.md
@ -1,24 +1,19 @@
|
||||
## 0.6.6-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
* fix: fileutils/file_type check json and eml decode ignore error
|
||||
## 0.6.6-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Added an additional trace logger for NLP debugging.
|
||||
* Add `get_date` method to `ElementMetadata` for converting the datestring to a `datetime` object.
|
||||
* Cleanup the `filename` attribute on `ElementMetadata` to remove the full filepath.
|
||||
|
||||
### Features
|
||||
|
||||
* Added table reading as html with URL parsing to `partition_docx` in docx
|
||||
* Added metadata field for text_as_html for docx files
|
||||
* Added metadata field for text_as_html for docx files
|
||||
|
||||
### Fixes
|
||||
|
||||
* `fileutils/file_type` check json and eml decode ignore error
|
||||
* `partition_email` was updated to more flexibly handle deviations from the RFC-2822 standard.
|
||||
The time in the metadata returns `None` if the time does not match RFC-2822 at all.
|
||||
* Include all metadata fields when converting to dataframe or CSV
|
||||
|
||||
10
slack-ingest-output/C052BGT7718.json
Normal file
10
slack-ingest-output/C052BGT7718.json
Normal file
@ -0,0 +1,10 @@
|
||||
[
|
||||
{
|
||||
"element_id": "e50da2f0aada6f89af788627ebf261b7",
|
||||
"text": "testing <@U051UBRR946> has joined the channel <@U04ST78RXU3> has joined the channel",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"filename": "C052BGT7718.txt"
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -41,7 +41,7 @@ def test_auto_partition_email_from_filename():
|
||||
elements = partition(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_EMAIL_OUTPUT
|
||||
assert elements[0].metadata.filename == filename
|
||||
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||
|
||||
|
||||
def test_auto_partition_email_from_file():
|
||||
@ -100,7 +100,7 @@ def test_auto_partition_docx_with_filename(mock_docx_document, expected_docx_ele
|
||||
|
||||
elements = partition(filename=filename)
|
||||
assert elements == expected_docx_elements
|
||||
assert elements[0].metadata.filename == filename
|
||||
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||
|
||||
|
||||
def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_elements, tmpdir):
|
||||
@ -134,7 +134,7 @@ def test_auto_partition_doc_with_filename(
|
||||
content_type=content_type,
|
||||
)
|
||||
assert elements == expected_docx_elements
|
||||
assert elements[0].metadata.filename == doc_filename
|
||||
assert elements[0].metadata.filename == "mock_document.doc"
|
||||
|
||||
|
||||
# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
|
||||
@ -160,7 +160,7 @@ def test_auto_partition_html_from_filename(pass_file_filename, content_type):
|
||||
file_filename = filename if pass_file_filename else None
|
||||
elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
|
||||
assert len(elements) > 0
|
||||
assert elements[0].metadata.filename == filename
|
||||
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -239,7 +239,7 @@ def test_auto_partition_text_from_filename():
|
||||
elements = partition(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_TEXT_OUTPUT
|
||||
assert elements[0].metadata.filename == filename
|
||||
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||
|
||||
|
||||
def test_auto_partition_text_from_file():
|
||||
@ -266,7 +266,7 @@ def test_auto_partition_pdf_from_filename(pass_file_filename, content_type):
|
||||
assert isinstance(elements[1], NarrativeText)
|
||||
assert elements[1].text.startswith("Zejiang Shen")
|
||||
|
||||
assert elements[0].metadata.filename == filename
|
||||
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||
|
||||
|
||||
def test_auto_partition_pdf_uses_table_extraction():
|
||||
@ -369,7 +369,7 @@ def test_auto_partition_pptx_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
elements = partition(filename=filename)
|
||||
assert elements == EXPECTED_PPTX_OUTPUT
|
||||
assert elements[0].metadata.filename == filename
|
||||
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@ -377,7 +377,7 @@ def test_auto_partition_ppt_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
|
||||
elements = partition(filename=filename)
|
||||
assert elements == EXPECTED_PPTX_OUTPUT
|
||||
assert elements[0].metadata.filename == filename
|
||||
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||
|
||||
|
||||
def test_auto_with_page_breaks():
|
||||
|
||||
@ -181,6 +181,9 @@ def test_partition_email_has_metadata():
|
||||
subject="Test Email",
|
||||
)
|
||||
|
||||
expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
|
||||
assert elements[0].metadata.get_date() == expected_dt
|
||||
|
||||
|
||||
def test_extract_email_text_matches_html():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
|
||||
|
||||
@ -37,7 +37,7 @@ def test_partition_msg_from_filename():
|
||||
elements = partition_msg(filename=filename)
|
||||
assert elements == EXPECTED_MSG_OUTPUT
|
||||
assert elements[0].metadata == ElementMetadata(
|
||||
filename=filename,
|
||||
filename="fake-email.msg",
|
||||
date="2022-12-16T17:04:16-05:00",
|
||||
page_number=None,
|
||||
url=None,
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
"text": "testing <@U051UBRR946> has joined the channel <@U04ST78RXU3> has joined the channel",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"filename": "slack-ingest-download/C052BGT7718.txt"
|
||||
"filename": "C052BGT7718.txt"
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.6.6-dev1" # pragma: no cover
|
||||
__version__ = "0.6.6-dev2" # pragma: no cover
|
||||
|
||||
@ -1,4 +1,6 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import os
|
||||
import pathlib
|
||||
from abc import ABC
|
||||
from dataclasses import dataclass
|
||||
@ -34,6 +36,9 @@ class ElementMetadata:
|
||||
if isinstance(self.filename, pathlib.Path):
|
||||
self.filename = str(self.filename)
|
||||
|
||||
if self.filename is not None:
|
||||
self.filename = os.path.basename(self.filename)
|
||||
|
||||
def to_dict(self):
|
||||
return {key: value for key, value in self.__dict__.items() if value is not None}
|
||||
|
||||
@ -41,6 +46,13 @@ class ElementMetadata:
|
||||
def from_dict(cls, input_dict):
|
||||
return cls(**input_dict)
|
||||
|
||||
def get_date(self) -> Optional[datetime.datetime]:
|
||||
"""Converts the date field to a datetime object."""
|
||||
dt = None
|
||||
if self.date is not None:
|
||||
dt = datetime.datetime.fromisoformat(self.date)
|
||||
return dt
|
||||
|
||||
|
||||
class Element(ABC):
|
||||
"""An element is a section of a page in the document."""
|
||||
|
||||
@ -91,7 +91,7 @@ def partition_email_header(msg: Message) -> List[Element]:
|
||||
return elements
|
||||
|
||||
|
||||
def build_email_metadata(msg: Message) -> ElementMetadata:
|
||||
def build_email_metadata(msg: Message, filename: Optional[str]) -> ElementMetadata:
|
||||
"""Creates an ElementMetadata object from the header information in the email."""
|
||||
header_dict = dict(msg.raw_items())
|
||||
email_date = header_dict.get("Date")
|
||||
@ -111,6 +111,7 @@ def build_email_metadata(msg: Message) -> ElementMetadata:
|
||||
sent_from=sent_from,
|
||||
subject=header_dict.get("Subject"),
|
||||
date=email_date,
|
||||
filename=filename,
|
||||
)
|
||||
|
||||
|
||||
@ -278,8 +279,7 @@ def partition_email(
|
||||
header = partition_email_header(msg)
|
||||
all_elements = header + elements
|
||||
|
||||
metadata = build_email_metadata(msg)
|
||||
metadata.filename = filename
|
||||
metadata = build_email_metadata(msg, filename=filename)
|
||||
for element in all_elements:
|
||||
element.metadata = metadata
|
||||
return all_elements
|
||||
|
||||
@ -39,15 +39,14 @@ def partition_msg(
|
||||
else:
|
||||
elements = partition_text(text=text)
|
||||
|
||||
metadata = build_msg_metadata(msg_obj)
|
||||
metadata.filename = filename
|
||||
metadata = build_msg_metadata(msg_obj, filename)
|
||||
for element in elements:
|
||||
element.metadata = metadata
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
def build_msg_metadata(msg_obj: msg_parser.MsOxMessage) -> ElementMetadata:
|
||||
def build_msg_metadata(msg_obj: msg_parser.MsOxMessage, filename: Optional[str]) -> ElementMetadata:
|
||||
"""Creates an ElementMetadata object from the header information in the emai."""
|
||||
email_date = getattr(msg_obj, "sent_date", None)
|
||||
if email_date is not None:
|
||||
@ -66,6 +65,7 @@ def build_msg_metadata(msg_obj: msg_parser.MsOxMessage) -> ElementMetadata:
|
||||
sent_from=sent_from,
|
||||
subject=getattr(msg_obj, "subject", None),
|
||||
date=email_date,
|
||||
filename=filename,
|
||||
)
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user