mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-11 07:57:21 +00:00
enhancement: add method for getting datetime; cleanup filename attribute (#575)
* added method for extracting datetime * change filename metadata to the base filename * fix filename metadata for msg * changelog and bump version * fix expected structured output * newline back in file * reset outpout file * update filename output * update test fixtures * update fixture
This commit is contained in:
parent
7c07b3f690
commit
8da1ddc6ec
13
CHANGELOG.md
13
CHANGELOG.md
@ -1,16 +1,10 @@
|
|||||||
## 0.6.6-dev1
|
## 0.6.6-dev2
|
||||||
|
|
||||||
### Enhancements
|
|
||||||
|
|
||||||
### Features
|
|
||||||
|
|
||||||
### Fixes
|
|
||||||
|
|
||||||
* fix: fileutils/file_type check json and eml decode ignore error
|
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* Added an additional trace logger for NLP debugging.
|
* Added an additional trace logger for NLP debugging.
|
||||||
|
* Add `get_date` method to `ElementMetadata` for converting the datestring to a `datetime` object.
|
||||||
|
* Cleanup the `filename` attribute on `ElementMetadata` to remove the full filepath.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
@ -19,6 +13,7 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* `fileutils/file_type` check json and eml decode ignore error
|
||||||
* `partition_email` was updated to more flexibly handle deviations from the RFC-2822 standard.
|
* `partition_email` was updated to more flexibly handle deviations from the RFC-2822 standard.
|
||||||
The time in the metadata returns `None` if the time does not match RFC-2822 at all.
|
The time in the metadata returns `None` if the time does not match RFC-2822 at all.
|
||||||
* Include all metadata fields when converting to dataframe or CSV
|
* Include all metadata fields when converting to dataframe or CSV
|
||||||
|
|||||||
10
slack-ingest-output/C052BGT7718.json
Normal file
10
slack-ingest-output/C052BGT7718.json
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"element_id": "e50da2f0aada6f89af788627ebf261b7",
|
||||||
|
"text": "testing <@U051UBRR946> has joined the channel <@U04ST78RXU3> has joined the channel",
|
||||||
|
"type": "NarrativeText",
|
||||||
|
"metadata": {
|
||||||
|
"filename": "C052BGT7718.txt"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -41,7 +41,7 @@ def test_auto_partition_email_from_filename():
|
|||||||
elements = partition(filename=filename)
|
elements = partition(filename=filename)
|
||||||
assert len(elements) > 0
|
assert len(elements) > 0
|
||||||
assert elements == EXPECTED_EMAIL_OUTPUT
|
assert elements == EXPECTED_EMAIL_OUTPUT
|
||||||
assert elements[0].metadata.filename == filename
|
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||||
|
|
||||||
|
|
||||||
def test_auto_partition_email_from_file():
|
def test_auto_partition_email_from_file():
|
||||||
@ -100,7 +100,7 @@ def test_auto_partition_docx_with_filename(mock_docx_document, expected_docx_ele
|
|||||||
|
|
||||||
elements = partition(filename=filename)
|
elements = partition(filename=filename)
|
||||||
assert elements == expected_docx_elements
|
assert elements == expected_docx_elements
|
||||||
assert elements[0].metadata.filename == filename
|
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||||
|
|
||||||
|
|
||||||
def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_elements, tmpdir):
|
def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_elements, tmpdir):
|
||||||
@ -134,7 +134,7 @@ def test_auto_partition_doc_with_filename(
|
|||||||
content_type=content_type,
|
content_type=content_type,
|
||||||
)
|
)
|
||||||
assert elements == expected_docx_elements
|
assert elements == expected_docx_elements
|
||||||
assert elements[0].metadata.filename == doc_filename
|
assert elements[0].metadata.filename == "mock_document.doc"
|
||||||
|
|
||||||
|
|
||||||
# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
|
# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
|
||||||
@ -160,7 +160,7 @@ def test_auto_partition_html_from_filename(pass_file_filename, content_type):
|
|||||||
file_filename = filename if pass_file_filename else None
|
file_filename = filename if pass_file_filename else None
|
||||||
elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
|
elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
|
||||||
assert len(elements) > 0
|
assert len(elements) > 0
|
||||||
assert elements[0].metadata.filename == filename
|
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -239,7 +239,7 @@ def test_auto_partition_text_from_filename():
|
|||||||
elements = partition(filename=filename)
|
elements = partition(filename=filename)
|
||||||
assert len(elements) > 0
|
assert len(elements) > 0
|
||||||
assert elements == EXPECTED_TEXT_OUTPUT
|
assert elements == EXPECTED_TEXT_OUTPUT
|
||||||
assert elements[0].metadata.filename == filename
|
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||||
|
|
||||||
|
|
||||||
def test_auto_partition_text_from_file():
|
def test_auto_partition_text_from_file():
|
||||||
@ -266,7 +266,7 @@ def test_auto_partition_pdf_from_filename(pass_file_filename, content_type):
|
|||||||
assert isinstance(elements[1], NarrativeText)
|
assert isinstance(elements[1], NarrativeText)
|
||||||
assert elements[1].text.startswith("Zejiang Shen")
|
assert elements[1].text.startswith("Zejiang Shen")
|
||||||
|
|
||||||
assert elements[0].metadata.filename == filename
|
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||||
|
|
||||||
|
|
||||||
def test_auto_partition_pdf_uses_table_extraction():
|
def test_auto_partition_pdf_uses_table_extraction():
|
||||||
@ -369,7 +369,7 @@ def test_auto_partition_pptx_from_filename():
|
|||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||||
elements = partition(filename=filename)
|
elements = partition(filename=filename)
|
||||||
assert elements == EXPECTED_PPTX_OUTPUT
|
assert elements == EXPECTED_PPTX_OUTPUT
|
||||||
assert elements[0].metadata.filename == filename
|
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||||
@ -377,7 +377,7 @@ def test_auto_partition_ppt_from_filename():
|
|||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
|
||||||
elements = partition(filename=filename)
|
elements = partition(filename=filename)
|
||||||
assert elements == EXPECTED_PPTX_OUTPUT
|
assert elements == EXPECTED_PPTX_OUTPUT
|
||||||
assert elements[0].metadata.filename == filename
|
assert elements[0].metadata.filename == os.path.basename(filename)
|
||||||
|
|
||||||
|
|
||||||
def test_auto_with_page_breaks():
|
def test_auto_with_page_breaks():
|
||||||
|
|||||||
@ -181,6 +181,9 @@ def test_partition_email_has_metadata():
|
|||||||
subject="Test Email",
|
subject="Test Email",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
|
||||||
|
assert elements[0].metadata.get_date() == expected_dt
|
||||||
|
|
||||||
|
|
||||||
def test_extract_email_text_matches_html():
|
def test_extract_email_text_matches_html():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
|
||||||
|
|||||||
@ -37,7 +37,7 @@ def test_partition_msg_from_filename():
|
|||||||
elements = partition_msg(filename=filename)
|
elements = partition_msg(filename=filename)
|
||||||
assert elements == EXPECTED_MSG_OUTPUT
|
assert elements == EXPECTED_MSG_OUTPUT
|
||||||
assert elements[0].metadata == ElementMetadata(
|
assert elements[0].metadata == ElementMetadata(
|
||||||
filename=filename,
|
filename="fake-email.msg",
|
||||||
date="2022-12-16T17:04:16-05:00",
|
date="2022-12-16T17:04:16-05:00",
|
||||||
page_number=None,
|
page_number=None,
|
||||||
url=None,
|
url=None,
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
"text": "testing <@U051UBRR946> has joined the channel <@U04ST78RXU3> has joined the channel",
|
"text": "testing <@U051UBRR946> has joined the channel <@U04ST78RXU3> has joined the channel",
|
||||||
"type": "NarrativeText",
|
"type": "NarrativeText",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"filename": "slack-ingest-download/C052BGT7718.txt"
|
"filename": "C052BGT7718.txt"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.6.6-dev1" # pragma: no cover
|
__version__ = "0.6.6-dev2" # pragma: no cover
|
||||||
|
|||||||
@ -1,4 +1,6 @@
|
|||||||
|
import datetime
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
from abc import ABC
|
from abc import ABC
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
@ -34,6 +36,9 @@ class ElementMetadata:
|
|||||||
if isinstance(self.filename, pathlib.Path):
|
if isinstance(self.filename, pathlib.Path):
|
||||||
self.filename = str(self.filename)
|
self.filename = str(self.filename)
|
||||||
|
|
||||||
|
if self.filename is not None:
|
||||||
|
self.filename = os.path.basename(self.filename)
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
return {key: value for key, value in self.__dict__.items() if value is not None}
|
return {key: value for key, value in self.__dict__.items() if value is not None}
|
||||||
|
|
||||||
@ -41,6 +46,13 @@ class ElementMetadata:
|
|||||||
def from_dict(cls, input_dict):
|
def from_dict(cls, input_dict):
|
||||||
return cls(**input_dict)
|
return cls(**input_dict)
|
||||||
|
|
||||||
|
def get_date(self) -> Optional[datetime.datetime]:
|
||||||
|
"""Converts the date field to a datetime object."""
|
||||||
|
dt = None
|
||||||
|
if self.date is not None:
|
||||||
|
dt = datetime.datetime.fromisoformat(self.date)
|
||||||
|
return dt
|
||||||
|
|
||||||
|
|
||||||
class Element(ABC):
|
class Element(ABC):
|
||||||
"""An element is a section of a page in the document."""
|
"""An element is a section of a page in the document."""
|
||||||
|
|||||||
@ -91,7 +91,7 @@ def partition_email_header(msg: Message) -> List[Element]:
|
|||||||
return elements
|
return elements
|
||||||
|
|
||||||
|
|
||||||
def build_email_metadata(msg: Message) -> ElementMetadata:
|
def build_email_metadata(msg: Message, filename: Optional[str]) -> ElementMetadata:
|
||||||
"""Creates an ElementMetadata object from the header information in the email."""
|
"""Creates an ElementMetadata object from the header information in the email."""
|
||||||
header_dict = dict(msg.raw_items())
|
header_dict = dict(msg.raw_items())
|
||||||
email_date = header_dict.get("Date")
|
email_date = header_dict.get("Date")
|
||||||
@ -111,6 +111,7 @@ def build_email_metadata(msg: Message) -> ElementMetadata:
|
|||||||
sent_from=sent_from,
|
sent_from=sent_from,
|
||||||
subject=header_dict.get("Subject"),
|
subject=header_dict.get("Subject"),
|
||||||
date=email_date,
|
date=email_date,
|
||||||
|
filename=filename,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -278,8 +279,7 @@ def partition_email(
|
|||||||
header = partition_email_header(msg)
|
header = partition_email_header(msg)
|
||||||
all_elements = header + elements
|
all_elements = header + elements
|
||||||
|
|
||||||
metadata = build_email_metadata(msg)
|
metadata = build_email_metadata(msg, filename=filename)
|
||||||
metadata.filename = filename
|
|
||||||
for element in all_elements:
|
for element in all_elements:
|
||||||
element.metadata = metadata
|
element.metadata = metadata
|
||||||
return all_elements
|
return all_elements
|
||||||
|
|||||||
@ -39,15 +39,14 @@ def partition_msg(
|
|||||||
else:
|
else:
|
||||||
elements = partition_text(text=text)
|
elements = partition_text(text=text)
|
||||||
|
|
||||||
metadata = build_msg_metadata(msg_obj)
|
metadata = build_msg_metadata(msg_obj, filename)
|
||||||
metadata.filename = filename
|
|
||||||
for element in elements:
|
for element in elements:
|
||||||
element.metadata = metadata
|
element.metadata = metadata
|
||||||
|
|
||||||
return elements
|
return elements
|
||||||
|
|
||||||
|
|
||||||
def build_msg_metadata(msg_obj: msg_parser.MsOxMessage) -> ElementMetadata:
|
def build_msg_metadata(msg_obj: msg_parser.MsOxMessage, filename: Optional[str]) -> ElementMetadata:
|
||||||
"""Creates an ElementMetadata object from the header information in the emai."""
|
"""Creates an ElementMetadata object from the header information in the emai."""
|
||||||
email_date = getattr(msg_obj, "sent_date", None)
|
email_date = getattr(msg_obj, "sent_date", None)
|
||||||
if email_date is not None:
|
if email_date is not None:
|
||||||
@ -66,6 +65,7 @@ def build_msg_metadata(msg_obj: msg_parser.MsOxMessage) -> ElementMetadata:
|
|||||||
sent_from=sent_from,
|
sent_from=sent_from,
|
||||||
subject=getattr(msg_obj, "subject", None),
|
subject=getattr(msg_obj, "subject", None),
|
||||||
date=email_date,
|
date=email_date,
|
||||||
|
filename=filename,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user