mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-02 05:13:30 +00:00
fix: email addresses shouldn't be flagged as titles (#957)
* feat: add func for checking on EmailAddress type * feat: add EmailAddress type * feat: add check for email type * feat: add test for cheking EmailAdress type * feat: update existing example files with email * feat: add new exampe fileds with email in the text * fix: apply linter * feat: update changelog file * feat: add test for is_email_address function * don't push * fix: clean up code * apply linter * fix: clean up * fix: remove file chaanges * fix: remove not used files for email address test * fix: remove not necessary tests * clean up * fix: apply linter * fix: update CHANGELOG * fix: change version * fix: fix msg test * fix: apply linter for tests * fix: remove spaces * fix: apply linter with longer line * feat: update documentation * fix: remove duplicates * Update getting_started.rst --------- Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
This commit is contained in:
parent
47b20119c3
commit
bef93aef6e
@ -17,6 +17,7 @@
|
||||
* Pass file_filename metadata when partitioning file object
|
||||
* Skip ingest test on missing Slack token
|
||||
* Add Dropbox variables to CI environments
|
||||
* Adds new element type `EmailAddress` for recognising email address in the text
|
||||
|
||||
* Simplifies `min_partition` logic; makes partitions falling below the `min_partition`
|
||||
less likely.
|
||||
|
@ -73,6 +73,7 @@ These element objects represent different components of the source document. Cur
|
||||
* ``PageBreak``
|
||||
* ``Header``
|
||||
* ``Footer``
|
||||
* ``EmailAddress``
|
||||
* ``CheckBox``
|
||||
* ``Image``
|
||||
|
||||
|
@ -33,10 +33,7 @@ def mock_document():
|
||||
# NOTE(robinson) - this should get dropped because it is empty
|
||||
document.add_paragraph("", style="Normal")
|
||||
# NOTE(robinson) - this should get picked up as a narrative text
|
||||
document.add_paragraph(
|
||||
"This is my first thought. This is my second thought.",
|
||||
style="Normal",
|
||||
)
|
||||
document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
|
||||
document.add_paragraph("This is my third thought.", style="Body Text")
|
||||
# NOTE(robinson) - this should just be regular text
|
||||
document.add_paragraph("2023")
|
||||
|
@ -141,6 +141,7 @@ def test_partition_msg_from_file_exclude_metadata():
|
||||
|
||||
|
||||
def test_partition_msg_can_process_attachments(
|
||||
mocker,
|
||||
tmpdir,
|
||||
filename="example-docs/fake-email-attachment.msg",
|
||||
):
|
||||
@ -149,6 +150,13 @@ def test_partition_msg_can_process_attachments(
|
||||
tmpdir.dirname,
|
||||
ATTACH_EXPECTED_OUTPUT[0]["filename"],
|
||||
)
|
||||
|
||||
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||
|
||||
mocker.patch(
|
||||
"unstructured.partition.text.get_last_modified_date",
|
||||
return_value=mocked_last_modification_date,
|
||||
)
|
||||
attachment_elements = partition_text(
|
||||
filename=attachment_filename,
|
||||
metadata_filename=attachment_filename,
|
||||
@ -161,7 +169,7 @@ def test_partition_msg_can_process_attachments(
|
||||
filename=filename,
|
||||
attachment_partitioner=partition_text,
|
||||
process_attachments=True,
|
||||
metadata_last_modified="2029-07-05T09:24:28",
|
||||
metadata_last_modified=mocked_last_modification_date,
|
||||
)
|
||||
|
||||
assert elements[0].text.startswith("Hello!")
|
||||
|
@ -294,3 +294,19 @@ def test_item_titles():
|
||||
)
|
||||
def test_is_us_city_state_zip(text, expected):
|
||||
assert text_type.is_us_city_state_zip(text) is expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("text", "expected"),
|
||||
[
|
||||
("fake@gmail.com", True),
|
||||
("Fake@gmail.com", False),
|
||||
("fake.gmail.@gmail.com", True),
|
||||
("fake.gmail@.@gmail.com", False),
|
||||
(" fake@gmail.com", True),
|
||||
("fak!/$e@gmail.com", False),
|
||||
("", False),
|
||||
],
|
||||
)
|
||||
def test_is_email_address(text, expected):
|
||||
assert text_type.is_email_address(text) is expected
|
||||
|
@ -457,6 +457,13 @@ class Address(Text):
|
||||
pass
|
||||
|
||||
|
||||
class EmailAddress(Text):
|
||||
"""A text element for capturing addresses"""
|
||||
|
||||
category = "EmailAddress"
|
||||
pass
|
||||
|
||||
|
||||
class Image(Text):
|
||||
"""A text element for capturing image metadata."""
|
||||
|
||||
@ -505,6 +512,7 @@ TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
|
||||
"BulletedText": ListItem,
|
||||
"Title": Title,
|
||||
"Address": Address,
|
||||
"EmailAddress": EmailAddress,
|
||||
"Image": Image,
|
||||
"PageBreak": PageBreak,
|
||||
"Table": Table,
|
||||
|
@ -15,6 +15,7 @@ from unstructured.documents.base import Page
|
||||
from unstructured.documents.elements import (
|
||||
Address,
|
||||
Element,
|
||||
EmailAddress,
|
||||
Link,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
@ -25,6 +26,7 @@ from unstructured.documents.xml import VALID_PARSERS, XMLDocument
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.text_type import (
|
||||
is_bulleted_text,
|
||||
is_email_address,
|
||||
is_possible_narrative_text,
|
||||
is_possible_title,
|
||||
is_us_city_state_zip,
|
||||
@ -73,6 +75,12 @@ class HTMLAddress(TagsMixin, Address):
|
||||
pass
|
||||
|
||||
|
||||
class HTMLEmailAddress(TagsMixin, EmailAddress):
|
||||
"""EmailAddress with tag information"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class HTMLTitle(TagsMixin, Title):
|
||||
"""Title with tag information."""
|
||||
|
||||
@ -306,6 +314,8 @@ def _text_to_element(
|
||||
links=links,
|
||||
emphasized_texts=emphasized_texts,
|
||||
)
|
||||
elif is_email_address(text):
|
||||
return HTMLEmailAddress(text=text, tag=tag, links=links, emphasized_texts=emphasized_texts)
|
||||
|
||||
if len(text) < 2:
|
||||
return None
|
||||
|
@ -134,4 +134,5 @@ class XMLDocument(Document):
|
||||
**kwargs,
|
||||
):
|
||||
_, content = read_txt_file(filename=filename, encoding=encoding)
|
||||
|
||||
return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)
|
||||
|
@ -100,7 +100,7 @@ EMAIL_DATETIMETZ_PATTERN_RE = re.compile(EMAIL_DATETIMETZ_PATTERN)
|
||||
|
||||
EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+" # noqa: W605 NOTE(harrell)
|
||||
# - skipping qa because we need the escape for the regex
|
||||
|
||||
EMAIL_ADDRESS_PATTERN_RE = re.compile(EMAIL_ADDRESS_PATTERN)
|
||||
|
||||
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
|
||||
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
|
||||
|
@ -13,6 +13,7 @@ from unstructured.documents.elements import (
|
||||
Address,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
EmailAddress,
|
||||
Footer,
|
||||
Header,
|
||||
ListItem,
|
||||
@ -33,6 +34,7 @@ from unstructured.partition.common import (
|
||||
)
|
||||
from unstructured.partition.text_type import (
|
||||
is_bulleted_text,
|
||||
is_email_address,
|
||||
is_possible_narrative_text,
|
||||
is_possible_title,
|
||||
is_us_city_state_zip,
|
||||
@ -259,7 +261,8 @@ def _text_to_element(text: str, is_list=False) -> Optional[Text]:
|
||||
|
||||
elif is_us_city_state_zip(text):
|
||||
return Address(text=text)
|
||||
|
||||
elif is_email_address(text):
|
||||
return EmailAddress(text=text)
|
||||
if len(text) < 2:
|
||||
return None
|
||||
elif is_possible_narrative_text(text):
|
||||
|
@ -6,6 +6,7 @@ import pptx
|
||||
from unstructured.documents.elements import (
|
||||
Element,
|
||||
ElementMetadata,
|
||||
EmailAddress,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
PageBreak,
|
||||
@ -23,6 +24,7 @@ from unstructured.partition.common import (
|
||||
spooled_to_bytes_io_if_needed,
|
||||
)
|
||||
from unstructured.partition.text_type import (
|
||||
is_email_address,
|
||||
is_possible_narrative_text,
|
||||
is_possible_title,
|
||||
)
|
||||
@ -121,6 +123,8 @@ def partition_pptx(
|
||||
continue
|
||||
if _is_bulleted_paragraph(paragraph):
|
||||
elements.append(ListItem(text=text, metadata=metadata))
|
||||
elif is_email_address(text):
|
||||
elements.append(EmailAddress(text=text))
|
||||
elif is_possible_narrative_text(text):
|
||||
elements.append(NarrativeText(text=text, metadata=metadata))
|
||||
elif is_possible_title(text):
|
||||
|
@ -8,6 +8,7 @@ from unstructured.documents.elements import (
|
||||
Address,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
EmailAddress,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
Text,
|
||||
@ -25,6 +26,7 @@ from unstructured.partition.common import (
|
||||
)
|
||||
from unstructured.partition.text_type import (
|
||||
is_bulleted_text,
|
||||
is_email_address,
|
||||
is_possible_narrative_text,
|
||||
is_possible_title,
|
||||
is_us_city_state_zip,
|
||||
@ -261,6 +263,8 @@ def element_from_text(
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
)
|
||||
elif is_email_address(text):
|
||||
return EmailAddress(text=text)
|
||||
elif is_us_city_state_zip(text):
|
||||
return Address(
|
||||
text=text,
|
||||
|
@ -13,6 +13,7 @@ from unstructured.cleaners.core import remove_punctuation
|
||||
from unstructured.logger import trace_logger
|
||||
from unstructured.nlp.english_words import ENGLISH_WORDS
|
||||
from unstructured.nlp.patterns import (
|
||||
EMAIL_ADDRESS_PATTERN_RE,
|
||||
ENDS_IN_PUNCT_RE,
|
||||
UNICODE_BULLETS_RE,
|
||||
US_CITY_STATE_ZIP_RE,
|
||||
@ -304,3 +305,8 @@ def is_us_city_state_zip(text) -> bool:
|
||||
DOYLESTOWN, PENNSYLVANIA 18901
|
||||
"""
|
||||
return US_CITY_STATE_ZIP_RE.match(text.strip()) is not None
|
||||
|
||||
|
||||
def is_email_address(text) -> bool:
|
||||
"""Check if the given text is the email address"""
|
||||
return EMAIL_ADDRESS_PATTERN_RE.match(text.strip()) is not None
|
||||
|
Loading…
x
Reference in New Issue
Block a user