fix: email addresses shouldn't be flagged as titles (#957)

* feat: add func for checking on EmailAddress type

* feat: add EmailAddress type

* feat: add check for email type

* feat: add test for cheking EmailAdress type

* feat: update existing example files with email

* feat: add new exampe fileds with email in the text

* fix: apply linter

* feat: update changelog file

* feat: add test for is_email_address function

* don't push

* fix: clean up code

* apply linter

* fix: clean up

* fix: remove file chaanges

* fix: remove not used  files for email address test

* fix: remove not necessary tests

* clean up

* fix: apply linter

* fix: update CHANGELOG

* fix: change version

* fix: fix  msg test

* fix: apply linter for tests

* fix: remove spaces

* fix: apply linter with longer line

* feat: update documentation

* fix: remove duplicates

* Update getting_started.rst

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
This commit is contained in:
kravetsmic 2023-08-04 18:28:36 +03:00 committed by GitHub
parent 47b20119c3
commit bef93aef6e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 66 additions and 7 deletions

View File

@ -17,6 +17,7 @@
* Pass file_filename metadata when partitioning file object
* Skip ingest test on missing Slack token
* Add Dropbox variables to CI environments
* Adds new element type `EmailAddress` for recognising email address in the  text
* Simplifies `min_partition` logic; makes partitions falling below the `min_partition`
less likely.

View File

@ -73,6 +73,7 @@ These element objects represent different components of the source document. Cur
* ``PageBreak``
* ``Header``
* ``Footer``
* ``EmailAddress``
* ``CheckBox``
* ``Image``

View File

@ -33,10 +33,7 @@ def mock_document():
# NOTE(robinson) - this should get dropped because it is empty
document.add_paragraph("", style="Normal")
# NOTE(robinson) - this should get picked up as a narrative text
document.add_paragraph(
"This is my first thought. This is my second thought.",
style="Normal",
)
document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
document.add_paragraph("This is my third thought.", style="Body Text")
# NOTE(robinson) - this should just be regular text
document.add_paragraph("2023")

View File

@ -141,6 +141,7 @@ def test_partition_msg_from_file_exclude_metadata():
def test_partition_msg_can_process_attachments(
mocker,
tmpdir,
filename="example-docs/fake-email-attachment.msg",
):
@ -149,6 +150,13 @@ def test_partition_msg_can_process_attachments(
tmpdir.dirname,
ATTACH_EXPECTED_OUTPUT[0]["filename"],
)
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.text.get_last_modified_date",
return_value=mocked_last_modification_date,
)
attachment_elements = partition_text(
filename=attachment_filename,
metadata_filename=attachment_filename,
@ -161,7 +169,7 @@ def test_partition_msg_can_process_attachments(
filename=filename,
attachment_partitioner=partition_text,
process_attachments=True,
metadata_last_modified="2029-07-05T09:24:28",
metadata_last_modified=mocked_last_modification_date,
)
assert elements[0].text.startswith("Hello!")

View File

@ -294,3 +294,19 @@ def test_item_titles():
)
def test_is_us_city_state_zip(text, expected):
assert text_type.is_us_city_state_zip(text) is expected
@pytest.mark.parametrize(
("text", "expected"),
[
("fake@gmail.com", True),
("Fake@gmail.com", False),
("fake.gmail.@gmail.com", True),
("fake.gmail@.@gmail.com", False),
(" fake@gmail.com", True),
("fak!/$e@gmail.com", False),
("", False),
],
)
def test_is_email_address(text, expected):
assert text_type.is_email_address(text) is expected

View File

@ -457,6 +457,13 @@ class Address(Text):
pass
class EmailAddress(Text):
"""A text element for capturing addresses"""
category = "EmailAddress"
pass
class Image(Text):
"""A text element for capturing image metadata."""
@ -505,6 +512,7 @@ TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
"BulletedText": ListItem,
"Title": Title,
"Address": Address,
"EmailAddress": EmailAddress,
"Image": Image,
"PageBreak": PageBreak,
"Table": Table,

View File

@ -15,6 +15,7 @@ from unstructured.documents.base import Page
from unstructured.documents.elements import (
Address,
Element,
EmailAddress,
Link,
ListItem,
NarrativeText,
@ -25,6 +26,7 @@ from unstructured.documents.xml import VALID_PARSERS, XMLDocument
from unstructured.logger import logger
from unstructured.partition.text_type import (
is_bulleted_text,
is_email_address,
is_possible_narrative_text,
is_possible_title,
is_us_city_state_zip,
@ -73,6 +75,12 @@ class HTMLAddress(TagsMixin, Address):
pass
class HTMLEmailAddress(TagsMixin, EmailAddress):
"""EmailAddress with tag information"""
pass
class HTMLTitle(TagsMixin, Title):
"""Title with tag information."""
@ -306,6 +314,8 @@ def _text_to_element(
links=links,
emphasized_texts=emphasized_texts,
)
elif is_email_address(text):
return HTMLEmailAddress(text=text, tag=tag, links=links, emphasized_texts=emphasized_texts)
if len(text) < 2:
return None

View File

@ -134,4 +134,5 @@ class XMLDocument(Document):
**kwargs,
):
_, content = read_txt_file(filename=filename, encoding=encoding)
return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)

View File

@ -100,7 +100,7 @@ EMAIL_DATETIMETZ_PATTERN_RE = re.compile(EMAIL_DATETIMETZ_PATTERN)
EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+" # noqa: W605 NOTE(harrell)
# - skipping qa because we need the escape for the regex
EMAIL_ADDRESS_PATTERN_RE = re.compile(EMAIL_ADDRESS_PATTERN)
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)

View File

@ -13,6 +13,7 @@ from unstructured.documents.elements import (
Address,
Element,
ElementMetadata,
EmailAddress,
Footer,
Header,
ListItem,
@ -33,6 +34,7 @@ from unstructured.partition.common import (
)
from unstructured.partition.text_type import (
is_bulleted_text,
is_email_address,
is_possible_narrative_text,
is_possible_title,
is_us_city_state_zip,
@ -259,7 +261,8 @@ def _text_to_element(text: str, is_list=False) -> Optional[Text]:
elif is_us_city_state_zip(text):
return Address(text=text)
elif is_email_address(text):
return EmailAddress(text=text)
if len(text) < 2:
return None
elif is_possible_narrative_text(text):

View File

@ -6,6 +6,7 @@ import pptx
from unstructured.documents.elements import (
Element,
ElementMetadata,
EmailAddress,
ListItem,
NarrativeText,
PageBreak,
@ -23,6 +24,7 @@ from unstructured.partition.common import (
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.text_type import (
is_email_address,
is_possible_narrative_text,
is_possible_title,
)
@ -121,6 +123,8 @@ def partition_pptx(
continue
if _is_bulleted_paragraph(paragraph):
elements.append(ListItem(text=text, metadata=metadata))
elif is_email_address(text):
elements.append(EmailAddress(text=text))
elif is_possible_narrative_text(text):
elements.append(NarrativeText(text=text, metadata=metadata))
elif is_possible_title(text):

View File

@ -8,6 +8,7 @@ from unstructured.documents.elements import (
Address,
Element,
ElementMetadata,
EmailAddress,
ListItem,
NarrativeText,
Text,
@ -25,6 +26,7 @@ from unstructured.partition.common import (
)
from unstructured.partition.text_type import (
is_bulleted_text,
is_email_address,
is_possible_narrative_text,
is_possible_title,
is_us_city_state_zip,
@ -261,6 +263,8 @@ def element_from_text(
coordinates=coordinates,
coordinate_system=coordinate_system,
)
elif is_email_address(text):
return EmailAddress(text=text)
elif is_us_city_state_zip(text):
return Address(
text=text,

View File

@ -13,6 +13,7 @@ from unstructured.cleaners.core import remove_punctuation
from unstructured.logger import trace_logger
from unstructured.nlp.english_words import ENGLISH_WORDS
from unstructured.nlp.patterns import (
EMAIL_ADDRESS_PATTERN_RE,
ENDS_IN_PUNCT_RE,
UNICODE_BULLETS_RE,
US_CITY_STATE_ZIP_RE,
@ -304,3 +305,8 @@ def is_us_city_state_zip(text) -> bool:
DOYLESTOWN, PENNSYLVANIA 18901
"""
return US_CITY_STATE_ZIP_RE.match(text.strip()) is not None
def is_email_address(text) -> bool:
"""Check if the given text is the email address"""
return EMAIL_ADDRESS_PATTERN_RE.match(text.strip()) is not None