mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-02 13:24:03 +00:00
fix: email addresses shouldn't be flagged as titles (#957)
* feat: add func for checking on EmailAddress type * feat: add EmailAddress type * feat: add check for email type * feat: add test for cheking EmailAdress type * feat: update existing example files with email * feat: add new exampe fileds with email in the text * fix: apply linter * feat: update changelog file * feat: add test for is_email_address function * don't push * fix: clean up code * apply linter * fix: clean up * fix: remove file chaanges * fix: remove not used files for email address test * fix: remove not necessary tests * clean up * fix: apply linter * fix: update CHANGELOG * fix: change version * fix: fix msg test * fix: apply linter for tests * fix: remove spaces * fix: apply linter with longer line * feat: update documentation * fix: remove duplicates * Update getting_started.rst --------- Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
This commit is contained in:
parent
47b20119c3
commit
bef93aef6e
@ -17,6 +17,7 @@
|
|||||||
* Pass file_filename metadata when partitioning file object
|
* Pass file_filename metadata when partitioning file object
|
||||||
* Skip ingest test on missing Slack token
|
* Skip ingest test on missing Slack token
|
||||||
* Add Dropbox variables to CI environments
|
* Add Dropbox variables to CI environments
|
||||||
|
* Adds new element type `EmailAddress` for recognising email address in the text
|
||||||
|
|
||||||
* Simplifies `min_partition` logic; makes partitions falling below the `min_partition`
|
* Simplifies `min_partition` logic; makes partitions falling below the `min_partition`
|
||||||
less likely.
|
less likely.
|
||||||
|
@ -73,6 +73,7 @@ These element objects represent different components of the source document. Cur
|
|||||||
* ``PageBreak``
|
* ``PageBreak``
|
||||||
* ``Header``
|
* ``Header``
|
||||||
* ``Footer``
|
* ``Footer``
|
||||||
|
* ``EmailAddress``
|
||||||
* ``CheckBox``
|
* ``CheckBox``
|
||||||
* ``Image``
|
* ``Image``
|
||||||
|
|
||||||
|
@ -33,10 +33,7 @@ def mock_document():
|
|||||||
# NOTE(robinson) - this should get dropped because it is empty
|
# NOTE(robinson) - this should get dropped because it is empty
|
||||||
document.add_paragraph("", style="Normal")
|
document.add_paragraph("", style="Normal")
|
||||||
# NOTE(robinson) - this should get picked up as a narrative text
|
# NOTE(robinson) - this should get picked up as a narrative text
|
||||||
document.add_paragraph(
|
document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
|
||||||
"This is my first thought. This is my second thought.",
|
|
||||||
style="Normal",
|
|
||||||
)
|
|
||||||
document.add_paragraph("This is my third thought.", style="Body Text")
|
document.add_paragraph("This is my third thought.", style="Body Text")
|
||||||
# NOTE(robinson) - this should just be regular text
|
# NOTE(robinson) - this should just be regular text
|
||||||
document.add_paragraph("2023")
|
document.add_paragraph("2023")
|
||||||
|
@ -141,6 +141,7 @@ def test_partition_msg_from_file_exclude_metadata():
|
|||||||
|
|
||||||
|
|
||||||
def test_partition_msg_can_process_attachments(
|
def test_partition_msg_can_process_attachments(
|
||||||
|
mocker,
|
||||||
tmpdir,
|
tmpdir,
|
||||||
filename="example-docs/fake-email-attachment.msg",
|
filename="example-docs/fake-email-attachment.msg",
|
||||||
):
|
):
|
||||||
@ -149,6 +150,13 @@ def test_partition_msg_can_process_attachments(
|
|||||||
tmpdir.dirname,
|
tmpdir.dirname,
|
||||||
ATTACH_EXPECTED_OUTPUT[0]["filename"],
|
ATTACH_EXPECTED_OUTPUT[0]["filename"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||||
|
|
||||||
|
mocker.patch(
|
||||||
|
"unstructured.partition.text.get_last_modified_date",
|
||||||
|
return_value=mocked_last_modification_date,
|
||||||
|
)
|
||||||
attachment_elements = partition_text(
|
attachment_elements = partition_text(
|
||||||
filename=attachment_filename,
|
filename=attachment_filename,
|
||||||
metadata_filename=attachment_filename,
|
metadata_filename=attachment_filename,
|
||||||
@ -161,7 +169,7 @@ def test_partition_msg_can_process_attachments(
|
|||||||
filename=filename,
|
filename=filename,
|
||||||
attachment_partitioner=partition_text,
|
attachment_partitioner=partition_text,
|
||||||
process_attachments=True,
|
process_attachments=True,
|
||||||
metadata_last_modified="2029-07-05T09:24:28",
|
metadata_last_modified=mocked_last_modification_date,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert elements[0].text.startswith("Hello!")
|
assert elements[0].text.startswith("Hello!")
|
||||||
|
@ -294,3 +294,19 @@ def test_item_titles():
|
|||||||
)
|
)
|
||||||
def test_is_us_city_state_zip(text, expected):
|
def test_is_us_city_state_zip(text, expected):
|
||||||
assert text_type.is_us_city_state_zip(text) is expected
|
assert text_type.is_us_city_state_zip(text) is expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("text", "expected"),
|
||||||
|
[
|
||||||
|
("fake@gmail.com", True),
|
||||||
|
("Fake@gmail.com", False),
|
||||||
|
("fake.gmail.@gmail.com", True),
|
||||||
|
("fake.gmail@.@gmail.com", False),
|
||||||
|
(" fake@gmail.com", True),
|
||||||
|
("fak!/$e@gmail.com", False),
|
||||||
|
("", False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_is_email_address(text, expected):
|
||||||
|
assert text_type.is_email_address(text) is expected
|
||||||
|
@ -457,6 +457,13 @@ class Address(Text):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class EmailAddress(Text):
|
||||||
|
"""A text element for capturing addresses"""
|
||||||
|
|
||||||
|
category = "EmailAddress"
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Image(Text):
|
class Image(Text):
|
||||||
"""A text element for capturing image metadata."""
|
"""A text element for capturing image metadata."""
|
||||||
|
|
||||||
@ -505,6 +512,7 @@ TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
|
|||||||
"BulletedText": ListItem,
|
"BulletedText": ListItem,
|
||||||
"Title": Title,
|
"Title": Title,
|
||||||
"Address": Address,
|
"Address": Address,
|
||||||
|
"EmailAddress": EmailAddress,
|
||||||
"Image": Image,
|
"Image": Image,
|
||||||
"PageBreak": PageBreak,
|
"PageBreak": PageBreak,
|
||||||
"Table": Table,
|
"Table": Table,
|
||||||
|
@ -15,6 +15,7 @@ from unstructured.documents.base import Page
|
|||||||
from unstructured.documents.elements import (
|
from unstructured.documents.elements import (
|
||||||
Address,
|
Address,
|
||||||
Element,
|
Element,
|
||||||
|
EmailAddress,
|
||||||
Link,
|
Link,
|
||||||
ListItem,
|
ListItem,
|
||||||
NarrativeText,
|
NarrativeText,
|
||||||
@ -25,6 +26,7 @@ from unstructured.documents.xml import VALID_PARSERS, XMLDocument
|
|||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
from unstructured.partition.text_type import (
|
from unstructured.partition.text_type import (
|
||||||
is_bulleted_text,
|
is_bulleted_text,
|
||||||
|
is_email_address,
|
||||||
is_possible_narrative_text,
|
is_possible_narrative_text,
|
||||||
is_possible_title,
|
is_possible_title,
|
||||||
is_us_city_state_zip,
|
is_us_city_state_zip,
|
||||||
@ -73,6 +75,12 @@ class HTMLAddress(TagsMixin, Address):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLEmailAddress(TagsMixin, EmailAddress):
|
||||||
|
"""EmailAddress with tag information"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class HTMLTitle(TagsMixin, Title):
|
class HTMLTitle(TagsMixin, Title):
|
||||||
"""Title with tag information."""
|
"""Title with tag information."""
|
||||||
|
|
||||||
@ -306,6 +314,8 @@ def _text_to_element(
|
|||||||
links=links,
|
links=links,
|
||||||
emphasized_texts=emphasized_texts,
|
emphasized_texts=emphasized_texts,
|
||||||
)
|
)
|
||||||
|
elif is_email_address(text):
|
||||||
|
return HTMLEmailAddress(text=text, tag=tag, links=links, emphasized_texts=emphasized_texts)
|
||||||
|
|
||||||
if len(text) < 2:
|
if len(text) < 2:
|
||||||
return None
|
return None
|
||||||
|
@ -134,4 +134,5 @@ class XMLDocument(Document):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
_, content = read_txt_file(filename=filename, encoding=encoding)
|
_, content = read_txt_file(filename=filename, encoding=encoding)
|
||||||
|
|
||||||
return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)
|
return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)
|
||||||
|
@ -100,7 +100,7 @@ EMAIL_DATETIMETZ_PATTERN_RE = re.compile(EMAIL_DATETIMETZ_PATTERN)
|
|||||||
|
|
||||||
EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+" # noqa: W605 NOTE(harrell)
|
EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+" # noqa: W605 NOTE(harrell)
|
||||||
# - skipping qa because we need the escape for the regex
|
# - skipping qa because we need the escape for the regex
|
||||||
|
EMAIL_ADDRESS_PATTERN_RE = re.compile(EMAIL_ADDRESS_PATTERN)
|
||||||
|
|
||||||
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
|
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
|
||||||
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
|
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
|
||||||
|
@ -13,6 +13,7 @@ from unstructured.documents.elements import (
|
|||||||
Address,
|
Address,
|
||||||
Element,
|
Element,
|
||||||
ElementMetadata,
|
ElementMetadata,
|
||||||
|
EmailAddress,
|
||||||
Footer,
|
Footer,
|
||||||
Header,
|
Header,
|
||||||
ListItem,
|
ListItem,
|
||||||
@ -33,6 +34,7 @@ from unstructured.partition.common import (
|
|||||||
)
|
)
|
||||||
from unstructured.partition.text_type import (
|
from unstructured.partition.text_type import (
|
||||||
is_bulleted_text,
|
is_bulleted_text,
|
||||||
|
is_email_address,
|
||||||
is_possible_narrative_text,
|
is_possible_narrative_text,
|
||||||
is_possible_title,
|
is_possible_title,
|
||||||
is_us_city_state_zip,
|
is_us_city_state_zip,
|
||||||
@ -259,7 +261,8 @@ def _text_to_element(text: str, is_list=False) -> Optional[Text]:
|
|||||||
|
|
||||||
elif is_us_city_state_zip(text):
|
elif is_us_city_state_zip(text):
|
||||||
return Address(text=text)
|
return Address(text=text)
|
||||||
|
elif is_email_address(text):
|
||||||
|
return EmailAddress(text=text)
|
||||||
if len(text) < 2:
|
if len(text) < 2:
|
||||||
return None
|
return None
|
||||||
elif is_possible_narrative_text(text):
|
elif is_possible_narrative_text(text):
|
||||||
|
@ -6,6 +6,7 @@ import pptx
|
|||||||
from unstructured.documents.elements import (
|
from unstructured.documents.elements import (
|
||||||
Element,
|
Element,
|
||||||
ElementMetadata,
|
ElementMetadata,
|
||||||
|
EmailAddress,
|
||||||
ListItem,
|
ListItem,
|
||||||
NarrativeText,
|
NarrativeText,
|
||||||
PageBreak,
|
PageBreak,
|
||||||
@ -23,6 +24,7 @@ from unstructured.partition.common import (
|
|||||||
spooled_to_bytes_io_if_needed,
|
spooled_to_bytes_io_if_needed,
|
||||||
)
|
)
|
||||||
from unstructured.partition.text_type import (
|
from unstructured.partition.text_type import (
|
||||||
|
is_email_address,
|
||||||
is_possible_narrative_text,
|
is_possible_narrative_text,
|
||||||
is_possible_title,
|
is_possible_title,
|
||||||
)
|
)
|
||||||
@ -121,6 +123,8 @@ def partition_pptx(
|
|||||||
continue
|
continue
|
||||||
if _is_bulleted_paragraph(paragraph):
|
if _is_bulleted_paragraph(paragraph):
|
||||||
elements.append(ListItem(text=text, metadata=metadata))
|
elements.append(ListItem(text=text, metadata=metadata))
|
||||||
|
elif is_email_address(text):
|
||||||
|
elements.append(EmailAddress(text=text))
|
||||||
elif is_possible_narrative_text(text):
|
elif is_possible_narrative_text(text):
|
||||||
elements.append(NarrativeText(text=text, metadata=metadata))
|
elements.append(NarrativeText(text=text, metadata=metadata))
|
||||||
elif is_possible_title(text):
|
elif is_possible_title(text):
|
||||||
|
@ -8,6 +8,7 @@ from unstructured.documents.elements import (
|
|||||||
Address,
|
Address,
|
||||||
Element,
|
Element,
|
||||||
ElementMetadata,
|
ElementMetadata,
|
||||||
|
EmailAddress,
|
||||||
ListItem,
|
ListItem,
|
||||||
NarrativeText,
|
NarrativeText,
|
||||||
Text,
|
Text,
|
||||||
@ -25,6 +26,7 @@ from unstructured.partition.common import (
|
|||||||
)
|
)
|
||||||
from unstructured.partition.text_type import (
|
from unstructured.partition.text_type import (
|
||||||
is_bulleted_text,
|
is_bulleted_text,
|
||||||
|
is_email_address,
|
||||||
is_possible_narrative_text,
|
is_possible_narrative_text,
|
||||||
is_possible_title,
|
is_possible_title,
|
||||||
is_us_city_state_zip,
|
is_us_city_state_zip,
|
||||||
@ -261,6 +263,8 @@ def element_from_text(
|
|||||||
coordinates=coordinates,
|
coordinates=coordinates,
|
||||||
coordinate_system=coordinate_system,
|
coordinate_system=coordinate_system,
|
||||||
)
|
)
|
||||||
|
elif is_email_address(text):
|
||||||
|
return EmailAddress(text=text)
|
||||||
elif is_us_city_state_zip(text):
|
elif is_us_city_state_zip(text):
|
||||||
return Address(
|
return Address(
|
||||||
text=text,
|
text=text,
|
||||||
|
@ -13,6 +13,7 @@ from unstructured.cleaners.core import remove_punctuation
|
|||||||
from unstructured.logger import trace_logger
|
from unstructured.logger import trace_logger
|
||||||
from unstructured.nlp.english_words import ENGLISH_WORDS
|
from unstructured.nlp.english_words import ENGLISH_WORDS
|
||||||
from unstructured.nlp.patterns import (
|
from unstructured.nlp.patterns import (
|
||||||
|
EMAIL_ADDRESS_PATTERN_RE,
|
||||||
ENDS_IN_PUNCT_RE,
|
ENDS_IN_PUNCT_RE,
|
||||||
UNICODE_BULLETS_RE,
|
UNICODE_BULLETS_RE,
|
||||||
US_CITY_STATE_ZIP_RE,
|
US_CITY_STATE_ZIP_RE,
|
||||||
@ -304,3 +305,8 @@ def is_us_city_state_zip(text) -> bool:
|
|||||||
DOYLESTOWN, PENNSYLVANIA 18901
|
DOYLESTOWN, PENNSYLVANIA 18901
|
||||||
"""
|
"""
|
||||||
return US_CITY_STATE_ZIP_RE.match(text.strip()) is not None
|
return US_CITY_STATE_ZIP_RE.match(text.strip()) is not None
|
||||||
|
|
||||||
|
|
||||||
|
def is_email_address(text) -> bool:
|
||||||
|
"""Check if the given text is the email address"""
|
||||||
|
return EMAIL_ADDRESS_PATTERN_RE.match(text.strip()) is not None
|
||||||
|
Loading…
x
Reference in New Issue
Block a user