fix: email addresses shouldn't be flagged as titles (#957)

* feat: add func for checking on EmailAddress type * feat: add EmailAddress type * feat: add check for email type * feat: add test for cheking EmailAdress type * feat: update existing example files with email * feat: add new exampe fileds with email in the text * fix: apply linter * feat: update changelog file * feat: add test for is_email_address function * don't push * fix: clean up code * apply linter * fix: clean up * fix: remove file chaanges * fix: remove not used files for email address test * fix: remove not necessary tests * clean up * fix: apply linter * fix: update CHANGELOG * fix: change version * fix: fix msg test * fix: apply linter for tests * fix: remove spaces * fix: apply linter with longer line * feat: update documentation * fix: remove duplicates * Update getting_started.rst --------- Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
2025-12-13 16:11:05 +00:00 · 2023-08-04 18:28:36 +03:00 · 2023-08-04 18:28:36 +03:00 · bef93aef6e
commit bef93aef6e
parent 47b20119c3
13 changed files with 66 additions and 7 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -17,6 +17,7 @@
 * Pass file_filename metadata when partitioning file object
 * Skip ingest test on missing Slack token
 * Add Dropbox variables to CI environments
+* Adds new element type `EmailAddress` for recognising email address in the  text

 * Simplifies `min_partition` logic; makes partitions falling below the `min_partition`
  less likely.
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@ -73,6 +73,7 @@ These element objects represent different components of the source document. Cur
 		* ``PageBreak``
 		* ``Header``
 		* ``Footer``
+        	* ``EmailAddress``
 	* ``CheckBox``
 	* ``Image``

--- a/test_unstructured/partition/test_doc.py
+++ b/test_unstructured/partition/test_doc.py
@ -33,10 +33,7 @@ def mock_document():
    # NOTE(robinson) - this should get dropped because it is empty
    document.add_paragraph("", style="Normal")
    # NOTE(robinson) - this should get picked up as a narrative text
-    document.add_paragraph(
-        "This is my first thought. This is my second thought.",
-        style="Normal",
-    )
+    document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
    document.add_paragraph("This is my third thought.", style="Body Text")
    # NOTE(robinson) - this should just be regular text
    document.add_paragraph("2023")
--- a/test_unstructured/partition/test_msg.py
+++ b/test_unstructured/partition/test_msg.py
@ -141,6 +141,7 @@ def test_partition_msg_from_file_exclude_metadata():


 def test_partition_msg_can_process_attachments(
+    mocker,
    tmpdir,
    filename="example-docs/fake-email-attachment.msg",
 ):
@ -149,6 +150,13 @@ def test_partition_msg_can_process_attachments(
        tmpdir.dirname,
        ATTACH_EXPECTED_OUTPUT[0]["filename"],
    )
+
+    mocked_last_modification_date = "2029-07-05T09:24:28"
+
+    mocker.patch(
+        "unstructured.partition.text.get_last_modified_date",
+        return_value=mocked_last_modification_date,
+    )
    attachment_elements = partition_text(
        filename=attachment_filename,
        metadata_filename=attachment_filename,
@ -161,7 +169,7 @@ def test_partition_msg_can_process_attachments(
        filename=filename,
        attachment_partitioner=partition_text,
        process_attachments=True,
-        metadata_last_modified="2029-07-05T09:24:28",
+        metadata_last_modified=mocked_last_modification_date,
    )

    assert elements[0].text.startswith("Hello!")
--- a/test_unstructured/partition/test_text_type.py
+++ b/test_unstructured/partition/test_text_type.py
@ -294,3 +294,19 @@ def test_item_titles():
 )
 def test_is_us_city_state_zip(text, expected):
    assert text_type.is_us_city_state_zip(text) is expected
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        ("fake@gmail.com", True),
+        ("Fake@gmail.com", False),
+        ("fake.gmail.@gmail.com", True),
+        ("fake.gmail@.@gmail.com", False),
+        ("     fake@gmail.com", True),
+        ("fak!/$e@gmail.com", False),
+        ("", False),
+    ],
+)
+def test_is_email_address(text, expected):
+    assert text_type.is_email_address(text) is expected
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@ -457,6 +457,13 @@ class Address(Text):
    pass


+class EmailAddress(Text):
+    """A text element for capturing addresses"""
+
+    category = "EmailAddress"
+    pass
+
+
 class Image(Text):
    """A text element for capturing image metadata."""

@ -505,6 +512,7 @@ TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
    "BulletedText": ListItem,
    "Title": Title,
    "Address": Address,
+    "EmailAddress": EmailAddress,
    "Image": Image,
    "PageBreak": PageBreak,
    "Table": Table,
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@ -15,6 +15,7 @@ from unstructured.documents.base import Page
 from unstructured.documents.elements import (
    Address,
    Element,
+    EmailAddress,
    Link,
    ListItem,
    NarrativeText,
@ -25,6 +26,7 @@ from unstructured.documents.xml import VALID_PARSERS, XMLDocument
 from unstructured.logger import logger
 from unstructured.partition.text_type import (
    is_bulleted_text,
+    is_email_address,
    is_possible_narrative_text,
    is_possible_title,
    is_us_city_state_zip,
@ -73,6 +75,12 @@ class HTMLAddress(TagsMixin, Address):
    pass


+class HTMLEmailAddress(TagsMixin, EmailAddress):
+    """EmailAddress with tag information"""
+
+    pass
+
+
 class HTMLTitle(TagsMixin, Title):
    """Title with tag information."""

@ -306,6 +314,8 @@ def _text_to_element(
            links=links,
            emphasized_texts=emphasized_texts,
        )
+    elif is_email_address(text):
+        return HTMLEmailAddress(text=text, tag=tag, links=links, emphasized_texts=emphasized_texts)

    if len(text) < 2:
        return None
--- a/unstructured/documents/xml.py
+++ b/unstructured/documents/xml.py
@ -134,4 +134,5 @@ class XMLDocument(Document):
        **kwargs,
    ):
        _, content = read_txt_file(filename=filename, encoding=encoding)
+
        return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)
--- a/unstructured/nlp/patterns.py
+++ b/unstructured/nlp/patterns.py
@ -100,7 +100,7 @@ EMAIL_DATETIMETZ_PATTERN_RE = re.compile(EMAIL_DATETIMETZ_PATTERN)

 EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"  # noqa: W605 NOTE(harrell)
 # - skipping qa because we need the escape for the regex
-
+EMAIL_ADDRESS_PATTERN_RE = re.compile(EMAIL_ADDRESS_PATTERN)

 ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
 ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -13,6 +13,7 @@ from unstructured.documents.elements import (
    Address,
    Element,
    ElementMetadata,
+    EmailAddress,
    Footer,
    Header,
    ListItem,
@ -33,6 +34,7 @@ from unstructured.partition.common import (
 )
 from unstructured.partition.text_type import (
    is_bulleted_text,
+    is_email_address,
    is_possible_narrative_text,
    is_possible_title,
    is_us_city_state_zip,
@ -259,7 +261,8 @@ def _text_to_element(text: str, is_list=False) -> Optional[Text]:

    elif is_us_city_state_zip(text):
        return Address(text=text)
-
+    elif is_email_address(text):
+        return EmailAddress(text=text)
    if len(text) < 2:
        return None
    elif is_possible_narrative_text(text):
--- a/unstructured/partition/pptx.py
+++ b/unstructured/partition/pptx.py
@ -6,6 +6,7 @@ import pptx
 from unstructured.documents.elements import (
    Element,
    ElementMetadata,
+    EmailAddress,
    ListItem,
    NarrativeText,
    PageBreak,
@ -23,6 +24,7 @@ from unstructured.partition.common import (
    spooled_to_bytes_io_if_needed,
 )
 from unstructured.partition.text_type import (
+    is_email_address,
    is_possible_narrative_text,
    is_possible_title,
 )
@ -121,6 +123,8 @@ def partition_pptx(
                    continue
                if _is_bulleted_paragraph(paragraph):
                    elements.append(ListItem(text=text, metadata=metadata))
+                elif is_email_address(text):
+                    elements.append(EmailAddress(text=text))
                elif is_possible_narrative_text(text):
                    elements.append(NarrativeText(text=text, metadata=metadata))
                elif is_possible_title(text):
--- a/unstructured/partition/text.py
+++ b/unstructured/partition/text.py
@ -8,6 +8,7 @@ from unstructured.documents.elements import (
    Address,
    Element,
    ElementMetadata,
+    EmailAddress,
    ListItem,
    NarrativeText,
    Text,
@ -25,6 +26,7 @@ from unstructured.partition.common import (
 )
 from unstructured.partition.text_type import (
    is_bulleted_text,
+    is_email_address,
    is_possible_narrative_text,
    is_possible_title,
    is_us_city_state_zip,
@ -261,6 +263,8 @@ def element_from_text(
            coordinates=coordinates,
            coordinate_system=coordinate_system,
        )
+    elif is_email_address(text):
+        return EmailAddress(text=text)
    elif is_us_city_state_zip(text):
        return Address(
            text=text,
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@ -13,6 +13,7 @@ from unstructured.cleaners.core import remove_punctuation
 from unstructured.logger import trace_logger
 from unstructured.nlp.english_words import ENGLISH_WORDS
 from unstructured.nlp.patterns import (
+    EMAIL_ADDRESS_PATTERN_RE,
    ENDS_IN_PUNCT_RE,
    UNICODE_BULLETS_RE,
    US_CITY_STATE_ZIP_RE,
@ -304,3 +305,8 @@ def is_us_city_state_zip(text) -> bool:
    DOYLESTOWN, PENNSYLVANIA 18901
    """
    return US_CITY_STATE_ZIP_RE.match(text.strip()) is not None
+
+
+def is_email_address(text) -> bool:
+    """Check if the given text is the email address"""
+    return EMAIL_ADDRESS_PATTERN_RE.match(text.strip()) is not None