diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e5dc0091..0064e1d89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ * Pass file_filename metadata when partitioning file object * Skip ingest test on missing Slack token * Add Dropbox variables to CI environments +* Adds new element type `EmailAddress` for recognising email address in the  text * Simplifies `min_partition` logic; makes partitions falling below the `min_partition` less likely. diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 07d0eaf4e..ee2353397 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -73,6 +73,7 @@ These element objects represent different components of the source document. Cur * ``PageBreak`` * ``Header`` * ``Footer`` + * ``EmailAddress`` * ``CheckBox`` * ``Image`` diff --git a/test_unstructured/partition/test_doc.py b/test_unstructured/partition/test_doc.py index bd47749e0..f8ea909ec 100644 --- a/test_unstructured/partition/test_doc.py +++ b/test_unstructured/partition/test_doc.py @@ -33,10 +33,7 @@ def mock_document(): # NOTE(robinson) - this should get dropped because it is empty document.add_paragraph("", style="Normal") # NOTE(robinson) - this should get picked up as a narrative text - document.add_paragraph( - "This is my first thought. This is my second thought.", - style="Normal", - ) + document.add_paragraph("This is my first thought. This is my second thought.", style="Normal") document.add_paragraph("This is my third thought.", style="Body Text") # NOTE(robinson) - this should just be regular text document.add_paragraph("2023") diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py index c6ba2759d..51504cc0b 100644 --- a/test_unstructured/partition/test_msg.py +++ b/test_unstructured/partition/test_msg.py @@ -141,6 +141,7 @@ def test_partition_msg_from_file_exclude_metadata(): def test_partition_msg_can_process_attachments( + mocker, tmpdir, filename="example-docs/fake-email-attachment.msg", ): @@ -149,6 +150,13 @@ def test_partition_msg_can_process_attachments( tmpdir.dirname, ATTACH_EXPECTED_OUTPUT[0]["filename"], ) + + mocked_last_modification_date = "2029-07-05T09:24:28" + + mocker.patch( + "unstructured.partition.text.get_last_modified_date", + return_value=mocked_last_modification_date, + ) attachment_elements = partition_text( filename=attachment_filename, metadata_filename=attachment_filename, @@ -161,7 +169,7 @@ def test_partition_msg_can_process_attachments( filename=filename, attachment_partitioner=partition_text, process_attachments=True, - metadata_last_modified="2029-07-05T09:24:28", + metadata_last_modified=mocked_last_modification_date, ) assert elements[0].text.startswith("Hello!") diff --git a/test_unstructured/partition/test_text_type.py b/test_unstructured/partition/test_text_type.py index 2402df5de..d85b90fec 100644 --- a/test_unstructured/partition/test_text_type.py +++ b/test_unstructured/partition/test_text_type.py @@ -294,3 +294,19 @@ def test_item_titles(): ) def test_is_us_city_state_zip(text, expected): assert text_type.is_us_city_state_zip(text) is expected + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + ("fake@gmail.com", True), + ("Fake@gmail.com", False), + ("fake.gmail.@gmail.com", True), + ("fake.gmail@.@gmail.com", False), + (" fake@gmail.com", True), + ("fak!/$e@gmail.com", False), + ("", False), + ], +) +def test_is_email_address(text, expected): + assert text_type.is_email_address(text) is expected diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index e62cda34d..46487ff8e 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -457,6 +457,13 @@ class Address(Text): pass +class EmailAddress(Text): + """A text element for capturing addresses""" + + category = "EmailAddress" + pass + + class Image(Text): """A text element for capturing image metadata.""" @@ -505,6 +512,7 @@ TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = { "BulletedText": ListItem, "Title": Title, "Address": Address, + "EmailAddress": EmailAddress, "Image": Image, "PageBreak": PageBreak, "Table": Table, diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index e690666d6..98acd44f3 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -15,6 +15,7 @@ from unstructured.documents.base import Page from unstructured.documents.elements import ( Address, Element, + EmailAddress, Link, ListItem, NarrativeText, @@ -25,6 +26,7 @@ from unstructured.documents.xml import VALID_PARSERS, XMLDocument from unstructured.logger import logger from unstructured.partition.text_type import ( is_bulleted_text, + is_email_address, is_possible_narrative_text, is_possible_title, is_us_city_state_zip, @@ -73,6 +75,12 @@ class HTMLAddress(TagsMixin, Address): pass +class HTMLEmailAddress(TagsMixin, EmailAddress): + """EmailAddress with tag information""" + + pass + + class HTMLTitle(TagsMixin, Title): """Title with tag information.""" @@ -306,6 +314,8 @@ def _text_to_element( links=links, emphasized_texts=emphasized_texts, ) + elif is_email_address(text): + return HTMLEmailAddress(text=text, tag=tag, links=links, emphasized_texts=emphasized_texts) if len(text) < 2: return None diff --git a/unstructured/documents/xml.py b/unstructured/documents/xml.py index 0ca5537ac..c85ea5ed0 100644 --- a/unstructured/documents/xml.py +++ b/unstructured/documents/xml.py @@ -134,4 +134,5 @@ class XMLDocument(Document): **kwargs, ): _, content = read_txt_file(filename=filename, encoding=encoding) + return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs) diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py index d582312c3..6eebbfc94 100644 --- a/unstructured/nlp/patterns.py +++ b/unstructured/nlp/patterns.py @@ -100,7 +100,7 @@ EMAIL_DATETIMETZ_PATTERN_RE = re.compile(EMAIL_DATETIMETZ_PATTERN) EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+" # noqa: W605 NOTE(harrell) # - skipping qa because we need the escape for the regex - +EMAIL_ADDRESS_PATTERN_RE = re.compile(EMAIL_ADDRESS_PATTERN) ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z" ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN) diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 26ae804d7..f16d7bf40 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -13,6 +13,7 @@ from unstructured.documents.elements import ( Address, Element, ElementMetadata, + EmailAddress, Footer, Header, ListItem, @@ -33,6 +34,7 @@ from unstructured.partition.common import ( ) from unstructured.partition.text_type import ( is_bulleted_text, + is_email_address, is_possible_narrative_text, is_possible_title, is_us_city_state_zip, @@ -259,7 +261,8 @@ def _text_to_element(text: str, is_list=False) -> Optional[Text]: elif is_us_city_state_zip(text): return Address(text=text) - + elif is_email_address(text): + return EmailAddress(text=text) if len(text) < 2: return None elif is_possible_narrative_text(text): diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index 067fbfe9c..0c2bd6914 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -6,6 +6,7 @@ import pptx from unstructured.documents.elements import ( Element, ElementMetadata, + EmailAddress, ListItem, NarrativeText, PageBreak, @@ -23,6 +24,7 @@ from unstructured.partition.common import ( spooled_to_bytes_io_if_needed, ) from unstructured.partition.text_type import ( + is_email_address, is_possible_narrative_text, is_possible_title, ) @@ -121,6 +123,8 @@ def partition_pptx( continue if _is_bulleted_paragraph(paragraph): elements.append(ListItem(text=text, metadata=metadata)) + elif is_email_address(text): + elements.append(EmailAddress(text=text)) elif is_possible_narrative_text(text): elements.append(NarrativeText(text=text, metadata=metadata)) elif is_possible_title(text): diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py index 4277998cb..3a6f2274b 100644 --- a/unstructured/partition/text.py +++ b/unstructured/partition/text.py @@ -8,6 +8,7 @@ from unstructured.documents.elements import ( Address, Element, ElementMetadata, + EmailAddress, ListItem, NarrativeText, Text, @@ -25,6 +26,7 @@ from unstructured.partition.common import ( ) from unstructured.partition.text_type import ( is_bulleted_text, + is_email_address, is_possible_narrative_text, is_possible_title, is_us_city_state_zip, @@ -261,6 +263,8 @@ def element_from_text( coordinates=coordinates, coordinate_system=coordinate_system, ) + elif is_email_address(text): + return EmailAddress(text=text) elif is_us_city_state_zip(text): return Address( text=text, diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py index fa606f87b..819b0898f 100644 --- a/unstructured/partition/text_type.py +++ b/unstructured/partition/text_type.py @@ -13,6 +13,7 @@ from unstructured.cleaners.core import remove_punctuation from unstructured.logger import trace_logger from unstructured.nlp.english_words import ENGLISH_WORDS from unstructured.nlp.patterns import ( + EMAIL_ADDRESS_PATTERN_RE, ENDS_IN_PUNCT_RE, UNICODE_BULLETS_RE, US_CITY_STATE_ZIP_RE, @@ -304,3 +305,8 @@ def is_us_city_state_zip(text) -> bool: DOYLESTOWN, PENNSYLVANIA 18901 """ return US_CITY_STATE_ZIP_RE.match(text.strip()) is not None + + +def is_email_address(text) -> bool: + """Check if the given text is the email address""" + return EMAIL_ADDRESS_PATTERN_RE.match(text.strip()) is not None