From de855bb4ed13c2de60ffef35bc0c47bcd5700a70 Mon Sep 17 00:00:00 2001 From: Charles Date: Wed, 30 Aug 2023 19:29:15 +0100 Subject: [PATCH] enhancement: new extract function for detecting image URLs (#1212) - Adds new feature discussed in GitHub Issue #1117 and in slack --- CHANGELOG.md | 3 +- test_unstructured/cleaners/test_extract.py | 50 ++++++++++++++++++++++ unstructured/cleaners/extract.py | 5 +++ unstructured/nlp/patterns.py | 7 +++ 4 files changed, 64 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 85a62d10b..4ca4e34a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,8 @@ * Adds `chunk_by_title` to break a document into sections based on the presence of `Title` elements. - +* add new extraction function `extract_image_urls_from_html` to extract all img related URL from html text. + ### Fixes * Make cv2 dependency optional diff --git a/test_unstructured/cleaners/test_extract.py b/test_unstructured/cleaners/test_extract.py index e54b2f74e..6ca059883 100644 --- a/test_unstructured/cleaners/test_extract.py +++ b/test_unstructured/cleaners/test_extract.py @@ -104,3 +104,53 @@ def test_extract_us_phone_number(text, expected): ) def test_extract_ordered_bullets(text, expected): assert extract.extract_ordered_bullets(text=text) == expected + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + ( + "https://my-image.jpg", + (["https://my-image.jpg"]), + ), + ( + "https://my-image.png with some text", + (["https://my-image.png"]), + ), + ( + "https://my-image/with/some/path.png", + (["https://my-image/with/some/path.png"]), + ), + ( + "some text https://my-image.jpg with another http://my-image.bmp", + (["https://my-image.jpg", "http://my-image.bmp"]), + ), + ( + "http://not-an-image.com", + ([]), + ), + ( + "some text", + ([]), + ), + ( + "some text https://my-image.JPG with another http://my-image.BMP", + (["https://my-image.JPG", "http://my-image.BMP"]), + ), + ( + "http://my-path-with-CAPS/my-image.JPG", + (["http://my-path-with-CAPS/my-image.JPG"]), + ), + ( + "http://my-path/my%20image.JPG", + (["http://my-path/my%20image.JPG"]), + ), + # url with reference # + ( + "https://my-image.jpg#ref", + (["https://my-image.jpg"]), + ), + ], +) +def test_extract_image_urls_from_html(text, expected): + assert extract.extract_image_urls_from_html(text=text) == expected diff --git a/unstructured/cleaners/extract.py b/unstructured/cleaners/extract.py index 69c399984..b576f9ccc 100644 --- a/unstructured/cleaners/extract.py +++ b/unstructured/cleaners/extract.py @@ -5,6 +5,7 @@ from typing import List, Optional from unstructured.nlp.patterns import ( EMAIL_ADDRESS_PATTERN, EMAIL_DATETIMETZ_PATTERN, + IMAGE_URL_PATTERN, IP_ADDRESS_NAME_PATTERN, IP_ADDRESS_PATTERN_RE, MAPI_ID_PATTERN, @@ -136,3 +137,7 @@ def extract_ordered_bullets(text) -> tuple: b = "".join(b) c = "".join(c) if c else None return a, b, c + + +def extract_image_urls_from_html(text: str) -> List[str]: + return re.findall(IMAGE_URL_PATTERN, text) diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py index 92bcecba3..15ccb66a1 100644 --- a/unstructured/nlp/patterns.py +++ b/unstructured/nlp/patterns.py @@ -138,3 +138,10 @@ JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])" # taken from https://stackoverflow.com/a/3845829/12406158 VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]" + +IMAGE_URL_PATTERN = ( + r"(?i)https?://" + r"(?:[a-z0-9$_@.&+!*\\(\\),%-])+" + r"(?:/[a-z0-9$_@.&+!*\\(\\),%-]*)*" + r"\.(?:jpg|jpeg|png|gif|bmp)" +)