From de855bb4ed13c2de60ffef35bc0c47bcd5700a70 Mon Sep 17 00:00:00 2001
From: Charles <charlespierse@gmail.com>
Date: Wed, 30 Aug 2023 19:29:15 +0100
Subject: [PATCH] enhancement: new extract function for detecting image URLs
 (#1212)

- Adds new feature discussed in GitHub Issue #1117 and in slack
---
 CHANGELOG.md                               |  3 +-
 test_unstructured/cleaners/test_extract.py | 50 ++++++++++++++++++++++
 unstructured/cleaners/extract.py           |  5 +++
 unstructured/nlp/patterns.py               |  7 +++
 4 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 85a62d10b..4ca4e34a2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,7 +17,8 @@
 
 * Adds `chunk_by_title` to break a document into sections based on the presence of `Title`
   elements.
-  
+* add new extraction function `extract_image_urls_from_html` to extract all img related URL from html text.
+
 ### Fixes
 
 * Make cv2 dependency optional
diff --git a/test_unstructured/cleaners/test_extract.py b/test_unstructured/cleaners/test_extract.py
index e54b2f74e..6ca059883 100644
--- a/test_unstructured/cleaners/test_extract.py
+++ b/test_unstructured/cleaners/test_extract.py
@@ -104,3 +104,53 @@ def test_extract_us_phone_number(text, expected):
 )
 def test_extract_ordered_bullets(text, expected):
     assert extract.extract_ordered_bullets(text=text) == expected
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        (
+            "https://my-image.jpg",
+            (["https://my-image.jpg"]),
+        ),
+        (
+            "https://my-image.png with some text",
+            (["https://my-image.png"]),
+        ),
+        (
+            "https://my-image/with/some/path.png",
+            (["https://my-image/with/some/path.png"]),
+        ),
+        (
+            "some text https://my-image.jpg with another http://my-image.bmp",
+            (["https://my-image.jpg", "http://my-image.bmp"]),
+        ),
+        (
+            "http://not-an-image.com",
+            ([]),
+        ),
+        (
+            "some text",
+            ([]),
+        ),
+        (
+            "some text https://my-image.JPG with another http://my-image.BMP",
+            (["https://my-image.JPG", "http://my-image.BMP"]),
+        ),
+        (
+            "http://my-path-with-CAPS/my-image.JPG",
+            (["http://my-path-with-CAPS/my-image.JPG"]),
+        ),
+        (
+            "http://my-path/my%20image.JPG",
+            (["http://my-path/my%20image.JPG"]),
+        ),
+        # url with reference #
+        (
+            "https://my-image.jpg#ref",
+            (["https://my-image.jpg"]),
+        ),
+    ],
+)
+def test_extract_image_urls_from_html(text, expected):
+    assert extract.extract_image_urls_from_html(text=text) == expected
diff --git a/unstructured/cleaners/extract.py b/unstructured/cleaners/extract.py
index 69c399984..b576f9ccc 100644
--- a/unstructured/cleaners/extract.py
+++ b/unstructured/cleaners/extract.py
@@ -5,6 +5,7 @@ from typing import List, Optional
 from unstructured.nlp.patterns import (
     EMAIL_ADDRESS_PATTERN,
     EMAIL_DATETIMETZ_PATTERN,
+    IMAGE_URL_PATTERN,
     IP_ADDRESS_NAME_PATTERN,
     IP_ADDRESS_PATTERN_RE,
     MAPI_ID_PATTERN,
@@ -136,3 +137,7 @@ def extract_ordered_bullets(text) -> tuple:
         b = "".join(b)
         c = "".join(c) if c else None
     return a, b, c
+
+
+def extract_image_urls_from_html(text: str) -> List[str]:
+    return re.findall(IMAGE_URL_PATTERN, text)
diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py
index 92bcecba3..15ccb66a1 100644
--- a/unstructured/nlp/patterns.py
+++ b/unstructured/nlp/patterns.py
@@ -138,3 +138,10 @@ JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])"
 
 # taken from https://stackoverflow.com/a/3845829/12406158
 VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"
+
+IMAGE_URL_PATTERN = (
+    r"(?i)https?://"
+    r"(?:[a-z0-9$_@.&+!*\\(\\),%-])+"
+    r"(?:/[a-z0-9$_@.&+!*\\(\\),%-]*)*"
+    r"\.(?:jpg|jpeg|png|gif|bmp)"
+)