mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-22 07:39:40 +00:00
enhancement: new extract function for detecting image URLs (#1212)
- Adds new feature discussed in GitHub Issue #1117 and in slack
This commit is contained in:
parent
d33d8b5d0b
commit
de855bb4ed
@ -17,7 +17,8 @@
|
|||||||
|
|
||||||
* Adds `chunk_by_title` to break a document into sections based on the presence of `Title`
|
* Adds `chunk_by_title` to break a document into sections based on the presence of `Title`
|
||||||
elements.
|
elements.
|
||||||
|
* add new extraction function `extract_image_urls_from_html` to extract all img related URL from html text.
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
* Make cv2 dependency optional
|
* Make cv2 dependency optional
|
||||||
|
@ -104,3 +104,53 @@ def test_extract_us_phone_number(text, expected):
|
|||||||
)
|
)
|
||||||
def test_extract_ordered_bullets(text, expected):
|
def test_extract_ordered_bullets(text, expected):
|
||||||
assert extract.extract_ordered_bullets(text=text) == expected
|
assert extract.extract_ordered_bullets(text=text) == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("text", "expected"),
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"https://my-image.jpg",
|
||||||
|
(["https://my-image.jpg"]),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"https://my-image.png with some text",
|
||||||
|
(["https://my-image.png"]),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"https://my-image/with/some/path.png",
|
||||||
|
(["https://my-image/with/some/path.png"]),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"some text https://my-image.jpg with another http://my-image.bmp",
|
||||||
|
(["https://my-image.jpg", "http://my-image.bmp"]),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"http://not-an-image.com",
|
||||||
|
([]),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"some text",
|
||||||
|
([]),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"some text https://my-image.JPG with another http://my-image.BMP",
|
||||||
|
(["https://my-image.JPG", "http://my-image.BMP"]),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"http://my-path-with-CAPS/my-image.JPG",
|
||||||
|
(["http://my-path-with-CAPS/my-image.JPG"]),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"http://my-path/my%20image.JPG",
|
||||||
|
(["http://my-path/my%20image.JPG"]),
|
||||||
|
),
|
||||||
|
# url with reference #
|
||||||
|
(
|
||||||
|
"https://my-image.jpg#ref",
|
||||||
|
(["https://my-image.jpg"]),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_extract_image_urls_from_html(text, expected):
|
||||||
|
assert extract.extract_image_urls_from_html(text=text) == expected
|
||||||
|
@ -5,6 +5,7 @@ from typing import List, Optional
|
|||||||
from unstructured.nlp.patterns import (
|
from unstructured.nlp.patterns import (
|
||||||
EMAIL_ADDRESS_PATTERN,
|
EMAIL_ADDRESS_PATTERN,
|
||||||
EMAIL_DATETIMETZ_PATTERN,
|
EMAIL_DATETIMETZ_PATTERN,
|
||||||
|
IMAGE_URL_PATTERN,
|
||||||
IP_ADDRESS_NAME_PATTERN,
|
IP_ADDRESS_NAME_PATTERN,
|
||||||
IP_ADDRESS_PATTERN_RE,
|
IP_ADDRESS_PATTERN_RE,
|
||||||
MAPI_ID_PATTERN,
|
MAPI_ID_PATTERN,
|
||||||
@ -136,3 +137,7 @@ def extract_ordered_bullets(text) -> tuple:
|
|||||||
b = "".join(b)
|
b = "".join(b)
|
||||||
c = "".join(c) if c else None
|
c = "".join(c) if c else None
|
||||||
return a, b, c
|
return a, b, c
|
||||||
|
|
||||||
|
|
||||||
|
def extract_image_urls_from_html(text: str) -> List[str]:
|
||||||
|
return re.findall(IMAGE_URL_PATTERN, text)
|
||||||
|
@ -138,3 +138,10 @@ JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])"
|
|||||||
|
|
||||||
# taken from https://stackoverflow.com/a/3845829/12406158
|
# taken from https://stackoverflow.com/a/3845829/12406158
|
||||||
VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"
|
VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"
|
||||||
|
|
||||||
|
IMAGE_URL_PATTERN = (
|
||||||
|
r"(?i)https?://"
|
||||||
|
r"(?:[a-z0-9$_@.&+!*\\(\\),%-])+"
|
||||||
|
r"(?:/[a-z0-9$_@.&+!*\\(\\),%-]*)*"
|
||||||
|
r"\.(?:jpg|jpeg|png|gif|bmp)"
|
||||||
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user