feat: support extracting image url in html (#3955)

also removes mimetype when base64 is not included in image metadata

---------

Co-authored-by: ryannikolaidis <ryannikolaidis@users.noreply.github.com>
This commit is contained in:
ryannikolaidis 2025-03-13 15:41:10 -07:00 committed by GitHub
parent 2dceac34b5
commit 66bf4b0198
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 63 additions and 8 deletions

View File

@ -1,3 +1,13 @@
## 0.17.1-dev0
### Enhancements
- **Add image_url of images in html partitioner** `<img>` tags with non-data content include a new image_url metadata field with the content of the src attribute.
### Features
### Fixes
## 0.17.0
### Enhancements

View File

@ -0,0 +1,4 @@
<div>
<p>Test page</p>
<img src="https://avatars.githubusercontent.com/u/108372208?s=200&v=4" alt="Unstructured Logo" />
</div>

View File

@ -335,20 +335,21 @@ def test_partition_html_base64_for_images(
assert element.category == ElementType.IMAGE
assert element.text == alt_text
assert element.metadata.image_mime_type == "image/png"
if expect_base64:
assert element.metadata.image_base64 == base64
assert element.metadata.image_mime_type == "image/png"
else:
assert element.metadata.image_base64 is None
assert element.metadata.image_mime_type is None
def test_partition_html_includes_url_for_images():
url = "https://example.com/image.png"
image_url = "https://example.com/image.png"
alt_text = "URL Image"
# language=HTML
html = f"""
<div class="Page">
<img src="{url}" alt="{alt_text}">
<img src="{image_url}" alt="{alt_text}">
</div>
"""
(image,) = partition_html(
@ -356,7 +357,7 @@ def test_partition_html_includes_url_for_images():
)
assert image.category == ElementType.IMAGE
assert image.text == alt_text
assert image.metadata.url == url
assert image.metadata.image_url == image_url
# -- table parsing behaviors ---------------------------------------------------------------------

View File

@ -632,7 +632,7 @@ def test_auto_partition_html_element_extraction():
with tempfile.TemporaryDirectory() as tmpdir:
elements = partition(
example_doc_path("html-with-base64-image.html"),
example_doc_path("fake-html-with-base64-image.html"),
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=True,
)
@ -640,6 +640,13 @@ def test_auto_partition_html_element_extraction():
assert_element_extraction(elements, extract_image_block_types, True, tmpdir)
def test_auto_partition_html_image_with_url():
elements = partition(
example_doc_path("fake-html-with-image-from-url.html"),
)
assert elements[1].metadata.image_url is not None
def test_partition_pdf_does_not_raise_warning():
# NOTE(robinson): This is the recommended way to check that no warning is emitted,
# per the pytest docs.

View File

@ -55,6 +55,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/overview.svg?version=1&modificationDate=1688907285640&cacheVersion=1&api=v2",
"languages": [
"eng"
]
@ -236,6 +237,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/shortcuts.svg?version=1&modificationDate=1688907288893&cacheVersion=1&api=v2",
"languages": [
"eng"
]
@ -326,6 +328,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605956/inline_comment.svg?version=1&modificationDate=1688907286335&cacheVersion=1&api=v2&width=442&height=99",
"languages": [
"eng"
]
@ -416,6 +419,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605956/teamspace_mention.svg?version=1&modificationDate=1688907289571&cacheVersion=1&api=v2&width=442&height=417",
"languages": [
"eng"
]
@ -500,6 +504,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/reactions.svg?version=1&modificationDate=1688907286993&cacheVersion=1&api=v2",
"languages": [
"eng"
]
@ -659,6 +664,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605956/slash_menu.svg?version=1&modificationDate=1688907287621&cacheVersion=1&api=v2&width=544&height=586",
"languages": [
"eng"
]
@ -755,6 +761,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/teamspace_introduce.svg?version=1&modificationDate=1688907290201&cacheVersion=1&api=v2",
"languages": [
"eng"
]
@ -776,6 +783,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/teamspace_announcements.svg?version=1&modificationDate=1688907290847&cacheVersion=1&api=v2",
"languages": [
"eng"
]
@ -905,6 +913,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/content_report.svg?version=1&modificationDate=1688907288249&cacheVersion=1&api=v2",
"languages": [
"eng"
]

View File

@ -328,6 +328,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/229477/angie.svg?version=1&modificationDate=1688145926387&cacheVersion=1&api=v2&width=256&height=257",
"languages": [
"eng"
]
@ -412,6 +413,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/229477/gael.svg?version=1&modificationDate=1688145927077&cacheVersion=1&api=v2&width=256&height=257",
"languages": [
"eng"
]
@ -496,6 +498,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/229477/claudia.svg?version=1&modificationDate=1688145927764&cacheVersion=1&api=v2&width=256&height=257",
"languages": [
"eng"
]
@ -766,6 +769,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/229477/raised_hand.svg?version=1&modificationDate=1688145928452&cacheVersion=1&api=v2",
"languages": [
"eng"
]
@ -835,6 +839,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/229477/ledger.svg?version=1&modificationDate=1688145929151&cacheVersion=1&api=v2",
"languages": [
"eng"
]
@ -904,6 +909,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/229477/astronaut.svg?version=1&modificationDate=1688145929790&cacheVersion=1&api=v2",
"languages": [
"eng"
]

View File

@ -328,6 +328,7 @@
"version": "2"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605859/angie.svg?version=1&modificationDate=1688907281095&cacheVersion=1&api=v2&width=256&height=257",
"languages": [
"eng"
]
@ -412,6 +413,7 @@
"version": "2"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605859/gael.svg?version=1&modificationDate=1688907281775&cacheVersion=1&api=v2&width=256&height=257",
"languages": [
"eng"
]
@ -496,6 +498,7 @@
"version": "2"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605859/claudia.svg?version=1&modificationDate=1688907282424&cacheVersion=1&api=v2&width=256&height=257",
"languages": [
"eng"
]
@ -766,6 +769,7 @@
"version": "2"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605859/raised_hand.svg?version=1&modificationDate=1688907283067&cacheVersion=1&api=v2",
"languages": [
"eng"
]
@ -835,6 +839,7 @@
"version": "2"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605859/ledger.svg?version=1&modificationDate=1688907283728&cacheVersion=1&api=v2",
"languages": [
"eng"
]
@ -904,6 +909,7 @@
"version": "2"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605859/astronaut.svg?version=1&modificationDate=1688907284407&cacheVersion=1&api=v2",
"languages": [
"eng"
]

View File

@ -321,6 +321,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605989/image-20230709-015203.png?version=1&modificationDate=1688907429067&cacheVersion=1&api=v2&width=680&height=259",
"languages": [
"eng",
"fra"
@ -802,6 +803,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605989/image-20230709-020021.png?version=1&modificationDate=1688907429074&cacheVersion=1&api=v2&width=475&height=236",
"languages": [
"eng",
"fra"

View File

@ -321,6 +321,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1802252/image-20230709-015203.png?version=1&modificationDate=1689094907437&cacheVersion=1&api=v2&width=680&height=259",
"languages": [
"eng",
"fra"
@ -802,6 +803,7 @@
"version": "1"
},
"filetype": "text/html",
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1802252/image-20230709-020021.png?version=1&modificationDate=1689094907442&cacheVersion=1&api=v2&width=475&height=236",
"languages": [
"eng",
"fra"

View File

@ -545,6 +545,7 @@
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"image_url": "https://media4.giphy.com/media/26FPsOhR3tyQRTc2Y/giphy.gif?cid=7941fdc68sl3vdqajgosqug9hfhg3zq3t5yoflyy9p7y66q0&ep=v1_gifs_trending&rid=giphy.gif&ct=g",
"languages": [
"eng"
]

View File

@ -32,6 +32,7 @@
"element_id": "f714fa214dac2f441515c4f28370d279",
"text": "",
"metadata": {
"image_url": "https://unstructuredio-dev-ed.develop.my.salesforce.com/servlet/servlet.ImageServer?oid=00DHu0000018RDe&esid=018Hu00001JMmTZ&from=int",
"languages": [
"eng"
],

View File

@ -32,6 +32,7 @@
"element_id": "68870d055535f48c7439ce67092768f6",
"text": "",
"metadata": {
"image_url": "https://unstructuredio-dev-ed.develop.my.salesforce.com/servlet/servlet.ImageServer?oid=00DHu0000018RDe&esid=018Hu00001JMmTa&from=int",
"languages": [
"eng"
],

View File

@ -32,6 +32,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
--input-path example-docs \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-num-files-output.sh 14 $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-num-files-output.sh 15 $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -1 +1 @@
__version__ = "0.17.0" # pragma: no cover
__version__ = "0.17.1-dev0" # pragma: no cover

View File

@ -172,6 +172,7 @@ class ElementMetadata:
file_directory: Optional[str]
filename: Optional[str]
filetype: Optional[str]
image_url: Optional[str]
image_path: Optional[str]
image_base64: Optional[str]
image_mime_type: Optional[str]
@ -230,6 +231,7 @@ class ElementMetadata:
header_footer_type: Optional[str] = None,
image_base64: Optional[str] = None,
image_mime_type: Optional[str] = None,
image_url: Optional[str] = None,
image_path: Optional[str] = None,
is_continuation: Optional[bool] = None,
languages: Optional[list[str]] = None,
@ -274,6 +276,7 @@ class ElementMetadata:
self.header_footer_type = header_footer_type
self.image_base64 = image_base64
self.image_mime_type = image_mime_type
self.image_url = image_url
self.image_path = image_path
self.is_continuation = is_continuation
self.languages = languages
@ -490,6 +493,7 @@ class ConsolidationStrategy(enum.Enum):
"filename": cls.FIRST,
"filetype": cls.FIRST,
"header_footer_type": cls.DROP,
"image_url": cls.DROP,
"image_path": cls.DROP,
"image_base64": cls.DROP,
"image_mime_type": cls.DROP,

View File

@ -502,7 +502,7 @@ class ImageBlock(Flow):
metadata=ElementMetadata(
image_mime_type=img_mime_type,
image_base64=img_base64,
url=img_url,
image_url=img_url,
),
)

View File

@ -223,6 +223,7 @@ class _HtmlPartitioner:
# -- remove <image_base64> if not requested --
if not self._should_include_image_base64(e):
e.metadata.image_base64 = None
e.metadata.image_mime_type = None
yield e
@lazyproperty