mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feat: support extracting image url in html (#3955)
also removes mimetype when base64 is not included in image metadata --------- Co-authored-by: ryannikolaidis <ryannikolaidis@users.noreply.github.com>
This commit is contained in:
parent
2dceac34b5
commit
66bf4b0198
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.17.1-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
- **Add image_url of images in html partitioner** `<img>` tags with non-data content include a new image_url metadata field with the content of the src attribute.
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.17.0
|
||||
|
||||
### Enhancements
|
||||
|
4
example-docs/fake-html-with-image-from-url.html
Normal file
4
example-docs/fake-html-with-image-from-url.html
Normal file
@ -0,0 +1,4 @@
|
||||
<div>
|
||||
<p>Test page</p>
|
||||
<img src="https://avatars.githubusercontent.com/u/108372208?s=200&v=4" alt="Unstructured Logo" />
|
||||
</div>
|
@ -335,20 +335,21 @@ def test_partition_html_base64_for_images(
|
||||
|
||||
assert element.category == ElementType.IMAGE
|
||||
assert element.text == alt_text
|
||||
assert element.metadata.image_mime_type == "image/png"
|
||||
if expect_base64:
|
||||
assert element.metadata.image_base64 == base64
|
||||
assert element.metadata.image_mime_type == "image/png"
|
||||
else:
|
||||
assert element.metadata.image_base64 is None
|
||||
assert element.metadata.image_mime_type is None
|
||||
|
||||
|
||||
def test_partition_html_includes_url_for_images():
|
||||
url = "https://example.com/image.png"
|
||||
image_url = "https://example.com/image.png"
|
||||
alt_text = "URL Image"
|
||||
# language=HTML
|
||||
html = f"""
|
||||
<div class="Page">
|
||||
<img src="{url}" alt="{alt_text}">
|
||||
<img src="{image_url}" alt="{alt_text}">
|
||||
</div>
|
||||
"""
|
||||
(image,) = partition_html(
|
||||
@ -356,7 +357,7 @@ def test_partition_html_includes_url_for_images():
|
||||
)
|
||||
assert image.category == ElementType.IMAGE
|
||||
assert image.text == alt_text
|
||||
assert image.metadata.url == url
|
||||
assert image.metadata.image_url == image_url
|
||||
|
||||
|
||||
# -- table parsing behaviors ---------------------------------------------------------------------
|
||||
|
@ -632,7 +632,7 @@ def test_auto_partition_html_element_extraction():
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
elements = partition(
|
||||
example_doc_path("html-with-base64-image.html"),
|
||||
example_doc_path("fake-html-with-base64-image.html"),
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_to_payload=True,
|
||||
)
|
||||
@ -640,6 +640,13 @@ def test_auto_partition_html_element_extraction():
|
||||
assert_element_extraction(elements, extract_image_block_types, True, tmpdir)
|
||||
|
||||
|
||||
def test_auto_partition_html_image_with_url():
|
||||
elements = partition(
|
||||
example_doc_path("fake-html-with-image-from-url.html"),
|
||||
)
|
||||
assert elements[1].metadata.image_url is not None
|
||||
|
||||
|
||||
def test_partition_pdf_does_not_raise_warning():
|
||||
# NOTE(robinson): This is the recommended way to check that no warning is emitted,
|
||||
# per the pytest docs.
|
||||
|
@ -55,6 +55,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/overview.svg?version=1&modificationDate=1688907285640&cacheVersion=1&api=v2",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -236,6 +237,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/shortcuts.svg?version=1&modificationDate=1688907288893&cacheVersion=1&api=v2",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -326,6 +328,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605956/inline_comment.svg?version=1&modificationDate=1688907286335&cacheVersion=1&api=v2&width=442&height=99",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -416,6 +419,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605956/teamspace_mention.svg?version=1&modificationDate=1688907289571&cacheVersion=1&api=v2&width=442&height=417",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -500,6 +504,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/reactions.svg?version=1&modificationDate=1688907286993&cacheVersion=1&api=v2",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -659,6 +664,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605956/slash_menu.svg?version=1&modificationDate=1688907287621&cacheVersion=1&api=v2&width=544&height=586",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -755,6 +761,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/teamspace_introduce.svg?version=1&modificationDate=1688907290201&cacheVersion=1&api=v2",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -776,6 +783,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/teamspace_announcements.svg?version=1&modificationDate=1688907290847&cacheVersion=1&api=v2",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -905,6 +913,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/content_report.svg?version=1&modificationDate=1688907288249&cacheVersion=1&api=v2",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
|
@ -328,6 +328,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/229477/angie.svg?version=1&modificationDate=1688145926387&cacheVersion=1&api=v2&width=256&height=257",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -412,6 +413,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/229477/gael.svg?version=1&modificationDate=1688145927077&cacheVersion=1&api=v2&width=256&height=257",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -496,6 +498,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/229477/claudia.svg?version=1&modificationDate=1688145927764&cacheVersion=1&api=v2&width=256&height=257",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -766,6 +769,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/229477/raised_hand.svg?version=1&modificationDate=1688145928452&cacheVersion=1&api=v2",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -835,6 +839,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/229477/ledger.svg?version=1&modificationDate=1688145929151&cacheVersion=1&api=v2",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -904,6 +909,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/229477/astronaut.svg?version=1&modificationDate=1688145929790&cacheVersion=1&api=v2",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
|
@ -328,6 +328,7 @@
|
||||
"version": "2"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605859/angie.svg?version=1&modificationDate=1688907281095&cacheVersion=1&api=v2&width=256&height=257",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -412,6 +413,7 @@
|
||||
"version": "2"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605859/gael.svg?version=1&modificationDate=1688907281775&cacheVersion=1&api=v2&width=256&height=257",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -496,6 +498,7 @@
|
||||
"version": "2"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605859/claudia.svg?version=1&modificationDate=1688907282424&cacheVersion=1&api=v2&width=256&height=257",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -766,6 +769,7 @@
|
||||
"version": "2"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605859/raised_hand.svg?version=1&modificationDate=1688907283067&cacheVersion=1&api=v2",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -835,6 +839,7 @@
|
||||
"version": "2"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605859/ledger.svg?version=1&modificationDate=1688907283728&cacheVersion=1&api=v2",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
@ -904,6 +909,7 @@
|
||||
"version": "2"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605859/astronaut.svg?version=1&modificationDate=1688907284407&cacheVersion=1&api=v2",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
|
@ -321,6 +321,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605989/image-20230709-015203.png?version=1&modificationDate=1688907429067&cacheVersion=1&api=v2&width=680&height=259",
|
||||
"languages": [
|
||||
"eng",
|
||||
"fra"
|
||||
@ -802,6 +803,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605989/image-20230709-020021.png?version=1&modificationDate=1688907429074&cacheVersion=1&api=v2&width=475&height=236",
|
||||
"languages": [
|
||||
"eng",
|
||||
"fra"
|
||||
|
@ -321,6 +321,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1802252/image-20230709-015203.png?version=1&modificationDate=1689094907437&cacheVersion=1&api=v2&width=680&height=259",
|
||||
"languages": [
|
||||
"eng",
|
||||
"fra"
|
||||
@ -802,6 +803,7 @@
|
||||
"version": "1"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1802252/image-20230709-020021.png?version=1&modificationDate=1689094907442&cacheVersion=1&api=v2&width=475&height=236",
|
||||
"languages": [
|
||||
"eng",
|
||||
"fra"
|
||||
|
@ -545,6 +545,7 @@
|
||||
"date_modified": "2023-08-17T18:48:00.000Z"
|
||||
},
|
||||
"filetype": "text/html",
|
||||
"image_url": "https://media4.giphy.com/media/26FPsOhR3tyQRTc2Y/giphy.gif?cid=7941fdc68sl3vdqajgosqug9hfhg3zq3t5yoflyy9p7y66q0&ep=v1_gifs_trending&rid=giphy.gif&ct=g",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
|
@ -32,6 +32,7 @@
|
||||
"element_id": "f714fa214dac2f441515c4f28370d279",
|
||||
"text": "",
|
||||
"metadata": {
|
||||
"image_url": "https://unstructuredio-dev-ed.develop.my.salesforce.com/servlet/servlet.ImageServer?oid=00DHu0000018RDe&esid=018Hu00001JMmTZ&from=int",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
|
@ -32,6 +32,7 @@
|
||||
"element_id": "68870d055535f48c7439ce67092768f6",
|
||||
"text": "",
|
||||
"metadata": {
|
||||
"image_url": "https://unstructuredio-dev-ed.develop.my.salesforce.com/servlet/servlet.ImageServer?oid=00DHu0000018RDe&esid=018Hu00001JMmTa&from=int",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
|
@ -32,6 +32,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
--input-path example-docs \
|
||||
--work-dir "$WORK_DIR"
|
||||
|
||||
"$SCRIPT_DIR"/check-num-files-output.sh 14 $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-num-files-output.sh 15 $OUTPUT_FOLDER_NAME
|
||||
|
||||
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.17.0" # pragma: no cover
|
||||
__version__ = "0.17.1-dev0" # pragma: no cover
|
||||
|
@ -172,6 +172,7 @@ class ElementMetadata:
|
||||
file_directory: Optional[str]
|
||||
filename: Optional[str]
|
||||
filetype: Optional[str]
|
||||
image_url: Optional[str]
|
||||
image_path: Optional[str]
|
||||
image_base64: Optional[str]
|
||||
image_mime_type: Optional[str]
|
||||
@ -230,6 +231,7 @@ class ElementMetadata:
|
||||
header_footer_type: Optional[str] = None,
|
||||
image_base64: Optional[str] = None,
|
||||
image_mime_type: Optional[str] = None,
|
||||
image_url: Optional[str] = None,
|
||||
image_path: Optional[str] = None,
|
||||
is_continuation: Optional[bool] = None,
|
||||
languages: Optional[list[str]] = None,
|
||||
@ -274,6 +276,7 @@ class ElementMetadata:
|
||||
self.header_footer_type = header_footer_type
|
||||
self.image_base64 = image_base64
|
||||
self.image_mime_type = image_mime_type
|
||||
self.image_url = image_url
|
||||
self.image_path = image_path
|
||||
self.is_continuation = is_continuation
|
||||
self.languages = languages
|
||||
@ -490,6 +493,7 @@ class ConsolidationStrategy(enum.Enum):
|
||||
"filename": cls.FIRST,
|
||||
"filetype": cls.FIRST,
|
||||
"header_footer_type": cls.DROP,
|
||||
"image_url": cls.DROP,
|
||||
"image_path": cls.DROP,
|
||||
"image_base64": cls.DROP,
|
||||
"image_mime_type": cls.DROP,
|
||||
|
@ -502,7 +502,7 @@ class ImageBlock(Flow):
|
||||
metadata=ElementMetadata(
|
||||
image_mime_type=img_mime_type,
|
||||
image_base64=img_base64,
|
||||
url=img_url,
|
||||
image_url=img_url,
|
||||
),
|
||||
)
|
||||
|
||||
|
@ -223,6 +223,7 @@ class _HtmlPartitioner:
|
||||
# -- remove <image_base64> if not requested --
|
||||
if not self._should_include_image_base64(e):
|
||||
e.metadata.image_base64 = None
|
||||
e.metadata.image_mime_type = None
|
||||
yield e
|
||||
|
||||
@lazyproperty
|
||||
|
Loading…
x
Reference in New Issue
Block a user