From 66bf4b01984e75898ea256a6c00a541956fbbf5e Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Thu, 13 Mar 2025 15:41:10 -0700 Subject: [PATCH] feat: support extracting image url in html (#3955) also removes mimetype when base64 is not included in image metadata --------- Co-authored-by: ryannikolaidis --- CHANGELOG.md | 10 ++++++++++ ...e64-image.html => fake-html-with-base64-image.html} | 0 example-docs/fake-html-with-image-from-url.html | 4 ++++ test_unstructured/partition/html/test_partition.py | 9 +++++---- test_unstructured/partition/test_auto.py | 9 ++++++++- .../confluence-diff/MFS/1605956.json | 9 +++++++++ .../confluence-diff/MFS/229477.json | 6 ++++++ .../confluence-diff/testteamsp/1605859.json | 6 ++++++ .../confluence-diff/testteamsp/1605989.json | 2 ++ .../confluence-diff/testteamsp/1802252.json | 2 ++ .../notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json | 1 + .../EmailMessage/02sHu00001efErPIAU.eml.json | 1 + .../EmailMessage/02sHu00001efErQIAU.eml.json | 1 + test_unstructured_ingest/src/local.sh | 2 +- unstructured/__version__.py | 2 +- unstructured/documents/elements.py | 4 ++++ unstructured/partition/html/parser.py | 2 +- unstructured/partition/html/partition.py | 1 + 18 files changed, 63 insertions(+), 8 deletions(-) rename example-docs/{html-with-base64-image.html => fake-html-with-base64-image.html} (100%) create mode 100644 example-docs/fake-html-with-image-from-url.html diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b03b9557..4874219f4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.17.1-dev0 + +### Enhancements + +- **Add image_url of images in html partitioner** `` tags with non-data content include a new image_url metadata field with the content of the src attribute. + +### Features + +### Fixes + ## 0.17.0 ### Enhancements diff --git a/example-docs/html-with-base64-image.html b/example-docs/fake-html-with-base64-image.html similarity index 100% rename from example-docs/html-with-base64-image.html rename to example-docs/fake-html-with-base64-image.html diff --git a/example-docs/fake-html-with-image-from-url.html b/example-docs/fake-html-with-image-from-url.html new file mode 100644 index 000000000..20d8b0f18 --- /dev/null +++ b/example-docs/fake-html-with-image-from-url.html @@ -0,0 +1,4 @@ +
+

Test page

+ Unstructured Logo +
\ No newline at end of file diff --git a/test_unstructured/partition/html/test_partition.py b/test_unstructured/partition/html/test_partition.py index 4df535319..7d8465019 100644 --- a/test_unstructured/partition/html/test_partition.py +++ b/test_unstructured/partition/html/test_partition.py @@ -335,20 +335,21 @@ def test_partition_html_base64_for_images( assert element.category == ElementType.IMAGE assert element.text == alt_text - assert element.metadata.image_mime_type == "image/png" if expect_base64: assert element.metadata.image_base64 == base64 + assert element.metadata.image_mime_type == "image/png" else: assert element.metadata.image_base64 is None + assert element.metadata.image_mime_type is None def test_partition_html_includes_url_for_images(): - url = "https://example.com/image.png" + image_url = "https://example.com/image.png" alt_text = "URL Image" # language=HTML html = f"""
- {alt_text} + {alt_text}
""" (image,) = partition_html( @@ -356,7 +357,7 @@ def test_partition_html_includes_url_for_images(): ) assert image.category == ElementType.IMAGE assert image.text == alt_text - assert image.metadata.url == url + assert image.metadata.image_url == image_url # -- table parsing behaviors --------------------------------------------------------------------- diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 856b157f9..f29f600b4 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -632,7 +632,7 @@ def test_auto_partition_html_element_extraction(): with tempfile.TemporaryDirectory() as tmpdir: elements = partition( - example_doc_path("html-with-base64-image.html"), + example_doc_path("fake-html-with-base64-image.html"), extract_image_block_types=extract_image_block_types, extract_image_block_to_payload=True, ) @@ -640,6 +640,13 @@ def test_auto_partition_html_element_extraction(): assert_element_extraction(elements, extract_image_block_types, True, tmpdir) +def test_auto_partition_html_image_with_url(): + elements = partition( + example_doc_path("fake-html-with-image-from-url.html"), + ) + assert elements[1].metadata.image_url is not None + + def test_partition_pdf_does_not_raise_warning(): # NOTE(robinson): This is the recommended way to check that no warning is emitted, # per the pytest docs. diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605956.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605956.json index 042101261..099f1fb34 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605956.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605956.json @@ -55,6 +55,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/overview.svg?version=1&modificationDate=1688907285640&cacheVersion=1&api=v2", "languages": [ "eng" ] @@ -236,6 +237,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/shortcuts.svg?version=1&modificationDate=1688907288893&cacheVersion=1&api=v2", "languages": [ "eng" ] @@ -326,6 +328,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605956/inline_comment.svg?version=1&modificationDate=1688907286335&cacheVersion=1&api=v2&width=442&height=99", "languages": [ "eng" ] @@ -416,6 +419,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605956/teamspace_mention.svg?version=1&modificationDate=1688907289571&cacheVersion=1&api=v2&width=442&height=417", "languages": [ "eng" ] @@ -500,6 +504,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/reactions.svg?version=1&modificationDate=1688907286993&cacheVersion=1&api=v2", "languages": [ "eng" ] @@ -659,6 +664,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605956/slash_menu.svg?version=1&modificationDate=1688907287621&cacheVersion=1&api=v2&width=544&height=586", "languages": [ "eng" ] @@ -755,6 +761,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/teamspace_introduce.svg?version=1&modificationDate=1688907290201&cacheVersion=1&api=v2", "languages": [ "eng" ] @@ -776,6 +783,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/teamspace_announcements.svg?version=1&modificationDate=1688907290847&cacheVersion=1&api=v2", "languages": [ "eng" ] @@ -905,6 +913,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/content_report.svg?version=1&modificationDate=1688907288249&cacheVersion=1&api=v2", "languages": [ "eng" ] diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/229477.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/229477.json index a592fa71c..2ebe6d788 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/229477.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/229477.json @@ -328,6 +328,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/229477/angie.svg?version=1&modificationDate=1688145926387&cacheVersion=1&api=v2&width=256&height=257", "languages": [ "eng" ] @@ -412,6 +413,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/229477/gael.svg?version=1&modificationDate=1688145927077&cacheVersion=1&api=v2&width=256&height=257", "languages": [ "eng" ] @@ -496,6 +498,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/229477/claudia.svg?version=1&modificationDate=1688145927764&cacheVersion=1&api=v2&width=256&height=257", "languages": [ "eng" ] @@ -766,6 +769,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/229477/raised_hand.svg?version=1&modificationDate=1688145928452&cacheVersion=1&api=v2", "languages": [ "eng" ] @@ -835,6 +839,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/229477/ledger.svg?version=1&modificationDate=1688145929151&cacheVersion=1&api=v2", "languages": [ "eng" ] @@ -904,6 +909,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/229477/astronaut.svg?version=1&modificationDate=1688145929790&cacheVersion=1&api=v2", "languages": [ "eng" ] diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605859.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605859.json index 8e74f96ed..b1745e899 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605859.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605859.json @@ -328,6 +328,7 @@ "version": "2" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605859/angie.svg?version=1&modificationDate=1688907281095&cacheVersion=1&api=v2&width=256&height=257", "languages": [ "eng" ] @@ -412,6 +413,7 @@ "version": "2" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605859/gael.svg?version=1&modificationDate=1688907281775&cacheVersion=1&api=v2&width=256&height=257", "languages": [ "eng" ] @@ -496,6 +498,7 @@ "version": "2" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605859/claudia.svg?version=1&modificationDate=1688907282424&cacheVersion=1&api=v2&width=256&height=257", "languages": [ "eng" ] @@ -766,6 +769,7 @@ "version": "2" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605859/raised_hand.svg?version=1&modificationDate=1688907283067&cacheVersion=1&api=v2", "languages": [ "eng" ] @@ -835,6 +839,7 @@ "version": "2" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605859/ledger.svg?version=1&modificationDate=1688907283728&cacheVersion=1&api=v2", "languages": [ "eng" ] @@ -904,6 +909,7 @@ "version": "2" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605859/astronaut.svg?version=1&modificationDate=1688907284407&cacheVersion=1&api=v2", "languages": [ "eng" ] diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605989.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605989.json index caceda2dd..c8f69173f 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605989.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605989.json @@ -321,6 +321,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605989/image-20230709-015203.png?version=1&modificationDate=1688907429067&cacheVersion=1&api=v2&width=680&height=259", "languages": [ "eng", "fra" @@ -802,6 +803,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605989/image-20230709-020021.png?version=1&modificationDate=1688907429074&cacheVersion=1&api=v2&width=475&height=236", "languages": [ "eng", "fra" diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1802252.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1802252.json index 5476ea28a..b097bc3b0 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1802252.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1802252.json @@ -321,6 +321,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1802252/image-20230709-015203.png?version=1&modificationDate=1689094907437&cacheVersion=1&api=v2&width=680&height=259", "languages": [ "eng", "fra" @@ -802,6 +803,7 @@ "version": "1" }, "filetype": "text/html", + "image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1802252/image-20230709-020021.png?version=1&modificationDate=1689094907442&cacheVersion=1&api=v2&width=475&height=236", "languages": [ "eng", "fra" diff --git a/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json b/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json index 90ef86b66..4619cb388 100644 --- a/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json +++ b/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json @@ -545,6 +545,7 @@ "date_modified": "2023-08-17T18:48:00.000Z" }, "filetype": "text/html", + "image_url": "https://media4.giphy.com/media/26FPsOhR3tyQRTc2Y/giphy.gif?cid=7941fdc68sl3vdqajgosqug9hfhg3zq3t5yoflyy9p7y66q0&ep=v1_gifs_trending&rid=giphy.gif&ct=g", "languages": [ "eng" ] diff --git a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json index ce4d1e2fe..1df736221 100644 --- a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json +++ b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json @@ -32,6 +32,7 @@ "element_id": "f714fa214dac2f441515c4f28370d279", "text": "", "metadata": { + "image_url": "https://unstructuredio-dev-ed.develop.my.salesforce.com/servlet/servlet.ImageServer?oid=00DHu0000018RDe&esid=018Hu00001JMmTZ&from=int", "languages": [ "eng" ], diff --git a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json index 302b1469b..796e7b71e 100644 --- a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json +++ b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json @@ -32,6 +32,7 @@ "element_id": "68870d055535f48c7439ce67092768f6", "text": "", "metadata": { + "image_url": "https://unstructuredio-dev-ed.develop.my.salesforce.com/servlet/servlet.ImageServer?oid=00DHu0000018RDe&esid=018Hu00001JMmTa&from=int", "languages": [ "eng" ], diff --git a/test_unstructured_ingest/src/local.sh b/test_unstructured_ingest/src/local.sh index 3d188c6a4..3c7139ceb 100755 --- a/test_unstructured_ingest/src/local.sh +++ b/test_unstructured_ingest/src/local.sh @@ -32,6 +32,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --input-path example-docs \ --work-dir "$WORK_DIR" -"$SCRIPT_DIR"/check-num-files-output.sh 14 $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-num-files-output.sh 15 $OUTPUT_FOLDER_NAME "$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 138620c64..06ff5fe5f 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.0" # pragma: no cover +__version__ = "0.17.1-dev0" # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index a9636b5d6..93cc903b3 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -172,6 +172,7 @@ class ElementMetadata: file_directory: Optional[str] filename: Optional[str] filetype: Optional[str] + image_url: Optional[str] image_path: Optional[str] image_base64: Optional[str] image_mime_type: Optional[str] @@ -230,6 +231,7 @@ class ElementMetadata: header_footer_type: Optional[str] = None, image_base64: Optional[str] = None, image_mime_type: Optional[str] = None, + image_url: Optional[str] = None, image_path: Optional[str] = None, is_continuation: Optional[bool] = None, languages: Optional[list[str]] = None, @@ -274,6 +276,7 @@ class ElementMetadata: self.header_footer_type = header_footer_type self.image_base64 = image_base64 self.image_mime_type = image_mime_type + self.image_url = image_url self.image_path = image_path self.is_continuation = is_continuation self.languages = languages @@ -490,6 +493,7 @@ class ConsolidationStrategy(enum.Enum): "filename": cls.FIRST, "filetype": cls.FIRST, "header_footer_type": cls.DROP, + "image_url": cls.DROP, "image_path": cls.DROP, "image_base64": cls.DROP, "image_mime_type": cls.DROP, diff --git a/unstructured/partition/html/parser.py b/unstructured/partition/html/parser.py index 1a9b03c50..858dea0ae 100644 --- a/unstructured/partition/html/parser.py +++ b/unstructured/partition/html/parser.py @@ -502,7 +502,7 @@ class ImageBlock(Flow): metadata=ElementMetadata( image_mime_type=img_mime_type, image_base64=img_base64, - url=img_url, + image_url=img_url, ), ) diff --git a/unstructured/partition/html/partition.py b/unstructured/partition/html/partition.py index 3cdcc6260..e5de65ed9 100644 --- a/unstructured/partition/html/partition.py +++ b/unstructured/partition/html/partition.py @@ -223,6 +223,7 @@ class _HtmlPartitioner: # -- remove if not requested -- if not self._should_include_image_base64(e): e.metadata.image_base64 = None + e.metadata.image_mime_type = None yield e @lazyproperty