feat: support image/webp file type (#1415)

* support image/webp file type Signed-off-by: Elwin <61868295+hzhaoy@users.noreply.github.com> Signed-off-by: Elwin <hzywong@gmail.com> * docs: add webp image format in supported_formats.md Signed-off-by: Elwin <61868295+hzhaoy@users.noreply.github.com> Signed-off-by: Elwin <hzywong@gmail.com> * test: add a test case for `image/webp` file Signed-off-by: Elwin <hzywong@gmail.com> * style: apply styling Signed-off-by: Elwin <hzywong@gmail.com> * test: update test case of converting `image/webp` file with more ocr engines Signed-off-by: Elwin <hzywong@gmail.com> * style: apply styling Signed-off-by: Elwin <hzywong@gmail.com> * rename test file Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Elwin <61868295+hzhaoy@users.noreply.github.com> Signed-off-by: Elwin <hzywong@gmail.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
2025-06-27 05:20:05 +00:00 · 2025-05-14 15:47:28 +08:00 · 2025-05-14 15:47:28 +08:00 · 12dab0a1e8
commit 12dab0a1e8
parent 23238c241f
9 changed files with 90 additions and 2 deletions
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -90,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
        "image/tiff",
        "image/gif",
        "image/bmp",
+        "image/webp",
    ],
    InputFormat.PDF: ["application/pdf"],
    InputFormat.ASCIIDOC: ["text/asciidoc"],
--- a/docs/usage/supported_formats.md
+++ b/docs/usage/supported_formats.md
@ -14,7 +14,7 @@ Below you can find a listing of all supported input and output formats.
 | AsciiDoc | |
 | HTML, XHTML | |
 | CSV | |
-| PNG, JPEG, TIFF, BMP | Image formats |
+| PNG, JPEG, TIFF, BMP, WEBP | Image formats |

 Schema-specific support:

--- a/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt
+++ b/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt
@ -0,0 +1,2 @@
+<doctag><text><loc_58><loc_44><loc_426><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
+</doctag>
--- a/tests/data/webp/groundtruth/docling_v2/webp-test.json
+++ b/tests/data/webp/groundtruth/docling_v2/webp-test.json
@ -0,0 +1 @@
+{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
--- a/tests/data/webp/groundtruth/docling_v2/webp-test.md
+++ b/tests/data/webp/groundtruth/docling_v2/webp-test.md
@ -0,0 +1 @@
+Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
--- a/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json
+++ b/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json
--- a/tests/data/webp/webp-test.webp
+++ b/tests/data/webp/webp-test.webp
--- a/tests/test_backend_webp.py
+++ b/tests/test_backend_webp.py
@ -0,0 +1,82 @@
+import sys
+from pathlib import Path
+from typing import List
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import ConversionResult, DoclingDocument
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    OcrMacOptions,
+    OcrOptions,
+    RapidOcrOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter, ImageFormatOption
+from tests.verify_utils import verify_conversion_result_v2
+
+from .test_data_gen_flag import GEN_TEST_DATA
+
+GENERATE = GEN_TEST_DATA
+
+
+def get_webp_paths():
+    # Define the directory you want to search
+    directory = Path("./tests/data/webp/")
+
+    # List all WEBP files in the directory and its subdirectories
+    webp_files = sorted(directory.rglob("*.webp"))
+    return webp_files
+
+
+def get_converter(ocr_options: OcrOptions):
+    image_format_option = ImageFormatOption()
+    image_format_option.pipeline_options.ocr_options = ocr_options
+
+    converter = DocumentConverter(
+        format_options={InputFormat.IMAGE: image_format_option},
+        allowed_formats=[InputFormat.IMAGE],
+    )
+
+    return converter
+
+
+def test_e2e_webp_conversions():
+    webp_paths = get_webp_paths()
+
+    engines: List[OcrOptions] = [
+        EasyOcrOptions(),
+        TesseractOcrOptions(),
+        TesseractCliOcrOptions(),
+        EasyOcrOptions(force_full_page_ocr=True),
+        TesseractOcrOptions(force_full_page_ocr=True),
+        TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
+        TesseractCliOcrOptions(force_full_page_ocr=True),
+        TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
+    ]
+
+    # rapidocr is only available for Python >=3.6,<3.13
+    if sys.version_info < (3, 13):
+        engines.append(RapidOcrOptions())
+        engines.append(RapidOcrOptions(force_full_page_ocr=True))
+
+    # only works on mac
+    if "darwin" == sys.platform:
+        engines.append(OcrMacOptions())
+        engines.append(OcrMacOptions(force_full_page_ocr=True))
+    for ocr_options in engines:
+        print(
+            f"Converting with ocr_engine: {ocr_options.kind}, language: {ocr_options.lang}"
+        )
+        converter = get_converter(ocr_options=ocr_options)
+        for webp_path in webp_paths:
+            print(f"converting {webp_path}")
+
+            doc_result: ConversionResult = converter.convert(webp_path)
+
+            verify_conversion_result_v2(
+                input_path=webp_path,
+                doc_result=doc_result,
+                generate=GENERATE,
+                fuzzy=True,
+            )
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@ -462,7 +462,7 @@ def verify_conversion_result_v2(
 def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
    if not os.path.exists(gtfile) or generate:
        with open(gtfile, "w") as fw:
-            json.dump(pred_doc.export_to_dict(), fw, indent=2)
+            json.dump(pred_doc.export_to_dict(), fw, ensure_ascii=False, indent=2)

        return True
    else:
				`@ -0,0 +1 @@`
				{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
				`@ -0,0 +1 @@`
				`Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package`