feat: support image/webp file type (#1415)

* support image/webp file type Signed-off-by: Elwin <61868295+hzhaoy@users.noreply.github.com> Signed-off-by: Elwin <hzywong@gmail.com> * docs: add webp image format in supported_formats.md Signed-off-by: Elwin <61868295+hzhaoy@users.noreply.github.com> Signed-off-by: Elwin <hzywong@gmail.com> * test: add a test case for `image/webp` file Signed-off-by: Elwin <hzywong@gmail.com> * style: apply styling Signed-off-by: Elwin <hzywong@gmail.com> * test: update test case of converting `image/webp` file with more ocr engines Signed-off-by: Elwin <hzywong@gmail.com> * style: apply styling Signed-off-by: Elwin <hzywong@gmail.com> * rename test file Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Elwin <61868295+hzhaoy@users.noreply.github.com> Signed-off-by: Elwin <hzywong@gmail.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
2025-06-27 05:20:05 +00:00 · 2025-05-14 15:47:28 +08:00 · 2025-05-14 15:47:28 +08:00 · 12dab0a1e8
commit 12dab0a1e8
parent 23238c241f
9 changed files with 90 additions and 2 deletions
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -90,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
        "image/tiff",
        "image/gif",
        "image/bmp",
        "image/webp",
    ],
    InputFormat.PDF: ["application/pdf"],
    InputFormat.ASCIIDOC: ["text/asciidoc"],
--- a/docs/usage/supported_formats.md
+++ b/docs/usage/supported_formats.md
@ -14,7 +14,7 @@ Below you can find a listing of all supported input and output formats.
 | AsciiDoc | |
 | HTML, XHTML | |
 | CSV | |
-| PNG, JPEG, TIFF, BMP | Image formats |
+| PNG, JPEG, TIFF, BMP, WEBP | Image formats |
 Schema-specific support:
--- a/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt
+++ b/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt
@ -0,0 +1,2 @@
 <doctag><text><loc_58><loc_44><loc_426><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
 </doctag>
--- a/tests/data/webp/groundtruth/docling_v2/webp-test.json
+++ b/tests/data/webp/groundtruth/docling_v2/webp-test.json
@ -0,0 +1 @@
 {"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
--- a/tests/data/webp/groundtruth/docling_v2/webp-test.md
+++ b/tests/data/webp/groundtruth/docling_v2/webp-test.md
@ -0,0 +1 @@
 Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
--- a/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json
+++ b/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json
--- a/tests/data/webp/webp-test.webp
+++ b/tests/data/webp/webp-test.webp
--- a/tests/test_backend_webp.py
+++ b/tests/test_backend_webp.py
@ -0,0 +1,82 @@
 import sys
 from pathlib import Path
 from typing import List
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult, DoclingDocument
 from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrMacOptions,
    OcrOptions,
    RapidOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )
 from docling.document_converter import DocumentConverter, ImageFormatOption
 from tests.verify_utils import verify_conversion_result_v2
 from .test_data_gen_flag import GEN_TEST_DATA
 GENERATE = GEN_TEST_DATA
 def get_webp_paths():
    # Define the directory you want to search
    directory = Path("./tests/data/webp/")
    # List all WEBP files in the directory and its subdirectories
    webp_files = sorted(directory.rglob("*.webp"))
    return webp_files
 def get_converter(ocr_options: OcrOptions):
    image_format_option = ImageFormatOption()
    image_format_option.pipeline_options.ocr_options = ocr_options
    converter = DocumentConverter(
        format_options={InputFormat.IMAGE: image_format_option},
        allowed_formats=[InputFormat.IMAGE],
    )
    return converter
 def test_e2e_webp_conversions():
    webp_paths = get_webp_paths()
    engines: List[OcrOptions] = [
        EasyOcrOptions(),
        TesseractOcrOptions(),
        TesseractCliOcrOptions(),
        EasyOcrOptions(force_full_page_ocr=True),
        TesseractOcrOptions(force_full_page_ocr=True),
        TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
        TesseractCliOcrOptions(force_full_page_ocr=True),
        TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
    ]
    # rapidocr is only available for Python >=3.6,<3.13
    if sys.version_info < (3, 13):
        engines.append(RapidOcrOptions())
        engines.append(RapidOcrOptions(force_full_page_ocr=True))
    # only works on mac
    if "darwin" == sys.platform:
        engines.append(OcrMacOptions())
        engines.append(OcrMacOptions(force_full_page_ocr=True))
    for ocr_options in engines:
        print(
            f"Converting with ocr_engine: {ocr_options.kind}, language: {ocr_options.lang}"
        )
        converter = get_converter(ocr_options=ocr_options)
        for webp_path in webp_paths:
            print(f"converting {webp_path}")
            doc_result: ConversionResult = converter.convert(webp_path)
            verify_conversion_result_v2(
                input_path=webp_path,
                doc_result=doc_result,
                generate=GENERATE,
                fuzzy=True,
            )
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@ -462,7 +462,7 @@ def verify_conversion_result_v2(
 def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
    if not os.path.exists(gtfile) or generate:
        with open(gtfile, "w") as fw:
-            json.dump(pred_doc.export_to_dict(), fw, indent=2)
+            json.dump(pred_doc.export_to_dict(), fw, ensure_ascii=False, indent=2)
        return True
    else:
		`@ -0,0 +1,2 @@`
							`<doctag><text><loc_58><loc_44><loc_426><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>`
							`</doctag>`
		`@ -0,0 +1 @@`
							{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
		`@ -0,0 +1 @@`
							`Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package`