feat: support image/webp file type (#1415)

* support image/webp file type

Signed-off-by: Elwin <61868295+hzhaoy@users.noreply.github.com>
Signed-off-by: Elwin <hzywong@gmail.com>

* docs: add webp image format in supported_formats.md

Signed-off-by: Elwin <61868295+hzhaoy@users.noreply.github.com>
Signed-off-by: Elwin <hzywong@gmail.com>

* test: add a test case for `image/webp` file

Signed-off-by: Elwin <hzywong@gmail.com>

* style: apply styling

Signed-off-by: Elwin <hzywong@gmail.com>

* test: update test case of converting `image/webp` file with more ocr engines

Signed-off-by: Elwin <hzywong@gmail.com>

* style: apply styling

Signed-off-by: Elwin <hzywong@gmail.com>

* rename test file

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Elwin <61868295+hzhaoy@users.noreply.github.com>
Signed-off-by: Elwin <hzywong@gmail.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Elwin 2025-05-14 15:47:28 +08:00 committed by GitHub
parent 23238c241f
commit 12dab0a1e8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 90 additions and 2 deletions

View File

@ -90,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
"image/tiff", "image/tiff",
"image/gif", "image/gif",
"image/bmp", "image/bmp",
"image/webp",
], ],
InputFormat.PDF: ["application/pdf"], InputFormat.PDF: ["application/pdf"],
InputFormat.ASCIIDOC: ["text/asciidoc"], InputFormat.ASCIIDOC: ["text/asciidoc"],

View File

@ -14,7 +14,7 @@ Below you can find a listing of all supported input and output formats.
| AsciiDoc | | | AsciiDoc | |
| HTML, XHTML | | | HTML, XHTML | |
| CSV | | | CSV | |
| PNG, JPEG, TIFF, BMP | Image formats | | PNG, JPEG, TIFF, BMP, WEBP | Image formats |
Schema-specific support: Schema-specific support:

View File

@ -0,0 +1,2 @@
<doctag><text><loc_58><loc_44><loc_426><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
</doctag>

View File

@ -0,0 +1 @@
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}

View File

@ -0,0 +1 @@
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

View File

@ -0,0 +1,82 @@
import sys
from pathlib import Path
from typing import List
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
OcrOptions,
RapidOcrOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, ImageFormatOption
from tests.verify_utils import verify_conversion_result_v2
from .test_data_gen_flag import GEN_TEST_DATA
GENERATE = GEN_TEST_DATA
def get_webp_paths():
# Define the directory you want to search
directory = Path("./tests/data/webp/")
# List all WEBP files in the directory and its subdirectories
webp_files = sorted(directory.rglob("*.webp"))
return webp_files
def get_converter(ocr_options: OcrOptions):
image_format_option = ImageFormatOption()
image_format_option.pipeline_options.ocr_options = ocr_options
converter = DocumentConverter(
format_options={InputFormat.IMAGE: image_format_option},
allowed_formats=[InputFormat.IMAGE],
)
return converter
def test_e2e_webp_conversions():
webp_paths = get_webp_paths()
engines: List[OcrOptions] = [
EasyOcrOptions(),
TesseractOcrOptions(),
TesseractCliOcrOptions(),
EasyOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]),
TesseractCliOcrOptions(force_full_page_ocr=True),
TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
]
# rapidocr is only available for Python >=3.6,<3.13
if sys.version_info < (3, 13):
engines.append(RapidOcrOptions())
engines.append(RapidOcrOptions(force_full_page_ocr=True))
# only works on mac
if "darwin" == sys.platform:
engines.append(OcrMacOptions())
engines.append(OcrMacOptions(force_full_page_ocr=True))
for ocr_options in engines:
print(
f"Converting with ocr_engine: {ocr_options.kind}, language: {ocr_options.lang}"
)
converter = get_converter(ocr_options=ocr_options)
for webp_path in webp_paths:
print(f"converting {webp_path}")
doc_result: ConversionResult = converter.convert(webp_path)
verify_conversion_result_v2(
input_path=webp_path,
doc_result=doc_result,
generate=GENERATE,
fuzzy=True,
)

View File

@ -462,7 +462,7 @@ def verify_conversion_result_v2(
def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False): def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
if not os.path.exists(gtfile) or generate: if not os.path.exists(gtfile) or generate:
with open(gtfile, "w") as fw: with open(gtfile, "w") as fw:
json.dump(pred_doc.export_to_dict(), fw, indent=2) json.dump(pred_doc.export_to_dict(), fw, ensure_ascii=False, indent=2)
return True return True
else: else: