From b09033cb73e3d99bb8b299675a539b4f10e41cb1 Mon Sep 17 00:00:00 2001 From: "Peter W. J. Staar" <91719829+PeterStaar-IBM@users.noreply.github.com> Date: Tue, 12 Aug 2025 11:02:38 +0200 Subject: [PATCH] feat: add convert_string to document-converter (#2069) * feat: add convert_string to document-converter Signed-off-by: Peter Staar * fix unsupported operand type(s) for |: type and NoneType Signed-off-by: Peter Staar * added tests for convert_string Signed-off-by: Peter Staar --------- Signed-off-by: Peter Staar --- docling/document_converter.py | 30 +++++++++++++++++++ tests/test_backend_markdown.py | 53 +++++++++++++++++++++++++++++++++- 2 files changed, 82 insertions(+), 1 deletion(-) diff --git a/docling/document_converter.py b/docling/document_converter.py index 5ad19c6d..591c75c1 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -5,7 +5,9 @@ import threading import time from collections.abc import Iterable, Iterator from concurrent.futures import ThreadPoolExecutor +from datetime import datetime from functools import partial +from io import BytesIO from pathlib import Path from typing import Dict, List, Optional, Tuple, Type, Union @@ -275,6 +277,34 @@ class DocumentConverter: "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats." ) + @validate_call(config=ConfigDict(strict=True)) + def convert_string( + self, + content: str, + format: InputFormat, + name: Optional[str], + ) -> ConversionResult: + name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + + if format == InputFormat.MD: + if not name.endswith(".md"): + name += ".md" + + buff = BytesIO(content.encode("utf-8")) + doc_stream = DocumentStream(name=name, stream=buff) + + return self.convert(doc_stream) + elif format == InputFormat.HTML: + if not name.endswith(".html"): + name += ".html" + + buff = BytesIO(content.encode("utf-8")) + doc_stream = DocumentStream(name=name, stream=buff) + + return self.convert(doc_stream) + else: + raise ValueError(f"format {format} is not supported in `convert_string`") + def _convert( self, conv_input: _DocumentConversionInput, raises_on_error: bool ) -> Iterator[ConversionResult]: diff --git a/tests/test_backend_markdown.py b/tests/test_backend_markdown.py index 8a0d7b4f..12db89a9 100644 --- a/tests/test_backend_markdown.py +++ b/tests/test_backend_markdown.py @@ -2,10 +2,19 @@ from pathlib import Path from docling.backend.md_backend import MarkdownDocumentBackend from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import DoclingDocument, InputDocument +from docling.datamodel.document import ( + ConversionResult, + DoclingDocument, + InputDocument, + SectionHeaderItem, +) +from docling.document_converter import DocumentConverter from tests.verify_utils import CONFID_PREC, COORD_PREC from .test_data_gen_flag import GEN_TEST_DATA +from .verify_utils import verify_document, verify_export + +GENERATE = GEN_TEST_DATA def test_convert_valid(): @@ -54,3 +63,45 @@ def test_convert_valid(): if in_path.stem in yaml_filter: exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path) assert act_doc == exp_doc, f"export to yaml failed on {in_path}" + + +def get_md_paths(): + # Define the directory you want to search + directory = Path("./tests/groundtruth/docling_v2") + + # List all MD files in the directory and its subdirectories + md_files = sorted(directory.rglob("*.md")) + return md_files + + +def get_converter(): + converter = DocumentConverter(allowed_formats=[InputFormat.MD]) + + return converter + + +def test_e2e_md_conversions(): + md_paths = get_md_paths() + converter = get_converter() + + for md_path in md_paths: + # print(f"converting {md_path}") + + with open(md_path) as fr: + true_md = fr.read() + + conv_result: ConversionResult = converter.convert(md_path) + + doc: DoclingDocument = conv_result.document + + pred_md: str = doc.export_to_markdown() + assert true_md == pred_md + + conv_result_: ConversionResult = converter.convert_string( + true_md, format=InputFormat.MD + ) + + doc_: DoclingDocument = conv_result_.document + + pred_md_: str = doc_.export_to_markdown() + assert true_md == pred_md_