feat: add convert_string to document-converter (#2069)

* feat: add convert_string to document-converter

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fix unsupported operand type(s) for |: type and NoneType

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added tests for convert_string

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar 2025-08-12 11:02:38 +02:00 committed by GitHub
parent e2cca931be
commit b09033cb73
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 82 additions and 1 deletions

View File

@ -5,7 +5,9 @@ import threading
import time import time
from collections.abc import Iterable, Iterator from collections.abc import Iterable, Iterator
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from functools import partial from functools import partial
from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional, Tuple, Type, Union from typing import Dict, List, Optional, Tuple, Type, Union
@ -275,6 +277,34 @@ class DocumentConverter:
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats." "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
) )
@validate_call(config=ConfigDict(strict=True))
def convert_string(
self,
content: str,
format: InputFormat,
name: Optional[str],
) -> ConversionResult:
name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
if format == InputFormat.MD:
if not name.endswith(".md"):
name += ".md"
buff = BytesIO(content.encode("utf-8"))
doc_stream = DocumentStream(name=name, stream=buff)
return self.convert(doc_stream)
elif format == InputFormat.HTML:
if not name.endswith(".html"):
name += ".html"
buff = BytesIO(content.encode("utf-8"))
doc_stream = DocumentStream(name=name, stream=buff)
return self.convert(doc_stream)
else:
raise ValueError(f"format {format} is not supported in `convert_string`")
def _convert( def _convert(
self, conv_input: _DocumentConversionInput, raises_on_error: bool self, conv_input: _DocumentConversionInput, raises_on_error: bool
) -> Iterator[ConversionResult]: ) -> Iterator[ConversionResult]:

View File

@ -2,10 +2,19 @@ from pathlib import Path
from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DoclingDocument, InputDocument from docling.datamodel.document import (
ConversionResult,
DoclingDocument,
InputDocument,
SectionHeaderItem,
)
from docling.document_converter import DocumentConverter
from tests.verify_utils import CONFID_PREC, COORD_PREC from tests.verify_utils import CONFID_PREC, COORD_PREC
from .test_data_gen_flag import GEN_TEST_DATA from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
GENERATE = GEN_TEST_DATA
def test_convert_valid(): def test_convert_valid():
@ -54,3 +63,45 @@ def test_convert_valid():
if in_path.stem in yaml_filter: if in_path.stem in yaml_filter:
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path) exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
assert act_doc == exp_doc, f"export to yaml failed on {in_path}" assert act_doc == exp_doc, f"export to yaml failed on {in_path}"
def get_md_paths():
# Define the directory you want to search
directory = Path("./tests/groundtruth/docling_v2")
# List all MD files in the directory and its subdirectories
md_files = sorted(directory.rglob("*.md"))
return md_files
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.MD])
return converter
def test_e2e_md_conversions():
md_paths = get_md_paths()
converter = get_converter()
for md_path in md_paths:
# print(f"converting {md_path}")
with open(md_path) as fr:
true_md = fr.read()
conv_result: ConversionResult = converter.convert(md_path)
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown()
assert true_md == pred_md
conv_result_: ConversionResult = converter.convert_string(
true_md, format=InputFormat.MD
)
doc_: DoclingDocument = conv_result_.document
pred_md_: str = doc_.export_to_markdown()
assert true_md == pred_md_