mirror of
https://github.com/docling-project/docling.git
synced 2025-11-02 20:02:55 +00:00
feat: add convert_string to document-converter (#2069)
* feat: add convert_string to document-converter Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix unsupported operand type(s) for |: type and NoneType Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added tests for convert_string Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
e2cca931be
commit
b09033cb73
@ -5,7 +5,9 @@ import threading
|
||||
import time
|
||||
from collections.abc import Iterable, Iterator
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime
|
||||
from functools import partial
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple, Type, Union
|
||||
|
||||
@ -275,6 +277,34 @@ class DocumentConverter:
|
||||
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
||||
)
|
||||
|
||||
@validate_call(config=ConfigDict(strict=True))
|
||||
def convert_string(
|
||||
self,
|
||||
content: str,
|
||||
format: InputFormat,
|
||||
name: Optional[str],
|
||||
) -> ConversionResult:
|
||||
name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
|
||||
if format == InputFormat.MD:
|
||||
if not name.endswith(".md"):
|
||||
name += ".md"
|
||||
|
||||
buff = BytesIO(content.encode("utf-8"))
|
||||
doc_stream = DocumentStream(name=name, stream=buff)
|
||||
|
||||
return self.convert(doc_stream)
|
||||
elif format == InputFormat.HTML:
|
||||
if not name.endswith(".html"):
|
||||
name += ".html"
|
||||
|
||||
buff = BytesIO(content.encode("utf-8"))
|
||||
doc_stream = DocumentStream(name=name, stream=buff)
|
||||
|
||||
return self.convert(doc_stream)
|
||||
else:
|
||||
raise ValueError(f"format {format} is not supported in `convert_string`")
|
||||
|
||||
def _convert(
|
||||
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
||||
) -> Iterator[ConversionResult]:
|
||||
|
||||
@ -2,10 +2,19 @@ from pathlib import Path
|
||||
|
||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import DoclingDocument, InputDocument
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
DoclingDocument,
|
||||
InputDocument,
|
||||
SectionHeaderItem,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
from tests.verify_utils import CONFID_PREC, COORD_PREC
|
||||
|
||||
from .test_data_gen_flag import GEN_TEST_DATA
|
||||
from .verify_utils import verify_document, verify_export
|
||||
|
||||
GENERATE = GEN_TEST_DATA
|
||||
|
||||
|
||||
def test_convert_valid():
|
||||
@ -54,3 +63,45 @@ def test_convert_valid():
|
||||
if in_path.stem in yaml_filter:
|
||||
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
|
||||
assert act_doc == exp_doc, f"export to yaml failed on {in_path}"
|
||||
|
||||
|
||||
def get_md_paths():
|
||||
# Define the directory you want to search
|
||||
directory = Path("./tests/groundtruth/docling_v2")
|
||||
|
||||
# List all MD files in the directory and its subdirectories
|
||||
md_files = sorted(directory.rglob("*.md"))
|
||||
return md_files
|
||||
|
||||
|
||||
def get_converter():
|
||||
converter = DocumentConverter(allowed_formats=[InputFormat.MD])
|
||||
|
||||
return converter
|
||||
|
||||
|
||||
def test_e2e_md_conversions():
|
||||
md_paths = get_md_paths()
|
||||
converter = get_converter()
|
||||
|
||||
for md_path in md_paths:
|
||||
# print(f"converting {md_path}")
|
||||
|
||||
with open(md_path) as fr:
|
||||
true_md = fr.read()
|
||||
|
||||
conv_result: ConversionResult = converter.convert(md_path)
|
||||
|
||||
doc: DoclingDocument = conv_result.document
|
||||
|
||||
pred_md: str = doc.export_to_markdown()
|
||||
assert true_md == pred_md
|
||||
|
||||
conv_result_: ConversionResult = converter.convert_string(
|
||||
true_md, format=InputFormat.MD
|
||||
)
|
||||
|
||||
doc_: DoclingDocument = conv_result_.document
|
||||
|
||||
pred_md_: str = doc_.export_to_markdown()
|
||||
assert true_md == pred_md_
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user