mirror of
https://github.com/docling-project/docling.git
synced 2025-08-27 10:36:09 +00:00

* feat: add convert_string to document-converter Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix unsupported operand type(s) for |: type and NoneType Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added tests for convert_string Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com>
108 lines
3.1 KiB
Python
108 lines
3.1 KiB
Python
from pathlib import Path
|
|
|
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.document import (
|
|
ConversionResult,
|
|
DoclingDocument,
|
|
InputDocument,
|
|
SectionHeaderItem,
|
|
)
|
|
from docling.document_converter import DocumentConverter
|
|
from tests.verify_utils import CONFID_PREC, COORD_PREC
|
|
|
|
from .test_data_gen_flag import GEN_TEST_DATA
|
|
from .verify_utils import verify_document, verify_export
|
|
|
|
GENERATE = GEN_TEST_DATA
|
|
|
|
|
|
def test_convert_valid():
|
|
fmt = InputFormat.MD
|
|
cls = MarkdownDocumentBackend
|
|
|
|
root_path = Path("tests") / "data"
|
|
relevant_paths = sorted((root_path / "md").rglob("*.md"))
|
|
assert len(relevant_paths) > 0
|
|
|
|
yaml_filter = ["inline_and_formatting", "mixed_without_h1"]
|
|
|
|
for in_path in relevant_paths:
|
|
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
|
|
yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
|
|
|
|
in_doc = InputDocument(
|
|
path_or_stream=in_path,
|
|
format=fmt,
|
|
backend=cls,
|
|
)
|
|
backend = cls(
|
|
in_doc=in_doc,
|
|
path_or_stream=in_path,
|
|
)
|
|
assert backend.is_valid()
|
|
|
|
act_doc = backend.convert()
|
|
act_data = act_doc.export_to_markdown()
|
|
|
|
if GEN_TEST_DATA:
|
|
with open(md_gt_path, mode="w", encoding="utf-8") as f:
|
|
f.write(f"{act_data}\n")
|
|
|
|
if in_path.stem in yaml_filter:
|
|
act_doc.save_as_yaml(
|
|
yaml_gt_path,
|
|
coord_precision=COORD_PREC,
|
|
confid_precision=CONFID_PREC,
|
|
)
|
|
else:
|
|
with open(md_gt_path, encoding="utf-8") as f:
|
|
exp_data = f.read().rstrip()
|
|
assert act_data == exp_data
|
|
|
|
if in_path.stem in yaml_filter:
|
|
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
|
|
assert act_doc == exp_doc, f"export to yaml failed on {in_path}"
|
|
|
|
|
|
def get_md_paths():
|
|
# Define the directory you want to search
|
|
directory = Path("./tests/groundtruth/docling_v2")
|
|
|
|
# List all MD files in the directory and its subdirectories
|
|
md_files = sorted(directory.rglob("*.md"))
|
|
return md_files
|
|
|
|
|
|
def get_converter():
|
|
converter = DocumentConverter(allowed_formats=[InputFormat.MD])
|
|
|
|
return converter
|
|
|
|
|
|
def test_e2e_md_conversions():
|
|
md_paths = get_md_paths()
|
|
converter = get_converter()
|
|
|
|
for md_path in md_paths:
|
|
# print(f"converting {md_path}")
|
|
|
|
with open(md_path) as fr:
|
|
true_md = fr.read()
|
|
|
|
conv_result: ConversionResult = converter.convert(md_path)
|
|
|
|
doc: DoclingDocument = conv_result.document
|
|
|
|
pred_md: str = doc.export_to_markdown()
|
|
assert true_md == pred_md
|
|
|
|
conv_result_: ConversionResult = converter.convert_string(
|
|
true_md, format=InputFormat.MD
|
|
)
|
|
|
|
doc_: DoclingDocument = conv_result_.document
|
|
|
|
pred_md_: str = doc_.export_to_markdown()
|
|
assert true_md == pred_md_
|