mirror of
https://github.com/docling-project/docling.git
synced 2025-06-27 05:20:05 +00:00
42 lines
1.2 KiB
Python
42 lines
1.2 KiB
Python
from pathlib import Path
|
|
|
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.document import InputDocument
|
|
|
|
from .test_data_gen_flag import GEN_TEST_DATA
|
|
|
|
|
|
def test_convert_valid():
|
|
fmt = InputFormat.MD
|
|
cls = MarkdownDocumentBackend
|
|
|
|
test_data_path = Path("tests") / "data"
|
|
relevant_paths = sorted((test_data_path / "md").rglob("*.md"))
|
|
assert len(relevant_paths) > 0
|
|
|
|
for in_path in relevant_paths:
|
|
gt_path = test_data_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
|
|
|
|
in_doc = InputDocument(
|
|
path_or_stream=in_path,
|
|
format=fmt,
|
|
backend=cls,
|
|
)
|
|
backend = cls(
|
|
in_doc=in_doc,
|
|
path_or_stream=in_path,
|
|
)
|
|
assert backend.is_valid()
|
|
|
|
act_doc = backend.convert()
|
|
act_data = act_doc.export_to_markdown()
|
|
|
|
if GEN_TEST_DATA:
|
|
with open(gt_path, mode="w", encoding="utf-8") as f:
|
|
f.write(f"{act_data}\n")
|
|
else:
|
|
with open(gt_path, encoding="utf-8") as f:
|
|
exp_data = f.read().rstrip()
|
|
assert exp_data == act_data
|