mirror of
https://github.com/docling-project/docling.git
synced 2025-06-27 05:20:05 +00:00

* added the contentlayer to html-backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updated the handle_image function Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted code of html backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * test(html): add more info if a test case fails Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * refactor(html): put parsed item in body if doc has no header In case an HTML does not have any header tag, all parsed items are placed in DoclingDocument's body content layer. HTML paragraphs ('p' tags) are parsed as text items with paragraph label. Update test ground truth accoring to the changes above. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore: set TextItem label to 'text' instead of 'paragraph' Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
144 lines
3.9 KiB
Python
144 lines
3.9 KiB
Python
from io import BytesIO
|
|
from pathlib import Path
|
|
|
|
from docling.backend.html_backend import HTMLDocumentBackend
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.document import (
|
|
ConversionResult,
|
|
DoclingDocument,
|
|
InputDocument,
|
|
SectionHeaderItem,
|
|
)
|
|
from docling.document_converter import DocumentConverter
|
|
|
|
from .verify_utils import verify_document, verify_export
|
|
|
|
GENERATE = False
|
|
|
|
|
|
def test_heading_levels():
|
|
in_path = Path("tests/data/html/wiki_duck.html")
|
|
in_doc = InputDocument(
|
|
path_or_stream=in_path,
|
|
format=InputFormat.HTML,
|
|
backend=HTMLDocumentBackend,
|
|
)
|
|
backend = HTMLDocumentBackend(
|
|
in_doc=in_doc,
|
|
path_or_stream=in_path,
|
|
)
|
|
doc = backend.convert()
|
|
|
|
found_lvl_2 = found_lvl_3 = False
|
|
for item, _ in doc.iterate_items():
|
|
if isinstance(item, SectionHeaderItem):
|
|
if item.text == "Etymology":
|
|
found_lvl_2 = True
|
|
assert item.level == 2
|
|
elif item.text == "Feeding":
|
|
found_lvl_3 = True
|
|
assert item.level == 3
|
|
assert found_lvl_2 and found_lvl_3
|
|
|
|
|
|
def test_ordered_lists():
|
|
test_set: list[tuple[bytes, str]] = []
|
|
|
|
test_set.append(
|
|
(
|
|
b"<html><body><ol><li>1st item</li><li>2nd item</li></ol></body></html>",
|
|
"1. 1st item\n2. 2nd item",
|
|
)
|
|
)
|
|
test_set.append(
|
|
(
|
|
b'<html><body><ol start="1"><li>1st item</li><li>2nd item</li></ol></body></html>',
|
|
"1. 1st item\n2. 2nd item",
|
|
)
|
|
)
|
|
test_set.append(
|
|
(
|
|
b'<html><body><ol start="2"><li>1st item</li><li>2nd item</li></ol></body></html>',
|
|
"2. 1st item\n3. 2nd item",
|
|
)
|
|
)
|
|
test_set.append(
|
|
(
|
|
b'<html><body><ol start="0"><li>1st item</li><li>2nd item</li></ol></body></html>',
|
|
"0. 1st item\n1. 2nd item",
|
|
)
|
|
)
|
|
test_set.append(
|
|
(
|
|
b'<html><body><ol start="-5"><li>1st item</li><li>2nd item</li></ol></body></html>',
|
|
"1. 1st item\n2. 2nd item",
|
|
)
|
|
)
|
|
test_set.append(
|
|
(
|
|
b'<html><body><ol start="foo"><li>1st item</li><li>2nd item</li></ol></body></html>',
|
|
"1. 1st item\n2. 2nd item",
|
|
)
|
|
)
|
|
|
|
for idx, pair in enumerate(test_set):
|
|
in_doc = InputDocument(
|
|
path_or_stream=BytesIO(pair[0]),
|
|
format=InputFormat.HTML,
|
|
backend=HTMLDocumentBackend,
|
|
filename="test",
|
|
)
|
|
backend = HTMLDocumentBackend(
|
|
in_doc=in_doc,
|
|
path_or_stream=BytesIO(pair[0]),
|
|
)
|
|
doc: DoclingDocument = backend.convert()
|
|
assert doc
|
|
assert doc.export_to_markdown() == pair[1], f"Error in case {idx}"
|
|
|
|
|
|
def get_html_paths():
|
|
|
|
# Define the directory you want to search
|
|
directory = Path("./tests/data/html/")
|
|
|
|
# List all HTML files in the directory and its subdirectories
|
|
html_files = sorted(directory.rglob("*.html"))
|
|
return html_files
|
|
|
|
|
|
def get_converter():
|
|
|
|
converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
|
|
|
|
return converter
|
|
|
|
|
|
def test_e2e_html_conversions():
|
|
|
|
html_paths = get_html_paths()
|
|
converter = get_converter()
|
|
|
|
for html_path in html_paths:
|
|
# print(f"converting {html_path}")
|
|
|
|
gt_path = (
|
|
html_path.parent.parent / "groundtruth" / "docling_v2" / html_path.name
|
|
)
|
|
|
|
conv_result: ConversionResult = converter.convert(html_path)
|
|
|
|
doc: DoclingDocument = conv_result.document
|
|
|
|
pred_md: str = doc.export_to_markdown()
|
|
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
|
|
|
|
pred_itxt: str = doc._export_to_indented_text(
|
|
max_text_len=70, explicit_tables=False
|
|
)
|
|
assert verify_export(
|
|
pred_itxt, str(gt_path) + ".itxt"
|
|
), "export to indented-text"
|
|
|
|
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
|