docling/tests/test_backend_msword.py

31 lines
970 B
Python
Raw Normal View History

from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument, SectionHeaderItem
def test_heading_levels():
in_path = Path("tests/data/word_sample.docx")
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.DOCX,
backend=MsWordDocumentBackend,
)
backend = MsWordDocumentBackend(
in_doc=in_doc,
path_or_stream=in_path,
)
doc = backend.convert()
found_lvl_1 = found_lvl_2 = False
for item, _ in doc.iterate_items():
if isinstance(item, SectionHeaderItem):
if item.text == "Let\u2019s swim!":
found_lvl_1 = True
assert item.level == 1
elif item.text == "Let\u2019s eat":
found_lvl_2 = True
assert item.level == 2
assert found_lvl_1 and found_lvl_2