feat(markdown): add formatting & improve inline support (#1804)

feat(markdown): support formatting & hyperlinks

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
Panos Vagenas 2025-06-18 15:57:57 +02:00 committed by GitHub
parent 215b540f6c
commit 861abcdcb0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 722 additions and 88 deletions

View File

@ -1,17 +1,15 @@
import logging
import re
import warnings
from copy import deepcopy
from io import BytesIO
from pathlib import Path
from typing import List, Optional, Set, Union
import marko
import marko.element
import marko.ext
import marko.ext.gfm
import marko.inline
from docling_core.types.doc import (
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
@ -21,7 +19,9 @@ from docling_core.types.doc import (
TableData,
TextItem,
)
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
from marko import Markdown
from pydantic import AnyUrl, TypeAdapter
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.in_table = False
self.md_table_buffer: list[str] = []
self.inline_texts: list[str] = []
self._html_blocks: int = 0
try:
@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc.add_table(data=table_data)
return
def _process_inline_text(
self, parent_item: Optional[NodeItem], doc: DoclingDocument
):
txt = " ".join(self.inline_texts)
if len(txt) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=parent_item,
text=txt,
)
self.inline_texts = []
def _iterate_elements( # noqa: C901
self,
*,
element: marko.element.Element,
depth: int,
doc: DoclingDocument,
visited: Set[marko.element.Element],
parent_item: Optional[NodeItem] = None,
formatting: Optional[Formatting] = None,
hyperlink: Optional[Union[AnyUrl, Path]] = None,
):
if element in visited:
return
@ -183,44 +173,32 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# Check for different element types and process relevant details
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
)
if element.level == 1:
doc_label = DocItemLabel.TITLE
if len(element.children) == 1:
child = element.children[0]
snippet_text = str(child.children) # type: ignore
visited.add(child)
else:
doc_label = DocItemLabel.SECTION_HEADER
snippet_text = "" # inline group will be created
# Header could have arbitrary inclusion of bold, italic or emphasis,
# hence we need to traverse the tree to get full text of a header
strings: List[str] = []
# Define a recursive function to traverse the tree
def traverse(node: marko.block.BlockElement):
# Check if the node has a "children" attribute
if hasattr(node, "children"):
# If "children" is a list, continue traversal
if isinstance(node.children, list):
for child in node.children:
traverse(child)
# If "children" is text, add it to header text
elif isinstance(node.children, str):
strings.append(node.children)
traverse(element)
snippet_text = "".join(strings)
if len(snippet_text) > 0:
if doc_label == DocItemLabel.SECTION_HEADER:
parent_item = doc.add_heading(
text=snippet_text,
level=element.level - 1,
parent=parent_item,
)
else:
parent_item = doc.add_text(
label=doc_label, parent=parent_item, text=snippet_text
)
if element.level == 1:
parent_item = doc.add_title(
text=snippet_text,
parent=parent_item,
formatting=formatting,
hyperlink=hyperlink,
)
else:
parent_item = doc.add_heading(
text=snippet_text,
level=element.level - 1,
parent=parent_item,
formatting=formatting,
hyperlink=hyperlink,
)
elif isinstance(element, marko.block.List):
has_non_empty_list_items = False
@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
break
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif (
isinstance(element, marko.block.ListItem)
and len(element.children) > 0
and isinstance((first_child := element.children[0]), marko.block.Paragraph)
and len(element.children) == 1
and isinstance((child := element.children[0]), marko.block.Paragraph)
and len(child.children) > 0
):
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(" - List item")
snippet_text = str(first_child.children[0].children) # type: ignore
is_numbered = False
if (
parent_item is not None
and isinstance(parent_item, DocItem)
and parent_item.label == GroupLabel.ORDERED_LIST
):
is_numbered = True
doc.add_list_item(
enumerated=is_numbered, parent=parent_item, text=snippet_text
if len(child.children) == 1:
snippet_text = str(child.children[0].children) # type: ignore
visited.add(child)
else:
snippet_text = "" # inline group will be created
is_numbered = isinstance(parent_item, OrderedList)
if not isinstance(parent_item, (OrderedList, UnorderedList)):
_log.warning("ListItem would have not had a list parent, adding one.")
parent_item = doc.add_unordered_list(parent=parent_item)
parent_item = doc.add_list_item(
enumerated=is_numbered,
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
visited.add(first_child)
elif isinstance(element, marko.inline.Image):
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
fig_caption: Optional[TextItem] = None
if element.title is not None and element.title != "":
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=element.title
label=DocItemLabel.CAPTION,
text=element.title,
formatting=formatting,
hyperlink=hyperlink,
)
doc.add_picture(parent=parent_item, caption=fig_caption)
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
self._process_inline_text(parent_item, doc)
elif isinstance(element, marko.inline.Emphasis):
_log.debug(f" - Emphasis: {element.children}")
formatting = deepcopy(formatting) if formatting else Formatting()
formatting.italic = True
elif isinstance(element, marko.inline.StrongEmphasis):
_log.debug(f" - StrongEmphasis: {element.children}")
formatting = deepcopy(formatting) if formatting else Formatting()
formatting.bold = True
elif isinstance(element, marko.inline.Link):
_log.debug(f" - Link: {element.children}")
hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
element.dest
)
elif isinstance(element, marko.inline.RawText):
_log.debug(f" - Paragraph (raw text): {element.children}")
@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
else:
self.md_table_buffer.append(snippet_text)
else:
elif snippet_text:
self._close_table(doc)
# most likely just inline text
self.inline_texts.append(str(element.children))
doc.add_text(
label=DocItemLabel.TEXT,
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
elif isinstance(element, marko.inline.CodeSpan):
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip()
doc.add_code(parent=parent_item, text=snippet_text)
doc.add_code(
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
elif (
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
and len(element.children) > 0
and isinstance((first_child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (first_child.children.strip())) > 0
and isinstance((child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (child.children.strip())) > 0
):
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Block: {element.children}")
doc.add_code(parent=parent_item, text=snippet_text)
doc.add_code(
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
elif isinstance(element, marko.inline.LineBreak):
if self.in_table:
@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element, marko.block.HTMLBlock):
self._html_blocks += 1
self._process_inline_text(parent_item, doc)
self._close_table(doc)
_log.debug(f"HTML Block: {element}")
if (
@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# wrap in markers to enable post-processing in convert()
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
doc.add_code(parent=parent_item, text=text_to_add)
doc.add_code(
parent=parent_item,
text=text_to_add,
formatting=formatting,
hyperlink=hyperlink,
)
else:
if not isinstance(element, str):
self._close_table(doc)
_log.debug(f"Some other element: {element}")
if (
isinstance(element, (marko.block.Paragraph, marko.block.Heading))
and len(element.children) > 1
):
parent_item = doc.add_inline_group(parent=parent_item)
processed_block_types = (
marko.block.Heading,
# marko.block.Heading,
marko.block.CodeBlock,
marko.block.FencedCode,
marko.inline.RawText,
@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc=doc,
visited=visited,
parent_item=parent_item,
formatting=formatting,
hyperlink=hyperlink,
)
def is_valid(self) -> bool:
@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
parent_item=None,
visited=set(),
)
self._process_inline_text(None, doc) # handle last hanging inline text
self._close_table(doc=doc) # handle any last hanging table
# if HTML blocks were detected, export to HTML and delegate to HTML backend

View File

@ -0,0 +1,20 @@
# Contribution guideline example
This is simple.
Foo *emphasis* **strong emphasis** ***both*** .
Create your feature branch: `git checkout -b feature/AmazingFeature` .
1. Pull the [**repository**](https://github.com/docling-project/docling) .
2. Create your feature branch ( `git checkout -b feature/AmazingFeature` )
3. Commit your changes ( `git commit -m 'Add some AmazingFeature'` )
4. Push to the branch ( `git push origin feature/AmazingFeature` )
5. Open a Pull Request
##
*Second* section
- **First** : Lorem ipsum.
- **Second** : Dolor `sit` amet.

View File

@ -0,0 +1,565 @@
body:
children:
- $ref: '#/texts/0'
- $ref: '#/texts/1'
- $ref: '#/groups/0'
- $ref: '#/groups/1'
- $ref: '#/groups/2'
- $ref: '#/texts/27'
- $ref: '#/groups/8'
content_layer: body
label: unspecified
name: _root_
self_ref: '#/body'
form_items: []
furniture:
children: []
content_layer: furniture
label: unspecified
name: _root_
self_ref: '#/furniture'
groups:
- children:
- $ref: '#/texts/2'
- $ref: '#/texts/3'
- $ref: '#/texts/4'
- $ref: '#/texts/5'
- $ref: '#/texts/6'
content_layer: body
label: inline
name: group
parent:
$ref: '#/body'
self_ref: '#/groups/0'
- children:
- $ref: '#/texts/7'
- $ref: '#/texts/8'
- $ref: '#/texts/9'
content_layer: body
label: inline
name: group
parent:
$ref: '#/body'
self_ref: '#/groups/1'
- children:
- $ref: '#/texts/10'
- $ref: '#/texts/14'
- $ref: '#/texts/18'
- $ref: '#/texts/22'
- $ref: '#/texts/26'
content_layer: body
label: ordered_list
name: list
parent:
$ref: '#/body'
self_ref: '#/groups/2'
- children:
- $ref: '#/texts/11'
- $ref: '#/texts/12'
- $ref: '#/texts/13'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/10'
self_ref: '#/groups/3'
- children:
- $ref: '#/texts/15'
- $ref: '#/texts/16'
- $ref: '#/texts/17'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/14'
self_ref: '#/groups/4'
- children:
- $ref: '#/texts/19'
- $ref: '#/texts/20'
- $ref: '#/texts/21'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/18'
self_ref: '#/groups/5'
- children:
- $ref: '#/texts/23'
- $ref: '#/texts/24'
- $ref: '#/texts/25'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/22'
self_ref: '#/groups/6'
- children:
- $ref: '#/texts/28'
- $ref: '#/texts/29'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/27'
self_ref: '#/groups/7'
- children:
- $ref: '#/texts/30'
- $ref: '#/texts/33'
content_layer: body
label: list
name: list
parent:
$ref: '#/body'
self_ref: '#/groups/8'
- children:
- $ref: '#/texts/31'
- $ref: '#/texts/32'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/30'
self_ref: '#/groups/9'
- children:
- $ref: '#/texts/34'
- $ref: '#/texts/35'
- $ref: '#/texts/36'
- $ref: '#/texts/37'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/33'
self_ref: '#/groups/10'
key_value_items: []
name: inline_and_formatting
origin:
binary_hash: 9342273634728023910
filename: inline_and_formatting.md
mimetype: text/markdown
pages: {}
pictures: []
schema_name: DoclingDocument
tables: []
texts:
- children: []
content_layer: body
label: title
orig: Contribution guideline example
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/0'
text: Contribution guideline example
- children: []
content_layer: body
label: text
orig: This is simple.
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/1'
text: This is simple.
- children: []
content_layer: body
label: text
orig: Foo
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/2'
text: Foo
- children: []
content_layer: body
formatting:
bold: false
italic: true
strikethrough: false
underline: false
label: text
orig: emphasis
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/3'
text: emphasis
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
label: text
orig: strong emphasis
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/4'
text: strong emphasis
- children: []
content_layer: body
formatting:
bold: true
italic: true
strikethrough: false
underline: false
label: text
orig: both
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/5'
text: both
- children: []
content_layer: body
label: text
orig: .
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/6'
text: .
- children: []
content_layer: body
label: text
orig: 'Create your feature branch:'
parent:
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/7'
text: 'Create your feature branch:'
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git checkout -b feature/AmazingFeature
parent:
$ref: '#/groups/1'
prov: []
references: []
self_ref: '#/texts/8'
text: git checkout -b feature/AmazingFeature
- children: []
content_layer: body
label: text
orig: .
parent:
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/9'
text: .
- children:
- $ref: '#/groups/3'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/10'
text: ''
- children: []
content_layer: body
label: text
orig: Pull the
parent:
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/11'
text: Pull the
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
hyperlink: https://github.com/docling-project/docling
label: text
orig: repository
parent:
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/12'
text: repository
- children: []
content_layer: body
label: text
orig: .
parent:
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/13'
text: .
- children:
- $ref: '#/groups/4'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/14'
text: ''
- children: []
content_layer: body
label: text
orig: Create your feature branch (
parent:
$ref: '#/groups/4'
prov: []
self_ref: '#/texts/15'
text: Create your feature branch (
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git checkout -b feature/AmazingFeature
parent:
$ref: '#/groups/4'
prov: []
references: []
self_ref: '#/texts/16'
text: git checkout -b feature/AmazingFeature
- children: []
content_layer: body
label: text
orig: )
parent:
$ref: '#/groups/4'
prov: []
self_ref: '#/texts/17'
text: )
- children:
- $ref: '#/groups/5'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/18'
text: ''
- children: []
content_layer: body
label: text
orig: Commit your changes (
parent:
$ref: '#/groups/5'
prov: []
self_ref: '#/texts/19'
text: Commit your changes (
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git commit -m 'Add some AmazingFeature'
parent:
$ref: '#/groups/5'
prov: []
references: []
self_ref: '#/texts/20'
text: git commit -m 'Add some AmazingFeature'
- children: []
content_layer: body
label: text
orig: )
parent:
$ref: '#/groups/5'
prov: []
self_ref: '#/texts/21'
text: )
- children:
- $ref: '#/groups/6'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/22'
text: ''
- children: []
content_layer: body
label: text
orig: Push to the branch (
parent:
$ref: '#/groups/6'
prov: []
self_ref: '#/texts/23'
text: Push to the branch (
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git push origin feature/AmazingFeature
parent:
$ref: '#/groups/6'
prov: []
references: []
self_ref: '#/texts/24'
text: git push origin feature/AmazingFeature
- children: []
content_layer: body
label: text
orig: )
parent:
$ref: '#/groups/6'
prov: []
self_ref: '#/texts/25'
text: )
- children: []
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: Open a Pull Request
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/26'
text: Open a Pull Request
- children:
- $ref: '#/groups/7'
content_layer: body
label: section_header
level: 1
orig: ''
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/27'
text: ''
- children: []
content_layer: body
formatting:
bold: false
italic: true
strikethrough: false
underline: false
label: text
orig: Second
parent:
$ref: '#/groups/7'
prov: []
self_ref: '#/texts/28'
text: Second
- children: []
content_layer: body
label: text
orig: section
parent:
$ref: '#/groups/7'
prov: []
self_ref: '#/texts/29'
text: section
- children:
- $ref: '#/groups/9'
content_layer: body
enumerated: false
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/8'
prov: []
self_ref: '#/texts/30'
text: ''
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
label: text
orig: First
parent:
$ref: '#/groups/9'
prov: []
self_ref: '#/texts/31'
text: First
- children: []
content_layer: body
label: text
orig: ': Lorem ipsum.'
parent:
$ref: '#/groups/9'
prov: []
self_ref: '#/texts/32'
text: ': Lorem ipsum.'
- children:
- $ref: '#/groups/10'
content_layer: body
enumerated: false
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/8'
prov: []
self_ref: '#/texts/33'
text: ''
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
label: text
orig: Second
parent:
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/34'
text: Second
- children: []
content_layer: body
label: text
orig: ': Dolor'
parent:
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/35'
text: ': Dolor'
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: sit
parent:
$ref: '#/groups/10'
prov: []
references: []
self_ref: '#/texts/36'
text: sit
- children: []
content_layer: body
label: text
orig: amet.
parent:
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/37'
text: amet.
version: 1.3.0

18
tests/data/md/inline_and_formatting.md vendored Normal file
View File

@ -0,0 +1,18 @@
# Contribution guideline example
This is simple.
Foo *emphasis* **strong emphasis** ***both***.
Create your feature branch: `git checkout -b feature/AmazingFeature`.
1. Pull the [**repository**](https://github.com/docling-project/docling).
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
4. Push to the branch (`git push origin feature/AmazingFeature`)
5. Open a Pull Request
## *Second* section <!-- inline groups in headings not yet supported by serializers -->
- **First**: Lorem ipsum.
- **Second**: Dolor `sit` amet.

View File

@ -2,7 +2,7 @@ from pathlib import Path
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from docling.datamodel.document import DoclingDocument, InputDocument
from .test_data_gen_flag import GEN_TEST_DATA
@ -11,12 +11,15 @@ def test_convert_valid():
fmt = InputFormat.MD
cls = MarkdownDocumentBackend
test_data_path = Path("tests") / "data"
relevant_paths = sorted((test_data_path / "md").rglob("*.md"))
root_path = Path("tests") / "data"
relevant_paths = sorted((root_path / "md").rglob("*.md"))
assert len(relevant_paths) > 0
yaml_filter = ["inline_and_formatting"]
for in_path in relevant_paths:
gt_path = test_data_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
in_doc = InputDocument(
path_or_stream=in_path,
@ -33,9 +36,17 @@ def test_convert_valid():
act_data = act_doc.export_to_markdown()
if GEN_TEST_DATA:
with open(gt_path, mode="w", encoding="utf-8") as f:
with open(md_gt_path, mode="w", encoding="utf-8") as f:
f.write(f"{act_data}\n")
if in_path.stem in yaml_filter:
with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
act_doc.save_as_yaml(yaml_gt_path)
else:
with open(gt_path, encoding="utf-8") as f:
with open(md_gt_path, encoding="utf-8") as f:
exp_data = f.read().rstrip()
assert exp_data == act_data
assert act_data == exp_data
if in_path.stem in yaml_filter:
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
assert act_doc == exp_doc