feat(markdown): add formatting & improve inline support (#1804)

feat(markdown): support formatting & hyperlinks

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
Panos Vagenas 2025-06-18 15:57:57 +02:00 committed by GitHub
parent 215b540f6c
commit 861abcdcb0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 722 additions and 88 deletions

View File

@ -1,17 +1,15 @@
import logging import logging
import re import re
import warnings import warnings
from copy import deepcopy
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import List, Optional, Set, Union from typing import List, Optional, Set, Union
import marko import marko
import marko.element import marko.element
import marko.ext
import marko.ext.gfm
import marko.inline import marko.inline
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItem,
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin, DocumentOrigin,
@ -21,7 +19,9 @@ from docling_core.types.doc import (
TableData, TableData,
TextItem, TextItem,
) )
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
from marko import Markdown from marko import Markdown
from pydantic import AnyUrl, TypeAdapter
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend
@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.in_table = False self.in_table = False
self.md_table_buffer: list[str] = [] self.md_table_buffer: list[str] = []
self.inline_texts: list[str] = []
self._html_blocks: int = 0 self._html_blocks: int = 0
try: try:
@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc.add_table(data=table_data) doc.add_table(data=table_data)
return return
def _process_inline_text(
self, parent_item: Optional[NodeItem], doc: DoclingDocument
):
txt = " ".join(self.inline_texts)
if len(txt) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=parent_item,
text=txt,
)
self.inline_texts = []
def _iterate_elements( # noqa: C901 def _iterate_elements( # noqa: C901
self, self,
*,
element: marko.element.Element, element: marko.element.Element,
depth: int, depth: int,
doc: DoclingDocument, doc: DoclingDocument,
visited: Set[marko.element.Element], visited: Set[marko.element.Element],
parent_item: Optional[NodeItem] = None, parent_item: Optional[NodeItem] = None,
formatting: Optional[Formatting] = None,
hyperlink: Optional[Union[AnyUrl, Path]] = None,
): ):
if element in visited: if element in visited:
return return
@ -183,43 +173,31 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# Check for different element types and process relevant details # Check for different element types and process relevant details
if isinstance(element, marko.block.Heading) and len(element.children) > 0: if isinstance(element, marko.block.Heading) and len(element.children) > 0:
self._close_table(doc) self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug( _log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
) )
if element.level == 1:
doc_label = DocItemLabel.TITLE if len(element.children) == 1:
child = element.children[0]
snippet_text = str(child.children) # type: ignore
visited.add(child)
else: else:
doc_label = DocItemLabel.SECTION_HEADER snippet_text = "" # inline group will be created
# Header could have arbitrary inclusion of bold, italic or emphasis, if element.level == 1:
# hence we need to traverse the tree to get full text of a header parent_item = doc.add_title(
strings: List[str] = [] text=snippet_text,
parent=parent_item,
# Define a recursive function to traverse the tree formatting=formatting,
def traverse(node: marko.block.BlockElement): hyperlink=hyperlink,
# Check if the node has a "children" attribute )
if hasattr(node, "children"): else:
# If "children" is a list, continue traversal
if isinstance(node.children, list):
for child in node.children:
traverse(child)
# If "children" is text, add it to header text
elif isinstance(node.children, str):
strings.append(node.children)
traverse(element)
snippet_text = "".join(strings)
if len(snippet_text) > 0:
if doc_label == DocItemLabel.SECTION_HEADER:
parent_item = doc.add_heading( parent_item = doc.add_heading(
text=snippet_text, text=snippet_text,
level=element.level - 1, level=element.level - 1,
parent=parent_item, parent=parent_item,
) formatting=formatting,
else: hyperlink=hyperlink,
parent_item = doc.add_text(
label=doc_label, parent=parent_item, text=snippet_text
) )
elif isinstance(element, marko.block.List): elif isinstance(element, marko.block.List):
@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
break break
self._close_table(doc) self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}") _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
if has_non_empty_list_items: if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif ( elif (
isinstance(element, marko.block.ListItem) isinstance(element, marko.block.ListItem)
and len(element.children) > 0 and len(element.children) == 1
and isinstance((first_child := element.children[0]), marko.block.Paragraph) and isinstance((child := element.children[0]), marko.block.Paragraph)
and len(child.children) > 0
): ):
self._close_table(doc) self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(" - List item") _log.debug(" - List item")
snippet_text = str(first_child.children[0].children) # type: ignore if len(child.children) == 1:
is_numbered = False snippet_text = str(child.children[0].children) # type: ignore
if ( visited.add(child)
parent_item is not None else:
and isinstance(parent_item, DocItem) snippet_text = "" # inline group will be created
and parent_item.label == GroupLabel.ORDERED_LIST is_numbered = isinstance(parent_item, OrderedList)
): if not isinstance(parent_item, (OrderedList, UnorderedList)):
is_numbered = True _log.warning("ListItem would have not had a list parent, adding one.")
doc.add_list_item( parent_item = doc.add_unordered_list(parent=parent_item)
enumerated=is_numbered, parent=parent_item, text=snippet_text parent_item = doc.add_list_item(
enumerated=is_numbered,
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
) )
visited.add(first_child)
elif isinstance(element, marko.inline.Image): elif isinstance(element, marko.inline.Image):
self._close_table(doc) self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}") _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
fig_caption: Optional[TextItem] = None fig_caption: Optional[TextItem] = None
if element.title is not None and element.title != "": if element.title is not None and element.title != "":
fig_caption = doc.add_text( fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=element.title label=DocItemLabel.CAPTION,
text=element.title,
formatting=formatting,
hyperlink=hyperlink,
) )
doc.add_picture(parent=parent_item, caption=fig_caption) doc.add_picture(parent=parent_item, caption=fig_caption)
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0: elif isinstance(element, marko.inline.Emphasis):
self._process_inline_text(parent_item, doc) _log.debug(f" - Emphasis: {element.children}")
formatting = deepcopy(formatting) if formatting else Formatting()
formatting.italic = True
elif isinstance(element, marko.inline.StrongEmphasis):
_log.debug(f" - StrongEmphasis: {element.children}")
formatting = deepcopy(formatting) if formatting else Formatting()
formatting.bold = True
elif isinstance(element, marko.inline.Link):
_log.debug(f" - Link: {element.children}")
hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
element.dest
)
elif isinstance(element, marko.inline.RawText): elif isinstance(element, marko.inline.RawText):
_log.debug(f" - Paragraph (raw text): {element.children}") _log.debug(f" - Paragraph (raw text): {element.children}")
@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
else: else:
self.md_table_buffer.append(snippet_text) self.md_table_buffer.append(snippet_text)
else: elif snippet_text:
self._close_table(doc) self._close_table(doc)
# most likely just inline text doc.add_text(
self.inline_texts.append(str(element.children)) label=DocItemLabel.TEXT,
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
elif isinstance(element, marko.inline.CodeSpan): elif isinstance(element, marko.inline.CodeSpan):
self._close_table(doc) self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Span: {element.children}") _log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip() snippet_text = str(element.children).strip()
doc.add_code(parent=parent_item, text=snippet_text) doc.add_code(
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
elif ( elif (
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode)) isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
and len(element.children) > 0 and len(element.children) > 0
and isinstance((first_child := element.children[0]), marko.inline.RawText) and isinstance((child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (first_child.children.strip())) > 0 and len(snippet_text := (child.children.strip())) > 0
): ):
self._close_table(doc) self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Block: {element.children}") _log.debug(f" - Code Block: {element.children}")
doc.add_code(parent=parent_item, text=snippet_text) doc.add_code(
parent=parent_item,
text=snippet_text,
formatting=formatting,
hyperlink=hyperlink,
)
elif isinstance(element, marko.inline.LineBreak): elif isinstance(element, marko.inline.LineBreak):
if self.in_table: if self.in_table:
@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element, marko.block.HTMLBlock): elif isinstance(element, marko.block.HTMLBlock):
self._html_blocks += 1 self._html_blocks += 1
self._process_inline_text(parent_item, doc)
self._close_table(doc) self._close_table(doc)
_log.debug(f"HTML Block: {element}") _log.debug(f"HTML Block: {element}")
if ( if (
@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# wrap in markers to enable post-processing in convert() # wrap in markers to enable post-processing in convert()
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}" text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
doc.add_code(parent=parent_item, text=text_to_add) doc.add_code(
parent=parent_item,
text=text_to_add,
formatting=formatting,
hyperlink=hyperlink,
)
else: else:
if not isinstance(element, str): if not isinstance(element, str):
self._close_table(doc) self._close_table(doc)
_log.debug(f"Some other element: {element}") _log.debug(f"Some other element: {element}")
if (
isinstance(element, (marko.block.Paragraph, marko.block.Heading))
and len(element.children) > 1
):
parent_item = doc.add_inline_group(parent=parent_item)
processed_block_types = ( processed_block_types = (
marko.block.Heading, # marko.block.Heading,
marko.block.CodeBlock, marko.block.CodeBlock,
marko.block.FencedCode, marko.block.FencedCode,
marko.inline.RawText, marko.inline.RawText,
@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc=doc, doc=doc,
visited=visited, visited=visited,
parent_item=parent_item, parent_item=parent_item,
formatting=formatting,
hyperlink=hyperlink,
) )
def is_valid(self) -> bool: def is_valid(self) -> bool:
@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
parent_item=None, parent_item=None,
visited=set(), visited=set(),
) )
self._process_inline_text(None, doc) # handle last hanging inline text
self._close_table(doc=doc) # handle any last hanging table self._close_table(doc=doc) # handle any last hanging table
# if HTML blocks were detected, export to HTML and delegate to HTML backend # if HTML blocks were detected, export to HTML and delegate to HTML backend

View File

@ -0,0 +1,20 @@
# Contribution guideline example
This is simple.
Foo *emphasis* **strong emphasis** ***both*** .
Create your feature branch: `git checkout -b feature/AmazingFeature` .
1. Pull the [**repository**](https://github.com/docling-project/docling) .
2. Create your feature branch ( `git checkout -b feature/AmazingFeature` )
3. Commit your changes ( `git commit -m 'Add some AmazingFeature'` )
4. Push to the branch ( `git push origin feature/AmazingFeature` )
5. Open a Pull Request
##
*Second* section
- **First** : Lorem ipsum.
- **Second** : Dolor `sit` amet.

View File

@ -0,0 +1,565 @@
body:
children:
- $ref: '#/texts/0'
- $ref: '#/texts/1'
- $ref: '#/groups/0'
- $ref: '#/groups/1'
- $ref: '#/groups/2'
- $ref: '#/texts/27'
- $ref: '#/groups/8'
content_layer: body
label: unspecified
name: _root_
self_ref: '#/body'
form_items: []
furniture:
children: []
content_layer: furniture
label: unspecified
name: _root_
self_ref: '#/furniture'
groups:
- children:
- $ref: '#/texts/2'
- $ref: '#/texts/3'
- $ref: '#/texts/4'
- $ref: '#/texts/5'
- $ref: '#/texts/6'
content_layer: body
label: inline
name: group
parent:
$ref: '#/body'
self_ref: '#/groups/0'
- children:
- $ref: '#/texts/7'
- $ref: '#/texts/8'
- $ref: '#/texts/9'
content_layer: body
label: inline
name: group
parent:
$ref: '#/body'
self_ref: '#/groups/1'
- children:
- $ref: '#/texts/10'
- $ref: '#/texts/14'
- $ref: '#/texts/18'
- $ref: '#/texts/22'
- $ref: '#/texts/26'
content_layer: body
label: ordered_list
name: list
parent:
$ref: '#/body'
self_ref: '#/groups/2'
- children:
- $ref: '#/texts/11'
- $ref: '#/texts/12'
- $ref: '#/texts/13'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/10'
self_ref: '#/groups/3'
- children:
- $ref: '#/texts/15'
- $ref: '#/texts/16'
- $ref: '#/texts/17'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/14'
self_ref: '#/groups/4'
- children:
- $ref: '#/texts/19'
- $ref: '#/texts/20'
- $ref: '#/texts/21'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/18'
self_ref: '#/groups/5'
- children:
- $ref: '#/texts/23'
- $ref: '#/texts/24'
- $ref: '#/texts/25'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/22'
self_ref: '#/groups/6'
- children:
- $ref: '#/texts/28'
- $ref: '#/texts/29'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/27'
self_ref: '#/groups/7'
- children:
- $ref: '#/texts/30'
- $ref: '#/texts/33'
content_layer: body
label: list
name: list
parent:
$ref: '#/body'
self_ref: '#/groups/8'
- children:
- $ref: '#/texts/31'
- $ref: '#/texts/32'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/30'
self_ref: '#/groups/9'
- children:
- $ref: '#/texts/34'
- $ref: '#/texts/35'
- $ref: '#/texts/36'
- $ref: '#/texts/37'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/33'
self_ref: '#/groups/10'
key_value_items: []
name: inline_and_formatting
origin:
binary_hash: 9342273634728023910
filename: inline_and_formatting.md
mimetype: text/markdown
pages: {}
pictures: []
schema_name: DoclingDocument
tables: []
texts:
- children: []
content_layer: body
label: title
orig: Contribution guideline example
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/0'
text: Contribution guideline example
- children: []
content_layer: body
label: text
orig: This is simple.
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/1'
text: This is simple.
- children: []
content_layer: body
label: text
orig: Foo
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/2'
text: Foo
- children: []
content_layer: body
formatting:
bold: false
italic: true
strikethrough: false
underline: false
label: text
orig: emphasis
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/3'
text: emphasis
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
label: text
orig: strong emphasis
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/4'
text: strong emphasis
- children: []
content_layer: body
formatting:
bold: true
italic: true
strikethrough: false
underline: false
label: text
orig: both
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/5'
text: both
- children: []
content_layer: body
label: text
orig: .
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/6'
text: .
- children: []
content_layer: body
label: text
orig: 'Create your feature branch:'
parent:
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/7'
text: 'Create your feature branch:'
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git checkout -b feature/AmazingFeature
parent:
$ref: '#/groups/1'
prov: []
references: []
self_ref: '#/texts/8'
text: git checkout -b feature/AmazingFeature
- children: []
content_layer: body
label: text
orig: .
parent:
$ref: '#/groups/1'
prov: []
self_ref: '#/texts/9'
text: .
- children:
- $ref: '#/groups/3'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/10'
text: ''
- children: []
content_layer: body
label: text
orig: Pull the
parent:
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/11'
text: Pull the
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
hyperlink: https://github.com/docling-project/docling
label: text
orig: repository
parent:
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/12'
text: repository
- children: []
content_layer: body
label: text
orig: .
parent:
$ref: '#/groups/3'
prov: []
self_ref: '#/texts/13'
text: .
- children:
- $ref: '#/groups/4'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/14'
text: ''
- children: []
content_layer: body
label: text
orig: Create your feature branch (
parent:
$ref: '#/groups/4'
prov: []
self_ref: '#/texts/15'
text: Create your feature branch (
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git checkout -b feature/AmazingFeature
parent:
$ref: '#/groups/4'
prov: []
references: []
self_ref: '#/texts/16'
text: git checkout -b feature/AmazingFeature
- children: []
content_layer: body
label: text
orig: )
parent:
$ref: '#/groups/4'
prov: []
self_ref: '#/texts/17'
text: )
- children:
- $ref: '#/groups/5'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/18'
text: ''
- children: []
content_layer: body
label: text
orig: Commit your changes (
parent:
$ref: '#/groups/5'
prov: []
self_ref: '#/texts/19'
text: Commit your changes (
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git commit -m 'Add some AmazingFeature'
parent:
$ref: '#/groups/5'
prov: []
references: []
self_ref: '#/texts/20'
text: git commit -m 'Add some AmazingFeature'
- children: []
content_layer: body
label: text
orig: )
parent:
$ref: '#/groups/5'
prov: []
self_ref: '#/texts/21'
text: )
- children:
- $ref: '#/groups/6'
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/22'
text: ''
- children: []
content_layer: body
label: text
orig: Push to the branch (
parent:
$ref: '#/groups/6'
prov: []
self_ref: '#/texts/23'
text: Push to the branch (
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: git push origin feature/AmazingFeature
parent:
$ref: '#/groups/6'
prov: []
references: []
self_ref: '#/texts/24'
text: git push origin feature/AmazingFeature
- children: []
content_layer: body
label: text
orig: )
parent:
$ref: '#/groups/6'
prov: []
self_ref: '#/texts/25'
text: )
- children: []
content_layer: body
enumerated: true
label: list_item
marker: '-'
orig: Open a Pull Request
parent:
$ref: '#/groups/2'
prov: []
self_ref: '#/texts/26'
text: Open a Pull Request
- children:
- $ref: '#/groups/7'
content_layer: body
label: section_header
level: 1
orig: ''
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/27'
text: ''
- children: []
content_layer: body
formatting:
bold: false
italic: true
strikethrough: false
underline: false
label: text
orig: Second
parent:
$ref: '#/groups/7'
prov: []
self_ref: '#/texts/28'
text: Second
- children: []
content_layer: body
label: text
orig: section
parent:
$ref: '#/groups/7'
prov: []
self_ref: '#/texts/29'
text: section
- children:
- $ref: '#/groups/9'
content_layer: body
enumerated: false
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/8'
prov: []
self_ref: '#/texts/30'
text: ''
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
label: text
orig: First
parent:
$ref: '#/groups/9'
prov: []
self_ref: '#/texts/31'
text: First
- children: []
content_layer: body
label: text
orig: ': Lorem ipsum.'
parent:
$ref: '#/groups/9'
prov: []
self_ref: '#/texts/32'
text: ': Lorem ipsum.'
- children:
- $ref: '#/groups/10'
content_layer: body
enumerated: false
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/8'
prov: []
self_ref: '#/texts/33'
text: ''
- children: []
content_layer: body
formatting:
bold: true
italic: false
strikethrough: false
underline: false
label: text
orig: Second
parent:
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/34'
text: Second
- children: []
content_layer: body
label: text
orig: ': Dolor'
parent:
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/35'
text: ': Dolor'
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: sit
parent:
$ref: '#/groups/10'
prov: []
references: []
self_ref: '#/texts/36'
text: sit
- children: []
content_layer: body
label: text
orig: amet.
parent:
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/37'
text: amet.
version: 1.3.0

18
tests/data/md/inline_and_formatting.md vendored Normal file
View File

@ -0,0 +1,18 @@
# Contribution guideline example
This is simple.
Foo *emphasis* **strong emphasis** ***both***.
Create your feature branch: `git checkout -b feature/AmazingFeature`.
1. Pull the [**repository**](https://github.com/docling-project/docling).
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
4. Push to the branch (`git push origin feature/AmazingFeature`)
5. Open a Pull Request
## *Second* section <!-- inline groups in headings not yet supported by serializers -->
- **First**: Lorem ipsum.
- **Second**: Dolor `sit` amet.

View File

@ -2,7 +2,7 @@ from pathlib import Path
from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docling.datamodel.document import DoclingDocument, InputDocument
from .test_data_gen_flag import GEN_TEST_DATA from .test_data_gen_flag import GEN_TEST_DATA
@ -11,12 +11,15 @@ def test_convert_valid():
fmt = InputFormat.MD fmt = InputFormat.MD
cls = MarkdownDocumentBackend cls = MarkdownDocumentBackend
test_data_path = Path("tests") / "data" root_path = Path("tests") / "data"
relevant_paths = sorted((test_data_path / "md").rglob("*.md")) relevant_paths = sorted((root_path / "md").rglob("*.md"))
assert len(relevant_paths) > 0 assert len(relevant_paths) > 0
yaml_filter = ["inline_and_formatting"]
for in_path in relevant_paths: for in_path in relevant_paths:
gt_path = test_data_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md" md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
in_doc = InputDocument( in_doc = InputDocument(
path_or_stream=in_path, path_or_stream=in_path,
@ -33,9 +36,17 @@ def test_convert_valid():
act_data = act_doc.export_to_markdown() act_data = act_doc.export_to_markdown()
if GEN_TEST_DATA: if GEN_TEST_DATA:
with open(gt_path, mode="w", encoding="utf-8") as f: with open(md_gt_path, mode="w", encoding="utf-8") as f:
f.write(f"{act_data}\n") f.write(f"{act_data}\n")
if in_path.stem in yaml_filter:
with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
act_doc.save_as_yaml(yaml_gt_path)
else: else:
with open(gt_path, encoding="utf-8") as f: with open(md_gt_path, encoding="utf-8") as f:
exp_data = f.read().rstrip() exp_data = f.read().rstrip()
assert exp_data == act_data assert act_data == exp_data
if in_path.stem in yaml_filter:
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
assert act_doc == exp_doc