mirror of
https://github.com/docling-project/docling.git
synced 2025-06-27 05:20:05 +00:00
feat(markdown): add formatting & improve inline support (#1804)
feat(markdown): support formatting & hyperlinks Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
parent
215b540f6c
commit
861abcdcb0
@ -1,17 +1,15 @@
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
|
from copy import deepcopy
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Set, Union
|
from typing import List, Optional, Set, Union
|
||||||
|
|
||||||
import marko
|
import marko
|
||||||
import marko.element
|
import marko.element
|
||||||
import marko.ext
|
|
||||||
import marko.ext.gfm
|
|
||||||
import marko.inline
|
import marko.inline
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItem,
|
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
@ -21,7 +19,9 @@ from docling_core.types.doc import (
|
|||||||
TableData,
|
TableData,
|
||||||
TextItem,
|
TextItem,
|
||||||
)
|
)
|
||||||
|
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
||||||
from marko import Markdown
|
from marko import Markdown
|
||||||
|
from pydantic import AnyUrl, TypeAdapter
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.backend.html_backend import HTMLDocumentBackend
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
self.in_table = False
|
self.in_table = False
|
||||||
self.md_table_buffer: list[str] = []
|
self.md_table_buffer: list[str] = []
|
||||||
self.inline_texts: list[str] = []
|
|
||||||
self._html_blocks: int = 0
|
self._html_blocks: int = 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc.add_table(data=table_data)
|
doc.add_table(data=table_data)
|
||||||
return
|
return
|
||||||
|
|
||||||
def _process_inline_text(
|
|
||||||
self, parent_item: Optional[NodeItem], doc: DoclingDocument
|
|
||||||
):
|
|
||||||
txt = " ".join(self.inline_texts)
|
|
||||||
if len(txt) > 0:
|
|
||||||
doc.add_text(
|
|
||||||
label=DocItemLabel.PARAGRAPH,
|
|
||||||
parent=parent_item,
|
|
||||||
text=txt,
|
|
||||||
)
|
|
||||||
self.inline_texts = []
|
|
||||||
|
|
||||||
def _iterate_elements( # noqa: C901
|
def _iterate_elements( # noqa: C901
|
||||||
self,
|
self,
|
||||||
|
*,
|
||||||
element: marko.element.Element,
|
element: marko.element.Element,
|
||||||
depth: int,
|
depth: int,
|
||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
visited: Set[marko.element.Element],
|
visited: Set[marko.element.Element],
|
||||||
parent_item: Optional[NodeItem] = None,
|
parent_item: Optional[NodeItem] = None,
|
||||||
|
formatting: Optional[Formatting] = None,
|
||||||
|
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
||||||
):
|
):
|
||||||
if element in visited:
|
if element in visited:
|
||||||
return
|
return
|
||||||
@ -183,43 +173,31 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Check for different element types and process relevant details
|
# Check for different element types and process relevant details
|
||||||
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
self._process_inline_text(parent_item, doc)
|
|
||||||
_log.debug(
|
_log.debug(
|
||||||
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
||||||
)
|
)
|
||||||
if element.level == 1:
|
|
||||||
doc_label = DocItemLabel.TITLE
|
if len(element.children) == 1:
|
||||||
|
child = element.children[0]
|
||||||
|
snippet_text = str(child.children) # type: ignore
|
||||||
|
visited.add(child)
|
||||||
else:
|
else:
|
||||||
doc_label = DocItemLabel.SECTION_HEADER
|
snippet_text = "" # inline group will be created
|
||||||
|
|
||||||
# Header could have arbitrary inclusion of bold, italic or emphasis,
|
if element.level == 1:
|
||||||
# hence we need to traverse the tree to get full text of a header
|
parent_item = doc.add_title(
|
||||||
strings: List[str] = []
|
text=snippet_text,
|
||||||
|
parent=parent_item,
|
||||||
# Define a recursive function to traverse the tree
|
formatting=formatting,
|
||||||
def traverse(node: marko.block.BlockElement):
|
hyperlink=hyperlink,
|
||||||
# Check if the node has a "children" attribute
|
)
|
||||||
if hasattr(node, "children"):
|
else:
|
||||||
# If "children" is a list, continue traversal
|
|
||||||
if isinstance(node.children, list):
|
|
||||||
for child in node.children:
|
|
||||||
traverse(child)
|
|
||||||
# If "children" is text, add it to header text
|
|
||||||
elif isinstance(node.children, str):
|
|
||||||
strings.append(node.children)
|
|
||||||
|
|
||||||
traverse(element)
|
|
||||||
snippet_text = "".join(strings)
|
|
||||||
if len(snippet_text) > 0:
|
|
||||||
if doc_label == DocItemLabel.SECTION_HEADER:
|
|
||||||
parent_item = doc.add_heading(
|
parent_item = doc.add_heading(
|
||||||
text=snippet_text,
|
text=snippet_text,
|
||||||
level=element.level - 1,
|
level=element.level - 1,
|
||||||
parent=parent_item,
|
parent=parent_item,
|
||||||
)
|
formatting=formatting,
|
||||||
else:
|
hyperlink=hyperlink,
|
||||||
parent_item = doc.add_text(
|
|
||||||
label=doc_label, parent=parent_item, text=snippet_text
|
|
||||||
)
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.block.List):
|
elif isinstance(element, marko.block.List):
|
||||||
@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
break
|
break
|
||||||
|
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
self._process_inline_text(parent_item, doc)
|
|
||||||
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
||||||
if has_non_empty_list_items:
|
if has_non_empty_list_items:
|
||||||
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
||||||
@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif (
|
elif (
|
||||||
isinstance(element, marko.block.ListItem)
|
isinstance(element, marko.block.ListItem)
|
||||||
and len(element.children) > 0
|
and len(element.children) == 1
|
||||||
and isinstance((first_child := element.children[0]), marko.block.Paragraph)
|
and isinstance((child := element.children[0]), marko.block.Paragraph)
|
||||||
|
and len(child.children) > 0
|
||||||
):
|
):
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
self._process_inline_text(parent_item, doc)
|
|
||||||
_log.debug(" - List item")
|
_log.debug(" - List item")
|
||||||
|
|
||||||
snippet_text = str(first_child.children[0].children) # type: ignore
|
if len(child.children) == 1:
|
||||||
is_numbered = False
|
snippet_text = str(child.children[0].children) # type: ignore
|
||||||
if (
|
visited.add(child)
|
||||||
parent_item is not None
|
else:
|
||||||
and isinstance(parent_item, DocItem)
|
snippet_text = "" # inline group will be created
|
||||||
and parent_item.label == GroupLabel.ORDERED_LIST
|
is_numbered = isinstance(parent_item, OrderedList)
|
||||||
):
|
if not isinstance(parent_item, (OrderedList, UnorderedList)):
|
||||||
is_numbered = True
|
_log.warning("ListItem would have not had a list parent, adding one.")
|
||||||
doc.add_list_item(
|
parent_item = doc.add_unordered_list(parent=parent_item)
|
||||||
enumerated=is_numbered, parent=parent_item, text=snippet_text
|
parent_item = doc.add_list_item(
|
||||||
|
enumerated=is_numbered,
|
||||||
|
parent=parent_item,
|
||||||
|
text=snippet_text,
|
||||||
|
formatting=formatting,
|
||||||
|
hyperlink=hyperlink,
|
||||||
)
|
)
|
||||||
visited.add(first_child)
|
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.Image):
|
elif isinstance(element, marko.inline.Image):
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
self._process_inline_text(parent_item, doc)
|
|
||||||
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
||||||
|
|
||||||
fig_caption: Optional[TextItem] = None
|
fig_caption: Optional[TextItem] = None
|
||||||
if element.title is not None and element.title != "":
|
if element.title is not None and element.title != "":
|
||||||
fig_caption = doc.add_text(
|
fig_caption = doc.add_text(
|
||||||
label=DocItemLabel.CAPTION, text=element.title
|
label=DocItemLabel.CAPTION,
|
||||||
|
text=element.title,
|
||||||
|
formatting=formatting,
|
||||||
|
hyperlink=hyperlink,
|
||||||
)
|
)
|
||||||
|
|
||||||
doc.add_picture(parent=parent_item, caption=fig_caption)
|
doc.add_picture(parent=parent_item, caption=fig_caption)
|
||||||
|
|
||||||
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
|
elif isinstance(element, marko.inline.Emphasis):
|
||||||
self._process_inline_text(parent_item, doc)
|
_log.debug(f" - Emphasis: {element.children}")
|
||||||
|
formatting = deepcopy(formatting) if formatting else Formatting()
|
||||||
|
formatting.italic = True
|
||||||
|
|
||||||
|
elif isinstance(element, marko.inline.StrongEmphasis):
|
||||||
|
_log.debug(f" - StrongEmphasis: {element.children}")
|
||||||
|
formatting = deepcopy(formatting) if formatting else Formatting()
|
||||||
|
formatting.bold = True
|
||||||
|
|
||||||
|
elif isinstance(element, marko.inline.Link):
|
||||||
|
_log.debug(f" - Link: {element.children}")
|
||||||
|
hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
|
||||||
|
element.dest
|
||||||
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.RawText):
|
elif isinstance(element, marko.inline.RawText):
|
||||||
_log.debug(f" - Paragraph (raw text): {element.children}")
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
||||||
@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
|
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
|
||||||
else:
|
else:
|
||||||
self.md_table_buffer.append(snippet_text)
|
self.md_table_buffer.append(snippet_text)
|
||||||
else:
|
elif snippet_text:
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
# most likely just inline text
|
doc.add_text(
|
||||||
self.inline_texts.append(str(element.children))
|
label=DocItemLabel.TEXT,
|
||||||
|
parent=parent_item,
|
||||||
|
text=snippet_text,
|
||||||
|
formatting=formatting,
|
||||||
|
hyperlink=hyperlink,
|
||||||
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.CodeSpan):
|
elif isinstance(element, marko.inline.CodeSpan):
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
self._process_inline_text(parent_item, doc)
|
|
||||||
_log.debug(f" - Code Span: {element.children}")
|
_log.debug(f" - Code Span: {element.children}")
|
||||||
snippet_text = str(element.children).strip()
|
snippet_text = str(element.children).strip()
|
||||||
doc.add_code(parent=parent_item, text=snippet_text)
|
doc.add_code(
|
||||||
|
parent=parent_item,
|
||||||
|
text=snippet_text,
|
||||||
|
formatting=formatting,
|
||||||
|
hyperlink=hyperlink,
|
||||||
|
)
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
||||||
and len(element.children) > 0
|
and len(element.children) > 0
|
||||||
and isinstance((first_child := element.children[0]), marko.inline.RawText)
|
and isinstance((child := element.children[0]), marko.inline.RawText)
|
||||||
and len(snippet_text := (first_child.children.strip())) > 0
|
and len(snippet_text := (child.children.strip())) > 0
|
||||||
):
|
):
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
self._process_inline_text(parent_item, doc)
|
|
||||||
_log.debug(f" - Code Block: {element.children}")
|
_log.debug(f" - Code Block: {element.children}")
|
||||||
doc.add_code(parent=parent_item, text=snippet_text)
|
doc.add_code(
|
||||||
|
parent=parent_item,
|
||||||
|
text=snippet_text,
|
||||||
|
formatting=formatting,
|
||||||
|
hyperlink=hyperlink,
|
||||||
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.LineBreak):
|
elif isinstance(element, marko.inline.LineBreak):
|
||||||
if self.in_table:
|
if self.in_table:
|
||||||
@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif isinstance(element, marko.block.HTMLBlock):
|
elif isinstance(element, marko.block.HTMLBlock):
|
||||||
self._html_blocks += 1
|
self._html_blocks += 1
|
||||||
self._process_inline_text(parent_item, doc)
|
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
_log.debug(f"HTML Block: {element}")
|
_log.debug(f"HTML Block: {element}")
|
||||||
if (
|
if (
|
||||||
@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
# wrap in markers to enable post-processing in convert()
|
# wrap in markers to enable post-processing in convert()
|
||||||
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
||||||
doc.add_code(parent=parent_item, text=text_to_add)
|
doc.add_code(
|
||||||
|
parent=parent_item,
|
||||||
|
text=text_to_add,
|
||||||
|
formatting=formatting,
|
||||||
|
hyperlink=hyperlink,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
if not isinstance(element, str):
|
if not isinstance(element, str):
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
_log.debug(f"Some other element: {element}")
|
_log.debug(f"Some other element: {element}")
|
||||||
|
|
||||||
|
if (
|
||||||
|
isinstance(element, (marko.block.Paragraph, marko.block.Heading))
|
||||||
|
and len(element.children) > 1
|
||||||
|
):
|
||||||
|
parent_item = doc.add_inline_group(parent=parent_item)
|
||||||
|
|
||||||
processed_block_types = (
|
processed_block_types = (
|
||||||
marko.block.Heading,
|
# marko.block.Heading,
|
||||||
marko.block.CodeBlock,
|
marko.block.CodeBlock,
|
||||||
marko.block.FencedCode,
|
marko.block.FencedCode,
|
||||||
marko.inline.RawText,
|
marko.inline.RawText,
|
||||||
@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc=doc,
|
doc=doc,
|
||||||
visited=visited,
|
visited=visited,
|
||||||
parent_item=parent_item,
|
parent_item=parent_item,
|
||||||
|
formatting=formatting,
|
||||||
|
hyperlink=hyperlink,
|
||||||
)
|
)
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
parent_item=None,
|
parent_item=None,
|
||||||
visited=set(),
|
visited=set(),
|
||||||
)
|
)
|
||||||
self._process_inline_text(None, doc) # handle last hanging inline text
|
|
||||||
self._close_table(doc=doc) # handle any last hanging table
|
self._close_table(doc=doc) # handle any last hanging table
|
||||||
|
|
||||||
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
||||||
|
20
tests/data/groundtruth/docling_v2/inline_and_formatting.md.md
vendored
Normal file
20
tests/data/groundtruth/docling_v2/inline_and_formatting.md.md
vendored
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# Contribution guideline example
|
||||||
|
|
||||||
|
This is simple.
|
||||||
|
|
||||||
|
Foo *emphasis* **strong emphasis** ***both*** .
|
||||||
|
|
||||||
|
Create your feature branch: `git checkout -b feature/AmazingFeature` .
|
||||||
|
|
||||||
|
1. Pull the [**repository**](https://github.com/docling-project/docling) .
|
||||||
|
2. Create your feature branch ( `git checkout -b feature/AmazingFeature` )
|
||||||
|
3. Commit your changes ( `git commit -m 'Add some AmazingFeature'` )
|
||||||
|
4. Push to the branch ( `git push origin feature/AmazingFeature` )
|
||||||
|
5. Open a Pull Request
|
||||||
|
|
||||||
|
##
|
||||||
|
|
||||||
|
*Second* section
|
||||||
|
|
||||||
|
- **First** : Lorem ipsum.
|
||||||
|
- **Second** : Dolor `sit` amet.
|
565
tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml
vendored
Normal file
565
tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml
vendored
Normal file
@ -0,0 +1,565 @@
|
|||||||
|
body:
|
||||||
|
children:
|
||||||
|
- $ref: '#/texts/0'
|
||||||
|
- $ref: '#/texts/1'
|
||||||
|
- $ref: '#/groups/0'
|
||||||
|
- $ref: '#/groups/1'
|
||||||
|
- $ref: '#/groups/2'
|
||||||
|
- $ref: '#/texts/27'
|
||||||
|
- $ref: '#/groups/8'
|
||||||
|
content_layer: body
|
||||||
|
label: unspecified
|
||||||
|
name: _root_
|
||||||
|
self_ref: '#/body'
|
||||||
|
form_items: []
|
||||||
|
furniture:
|
||||||
|
children: []
|
||||||
|
content_layer: furniture
|
||||||
|
label: unspecified
|
||||||
|
name: _root_
|
||||||
|
self_ref: '#/furniture'
|
||||||
|
groups:
|
||||||
|
- children:
|
||||||
|
- $ref: '#/texts/2'
|
||||||
|
- $ref: '#/texts/3'
|
||||||
|
- $ref: '#/texts/4'
|
||||||
|
- $ref: '#/texts/5'
|
||||||
|
- $ref: '#/texts/6'
|
||||||
|
content_layer: body
|
||||||
|
label: inline
|
||||||
|
name: group
|
||||||
|
parent:
|
||||||
|
$ref: '#/body'
|
||||||
|
self_ref: '#/groups/0'
|
||||||
|
- children:
|
||||||
|
- $ref: '#/texts/7'
|
||||||
|
- $ref: '#/texts/8'
|
||||||
|
- $ref: '#/texts/9'
|
||||||
|
content_layer: body
|
||||||
|
label: inline
|
||||||
|
name: group
|
||||||
|
parent:
|
||||||
|
$ref: '#/body'
|
||||||
|
self_ref: '#/groups/1'
|
||||||
|
- children:
|
||||||
|
- $ref: '#/texts/10'
|
||||||
|
- $ref: '#/texts/14'
|
||||||
|
- $ref: '#/texts/18'
|
||||||
|
- $ref: '#/texts/22'
|
||||||
|
- $ref: '#/texts/26'
|
||||||
|
content_layer: body
|
||||||
|
label: ordered_list
|
||||||
|
name: list
|
||||||
|
parent:
|
||||||
|
$ref: '#/body'
|
||||||
|
self_ref: '#/groups/2'
|
||||||
|
- children:
|
||||||
|
- $ref: '#/texts/11'
|
||||||
|
- $ref: '#/texts/12'
|
||||||
|
- $ref: '#/texts/13'
|
||||||
|
content_layer: body
|
||||||
|
label: inline
|
||||||
|
name: group
|
||||||
|
parent:
|
||||||
|
$ref: '#/texts/10'
|
||||||
|
self_ref: '#/groups/3'
|
||||||
|
- children:
|
||||||
|
- $ref: '#/texts/15'
|
||||||
|
- $ref: '#/texts/16'
|
||||||
|
- $ref: '#/texts/17'
|
||||||
|
content_layer: body
|
||||||
|
label: inline
|
||||||
|
name: group
|
||||||
|
parent:
|
||||||
|
$ref: '#/texts/14'
|
||||||
|
self_ref: '#/groups/4'
|
||||||
|
- children:
|
||||||
|
- $ref: '#/texts/19'
|
||||||
|
- $ref: '#/texts/20'
|
||||||
|
- $ref: '#/texts/21'
|
||||||
|
content_layer: body
|
||||||
|
label: inline
|
||||||
|
name: group
|
||||||
|
parent:
|
||||||
|
$ref: '#/texts/18'
|
||||||
|
self_ref: '#/groups/5'
|
||||||
|
- children:
|
||||||
|
- $ref: '#/texts/23'
|
||||||
|
- $ref: '#/texts/24'
|
||||||
|
- $ref: '#/texts/25'
|
||||||
|
content_layer: body
|
||||||
|
label: inline
|
||||||
|
name: group
|
||||||
|
parent:
|
||||||
|
$ref: '#/texts/22'
|
||||||
|
self_ref: '#/groups/6'
|
||||||
|
- children:
|
||||||
|
- $ref: '#/texts/28'
|
||||||
|
- $ref: '#/texts/29'
|
||||||
|
content_layer: body
|
||||||
|
label: inline
|
||||||
|
name: group
|
||||||
|
parent:
|
||||||
|
$ref: '#/texts/27'
|
||||||
|
self_ref: '#/groups/7'
|
||||||
|
- children:
|
||||||
|
- $ref: '#/texts/30'
|
||||||
|
- $ref: '#/texts/33'
|
||||||
|
content_layer: body
|
||||||
|
label: list
|
||||||
|
name: list
|
||||||
|
parent:
|
||||||
|
$ref: '#/body'
|
||||||
|
self_ref: '#/groups/8'
|
||||||
|
- children:
|
||||||
|
- $ref: '#/texts/31'
|
||||||
|
- $ref: '#/texts/32'
|
||||||
|
content_layer: body
|
||||||
|
label: inline
|
||||||
|
name: group
|
||||||
|
parent:
|
||||||
|
$ref: '#/texts/30'
|
||||||
|
self_ref: '#/groups/9'
|
||||||
|
- children:
|
||||||
|
- $ref: '#/texts/34'
|
||||||
|
- $ref: '#/texts/35'
|
||||||
|
- $ref: '#/texts/36'
|
||||||
|
- $ref: '#/texts/37'
|
||||||
|
content_layer: body
|
||||||
|
label: inline
|
||||||
|
name: group
|
||||||
|
parent:
|
||||||
|
$ref: '#/texts/33'
|
||||||
|
self_ref: '#/groups/10'
|
||||||
|
key_value_items: []
|
||||||
|
name: inline_and_formatting
|
||||||
|
origin:
|
||||||
|
binary_hash: 9342273634728023910
|
||||||
|
filename: inline_and_formatting.md
|
||||||
|
mimetype: text/markdown
|
||||||
|
pages: {}
|
||||||
|
pictures: []
|
||||||
|
schema_name: DoclingDocument
|
||||||
|
tables: []
|
||||||
|
texts:
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: title
|
||||||
|
orig: Contribution guideline example
|
||||||
|
parent:
|
||||||
|
$ref: '#/body'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/0'
|
||||||
|
text: Contribution guideline example
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: This is simple.
|
||||||
|
parent:
|
||||||
|
$ref: '#/body'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/1'
|
||||||
|
text: This is simple.
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: Foo
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/0'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/2'
|
||||||
|
text: Foo
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
formatting:
|
||||||
|
bold: false
|
||||||
|
italic: true
|
||||||
|
strikethrough: false
|
||||||
|
underline: false
|
||||||
|
label: text
|
||||||
|
orig: emphasis
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/0'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/3'
|
||||||
|
text: emphasis
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
formatting:
|
||||||
|
bold: true
|
||||||
|
italic: false
|
||||||
|
strikethrough: false
|
||||||
|
underline: false
|
||||||
|
label: text
|
||||||
|
orig: strong emphasis
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/0'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/4'
|
||||||
|
text: strong emphasis
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
formatting:
|
||||||
|
bold: true
|
||||||
|
italic: true
|
||||||
|
strikethrough: false
|
||||||
|
underline: false
|
||||||
|
label: text
|
||||||
|
orig: both
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/0'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/5'
|
||||||
|
text: both
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: .
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/0'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/6'
|
||||||
|
text: .
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: 'Create your feature branch:'
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/1'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/7'
|
||||||
|
text: 'Create your feature branch:'
|
||||||
|
- captions: []
|
||||||
|
children: []
|
||||||
|
code_language: unknown
|
||||||
|
content_layer: body
|
||||||
|
footnotes: []
|
||||||
|
label: code
|
||||||
|
orig: git checkout -b feature/AmazingFeature
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/1'
|
||||||
|
prov: []
|
||||||
|
references: []
|
||||||
|
self_ref: '#/texts/8'
|
||||||
|
text: git checkout -b feature/AmazingFeature
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: .
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/1'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/9'
|
||||||
|
text: .
|
||||||
|
- children:
|
||||||
|
- $ref: '#/groups/3'
|
||||||
|
content_layer: body
|
||||||
|
enumerated: true
|
||||||
|
label: list_item
|
||||||
|
marker: '-'
|
||||||
|
orig: ''
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/2'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/10'
|
||||||
|
text: ''
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: Pull the
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/3'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/11'
|
||||||
|
text: Pull the
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
formatting:
|
||||||
|
bold: true
|
||||||
|
italic: false
|
||||||
|
strikethrough: false
|
||||||
|
underline: false
|
||||||
|
hyperlink: https://github.com/docling-project/docling
|
||||||
|
label: text
|
||||||
|
orig: repository
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/3'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/12'
|
||||||
|
text: repository
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: .
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/3'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/13'
|
||||||
|
text: .
|
||||||
|
- children:
|
||||||
|
- $ref: '#/groups/4'
|
||||||
|
content_layer: body
|
||||||
|
enumerated: true
|
||||||
|
label: list_item
|
||||||
|
marker: '-'
|
||||||
|
orig: ''
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/2'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/14'
|
||||||
|
text: ''
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: Create your feature branch (
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/4'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/15'
|
||||||
|
text: Create your feature branch (
|
||||||
|
- captions: []
|
||||||
|
children: []
|
||||||
|
code_language: unknown
|
||||||
|
content_layer: body
|
||||||
|
footnotes: []
|
||||||
|
label: code
|
||||||
|
orig: git checkout -b feature/AmazingFeature
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/4'
|
||||||
|
prov: []
|
||||||
|
references: []
|
||||||
|
self_ref: '#/texts/16'
|
||||||
|
text: git checkout -b feature/AmazingFeature
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: )
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/4'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/17'
|
||||||
|
text: )
|
||||||
|
- children:
|
||||||
|
- $ref: '#/groups/5'
|
||||||
|
content_layer: body
|
||||||
|
enumerated: true
|
||||||
|
label: list_item
|
||||||
|
marker: '-'
|
||||||
|
orig: ''
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/2'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/18'
|
||||||
|
text: ''
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: Commit your changes (
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/5'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/19'
|
||||||
|
text: Commit your changes (
|
||||||
|
- captions: []
|
||||||
|
children: []
|
||||||
|
code_language: unknown
|
||||||
|
content_layer: body
|
||||||
|
footnotes: []
|
||||||
|
label: code
|
||||||
|
orig: git commit -m 'Add some AmazingFeature'
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/5'
|
||||||
|
prov: []
|
||||||
|
references: []
|
||||||
|
self_ref: '#/texts/20'
|
||||||
|
text: git commit -m 'Add some AmazingFeature'
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: )
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/5'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/21'
|
||||||
|
text: )
|
||||||
|
- children:
|
||||||
|
- $ref: '#/groups/6'
|
||||||
|
content_layer: body
|
||||||
|
enumerated: true
|
||||||
|
label: list_item
|
||||||
|
marker: '-'
|
||||||
|
orig: ''
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/2'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/22'
|
||||||
|
text: ''
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: Push to the branch (
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/6'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/23'
|
||||||
|
text: Push to the branch (
|
||||||
|
- captions: []
|
||||||
|
children: []
|
||||||
|
code_language: unknown
|
||||||
|
content_layer: body
|
||||||
|
footnotes: []
|
||||||
|
label: code
|
||||||
|
orig: git push origin feature/AmazingFeature
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/6'
|
||||||
|
prov: []
|
||||||
|
references: []
|
||||||
|
self_ref: '#/texts/24'
|
||||||
|
text: git push origin feature/AmazingFeature
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: )
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/6'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/25'
|
||||||
|
text: )
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
enumerated: true
|
||||||
|
label: list_item
|
||||||
|
marker: '-'
|
||||||
|
orig: Open a Pull Request
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/2'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/26'
|
||||||
|
text: Open a Pull Request
|
||||||
|
- children:
|
||||||
|
- $ref: '#/groups/7'
|
||||||
|
content_layer: body
|
||||||
|
label: section_header
|
||||||
|
level: 1
|
||||||
|
orig: ''
|
||||||
|
parent:
|
||||||
|
$ref: '#/body'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/27'
|
||||||
|
text: ''
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
formatting:
|
||||||
|
bold: false
|
||||||
|
italic: true
|
||||||
|
strikethrough: false
|
||||||
|
underline: false
|
||||||
|
label: text
|
||||||
|
orig: Second
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/7'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/28'
|
||||||
|
text: Second
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: section
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/7'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/29'
|
||||||
|
text: section
|
||||||
|
- children:
|
||||||
|
- $ref: '#/groups/9'
|
||||||
|
content_layer: body
|
||||||
|
enumerated: false
|
||||||
|
label: list_item
|
||||||
|
marker: '-'
|
||||||
|
orig: ''
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/8'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/30'
|
||||||
|
text: ''
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
formatting:
|
||||||
|
bold: true
|
||||||
|
italic: false
|
||||||
|
strikethrough: false
|
||||||
|
underline: false
|
||||||
|
label: text
|
||||||
|
orig: First
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/9'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/31'
|
||||||
|
text: First
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: ': Lorem ipsum.'
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/9'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/32'
|
||||||
|
text: ': Lorem ipsum.'
|
||||||
|
- children:
|
||||||
|
- $ref: '#/groups/10'
|
||||||
|
content_layer: body
|
||||||
|
enumerated: false
|
||||||
|
label: list_item
|
||||||
|
marker: '-'
|
||||||
|
orig: ''
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/8'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/33'
|
||||||
|
text: ''
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
formatting:
|
||||||
|
bold: true
|
||||||
|
italic: false
|
||||||
|
strikethrough: false
|
||||||
|
underline: false
|
||||||
|
label: text
|
||||||
|
orig: Second
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/10'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/34'
|
||||||
|
text: Second
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: ': Dolor'
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/10'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/35'
|
||||||
|
text: ': Dolor'
|
||||||
|
- captions: []
|
||||||
|
children: []
|
||||||
|
code_language: unknown
|
||||||
|
content_layer: body
|
||||||
|
footnotes: []
|
||||||
|
label: code
|
||||||
|
orig: sit
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/10'
|
||||||
|
prov: []
|
||||||
|
references: []
|
||||||
|
self_ref: '#/texts/36'
|
||||||
|
text: sit
|
||||||
|
- children: []
|
||||||
|
content_layer: body
|
||||||
|
label: text
|
||||||
|
orig: amet.
|
||||||
|
parent:
|
||||||
|
$ref: '#/groups/10'
|
||||||
|
prov: []
|
||||||
|
self_ref: '#/texts/37'
|
||||||
|
text: amet.
|
||||||
|
version: 1.3.0
|
18
tests/data/md/inline_and_formatting.md
vendored
Normal file
18
tests/data/md/inline_and_formatting.md
vendored
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
# Contribution guideline example
|
||||||
|
|
||||||
|
This is simple.
|
||||||
|
|
||||||
|
Foo *emphasis* **strong emphasis** ***both***.
|
||||||
|
|
||||||
|
Create your feature branch: `git checkout -b feature/AmazingFeature`.
|
||||||
|
|
||||||
|
1. Pull the [**repository**](https://github.com/docling-project/docling).
|
||||||
|
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
|
||||||
|
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
||||||
|
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
||||||
|
5. Open a Pull Request
|
||||||
|
|
||||||
|
## *Second* section <!-- inline groups in headings not yet supported by serializers -->
|
||||||
|
|
||||||
|
- **First**: Lorem ipsum.
|
||||||
|
- **Second**: Dolor `sit` amet.
|
@ -2,7 +2,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import DoclingDocument, InputDocument
|
||||||
|
|
||||||
from .test_data_gen_flag import GEN_TEST_DATA
|
from .test_data_gen_flag import GEN_TEST_DATA
|
||||||
|
|
||||||
@ -11,12 +11,15 @@ def test_convert_valid():
|
|||||||
fmt = InputFormat.MD
|
fmt = InputFormat.MD
|
||||||
cls = MarkdownDocumentBackend
|
cls = MarkdownDocumentBackend
|
||||||
|
|
||||||
test_data_path = Path("tests") / "data"
|
root_path = Path("tests") / "data"
|
||||||
relevant_paths = sorted((test_data_path / "md").rglob("*.md"))
|
relevant_paths = sorted((root_path / "md").rglob("*.md"))
|
||||||
assert len(relevant_paths) > 0
|
assert len(relevant_paths) > 0
|
||||||
|
|
||||||
|
yaml_filter = ["inline_and_formatting"]
|
||||||
|
|
||||||
for in_path in relevant_paths:
|
for in_path in relevant_paths:
|
||||||
gt_path = test_data_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
|
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
|
||||||
|
yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
|
||||||
|
|
||||||
in_doc = InputDocument(
|
in_doc = InputDocument(
|
||||||
path_or_stream=in_path,
|
path_or_stream=in_path,
|
||||||
@ -33,9 +36,17 @@ def test_convert_valid():
|
|||||||
act_data = act_doc.export_to_markdown()
|
act_data = act_doc.export_to_markdown()
|
||||||
|
|
||||||
if GEN_TEST_DATA:
|
if GEN_TEST_DATA:
|
||||||
with open(gt_path, mode="w", encoding="utf-8") as f:
|
with open(md_gt_path, mode="w", encoding="utf-8") as f:
|
||||||
f.write(f"{act_data}\n")
|
f.write(f"{act_data}\n")
|
||||||
|
|
||||||
|
if in_path.stem in yaml_filter:
|
||||||
|
with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
|
||||||
|
act_doc.save_as_yaml(yaml_gt_path)
|
||||||
else:
|
else:
|
||||||
with open(gt_path, encoding="utf-8") as f:
|
with open(md_gt_path, encoding="utf-8") as f:
|
||||||
exp_data = f.read().rstrip()
|
exp_data = f.read().rstrip()
|
||||||
assert exp_data == act_data
|
assert act_data == exp_data
|
||||||
|
|
||||||
|
if in_path.stem in yaml_filter:
|
||||||
|
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
|
||||||
|
assert act_doc == exp_doc
|
||||||
|
Loading…
x
Reference in New Issue
Block a user