{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Advanced chunking & serialization" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Overview" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this notebook we show how to customize the serialization strategies that come into\n", "play during chunking." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will work with a document that contains some [picture annotations](../pictures_description):" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from docling_core.types.doc.document import DoclingDocument\n", "\n", "SOURCE = \"./data/2408.09869v3_enriched.json\"\n", "\n", "doc = DoclingDocument.load_from_json(SOURCE)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Below we define the chunker (for more details check out [Hybrid Chunking](../hybrid_chunking)):" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from docling_core.transforms.chunker.hybrid_chunker import HybridChunker\n", "from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer\n", "from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer\n", "from transformers import AutoTokenizer\n", "\n", "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n", "\n", "tokenizer: BaseTokenizer = HuggingFaceTokenizer(\n", " tokenizer=AutoTokenizer.from_pretrained(EMBED_MODEL_ID),\n", ")\n", "chunker = HybridChunker(tokenizer=tokenizer)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tokenizer.get_max_tokens()=512\n" ] } ], "source": [ "print(f\"{tokenizer.get_max_tokens()=}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Defining some helper methods:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from typing import Iterable, Optional\n", "\n", "from docling_core.transforms.chunker.base import BaseChunk\n", "from docling_core.transforms.chunker.hierarchical_chunker import DocChunk\n", "from docling_core.types.doc.labels import DocItemLabel\n", "from rich.console import Console\n", "from rich.panel import Panel\n", "\n", "console = Console(\n", " width=200, # for getting Markdown tables rendered nicely\n", ")\n", "\n", "\n", "def find_n_th_chunk_with_label(\n", " iter: Iterable[BaseChunk], n: int, label: DocItemLabel\n", ") -> Optional[DocChunk]:\n", " num_found = -1\n", " for i, chunk in enumerate(iter):\n", " doc_chunk = DocChunk.model_validate(chunk)\n", " for it in doc_chunk.meta.doc_items:\n", " if it.label == label:\n", " num_found += 1\n", " if num_found == n:\n", " return i, chunk\n", " return None, None\n", "\n", "\n", "def print_chunk(chunks, chunk_pos):\n", " chunk = chunks[chunk_pos]\n", " ctx_text = chunker.contextualize(chunk=chunk)\n", " num_tokens = tokenizer.count_tokens(text=ctx_text)\n", " doc_items_refs = [it.self_ref for it in chunk.meta.doc_items]\n", " title = f\"{chunk_pos=} {num_tokens=} {doc_items_refs=}\"\n", " console.print(Panel(ctx_text, title=title))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Table serialization" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using the default strategy" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Below we inspect the first chunk containing a table — using the default serialization strategy:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Token indices sequence length is longer than the specified maximum sequence length for this model (652 > 512). Running this sequence through the model will result in indexing errors\n" ] }, { "data": { "text/html": [ "
╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=426 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n", "│ Docling Technical Report │\n", "│ 4 Performance │\n", "│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n", "│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. │\n", "│ │\n", "│ Apple M3 Max, Thread budget. = 4. Apple M3 Max, native backend.TTS = 177 s 167 s. Apple M3 Max, native backend.Pages/s = 1.27 1.34. Apple M3 Max, native backend.Mem = 6.20 GB. Apple M3 Max, │\n", "│ pypdfium backend.TTS = 103 s 92 s. Apple M3 Max, pypdfium backend.Pages/s = 2.18 2.45. Apple M3 Max, pypdfium backend.Mem = 2.56 GB. (16 cores) Intel(R) Xeon E5-2690, Thread budget. = 16 4 16. (16 │\n", "│ cores) Intel(R) Xeon E5-2690, native backend.TTS = 375 s 244 s. (16 cores) Intel(R) Xeon E5-2690, native backend.Pages/s = 0.60 0.92. (16 cores) Intel(R) Xeon E5-2690, native backend.Mem = 6.16 │\n", "│ GB. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.TTS = 239 s 143 s. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.Pages/s = 0.94 1.57. (16 cores) Intel(R) Xeon E5-2690, pypdfium │\n", "│ backend.Mem = 2.42 GB │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "\n" ], "text/plain": [ "╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=426 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n", "│ Docling Technical Report │\n", "│ 4 Performance │\n", "│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n", "│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. │\n", "│ │\n", "│ Apple M3 Max, Thread budget. = 4. Apple M3 Max, native backend.TTS = 177 s 167 s. Apple M3 Max, native backend.Pages/s = 1.27 1.34. Apple M3 Max, native backend.Mem = 6.20 GB. Apple M3 Max, │\n", "│ pypdfium backend.TTS = 103 s 92 s. Apple M3 Max, pypdfium backend.Pages/s = 2.18 2.45. Apple M3 Max, pypdfium backend.Mem = 2.56 GB. (16 cores) Intel(R) Xeon E5-2690, Thread budget. = 16 4 16. (16 │\n", "│ cores) Intel(R) Xeon E5-2690, native backend.TTS = 375 s 244 s. (16 cores) Intel(R) Xeon E5-2690, native backend.Pages/s = 0.60 0.92. (16 cores) Intel(R) Xeon E5-2690, native backend.Mem = 6.16 │\n", "│ GB. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.TTS = 239 s 143 s. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.Pages/s = 0.94 1.57. (16 cores) Intel(R) Xeon E5-2690, pypdfium │\n", "│ backend.Mem = 2.42 GB │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "chunker = HybridChunker(tokenizer=tokenizer)\n", "\n", "chunk_iter = chunker.chunk(dl_doc=doc)\n", "\n", "chunks = list(chunk_iter)\n", "i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.TABLE)\n", "print_chunk(\n", " chunks=chunks,\n", " chunk_pos=i,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
HybridChunker
can sometimes lead to a warning from the transformers library, however this is a \"false alarm\" — for details check here.\n",
"╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=431 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n", "│ Docling Technical Report │\n", "│ 4 Performance │\n", "│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n", "│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. │\n", "│ │\n", "│ | CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend | │\n", "│ |----------------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------| │\n", "│ | | | TTS | Pages/s | Mem | TTS | Pages/s | Mem | │\n", "│ | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | │\n", "│ | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "\n" ], "text/plain": [ "╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=431 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n", "│ Docling Technical Report │\n", "│ 4 Performance │\n", "│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n", "│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. │\n", "│ │\n", "│ | CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend | │\n", "│ |----------------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------| │\n", "│ | | | TTS | Pages/s | Mem | TTS | Pages/s | Mem | │\n", "│ | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | │\n", "│ | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from docling_core.transforms.chunker.hierarchical_chunker import (\n", " ChunkingDocSerializer,\n", " ChunkingSerializerProvider,\n", ")\n", "from docling_core.transforms.serializer.markdown import MarkdownTableSerializer\n", "\n", "\n", "class MDTableSerializerProvider(ChunkingSerializerProvider):\n", " def get_serializer(self, doc):\n", " return ChunkingDocSerializer(\n", " doc=doc,\n", " table_serializer=MarkdownTableSerializer(), # configuring a different table serializer\n", " )\n", "\n", "\n", "chunker = HybridChunker(\n", " tokenizer=tokenizer,\n", " serializer_provider=MDTableSerializerProvider(),\n", ")\n", "\n", "chunk_iter = chunker.chunk(dl_doc=doc)\n", "\n", "chunks = list(chunk_iter)\n", "i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.TABLE)\n", "print_chunk(\n", " chunks=chunks,\n", " chunk_pos=i,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Picture serialization" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using the default strategy" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Below we inspect the first chunk containing a picture.\n", "\n", "Even when using the default strategy, we can modify the relevant parameters, e.g. which placeholder is used for pictures:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=117 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n", "│ Docling Technical Report │\n", "│ <!-- image --> │\n", "│ Version 1.0 │\n", "│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta │\n", "│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar │\n", "│ AI4K Group, IBM Research R¨ uschlikon, Switzerland │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "\n" ], "text/plain": [ "╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=117 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n", "│ Docling Technical Report │\n", "│ │\n", "│ Version 1.0 │\n", "│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta │\n", "│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar │\n", "│ AI4K Group, IBM Research R¨ uschlikon, Switzerland │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from docling_core.transforms.serializer.markdown import MarkdownParams\n", "\n", "\n", "class ImgPlaceholderSerializerProvider(ChunkingSerializerProvider):\n", " def get_serializer(self, doc):\n", " return ChunkingDocSerializer(\n", " doc=doc,\n", " params=MarkdownParams(\n", " image_placeholder=\"\",\n", " ),\n", " )\n", "\n", "\n", "chunker = HybridChunker(\n", " tokenizer=tokenizer,\n", " serializer_provider=ImgPlaceholderSerializerProvider(),\n", ")\n", "\n", "chunk_iter = chunker.chunk(dl_doc=doc)\n", "\n", "chunks = list(chunk_iter)\n", "i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.PICTURE)\n", "print_chunk(\n", " chunks=chunks,\n", " chunk_pos=i,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using a custom strategy" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Below we define and use our custom picture serialization strategy which leverages picture annotations:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from typing import Any\n", "\n", "from docling_core.transforms.serializer.base import (\n", " BaseDocSerializer,\n", " SerializationResult,\n", ")\n", "from docling_core.transforms.serializer.common import create_ser_result\n", "from docling_core.transforms.serializer.markdown import MarkdownPictureSerializer\n", "from docling_core.types.doc.document import (\n", " PictureClassificationData,\n", " PictureDescriptionData,\n", " PictureItem,\n", " PictureMoleculeData,\n", ")\n", "from typing_extensions import override\n", "\n", "\n", "class AnnotationPictureSerializer(MarkdownPictureSerializer):\n", " @override\n", " def serialize(\n", " self,\n", " *,\n", " item: PictureItem,\n", " doc_serializer: BaseDocSerializer,\n", " doc: DoclingDocument,\n", " **kwargs: Any,\n", " ) -> SerializationResult:\n", " text_parts: list[str] = []\n", " for annotation in item.annotations:\n", " if isinstance(annotation, PictureClassificationData):\n", " predicted_class = (\n", " annotation.predicted_classes[0].class_name\n", " if annotation.predicted_classes\n", " else None\n", " )\n", " if predicted_class is not None:\n", " text_parts.append(f\"Picture type: {predicted_class}\")\n", " elif isinstance(annotation, PictureMoleculeData):\n", " text_parts.append(f\"SMILES: {annotation.smi}\")\n", " elif isinstance(annotation, PictureDescriptionData):\n", " text_parts.append(f\"Picture description: {annotation.text}\")\n", "\n", " text_res = \"\\n\".join(text_parts)\n", " text_res = doc_serializer.post_process(text=text_res)\n", " return create_ser_result(text=text_res, span_source=item)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=128 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n", "│ Docling Technical Report │\n", "│ Picture description: In this image we can see a cartoon image of a duck holding a paper. │\n", "│ Version 1.0 │\n", "│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta │\n", "│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar │\n", "│ AI4K Group, IBM Research R¨ uschlikon, Switzerland │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "\n" ], "text/plain": [ "╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=128 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n", "│ Docling Technical Report │\n", "│ Picture description: In this image we can see a cartoon image of a duck holding a paper. │\n", "│ Version 1.0 │\n", "│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta │\n", "│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar │\n", "│ AI4K Group, IBM Research R¨ uschlikon, Switzerland │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "class ImgAnnotationSerializerProvider(ChunkingSerializerProvider):\n", " def get_serializer(self, doc: DoclingDocument):\n", " return ChunkingDocSerializer(\n", " doc=doc,\n", " picture_serializer=AnnotationPictureSerializer(), # configuring a different picture serializer\n", " )\n", "\n", "\n", "chunker = HybridChunker(\n", " tokenizer=tokenizer,\n", " serializer_provider=ImgAnnotationSerializerProvider(),\n", ")\n", "\n", "chunk_iter = chunker.chunk(dl_doc=doc)\n", "\n", "chunks = list(chunk_iter)\n", "i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.PICTURE)\n", "print_chunk(\n", " chunks=chunks,\n", " chunk_pos=i,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.2" } }, "nbformat": 4, "nbformat_minor": 2 }