docling/docs/examples/advanced_chunking_and_serialization.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Advanced chunking & serialization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Overview"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this notebook we show how to customize the serialization strategies that come into\n",
    "play during chunking."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will work with a document that contains some [picture annotations](../pictures_description):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from docling_core.types.doc.document import DoclingDocument\n",
    "\n",
    "SOURCE = \"./data/2408.09869v3_enriched.json\"\n",
    "\n",
    "doc = DoclingDocument.load_from_json(SOURCE)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Below we define the chunker (for more details check out [Hybrid Chunking](../hybrid_chunking)):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from docling_core.transforms.chunker.hybrid_chunker import HybridChunker\n",
    "from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer\n",
    "from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
    "\n",
    "tokenizer: BaseTokenizer = HuggingFaceTokenizer(\n",
    "    tokenizer=AutoTokenizer.from_pretrained(EMBED_MODEL_ID),\n",
    ")\n",
    "chunker = HybridChunker(tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tokenizer.get_max_tokens()=512\n"
     ]
    }
   ],
   "source": [
    "print(f\"{tokenizer.get_max_tokens()=}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Defining some helper methods:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Iterable, Optional\n",
    "\n",
    "from docling_core.transforms.chunker.base import BaseChunk\n",
    "from docling_core.transforms.chunker.hierarchical_chunker import DocChunk\n",
    "from docling_core.types.doc.labels import DocItemLabel\n",
    "from rich.console import Console\n",
    "from rich.panel import Panel\n",
    "\n",
    "console = Console(\n",
    "    width=200,  # for getting Markdown tables rendered nicely\n",
    ")\n",
    "\n",
    "\n",
    "def find_n_th_chunk_with_label(\n",
    "    iter: Iterable[BaseChunk], n: int, label: DocItemLabel\n",
    ") -> Optional[DocChunk]:\n",
    "    num_found = -1\n",
    "    for i, chunk in enumerate(iter):\n",
    "        doc_chunk = DocChunk.model_validate(chunk)\n",
    "        for it in doc_chunk.meta.doc_items:\n",
    "            if it.label == label:\n",
    "                num_found += 1\n",
    "                if num_found == n:\n",
    "                    return i, chunk\n",
    "    return None, None\n",
    "\n",
    "\n",
    "def print_chunk(chunks, chunk_pos):\n",
    "    chunk = chunks[chunk_pos]\n",
    "    ctx_text = chunker.contextualize(chunk=chunk)\n",
    "    num_tokens = tokenizer.count_tokens(text=ctx_text)\n",
    "    doc_items_refs = [it.self_ref for it in chunk.meta.doc_items]\n",
    "    title = f\"{chunk_pos=} {num_tokens=} {doc_items_refs=}\"\n",
    "    console.print(Panel(ctx_text, title=title))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Table serialization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using the default strategy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Below we inspect the first chunk containing a table — using the default serialization strategy:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Token indices sequence length is longer than the specified maximum sequence length for this model (652 > 512). Running this sequence through the model will result in indexing errors\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=426 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n",
       "│ Docling Technical Report                                                                                                                                                                             │\n",
       "│ 4 Performance                                                                                                                                                                                        │\n",
       "│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n",
       "│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.           │\n",
       "│                                                                                                                                                                                                      │\n",
       "│ Apple M3 Max, Thread budget. = 4. Apple M3 Max, native backend.TTS = 177 s 167 s. Apple M3 Max, native backend.Pages/s = 1.27 1.34. Apple M3 Max, native backend.Mem = 6.20 GB. Apple M3 Max,        │\n",
       "│ pypdfium backend.TTS = 103 s 92 s. Apple M3 Max, pypdfium backend.Pages/s = 2.18 2.45. Apple M3 Max, pypdfium backend.Mem = 2.56 GB. (16 cores) Intel(R) Xeon E5-2690, Thread budget. = 16 4 16. (16 │\n",
       "│ cores) Intel(R) Xeon E5-2690, native backend.TTS = 375 s 244 s. (16 cores) Intel(R) Xeon E5-2690, native backend.Pages/s = 0.60 0.92. (16 cores) Intel(R) Xeon E5-2690, native backend.Mem = 6.16    │\n",
       "│ GB. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.TTS = 239 s 143 s. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.Pages/s = 0.94 1.57. (16 cores) Intel(R) Xeon E5-2690, pypdfium         │\n",
       "│ backend.Mem = 2.42 GB                                                                                                                                                                                │\n",
       "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
       "</pre>\n"
      ],
      "text/plain": [
       "╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=426 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n",
       "│ Docling Technical Report                                                                                                                                                                             │\n",
       "│ 4 Performance                                                                                                                                                                                        │\n",
       "│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n",
       "│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.           │\n",
       "│                                                                                                                                                                                                      │\n",
       "│ Apple M3 Max, Thread budget. = 4. Apple M3 Max, native backend.TTS = 177 s 167 s. Apple M3 Max, native backend.Pages/s = 1.27 1.34. Apple M3 Max, native backend.Mem = 6.20 GB. Apple M3 Max,        │\n",
       "│ pypdfium backend.TTS = 103 s 92 s. Apple M3 Max, pypdfium backend.Pages/s = 2.18 2.45. Apple M3 Max, pypdfium backend.Mem = 2.56 GB. (16 cores) Intel(R) Xeon E5-2690, Thread budget. = 16 4 16. (16 │\n",
       "│ cores) Intel(R) Xeon E5-2690, native backend.TTS = 375 s 244 s. (16 cores) Intel(R) Xeon E5-2690, native backend.Pages/s = 0.60 0.92. (16 cores) Intel(R) Xeon E5-2690, native backend.Mem = 6.16    │\n",
       "│ GB. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.TTS = 239 s 143 s. (16 cores) Intel(R) Xeon E5-2690, pypdfium backend.Pages/s = 0.94 1.57. (16 cores) Intel(R) Xeon E5-2690, pypdfium         │\n",
       "│ backend.Mem = 2.42 GB                                                                                                                                                                                │\n",
       "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "chunker = HybridChunker(tokenizer=tokenizer)\n",
    "\n",
    "chunk_iter = chunker.chunk(dl_doc=doc)\n",
    "\n",
    "chunks = list(chunk_iter)\n",
    "i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.TABLE)\n",
    "print_chunk(\n",
    "    chunks=chunks,\n",
    "    chunk_pos=i,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "    <strong>INFO</strong>: As you see above, using the <code>HybridChunker</code> can sometimes lead to a warning from the transformers library, however this is a \"false alarm\" — for details check <a href=\"https://docling-project.github.io/docling/faq/#hybridchunker-triggers-warning-token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-length-for-this-model\">here</a>.\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Configuring a different strategy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can configure a different serialization strategy. In the example below, we specify a different table serializer that serializes tables to Markdown instead of the triplet notation used by default:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=431 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n",
       "│ Docling Technical Report                                                                                                                                                                             │\n",
       "│ 4 Performance                                                                                                                                                                                        │\n",
       "│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n",
       "│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.           │\n",
       "│                                                                                                                                                                                                      │\n",
       "│ | CPU                              | Thread budget   | native backend   | native backend   | native backend   | pypdfium backend   | pypdfium backend   | pypdfium backend   |                       │\n",
       "│ |----------------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------|                       │\n",
       "│ |                                  |                 | TTS              | Pages/s          | Mem              | TTS                | Pages/s            | Mem                |                       │\n",
       "│ | Apple M3 Max                     | 4               | 177 s 167 s      | 1.27 1.34        | 6.20 GB          | 103 s 92 s         | 2.18 2.45          | 2.56 GB            |                       │\n",
       "│ | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16         | 375 s 244 s      | 0.60 0.92        | 6.16 GB          | 239 s 143 s        | 0.94 1.57          | 2.42 GB            |                       │\n",
       "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
       "</pre>\n"
      ],
      "text/plain": [
       "╭────────────────────────────────────────────────────────────── chunk_pos=13 num_tokens=431 doc_items_refs=['#/texts/72', '#/tables/0'] ───────────────────────────────────────────────────────────────╮\n",
       "│ Docling Technical Report                                                                                                                                                                             │\n",
       "│ 4 Performance                                                                                                                                                                                        │\n",
       "│ Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution │\n",
       "│ (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.           │\n",
       "│                                                                                                                                                                                                      │\n",
       "│ | CPU                              | Thread budget   | native backend   | native backend   | native backend   | pypdfium backend   | pypdfium backend   | pypdfium backend   |                       │\n",
       "│ |----------------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------|                       │\n",
       "│ |                                  |                 | TTS              | Pages/s          | Mem              | TTS                | Pages/s            | Mem                |                       │\n",
       "│ | Apple M3 Max                     | 4               | 177 s 167 s      | 1.27 1.34        | 6.20 GB          | 103 s 92 s         | 2.18 2.45          | 2.56 GB            |                       │\n",
       "│ | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16         | 375 s 244 s      | 0.60 0.92        | 6.16 GB          | 239 s 143 s        | 0.94 1.57          | 2.42 GB            |                       │\n",
       "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from docling_core.transforms.chunker.hierarchical_chunker import (\n",
    "    ChunkingDocSerializer,\n",
    "    ChunkingSerializerProvider,\n",
    ")\n",
    "from docling_core.transforms.serializer.markdown import MarkdownTableSerializer\n",
    "\n",
    "\n",
    "class MDTableSerializerProvider(ChunkingSerializerProvider):\n",
    "    def get_serializer(self, doc):\n",
    "        return ChunkingDocSerializer(\n",
    "            doc=doc,\n",
    "            table_serializer=MarkdownTableSerializer(),  # configuring a different table serializer\n",
    "        )\n",
    "\n",
    "\n",
    "chunker = HybridChunker(\n",
    "    tokenizer=tokenizer,\n",
    "    serializer_provider=MDTableSerializerProvider(),\n",
    ")\n",
    "\n",
    "chunk_iter = chunker.chunk(dl_doc=doc)\n",
    "\n",
    "chunks = list(chunk_iter)\n",
    "i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.TABLE)\n",
    "print_chunk(\n",
    "    chunks=chunks,\n",
    "    chunk_pos=i,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Picture serialization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using the default strategy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Below we inspect the first chunk containing a picture.\n",
    "\n",
    "Even when using the default strategy, we can modify the relevant parameters, e.g. which placeholder is used for pictures:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=117 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n",
       "│ Docling Technical Report                                                                                                                                                                             │\n",
       "│ &lt;!-- image --&gt;                                                                                                                                                                                       │\n",
       "│ Version 1.0                                                                                                                                                                                          │\n",
       "│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta  │\n",
       "│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar                                                                                                  │\n",
       "│ AI4K Group, IBM Research R¨ uschlikon, Switzerland                                                                                                                                                   │\n",
       "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
       "</pre>\n"
      ],
      "text/plain": [
       "╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=117 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n",
       "│ Docling Technical Report                                                                                                                                                                             │\n",
       "│ <!-- image -->                                                                                                                                                                                       │\n",
       "│ Version 1.0                                                                                                                                                                                          │\n",
       "│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta  │\n",
       "│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar                                                                                                  │\n",
       "│ AI4K Group, IBM Research R¨ uschlikon, Switzerland                                                                                                                                                   │\n",
       "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from docling_core.transforms.serializer.markdown import MarkdownParams\n",
    "\n",
    "\n",
    "class ImgPlaceholderSerializerProvider(ChunkingSerializerProvider):\n",
    "    def get_serializer(self, doc):\n",
    "        return ChunkingDocSerializer(\n",
    "            doc=doc,\n",
    "            params=MarkdownParams(\n",
    "                image_placeholder=\"<!-- image -->\",\n",
    "            ),\n",
    "        )\n",
    "\n",
    "\n",
    "chunker = HybridChunker(\n",
    "    tokenizer=tokenizer,\n",
    "    serializer_provider=ImgPlaceholderSerializerProvider(),\n",
    ")\n",
    "\n",
    "chunk_iter = chunker.chunk(dl_doc=doc)\n",
    "\n",
    "chunks = list(chunk_iter)\n",
    "i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.PICTURE)\n",
    "print_chunk(\n",
    "    chunks=chunks,\n",
    "    chunk_pos=i,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using a custom strategy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Below we define and use our custom picture serialization strategy which leverages picture annotations:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Any\n",
    "\n",
    "from docling_core.transforms.serializer.base import (\n",
    "    BaseDocSerializer,\n",
    "    SerializationResult,\n",
    ")\n",
    "from docling_core.transforms.serializer.common import create_ser_result\n",
    "from docling_core.transforms.serializer.markdown import MarkdownPictureSerializer\n",
    "from docling_core.types.doc.document import (\n",
    "    PictureClassificationData,\n",
    "    PictureDescriptionData,\n",
    "    PictureItem,\n",
    "    PictureMoleculeData,\n",
    ")\n",
    "from typing_extensions import override\n",
    "\n",
    "\n",
    "class AnnotationPictureSerializer(MarkdownPictureSerializer):\n",
    "    @override\n",
    "    def serialize(\n",
    "        self,\n",
    "        *,\n",
    "        item: PictureItem,\n",
    "        doc_serializer: BaseDocSerializer,\n",
    "        doc: DoclingDocument,\n",
    "        **kwargs: Any,\n",
    "    ) -> SerializationResult:\n",
    "        text_parts: list[str] = []\n",
    "        for annotation in item.annotations:\n",
    "            if isinstance(annotation, PictureClassificationData):\n",
    "                predicted_class = (\n",
    "                    annotation.predicted_classes[0].class_name\n",
    "                    if annotation.predicted_classes\n",
    "                    else None\n",
    "                )\n",
    "                if predicted_class is not None:\n",
    "                    text_parts.append(f\"Picture type: {predicted_class}\")\n",
    "            elif isinstance(annotation, PictureMoleculeData):\n",
    "                text_parts.append(f\"SMILES: {annotation.smi}\")\n",
    "            elif isinstance(annotation, PictureDescriptionData):\n",
    "                text_parts.append(f\"Picture description: {annotation.text}\")\n",
    "\n",
    "        text_res = \"\\n\".join(text_parts)\n",
    "        text_res = doc_serializer.post_process(text=text_res)\n",
    "        return create_ser_result(text=text_res, span_source=item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=128 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n",
       "│ Docling Technical Report                                                                                                                                                                             │\n",
       "│ Picture description: In this image we can see a cartoon image of a duck holding a paper.                                                                                                             │\n",
       "│ Version 1.0                                                                                                                                                                                          │\n",
       "│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta  │\n",
       "│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar                                                                                                  │\n",
       "│ AI4K Group, IBM Research R¨ uschlikon, Switzerland                                                                                                                                                   │\n",
       "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
       "</pre>\n"
      ],
      "text/plain": [
       "╭───────────────────────────────────────────────── chunk_pos=0 num_tokens=128 doc_items_refs=['#/pictures/0', '#/texts/2', '#/texts/3', '#/texts/4'] ──────────────────────────────────────────────────╮\n",
       "│ Docling Technical Report                                                                                                                                                                             │\n",
       "│ Picture description: In this image we can see a cartoon image of a duck holding a paper.                                                                                                             │\n",
       "│ Version 1.0                                                                                                                                                                                          │\n",
       "│ Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta  │\n",
       "│ Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar                                                                                                  │\n",
       "│ AI4K Group, IBM Research R¨ uschlikon, Switzerland                                                                                                                                                   │\n",
       "╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "class ImgAnnotationSerializerProvider(ChunkingSerializerProvider):\n",
    "    def get_serializer(self, doc: DoclingDocument):\n",
    "        return ChunkingDocSerializer(\n",
    "            doc=doc,\n",
    "            picture_serializer=AnnotationPictureSerializer(),  # configuring a different picture serializer\n",
    "        )\n",
    "\n",
    "\n",
    "chunker = HybridChunker(\n",
    "    tokenizer=tokenizer,\n",
    "    serializer_provider=ImgAnnotationSerializerProvider(),\n",
    ")\n",
    "\n",
    "chunk_iter = chunker.chunk(dl_doc=doc)\n",
    "\n",
    "chunks = list(chunk_iter)\n",
    "i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.PICTURE)\n",
    "print_chunk(\n",
    "    chunks=chunks,\n",
    "    chunk_pos=i,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}