{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install -q docling[vlm] ipython"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from docling.datamodel.base_models import InputFormat\n",
"from docling.datamodel.pipeline_options import PdfPipelineOptions\n",
"from docling.document_converter import DocumentConverter, PdfFormatOption"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# The source document\n",
"DOC_SOURCE = \"https://arxiv.org/pdf/2501.17887\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Describe pictures with Granite Vision\n",
"\n",
"This section will run locally the [ibm-granite/granite-vision-3.1-2b-preview](https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview) model to describe the pictures of the document."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "93a634699bf1434c9bc8e384d6db1a28",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading checkpoint shards: 0%| | 0/2 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from docling.datamodel.pipeline_options import granite_picture_description\n",
"\n",
"pipeline_options = PdfPipelineOptions()\n",
"pipeline_options.do_picture_description = True\n",
"pipeline_options.picture_description_options = (\n",
" granite_picture_description # <-- the model choice\n",
")\n",
"pipeline_options.picture_description_options.prompt = (\n",
" \"Describe the image in three sentences. Be consise and accurate.\"\n",
")\n",
"pipeline_options.images_scale = 2.0\n",
"pipeline_options.generate_picture_images = True\n",
"\n",
"converter = DocumentConverter(\n",
" format_options={\n",
" InputFormat.PDF: PdfFormatOption(\n",
" pipeline_options=pipeline_options,\n",
" )\n",
" }\n",
")\n",
"doc = converter.convert(DOC_SOURCE).document"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
#/pictures/0
#/pictures/1
#/pictures/2
#/pictures/3
#/pictures/4
{pic.self_ref}
#/pictures/0
#/pictures/1
#/pictures/2
#/pictures/3
#/pictures/4
{pic.self_ref}