"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"file_path = (\n",
" \"https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg\"\n",
")\n",
"display.HTML(f\"
\")"
]
},
{
"cell_type": "markdown",
"id": "dbbc173c",
"metadata": {},
"source": [
"## Defining the extractor"
]
},
{
"cell_type": "markdown",
"id": "e9871c8d",
"metadata": {},
"source": [
"Let's first define our extractor:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7a8c6ff0",
"metadata": {},
"outputs": [],
"source": [
"from docling.datamodel.base_models import InputFormat\n",
"from docling.document_extractor import DocumentExtractor\n",
"\n",
"extractor = DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])"
]
},
{
"cell_type": "markdown",
"id": "e2b1933e",
"metadata": {},
"source": [
"Following, we look at different ways to define the data template."
]
},
{
"cell_type": "markdown",
"id": "20e62dfd",
"metadata": {},
"source": [
"## Using a string template"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "4c5119b0",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/pva/work/github.com/DS4SD/docling/docling/document_extractor.py:143: UserWarning: The extract API is currently experimental and may change without prior notice.\n",
"Only PDF and image formats are supported.\n",
" return next(all_res)\n",
"You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.\n",
"The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n"
]
},
{
"data": {
"text/html": [
"[\n",
" ExtractedPageData(\n",
" page_no=1,\n",
" extracted_data={'bill_no': '3139', 'total': 3949.75},\n",
" raw_text='{\"bill_no\": \"3139\", \"total\": 3949.75}',\n",
" errors=[]\n",
" )\n",
"]\n",
"
\n"
],
"text/plain": [
"\u001b[1m[\u001b[0m\n",
" \u001b[1;35mExtractedPageData\u001b[0m\u001b[1m(\u001b[0m\n",
" \u001b[33mpage_no\u001b[0m=\u001b[1;36m1\u001b[0m,\n",
" \u001b[33mextracted_data\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'bill_no'\u001b[0m: \u001b[32m'3139'\u001b[0m, \u001b[32m'total'\u001b[0m: \u001b[1;36m3949.75\u001b[0m\u001b[1m}\u001b[0m,\n",
" \u001b[33mraw_text\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"bill_no\": \"3139\", \"total\": 3949.75\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
" \u001b[33merrors\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
" \u001b[1m)\u001b[0m\n",
"\u001b[1m]\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"result = extractor.extract(\n",
" source=file_path,\n",
" template='{\"bill_no\": \"string\", \"total\": \"float\"}',\n",
")\n",
"print(result.pages)"
]
},
{
"cell_type": "markdown",
"id": "0da85c9c",
"metadata": {},
"source": [
"## Using a dict template"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e0df82f6",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n"
]
},
{
"data": {
"text/html": [
"[\n",
" ExtractedPageData(\n",
" page_no=1,\n",
" extracted_data={'bill_no': '3139', 'total': 3949.75},\n",
" raw_text='{\"bill_no\": \"3139\", \"total\": 3949.75}',\n",
" errors=[]\n",
" )\n",
"]\n",
"
\n"
],
"text/plain": [
"\u001b[1m[\u001b[0m\n",
" \u001b[1;35mExtractedPageData\u001b[0m\u001b[1m(\u001b[0m\n",
" \u001b[33mpage_no\u001b[0m=\u001b[1;36m1\u001b[0m,\n",
" \u001b[33mextracted_data\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'bill_no'\u001b[0m: \u001b[32m'3139'\u001b[0m, \u001b[32m'total'\u001b[0m: \u001b[1;36m3949.75\u001b[0m\u001b[1m}\u001b[0m,\n",
" \u001b[33mraw_text\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"bill_no\": \"3139\", \"total\": 3949.75\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
" \u001b[33merrors\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
" \u001b[1m)\u001b[0m\n",
"\u001b[1m]\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"result = extractor.extract(\n",
" source=file_path,\n",
" template={\n",
" \"bill_no\": \"string\",\n",
" \"total\": \"float\",\n",
" },\n",
")\n",
"print(result.pages)"
]
},
{
"cell_type": "markdown",
"id": "925c1804",
"metadata": {},
"source": [
"## Using a Pydantic model template"
]
},
{
"cell_type": "markdown",
"id": "01aee19d",
"metadata": {},
"source": [
"First we define the Pydantic model we want to use"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "69facb7b",
"metadata": {},
"outputs": [],
"source": [
"from typing import Optional\n",
"\n",
"\n",
"class Invoice(BaseModel):\n",
" bill_no: str = Field(\n",
" examples=[\"A123\", \"5414\"]\n",
" ) # provide some examples, but no default value\n",
" total: float = Field(\n",
" default=10, examples=[20]\n",
" ) # provide some examples and a default value\n",
" tax_id: Optional[str] = Field(default=None, examples=[\"1234567890\"])"
]
},
{
"cell_type": "markdown",
"id": "fbcbce95",
"metadata": {},
"source": [
"The class itself can then be used directly as the template: "
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "81db63b1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n"
]
},
{
"data": {
"text/html": [
"[\n",
" ExtractedPageData(\n",
" page_no=1,\n",
" extracted_data={'bill_no': '3139', 'total': 3949.75, 'tax_id': None},\n",
" raw_text='{\"bill_no\": \"3139\", \"total\": 3949.75, \"tax_id\": null}',\n",
" errors=[]\n",
" )\n",
"]\n",
"
\n"
],
"text/plain": [
"\u001b[1m[\u001b[0m\n",
" \u001b[1;35mExtractedPageData\u001b[0m\u001b[1m(\u001b[0m\n",
" \u001b[33mpage_no\u001b[0m=\u001b[1;36m1\u001b[0m,\n",
" \u001b[33mextracted_data\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'bill_no'\u001b[0m: \u001b[32m'3139'\u001b[0m, \u001b[32m'total'\u001b[0m: \u001b[1;36m3949.75\u001b[0m, \u001b[32m'tax_id'\u001b[0m: \u001b[3;35mNone\u001b[0m\u001b[1m}\u001b[0m,\n",
" \u001b[33mraw_text\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"bill_no\": \"3139\", \"total\": 3949.75, \"tax_id\": null\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
" \u001b[33merrors\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
" \u001b[1m)\u001b[0m\n",
"\u001b[1m]\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"result = extractor.extract(\n",
" source=file_path,\n",
" template=Invoice,\n",
")\n",
"print(result.pages)"
]
},
{
"cell_type": "markdown",
"id": "2bd8736b",
"metadata": {},
"source": [
"Alternatively, a Pydantic model instance can be passed as a template instead, allowing to override the default values.\n",
"\n",
"This can be very useful in scenarios where we happen to have available context that is more relevant than the\n",
"default values predefined in the model definition.\n",
"\n",
"E.g. in the example below:\n",
"- `bill_no` and `total` are actually set from the value extracted from the data,\n",
"- there was no `tax_id` to be extracted, so the updated default we provided was applied"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "b531a20d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n"
]
},
{
"data": {
"text/html": [
"[\n",
" ExtractedPageData(\n",
" page_no=1,\n",
" extracted_data={'bill_no': '3139', 'total': 3949.75, 'tax_id': '42'},\n",
" raw_text='{\"bill_no\": \"3139\", \"total\": 3949.75, \"tax_id\": \"42\"}',\n",
" errors=[]\n",
" )\n",
"]\n",
"
\n"
],
"text/plain": [
"\u001b[1m[\u001b[0m\n",
" \u001b[1;35mExtractedPageData\u001b[0m\u001b[1m(\u001b[0m\n",
" \u001b[33mpage_no\u001b[0m=\u001b[1;36m1\u001b[0m,\n",
" \u001b[33mextracted_data\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'bill_no'\u001b[0m: \u001b[32m'3139'\u001b[0m, \u001b[32m'total'\u001b[0m: \u001b[1;36m3949.75\u001b[0m, \u001b[32m'tax_id'\u001b[0m: \u001b[32m'42'\u001b[0m\u001b[1m}\u001b[0m,\n",
" \u001b[33mraw_text\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"bill_no\": \"3139\", \"total\": 3949.75, \"tax_id\": \"42\"\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
" \u001b[33merrors\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
" \u001b[1m)\u001b[0m\n",
"\u001b[1m]\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"result = extractor.extract(\n",
" source=file_path,\n",
" template=Invoice(\n",
" bill_no=\"41\",\n",
" total=100,\n",
" tax_id=\"42\",\n",
" ),\n",
")\n",
"print(result.pages)"
]
},
{
"cell_type": "markdown",
"id": "dc38e143",
"metadata": {},
"source": [
"### Advanced Pydantic model"
]
},
{
"cell_type": "markdown",
"id": "5a1ee898",
"metadata": {},
"source": [
"Besides a flat template, we can in principle use any Pydantic model, thus leveraging reuse and being able to capture\n",
"hierarchies:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "dca8289a",
"metadata": {},
"outputs": [],
"source": [
"class Contact(BaseModel):\n",
" name: Optional[str] = Field(default=None, examples=[\"Smith\"])\n",
" address: str = Field(default=\"123 Main St\", examples=[\"456 Elm St\"])\n",
" postal_code: str = Field(default=\"12345\", examples=[\"67890\"])\n",
" city: str = Field(default=\"Anytown\", examples=[\"Othertown\"])\n",
" country: Optional[str] = Field(default=None, examples=[\"Canada\"])\n",
"\n",
"\n",
"class ExtendedInvoice(BaseModel):\n",
" bill_no: str = Field(\n",
" examples=[\"A123\", \"5414\"]\n",
" ) # provide some examples, but not the actual value of the test sample\n",
" total: float = Field(\n",
" default=10, examples=[20]\n",
" ) # provide a default value and some examples\n",
" garden_work_hours: int = Field(default=1, examples=[2])\n",
" sender: Contact = Field(default=Contact(), examples=[Contact()])\n",
" receiver: Contact = Field(default=Contact(), examples=[Contact()])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "5896662d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n"
]
},
{
"data": {
"text/html": [
"[\n",
" ExtractedPageData(\n",
" page_no=1,\n",
" extracted_data={\n",
" 'bill_no': '3139',\n",
" 'total': 3949.75,\n",
" 'garden_work_hours': 28,\n",
" 'sender': {\n",
" 'name': 'Robert Schneider',\n",
" 'address': 'Rue du Lac 1268',\n",
" 'postal_code': '2501',\n",
" 'city': 'Biel',\n",
" 'country': 'Switzerland'\n",
" },\n",
" 'receiver': {\n",
" 'name': 'Pia Rutschmann',\n",
" 'address': 'Marktgasse 28',\n",
" 'postal_code': '9400',\n",
" 'city': 'Rorschach',\n",
" 'country': 'Switzerland'\n",
" }\n",
" },\n",
" raw_text='{\"bill_no\": \"3139\", \"total\": 3949.75, \"garden_work_hours\": 28, \"sender\": {\"name\": \"Robert \n",
"Schneider\", \"address\": \"Rue du Lac 1268\", \"postal_code\": \"2501\", \"city\": \"Biel\", \"country\": \"Switzerland\"}, \n",
"\"receiver\": {\"name\": \"Pia Rutschmann\", \"address\": \"Marktgasse 28\", \"postal_code\": \"9400\", \"city\": \"Rorschach\", \n",
"\"country\": \"Switzerland\"}}',\n",
" errors=[]\n",
" )\n",
"]\n",
"
\n"
],
"text/plain": [
"\u001b[1m[\u001b[0m\n",
" \u001b[1;35mExtractedPageData\u001b[0m\u001b[1m(\u001b[0m\n",
" \u001b[33mpage_no\u001b[0m=\u001b[1;36m1\u001b[0m,\n",
" \u001b[33mextracted_data\u001b[0m=\u001b[1m{\u001b[0m\n",
" \u001b[32m'bill_no'\u001b[0m: \u001b[32m'3139'\u001b[0m,\n",
" \u001b[32m'total'\u001b[0m: \u001b[1;36m3949.75\u001b[0m,\n",
" \u001b[32m'garden_work_hours'\u001b[0m: \u001b[1;36m28\u001b[0m,\n",
" \u001b[32m'sender'\u001b[0m: \u001b[1m{\u001b[0m\n",
" \u001b[32m'name'\u001b[0m: \u001b[32m'Robert Schneider'\u001b[0m,\n",
" \u001b[32m'address'\u001b[0m: \u001b[32m'Rue du Lac 1268'\u001b[0m,\n",
" \u001b[32m'postal_code'\u001b[0m: \u001b[32m'2501'\u001b[0m,\n",
" \u001b[32m'city'\u001b[0m: \u001b[32m'Biel'\u001b[0m,\n",
" \u001b[32m'country'\u001b[0m: \u001b[32m'Switzerland'\u001b[0m\n",
" \u001b[1m}\u001b[0m,\n",
" \u001b[32m'receiver'\u001b[0m: \u001b[1m{\u001b[0m\n",
" \u001b[32m'name'\u001b[0m: \u001b[32m'Pia Rutschmann'\u001b[0m,\n",
" \u001b[32m'address'\u001b[0m: \u001b[32m'Marktgasse 28'\u001b[0m,\n",
" \u001b[32m'postal_code'\u001b[0m: \u001b[32m'9400'\u001b[0m,\n",
" \u001b[32m'city'\u001b[0m: \u001b[32m'Rorschach'\u001b[0m,\n",
" \u001b[32m'country'\u001b[0m: \u001b[32m'Switzerland'\u001b[0m\n",
" \u001b[1m}\u001b[0m\n",
" \u001b[1m}\u001b[0m,\n",
" \u001b[33mraw_text\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"bill_no\": \"3139\", \"total\": 3949.75, \"garden_work_hours\": 28, \"sender\": \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"name\": \"Robert \u001b[0m\n",
"\u001b[32mSchneider\", \"address\": \"Rue du Lac 1268\", \"postal_code\": \"2501\", \"city\": \"Biel\", \"country\": \"Switzerland\"\u001b[0m\u001b[32m}\u001b[0m\u001b[32m, \u001b[0m\n",
"\u001b[32m\"receiver\": \u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"name\": \"Pia Rutschmann\", \"address\": \"Marktgasse 28\", \"postal_code\": \"9400\", \"city\": \"Rorschach\", \u001b[0m\n",
"\u001b[32m\"country\": \"Switzerland\"\u001b[0m\u001b[32m}\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
" \u001b[33merrors\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
" \u001b[1m)\u001b[0m\n",
"\u001b[1m]\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"result = extractor.extract(\n",
" source=file_path,\n",
" template=ExtendedInvoice,\n",
")\n",
"print(result.pages)"
]
},
{
"cell_type": "markdown",
"id": "e873f65d",
"metadata": {},
"source": [
"### Validating and loading the extracted data"
]
},
{
"cell_type": "markdown",
"id": "080991f6",
"metadata": {},
"source": [
"The generated response data can be easily validated and loaded via Pydantic:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "a015bf60",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"ExtendedInvoice(\n",
" bill_no='3139',\n",
" total=3949.75,\n",
" garden_work_hours=28,\n",
" sender=Contact(\n",
" name='Robert Schneider',\n",
" address='Rue du Lac 1268',\n",
" postal_code='2501',\n",
" city='Biel',\n",
" country='Switzerland'\n",
" ),\n",
" receiver=Contact(\n",
" name='Pia Rutschmann',\n",
" address='Marktgasse 28',\n",
" postal_code='9400',\n",
" city='Rorschach',\n",
" country='Switzerland'\n",
" )\n",
")\n",
"
\n"
],
"text/plain": [
"\u001b[1;35mExtendedInvoice\u001b[0m\u001b[1m(\u001b[0m\n",
" \u001b[33mbill_no\u001b[0m=\u001b[32m'3139'\u001b[0m,\n",
" \u001b[33mtotal\u001b[0m=\u001b[1;36m3949\u001b[0m\u001b[1;36m.75\u001b[0m,\n",
" \u001b[33mgarden_work_hours\u001b[0m=\u001b[1;36m28\u001b[0m,\n",
" \u001b[33msender\u001b[0m=\u001b[1;35mContact\u001b[0m\u001b[1m(\u001b[0m\n",
" \u001b[33mname\u001b[0m=\u001b[32m'Robert Schneider'\u001b[0m,\n",
" \u001b[33maddress\u001b[0m=\u001b[32m'Rue du Lac 1268'\u001b[0m,\n",
" \u001b[33mpostal_code\u001b[0m=\u001b[32m'2501'\u001b[0m,\n",
" \u001b[33mcity\u001b[0m=\u001b[32m'Biel'\u001b[0m,\n",
" \u001b[33mcountry\u001b[0m=\u001b[32m'Switzerland'\u001b[0m\n",
" \u001b[1m)\u001b[0m,\n",
" \u001b[33mreceiver\u001b[0m=\u001b[1;35mContact\u001b[0m\u001b[1m(\u001b[0m\n",
" \u001b[33mname\u001b[0m=\u001b[32m'Pia Rutschmann'\u001b[0m,\n",
" \u001b[33maddress\u001b[0m=\u001b[32m'Marktgasse 28'\u001b[0m,\n",
" \u001b[33mpostal_code\u001b[0m=\u001b[32m'9400'\u001b[0m,\n",
" \u001b[33mcity\u001b[0m=\u001b[32m'Rorschach'\u001b[0m,\n",
" \u001b[33mcountry\u001b[0m=\u001b[32m'Switzerland'\u001b[0m\n",
" \u001b[1m)\u001b[0m\n",
"\u001b[1m)\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"invoice = ExtendedInvoice.model_validate(result.pages[0].extracted_data)\n",
"print(invoice)"
]
},
{
"cell_type": "markdown",
"id": "ae593926",
"metadata": {},
"source": [
"This way, we can get from completely unstructured data to a very structured and developer-friendly representation:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "32844e40",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Invoice #3139 was sent by Robert Schneider to Pia Rutschmann at Rue du Lac 1268.\n",
"
\n"
],
"text/plain": [
"Invoice #\u001b[1;36m3139\u001b[0m was sent by Robert Schneider to Pia Rutschmann at Rue du Lac \u001b[1;36m1268\u001b[0m.\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print(\n",
" f\"Invoice #{invoice.bill_no} was sent by {invoice.sender.name} \"\n",
" f\"to {invoice.receiver.name} at {invoice.sender.address}.\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6c1dbe41",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}