docling/examples/rag_llamaindex.ipynb

480 lines
29 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RAG with Docling and 🦙 LlamaIndex"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"# requirements for this example:\n",
"%pip install -qq docling docling-core python-dotenv llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-vector-stores-milvus"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"from tempfile import TemporaryDirectory\n",
"\n",
"from dotenv import load_dotenv\n",
"from pydantic import TypeAdapter\n",
"from rich.pretty import pprint\n",
"\n",
"load_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"\n",
"warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n",
"warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Reader and node parser"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below we set up:\n",
"- a `Reader` which will be used to create LlamaIndex documents, and\n",
"- a `NodeParser`, which will be used to create LlamaIndex nodes out of the documents"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from enum import Enum\n",
"from typing import Iterable\n",
"\n",
"from llama_index.core.readers.base import BasePydanticReader\n",
"from llama_index.core.schema import Document as LIDocument\n",
"from pydantic import BaseModel\n",
"\n",
"from docling.document_converter import DocumentConverter\n",
"\n",
"\n",
"class DocumentMetadata(BaseModel):\n",
" dl_doc_hash: str\n",
"\n",
"\n",
"class DoclingPDFReader(BasePydanticReader):\n",
" class ParseType(str, Enum):\n",
" MARKDOWN = \"markdown\"\n",
" # JSON = \"json\"\n",
"\n",
" parse_type: ParseType = ParseType.MARKDOWN\n",
"\n",
" def lazy_load_data(self, file_path: str | list[str]) -> Iterable[LIDocument]:\n",
" file_paths = file_path if isinstance(file_path, list) else [file_path]\n",
" converter = DocumentConverter()\n",
" for source in file_paths:\n",
" dl_doc = converter.convert_single(source).output\n",
" match self.parse_type:\n",
" case self.ParseType.MARKDOWN:\n",
" text = dl_doc.export_to_markdown()\n",
" # case self.ParseType.JSON:\n",
" # text = dl_doc.model_dump_json()\n",
" case _:\n",
" raise RuntimeError(\n",
" f\"Unexpected parse type encountered: {self.parse_type}\"\n",
" )\n",
" excl_metadata_keys = [\"dl_doc_hash\"]\n",
" li_doc = LIDocument(\n",
" doc_id=dl_doc.file_info.document_hash,\n",
" text=text,\n",
" excluded_embed_metadata_keys=excl_metadata_keys,\n",
" excluded_llm_metadata_keys=excl_metadata_keys,\n",
" )\n",
" li_doc.metadata = DocumentMetadata(\n",
" dl_doc_hash=dl_doc.file_info.document_hash,\n",
" ).model_dump()\n",
" yield li_doc"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core.node_parser import MarkdownNodeParser\n",
"\n",
"reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.MARKDOWN)\n",
"node_parser = MarkdownNodeParser()\n",
"transformations = [node_parser]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"One can include add more transformations, e.g. further chunking based on text size / overlap, as shown below:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# from llama_index.core.node_parser import TokenTextSplitter\n",
"\n",
"# splitter = TokenTextSplitter(\n",
"# chunk_size=1024,\n",
"# chunk_overlap=20,\n",
"# )\n",
"# transformations.append(splitter)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Embed model"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
"\n",
"embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Vector store"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"INGEST = True # whether to ingest from scratch or reuse an existing vector store"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
]
}
],
"source": [
"from llama_index.vector_stores.milvus import MilvusVectorStore\n",
"\n",
"MILVUS_URL = os.environ.get(\n",
" \"MILVUS_URL\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
")\n",
"MILVUS_COLL_NAME = os.environ.get(\"MILVUS_COLL_NAME\", \"basic_llamaindex_pipeline\")\n",
"MILVUS_KWARGS = TypeAdapter(dict).validate_json(os.environ.get(\"MILVUS_KWARGS\", \"{}\"))\n",
"vector_store = MilvusVectorStore(\n",
" uri=MILVUS_URL,\n",
" collection_name=MILVUS_COLL_NAME,\n",
" dim=len(embed_model.get_text_embedding(\"hi\")),\n",
" overwrite=INGEST,\n",
" **MILVUS_KWARGS,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "536daee038de4d52a793445c6d853c72",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Fetching 7 files: 0%| | 0/7 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">[</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Document</span><span style=\"font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">14</span><span style=\"font-weight: bold\">}</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"font-weight: bold\">]</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span><span style=\"font-weight: bold\">]</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span>=<span style=\"font-weight: bold\">{}</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'## DocLayNet: A Large Human-Annotated Dataset for '</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">50593</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{metadata_str}\\n\\n{content}'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{key}: {value}'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_seperator</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'\\n'</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">)</span>\n",
"<span style=\"font-weight: bold\">]</span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1m[\u001b[0m\n",
"\u001b[2;32m│ \u001b[0m\u001b[1;35mDocument\u001b[0m\u001b[1m(\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c84663'\u001b[0m+\u001b[1;36m14\u001b[0m\u001b[1m}\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[1m]\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[1m]\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'## DocLayNet: A Large Human-Annotated Dataset for '\u001b[0m+\u001b[1;36m50593\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mtext_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n",
"\u001b[2;32m│ \u001b[0m\u001b[1m)\u001b[0m\n",
"\u001b[1m]\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from llama_index.core import StorageContext, VectorStoreIndex\n",
"\n",
"if INGEST:\n",
" # in this case we ingest the data into the vector store\n",
" docs = reader.load_data(\n",
" file_path=\"https://arxiv.org/pdf/2206.01062\", # DocLayNet paper\n",
" )\n",
" pprint(docs, max_length=1, max_string=50, max_depth=4)\n",
" storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
" index = VectorStoreIndex.from_documents(\n",
" documents=docs,\n",
" embed_model=embed_model,\n",
" storage_context=storage_context,\n",
" transformations=transformations,\n",
" )\n",
"else:\n",
" # in this case we just load the vector store index\n",
" index = VectorStoreIndex.from_vector_store(\n",
" vector_store=vector_store,\n",
" embed_model=embed_model,\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### LLM"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
"\n",
"HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n",
"\n",
"llm = HuggingFaceInferenceAPI(\n",
" token=HF_API_KEY,\n",
" model_name=\"mistralai/Mistral-7B-Instruct-v0.3\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## RAG"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Response</span><span style=\"font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">response</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'80863 pages were human annotated.'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">source_nodes</span>=<span style=\"font-weight: bold\">[</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">NodeWithScore</span><span style=\"font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">node</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextNode</span><span style=\"font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">id_</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'8874a117-d181-4f4f-a30b-0b5604370d77'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">embedding</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">}</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_embed_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">]</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">excluded_llm_metadata_keys</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">]</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">relationships</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #808000; text-decoration-color: #808000\">...</span><span style=\"font-weight: bold\">}</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'3 THE DOCLAYNET DATASET\\n\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape o'</span>+<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5775</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">mimetype</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text/plain'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">start_char_idx</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">9089</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">end_char_idx</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">15114</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">text_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{metadata_str}\\n\\n{content}'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_template</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'{key}: {value}'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata_seperator</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'\\n'</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"font-weight: bold\">)</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">score</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.7367570400238037</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">)</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">]</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'8874a117-d181-4f4f-a30b-0b5604370d77'</span>: <span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'dl_doc_hash'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">}</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">...</span> +<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">}</span>\n",
"<span style=\"font-weight: bold\">)</span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1;35mResponse\u001b[0m\u001b[1m(\u001b[0m\n",
"\u001b[2;32m│ \u001b[0m\u001b[33mresponse\u001b[0m=\u001b[32m'80863 pages were human annotated.'\u001b[0m,\n",
"\u001b[2;32m│ \u001b[0m\u001b[33msource_nodes\u001b[0m=\u001b[1m[\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1m(\u001b[0m\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode\u001b[0m=\u001b[1;35mTextNode\u001b[0m\u001b[1m(\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'8874a117-d181-4f4f-a30b-0b5604370d77'\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[33m...\u001b[0m\u001b[1m]\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[33m...\u001b[0m\u001b[1m]\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'3 THE DOCLAYNET DATASET\\n\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape o'\u001b[0m+\u001b[1;36m5775\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[1;36m9089\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[1;36m15114\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mmetadata_str\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\\n\\n\u001b[0m\u001b[32m{\u001b[0m\u001b[32mcontent\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[1m)\u001b[0m,\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.7367570400238037\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\n",
"\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m,\n",
"\u001b[2;32m│ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[32m'8874a117-d181-4f4f-a30b-0b5604370d77'\u001b[0m: \u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m: \u001b[32m'5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc'\u001b[0m,\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[1m}\u001b[0m,\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\n",
"\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m\n",
"\u001b[1m)\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"query_engine = index.as_query_engine(llm=llm)\n",
"query_res = query_engine.query(\"How many pages were human annotated?\")\n",
"pprint(query_res, max_length=1, max_string=250, max_depth=4)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}