docling/examples/rag_langchain.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "# requirements for this example:\n",
    "%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "\n",
    "warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n",
    "warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Loader and splitter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Below we set up:\n",
    "- a `Loader` which will be used to create LangChain documents, and\n",
    "- a splitter, which will be used to split these documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from enum import Enum\n",
    "from typing import Iterator\n",
    "\n",
    "from langchain_core.document_loaders import BaseLoader\n",
    "from langchain_core.documents import Document as LCDocument\n",
    "from pydantic import BaseModel\n",
    "\n",
    "from docling.document_converter import DocumentConverter\n",
    "\n",
    "\n",
    "class DocumentMetadata(BaseModel):\n",
    "    dl_doc_hash: str\n",
    "    # source: str\n",
    "\n",
    "\n",
    "class DoclingPDFLoader(BaseLoader):\n",
    "    class ParseType(str, Enum):\n",
    "        MARKDOWN = \"markdown\"\n",
    "        # JSON = \"json\"\n",
    "\n",
    "    def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:\n",
    "        self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n",
    "        self._parse_type = parse_type\n",
    "        self._converter = DocumentConverter()\n",
    "\n",
    "    def lazy_load(self) -> Iterator[LCDocument]:\n",
    "        for source in self._file_paths:\n",
    "            dl_doc = self._converter.convert_single(source).output\n",
    "            match self._parse_type:\n",
    "                case self.ParseType.MARKDOWN:\n",
    "                    text = dl_doc.export_to_markdown()\n",
    "                # case self.ParseType.JSON:\n",
    "                #     text = dl_doc.model_dump_json()\n",
    "                case _:\n",
    "                    raise RuntimeError(\n",
    "                        f\"Unexpected parse type encountered: {self._parse_type}\"\n",
    "                    )\n",
    "            lc_doc = LCDocument(\n",
    "                page_content=text,\n",
    "                metadata=DocumentMetadata(\n",
    "                    dl_doc_hash=dl_doc.file_info.document_hash,\n",
    "                ).model_dump(),\n",
    "            )\n",
    "            yield lc_doc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "FILE_PATH = \"https://arxiv.org/pdf/2206.01062\"  # DocLayNet paper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1b38d07d5fed4618a44ecf261e1e5c44",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
    "\n",
    "loader = DoclingPDFLoader(\n",
    "    file_path=FILE_PATH,\n",
    "    parse_type=DoclingPDFLoader.ParseType.MARKDOWN,\n",
    ")\n",
    "text_splitter = RecursiveCharacterTextSplitter(\n",
    "    chunk_size=1000,\n",
    "    chunk_overlap=200,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We now used the above-defined objects to get the document splits:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = loader.load()\n",
    "splits = text_splitter.split_documents(docs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
    "\n",
    "HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n",
    "embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Vector store"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tempfile import TemporaryDirectory\n",
    "\n",
    "from langchain_milvus import Milvus\n",
    "\n",
    "MILVUS_URI = os.environ.get(\n",
    "    \"MILVUS_URL\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
    ")\n",
    "\n",
    "vectorstore = Milvus.from_documents(\n",
    "    splits,\n",
    "    embeddings,\n",
    "    connection_args={\"uri\": MILVUS_URI},\n",
    "    drop_old=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### LLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
      "Token is valid (permission: write).\n",
      "Your token has been saved to /Users/pva/.cache/huggingface/token\n",
      "Login successful\n"
     ]
    }
   ],
   "source": [
    "from langchain_huggingface import HuggingFaceEndpoint\n",
    "\n",
    "HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n",
    "HF_LLM_MODEL_ID = \"mistralai/Mistral-7B-Instruct-v0.3\"\n",
    "\n",
    "llm = HuggingFaceEndpoint(\n",
    "    repo_id=HF_LLM_MODEL_ID,\n",
    "    huggingfacehub_api_token=HF_API_KEY,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## RAG"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Iterable\n",
    "\n",
    "from langchain_core.documents import Document as LCDocument\n",
    "from langchain_core.output_parsers import StrOutputParser\n",
    "from langchain_core.prompts import PromptTemplate\n",
    "from langchain_core.runnables import RunnablePassthrough\n",
    "\n",
    "\n",
    "def format_docs(docs: Iterable[LCDocument]):\n",
    "    return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
    "\n",
    "\n",
    "retriever = vectorstore.as_retriever()\n",
    "\n",
    "prompt = PromptTemplate.from_template(\n",
    "    \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {question}\\nAnswer:\\n\"\n",
    ")\n",
    "\n",
    "rag_chain = (\n",
    "    {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
    "    | prompt\n",
    "    | llm\n",
    "    | StrOutputParser()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'The human annotation of DocLayNet was performed on 80863 pages.\\n\\nExplanation:\\nThe information is found in the paragraph \"DocLayNet contains 80863 PDF pages\" in the context.'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rag_chain.invoke(\"How many pages were human annotated for DocLayNet?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
docs: showcase RAG with LlamaIndex and LangChain (#71) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> 2024-09-11 15:07:08 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 1,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Note: you may need to restart the kernel to use updated packages.\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"# requirements for this example:\n",`
			`"%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
			`"text/plain": [`
			`"True"`
			`]`
			`},`
			`"execution_count": 2,`
			`"metadata": {},`
			`"output_type": "execute_result"`
			`}`
			`],`
			`"source": [`
			`"import os\n",`
			`"\n",`
			`"from dotenv import load_dotenv\n",`
			`"\n",`
			`"load_dotenv()"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 3,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import warnings\n",`
			`"\n",`
			`"warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic\|torch\")\n",`
			`"warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"## Setup"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"### Loader and splitter"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Below we set up:\n",`
			"- a `Loader` which will be used to create LangChain documents, and\n",
			`"- a splitter, which will be used to split these documents"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 4,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"from enum import Enum\n",`
			`"from typing import Iterator\n",`
			`"\n",`
			`"from langchain_core.document_loaders import BaseLoader\n",`
			`"from langchain_core.documents import Document as LCDocument\n",`
			`"from pydantic import BaseModel\n",`
			`"\n",`
			`"from docling.document_converter import DocumentConverter\n",`
			`"\n",`
			`"\n",`
			`"class DocumentMetadata(BaseModel):\n",`
			`" dl_doc_hash: str\n",`
			`" # source: str\n",`
			`"\n",`
			`"\n",`
			`"class DoclingPDFLoader(BaseLoader):\n",`
			`" class ParseType(str, Enum):\n",`
			`" MARKDOWN = \"markdown\"\n",`
			`" # JSON = \"json\"\n",`
			`"\n",`
			`" def __init__(self, file_path: str \| list[str], parse_type: ParseType) -> None:\n",`
			`" self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n",`
			`" self._parse_type = parse_type\n",`
			`" self._converter = DocumentConverter()\n",`
			`"\n",`
			`" def lazy_load(self) -> Iterator[LCDocument]:\n",`
			`" for source in self._file_paths:\n",`
			`" dl_doc = self._converter.convert_single(source).output\n",`
			`" match self._parse_type:\n",`
			`" case self.ParseType.MARKDOWN:\n",`
			`" text = dl_doc.export_to_markdown()\n",`
			`" # case self.ParseType.JSON:\n",`
			`" # text = dl_doc.model_dump_json()\n",`
			`" case _:\n",`
			`" raise RuntimeError(\n",`
			`" f\"Unexpected parse type encountered: {self._parse_type}\"\n",`
			`" )\n",`
			`" lc_doc = LCDocument(\n",`
			`" page_content=text,\n",`
			`" metadata=DocumentMetadata(\n",`
			`" dl_doc_hash=dl_doc.file_info.document_hash,\n",`
			`" ).model_dump(),\n",`
			`" )\n",`
			`" yield lc_doc"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 5,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 6,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "1b38d07d5fed4618a44ecf261e1e5c44",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`"Fetching 7 files: 0%\| \| 0/7 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`}`
			`],`
			`"source": [`
			`"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",`
			`"\n",`
			`"loader = DoclingPDFLoader(\n",`
			`" file_path=FILE_PATH,\n",`
			`" parse_type=DoclingPDFLoader.ParseType.MARKDOWN,\n",`
			`")\n",`
			`"text_splitter = RecursiveCharacterTextSplitter(\n",`
			`" chunk_size=1000,\n",`
			`" chunk_overlap=200,\n",`
			`")"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"We now used the above-defined objects to get the document splits:"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 7,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"docs = loader.load()\n",`
			`"splits = text_splitter.split_documents(docs)"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"### Embeddings"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 8,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",`
			`"\n",`
			`"HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n",`
			`"embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"### Vector store"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 9,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"from tempfile import TemporaryDirectory\n",`
			`"\n",`
			`"from langchain_milvus import Milvus\n",`
			`"\n",`
			`"MILVUS_URI = os.environ.get(\n",`
			`" \"MILVUS_URL\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",`
			`")\n",`
			`"\n",`
			`"vectorstore = Milvus.from_documents(\n",`
			`" splits,\n",`
			`" embeddings,\n",`
			`" connection_args={\"uri\": MILVUS_URI},\n",`
			`" drop_old=True,\n",`
			`")"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"### LLM"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 10,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			"The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
			`"Token is valid (permission: write).\n",`
			`"Your token has been saved to /Users/pva/.cache/huggingface/token\n",`
			`"Login successful\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"from langchain_huggingface import HuggingFaceEndpoint\n",`
			`"\n",`
			`"HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n",`
			`"HF_LLM_MODEL_ID = \"mistralai/Mistral-7B-Instruct-v0.3\"\n",`
			`"\n",`
			`"llm = HuggingFaceEndpoint(\n",`
			`" repo_id=HF_LLM_MODEL_ID,\n",`
			`" huggingfacehub_api_token=HF_API_KEY,\n",`
			`")"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"## RAG"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 11,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"from typing import Iterable\n",`
			`"\n",`
			`"from langchain_core.documents import Document as LCDocument\n",`
			`"from langchain_core.output_parsers import StrOutputParser\n",`
			`"from langchain_core.prompts import PromptTemplate\n",`
			`"from langchain_core.runnables import RunnablePassthrough\n",`
			`"\n",`
			`"\n",`
			`"def format_docs(docs: Iterable[LCDocument]):\n",`
			`" return \"\\n\\n\".join(doc.page_content for doc in docs)\n",`
			`"\n",`
			`"\n",`
			`"retriever = vectorstore.as_retriever()\n",`
			`"\n",`
			`"prompt = PromptTemplate.from_template(\n",`
			`" \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {question}\\nAnswer:\\n\"\n",`
			`")\n",`
			`"\n",`
			`"rag_chain = (\n",`
			`" {\"context\": retriever \| format_docs, \"question\": RunnablePassthrough()}\n",`
			`" \| prompt\n",`
			`" \| llm\n",`
			`" \| StrOutputParser()\n",`
			`")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 12,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
			`"text/plain": [`
			`"'The human annotation of DocLayNet was performed on 80863 pages.\\n\\nExplanation:\\nThe information is found in the paragraph \"DocLayNet contains 80863 PDF pages\" in the context.'"`
			`]`
			`},`
			`"execution_count": 12,`
			`"metadata": {},`
			`"output_type": "execute_result"`
			`}`
			`],`
			`"source": [`
			`"rag_chain.invoke(\"How many pages were human annotated for DocLayNet?\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": []`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": ".venv",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.12.4"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`