docling/docs/examples/rag_langchain.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# RAG with LangChain 🦜🔗"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "# requirements for this example:\n",
    "%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Loader and splitter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Below we set up:\n",
    "- a `Loader` which will be used to create LangChain documents, and\n",
    "- a splitter, which will be used to split these documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Iterator\n",
    "\n",
    "from langchain_core.document_loaders import BaseLoader\n",
    "from langchain_core.documents import Document as LCDocument\n",
    "\n",
    "from docling.document_converter import DocumentConverter\n",
    "\n",
    "class DoclingPDFLoader(BaseLoader):\n",
    "\n",
    "    def __init__(self, file_path: str | list[str]) -> None:\n",
    "        self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n",
    "        self._converter = DocumentConverter()\n",
    "\n",
    "    def lazy_load(self) -> Iterator[LCDocument]:\n",
    "        for source in self._file_paths:\n",
    "            dl_doc = self._converter.convert(source).document\n",
    "            text = dl_doc.export_to_markdown()\n",
    "            yield LCDocument(page_content=text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "FILE_PATH = \"https://raw.githubusercontent.com/DS4SD/docling/main/tests/data/2206.01062.pdf\"  # DocLayNet paper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
    "\n",
    "loader = DoclingPDFLoader(file_path=FILE_PATH)\n",
    "text_splitter = RecursiveCharacterTextSplitter(\n",
    "    chunk_size=1000,\n",
    "    chunk_overlap=200,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We now used the above-defined objects to get the document splits:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = loader.load()\n",
    "splits = text_splitter.split_documents(docs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
    "\n",
    "HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n",
    "embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Vector store"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tempfile import TemporaryDirectory\n",
    "\n",
    "from langchain_milvus import Milvus\n",
    "\n",
    "MILVUS_URI = os.environ.get(\n",
    "    \"MILVUS_URI\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
    ")\n",
    "\n",
    "vectorstore = Milvus.from_documents(\n",
    "    splits,\n",
    "    embeddings,\n",
    "    connection_args={\"uri\": MILVUS_URI},\n",
    "    drop_old=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### LLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
      "Token is valid (permission: write).\n",
      "Your token has been saved to /Users/pva/.cache/huggingface/token\n",
      "Login successful\n"
     ]
    }
   ],
   "source": [
    "from langchain_huggingface import HuggingFaceEndpoint\n",
    "\n",
    "HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n",
    "HF_LLM_MODEL_ID = \"mistralai/Mistral-7B-Instruct-v0.3\"\n",
    "\n",
    "llm = HuggingFaceEndpoint(\n",
    "    repo_id=HF_LLM_MODEL_ID,\n",
    "    huggingfacehub_api_token=HF_API_KEY,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## RAG"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Iterable\n",
    "\n",
    "from langchain_core.documents import Document as LCDocument\n",
    "from langchain_core.output_parsers import StrOutputParser\n",
    "from langchain_core.prompts import PromptTemplate\n",
    "from langchain_core.runnables import RunnablePassthrough\n",
    "\n",
    "\n",
    "def format_docs(docs: Iterable[LCDocument]):\n",
    "    return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
    "\n",
    "\n",
    "retriever = vectorstore.as_retriever()\n",
    "\n",
    "prompt = PromptTemplate.from_template(\n",
    "    \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {question}\\nAnswer:\\n\"\n",
    ")\n",
    "\n",
    "rag_chain = (\n",
    "    {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
    "    | prompt\n",
    "    | llm\n",
    "    | StrOutputParser()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'- 80,863 pages were human annotated for DocLayNet.'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rag_chain.invoke(\"How many pages were human annotated for DocLayNet?\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}