{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# RAG with LangChain 🦜🔗" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "# requirements for this example:\n", "%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Loader and splitter" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Below we set up:\n", "- a `Loader` which will be used to create LangChain documents, and\n", "- a splitter, which will be used to split these documents" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from typing import Iterator\n", "\n", "from langchain_core.document_loaders import BaseLoader\n", "from langchain_core.documents import Document as LCDocument\n", "\n", "from docling.document_converter import DocumentConverter\n", "\n", "class DoclingPDFLoader(BaseLoader):\n", "\n", " def __init__(self, file_path: str | list[str]) -> None:\n", " self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n", " self._converter = DocumentConverter()\n", "\n", " def lazy_load(self) -> Iterator[LCDocument]:\n", " for source in self._file_paths:\n", " dl_doc = self._converter.convert(source).document\n", " text = dl_doc.export_to_markdown()\n", " yield LCDocument(page_content=text)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "FILE_PATH = \"https://raw.githubusercontent.com/DS4SD/docling/main/tests/data/2206.01062.pdf\" # DocLayNet paper" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", "\n", "loader = DoclingPDFLoader(file_path=FILE_PATH)\n", "text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=1000,\n", " chunk_overlap=200,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We now used the above-defined objects to get the document splits:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "docs = loader.load()\n", "splits = text_splitter.split_documents(docs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Embeddings" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n", "\n", "HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n", "embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Vector store" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from tempfile import TemporaryDirectory\n", "\n", "from langchain_milvus import Milvus\n", "\n", "MILVUS_URI = os.environ.get(\n", " \"MILVUS_URI\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n", ")\n", "\n", "vectorstore = Milvus.from_documents(\n", " splits,\n", " embeddings,\n", " connection_args={\"uri\": MILVUS_URI},\n", " drop_old=True,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### LLM" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n", "Token is valid (permission: write).\n", "Your token has been saved to /Users/pva/.cache/huggingface/token\n", "Login successful\n" ] } ], "source": [ "from langchain_huggingface import HuggingFaceEndpoint\n", "\n", "HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n", "HF_LLM_MODEL_ID = \"mistralai/Mistral-7B-Instruct-v0.3\"\n", "\n", "llm = HuggingFaceEndpoint(\n", " repo_id=HF_LLM_MODEL_ID,\n", " huggingfacehub_api_token=HF_API_KEY,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## RAG" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from typing import Iterable\n", "\n", "from langchain_core.documents import Document as LCDocument\n", "from langchain_core.output_parsers import StrOutputParser\n", "from langchain_core.prompts import PromptTemplate\n", "from langchain_core.runnables import RunnablePassthrough\n", "\n", "\n", "def format_docs(docs: Iterable[LCDocument]):\n", " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", "\n", "\n", "retriever = vectorstore.as_retriever()\n", "\n", "prompt = PromptTemplate.from_template(\n", " \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {question}\\nAnswer:\\n\"\n", ")\n", "\n", "rag_chain = (\n", " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n", " | prompt\n", " | llm\n", " | StrOutputParser()\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'- 80,863 pages were human annotated for DocLayNet.'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rag_chain.invoke(\"How many pages were human annotated for DocLayNet?\")" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }