haystack/tutorials/Tutorial7_RAG_Generator.ipynb

{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "name": "Tutorial7_RAG_Generator.ipynb",
   "provenance": [],
   "collapsed_sections": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  }
 },
 "cells": [
  {
   "cell_type": "code",
   "metadata": {
    "id": "iDyfhfyp7Sjh"
   },
   "source": [
    "!pip install git+https://github.com/deepset-ai/haystack.git\n",
    "!pip install urllib3==1.25.4"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "ICZanGLa7khF"
   },
   "source": [
    "from typing import List\n",
    "import requests\n",
    "import pandas as pd\n",
    "from haystack import Document\n",
    "from haystack.document_store.faiss import FAISSDocumentStore\n",
    "from haystack.generator.transformers import RAGenerator\n",
    "from haystack.retriever.dense import DensePassageRetriever"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "D3f-CQ4c7lEN"
   },
   "source": [
    "# Add documents from which you want generate answers\n",
    "# Download a csv containing some sample documents data\n",
    "# Here some sample documents data\n",
    "temp = requests.get(\"https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv\")\n",
    "open('small_generator_dataset.csv', 'wb').write(temp.content)\n",
    "\n",
    "# Get dataframe with columns \"title\", and \"text\"\n",
    "df = pd.read_csv(\"small_generator_dataset.csv\", sep=',')\n",
    "# Minimal cleaning\n",
    "df.fillna(value=\"\", inplace=True)\n",
    "\n",
    "print(df.head())\n",
    "\n",
    "# Create to haystack document format\n",
    "titles = list(df[\"title\"].values)\n",
    "texts = list(df[\"text\"].values)\n",
    "\n",
    "documents: List[Document] = []\n",
    "for title, text in zip(titles, texts):\n",
    "    documents.append(\n",
    "        Document(\n",
    "            text=text,\n",
    "            meta={\n",
    "                \"name\": title or \"\"\n",
    "            }\n",
    "        )\n",
    "    )"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "upRu3ebX7nr_"
   },
   "source": [
    "# Initialize FAISS document store to documents and corresponding index for embeddings\n",
    "# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding\n",
    "document_store = FAISSDocumentStore(\n",
    "    faiss_index_factory_str=\"Flat\",\n",
    "    return_embedding=True\n",
    ")\n",
    "\n",
    "# Initialize DPR Retriever to encode documents, encode question and query documents\n",
    "retriever = DensePassageRetriever(\n",
    "    document_store=document_store,\n",
    "    query_embedding_model=\"facebook/dpr-question_encoder-single-nq-base\",\n",
    "    passage_embedding_model=\"facebook/dpr-ctx_encoder-single-nq-base\",\n",
    "    use_gpu=False,\n",
    "    embed_title=True,\n",
    ")\n",
    "\n",
    "# Initialize RAG Generator\n",
    "generator = RAGenerator(\n",
    "    model_name_or_path=\"facebook/rag-token-nq\",\n",
    "    use_gpu=False,\n",
    "    top_k_answers=1,\n",
    "    max_length=200,\n",
    "    min_length=2,\n",
    "    embed_title=True,\n",
    "    num_beams=2,\n",
    ")"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "as8j7hkW7rOW"
   },
   "source": [
    "# Delete existing documents in documents store\n",
    "document_store.delete_all_documents()\n",
    "# Write documents to document store\n",
    "document_store.write_documents(documents)\n",
    "# Add documents embeddings to index\n",
    "document_store.update_embeddings(\n",
    "    retriever=retriever\n",
    ")"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "j8It45R872vb",
    "cellView": "form"
   },
   "source": [
    "#@title\n",
    "# Now ask your questions\n",
    "# We have some sample questions\n",
    "QUESTIONS = [\n",
    "    \"who got the first nobel prize in physics\",\n",
    "    \"when is the next deadpool movie being released\",\n",
    "    \"which mode is used for short wave broadcast service\",\n",
    "    \"who is the owner of reading football club\",\n",
    "    \"when is the next scandal episode coming out\",\n",
    "    \"when is the last time the philadelphia won the superbowl\",\n",
    "    \"what is the most current adobe flash player version\",\n",
    "    \"how many episodes are there in dragon ball z\",\n",
    "    \"what is the first step in the evolution of the eye\",\n",
    "    \"where is gall bladder situated in human body\",\n",
    "    \"what is the main mineral in lithium batteries\",\n",
    "    \"who is the president of usa right now\",\n",
    "    \"where do the greasers live in the outsiders\",\n",
    "    \"panda is a national animal of which country\",\n",
    "    \"what is the name of manchester united stadium\",\n",
    "]"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "xPUHRuTP742h"
   },
   "source": [
    "# Now generate answer for question\n",
    "for question in QUESTIONS:\n",
    "    # Retrieve related documents from retriever\n",
    "    retriever_results = retriever.retrieve(\n",
    "        query=question\n",
    "    )\n",
    "\n",
    "    # Now generate answer from question and retrieved documents\n",
    "    predicted_result = generator.predict(\n",
    "        question=question,\n",
    "        documents=retriever_results,\n",
    "        top_k=1\n",
    "    )\n",
    "\n",
    "    # Print you answer\n",
    "    answers = predicted_result[\"answers\"]\n",
    "    print(f'Generated answer is \\'{answers[0][\"answer\"]}\\' for the question = \\'{question}\\'')"
   ],
   "execution_count": null,
   "outputs": []
  }
 ]
}
[RAG] Integrate "Retrieval-Augmented Generation" with Haystack (#484) * Adding dummy generator implementation * Adding tutorial to try the model * Committing current non working code * Committing current update where we need to call generate function directly and need to convert embedding to tensor way * Addressing review comments. * Refactoring finder, and implementing rag_generator class. * Refined the implementation of RAGGenerator and now it is in clean shape * Renaming RAGGenerator to RAGenerator * Reverting change from finder.py and addressing review comments * Remove support for RagSequenceForGeneration * Utilizing embed_passage function from DensePassageRetriever * Adding sample test data to verify generator output * Updating testing script * Updating testing script * Fixing bug related to top_k * Updating latest farm dependency * Comment out farm dependency * Reverting changes from TransformersReader * Adding transformers dataset to compare transformers and haystack generator implementation * Using generator_encoder instead of question_encoder to generate context_input_ids * Adding workaround to install FARM dependency from master branch * Removing unnecessary changes * Fixing generator test * Removing transformers datasets * Fixing generator test * Some cleanup and updating TODO comments * Adding tutorial notebook * Updating tutorials with comments * Explicitly passing token model in RAG test * Addressing review comments * Fixing notebook * Refactoring tests to reduce memory footprint * Split generator tests in separate ci step and before running it reclaim memory by terminating containers * Moving tika dependent test to separate dir * Remove unwanted code * Brining reader under session scope * Farm is now session object hence restoring changes from default value * Updating assert for pdf converter * Dummy commit to trigger CI flow * REducing memory footprint required for generator tests * Fixing mypy issues * Marking test with tika and elasticsearch markers. Reverting changes in CI and pytest splits * reducing changes * Fixing CI * changing elastic search ci * Fixing test error * Disabling return of embedding * Marking generator test as well * Refactoring tutorials * Increasing ES memory to 750M * Trying another fix for ES CI * Reverting CI changes * Splitting tests in CI * Generator and non-generator markers split * Adding pytest.ini to add markers and enable strict-markers option * Reducing elastic search container memory * Simplifying generator test by using documents with embedding directly * Bump up farm to 0.5.0 2020-10-30 18:06:02 +01:00			`{`
DensePassageRetriever: Add Training, Refactor Inference to FARM modules (#527) * dpr training and inference code refactored with FARM modules * dpr test cases modified * docstring and default arguments updated * dpr training docstring updated * bugfix in dense retriever inference, DPR tutorials modified * Bump FARM to 0.5.0 * update README for DPR * dpr training and inference code refactored with FARM modules * dpr test cases modified * docstring and default arguments updated * dpr training docstring updated * bugfix in dense retriever inference, DPR tutorials modified * Bump FARM to 0.5.0 * update README for DPR * mypy errors fix * DPR instantiation bugfix * Fix DPR init in RAG Tutorial Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai> 2020-10-30 23:52:06 +05:30			`"nbformat": 4,`
			`"nbformat_minor": 0,`
			`"metadata": {`
			`"colab": {`
			`"name": "Tutorial7_RAG_Generator.ipynb",`
			`"provenance": [],`
			`"collapsed_sections": []`
[RAG] Integrate "Retrieval-Augmented Generation" with Haystack (#484) * Adding dummy generator implementation * Adding tutorial to try the model * Committing current non working code * Committing current update where we need to call generate function directly and need to convert embedding to tensor way * Addressing review comments. * Refactoring finder, and implementing rag_generator class. * Refined the implementation of RAGGenerator and now it is in clean shape * Renaming RAGGenerator to RAGenerator * Reverting change from finder.py and addressing review comments * Remove support for RagSequenceForGeneration * Utilizing embed_passage function from DensePassageRetriever * Adding sample test data to verify generator output * Updating testing script * Updating testing script * Fixing bug related to top_k * Updating latest farm dependency * Comment out farm dependency * Reverting changes from TransformersReader * Adding transformers dataset to compare transformers and haystack generator implementation * Using generator_encoder instead of question_encoder to generate context_input_ids * Adding workaround to install FARM dependency from master branch * Removing unnecessary changes * Fixing generator test * Removing transformers datasets * Fixing generator test * Some cleanup and updating TODO comments * Adding tutorial notebook * Updating tutorials with comments * Explicitly passing token model in RAG test * Addressing review comments * Fixing notebook * Refactoring tests to reduce memory footprint * Split generator tests in separate ci step and before running it reclaim memory by terminating containers * Moving tika dependent test to separate dir * Remove unwanted code * Brining reader under session scope * Farm is now session object hence restoring changes from default value * Updating assert for pdf converter * Dummy commit to trigger CI flow * REducing memory footprint required for generator tests * Fixing mypy issues * Marking test with tika and elasticsearch markers. Reverting changes in CI and pytest splits * reducing changes * Fixing CI * changing elastic search ci * Fixing test error * Disabling return of embedding * Marking generator test as well * Refactoring tutorials * Increasing ES memory to 750M * Trying another fix for ES CI * Reverting CI changes * Splitting tests in CI * Generator and non-generator markers split * Adding pytest.ini to add markers and enable strict-markers option * Reducing elastic search container memory * Simplifying generator test by using documents with embedding directly * Bump up farm to 0.5.0 2020-10-30 18:06:02 +01:00			`},`
DensePassageRetriever: Add Training, Refactor Inference to FARM modules (#527) * dpr training and inference code refactored with FARM modules * dpr test cases modified * docstring and default arguments updated * dpr training docstring updated * bugfix in dense retriever inference, DPR tutorials modified * Bump FARM to 0.5.0 * update README for DPR * dpr training and inference code refactored with FARM modules * dpr test cases modified * docstring and default arguments updated * dpr training docstring updated * bugfix in dense retriever inference, DPR tutorials modified * Bump FARM to 0.5.0 * update README for DPR * mypy errors fix * DPR instantiation bugfix * Fix DPR init in RAG Tutorial Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai> 2020-10-30 23:52:06 +05:30			`"kernelspec": {`
			`"name": "python3",`
			`"display_name": "Python 3"`
			`}`
			`},`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"metadata": {`
			`"id": "iDyfhfyp7Sjh"`
			`},`
			`"source": [`
			`"!pip install git+https://github.com/deepset-ai/haystack.git\n",`
			`"!pip install urllib3==1.25.4"`
			`],`
			`"execution_count": null,`
			`"outputs": []`
			`},`
			`{`
			`"cell_type": "code",`
			`"metadata": {`
			`"id": "ICZanGLa7khF"`
			`},`
			`"source": [`
			`"from typing import List\n",`
			`"import requests\n",`
			`"import pandas as pd\n",`
			`"from haystack import Document\n",`
			`"from haystack.document_store.faiss import FAISSDocumentStore\n",`
			`"from haystack.generator.transformers import RAGenerator\n",`
			`"from haystack.retriever.dense import DensePassageRetriever"`
			`],`
			`"execution_count": null,`
			`"outputs": []`
			`},`
			`{`
			`"cell_type": "code",`
			`"metadata": {`
			`"id": "D3f-CQ4c7lEN"`
			`},`
			`"source": [`
			`"# Add documents from which you want generate answers\n",`
			`"# Download a csv containing some sample documents data\n",`
			`"# Here some sample documents data\n",`
			`"temp = requests.get(\"https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv\")\n",`
			`"open('small_generator_dataset.csv', 'wb').write(temp.content)\n",`
			`"\n",`
			`"# Get dataframe with columns \"title\", and \"text\"\n",`
			`"df = pd.read_csv(\"small_generator_dataset.csv\", sep=',')\n",`
			`"# Minimal cleaning\n",`
			`"df.fillna(value=\"\", inplace=True)\n",`
			`"\n",`
			`"print(df.head())\n",`
			`"\n",`
			`"# Create to haystack document format\n",`
			`"titles = list(df[\"title\"].values)\n",`
			`"texts = list(df[\"text\"].values)\n",`
			`"\n",`
			`"documents: List[Document] = []\n",`
			`"for title, text in zip(titles, texts):\n",`
			`" documents.append(\n",`
			`" Document(\n",`
			`" text=text,\n",`
			`" meta={\n",`
			`" \"name\": title or \"\"\n",`
			`" }\n",`
			`" )\n",`
			`" )"`
			`],`
			`"execution_count": null,`
			`"outputs": []`
			`},`
			`{`
			`"cell_type": "code",`
			`"metadata": {`
			`"id": "upRu3ebX7nr_"`
			`},`
			`"source": [`
			`"# Initialize FAISS document store to documents and corresponding index for embeddings\n",`
			"# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding\n",
			`"document_store = FAISSDocumentStore(\n",`
			`" faiss_index_factory_str=\"Flat\",\n",`
			`" return_embedding=True\n",`
			`")\n",`
			`"\n",`
			`"# Initialize DPR Retriever to encode documents, encode question and query documents\n",`
			`"retriever = DensePassageRetriever(\n",`
			`" document_store=document_store,\n",`
			`" query_embedding_model=\"facebook/dpr-question_encoder-single-nq-base\",\n",`
			`" passage_embedding_model=\"facebook/dpr-ctx_encoder-single-nq-base\",\n",`
			`" use_gpu=False,\n",`
			`" embed_title=True,\n",`
			`")\n",`
			`"\n",`
			`"# Initialize RAG Generator\n",`
			`"generator = RAGenerator(\n",`
			`" model_name_or_path=\"facebook/rag-token-nq\",\n",`
			`" use_gpu=False,\n",`
			`" top_k_answers=1,\n",`
			`" max_length=200,\n",`
			`" min_length=2,\n",`
			`" embed_title=True,\n",`
			`" num_beams=2,\n",`
			`")"`
			`],`
			`"execution_count": null,`
			`"outputs": []`
			`},`
			`{`
			`"cell_type": "code",`
			`"metadata": {`
			`"id": "as8j7hkW7rOW"`
			`},`
			`"source": [`
			`"# Delete existing documents in documents store\n",`
			`"document_store.delete_all_documents()\n",`
			`"# Write documents to document store\n",`
			`"document_store.write_documents(documents)\n",`
			`"# Add documents embeddings to index\n",`
			`"document_store.update_embeddings(\n",`
			`" retriever=retriever\n",`
			`")"`
			`],`
			`"execution_count": null,`
			`"outputs": []`
			`},`
			`{`
			`"cell_type": "code",`
			`"metadata": {`
			`"id": "j8It45R872vb",`
			`"cellView": "form"`
			`},`
			`"source": [`
			`"#@title\n",`
			`"# Now ask your questions\n",`
			`"# We have some sample questions\n",`
			`"QUESTIONS = [\n",`
			`" \"who got the first nobel prize in physics\",\n",`
			`" \"when is the next deadpool movie being released\",\n",`
			`" \"which mode is used for short wave broadcast service\",\n",`
			`" \"who is the owner of reading football club\",\n",`
			`" \"when is the next scandal episode coming out\",\n",`
			`" \"when is the last time the philadelphia won the superbowl\",\n",`
			`" \"what is the most current adobe flash player version\",\n",`
			`" \"how many episodes are there in dragon ball z\",\n",`
			`" \"what is the first step in the evolution of the eye\",\n",`
			`" \"where is gall bladder situated in human body\",\n",`
			`" \"what is the main mineral in lithium batteries\",\n",`
			`" \"who is the president of usa right now\",\n",`
			`" \"where do the greasers live in the outsiders\",\n",`
			`" \"panda is a national animal of which country\",\n",`
			`" \"what is the name of manchester united stadium\",\n",`
			`"]"`
			`],`
			`"execution_count": null,`
			`"outputs": []`
			`},`
			`{`
			`"cell_type": "code",`
			`"metadata": {`
			`"id": "xPUHRuTP742h"`
			`},`
			`"source": [`
			`"# Now generate answer for question\n",`
			`"for question in QUESTIONS:\n",`
			`" # Retrieve related documents from retriever\n",`
			`" retriever_results = retriever.retrieve(\n",`
			`" query=question\n",`
			`" )\n",`
			`"\n",`
			`" # Now generate answer from question and retrieved documents\n",`
			`" predicted_result = generator.predict(\n",`
			`" question=question,\n",`
			`" documents=retriever_results,\n",`
			`" top_k=1\n",`
			`" )\n",`
			`"\n",`
			`" # Print you answer\n",`
			`" answers = predicted_result[\"answers\"]\n",`
			`" print(f'Generated answer is \\'{answers[0][\"answer\"]}\\' for the question = \\'{question}\\'')"`
			`],`
			`"execution_count": null,`
			`"outputs": []`
			`}`
			`]`
			`}`