mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-23 00:42:28 +00:00

* Adding dummy generator implementation * Adding tutorial to try the model * Committing current non working code * Committing current update where we need to call generate function directly and need to convert embedding to tensor way * Addressing review comments. * Refactoring finder, and implementing rag_generator class. * Refined the implementation of RAGGenerator and now it is in clean shape * Renaming RAGGenerator to RAGenerator * Reverting change from finder.py and addressing review comments * Remove support for RagSequenceForGeneration * Utilizing embed_passage function from DensePassageRetriever * Adding sample test data to verify generator output * Updating testing script * Updating testing script * Fixing bug related to top_k * Updating latest farm dependency * Comment out farm dependency * Reverting changes from TransformersReader * Adding transformers dataset to compare transformers and haystack generator implementation * Using generator_encoder instead of question_encoder to generate context_input_ids * Adding workaround to install FARM dependency from master branch * Removing unnecessary changes * Fixing generator test * Removing transformers datasets * Fixing generator test * Some cleanup and updating TODO comments * Adding tutorial notebook * Updating tutorials with comments * Explicitly passing token model in RAG test * Addressing review comments * Fixing notebook * Refactoring tests to reduce memory footprint * Split generator tests in separate ci step and before running it reclaim memory by terminating containers * Moving tika dependent test to separate dir * Remove unwanted code * Brining reader under session scope * Farm is now session object hence restoring changes from default value * Updating assert for pdf converter * Dummy commit to trigger CI flow * REducing memory footprint required for generator tests * Fixing mypy issues * Marking test with tika and elasticsearch markers. Reverting changes in CI and pytest splits * reducing changes * Fixing CI * changing elastic search ci * Fixing test error * Disabling return of embedding * Marking generator test as well * Refactoring tutorials * Increasing ES memory to 750M * Trying another fix for ES CI * Reverting CI changes * Splitting tests in CI * Generator and non-generator markers split * Adding pytest.ini to add markers and enable strict-markers option * Reducing elastic search container memory * Simplifying generator test by using documents with embedding directly * Bump up farm to 0.5.0
196 lines
6.6 KiB
Plaintext
196 lines
6.6 KiB
Plaintext
{
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0,
|
|
"metadata": {
|
|
"colab": {
|
|
"name": "Tutorial7_RAG_Generator.ipynb",
|
|
"provenance": [],
|
|
"collapsed_sections": []
|
|
},
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"display_name": "Python 3"
|
|
}
|
|
},
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "iDyfhfyp7Sjh"
|
|
},
|
|
"source": [
|
|
"!pip install git+https://github.com/deepset-ai/haystack.git\n",
|
|
"!pip install urllib3==1.25.4"
|
|
],
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "ICZanGLa7khF"
|
|
},
|
|
"source": [
|
|
"from typing import List\n",
|
|
"import requests\n",
|
|
"import pandas as pd\n",
|
|
"from haystack import Document\n",
|
|
"from haystack.document_store.faiss import FAISSDocumentStore\n",
|
|
"from haystack.generator.transformers import RAGenerator\n",
|
|
"from haystack.retriever.dense import DensePassageRetriever"
|
|
],
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "D3f-CQ4c7lEN"
|
|
},
|
|
"source": [
|
|
"# Add documents from which you want generate answers\n",
|
|
"# Download a csv containing some sample documents data\n",
|
|
"# Here some sample documents data\n",
|
|
"temp = requests.get(\"https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv\")\n",
|
|
"open('small_generator_dataset.csv', 'wb').write(temp.content)\n",
|
|
"\n",
|
|
"# Get dataframe with columns \"title\", and \"text\"\n",
|
|
"df = pd.read_csv(\"small_generator_dataset.csv\", sep=',')\n",
|
|
"# Minimal cleaning\n",
|
|
"df.fillna(value=\"\", inplace=True)\n",
|
|
"\n",
|
|
"print(df.head())\n",
|
|
"\n",
|
|
"# Create to haystack document format\n",
|
|
"titles = list(df[\"title\"].values)\n",
|
|
"texts = list(df[\"text\"].values)\n",
|
|
"\n",
|
|
"documents: List[Document] = []\n",
|
|
"for title, text in zip(titles, texts):\n",
|
|
" documents.append(\n",
|
|
" Document(\n",
|
|
" text=text,\n",
|
|
" meta={\n",
|
|
" \"name\": title or \"\"\n",
|
|
" }\n",
|
|
" )\n",
|
|
" )"
|
|
],
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "upRu3ebX7nr_"
|
|
},
|
|
"source": [
|
|
"# Initialize FAISS document store to documents and corresponding index for embeddings\n",
|
|
"# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding\n",
|
|
"document_store = FAISSDocumentStore(\n",
|
|
" faiss_index_factory_str=\"Flat\",\n",
|
|
" return_embedding=True\n",
|
|
")\n",
|
|
"\n",
|
|
"# Initialize DPR Retriever to encode documents, encode question and query documents\n",
|
|
"retriever = DensePassageRetriever(\n",
|
|
" document_store=document_store,\n",
|
|
" query_embedding_model=\"facebook/dpr-question_encoder-single-nq-base\",\n",
|
|
" passage_embedding_model=\"facebook/dpr-ctx_encoder-single-nq-base\",\n",
|
|
" use_gpu=False,\n",
|
|
" embed_title=True,\n",
|
|
" remove_sep_tok_from_untitled_passages=True,\n",
|
|
")\n",
|
|
"\n",
|
|
"# Initialize RAG Generator\n",
|
|
"generator = RAGenerator(\n",
|
|
" model_name_or_path=\"facebook/rag-token-nq\",\n",
|
|
" use_gpu=False,\n",
|
|
" top_k_answers=1,\n",
|
|
" max_length=200,\n",
|
|
" min_length=2,\n",
|
|
" embed_title=True,\n",
|
|
" num_beams=2,\n",
|
|
")"
|
|
],
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "as8j7hkW7rOW"
|
|
},
|
|
"source": [
|
|
"# Delete existing documents in documents store\n",
|
|
"document_store.delete_all_documents()\n",
|
|
"# Write documents to document store\n",
|
|
"document_store.write_documents(documents)\n",
|
|
"# Add documents embeddings to index\n",
|
|
"document_store.update_embeddings(\n",
|
|
" retriever=retriever\n",
|
|
")"
|
|
],
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "j8It45R872vb",
|
|
"cellView": "form"
|
|
},
|
|
"source": [
|
|
"#@title\n",
|
|
"# Now ask your questions\n",
|
|
"# We have some sample questions\n",
|
|
"QUESTIONS = [\n",
|
|
" \"who got the first nobel prize in physics\",\n",
|
|
" \"when is the next deadpool movie being released\",\n",
|
|
" \"which mode is used for short wave broadcast service\",\n",
|
|
" \"who is the owner of reading football club\",\n",
|
|
" \"when is the next scandal episode coming out\",\n",
|
|
" \"when is the last time the philadelphia won the superbowl\",\n",
|
|
" \"what is the most current adobe flash player version\",\n",
|
|
" \"how many episodes are there in dragon ball z\",\n",
|
|
" \"what is the first step in the evolution of the eye\",\n",
|
|
" \"where is gall bladder situated in human body\",\n",
|
|
" \"what is the main mineral in lithium batteries\",\n",
|
|
" \"who is the president of usa right now\",\n",
|
|
" \"where do the greasers live in the outsiders\",\n",
|
|
" \"panda is a national animal of which country\",\n",
|
|
" \"what is the name of manchester united stadium\",\n",
|
|
"]"
|
|
],
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "xPUHRuTP742h"
|
|
},
|
|
"source": [
|
|
"# Now generate answer for question\n",
|
|
"for question in QUESTIONS:\n",
|
|
" # Retrieve related documents from retriever\n",
|
|
" retriever_results = retriever.retrieve(\n",
|
|
" query=question\n",
|
|
" )\n",
|
|
"\n",
|
|
" # Now generate answer from question and retrieved documents\n",
|
|
" predicted_result = generator.predict(\n",
|
|
" question=question,\n",
|
|
" documents=retriever_results,\n",
|
|
" top_k=1\n",
|
|
" )\n",
|
|
"\n",
|
|
" # Print you answer\n",
|
|
" answers = predicted_result[\"answers\"]\n",
|
|
" print(f'Generated answer is \\'{answers[0][\"answer\"]}\\' for the question = \\'{question}\\'')"
|
|
],
|
|
"execution_count": null,
|
|
"outputs": []
|
|
}
|
|
]
|
|
} |