2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								{
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								 "cells": [
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:14:37 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2020-09-18 12:57:32 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# Build Your First QA System\n",
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:14:37 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "<img style=\"float: right;\" src=\"https://upload.wikimedia.org/wikipedia/en/d/d8/Game_of_Thrones_title_card.jpg\">\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-09-18 12:57:32 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "EXECUTABLE VERSION: [*colab*](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:14:37 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "Question Answering can be used in a variety of use cases. A very common one:  Using it to navigate through complex knowledge bases or long documents (\"search setting\").\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "A \"knowledge base\" could for example be your website, an internal wiki or a collection of financial reports. \n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "In this tutorial we will work on a slightly different domain: \"Game of Thrones\". \n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "Let's see how we can use a bunch of Wikipedia articles to answer a variety of questions about the \n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "marvellous seven kingdoms...  \n",
							 
						 
					
						
							
								
									
										
										
										
											2020-09-18 12:57:32 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:14:37 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2019-11-28 12:07:04 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": null,
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 17:41:03 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
									
										
										
										
											2019-11-28 12:07:04 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 17:41:03 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2020-08-19 14:52:50 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# Install the latest release of Haystack in your own environment \n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "#! pip install farm-haystack\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-09-21 10:26:12 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# Install the latest master of Haystack\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "!pip install git+https://github.com/deepset-ai/haystack.git"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 17:41:03 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2020-07-07 14:59:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 2,
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
									
										
										
										
											2019-11-28 12:07:04 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2019-11-27 13:56:03 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "from haystack import Finder\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-09-16 18:33:23 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "from haystack.preprocessor.cleaning import clean_wiki_text\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-01-23 15:18:41 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "from haystack.reader.farm import FARMReader\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "from haystack.reader.transformers import TransformersReader\n",
							 
						 
					
						
							
								
									
										
										
										
											2019-11-27 13:56:03 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "from haystack.utils import print_answers"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:01:32 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "## Document Store\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-09-18 12:57:32 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "Haystack finds answers to queries within the documents stored in a `DocumentStore`. The current implementations of `DocumentStore` include `ElasticsearchDocumentStore`, `FAISSDocumentStore`,  `SQLDocumentStore`, and `InMemoryDocumentStore`.\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "**Here:** We recommended Elasticsearch as it comes preloaded with features like [full-text queries](https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html), [BM25 retrieval](https://www.elastic.co/elasticon/conf/2016/sf/improved-text-scoring-with-bm25), and [vector storage for text embeddings](https://www.elastic.co/guide/en/elasticsearch/reference/7.6/dense-vector.html).\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-30 19:00:41 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "**Alternatives:** If you are unable to setup an Elasticsearch instance, then follow the [Tutorial 3](https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb) for using SQL/InMemory document stores.\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-30 19:00:41 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "**Hint**: This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can configure Haystack to work with your existing document stores.\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "### Start an Elasticsearch server\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source."
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:01:32 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2020-07-07 14:59:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 5,
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
									
										
										
										
											2020-07-07 14:59:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "0ae423cd9c30d6f02ca2073e430d4e1f4403d88b8ec316411ec4c198bad3d416\r\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2020-04-30 19:00:41 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# Recommended: Start Elasticsearch using Docker\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-07-07 14:59:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "#! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2"
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "execution_count": null,
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2020-04-30 19:00:41 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# In Colab / No Docker environments: Start Elasticsearch from source\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "! chown -R daemon:daemon elasticsearch-7.6.2\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "import os\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "from subprocess import Popen, PIPE, STDOUT\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "                   stdout=PIPE, stderr=STDOUT,\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "                   preexec_fn=lambda: os.setuid(1)  # as daemon\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "                  )\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# wait until ES has started\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "! sleep 30"
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2020-07-07 14:59:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 7,
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "metadata": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "pycharm": {
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "name": "#%%\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stderr",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2020-07-07 14:59:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "07/07/2020 10:41:47 - INFO - elasticsearch -   PUT http://localhost:9200/document [status:200 request:0.364s]\n"
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# Connect to Elasticsearch\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-09-16 18:33:23 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "from haystack.document_store.elasticsearch import ElasticsearchDocumentStore\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "document_store = ElasticsearchDocumentStore(host=\"localhost\", username=\"\", password=\"\", index=\"document\")"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "pycharm": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "#%% md\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2020-09-16 18:33:23 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "## Preprocessing of documents\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-09-16 18:33:23 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "Haystack provides a customizable pipeline for:\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    " - converting files into texts\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    " - cleaning texts\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    " - splitting texts\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    " - writing them to a Document Store\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-09-18 12:57:32 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and index them in Elasticsearch."
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2020-07-07 14:59:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 8,
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "metadata": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "pycharm": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "#%%\n"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   },
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 17:41:03 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stderr",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2020-07-07 14:59:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "07/07/2020 10:41:48 - INFO - haystack.indexing.utils -   Found data stored in `data/article_txt_got`. Delete this first if you really want to fetch new data.\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "07/07/2020 10:41:48 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.461s]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "07/07/2020 10:41:49 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.259s]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "07/07/2020 10:41:49 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.205s]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "07/07/2020 10:41:49 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.158s]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "07/07/2020 10:41:49 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.126s]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "07/07/2020 10:41:49 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.095s]\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "[{'name': '384_Maelor_Targaryen.txt', 'text': '#REDIRECT The Princess and the Queen'}, {'name': '314_Pypar.txt', 'text': \"#REDIRECT List of Game of Thrones characters#Night's Watch\"}, {'name': '73_A_Man_Without_Honor.txt', 'text': '\"\\'\\'\\'A Man Without Honor\\'\\'\\'\" is the seventh episode of the second season of HBO\\'s medieval fantasy television series \\'\\'Game of Thrones\\'\\'.\\nThe episode is written by series co-creators David Benioff and D. B. Weiss and directed, for the second time in this season, by David Nutter. It premiered on May 13, 2012.\\nThe name of the episode comes from Catelyn Stark\\'s assessment of Ser Jaime Lannister: \"You are a man without honor,\" after he kills a member of his own family to attempt escape.'}]\n"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 17:41:03 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2020-09-18 12:57:32 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# Let's first fetch some documents that we want to query\n",
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:01:32 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# Here: 517 Wikipedia articles for Game of Thrones\n",
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "doc_dir = \"data/article_txt_got\"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip\"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "fetch_archive_from_http(url=s3_url, output_dir=doc_dir)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-06-08 11:07:19 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# Convert files to dicts\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-01-23 15:18:41 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)\n",
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:01:32 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# It must take a str as input, and return a str.\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-06-08 11:07:19 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-07-07 14:59:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# We now have a list of dictionaries that we can write to our document store.\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself.\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-09-18 12:57:32 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# The default format here is:\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# {\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "#    'text': \"<DOCUMENT_TEXT_HERE>\",\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "#    'meta': {'name': \"<DOCUMENT_NAME_HERE>\", ...}\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "#}\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-07-07 14:59:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# (Optionally: you can also add more key-value-pairs here, that will be indexed as fields in Elasticsearch and\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# can be accessed later for filtering or shown in the responses of the Finder)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# Let's have a look at the first 3 entries:\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(dicts[:3])\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-06-08 11:07:19 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# Now, let's write the dicts containing documents to our DB.\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "document_store.write_documents(dicts)"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:01:32 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "## Initalize Retriever, Reader,  & Finder\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "### Retriever\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question could be answered.\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "They use some simple but fast algorithm.\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-30 19:00:41 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "**Here:** We use Elasticsearch's default BM25 algorithm\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-30 19:00:41 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "**Alternatives:**\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-30 19:00:41 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-07-07 14:59:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- Use `EmbeddingRetriever` to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT)\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-07-07 14:59:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- Use `DensePassageRetriever` to use different embedding models for passage and query (see Tutorial 6)"
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "execution_count": 3,
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2020-07-07 14:59:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "from haystack.retriever.sparse import ElasticsearchRetriever\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "retriever = ElasticsearchRetriever(document_store=document_store)"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:01:32 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 17:41:03 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 4,
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "metadata": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "pycharm": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "is_executing": false,
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "#%%\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   },
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes with SQLite document store.\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-07-07 14:59:01 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# from haystack.retriever.sparse import TfidfRetriever\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# retriever = TfidfRetriever(document_store=document_store)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "### Reader\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "on powerful, but slower deep learning models.\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "Haystack currently supports Readers based on the frameworks FARM and Transformers.\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-30 19:00:41 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "**Here:** a medium sized RoBERTa QA model using a Reader based on FARM (https://huggingface.co/deepset/roberta-base-squad2)\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-30 19:00:41 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "**Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package)\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-30 19:00:41 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "**Alternatives (Models):** e.g. \"distilbert-base-uncased-distilled-squad\" (fast) or \"deepset/bert-large-uncased-whole-word-masking-squad2\" (good accuracy)\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-30 19:00:41 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "**Hint:** You can adjust the model to return \"no answer possible\" with the no_ans_boost. Higher values mean the model prefers \"no answer possible\"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "#### FARMReader"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 8,
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "metadata": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "pycharm": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "is_executing": false
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   },
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 17:41:03 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stderr",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "04/28/2020 12:29:45 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "04/28/2020 12:29:45 - INFO - farm.infer -   Could not find `deepset/roberta-base-squad2` locally. Try to download from model hub ...\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "04/28/2020 12:29:49 - WARNING - farm.modeling.language_model -   Could not automatically detect from language model name what language it is. \n",
							 
						 
					
						
							
								
									
										
										
										
											2020-03-17 19:58:53 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "\t We guess it's an *ENGLISH* model ... \n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "\t If not: Init the language model by supplying the 'language' param.\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "04/28/2020 12:29:54 - WARNING - farm.modeling.prediction_head -   Some unused parameters are passed to the QuestionAnsweringHead. Might not be a problem. Params: {\"loss_ignore_index\": -1}\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "04/28/2020 12:29:58 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None\n"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 17:41:03 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# Load a  local model or any of the QA models on\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# Hugging Face's model hub (https://huggingface.co/models)\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-01-13 18:56:22 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "reader = FARMReader(model_name_or_path=\"deepset/roberta-base-squad2\", use_gpu=False)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "#### TransformersReader"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "execution_count": null,
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# Alternative:\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-01-23 15:18:41 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# reader = TransformersReader(model=\"distilbert-base-uncased-distilled-squad\", tokenizer=\"distilbert-base-uncased\", use_gpu=-1)"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "### Finder\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "The Finder sticks together reader and retriever in a pipeline to answer our actual questions. "
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 5,
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "metadata": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "pycharm": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "is_executing": false
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "finder = Finder(reader, retriever)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:01:32 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2020-03-17 19:58:53 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "## Voilà! Ask a question!"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:01:32 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 6,
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "metadata": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "pycharm": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "is_executing": false
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   },
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 17:41:03 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stderr",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "04/28/2020 12:27:53 - INFO - elasticsearch -   GET http://localhost:9200/document/_search [status:200 request:0.113s]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "04/28/2020 12:27:53 - INFO - haystack.retriever.elasticsearch -   Got 10 candidates from retriever\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "04/28/2020 12:27:53 - INFO - haystack.finder -   Reader is looking for detailed answer in 362347 chars ...\n"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 17:41:03 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:14:37 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# You can configure how many candidates the reader and retriever shall return\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# The higher top_k_retriever, the better (but also the slower) your answers. \n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "prediction = finder.get_answers(question=\"Who is the father of Arya Stark?\", top_k_retriever=10, top_k_reader=5)"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": null,
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:01:32 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# prediction = finder.get_answers(question=\"Who created the Dothraki vocabulary?\", top_k_reader=5)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# prediction = finder.get_answers(question=\"Who is the sister of Sansa?\", top_k_reader=5)"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 7,
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "metadata": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "pycharm": {
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:01:32 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "is_executing": false,
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     "name": "#%%\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   },
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 17:41:03 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "[   {   'answer': 'Eddard',\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-03-17 19:58:53 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "        'context': 's Nymeria after a legendary warrior queen. She travels '\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "                   \"with her father, Eddard, to King's Landing when he is made \"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "                   'Hand of the King. Before she leaves,'},\n",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "    {   'answer': 'Ned',\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        'context': 'girl disguised as a boy all along and is surprised to '\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "                   \"learn she is Arya, Ned Stark's daughter. After the \"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "                   'Goldcloaks get help from Ser Amory Lorch and '},\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "    {   'answer': 'Ned',\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        'context': 'in the television series.\\n'\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "                   '\\n'\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "                   '\\n'\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "                   '====Season 1====\\n'\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "                   'Arya accompanies her father Ned and her sister Sansa to '\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "                   \"King's Landing. Before their departure, Arya's ha\"},\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "    {   'answer': 'Balon Greyjoy',\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        'context': 'He sends Theon to the Iron Islands hoping to broker an '\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "                   \"alliance with Balon Greyjoy, Theon's father. In exchange \"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "                   'for Greyjoy support, Robb as the King '},\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "    {   'answer': 'Brynden Tully',\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        'context': 'o the weather. Sandor decides to instead take her to her '\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "                   'great-uncle Brynden Tully. On their way to Riverrun, they '\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "                   \"encounter two men on Arya's death l\"}]\n"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 17:41:03 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2019-11-25 16:01:32 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "print_answers(prediction, details=\"minimal\")"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								 ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								 "metadata": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  "kernelspec": {
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "display_name": "Python 3",
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "language": "python",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "name": "python3"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  "language_info": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "codemirror_mode": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "name": "ipython",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "version": 3
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "file_extension": ".py",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "mimetype": "text/x-python",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "name": "python",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "nbconvert_exporter": "python",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "pygments_lexer": "ipython3",
							 
						 
					
						
							
								
									
										
										
										
											2020-04-29 14:01:05 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "version": "3.7.6"
							 
						 
					
						
							
								
									
										
										
										
											2019-11-14 11:42:51 +01:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								 },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								 "nbformat": 4,
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								 "nbformat_minor": 2
							 
						 
					
						
							
								
									
										
										
										
											2020-06-30 19:05:45 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								}