| 
									
										
										
										
											2020-04-29 14:01:05 +02:00
										 |  |  | # ## Task: Question Answering for Game of Thrones | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Question Answering can be used in a variety of use cases. A very common one:  Using it to navigate through complex | 
					
						
							|  |  |  | # knowledge bases or long documents ("search setting"). | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # A "knowledge base" could for example be your website, an internal wiki or a collection of financial reports. | 
					
						
							|  |  |  | # In this tutorial we will work on a slightly different domain: "Game of Thrones". | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Let's see how we can use a bunch of Wikipedia articles to answer a variety of questions about the | 
					
						
							|  |  |  | # marvellous seven kingdoms... | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import logging | 
					
						
							|  |  |  | import subprocess | 
					
						
							|  |  |  | import time | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-01-23 15:18:41 +01:00
										 |  |  | from haystack import Finder | 
					
						
							| 
									
										
										
										
											2020-09-16 18:33:23 +02:00
										 |  |  | from haystack.document_store.elasticsearch import ElasticsearchDocumentStore | 
					
						
							|  |  |  | from haystack.preprocessor.cleaning import clean_wiki_text | 
					
						
							|  |  |  | from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http | 
					
						
							| 
									
										
										
										
											2020-01-13 18:56:22 +01:00
										 |  |  | from haystack.reader.farm import FARMReader | 
					
						
							|  |  |  | from haystack.reader.transformers import TransformersReader | 
					
						
							|  |  |  | from haystack.utils import print_answers | 
					
						
							| 
									
										
										
										
											2020-06-30 19:05:45 +02:00
										 |  |  | from haystack.retriever.sparse import ElasticsearchRetriever | 
					
						
							| 
									
										
										
										
											2020-04-29 14:01:05 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-24 15:14:09 +02:00
										 |  |  | logger = logging.getLogger(__name__) | 
					
						
							| 
									
										
										
										
											2020-04-29 14:01:05 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | LAUNCH_ELASTICSEARCH = True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # ## Document Store | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Haystack finds answers to queries within the documents stored in a `DocumentStore`. The current implementations of | 
					
						
							| 
									
										
										
										
											2020-09-18 12:57:32 +02:00
										 |  |  | # `DocumentStore` include `ElasticsearchDocumentStore`, `FAISSDocumentStore`, `SQLDocumentStore`, and `InMemoryDocumentStore`. | 
					
						
							| 
									
										
										
										
											2020-04-29 14:01:05 +02:00
										 |  |  | # | 
					
						
							|  |  |  | # **Here:** We recommended Elasticsearch as it comes preloaded with features like full-text queries, BM25 retrieval, | 
					
						
							|  |  |  | # and vector storage for text embeddings. | 
					
						
							|  |  |  | # **Alternatives:** If you are unable to setup an Elasticsearch instance, then follow the Tutorial 3 | 
					
						
							|  |  |  | # for using SQL/InMemory document stores. | 
					
						
							|  |  |  | # **Hint**: | 
					
						
							|  |  |  | # This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can | 
					
						
							|  |  |  | # configure Haystack to work with your existing document stores. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Start an Elasticsearch server | 
					
						
							|  |  |  | # You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in | 
					
						
							|  |  |  | # your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if LAUNCH_ELASTICSEARCH: | 
					
						
							|  |  |  |     logging.info("Starting Elasticsearch ...") | 
					
						
							|  |  |  |     status = subprocess.run( | 
					
						
							|  |  |  |         ['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     if status.returncode: | 
					
						
							|  |  |  |         raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance" | 
					
						
							|  |  |  |                         "then set LAUNCH_ELASTICSEARCH in the script to False.") | 
					
						
							|  |  |  |     time.sleep(15) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Connect to Elasticsearch | 
					
						
							|  |  |  | document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") | 
					
						
							| 
									
										
										
										
											2020-01-13 18:56:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-16 18:33:23 +02:00
										 |  |  | # ## Preprocessing of documents | 
					
						
							| 
									
										
										
										
											2020-04-29 14:01:05 +02:00
										 |  |  | # | 
					
						
							| 
									
										
										
										
											2020-09-16 18:33:23 +02:00
										 |  |  | # Haystack provides a customizable pipeline for: | 
					
						
							|  |  |  | # - converting files into texts | 
					
						
							|  |  |  | # - cleaning texts | 
					
						
							|  |  |  | # - splitting texts | 
					
						
							|  |  |  | # - writing them to a Document Store | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-18 12:57:32 +02:00
										 |  |  | # In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and add | 
					
						
							| 
									
										
										
										
											2020-04-29 14:01:05 +02:00
										 |  |  | # them in Elasticsearch. | 
					
						
							| 
									
										
										
										
											2020-01-13 18:56:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-09-18 12:57:32 +02:00
										 |  |  | # Let's first fetch some documents that we want to query | 
					
						
							| 
									
										
										
										
											2020-01-13 18:56:22 +01:00
										 |  |  | # Here: 517 Wikipedia articles for Game of Thrones | 
					
						
							|  |  |  | doc_dir = "data/article_txt_got" | 
					
						
							|  |  |  | s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" | 
					
						
							|  |  |  | fetch_archive_from_http(url=s3_url, output_dir=doc_dir) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-06-08 11:07:19 +02:00
										 |  |  | # convert files to dicts containing documents that can be indexed to our datastore | 
					
						
							|  |  |  | dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) | 
					
						
							| 
									
										
										
										
											2020-01-23 15:18:41 +01:00
										 |  |  | # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) | 
					
						
							| 
									
										
										
										
											2020-01-13 18:56:22 +01:00
										 |  |  | # It must take a str as input, and return a str. | 
					
						
							| 
									
										
										
										
											2020-06-08 11:07:19 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Now, let's write the docs to our DB. | 
					
						
							| 
									
										
										
										
											2020-06-24 15:14:09 +02:00
										 |  |  | if LAUNCH_ELASTICSEARCH: | 
					
						
							|  |  |  |     document_store.write_documents(dicts) | 
					
						
							|  |  |  | else: | 
					
						
							|  |  |  |     logger.warning("Since we already have a running ES instance we should not index the same documents again. \n" | 
					
						
							|  |  |  |                    "If you still want to do this call: document_store.write_documents(dicts) manually ") | 
					
						
							| 
									
										
										
										
											2020-06-08 11:07:19 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-04-29 14:01:05 +02:00
										 |  |  | # ## Initalize Retriever, Reader,  & Finder | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # ### Retriever | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question | 
					
						
							|  |  |  | # could be answered. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # They use some simple but fast algorithm. | 
					
						
							|  |  |  | # **Here:** We use Elasticsearch's default BM25 algorithm | 
					
						
							|  |  |  | # **Alternatives:** | 
					
						
							|  |  |  | # - Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters | 
					
						
							|  |  |  | # - Use `EmbeddingRetriever` to find candidate documents based on the similarity of | 
					
						
							|  |  |  | #   embeddings (e.g. created via Sentence-BERT) | 
					
						
							|  |  |  | # - Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | retriever = ElasticsearchRetriever(document_store=document_store) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes | 
					
						
							|  |  |  | # with SQLite document store. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # from haystack.retriever.tfidf import TfidfRetriever | 
					
						
							|  |  |  | # retriever = TfidfRetriever(document_store=document_store) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # ### Reader | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based | 
					
						
							|  |  |  | # on powerful, but slower deep learning models. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # Haystack currently supports Readers based on the frameworks FARM and Transformers. | 
					
						
							|  |  |  | # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models). | 
					
						
							|  |  |  | # **Here:** a medium sized RoBERTa QA model using a Reader based on | 
					
						
							|  |  |  | #           FARM (https://huggingface.co/deepset/roberta-base-squad2) | 
					
						
							|  |  |  | # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package) | 
					
						
							|  |  |  | # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or | 
					
						
							|  |  |  | #                            "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy) | 
					
						
							|  |  |  | # **Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean | 
					
						
							|  |  |  | #           the model prefers "no answer possible" | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # #### FARMReader | 
					
						
							| 
									
										
										
										
											2020-01-13 18:56:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-04-29 14:01:05 +02:00
										 |  |  | # Load a  local model or any of the QA models on | 
					
						
							|  |  |  | # Hugging Face's model hub (https://huggingface.co/models) | 
					
						
							|  |  |  | reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) | 
					
						
							| 
									
										
										
										
											2020-01-13 18:56:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-04-29 14:01:05 +02:00
										 |  |  | # #### TransformersReader | 
					
						
							| 
									
										
										
										
											2020-01-13 18:56:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-04-29 14:01:05 +02:00
										 |  |  | # Alternative: | 
					
						
							|  |  |  | # reader = TransformersReader( | 
					
						
							| 
									
										
										
										
											2020-10-21 17:15:35 +02:00
										 |  |  | #    model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) | 
					
						
							| 
									
										
										
										
											2020-01-13 18:56:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-04-29 14:01:05 +02:00
										 |  |  | # ### Finder | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | # The Finder sticks together reader and retriever in a pipeline to answer our actual questions. | 
					
						
							| 
									
										
										
										
											2020-01-13 18:56:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | finder = Finder(reader, retriever) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-04-29 14:01:05 +02:00
										 |  |  | # ## Voilà! Ask a question! | 
					
						
							| 
									
										
										
										
											2020-01-13 18:56:22 +01:00
										 |  |  | # You can configure how many candidates the reader and retriever shall return | 
					
						
							|  |  |  | # The higher top_k_retriever, the better (but also the slower) your answers. | 
					
						
							| 
									
										
										
										
											2020-06-24 15:14:09 +02:00
										 |  |  | prediction = finder.get_answers(question="Who is the father of Sansa Stark?", top_k_retriever=10, top_k_reader=5) | 
					
						
							| 
									
										
										
										
											2020-01-13 18:56:22 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-02-21 18:27:53 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-04-29 14:01:05 +02:00
										 |  |  | # prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_reader=5) | 
					
						
							|  |  |  | # prediction = finder.get_answers(question="Who is the sister of Sansa?", top_k_reader=5) | 
					
						
							| 
									
										
										
										
											2020-01-13 18:56:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | print_answers(prediction, details="minimal") |