| 
									
										
										
										
											2022-07-25 17:57:30 +02:00
										 |  |  | import logging | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # We configure how logging messages should be displayed and which log level should be used before importing Haystack. | 
					
						
							|  |  |  | # Example log message: | 
					
						
							|  |  |  | # INFO - haystack.utils.preprocessing -  Converting data/tutorial1/218_Olenna_Tyrell.txt | 
					
						
							|  |  |  | # Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: | 
					
						
							|  |  |  | logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING) | 
					
						
							|  |  |  | logging.getLogger("haystack").setLevel(logging.INFO) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-29 13:53:35 +02:00
										 |  |  | from haystack.utils import convert_files_to_docs, fetch_archive_from_http, clean_wiki_text | 
					
						
							| 
									
										
										
										
											2021-10-25 15:50:23 +02:00
										 |  |  | from haystack.nodes import Seq2SeqGenerator | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def tutorial12_lfqa(): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Document Store: | 
					
						
							|  |  |  |     FAISS is a library for efficient similarity search on a cluster of dense vectors. | 
					
						
							|  |  |  |     The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood | 
					
						
							|  |  |  |     to store the document text and other meta data. The vector embeddings of the text are | 
					
						
							|  |  |  |     indexed on a FAISS Index that later is queried for searching answers. | 
					
						
							|  |  |  |     The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for | 
					
						
							|  |  |  |     faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor. | 
					
						
							|  |  |  |     For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-25 15:50:23 +02:00
										 |  |  |     from haystack.document_stores.faiss import FAISSDocumentStore | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-10 17:10:32 +00:00
										 |  |  |     document_store = FAISSDocumentStore(embedding_dim=128, faiss_index_factory_str="Flat") | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Cleaning & indexing documents: | 
					
						
							|  |  |  |     Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Let's first get some files that we want to use | 
					
						
							| 
									
										
										
										
											2022-03-21 11:58:51 +01:00
										 |  |  |     doc_dir = "data/tutorial12" | 
					
						
							|  |  |  |     s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip" | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  |     fetch_archive_from_http(url=s3_url, output_dir=doc_dir) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Convert files to dicts | 
					
						
							| 
									
										
										
										
											2022-03-29 13:53:35 +02:00
										 |  |  |     docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Now, let's write the dicts containing documents to our DB. | 
					
						
							| 
									
										
										
										
											2022-03-29 13:53:35 +02:00
										 |  |  |     document_store.write_documents(docs) | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2022-06-03 02:57:40 -05:00
										 |  |  |     Initialize Retriever and Reader/Generator: | 
					
						
							| 
									
										
										
										
											2022-03-08 15:11:41 +01:00
										 |  |  |     We use a `DensePassageRetriever` and we invoke `update_embeddings` to index the embeddings of documents in the `FAISSDocumentStore` | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  |     """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-08 15:11:41 +01:00
										 |  |  |     from haystack.nodes import DensePassageRetriever | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-08 15:11:41 +01:00
										 |  |  |     retriever = DensePassageRetriever( | 
					
						
							|  |  |  |         document_store=document_store, | 
					
						
							|  |  |  |         query_embedding_model="vblagoje/dpr-question_encoder-single-lfqa-wiki", | 
					
						
							|  |  |  |         passage_embedding_model="vblagoje/dpr-ctx_encoder-single-lfqa-wiki", | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     document_store.update_embeddings(retriever) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-08 15:11:41 +01:00
										 |  |  |     """Before we blindly use the `DensePassageRetriever` let's empirically test it to make sure a simple search indeed finds the relevant documents.""" | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-22 17:22:51 +02:00
										 |  |  |     from haystack.utils import print_documents | 
					
						
							| 
									
										
										
										
											2021-10-25 15:50:23 +02:00
										 |  |  |     from haystack.pipelines import DocumentSearchPipeline | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     p_retrieval = DocumentSearchPipeline(retriever) | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  |     res = p_retrieval.run(query="Tell me something about Arya Stark?", params={"Retriever": {"top_k": 1}}) | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  |     print_documents(res, max_text_len=512) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Similar to previous Tutorials we now initalize our reader/generator. | 
					
						
							| 
									
										
										
										
											2022-03-08 15:11:41 +01:00
										 |  |  |     Here we use a `Seq2SeqGenerator` with the *vblagoje/bart_lfqa* model (see: https://huggingface.co/vblagoje/bart_lfqa) | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  |     """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-08 15:11:41 +01:00
										 |  |  |     generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa") | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Pipeline: | 
					
						
							|  |  |  |     With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. | 
					
						
							|  |  |  |     Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. | 
					
						
							|  |  |  |     To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `GenerativeQAPipeline` that combines a retriever and a reader/generator to answer our questions. | 
					
						
							|  |  |  |     You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-10-25 15:50:23 +02:00
										 |  |  |     from haystack.pipelines import GenerativeQAPipeline | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  |     pipe = GenerativeQAPipeline(generator, retriever) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """Voilà! Ask a question!""" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-08 15:11:41 +01:00
										 |  |  |     query_1 = "How did Arya Stark's character get portrayed in a television adaptation?" | 
					
						
							|  |  |  |     result_1 = pipe.run(query=query_1, params={"Retriever": {"top_k": 3}}) | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  |     print(f"Query: {query_1}") | 
					
						
							|  |  |  |     print(f"Answer: {result_1['answers'][0]}") | 
					
						
							|  |  |  |     print() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-08 15:11:41 +01:00
										 |  |  |     query_2 = "Why is Arya Stark an unusual character?" | 
					
						
							|  |  |  |     result_2 = pipe.run(query=query_2, params={"Retriever": {"top_k": 3}}) | 
					
						
							| 
									
										
										
										
											2021-06-14 17:53:43 +02:00
										 |  |  |     print(f"Query: {query_2}") | 
					
						
							|  |  |  |     print(f"Answer: {result_2['answers'][0]}") | 
					
						
							|  |  |  |     print() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if __name__ == "__main__": | 
					
						
							|  |  |  |     tutorial12_lfqa() | 
					
						
							| 
									
										
										
										
											2021-06-14 18:37:00 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # This Haystack script was made with love by deepset in Berlin, Germany | 
					
						
							|  |  |  | # Haystack: https://github.com/deepset-ai/haystack | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  | # deepset: https://deepset.ai/ |