| 
									
										
										
										
											2021-10-25 15:50:23 +02:00
										 |  |  | from haystack.document_stores import FAISSDocumentStore, MilvusDocumentStore | 
					
						
							| 
									
										
										
										
											2022-03-29 13:53:35 +02:00
										 |  |  | from haystack.utils import clean_wiki_text, print_answers, launch_milvus, convert_files_to_docs, fetch_archive_from_http | 
					
						
							| 
									
										
										
										
											2021-10-25 15:50:23 +02:00
										 |  |  | from haystack.nodes import FARMReader, DensePassageRetriever | 
					
						
							| 
									
										
										
										
											2020-06-30 19:05:45 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-13 18:17:54 +01:00
										 |  |  | def tutorial6_better_retrieval_via_dpr(): | 
					
						
							| 
									
										
										
										
											2021-06-11 11:09:15 +02:00
										 |  |  |     # OPTION 1: FAISS is a library for efficient similarity search on a cluster of dense vectors. | 
					
						
							| 
									
										
										
										
											2021-01-13 18:17:54 +01:00
										 |  |  |     # The FAISSDocumentStore uses a SQL(SQLite in-memory be default) document store under-the-hood | 
					
						
							|  |  |  |     # to store the document text and other meta data. The vector embeddings of the text are | 
					
						
							|  |  |  |     # indexed on a FAISS Index that later is queried for searching answers. | 
					
						
							|  |  |  |     # The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for | 
					
						
							|  |  |  |     # faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor. | 
					
						
							|  |  |  |     # For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index | 
					
						
							| 
									
										
										
										
											2022-01-26 18:12:55 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Do not forget to install its dependencies with `pip install farm-haystack[faiss]` | 
					
						
							| 
									
										
										
										
											2021-01-13 18:17:54 +01:00
										 |  |  |     document_store = FAISSDocumentStore(faiss_index_factory_str="Flat") | 
					
						
							| 
									
										
										
										
											2020-06-30 19:05:45 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-11 11:09:15 +02:00
										 |  |  |     # OPTION2: Milvus is an open source database library that is also optimized for vector similarity searches like FAISS. | 
					
						
							|  |  |  |     # Like FAISS it has both a "Flat" and "HNSW" mode but it outperforms FAISS when it comes to dynamic data management. | 
					
						
							|  |  |  |     # It does require a little more setup, however, as it is run through Docker and requires the setup of some config files. | 
					
						
							|  |  |  |     # See https://milvus.io/docs/v1.0.0/milvus_docker-cpu.md | 
					
						
							| 
									
										
										
										
											2022-01-26 18:12:55 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Do not forget to install its dependencies with `pip install farm-haystack[milvus1]` | 
					
						
							| 
									
										
										
										
											2021-06-11 11:09:15 +02:00
										 |  |  |     # launch_milvus() | 
					
						
							|  |  |  |     # document_store = MilvusDocumentStore() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-13 18:17:54 +01:00
										 |  |  |     # ## Preprocessing of documents | 
					
						
							|  |  |  |     # Let's first get some documents that we want to query | 
					
						
							| 
									
										
										
										
											2022-03-21 11:58:51 +01:00
										 |  |  |     doc_dir = "data/tutorial6" | 
					
						
							|  |  |  |     s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt6.zip" | 
					
						
							| 
									
										
										
										
											2021-01-13 18:17:54 +01:00
										 |  |  |     fetch_archive_from_http(url=s3_url, output_dir=doc_dir) | 
					
						
							| 
									
										
										
										
											2020-06-30 19:05:45 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-13 18:17:54 +01:00
										 |  |  |     # convert files to dicts containing documents that can be indexed to our datastore | 
					
						
							| 
									
										
										
										
											2022-03-29 13:53:35 +02:00
										 |  |  |     docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) | 
					
						
							| 
									
										
										
										
											2020-06-30 19:05:45 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-13 18:17:54 +01:00
										 |  |  |     # Now, let's write the docs to our DB. | 
					
						
							| 
									
										
										
										
											2022-03-29 13:53:35 +02:00
										 |  |  |     document_store.write_documents(docs) | 
					
						
							| 
									
										
										
										
											2020-06-30 19:05:45 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-13 18:17:54 +01:00
										 |  |  |     ### Retriever | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  |     retriever = DensePassageRetriever( | 
					
						
							|  |  |  |         document_store=document_store, | 
					
						
							|  |  |  |         query_embedding_model="facebook/dpr-question_encoder-single-nq-base", | 
					
						
							|  |  |  |         passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", | 
					
						
							|  |  |  |         max_seq_len_query=64, | 
					
						
							|  |  |  |         max_seq_len_passage=256, | 
					
						
							|  |  |  |         batch_size=2, | 
					
						
							|  |  |  |         use_gpu=True, | 
					
						
							|  |  |  |         embed_title=True, | 
					
						
							|  |  |  |         use_fast_tokenizers=True, | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2020-06-30 19:05:45 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-13 18:17:54 +01:00
										 |  |  |     # Important: | 
					
						
							|  |  |  |     # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all | 
					
						
							|  |  |  |     # previously indexed documents and update their embedding representation. | 
					
						
							|  |  |  |     # While this can be a time consuming operation (depending on corpus size), it only needs to be done once. | 
					
						
							|  |  |  |     # At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast. | 
					
						
							|  |  |  |     document_store.update_embeddings(retriever) | 
					
						
							| 
									
										
										
										
											2020-07-31 11:34:06 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-13 18:17:54 +01:00
										 |  |  |     ### Reader | 
					
						
							|  |  |  |     # Load a  local model or any of the QA models on | 
					
						
							|  |  |  |     # Hugging Face's model hub (https://huggingface.co/models) | 
					
						
							|  |  |  |     reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) | 
					
						
							| 
									
										
										
										
											2020-06-30 19:05:45 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-02-09 14:56:54 +01:00
										 |  |  |     ### Pipeline | 
					
						
							| 
									
										
										
										
											2021-10-25 15:50:23 +02:00
										 |  |  |     from haystack.pipelines import ExtractiveQAPipeline | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-02-09 14:56:54 +01:00
										 |  |  |     pipe = ExtractiveQAPipeline(reader, retriever) | 
					
						
							| 
									
										
										
										
											2020-06-30 19:05:45 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-02-09 14:56:54 +01:00
										 |  |  |     ## Voilà! Ask a question! | 
					
						
							| 
									
										
										
										
											2021-09-10 11:41:16 +02:00
										 |  |  |     prediction = pipe.run( | 
					
						
							|  |  |  |         query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2020-06-30 19:05:45 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-10 11:41:16 +02:00
										 |  |  |     # prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}}) | 
					
						
							|  |  |  |     # prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}}) | 
					
						
							| 
									
										
										
										
											2020-06-30 19:05:45 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     print_answers(prediction, details="minimum") | 
					
						
							| 
									
										
										
										
											2020-06-30 19:05:45 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-13 18:17:54 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | if __name__ == "__main__": | 
					
						
							|  |  |  |     tutorial6_better_retrieval_via_dpr() | 
					
						
							| 
									
										
										
										
											2021-06-11 11:09:15 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # This Haystack script was made with love by deepset in Berlin, Germany | 
					
						
							|  |  |  | # Haystack: https://github.com/deepset-ai/haystack | 
					
						
							| 
									
										
										
										
											2022-02-03 13:43:18 +01:00
										 |  |  | # deepset: https://deepset.ai/ |