| 
									
										
										
										
											2021-10-25 15:50:23 +02:00
										 |  |  | from haystack.utils import clean_wiki_text, print_answers, print_documents, fetch_archive_from_http, convert_files_to_dicts, launch_es | 
					
						
							| 
									
										
										
										
											2021-04-29 17:31:28 +02:00
										 |  |  | from pprint import pprint | 
					
						
							|  |  |  | from haystack import Pipeline | 
					
						
							| 
									
										
										
										
											2021-10-25 15:50:23 +02:00
										 |  |  | from haystack.document_stores import ElasticsearchDocumentStore | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  | from haystack.nodes import ElasticsearchRetriever, DensePassageRetriever, FARMReader, RAGenerator, BaseComponent, JoinDocuments | 
					
						
							| 
									
										
										
										
											2021-10-27 10:11:22 +02:00
										 |  |  | from haystack.pipelines import ExtractiveQAPipeline, DocumentSearchPipeline, GenerativeQAPipeline | 
					
						
							| 
									
										
										
										
											2021-04-29 17:31:28 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  | def tutorial11_pipelines(): | 
					
						
							|  |  |  |     #Download and prepare data - 517 Wikipedia articles for Game of Thrones | 
					
						
							|  |  |  |     doc_dir = "data/article_txt_got" | 
					
						
							|  |  |  |     s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" | 
					
						
							|  |  |  |     fetch_archive_from_http(url=s3_url, output_dir=doc_dir) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # convert files to dicts containing documents that can be indexed to our datastore | 
					
						
							|  |  |  |     got_dicts = convert_files_to_dicts( | 
					
						
							|  |  |  |         dir_path=doc_dir, | 
					
						
							|  |  |  |         clean_func=clean_wiki_text, | 
					
						
							|  |  |  |         split_paragraphs=True | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Initialize DocumentStore and index documents | 
					
						
							|  |  |  |     launch_es() | 
					
						
							|  |  |  |     document_store = ElasticsearchDocumentStore() | 
					
						
							| 
									
										
										
										
											2021-08-30 18:48:28 +05:30
										 |  |  |     document_store.delete_documents() | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     document_store.write_documents(got_dicts) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Initialize Sparse retriever | 
					
						
							|  |  |  |     es_retriever = ElasticsearchRetriever(document_store=document_store) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Initialize dense retriever | 
					
						
							|  |  |  |     dpr_retriever = DensePassageRetriever(document_store) | 
					
						
							|  |  |  |     document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     print() | 
					
						
							|  |  |  |     print("######################") | 
					
						
							|  |  |  |     print("# Prebuilt Pipelines #") | 
					
						
							|  |  |  |     print("######################") | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     print() | 
					
						
							|  |  |  |     print("# Extractive QA Pipeline") | 
					
						
							|  |  |  |     print("########################") | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     query="Who is the father of Arya Stark?" | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=es_retriever) | 
					
						
							|  |  |  |     res = p_extractive_premade.run( | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |         query=query, | 
					
						
							| 
									
										
										
										
											2021-09-10 11:41:16 +02:00
										 |  |  |         params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}, | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     print("\nQuery: ", query) | 
					
						
							|  |  |  |     print("Answers:") | 
					
						
							|  |  |  |     print_answers(res, details="minimum") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     print() | 
					
						
							|  |  |  |     print("# Document Search Pipeline") | 
					
						
							|  |  |  |     print("##########################") | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     query="Who is the father of Arya Stark?" | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     p_retrieval = DocumentSearchPipeline(es_retriever) | 
					
						
							|  |  |  |     res = p_retrieval.run( | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |         query=query, | 
					
						
							| 
									
										
										
										
											2021-09-10 11:41:16 +02:00
										 |  |  |         params={"Retriever": {"top_k": 10}}, | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     print() | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     print_documents(res, max_text_len=200) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     print() | 
					
						
							|  |  |  |     print("# Generator Pipeline") | 
					
						
							|  |  |  |     print("####################") | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # We set this to True so that the document store returns document embeddings | 
					
						
							|  |  |  |     # with each document, this is needed by the Generator | 
					
						
							|  |  |  |     document_store.return_embedding = True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Initialize generator | 
					
						
							|  |  |  |     rag_generator = RAGenerator() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Generative QA | 
					
						
							|  |  |  |     p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever) | 
					
						
							|  |  |  |     res = p_generator.run( | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |         query=query, | 
					
						
							| 
									
										
										
										
											2021-09-10 11:41:16 +02:00
										 |  |  |         params={"Retriever": {"top_k": 10}}, | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     print() | 
					
						
							|  |  |  |     print_answers(res, details="minimum") | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # We are setting this to False so that in later pipelines, | 
					
						
							|  |  |  |     # we get a cleaner printout | 
					
						
							|  |  |  |     document_store.return_embedding = False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     ############################## | 
					
						
							|  |  |  |     # Creating Pipeline Diagrams # | 
					
						
							|  |  |  |     ############################## | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     p_extractive_premade.draw("pipeline_extractive_premade.png") | 
					
						
							|  |  |  |     p_retrieval.draw("pipeline_retrieval.png") | 
					
						
							|  |  |  |     p_generator.draw("pipeline_generator.png") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     print() | 
					
						
							|  |  |  |     print("####################") | 
					
						
							|  |  |  |     print("# Custom Pipelines #") | 
					
						
							|  |  |  |     print("####################") | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     print() | 
					
						
							|  |  |  |     print("# Extractive QA Pipeline") | 
					
						
							|  |  |  |     print("########################") | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Custom built extractive QA pipeline | 
					
						
							|  |  |  |     p_extractive = Pipeline() | 
					
						
							|  |  |  |     p_extractive.add_node(component=es_retriever, name="Retriever", inputs=["Query"]) | 
					
						
							|  |  |  |     p_extractive.add_node(component=reader, name="Reader", inputs=["Retriever"]) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Now we can run it | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     query="Who is the father of Arya Stark?" | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     res = p_extractive.run( | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |         query=query, | 
					
						
							| 
									
										
										
										
											2021-09-10 11:41:16 +02:00
										 |  |  |         params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}, | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     print("\nQuery: ", query) | 
					
						
							|  |  |  |     print("Answers:") | 
					
						
							|  |  |  |     print_answers(res, details="minimum") | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     p_extractive.draw("pipeline_extractive.png") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     print() | 
					
						
							|  |  |  |     print("# Ensembled Retriever Pipeline") | 
					
						
							|  |  |  |     print("##############################") | 
					
						
							|  |  |  |      | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     # Create ensembled pipeline | 
					
						
							|  |  |  |     p_ensemble = Pipeline() | 
					
						
							|  |  |  |     p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) | 
					
						
							|  |  |  |     p_ensemble.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"]) | 
					
						
							|  |  |  |     p_ensemble.add_node(component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever"]) | 
					
						
							|  |  |  |     p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"]) | 
					
						
							|  |  |  |     p_ensemble.draw("pipeline_ensemble.png") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Run pipeline | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     query="Who is the father of Arya Stark?" | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     res = p_ensemble.run( | 
					
						
							|  |  |  |         query="Who is the father of Arya Stark?", | 
					
						
							| 
									
										
										
										
											2021-09-10 11:41:16 +02:00
										 |  |  |         params={"ESRetriever": {"top_k": 5}, "DPRRetriever": {"top_k": 5}}, | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     print("\nQuery: ", query) | 
					
						
							|  |  |  |     print("Answers:") | 
					
						
							|  |  |  |     print_answers(res, details="minimum") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     print() | 
					
						
							|  |  |  |     print("# Query Classification Pipeline") | 
					
						
							|  |  |  |     print("###############################") | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run. | 
					
						
							|  |  |  |     # Though this looks very similar to the ensembled pipeline shown above, | 
					
						
							|  |  |  |     # the key difference is that only one of the retrievers is run for each request. | 
					
						
							|  |  |  |     # By contrast both retrievers are always run in the ensembled approach. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     class CustomQueryClassifier(BaseComponent): | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |         outgoing_edges = 2 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-10 11:41:16 +02:00
										 |  |  |         def run(self, query): | 
					
						
							|  |  |  |             if "?" in query: | 
					
						
							|  |  |  |                 return {}, "output_2" | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |             else: | 
					
						
							| 
									
										
										
										
											2021-09-10 11:41:16 +02:00
										 |  |  |                 return {}, "output_1" | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Here we build the pipeline | 
					
						
							|  |  |  |     p_classifier = Pipeline() | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"]) | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"]) | 
					
						
							|  |  |  |     p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"]) | 
					
						
							|  |  |  |     p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) | 
					
						
							|  |  |  |     p_classifier.draw("pipeline_classifier.png") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Run only the dense retriever on the full sentence query | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     query="Who is the father of Arya Stark?" | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     res_1 = p_classifier.run( | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |         query=query, | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     print() | 
					
						
							|  |  |  |     print("\nQuery: ", query) | 
					
						
							|  |  |  |     print(" * DPR Answers:") | 
					
						
							|  |  |  |     print_answers(res_1, details="minimum") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # Run only the sparse retriever on a keyword based query | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     query="Arya Stark father" | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     res_2 = p_classifier.run( | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |         query=query, | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2021-11-09 15:09:26 +01:00
										 |  |  |     print() | 
					
						
							|  |  |  |     print("\nQuery: ", query) | 
					
						
							|  |  |  |     print(" * ES Answers:") | 
					
						
							|  |  |  |     print_answers(res_2, details="minimum") | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if __name__ == "__main__": | 
					
						
							|  |  |  |     tutorial11_pipelines() | 
					
						
							| 
									
										
										
										
											2021-06-11 11:09:15 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # This Haystack script was made with love by deepset in Berlin, Germany | 
					
						
							|  |  |  | # Haystack: https://github.com/deepset-ai/haystack | 
					
						
							| 
									
										
										
										
											2021-07-09 17:08:19 +02:00
										 |  |  | # deepset: https://deepset.ai/ |