2021-10-25 15:50:23 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								from  haystack . document_stores  import  ElasticsearchDocumentStore  
						 
					
						
							
								
									
										
										
										
											2020-05-07 10:19:26 +02:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2021-10-25 15:50:23 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								from  haystack . nodes  import  EmbeddingRetriever  
						 
					
						
							
								
									
										
										
										
											2021-10-14 11:49:35 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								from  haystack . utils  import  launch_es  
						 
					
						
							
								
									
										
										
										
											2020-05-07 10:19:26 +02:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								import  pandas  as  pd  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								import  requests  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								import  logging  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								import  subprocess  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								import  time  
						 
					
						
							
								
									
										
										
										
											2021-01-13 18:17:54 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								def  tutorial4_faq_style_qa ( ) :  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ## "FAQ-Style QA": Utilizing existing FAQs for Question Answering 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # While *extractive Question Answering* works on pure texts and is therefore more generalizable, there's also a common alternative that utilizes existing FAQ data. 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Pros: 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # - Very fast at inference time 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # - Utilize existing FAQ data 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # - Quite good control over answers 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Cons: 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # - Generalizability: We can only answer questions that are similar to existing ones in FAQ 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # In some use cases, a combination of extractive QA and FAQ-style can also be an interesting option. 
							 
						 
					
						
							
								
									
										
										
										
											2021-06-11 11:09:15 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    launch_es ( ) 
							 
						 
					
						
							
								
									
										
										
										
											2021-01-13 18:17:54 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ### Init the DocumentStore 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # In contrast to Tutorial 1 (extractive QA), we: 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # * specify the name of our `text_field` in Elasticsearch that we want to return as an answer 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # * specify the name of our `embedding_field` in Elasticsearch where we'll store the embedding of our question and that is used later for calculating our similarity to the incoming user question 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # * set `excluded_meta_data=["question_emb"]` so that we don't return the huge embedding vectors in our search results 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    document_store  =  ElasticsearchDocumentStore ( host = " localhost " ,  username = " " ,  password = " " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                                                index = " document " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                                                embedding_field = " question_emb " , 
							 
						 
					
						
							
								
									
										
										
										
											2021-09-01 18:39:06 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                                                embedding_dim = 384 , 
							 
						 
					
						
							
								
									
										
										
										
											2021-01-13 18:17:54 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                                                excluded_meta_data = [ " question_emb " ] , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                                                similarity = " cosine " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ### Create a Retriever using embeddings 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones). 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings. 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # 
							 
						 
					
						
							
								
									
										
										
										
											2021-09-01 18:39:06 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    retriever  =  EmbeddingRetriever ( document_store = document_store ,  embedding_model = " sentence-transformers/all-MiniLM-L6-v2 " ,  use_gpu = True ) 
							 
						 
					
						
							
								
									
										
										
										
											2021-01-13 18:17:54 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Download a csv containing some FAQ data 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Here: Some question-answer pairs related to COVID-19 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    temp  =  requests . get ( " https://raw.githubusercontent.com/deepset-ai/COVID-QA/master/data/faqs/faq_covidbert.csv " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    open ( ' small_faq_covid.csv ' ,  ' wb ' ) . write ( temp . content ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Get dataframe with columns "question", "answer" and some custom metadata 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    df  =  pd . read_csv ( " small_faq_covid.csv " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Minimal cleaning 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    df . fillna ( value = " " ,  inplace = True ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    df [ " question " ]  =  df [ " question " ] . apply ( lambda  x :  x . strip ( ) ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    print ( df . head ( ) ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Get embeddings for our questions from the FAQs 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    questions  =  list ( df [ " question " ] . values ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    df [ " question_emb " ]  =  retriever . embed_queries ( texts = questions ) 
							 
						 
					
						
							
								
									
										
										
										
											2021-10-13 14:23:23 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    df  =  df . rename ( columns = { " question " :  " content " } ) 
							 
						 
					
						
							
								
									
										
										
										
											2021-01-13 18:17:54 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Convert Dataframe to list of dicts and index them in our DocumentStore 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    docs_to_index  =  df . to_dict ( orient = " records " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    document_store . write_documents ( docs_to_index ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2021-02-09 14:56:54 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    #    Initialize a Pipeline (this time without a reader) and ask questions 
							 
						 
					
						
							
								
									
										
										
										
											2021-01-13 18:17:54 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2021-10-25 15:50:23 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    from  haystack . pipelines  import  FAQPipeline 
							 
						 
					
						
							
								
									
										
										
										
											2021-02-09 14:56:54 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    pipe  =  FAQPipeline ( retriever = retriever ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2021-09-10 11:41:16 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    prediction  =  pipe . run ( query = " How is the virus spreading? " ,  params = { " Retriever " :  { " top_k " :  10 } } ) 
							 
						 
					
						
							
								
									
										
										
										
											2021-10-14 11:49:35 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    for  a  in  prediction [ " answers " ] : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        print ( f " Answer:  { a . answer } " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        print ( f " Question:  { a . meta [ ' query ' ] } " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        print ( f " Score:  { a . score } " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        print ( " --------------------- " ) 
							 
						 
					
						
							
								
									
										
										
										
											2021-01-13 18:17:54 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								if  __name__  ==  " __main__ " :  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    tutorial4_faq_style_qa ( ) 
							 
						 
					
						
							
								
									
										
										
										
											2021-06-11 11:09:15 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								# This Haystack script was made with love by deepset in Berlin, Germany  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								# Haystack: https://github.com/deepset-ai/haystack  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								# deepset: https://deepset.ai/