| 
									
										
										
										
											2020-10-12 13:34:42 +02:00
										 |  |  | from pathlib import Path | 
					
						
							|  |  |  | from time import perf_counter | 
					
						
							|  |  |  | import logging | 
					
						
							|  |  |  | import datetime | 
					
						
							|  |  |  | import traceback | 
					
						
							| 
									
										
										
										
											2023-05-25 15:39:02 +02:00
										 |  |  | from typing import Dict | 
					
						
							| 
									
										
										
										
											2021-07-26 10:52:52 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-25 15:39:02 +02:00
										 |  |  | from haystack.nodes import BaseRetriever | 
					
						
							|  |  |  | from haystack import Pipeline | 
					
						
							|  |  |  | from haystack.utils import aggregate_labels | 
					
						
							| 
									
										
										
										
											2020-10-12 13:34:42 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-26 13:54:52 +02:00
										 |  |  | from utils import load_eval_data, get_retriever_config | 
					
						
							| 
									
										
										
										
											2020-10-12 13:34:42 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-25 15:39:02 +02:00
										 |  |  | def benchmark_retriever( | 
					
						
							|  |  |  |     indexing_pipeline: Pipeline, querying_pipeline: Pipeline, documents_directory: Path, eval_set: Path | 
					
						
							|  |  |  | ) -> Dict: | 
					
						
							| 
									
										
										
										
											2020-10-15 18:12:17 +02:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2023-05-25 15:39:02 +02:00
										 |  |  |     Benchmark indexing and querying on retriever pipelines on a given dataset. | 
					
						
							|  |  |  |     :param indexing_pipeline: Pipeline for indexing documents. | 
					
						
							|  |  |  |     :param querying_pipeline: Pipeline for querying documents. | 
					
						
							|  |  |  |     :param documents_directory: Directory containing files to index. | 
					
						
							|  |  |  |     :param eval_set: Path to evaluation set. | 
					
						
							| 
									
										
										
										
											2020-10-15 18:12:17 +02:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2023-05-25 15:39:02 +02:00
										 |  |  |     # Indexing | 
					
						
							|  |  |  |     indexing_results = benchmark_indexing(indexing_pipeline, documents_directory) | 
					
						
							| 
									
										
										
										
											2020-10-15 18:12:17 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-25 15:39:02 +02:00
										 |  |  |     # Querying | 
					
						
							|  |  |  |     querying_results = benchmark_querying(querying_pipeline, eval_set) | 
					
						
							| 
									
										
										
										
											2020-10-15 18:12:17 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-25 15:39:02 +02:00
										 |  |  |     results = {"indexing": indexing_results, "querying": querying_results} | 
					
						
							| 
									
										
										
										
											2023-08-17 12:56:45 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     doc_store = indexing_pipeline.get_document_store() | 
					
						
							|  |  |  |     doc_store.delete_index(index="document") | 
					
						
							| 
									
										
										
										
											2023-05-25 15:39:02 +02:00
										 |  |  |     return results | 
					
						
							| 
									
										
										
										
											2020-10-15 18:12:17 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-25 15:39:02 +02:00
										 |  |  | def benchmark_indexing(pipeline: Pipeline, documents_directory: Path) -> Dict: | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Benchmark indexing. | 
					
						
							|  |  |  |     :param pipeline: Pipeline for indexing documents. | 
					
						
							|  |  |  |     :param documents_directory: Directory containing files to index. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         # Indexing Pipelines take a list of file paths as input | 
					
						
							|  |  |  |         file_paths = [str(fp) for fp in documents_directory.iterdir() if fp.is_file() and not fp.name.startswith(".")] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Indexing | 
					
						
							|  |  |  |         start_time = perf_counter() | 
					
						
							|  |  |  |         pipeline.run_batch(file_paths=file_paths) | 
					
						
							|  |  |  |         end_time = perf_counter() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         indexing_time = end_time - start_time | 
					
						
							|  |  |  |         n_docs = len(file_paths) | 
					
						
							|  |  |  |         retrievers = pipeline.get_nodes_by_class(BaseRetriever) | 
					
						
							|  |  |  |         retriever_type = retrievers[0].__class__.__name__ if retrievers else "No component of type BaseRetriever found" | 
					
						
							|  |  |  |         doc_store = pipeline.get_document_store() | 
					
						
							|  |  |  |         doc_store_type = doc_store.__class__.__name__ if doc_store else "No DocumentStore found" | 
					
						
							|  |  |  |         results = { | 
					
						
							|  |  |  |             "retriever": retriever_type, | 
					
						
							|  |  |  |             "doc_store": doc_store_type, | 
					
						
							|  |  |  |             "n_docs": n_docs, | 
					
						
							|  |  |  |             "indexing_time": indexing_time, | 
					
						
							|  |  |  |             "docs_per_second": n_docs / indexing_time, | 
					
						
							|  |  |  |             "date_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | 
					
						
							|  |  |  |             "error": None, | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     except Exception: | 
					
						
							|  |  |  |         tb = traceback.format_exc() | 
					
						
							|  |  |  |         logging.error("##### The following Error was raised while running indexing run:") | 
					
						
							|  |  |  |         logging.error(tb) | 
					
						
							|  |  |  |         retrievers = pipeline.get_nodes_by_class(BaseRetriever) | 
					
						
							|  |  |  |         retriever_type = retrievers[0].__class__.__name__ if retrievers else "No component of type BaseRetriever found" | 
					
						
							|  |  |  |         doc_store = pipeline.get_document_store() | 
					
						
							|  |  |  |         doc_store_type = doc_store.__class__.__name__ if doc_store else "No DocumentStore found" | 
					
						
							|  |  |  |         results = { | 
					
						
							|  |  |  |             "retriever": retriever_type, | 
					
						
							|  |  |  |             "doc_store": doc_store_type, | 
					
						
							|  |  |  |             "n_docs": 0, | 
					
						
							|  |  |  |             "indexing_time": 0, | 
					
						
							|  |  |  |             "docs_per_second": 0, | 
					
						
							|  |  |  |             "date_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | 
					
						
							|  |  |  |             "error": str(tb), | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return results | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def benchmark_querying(pipeline: Pipeline, eval_set: Path) -> Dict: | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Benchmark querying. This method should only be called if indexing has already been done. | 
					
						
							|  |  |  |     :param pipeline: Pipeline for querying documents. | 
					
						
							|  |  |  |     :param eval_set: Path to evaluation set. | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         # Load eval data | 
					
						
							| 
									
										
										
										
											2023-06-01 10:49:54 +02:00
										 |  |  |         labels, _ = load_eval_data(eval_set) | 
					
						
							| 
									
										
										
										
											2023-05-25 15:39:02 +02:00
										 |  |  |         multi_labels = aggregate_labels(labels) | 
					
						
							| 
									
										
										
										
											2023-06-01 10:49:54 +02:00
										 |  |  |         queries = [label.query for label in multi_labels] | 
					
						
							| 
									
										
										
										
											2023-05-25 15:39:02 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # Run querying | 
					
						
							|  |  |  |         start_time = perf_counter() | 
					
						
							|  |  |  |         predictions = pipeline.run_batch(queries=queries, labels=multi_labels, debug=True) | 
					
						
							|  |  |  |         end_time = perf_counter() | 
					
						
							|  |  |  |         querying_time = end_time - start_time | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Evaluate predictions | 
					
						
							|  |  |  |         eval_result = pipeline._generate_eval_result_from_batch_preds(predictions_batches=predictions) | 
					
						
							|  |  |  |         metrics = eval_result.calculate_metrics()["Retriever"] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-26 13:54:52 +02:00
										 |  |  |         retriever_type, retriever_top_k = get_retriever_config(pipeline) | 
					
						
							| 
									
										
										
										
											2023-05-25 15:39:02 +02:00
										 |  |  |         doc_store = pipeline.get_document_store() | 
					
						
							|  |  |  |         doc_store_type = doc_store.__class__.__name__ if doc_store else "No DocumentStore found" | 
					
						
							|  |  |  |         results = { | 
					
						
							|  |  |  |             "retriever": retriever_type, | 
					
						
							|  |  |  |             "doc_store": doc_store_type, | 
					
						
							|  |  |  |             "n_docs": doc_store.get_document_count(), | 
					
						
							|  |  |  |             "n_queries": len(labels), | 
					
						
							|  |  |  |             "querying_time": querying_time, | 
					
						
							|  |  |  |             "queries_per_second": len(labels) / querying_time, | 
					
						
							|  |  |  |             "seconds_per_query": querying_time / len(labels), | 
					
						
							|  |  |  |             "recall": metrics["recall_single_hit"], | 
					
						
							|  |  |  |             "map": metrics["map"], | 
					
						
							|  |  |  |             "top_k": retriever_top_k, | 
					
						
							|  |  |  |             "date_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | 
					
						
							|  |  |  |             "error": None, | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     except Exception: | 
					
						
							|  |  |  |         tb = traceback.format_exc() | 
					
						
							|  |  |  |         logging.error("##### The following Error was raised while running querying run:") | 
					
						
							|  |  |  |         logging.error(tb) | 
					
						
							| 
									
										
										
										
											2023-05-26 13:54:52 +02:00
										 |  |  |         retriever_type, retriever_top_k = get_retriever_config(pipeline) | 
					
						
							| 
									
										
										
										
											2023-05-25 15:39:02 +02:00
										 |  |  |         doc_store = pipeline.get_document_store() | 
					
						
							|  |  |  |         doc_store_type = doc_store.__class__.__name__ if doc_store else "No DocumentStore found" | 
					
						
							|  |  |  |         results = { | 
					
						
							|  |  |  |             "retriever": retriever_type, | 
					
						
							|  |  |  |             "doc_store": doc_store_type, | 
					
						
							|  |  |  |             "n_docs": 0, | 
					
						
							|  |  |  |             "n_queries": 0, | 
					
						
							|  |  |  |             "retrieve_time": 0, | 
					
						
							|  |  |  |             "queries_per_second": 0, | 
					
						
							|  |  |  |             "seconds_per_query": 0, | 
					
						
							|  |  |  |             "recall": 0, | 
					
						
							|  |  |  |             "map": 0, | 
					
						
							| 
									
										
										
										
											2023-05-26 13:54:52 +02:00
										 |  |  |             "top_k": retriever_top_k, | 
					
						
							| 
									
										
										
										
											2023-05-25 15:39:02 +02:00
										 |  |  |             "date_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | 
					
						
							|  |  |  |             "error": str(tb), | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return results |