mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 09:49:48 +00:00 
			
		
		
		
	Docs: Update Rankers docstrings and messages (#6296)
* Update docstrings and messages * Fix tests * Fix formatting * Update haystack/preview/components/rankers/meta_field.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Fix tests --------- Co-authored-by: Silvano Cerza <silvanocerza@gmail.com> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									0ef06e72ff
								
							
						
					
					
						commit
						497299c27a
					
				| @ -11,7 +11,7 @@ logger = logging.getLogger(__name__) | |||||||
| @component | @component | ||||||
| class MetaFieldRanker: | class MetaFieldRanker: | ||||||
|     """ |     """ | ||||||
|     Ranks documents based on the value of a metadata field. |     Ranks Documents based on the value of their specific metadata field. The ranking is done in a descending order. | ||||||
| 
 | 
 | ||||||
|     Usage example: |     Usage example: | ||||||
|     ``` |     ``` | ||||||
| @ -42,13 +42,13 @@ class MetaFieldRanker: | |||||||
| 
 | 
 | ||||||
|         :param metadata_field: The name of the metadata field to rank by. |         :param metadata_field: The name of the metadata field to rank by. | ||||||
|         :param weight: In range [0,1]. |         :param weight: In range [0,1]. | ||||||
|                 0 disables sorting by metadata field. |                 0 disables ranking by a metadata field. | ||||||
|                 0.5 content and metadata fields have the same impact. |                 0.5 content and metadata fields have the same impact for the ranking. | ||||||
|                 1 means sorting only by metadata field, highest value comes first. |                 1 means ranking by a metadata field only. The highest value comes first. | ||||||
|         :param top_k: The maximum number of documents to return. |         :param top_k: The maximum number of Documents you want the Ranker to return per query. | ||||||
|         :param ranking_mode: The mode used to combine retriever and recentness. |         :param ranking_mode: The mode used to combine the Retriever's and Ranker's scores. | ||||||
|                 Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'. |                 Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'. | ||||||
|                 Make sure to use 'score' mode only with retrievers/rankers that give back OK score in range [0,1]. |                 Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1]. | ||||||
|         """ |         """ | ||||||
| 
 | 
 | ||||||
|         self.metadata_field = metadata_field |         self.metadata_field = metadata_field | ||||||
| @ -59,9 +59,9 @@ class MetaFieldRanker: | |||||||
|         if self.weight < 0 or self.weight > 1: |         if self.weight < 0 or self.weight > 1: | ||||||
|             raise ValueError( |             raise ValueError( | ||||||
|                 """ |                 """ | ||||||
|                 Param <weight> needs to be in range [0,1] but was set to '{}'.\n |                 Parameter <weight> must be in range [0,1] but is currently set to '{}'.\n | ||||||
|                 '0' disables sorting by metadata field, '0.5' gives equal weight to previous relevance scores and metadata field, and '1' ranks by metadata field only.\n |                 '0' disables sorting by a metadata field, '0.5' assigns equal weight to the previous relevance scores and the metadata field, and '1' ranks by the metadata field only.\n | ||||||
|                 Please change param <weight> when initializing the MetaFieldRanker. |                 Change the <weight> parameter to a value in range 0 to 1 when initializing the MetaFieldRanker. | ||||||
|                 """.format( |                 """.format( | ||||||
|                     self.weight |                     self.weight | ||||||
|                 ) |                 ) | ||||||
| @ -70,8 +70,8 @@ class MetaFieldRanker: | |||||||
|         if self.ranking_mode not in ["reciprocal_rank_fusion", "linear_score"]: |         if self.ranking_mode not in ["reciprocal_rank_fusion", "linear_score"]: | ||||||
|             raise ValueError( |             raise ValueError( | ||||||
|                 """ |                 """ | ||||||
|                 Param <ranking_mode> needs to be 'reciprocal_rank_fusion' or 'linear_score' but was set to '{}'. \n |                 The value of parameter <ranking_mode> must be 'reciprocal_rank_fusion' or 'linear_score', but is currently set to '{}'. \n | ||||||
|                 Please change the <ranking_mode> when initializing the MetaFieldRanker. |                 Change the <ranking_mode> value to 'reciprocal_rank_fusion' or 'linear_score' when initializing the MetaFieldRanker. | ||||||
|                 """.format( |                 """.format( | ||||||
|                     self.ranking_mode |                     self.ranking_mode | ||||||
|                 ) |                 ) | ||||||
| @ -92,13 +92,13 @@ class MetaFieldRanker: | |||||||
|     @component.output_types(documents=List[Document]) |     @component.output_types(documents=List[Document]) | ||||||
|     def run(self, documents: List[Document], top_k: Optional[int] = None): |     def run(self, documents: List[Document], top_k: Optional[int] = None): | ||||||
|         """ |         """ | ||||||
|         This method is used to rank a list of documents based on the selected metadata field by: |         Use this method to rank a list of Documents based on the selected metadata field by: | ||||||
|         1. Sorting the documents by the metadata field in descending order. |         1. Sorting the Documents by the metadata field in descending order. | ||||||
|         2. Merging the scores from the metadata field with the scores from the previous component according to the strategy and weight provided. |         2. Merging the scores from the metadata field with the scores from the previous component according to the strategy and weight provided. | ||||||
|         3. Returning the top-k documents. |         3. Returning the top-k documents. | ||||||
| 
 | 
 | ||||||
|         :param documents: Documents provided for ranking. |         :param documents: Documents to be ranked. | ||||||
|         :param top_k: (optional) How many documents to return at the end. If not provided, all documents will be returned. |         :param top_k: (optional) The number of Documents you want the Ranker to return. If not provided, the Ranker returns all Documents it received. | ||||||
|         """ |         """ | ||||||
|         if not documents: |         if not documents: | ||||||
|             return {"documents": []} |             return {"documents": []} | ||||||
| @ -113,9 +113,9 @@ class MetaFieldRanker: | |||||||
|         except KeyError: |         except KeyError: | ||||||
|             raise ComponentError( |             raise ComponentError( | ||||||
|                 """ |                 """ | ||||||
|                 Param <metadata_field> was set to '{}' but document(s) {} do not contain this metadata key.\n |                 The parameter <metadata_field> is currently set to '{}' but the Documents {} don't have this metadata key.\n | ||||||
|                 Please double-check the names of existing metadata fields of your documents \n |                 Double-check the names of the metadata fields in your documents \n | ||||||
|                 and set <metadata_field> to the name of the field that contains the metadata you want to rank by. |                 and set <metadata_field> to the name of the field that contains the metadata you want to use for ranking. | ||||||
|                 """.format( |                 """.format( | ||||||
|                     self.metadata_field, ",".join([doc.id for doc in documents if self.metadata_field not in doc.meta]) |                     self.metadata_field, ",".join([doc.id for doc in documents if self.metadata_field not in doc.meta]) | ||||||
|                 ) |                 ) | ||||||
| @ -129,7 +129,7 @@ class MetaFieldRanker: | |||||||
| 
 | 
 | ||||||
|     def _merge_scores(self, documents: List[Document], sorted_documents: List[Document]) -> List[Document]: |     def _merge_scores(self, documents: List[Document], sorted_documents: List[Document]) -> List[Document]: | ||||||
|         """ |         """ | ||||||
|         Merge scores for documents sorted both by content and by metadata field. |         Merge scores for Documents sorted both by their content and by their metadata field. | ||||||
|         """ |         """ | ||||||
|         scores_map: Dict = defaultdict(int) |         scores_map: Dict = defaultdict(int) | ||||||
| 
 | 
 | ||||||
| @ -141,10 +141,10 @@ class MetaFieldRanker: | |||||||
|             for i, (doc, sorted_doc) in enumerate(zip(documents, sorted_documents)): |             for i, (doc, sorted_doc) in enumerate(zip(documents, sorted_documents)): | ||||||
|                 score = float(0) |                 score = float(0) | ||||||
|                 if doc.score is None: |                 if doc.score is None: | ||||||
|                     warnings.warn("The score was not provided; defaulting to 0") |                     warnings.warn("The score wasn't provided; defaulting to 0.") | ||||||
|                 elif doc.score < 0 or doc.score > 1: |                 elif doc.score < 0 or doc.score > 1: | ||||||
|                     warnings.warn( |                     warnings.warn( | ||||||
|                         "The score {} for document {} is outside the [0,1] range; defaulting to 0".format( |                         "The score {} for Document {} is outside the [0,1] range; defaulting to 0".format( | ||||||
|                             doc.score, doc.id |                             doc.score, doc.id | ||||||
|                         ) |                         ) | ||||||
|                     ) |                     ) | ||||||
| @ -164,7 +164,7 @@ class MetaFieldRanker: | |||||||
|     def _calculate_rrf(rank: int, k: int = 61) -> float: |     def _calculate_rrf(rank: int, k: int = 61) -> float: | ||||||
|         """ |         """ | ||||||
|         Calculates the reciprocal rank fusion. The constant K is set to 61 (60 was suggested by the original paper, |         Calculates the reciprocal rank fusion. The constant K is set to 61 (60 was suggested by the original paper, | ||||||
|         plus 1 as python lists are 0-based and the paper [https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf] used 1-based ranking). |         plus 1 as python lists are 0-based and the [paper](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) used 1-based ranking). | ||||||
|         """ |         """ | ||||||
|         return 1 / (k + rank) |         return 1 / (k + rank) | ||||||
| 
 | 
 | ||||||
| @ -172,9 +172,9 @@ class MetaFieldRanker: | |||||||
|     def _calc_linear_score(rank: int, amount: int) -> float: |     def _calc_linear_score(rank: int, amount: int) -> float: | ||||||
|         """ |         """ | ||||||
|         Calculate the metadata field score as a linear score between the greatest and the lowest score in the list. |         Calculate the metadata field score as a linear score between the greatest and the lowest score in the list. | ||||||
|         This linear scaling is useful to |         This linear scaling is useful for: | ||||||
|           a) reduce the effect of outliers and |           - Reducing the effect of outliers | ||||||
|           b) create scores that are meaningfully distributed in [0,1], |           - Creating scores that are meaningfully distributed in the range [0,1], | ||||||
|              similar to scores coming from a retriever/ranker. |              similar to scores coming from a Retriever or Ranker. | ||||||
|         """ |         """ | ||||||
|         return (amount - rank) / amount |         return (amount - rank) / amount | ||||||
|  | |||||||
| @ -16,8 +16,8 @@ with LazyImport(message="Run 'pip install transformers[torch,sentencepiece]'") a | |||||||
| @component | @component | ||||||
| class TransformersSimilarityRanker: | class TransformersSimilarityRanker: | ||||||
|     """ |     """ | ||||||
|     Ranks documents based on query similarity. |     Ranks Documents based on their similarity to the query. | ||||||
|     It uses a pre-trained cross-encoder model (from Hugging Face Hub) to embed the query and documents. |     It uses a pre-trained cross-encoder model (from the Hugging Face Hub) to embed the query and the Documents. | ||||||
| 
 | 
 | ||||||
|     Usage example: |     Usage example: | ||||||
|     ``` |     ``` | ||||||
| @ -45,12 +45,12 @@ class TransformersSimilarityRanker: | |||||||
|         Creates an instance of TransformersSimilarityRanker. |         Creates an instance of TransformersSimilarityRanker. | ||||||
| 
 | 
 | ||||||
|         :param model_name_or_path: The name or path of a pre-trained cross-encoder model |         :param model_name_or_path: The name or path of a pre-trained cross-encoder model | ||||||
|             from Hugging Face Hub. |             from the Hugging Face Hub. | ||||||
|         :param device: torch device (for example, cuda:0, cpu, mps) to limit model inference to a specific device. |         :param device: The torch device (for example, cuda:0, cpu, mps) to which you want to limit model inference. | ||||||
|         :param token: The API token used to download private models from Hugging Face. |         :param token: The API token used to download private models from Hugging Face. | ||||||
|             If this parameter is set to `True`, then the token generated when running |             If this parameter is set to `True`, the token generated when running | ||||||
|             `transformers-cli login` (stored in ~/.huggingface) will be used. |             `transformers-cli login` (stored in ~/.huggingface) is used. | ||||||
|         :param top_k: The maximum number of documents to return per query. |         :param top_k: The maximum number of Documents to return per query. | ||||||
|         """ |         """ | ||||||
|         torch_and_transformers_import.check() |         torch_and_transformers_import.check() | ||||||
| 
 | 
 | ||||||
| @ -71,7 +71,7 @@ class TransformersSimilarityRanker: | |||||||
| 
 | 
 | ||||||
|     def warm_up(self): |     def warm_up(self): | ||||||
|         """ |         """ | ||||||
|         Warm up the model and tokenizer used in scoring the documents. |         Warm up the model and tokenizer used for scoring the Documents. | ||||||
|         """ |         """ | ||||||
|         if self.model_name_or_path and not self.model: |         if self.model_name_or_path and not self.model: | ||||||
|             self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path, token=self.token) |             self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path, token=self.token) | ||||||
| @ -94,12 +94,12 @@ class TransformersSimilarityRanker: | |||||||
|     @component.output_types(documents=List[Document]) |     @component.output_types(documents=List[Document]) | ||||||
|     def run(self, query: str, documents: List[Document], top_k: Optional[int] = None): |     def run(self, query: str, documents: List[Document], top_k: Optional[int] = None): | ||||||
|         """ |         """ | ||||||
|         Returns a list of documents ranked by their similarity to the given query |         Returns a list of Documents ranked by their similarity to the given query. | ||||||
| 
 | 
 | ||||||
|         :param query: Query string. |         :param query: Query string. | ||||||
|         :param documents: List of Documents. |         :param documents: List of Documents. | ||||||
|         :param top_k: The maximum number of documents to return. |         :param top_k: The maximum number of Documents you want the Ranker to return. | ||||||
|         :return: List of Documents sorted by (desc.) similarity with the query. |         :return: List of Documents sorted by their similarity to the query with the most similar Documents appearing first. | ||||||
|         """ |         """ | ||||||
|         if not documents: |         if not documents: | ||||||
|             return {"documents": []} |             return {"documents": []} | ||||||
| @ -113,7 +113,7 @@ class TransformersSimilarityRanker: | |||||||
|         # If a model path is provided but the model isn't loaded |         # If a model path is provided but the model isn't loaded | ||||||
|         if self.model_name_or_path and not self.model: |         if self.model_name_or_path and not self.model: | ||||||
|             raise ComponentError( |             raise ComponentError( | ||||||
|                 f"The component {self.__class__.__name__} not warmed up. Run 'warm_up()' before calling 'run()'." |                 f"The component {self.__class__.__name__} wasn't warmed up. Run 'warm_up()' before calling 'run()'." | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|         query_doc_pairs = [[query, doc.content] for doc in documents] |         query_doc_pairs = [[query, doc.content] for doc in documents] | ||||||
|  | |||||||
| @ -105,7 +105,7 @@ class TestMetaFieldRanker: | |||||||
|             Document(id=3, content="abc", meta={"rating": 2.1}, score=0.6), |             Document(id=3, content="abc", meta={"rating": 2.1}, score=0.6), | ||||||
|         ] |         ] | ||||||
|         with pytest.warns( |         with pytest.warns( | ||||||
|             UserWarning, match=rf"The score {score} for document 1 is outside the \[0,1\] range; defaulting to 0" |             UserWarning, match=rf"The score {score} for Document 1 is outside the \[0,1\] range; defaulting to 0" | ||||||
|         ): |         ): | ||||||
|             ranker.run(documents=docs_before) |             ranker.run(documents=docs_before) | ||||||
| 
 | 
 | ||||||
| @ -117,5 +117,6 @@ class TestMetaFieldRanker: | |||||||
|             Document(content="abc", meta={"rating": 0.7}), |             Document(content="abc", meta={"rating": 0.7}), | ||||||
|             Document(content="abc", meta={"rating": 2.1}), |             Document(content="abc", meta={"rating": 2.1}), | ||||||
|         ] |         ] | ||||||
|         with pytest.warns(UserWarning, match="The score was not provided; defaulting to 0"): | 
 | ||||||
|  |         with pytest.warns(UserWarning, match="The score wasn't provided; defaulting to 0."): | ||||||
|             ranker.run(documents=docs_before) |             ranker.run(documents=docs_before) | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Agnieszka Marzec
						Agnieszka Marzec