mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-29 16:59:47 +00:00
Docs: Update Rankers docstrings and messages (#6296)
* Update docstrings and messages * Fix tests * Fix formatting * Update haystack/preview/components/rankers/meta_field.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Fix tests --------- Co-authored-by: Silvano Cerza <silvanocerza@gmail.com> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
This commit is contained in:
parent
0ef06e72ff
commit
497299c27a
@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
||||
@component
|
||||
class MetaFieldRanker:
|
||||
"""
|
||||
Ranks documents based on the value of a metadata field.
|
||||
Ranks Documents based on the value of their specific metadata field. The ranking is done in a descending order.
|
||||
|
||||
Usage example:
|
||||
```
|
||||
@ -42,13 +42,13 @@ class MetaFieldRanker:
|
||||
|
||||
:param metadata_field: The name of the metadata field to rank by.
|
||||
:param weight: In range [0,1].
|
||||
0 disables sorting by metadata field.
|
||||
0.5 content and metadata fields have the same impact.
|
||||
1 means sorting only by metadata field, highest value comes first.
|
||||
:param top_k: The maximum number of documents to return.
|
||||
:param ranking_mode: The mode used to combine retriever and recentness.
|
||||
0 disables ranking by a metadata field.
|
||||
0.5 content and metadata fields have the same impact for the ranking.
|
||||
1 means ranking by a metadata field only. The highest value comes first.
|
||||
:param top_k: The maximum number of Documents you want the Ranker to return per query.
|
||||
:param ranking_mode: The mode used to combine the Retriever's and Ranker's scores.
|
||||
Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'.
|
||||
Make sure to use 'score' mode only with retrievers/rankers that give back OK score in range [0,1].
|
||||
Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1].
|
||||
"""
|
||||
|
||||
self.metadata_field = metadata_field
|
||||
@ -59,9 +59,9 @@ class MetaFieldRanker:
|
||||
if self.weight < 0 or self.weight > 1:
|
||||
raise ValueError(
|
||||
"""
|
||||
Param <weight> needs to be in range [0,1] but was set to '{}'.\n
|
||||
'0' disables sorting by metadata field, '0.5' gives equal weight to previous relevance scores and metadata field, and '1' ranks by metadata field only.\n
|
||||
Please change param <weight> when initializing the MetaFieldRanker.
|
||||
Parameter <weight> must be in range [0,1] but is currently set to '{}'.\n
|
||||
'0' disables sorting by a metadata field, '0.5' assigns equal weight to the previous relevance scores and the metadata field, and '1' ranks by the metadata field only.\n
|
||||
Change the <weight> parameter to a value in range 0 to 1 when initializing the MetaFieldRanker.
|
||||
""".format(
|
||||
self.weight
|
||||
)
|
||||
@ -70,8 +70,8 @@ class MetaFieldRanker:
|
||||
if self.ranking_mode not in ["reciprocal_rank_fusion", "linear_score"]:
|
||||
raise ValueError(
|
||||
"""
|
||||
Param <ranking_mode> needs to be 'reciprocal_rank_fusion' or 'linear_score' but was set to '{}'. \n
|
||||
Please change the <ranking_mode> when initializing the MetaFieldRanker.
|
||||
The value of parameter <ranking_mode> must be 'reciprocal_rank_fusion' or 'linear_score', but is currently set to '{}'. \n
|
||||
Change the <ranking_mode> value to 'reciprocal_rank_fusion' or 'linear_score' when initializing the MetaFieldRanker.
|
||||
""".format(
|
||||
self.ranking_mode
|
||||
)
|
||||
@ -92,13 +92,13 @@ class MetaFieldRanker:
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(self, documents: List[Document], top_k: Optional[int] = None):
|
||||
"""
|
||||
This method is used to rank a list of documents based on the selected metadata field by:
|
||||
1. Sorting the documents by the metadata field in descending order.
|
||||
Use this method to rank a list of Documents based on the selected metadata field by:
|
||||
1. Sorting the Documents by the metadata field in descending order.
|
||||
2. Merging the scores from the metadata field with the scores from the previous component according to the strategy and weight provided.
|
||||
3. Returning the top-k documents.
|
||||
|
||||
:param documents: Documents provided for ranking.
|
||||
:param top_k: (optional) How many documents to return at the end. If not provided, all documents will be returned.
|
||||
:param documents: Documents to be ranked.
|
||||
:param top_k: (optional) The number of Documents you want the Ranker to return. If not provided, the Ranker returns all Documents it received.
|
||||
"""
|
||||
if not documents:
|
||||
return {"documents": []}
|
||||
@ -113,9 +113,9 @@ class MetaFieldRanker:
|
||||
except KeyError:
|
||||
raise ComponentError(
|
||||
"""
|
||||
Param <metadata_field> was set to '{}' but document(s) {} do not contain this metadata key.\n
|
||||
Please double-check the names of existing metadata fields of your documents \n
|
||||
and set <metadata_field> to the name of the field that contains the metadata you want to rank by.
|
||||
The parameter <metadata_field> is currently set to '{}' but the Documents {} don't have this metadata key.\n
|
||||
Double-check the names of the metadata fields in your documents \n
|
||||
and set <metadata_field> to the name of the field that contains the metadata you want to use for ranking.
|
||||
""".format(
|
||||
self.metadata_field, ",".join([doc.id for doc in documents if self.metadata_field not in doc.meta])
|
||||
)
|
||||
@ -129,7 +129,7 @@ class MetaFieldRanker:
|
||||
|
||||
def _merge_scores(self, documents: List[Document], sorted_documents: List[Document]) -> List[Document]:
|
||||
"""
|
||||
Merge scores for documents sorted both by content and by metadata field.
|
||||
Merge scores for Documents sorted both by their content and by their metadata field.
|
||||
"""
|
||||
scores_map: Dict = defaultdict(int)
|
||||
|
||||
@ -141,10 +141,10 @@ class MetaFieldRanker:
|
||||
for i, (doc, sorted_doc) in enumerate(zip(documents, sorted_documents)):
|
||||
score = float(0)
|
||||
if doc.score is None:
|
||||
warnings.warn("The score was not provided; defaulting to 0")
|
||||
warnings.warn("The score wasn't provided; defaulting to 0.")
|
||||
elif doc.score < 0 or doc.score > 1:
|
||||
warnings.warn(
|
||||
"The score {} for document {} is outside the [0,1] range; defaulting to 0".format(
|
||||
"The score {} for Document {} is outside the [0,1] range; defaulting to 0".format(
|
||||
doc.score, doc.id
|
||||
)
|
||||
)
|
||||
@ -164,7 +164,7 @@ class MetaFieldRanker:
|
||||
def _calculate_rrf(rank: int, k: int = 61) -> float:
|
||||
"""
|
||||
Calculates the reciprocal rank fusion. The constant K is set to 61 (60 was suggested by the original paper,
|
||||
plus 1 as python lists are 0-based and the paper [https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf] used 1-based ranking).
|
||||
plus 1 as python lists are 0-based and the [paper](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) used 1-based ranking).
|
||||
"""
|
||||
return 1 / (k + rank)
|
||||
|
||||
@ -172,9 +172,9 @@ class MetaFieldRanker:
|
||||
def _calc_linear_score(rank: int, amount: int) -> float:
|
||||
"""
|
||||
Calculate the metadata field score as a linear score between the greatest and the lowest score in the list.
|
||||
This linear scaling is useful to
|
||||
a) reduce the effect of outliers and
|
||||
b) create scores that are meaningfully distributed in [0,1],
|
||||
similar to scores coming from a retriever/ranker.
|
||||
This linear scaling is useful for:
|
||||
- Reducing the effect of outliers
|
||||
- Creating scores that are meaningfully distributed in the range [0,1],
|
||||
similar to scores coming from a Retriever or Ranker.
|
||||
"""
|
||||
return (amount - rank) / amount
|
||||
|
||||
@ -16,8 +16,8 @@ with LazyImport(message="Run 'pip install transformers[torch,sentencepiece]'") a
|
||||
@component
|
||||
class TransformersSimilarityRanker:
|
||||
"""
|
||||
Ranks documents based on query similarity.
|
||||
It uses a pre-trained cross-encoder model (from Hugging Face Hub) to embed the query and documents.
|
||||
Ranks Documents based on their similarity to the query.
|
||||
It uses a pre-trained cross-encoder model (from the Hugging Face Hub) to embed the query and the Documents.
|
||||
|
||||
Usage example:
|
||||
```
|
||||
@ -45,12 +45,12 @@ class TransformersSimilarityRanker:
|
||||
Creates an instance of TransformersSimilarityRanker.
|
||||
|
||||
:param model_name_or_path: The name or path of a pre-trained cross-encoder model
|
||||
from Hugging Face Hub.
|
||||
:param device: torch device (for example, cuda:0, cpu, mps) to limit model inference to a specific device.
|
||||
from the Hugging Face Hub.
|
||||
:param device: The torch device (for example, cuda:0, cpu, mps) to which you want to limit model inference.
|
||||
:param token: The API token used to download private models from Hugging Face.
|
||||
If this parameter is set to `True`, then the token generated when running
|
||||
`transformers-cli login` (stored in ~/.huggingface) will be used.
|
||||
:param top_k: The maximum number of documents to return per query.
|
||||
If this parameter is set to `True`, the token generated when running
|
||||
`transformers-cli login` (stored in ~/.huggingface) is used.
|
||||
:param top_k: The maximum number of Documents to return per query.
|
||||
"""
|
||||
torch_and_transformers_import.check()
|
||||
|
||||
@ -71,7 +71,7 @@ class TransformersSimilarityRanker:
|
||||
|
||||
def warm_up(self):
|
||||
"""
|
||||
Warm up the model and tokenizer used in scoring the documents.
|
||||
Warm up the model and tokenizer used for scoring the Documents.
|
||||
"""
|
||||
if self.model_name_or_path and not self.model:
|
||||
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path, token=self.token)
|
||||
@ -94,12 +94,12 @@ class TransformersSimilarityRanker:
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(self, query: str, documents: List[Document], top_k: Optional[int] = None):
|
||||
"""
|
||||
Returns a list of documents ranked by their similarity to the given query
|
||||
Returns a list of Documents ranked by their similarity to the given query.
|
||||
|
||||
:param query: Query string.
|
||||
:param documents: List of Documents.
|
||||
:param top_k: The maximum number of documents to return.
|
||||
:return: List of Documents sorted by (desc.) similarity with the query.
|
||||
:param top_k: The maximum number of Documents you want the Ranker to return.
|
||||
:return: List of Documents sorted by their similarity to the query with the most similar Documents appearing first.
|
||||
"""
|
||||
if not documents:
|
||||
return {"documents": []}
|
||||
@ -113,7 +113,7 @@ class TransformersSimilarityRanker:
|
||||
# If a model path is provided but the model isn't loaded
|
||||
if self.model_name_or_path and not self.model:
|
||||
raise ComponentError(
|
||||
f"The component {self.__class__.__name__} not warmed up. Run 'warm_up()' before calling 'run()'."
|
||||
f"The component {self.__class__.__name__} wasn't warmed up. Run 'warm_up()' before calling 'run()'."
|
||||
)
|
||||
|
||||
query_doc_pairs = [[query, doc.content] for doc in documents]
|
||||
|
||||
@ -105,7 +105,7 @@ class TestMetaFieldRanker:
|
||||
Document(id=3, content="abc", meta={"rating": 2.1}, score=0.6),
|
||||
]
|
||||
with pytest.warns(
|
||||
UserWarning, match=rf"The score {score} for document 1 is outside the \[0,1\] range; defaulting to 0"
|
||||
UserWarning, match=rf"The score {score} for Document 1 is outside the \[0,1\] range; defaulting to 0"
|
||||
):
|
||||
ranker.run(documents=docs_before)
|
||||
|
||||
@ -117,5 +117,6 @@ class TestMetaFieldRanker:
|
||||
Document(content="abc", meta={"rating": 0.7}),
|
||||
Document(content="abc", meta={"rating": 2.1}),
|
||||
]
|
||||
with pytest.warns(UserWarning, match="The score was not provided; defaulting to 0"):
|
||||
|
||||
with pytest.warns(UserWarning, match="The score wasn't provided; defaulting to 0."):
|
||||
ranker.run(documents=docs_before)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user