diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py index 964d4db97..b14b10514 100644 --- a/haystack/components/rankers/meta_field.py +++ b/haystack/components/rankers/meta_field.py @@ -43,6 +43,7 @@ class MetaFieldRanker: top_k: Optional[int] = None, ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] = "reciprocal_rank_fusion", sort_order: Literal["ascending", "descending"] = "descending", + missing_meta: Literal["drop", "top", "bottom"] = "bottom", meta_value_type: Optional[Literal["float", "int", "date"]] = None, ): """ @@ -65,6 +66,14 @@ class MetaFieldRanker: :param sort_order: Whether to sort the meta field by ascending or descending order. Possible values are `descending` (default) and `ascending`. + :param missing_meta: + What to do with documents that are missing the sorting metadata field. + Possible values are: + - 'drop' will drop the documents entirely. + - 'top' will place the documents at the top of the metadata-sorted list + (regardless of 'ascending' or 'descending'). + - 'bottom' will place the documents at the bottom of metadata-sorted list + (regardless of 'ascending' or 'descending'). :param meta_value_type: Parse the meta value into the data type specified before sorting. This will only work if all meta values stored under `meta_field` in the provided documents are strings. @@ -82,11 +91,13 @@ class MetaFieldRanker: self.top_k = top_k self.ranking_mode = ranking_mode self.sort_order = sort_order + self.missing_meta = missing_meta self._validate_params( weight=self.weight, top_k=self.top_k, ranking_mode=self.ranking_mode, sort_order=self.sort_order, + missing_meta=self.missing_meta, meta_value_type=meta_value_type, ) self.meta_value_type = meta_value_type @@ -97,6 +108,7 @@ class MetaFieldRanker: top_k: Optional[int], ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"], sort_order: Literal["ascending", "descending"], + missing_meta: Literal["drop", "top", "bottom"], meta_value_type: Optional[Literal["float", "int", "date"]], ): if top_k is not None and top_k <= 0: @@ -125,6 +137,14 @@ class MetaFieldRanker: "MetaFieldRanker." % sort_order ) + if missing_meta not in ["drop", "top", "bottom"]: + raise ValueError( + "The value of parameter must be 'drop', 'top', or 'bottom', " + "but is currently set to '%s'.\n" + "Change the value to 'drop', 'top', or 'bottom' when initializing the " + "MetaFieldRanker." % missing_meta + ) + if meta_value_type not in ["float", "int", "date", None]: raise ValueError( "The value of parameter must be 'float', 'int', 'date' or None but is " @@ -141,6 +161,7 @@ class MetaFieldRanker: weight: Optional[float] = None, ranking_mode: Optional[Literal["reciprocal_rank_fusion", "linear_score"]] = None, sort_order: Optional[Literal["ascending", "descending"]] = None, + missing_meta: Optional[Literal["drop", "top", "bottom"]] = None, meta_value_type: Optional[Literal["float", "int", "date"]] = None, ): """ @@ -171,6 +192,15 @@ class MetaFieldRanker: Whether to sort the meta field by ascending or descending order. Possible values are `descending` (default) and `ascending`. If not provided, the sort_order provided at initialization time is used. + :param missing_meta: + What to do with documents that are missing the sorting metadata field. + Possible values are: + - 'drop' will drop the documents entirely. + - 'top' will place the documents at the top of the metadata-sorted list + (regardless of 'ascending' or 'descending'). + - 'bottom' will place the documents at the bottom of metadata-sorted list + (regardless of 'ascending' or 'descending'). + If not provided, the missing_meta provided at initialization time is used. :param meta_value_type: Parse the meta value into the data type specified before sorting. This will only work if all meta values stored under `meta_field` in the provided documents are strings. @@ -199,12 +229,14 @@ class MetaFieldRanker: weight = weight if weight is not None else self.weight ranking_mode = ranking_mode or self.ranking_mode sort_order = sort_order or self.sort_order + missing_meta = missing_meta or self.missing_meta meta_value_type = meta_value_type or self.meta_value_type self._validate_params( weight=weight, top_k=top_k, ranking_mode=ranking_mode, sort_order=sort_order, + missing_meta=missing_meta, meta_value_type=meta_value_type, ) @@ -227,13 +259,27 @@ class MetaFieldRanker: return {"documents": documents[:top_k]} if len(docs_missing_meta_field) > 0: - logger.warning( - "The parameter is currently set to '{meta_field}' but the Documents with IDs {document_ids} don't have this meta key.\n" - "These Documents will be placed at the end of the sorting order.", - meta_field=self.meta_field, - document_ids=",".join([doc.id for doc in docs_missing_meta_field]), + warning_start = ( + f"The parameter is currently set to '{self.meta_field}' but the Documents " + f"with IDs {','.join([doc.id for doc in docs_missing_meta_field])} don't have this meta key.\n" ) + if missing_meta == "bottom": + logger.warning( + "{warning_start}Because the parameter is set to 'bottom', these Documents will be placed at the end of the sorting order.", + warning_start=warning_start, + ) + elif missing_meta == "top": + logger.warning( + "{warning_start}Because the parameter is set to 'top', these Documents will be placed at the top of the sorting order.", + warning_start=warning_start, + ) + else: + logger.warning( + "{warning_start}Because the parameter is set to 'drop', these Documents will be removed from the list of retrieved Documents.", + warning_start=warning_start, + ) + # If meta_value_type is provided try to parse the meta values parsed_meta = self._parse_meta(docs_with_meta_field=docs_with_meta_field, meta_value_type=meta_value_type) tuple_parsed_meta_and_docs = list(zip(parsed_meta, docs_with_meta_field)) @@ -252,10 +298,18 @@ class MetaFieldRanker: ) return {"documents": documents[:top_k]} - # Add the docs missing the meta_field back on the end + # Merge rankings and handle missing meta fields as specified in the missing_meta parameter sorted_by_meta = [doc for meta, doc in tuple_sorted_by_meta] - sorted_documents = sorted_by_meta + docs_missing_meta_field - sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode) + if missing_meta == "bottom": + sorted_documents = sorted_by_meta + docs_missing_meta_field + sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode) + elif missing_meta == "top": + sorted_documents = docs_missing_meta_field + sorted_by_meta + sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode) + else: + sorted_documents = sorted_by_meta + sorted_documents = self._merge_rankings(docs_with_meta_field, sorted_documents, weight, ranking_mode) + return {"documents": sorted_documents[:top_k]} def _parse_meta( diff --git a/releasenotes/notes/metaranker-missing-meta-options-1a969008d632a523.yaml b/releasenotes/notes/metaranker-missing-meta-options-1a969008d632a523.yaml new file mode 100644 index 000000000..46cb0c82a --- /dev/null +++ b/releasenotes/notes/metaranker-missing-meta-options-1a969008d632a523.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + Add a new `missing_meta` param to `MetaFieldRanker`, which determines what to do with + documents that lack the ranked meta field. Supported values are `"bottom"` (which + puts documents with missing meta at the bottom of the sorted list), `"top"` (which puts them + at the top), and `"drop"` (which removes them from the results entirely). diff --git a/test/components/rankers/test_metafield.py b/test/components/rankers/test_metafield.py index beb105bcf..87366e27c 100644 --- a/test/components/rankers/test_metafield.py +++ b/test/components/rankers/test_metafield.py @@ -175,6 +175,10 @@ class TestMetaFieldRanker: with pytest.raises(ValueError): MetaFieldRanker(meta_field="rating", sort_order="wrong_order") + def test_raises_value_error_if_wrong_missing_meta(self): + with pytest.raises(ValueError): + MetaFieldRanker(meta_field="rating", missing_meta="wrong_missing_meta") + def test_raises_value_error_if_wrong_meta_value_type(self): with pytest.raises(ValueError): MetaFieldRanker(meta_field="rating", meta_value_type="wrong_type") @@ -239,3 +243,39 @@ class TestMetaFieldRanker: output = ranker.run(documents=docs_before, ranking_mode="reciprocal_rank_fusion") docs_after = output["documents"] assert docs_after[0].score == pytest.approx(0.016261, abs=1e-5) + + def test_missing_meta_bottom(self): + ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="bottom") + docs_before = [ + Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6), + Document(id="2", content="abc", meta={}, score=0.4), + Document(id="3", content="abc", meta={"rating": 2.1}, score=0.39), + ] + output = ranker.run(documents=docs_before) + docs_after = output["documents"] + assert len(docs_after) == 3 + assert docs_after[2].id == "2" + + def test_missing_meta_top(self): + ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="top") + docs_before = [ + Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6), + Document(id="2", content="abc", meta={}, score=0.59), + Document(id="3", content="abc", meta={"rating": 2.1}, score=0.4), + ] + output = ranker.run(documents=docs_before) + docs_after = output["documents"] + assert len(docs_after) == 3 + assert docs_after[0].id == "2" + + def test_missing_meta_drop(self): + ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="drop") + docs_before = [ + Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6), + Document(id="2", content="abc", meta={}, score=0.59), + Document(id="3", content="abc", meta={"rating": 2.1}, score=0.4), + ] + output = ranker.run(documents=docs_before) + docs_after = output["documents"] + assert len(docs_after) == 2 + assert "2" not in [doc.id for doc in docs_after]