mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-25 08:04:49 +00:00
feat: Add options for what to do with missing metadata fields in MetaFieldRanker
(#7700)
* Add `missing_meta` param to `MetaFieldRanker`, plus checks for validation. * Implement `missing_meta` functionality in `run()`. * Finish first draft of revised `MetaFieldRanker` functionality. * Add tests for `MetaFieldRanker` `missing_meta` functionality. * Add `missing_meta` param to `MetaFieldRanker`, plus checks for validation. * Implement `missing_meta` functionality in `run()`. * Finish first draft of revised `MetaFieldRanker` functionality. * Add tests for `MetaFieldRanker` `missing_meta` functionality. * Add release notes for new `missing_meta` param of `MetaFieldRanker` * Move part of docs_missing_meta_field warning string outside of `if...elif...else`.
This commit is contained in:
parent
14c7b02a4c
commit
28dd0f5596
@ -43,6 +43,7 @@ class MetaFieldRanker:
|
||||
top_k: Optional[int] = None,
|
||||
ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] = "reciprocal_rank_fusion",
|
||||
sort_order: Literal["ascending", "descending"] = "descending",
|
||||
missing_meta: Literal["drop", "top", "bottom"] = "bottom",
|
||||
meta_value_type: Optional[Literal["float", "int", "date"]] = None,
|
||||
):
|
||||
"""
|
||||
@ -65,6 +66,14 @@ class MetaFieldRanker:
|
||||
:param sort_order:
|
||||
Whether to sort the meta field by ascending or descending order.
|
||||
Possible values are `descending` (default) and `ascending`.
|
||||
:param missing_meta:
|
||||
What to do with documents that are missing the sorting metadata field.
|
||||
Possible values are:
|
||||
- 'drop' will drop the documents entirely.
|
||||
- 'top' will place the documents at the top of the metadata-sorted list
|
||||
(regardless of 'ascending' or 'descending').
|
||||
- 'bottom' will place the documents at the bottom of metadata-sorted list
|
||||
(regardless of 'ascending' or 'descending').
|
||||
:param meta_value_type:
|
||||
Parse the meta value into the data type specified before sorting.
|
||||
This will only work if all meta values stored under `meta_field` in the provided documents are strings.
|
||||
@ -82,11 +91,13 @@ class MetaFieldRanker:
|
||||
self.top_k = top_k
|
||||
self.ranking_mode = ranking_mode
|
||||
self.sort_order = sort_order
|
||||
self.missing_meta = missing_meta
|
||||
self._validate_params(
|
||||
weight=self.weight,
|
||||
top_k=self.top_k,
|
||||
ranking_mode=self.ranking_mode,
|
||||
sort_order=self.sort_order,
|
||||
missing_meta=self.missing_meta,
|
||||
meta_value_type=meta_value_type,
|
||||
)
|
||||
self.meta_value_type = meta_value_type
|
||||
@ -97,6 +108,7 @@ class MetaFieldRanker:
|
||||
top_k: Optional[int],
|
||||
ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"],
|
||||
sort_order: Literal["ascending", "descending"],
|
||||
missing_meta: Literal["drop", "top", "bottom"],
|
||||
meta_value_type: Optional[Literal["float", "int", "date"]],
|
||||
):
|
||||
if top_k is not None and top_k <= 0:
|
||||
@ -125,6 +137,14 @@ class MetaFieldRanker:
|
||||
"MetaFieldRanker." % sort_order
|
||||
)
|
||||
|
||||
if missing_meta not in ["drop", "top", "bottom"]:
|
||||
raise ValueError(
|
||||
"The value of parameter <missing_meta> must be 'drop', 'top', or 'bottom', "
|
||||
"but is currently set to '%s'.\n"
|
||||
"Change the <missing_meta> value to 'drop', 'top', or 'bottom' when initializing the "
|
||||
"MetaFieldRanker." % missing_meta
|
||||
)
|
||||
|
||||
if meta_value_type not in ["float", "int", "date", None]:
|
||||
raise ValueError(
|
||||
"The value of parameter <meta_value_type> must be 'float', 'int', 'date' or None but is "
|
||||
@ -141,6 +161,7 @@ class MetaFieldRanker:
|
||||
weight: Optional[float] = None,
|
||||
ranking_mode: Optional[Literal["reciprocal_rank_fusion", "linear_score"]] = None,
|
||||
sort_order: Optional[Literal["ascending", "descending"]] = None,
|
||||
missing_meta: Optional[Literal["drop", "top", "bottom"]] = None,
|
||||
meta_value_type: Optional[Literal["float", "int", "date"]] = None,
|
||||
):
|
||||
"""
|
||||
@ -171,6 +192,15 @@ class MetaFieldRanker:
|
||||
Whether to sort the meta field by ascending or descending order.
|
||||
Possible values are `descending` (default) and `ascending`.
|
||||
If not provided, the sort_order provided at initialization time is used.
|
||||
:param missing_meta:
|
||||
What to do with documents that are missing the sorting metadata field.
|
||||
Possible values are:
|
||||
- 'drop' will drop the documents entirely.
|
||||
- 'top' will place the documents at the top of the metadata-sorted list
|
||||
(regardless of 'ascending' or 'descending').
|
||||
- 'bottom' will place the documents at the bottom of metadata-sorted list
|
||||
(regardless of 'ascending' or 'descending').
|
||||
If not provided, the missing_meta provided at initialization time is used.
|
||||
:param meta_value_type:
|
||||
Parse the meta value into the data type specified before sorting.
|
||||
This will only work if all meta values stored under `meta_field` in the provided documents are strings.
|
||||
@ -199,12 +229,14 @@ class MetaFieldRanker:
|
||||
weight = weight if weight is not None else self.weight
|
||||
ranking_mode = ranking_mode or self.ranking_mode
|
||||
sort_order = sort_order or self.sort_order
|
||||
missing_meta = missing_meta or self.missing_meta
|
||||
meta_value_type = meta_value_type or self.meta_value_type
|
||||
self._validate_params(
|
||||
weight=weight,
|
||||
top_k=top_k,
|
||||
ranking_mode=ranking_mode,
|
||||
sort_order=sort_order,
|
||||
missing_meta=missing_meta,
|
||||
meta_value_type=meta_value_type,
|
||||
)
|
||||
|
||||
@ -227,13 +259,27 @@ class MetaFieldRanker:
|
||||
return {"documents": documents[:top_k]}
|
||||
|
||||
if len(docs_missing_meta_field) > 0:
|
||||
logger.warning(
|
||||
"The parameter <meta_field> is currently set to '{meta_field}' but the Documents with IDs {document_ids} don't have this meta key.\n"
|
||||
"These Documents will be placed at the end of the sorting order.",
|
||||
meta_field=self.meta_field,
|
||||
document_ids=",".join([doc.id for doc in docs_missing_meta_field]),
|
||||
warning_start = (
|
||||
f"The parameter <meta_field> is currently set to '{self.meta_field}' but the Documents "
|
||||
f"with IDs {','.join([doc.id for doc in docs_missing_meta_field])} don't have this meta key.\n"
|
||||
)
|
||||
|
||||
if missing_meta == "bottom":
|
||||
logger.warning(
|
||||
"{warning_start}Because the parameter <missing_meta> is set to 'bottom', these Documents will be placed at the end of the sorting order.",
|
||||
warning_start=warning_start,
|
||||
)
|
||||
elif missing_meta == "top":
|
||||
logger.warning(
|
||||
"{warning_start}Because the parameter <missing_meta> is set to 'top', these Documents will be placed at the top of the sorting order.",
|
||||
warning_start=warning_start,
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
"{warning_start}Because the parameter <missing_meta> is set to 'drop', these Documents will be removed from the list of retrieved Documents.",
|
||||
warning_start=warning_start,
|
||||
)
|
||||
|
||||
# If meta_value_type is provided try to parse the meta values
|
||||
parsed_meta = self._parse_meta(docs_with_meta_field=docs_with_meta_field, meta_value_type=meta_value_type)
|
||||
tuple_parsed_meta_and_docs = list(zip(parsed_meta, docs_with_meta_field))
|
||||
@ -252,10 +298,18 @@ class MetaFieldRanker:
|
||||
)
|
||||
return {"documents": documents[:top_k]}
|
||||
|
||||
# Add the docs missing the meta_field back on the end
|
||||
# Merge rankings and handle missing meta fields as specified in the missing_meta parameter
|
||||
sorted_by_meta = [doc for meta, doc in tuple_sorted_by_meta]
|
||||
sorted_documents = sorted_by_meta + docs_missing_meta_field
|
||||
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
|
||||
if missing_meta == "bottom":
|
||||
sorted_documents = sorted_by_meta + docs_missing_meta_field
|
||||
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
|
||||
elif missing_meta == "top":
|
||||
sorted_documents = docs_missing_meta_field + sorted_by_meta
|
||||
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
|
||||
else:
|
||||
sorted_documents = sorted_by_meta
|
||||
sorted_documents = self._merge_rankings(docs_with_meta_field, sorted_documents, weight, ranking_mode)
|
||||
|
||||
return {"documents": sorted_documents[:top_k]}
|
||||
|
||||
def _parse_meta(
|
||||
|
@ -0,0 +1,7 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Add a new `missing_meta` param to `MetaFieldRanker`, which determines what to do with
|
||||
documents that lack the ranked meta field. Supported values are `"bottom"` (which
|
||||
puts documents with missing meta at the bottom of the sorted list), `"top"` (which puts them
|
||||
at the top), and `"drop"` (which removes them from the results entirely).
|
@ -175,6 +175,10 @@ class TestMetaFieldRanker:
|
||||
with pytest.raises(ValueError):
|
||||
MetaFieldRanker(meta_field="rating", sort_order="wrong_order")
|
||||
|
||||
def test_raises_value_error_if_wrong_missing_meta(self):
|
||||
with pytest.raises(ValueError):
|
||||
MetaFieldRanker(meta_field="rating", missing_meta="wrong_missing_meta")
|
||||
|
||||
def test_raises_value_error_if_wrong_meta_value_type(self):
|
||||
with pytest.raises(ValueError):
|
||||
MetaFieldRanker(meta_field="rating", meta_value_type="wrong_type")
|
||||
@ -239,3 +243,39 @@ class TestMetaFieldRanker:
|
||||
output = ranker.run(documents=docs_before, ranking_mode="reciprocal_rank_fusion")
|
||||
docs_after = output["documents"]
|
||||
assert docs_after[0].score == pytest.approx(0.016261, abs=1e-5)
|
||||
|
||||
def test_missing_meta_bottom(self):
|
||||
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="bottom")
|
||||
docs_before = [
|
||||
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
|
||||
Document(id="2", content="abc", meta={}, score=0.4),
|
||||
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.39),
|
||||
]
|
||||
output = ranker.run(documents=docs_before)
|
||||
docs_after = output["documents"]
|
||||
assert len(docs_after) == 3
|
||||
assert docs_after[2].id == "2"
|
||||
|
||||
def test_missing_meta_top(self):
|
||||
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="top")
|
||||
docs_before = [
|
||||
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
|
||||
Document(id="2", content="abc", meta={}, score=0.59),
|
||||
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.4),
|
||||
]
|
||||
output = ranker.run(documents=docs_before)
|
||||
docs_after = output["documents"]
|
||||
assert len(docs_after) == 3
|
||||
assert docs_after[0].id == "2"
|
||||
|
||||
def test_missing_meta_drop(self):
|
||||
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="drop")
|
||||
docs_before = [
|
||||
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
|
||||
Document(id="2", content="abc", meta={}, score=0.59),
|
||||
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.4),
|
||||
]
|
||||
output = ranker.run(documents=docs_before)
|
||||
docs_after = output["documents"]
|
||||
assert len(docs_after) == 2
|
||||
assert "2" not in [doc.id for doc in docs_after]
|
||||
|
Loading…
x
Reference in New Issue
Block a user