feat: Add options for what to do with missing metadata fields in MetaFieldRanker (#7700)

* Add `missing_meta` param to `MetaFieldRanker`, plus checks for validation.

* Implement `missing_meta` functionality in `run()`.

* Finish first draft of revised `MetaFieldRanker` functionality.

* Add tests for `MetaFieldRanker` `missing_meta` functionality.

* Add `missing_meta` param to `MetaFieldRanker`, plus checks for validation.

* Implement `missing_meta` functionality in `run()`.

* Finish first draft of revised `MetaFieldRanker` functionality.

* Add tests for `MetaFieldRanker` `missing_meta` functionality.

* Add release notes for new `missing_meta` param of `MetaFieldRanker`

* Move part of docs_missing_meta_field warning string outside of `if...elif...else`.
This commit is contained in:
Rob Pasternak 2024-06-12 10:42:02 +02:00 committed by GitHub
parent 14c7b02a4c
commit 28dd0f5596
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 109 additions and 8 deletions

View File

@ -43,6 +43,7 @@ class MetaFieldRanker:
top_k: Optional[int] = None,
ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] = "reciprocal_rank_fusion",
sort_order: Literal["ascending", "descending"] = "descending",
missing_meta: Literal["drop", "top", "bottom"] = "bottom",
meta_value_type: Optional[Literal["float", "int", "date"]] = None,
):
"""
@ -65,6 +66,14 @@ class MetaFieldRanker:
:param sort_order:
Whether to sort the meta field by ascending or descending order.
Possible values are `descending` (default) and `ascending`.
:param missing_meta:
What to do with documents that are missing the sorting metadata field.
Possible values are:
- 'drop' will drop the documents entirely.
- 'top' will place the documents at the top of the metadata-sorted list
(regardless of 'ascending' or 'descending').
- 'bottom' will place the documents at the bottom of metadata-sorted list
(regardless of 'ascending' or 'descending').
:param meta_value_type:
Parse the meta value into the data type specified before sorting.
This will only work if all meta values stored under `meta_field` in the provided documents are strings.
@ -82,11 +91,13 @@ class MetaFieldRanker:
self.top_k = top_k
self.ranking_mode = ranking_mode
self.sort_order = sort_order
self.missing_meta = missing_meta
self._validate_params(
weight=self.weight,
top_k=self.top_k,
ranking_mode=self.ranking_mode,
sort_order=self.sort_order,
missing_meta=self.missing_meta,
meta_value_type=meta_value_type,
)
self.meta_value_type = meta_value_type
@ -97,6 +108,7 @@ class MetaFieldRanker:
top_k: Optional[int],
ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"],
sort_order: Literal["ascending", "descending"],
missing_meta: Literal["drop", "top", "bottom"],
meta_value_type: Optional[Literal["float", "int", "date"]],
):
if top_k is not None and top_k <= 0:
@ -125,6 +137,14 @@ class MetaFieldRanker:
"MetaFieldRanker." % sort_order
)
if missing_meta not in ["drop", "top", "bottom"]:
raise ValueError(
"The value of parameter <missing_meta> must be 'drop', 'top', or 'bottom', "
"but is currently set to '%s'.\n"
"Change the <missing_meta> value to 'drop', 'top', or 'bottom' when initializing the "
"MetaFieldRanker." % missing_meta
)
if meta_value_type not in ["float", "int", "date", None]:
raise ValueError(
"The value of parameter <meta_value_type> must be 'float', 'int', 'date' or None but is "
@ -141,6 +161,7 @@ class MetaFieldRanker:
weight: Optional[float] = None,
ranking_mode: Optional[Literal["reciprocal_rank_fusion", "linear_score"]] = None,
sort_order: Optional[Literal["ascending", "descending"]] = None,
missing_meta: Optional[Literal["drop", "top", "bottom"]] = None,
meta_value_type: Optional[Literal["float", "int", "date"]] = None,
):
"""
@ -171,6 +192,15 @@ class MetaFieldRanker:
Whether to sort the meta field by ascending or descending order.
Possible values are `descending` (default) and `ascending`.
If not provided, the sort_order provided at initialization time is used.
:param missing_meta:
What to do with documents that are missing the sorting metadata field.
Possible values are:
- 'drop' will drop the documents entirely.
- 'top' will place the documents at the top of the metadata-sorted list
(regardless of 'ascending' or 'descending').
- 'bottom' will place the documents at the bottom of metadata-sorted list
(regardless of 'ascending' or 'descending').
If not provided, the missing_meta provided at initialization time is used.
:param meta_value_type:
Parse the meta value into the data type specified before sorting.
This will only work if all meta values stored under `meta_field` in the provided documents are strings.
@ -199,12 +229,14 @@ class MetaFieldRanker:
weight = weight if weight is not None else self.weight
ranking_mode = ranking_mode or self.ranking_mode
sort_order = sort_order or self.sort_order
missing_meta = missing_meta or self.missing_meta
meta_value_type = meta_value_type or self.meta_value_type
self._validate_params(
weight=weight,
top_k=top_k,
ranking_mode=ranking_mode,
sort_order=sort_order,
missing_meta=missing_meta,
meta_value_type=meta_value_type,
)
@ -227,13 +259,27 @@ class MetaFieldRanker:
return {"documents": documents[:top_k]}
if len(docs_missing_meta_field) > 0:
logger.warning(
"The parameter <meta_field> is currently set to '{meta_field}' but the Documents with IDs {document_ids} don't have this meta key.\n"
"These Documents will be placed at the end of the sorting order.",
meta_field=self.meta_field,
document_ids=",".join([doc.id for doc in docs_missing_meta_field]),
warning_start = (
f"The parameter <meta_field> is currently set to '{self.meta_field}' but the Documents "
f"with IDs {','.join([doc.id for doc in docs_missing_meta_field])} don't have this meta key.\n"
)
if missing_meta == "bottom":
logger.warning(
"{warning_start}Because the parameter <missing_meta> is set to 'bottom', these Documents will be placed at the end of the sorting order.",
warning_start=warning_start,
)
elif missing_meta == "top":
logger.warning(
"{warning_start}Because the parameter <missing_meta> is set to 'top', these Documents will be placed at the top of the sorting order.",
warning_start=warning_start,
)
else:
logger.warning(
"{warning_start}Because the parameter <missing_meta> is set to 'drop', these Documents will be removed from the list of retrieved Documents.",
warning_start=warning_start,
)
# If meta_value_type is provided try to parse the meta values
parsed_meta = self._parse_meta(docs_with_meta_field=docs_with_meta_field, meta_value_type=meta_value_type)
tuple_parsed_meta_and_docs = list(zip(parsed_meta, docs_with_meta_field))
@ -252,10 +298,18 @@ class MetaFieldRanker:
)
return {"documents": documents[:top_k]}
# Add the docs missing the meta_field back on the end
# Merge rankings and handle missing meta fields as specified in the missing_meta parameter
sorted_by_meta = [doc for meta, doc in tuple_sorted_by_meta]
sorted_documents = sorted_by_meta + docs_missing_meta_field
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
if missing_meta == "bottom":
sorted_documents = sorted_by_meta + docs_missing_meta_field
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
elif missing_meta == "top":
sorted_documents = docs_missing_meta_field + sorted_by_meta
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
else:
sorted_documents = sorted_by_meta
sorted_documents = self._merge_rankings(docs_with_meta_field, sorted_documents, weight, ranking_mode)
return {"documents": sorted_documents[:top_k]}
def _parse_meta(

View File

@ -0,0 +1,7 @@
---
features:
- |
Add a new `missing_meta` param to `MetaFieldRanker`, which determines what to do with
documents that lack the ranked meta field. Supported values are `"bottom"` (which
puts documents with missing meta at the bottom of the sorted list), `"top"` (which puts them
at the top), and `"drop"` (which removes them from the results entirely).

View File

@ -175,6 +175,10 @@ class TestMetaFieldRanker:
with pytest.raises(ValueError):
MetaFieldRanker(meta_field="rating", sort_order="wrong_order")
def test_raises_value_error_if_wrong_missing_meta(self):
with pytest.raises(ValueError):
MetaFieldRanker(meta_field="rating", missing_meta="wrong_missing_meta")
def test_raises_value_error_if_wrong_meta_value_type(self):
with pytest.raises(ValueError):
MetaFieldRanker(meta_field="rating", meta_value_type="wrong_type")
@ -239,3 +243,39 @@ class TestMetaFieldRanker:
output = ranker.run(documents=docs_before, ranking_mode="reciprocal_rank_fusion")
docs_after = output["documents"]
assert docs_after[0].score == pytest.approx(0.016261, abs=1e-5)
def test_missing_meta_bottom(self):
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="bottom")
docs_before = [
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
Document(id="2", content="abc", meta={}, score=0.4),
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.39),
]
output = ranker.run(documents=docs_before)
docs_after = output["documents"]
assert len(docs_after) == 3
assert docs_after[2].id == "2"
def test_missing_meta_top(self):
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="top")
docs_before = [
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
Document(id="2", content="abc", meta={}, score=0.59),
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.4),
]
output = ranker.run(documents=docs_before)
docs_after = output["documents"]
assert len(docs_after) == 3
assert docs_after[0].id == "2"
def test_missing_meta_drop(self):
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="drop")
docs_before = [
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
Document(id="2", content="abc", meta={}, score=0.59),
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.4),
]
output = ranker.run(documents=docs_before)
docs_after = output["documents"]
assert len(docs_after) == 2
assert "2" not in [doc.id for doc in docs_after]