feat: Add options for what to do with missing metadata fields in MetaFieldRanker (#7700)

* Add `missing_meta` param to `MetaFieldRanker`, plus checks for validation.

* Implement `missing_meta` functionality in `run()`.

* Finish first draft of revised `MetaFieldRanker` functionality.

* Add tests for `MetaFieldRanker` `missing_meta` functionality.

* Add `missing_meta` param to `MetaFieldRanker`, plus checks for validation.

* Implement `missing_meta` functionality in `run()`.

* Finish first draft of revised `MetaFieldRanker` functionality.

* Add tests for `MetaFieldRanker` `missing_meta` functionality.

* Add release notes for new `missing_meta` param of `MetaFieldRanker`

* Move part of docs_missing_meta_field warning string outside of `if...elif...else`.
This commit is contained in:
Rob Pasternak 2024-06-12 10:42:02 +02:00 committed by GitHub
parent 14c7b02a4c
commit 28dd0f5596
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 109 additions and 8 deletions

View File

@ -43,6 +43,7 @@ class MetaFieldRanker:
top_k: Optional[int] = None, top_k: Optional[int] = None,
ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] = "reciprocal_rank_fusion", ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] = "reciprocal_rank_fusion",
sort_order: Literal["ascending", "descending"] = "descending", sort_order: Literal["ascending", "descending"] = "descending",
missing_meta: Literal["drop", "top", "bottom"] = "bottom",
meta_value_type: Optional[Literal["float", "int", "date"]] = None, meta_value_type: Optional[Literal["float", "int", "date"]] = None,
): ):
""" """
@ -65,6 +66,14 @@ class MetaFieldRanker:
:param sort_order: :param sort_order:
Whether to sort the meta field by ascending or descending order. Whether to sort the meta field by ascending or descending order.
Possible values are `descending` (default) and `ascending`. Possible values are `descending` (default) and `ascending`.
:param missing_meta:
What to do with documents that are missing the sorting metadata field.
Possible values are:
- 'drop' will drop the documents entirely.
- 'top' will place the documents at the top of the metadata-sorted list
(regardless of 'ascending' or 'descending').
- 'bottom' will place the documents at the bottom of metadata-sorted list
(regardless of 'ascending' or 'descending').
:param meta_value_type: :param meta_value_type:
Parse the meta value into the data type specified before sorting. Parse the meta value into the data type specified before sorting.
This will only work if all meta values stored under `meta_field` in the provided documents are strings. This will only work if all meta values stored under `meta_field` in the provided documents are strings.
@ -82,11 +91,13 @@ class MetaFieldRanker:
self.top_k = top_k self.top_k = top_k
self.ranking_mode = ranking_mode self.ranking_mode = ranking_mode
self.sort_order = sort_order self.sort_order = sort_order
self.missing_meta = missing_meta
self._validate_params( self._validate_params(
weight=self.weight, weight=self.weight,
top_k=self.top_k, top_k=self.top_k,
ranking_mode=self.ranking_mode, ranking_mode=self.ranking_mode,
sort_order=self.sort_order, sort_order=self.sort_order,
missing_meta=self.missing_meta,
meta_value_type=meta_value_type, meta_value_type=meta_value_type,
) )
self.meta_value_type = meta_value_type self.meta_value_type = meta_value_type
@ -97,6 +108,7 @@ class MetaFieldRanker:
top_k: Optional[int], top_k: Optional[int],
ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"], ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"],
sort_order: Literal["ascending", "descending"], sort_order: Literal["ascending", "descending"],
missing_meta: Literal["drop", "top", "bottom"],
meta_value_type: Optional[Literal["float", "int", "date"]], meta_value_type: Optional[Literal["float", "int", "date"]],
): ):
if top_k is not None and top_k <= 0: if top_k is not None and top_k <= 0:
@ -125,6 +137,14 @@ class MetaFieldRanker:
"MetaFieldRanker." % sort_order "MetaFieldRanker." % sort_order
) )
if missing_meta not in ["drop", "top", "bottom"]:
raise ValueError(
"The value of parameter <missing_meta> must be 'drop', 'top', or 'bottom', "
"but is currently set to '%s'.\n"
"Change the <missing_meta> value to 'drop', 'top', or 'bottom' when initializing the "
"MetaFieldRanker." % missing_meta
)
if meta_value_type not in ["float", "int", "date", None]: if meta_value_type not in ["float", "int", "date", None]:
raise ValueError( raise ValueError(
"The value of parameter <meta_value_type> must be 'float', 'int', 'date' or None but is " "The value of parameter <meta_value_type> must be 'float', 'int', 'date' or None but is "
@ -141,6 +161,7 @@ class MetaFieldRanker:
weight: Optional[float] = None, weight: Optional[float] = None,
ranking_mode: Optional[Literal["reciprocal_rank_fusion", "linear_score"]] = None, ranking_mode: Optional[Literal["reciprocal_rank_fusion", "linear_score"]] = None,
sort_order: Optional[Literal["ascending", "descending"]] = None, sort_order: Optional[Literal["ascending", "descending"]] = None,
missing_meta: Optional[Literal["drop", "top", "bottom"]] = None,
meta_value_type: Optional[Literal["float", "int", "date"]] = None, meta_value_type: Optional[Literal["float", "int", "date"]] = None,
): ):
""" """
@ -171,6 +192,15 @@ class MetaFieldRanker:
Whether to sort the meta field by ascending or descending order. Whether to sort the meta field by ascending or descending order.
Possible values are `descending` (default) and `ascending`. Possible values are `descending` (default) and `ascending`.
If not provided, the sort_order provided at initialization time is used. If not provided, the sort_order provided at initialization time is used.
:param missing_meta:
What to do with documents that are missing the sorting metadata field.
Possible values are:
- 'drop' will drop the documents entirely.
- 'top' will place the documents at the top of the metadata-sorted list
(regardless of 'ascending' or 'descending').
- 'bottom' will place the documents at the bottom of metadata-sorted list
(regardless of 'ascending' or 'descending').
If not provided, the missing_meta provided at initialization time is used.
:param meta_value_type: :param meta_value_type:
Parse the meta value into the data type specified before sorting. Parse the meta value into the data type specified before sorting.
This will only work if all meta values stored under `meta_field` in the provided documents are strings. This will only work if all meta values stored under `meta_field` in the provided documents are strings.
@ -199,12 +229,14 @@ class MetaFieldRanker:
weight = weight if weight is not None else self.weight weight = weight if weight is not None else self.weight
ranking_mode = ranking_mode or self.ranking_mode ranking_mode = ranking_mode or self.ranking_mode
sort_order = sort_order or self.sort_order sort_order = sort_order or self.sort_order
missing_meta = missing_meta or self.missing_meta
meta_value_type = meta_value_type or self.meta_value_type meta_value_type = meta_value_type or self.meta_value_type
self._validate_params( self._validate_params(
weight=weight, weight=weight,
top_k=top_k, top_k=top_k,
ranking_mode=ranking_mode, ranking_mode=ranking_mode,
sort_order=sort_order, sort_order=sort_order,
missing_meta=missing_meta,
meta_value_type=meta_value_type, meta_value_type=meta_value_type,
) )
@ -227,13 +259,27 @@ class MetaFieldRanker:
return {"documents": documents[:top_k]} return {"documents": documents[:top_k]}
if len(docs_missing_meta_field) > 0: if len(docs_missing_meta_field) > 0:
logger.warning( warning_start = (
"The parameter <meta_field> is currently set to '{meta_field}' but the Documents with IDs {document_ids} don't have this meta key.\n" f"The parameter <meta_field> is currently set to '{self.meta_field}' but the Documents "
"These Documents will be placed at the end of the sorting order.", f"with IDs {','.join([doc.id for doc in docs_missing_meta_field])} don't have this meta key.\n"
meta_field=self.meta_field,
document_ids=",".join([doc.id for doc in docs_missing_meta_field]),
) )
if missing_meta == "bottom":
logger.warning(
"{warning_start}Because the parameter <missing_meta> is set to 'bottom', these Documents will be placed at the end of the sorting order.",
warning_start=warning_start,
)
elif missing_meta == "top":
logger.warning(
"{warning_start}Because the parameter <missing_meta> is set to 'top', these Documents will be placed at the top of the sorting order.",
warning_start=warning_start,
)
else:
logger.warning(
"{warning_start}Because the parameter <missing_meta> is set to 'drop', these Documents will be removed from the list of retrieved Documents.",
warning_start=warning_start,
)
# If meta_value_type is provided try to parse the meta values # If meta_value_type is provided try to parse the meta values
parsed_meta = self._parse_meta(docs_with_meta_field=docs_with_meta_field, meta_value_type=meta_value_type) parsed_meta = self._parse_meta(docs_with_meta_field=docs_with_meta_field, meta_value_type=meta_value_type)
tuple_parsed_meta_and_docs = list(zip(parsed_meta, docs_with_meta_field)) tuple_parsed_meta_and_docs = list(zip(parsed_meta, docs_with_meta_field))
@ -252,10 +298,18 @@ class MetaFieldRanker:
) )
return {"documents": documents[:top_k]} return {"documents": documents[:top_k]}
# Add the docs missing the meta_field back on the end # Merge rankings and handle missing meta fields as specified in the missing_meta parameter
sorted_by_meta = [doc for meta, doc in tuple_sorted_by_meta] sorted_by_meta = [doc for meta, doc in tuple_sorted_by_meta]
sorted_documents = sorted_by_meta + docs_missing_meta_field if missing_meta == "bottom":
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode) sorted_documents = sorted_by_meta + docs_missing_meta_field
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
elif missing_meta == "top":
sorted_documents = docs_missing_meta_field + sorted_by_meta
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
else:
sorted_documents = sorted_by_meta
sorted_documents = self._merge_rankings(docs_with_meta_field, sorted_documents, weight, ranking_mode)
return {"documents": sorted_documents[:top_k]} return {"documents": sorted_documents[:top_k]}
def _parse_meta( def _parse_meta(

View File

@ -0,0 +1,7 @@
---
features:
- |
Add a new `missing_meta` param to `MetaFieldRanker`, which determines what to do with
documents that lack the ranked meta field. Supported values are `"bottom"` (which
puts documents with missing meta at the bottom of the sorted list), `"top"` (which puts them
at the top), and `"drop"` (which removes them from the results entirely).

View File

@ -175,6 +175,10 @@ class TestMetaFieldRanker:
with pytest.raises(ValueError): with pytest.raises(ValueError):
MetaFieldRanker(meta_field="rating", sort_order="wrong_order") MetaFieldRanker(meta_field="rating", sort_order="wrong_order")
def test_raises_value_error_if_wrong_missing_meta(self):
with pytest.raises(ValueError):
MetaFieldRanker(meta_field="rating", missing_meta="wrong_missing_meta")
def test_raises_value_error_if_wrong_meta_value_type(self): def test_raises_value_error_if_wrong_meta_value_type(self):
with pytest.raises(ValueError): with pytest.raises(ValueError):
MetaFieldRanker(meta_field="rating", meta_value_type="wrong_type") MetaFieldRanker(meta_field="rating", meta_value_type="wrong_type")
@ -239,3 +243,39 @@ class TestMetaFieldRanker:
output = ranker.run(documents=docs_before, ranking_mode="reciprocal_rank_fusion") output = ranker.run(documents=docs_before, ranking_mode="reciprocal_rank_fusion")
docs_after = output["documents"] docs_after = output["documents"]
assert docs_after[0].score == pytest.approx(0.016261, abs=1e-5) assert docs_after[0].score == pytest.approx(0.016261, abs=1e-5)
def test_missing_meta_bottom(self):
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="bottom")
docs_before = [
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
Document(id="2", content="abc", meta={}, score=0.4),
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.39),
]
output = ranker.run(documents=docs_before)
docs_after = output["documents"]
assert len(docs_after) == 3
assert docs_after[2].id == "2"
def test_missing_meta_top(self):
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="top")
docs_before = [
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
Document(id="2", content="abc", meta={}, score=0.59),
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.4),
]
output = ranker.run(documents=docs_before)
docs_after = output["documents"]
assert len(docs_after) == 3
assert docs_after[0].id == "2"
def test_missing_meta_drop(self):
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5, missing_meta="drop")
docs_before = [
Document(id="1", content="abc", meta={"rating": 1.3}, score=0.6),
Document(id="2", content="abc", meta={}, score=0.59),
Document(id="3", content="abc", meta={"rating": 2.1}, score=0.4),
]
output = ranker.run(documents=docs_before)
docs_after = output["documents"]
assert len(docs_after) == 2
assert "2" not in [doc.id for doc in docs_after]