fix: document joiner division by zero with distribution based rank fusion (#8520)

* Parametrize document joiner tests with empty lists

* Skip loop in _distribution_based_rank_fusion if document list is empty

* Parametrize test_empty_list with join_mode

* Prevent division by zero in _merge and _reciprocal_rank_fusion

* Add release notes

---------

Co-authored-by: Silvano Cerza <silvanocerza@gmail.com>
This commit is contained in:
Anes Benmerzoug 2024-11-14 12:41:28 +01:00 committed by GitHub
parent e5a80722c2
commit f5683bc8fa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 48 additions and 6 deletions

View File

@ -166,6 +166,10 @@ class DocumentJoiner:
"""
Merge multiple lists of Documents and calculate a weighted sum of the scores of duplicate Documents.
"""
# This check prevents a division by zero when no documents are passed
if not document_lists:
return []
scores_map: dict = defaultdict(int)
documents_map = {}
weights = self.weights if self.weights else [1 / len(document_lists)] * len(document_lists)
@ -187,6 +191,10 @@ class DocumentJoiner:
The constant k is set to 61 (60 was suggested by the original paper,
plus 1 as python lists are 0-based and the paper used 1-based ranking).
"""
# This check prevents a division by zero when no documents are passed
if not document_lists:
return []
k = 61
scores_map: dict = defaultdict(int)
@ -217,6 +225,9 @@ class DocumentJoiner:
If a Document is in more than one retriever, the one with the highest score is used.
"""
for documents in document_lists:
if len(documents) == 0:
continue
scores_list = []
for doc in documents:

View File

@ -0,0 +1,4 @@
---
fixes:
- |
Fix `DocumentJoiner` failing when ran with an empty list of `Document`s

View File

@ -60,18 +60,45 @@ class TestDocumentJoiner:
assert document_joiner.top_k == 6
assert not document_joiner.sort_by_score
def test_empty_list(self):
joiner = DocumentJoiner()
@pytest.mark.parametrize(
"join_mode",
[
JoinMode.CONCATENATE,
JoinMode.MERGE,
JoinMode.RECIPROCAL_RANK_FUSION,
JoinMode.DISTRIBUTION_BASED_RANK_FUSION,
],
)
def test_empty_list(self, join_mode: JoinMode):
joiner = DocumentJoiner(join_mode=join_mode)
result = joiner.run([])
assert result == {"documents": []}
def test_list_of_empty_lists(self):
joiner = DocumentJoiner()
@pytest.mark.parametrize(
"join_mode",
[
JoinMode.CONCATENATE,
JoinMode.MERGE,
JoinMode.RECIPROCAL_RANK_FUSION,
JoinMode.DISTRIBUTION_BASED_RANK_FUSION,
],
)
def test_list_of_empty_lists(self, join_mode: JoinMode):
joiner = DocumentJoiner(join_mode=join_mode)
result = joiner.run([[], []])
assert result == {"documents": []}
def test_list_with_one_empty_list(self):
joiner = DocumentJoiner()
@pytest.mark.parametrize(
"join_mode",
[
JoinMode.CONCATENATE,
JoinMode.MERGE,
JoinMode.RECIPROCAL_RANK_FUSION,
JoinMode.DISTRIBUTION_BASED_RANK_FUSION,
],
)
def test_list_with_one_empty_list(self, join_mode: JoinMode):
joiner = DocumentJoiner(join_mode=join_mode)
documents = [Document(content="a"), Document(content="b"), Document(content="c")]
result = joiner.run([[], documents])
assert result == {"documents": documents}