mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 03:57:19 +00:00
fix: document joiner division by zero with distribution based rank fusion (#8520)
* Parametrize document joiner tests with empty lists * Skip loop in _distribution_based_rank_fusion if document list is empty * Parametrize test_empty_list with join_mode * Prevent division by zero in _merge and _reciprocal_rank_fusion * Add release notes --------- Co-authored-by: Silvano Cerza <silvanocerza@gmail.com>
This commit is contained in:
parent
e5a80722c2
commit
f5683bc8fa
@ -166,6 +166,10 @@ class DocumentJoiner:
|
||||
"""
|
||||
Merge multiple lists of Documents and calculate a weighted sum of the scores of duplicate Documents.
|
||||
"""
|
||||
# This check prevents a division by zero when no documents are passed
|
||||
if not document_lists:
|
||||
return []
|
||||
|
||||
scores_map: dict = defaultdict(int)
|
||||
documents_map = {}
|
||||
weights = self.weights if self.weights else [1 / len(document_lists)] * len(document_lists)
|
||||
@ -187,6 +191,10 @@ class DocumentJoiner:
|
||||
The constant k is set to 61 (60 was suggested by the original paper,
|
||||
plus 1 as python lists are 0-based and the paper used 1-based ranking).
|
||||
"""
|
||||
# This check prevents a division by zero when no documents are passed
|
||||
if not document_lists:
|
||||
return []
|
||||
|
||||
k = 61
|
||||
|
||||
scores_map: dict = defaultdict(int)
|
||||
@ -217,6 +225,9 @@ class DocumentJoiner:
|
||||
If a Document is in more than one retriever, the one with the highest score is used.
|
||||
"""
|
||||
for documents in document_lists:
|
||||
if len(documents) == 0:
|
||||
continue
|
||||
|
||||
scores_list = []
|
||||
|
||||
for doc in documents:
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fix `DocumentJoiner` failing when ran with an empty list of `Document`s
|
||||
@ -60,18 +60,45 @@ class TestDocumentJoiner:
|
||||
assert document_joiner.top_k == 6
|
||||
assert not document_joiner.sort_by_score
|
||||
|
||||
def test_empty_list(self):
|
||||
joiner = DocumentJoiner()
|
||||
@pytest.mark.parametrize(
|
||||
"join_mode",
|
||||
[
|
||||
JoinMode.CONCATENATE,
|
||||
JoinMode.MERGE,
|
||||
JoinMode.RECIPROCAL_RANK_FUSION,
|
||||
JoinMode.DISTRIBUTION_BASED_RANK_FUSION,
|
||||
],
|
||||
)
|
||||
def test_empty_list(self, join_mode: JoinMode):
|
||||
joiner = DocumentJoiner(join_mode=join_mode)
|
||||
result = joiner.run([])
|
||||
assert result == {"documents": []}
|
||||
|
||||
def test_list_of_empty_lists(self):
|
||||
joiner = DocumentJoiner()
|
||||
@pytest.mark.parametrize(
|
||||
"join_mode",
|
||||
[
|
||||
JoinMode.CONCATENATE,
|
||||
JoinMode.MERGE,
|
||||
JoinMode.RECIPROCAL_RANK_FUSION,
|
||||
JoinMode.DISTRIBUTION_BASED_RANK_FUSION,
|
||||
],
|
||||
)
|
||||
def test_list_of_empty_lists(self, join_mode: JoinMode):
|
||||
joiner = DocumentJoiner(join_mode=join_mode)
|
||||
result = joiner.run([[], []])
|
||||
assert result == {"documents": []}
|
||||
|
||||
def test_list_with_one_empty_list(self):
|
||||
joiner = DocumentJoiner()
|
||||
@pytest.mark.parametrize(
|
||||
"join_mode",
|
||||
[
|
||||
JoinMode.CONCATENATE,
|
||||
JoinMode.MERGE,
|
||||
JoinMode.RECIPROCAL_RANK_FUSION,
|
||||
JoinMode.DISTRIBUTION_BASED_RANK_FUSION,
|
||||
],
|
||||
)
|
||||
def test_list_with_one_empty_list(self, join_mode: JoinMode):
|
||||
joiner = DocumentJoiner(join_mode=join_mode)
|
||||
documents = [Document(content="a"), Document(content="b"), Document(content="c")]
|
||||
result = joiner.run([[], documents])
|
||||
assert result == {"documents": documents}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user