fix: make JoinDocuments correctly handle duplicate documents w null scores (#6261)

* fix error with null values

* release note

* simplify
This commit is contained in:
Stefano Fiorucci 2023-11-09 14:28:56 +01:00 committed by GitHub
parent 676da681d0
commit 2b3c77e41d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 45 additions and 3 deletions

View File

@ -14,8 +14,8 @@ class JoinDocuments(JoinNode):
A node to join documents outputted by multiple retriever nodes.
The node allows multiple join modes:
* concatenate: combine the documents from multiple nodes. Any duplicate documents are discarded.
The score is only determined by the last node that outputs the document.
* concatenate: combine the documents from multiple nodes.
In case of duplicate documents, the one with the highest score is kept.
* merge: merge scores of documents from multiple nodes. Optionally, each input score can be given a different
`weight` & a `top_k` limit can be set. This mode can also be used for "reranking" retrieved documents.
* reciprocal_rank_fusion: combines the documents based on their rank in multiple nodes.
@ -130,7 +130,7 @@ class JoinDocuments(JoinNode):
for doc in result:
if doc.id == idx:
tmp.append(doc)
item_best_score = max(tmp, key=lambda x: x.score)
item_best_score = max(tmp, key=lambda x: x.score if x.score is not None else -inf)
scores_map.update({idx: item_best_score.score})
return scores_map

View File

@ -0,0 +1,7 @@
---
fixes:
- |
When using `JoinDocuments` with `join_mode=concatenate` (default) and
passing duplicate documents, including some with a null score, this
node raised an exception.
Now this case is handled correctly and the documents are joined as expected.

View File

@ -78,3 +78,38 @@ def test_joindocuments_concatenate_keep_only_highest_ranking_duplicate():
result, _ = join_docs.run(inputs)
assert len(result["documents"]) == 2
assert result["documents"] == expected_outputs["documents"]
@pytest.mark.unit
def test_joindocuments_concatenate_duplicate_docs_null_score():
"""
Test that the concatenate method correctly handles duplicate documents,
when one has a null score.
"""
inputs = [
{
"documents": [
Document(content="text document 1", content_type="text", score=0.2),
Document(content="text document 2", content_type="text", score=0.3),
Document(content="text document 3", content_type="text", score=None),
]
},
{
"documents": [
Document(content="text document 2", content_type="text", score=0.7),
Document(content="text document 1", content_type="text", score=None),
]
},
]
expected_outputs = {
"documents": [
Document(content="text document 2", content_type="text", score=0.7),
Document(content="text document 1", content_type="text", score=0.2),
Document(content="text document 3", content_type="text", score=None),
]
}
join_docs = JoinDocuments(join_mode="concatenate")
result, _ = join_docs.run(inputs)
assert len(result["documents"]) == 3
assert result["documents"] == expected_outputs["documents"]