mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-28 07:29:06 +00:00
feat: Weights and score normalization for DocumentJoiner with reciprocal rank fusion (#6735)
* Add weighting and score normalization for DocumentJoiner w/ reciprocal rank fusion (fix trailing whitespace) * Add release notes * Add unit test * Update release note --------- Co-authored-by: Vladimir Blagojevic <dovlex@gmail.com>
This commit is contained in:
parent
6e86f4e26a
commit
7358b910d7
@ -142,11 +142,19 @@ class DocumentJoiner:
|
||||
|
||||
scores_map = defaultdict(int)
|
||||
documents_map = {}
|
||||
for documents in document_lists:
|
||||
weights = self.weights if self.weights else [1 / len(document_lists)] * len(document_lists)
|
||||
|
||||
# Calculate weighted reciprocal rank fusion score
|
||||
for documents, weight in zip(document_lists, weights):
|
||||
for rank, doc in enumerate(documents):
|
||||
scores_map[doc.id] += 1 / (k + rank)
|
||||
scores_map[doc.id] += (weight * len(document_lists)) / (k + rank)
|
||||
documents_map[doc.id] = doc
|
||||
|
||||
# Normalize scores. Note: len(results) / k is the maximum possible score,
|
||||
# achieved by being ranked first in all doc lists with non-zero weight.
|
||||
for id in scores_map:
|
||||
scores_map[id] /= len(document_lists) / k
|
||||
|
||||
for doc in documents_map.values():
|
||||
doc.score = scores_map[doc.id]
|
||||
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
enhancements:
|
||||
- |
|
||||
Introduces weighted score normalization for the DocumentJoiner's reciprocal rank fusion, enhancing the relevance of document sorting by allowing customizable influence on the final scores
|
||||
@ -126,3 +126,43 @@ class TestDocumentJoiner:
|
||||
documents_2 = [Document(content="d", score=0.2)]
|
||||
output = joiner.run([documents_1, documents_2])
|
||||
assert output["documents"] == documents_1 + documents_2
|
||||
|
||||
def test_test_score_norm_with_rrf(self):
|
||||
"""
|
||||
Verifies reciprocal rank fusion (RRF) of the DocumentJoiner component with various weight configurations.
|
||||
It creates a set of documents, forms them into two lists, and then applies multiple DocumentJoiner
|
||||
instances with distinct weights to these lists. The test checks if the resulting
|
||||
joined documents are correctly sorted in descending order by score, ensuring the RRF ranking works as
|
||||
expected under different weighting scenarios.
|
||||
"""
|
||||
num_docs = 6
|
||||
docs = []
|
||||
|
||||
for i in range(num_docs):
|
||||
docs.append(Document(content=f"doc{i}"))
|
||||
|
||||
docs_2 = [docs[0], docs[4], docs[2], docs[5], docs[1]]
|
||||
document_lists = [docs, docs_2]
|
||||
|
||||
joiner_1 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.5, 0.5])
|
||||
|
||||
joiner_2 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[7, 7])
|
||||
|
||||
joiner_3 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.7, 0.3])
|
||||
|
||||
joiner_4 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.6, 0.4])
|
||||
|
||||
joiner_5 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[1, 0])
|
||||
|
||||
joiners = [joiner_1, joiner_2, joiner_3, joiner_4, joiner_5]
|
||||
|
||||
for index, joiner in enumerate(joiners):
|
||||
join_results = joiner.run(documents=document_lists)
|
||||
is_sorted = all(
|
||||
join_results["documents"][i].score >= join_results["documents"][i + 1].score
|
||||
for i in range(len(join_results["documents"]) - 1)
|
||||
)
|
||||
|
||||
assert (
|
||||
is_sorted
|
||||
), "Documents are not sorted in descending order by score, there is an issue with rff ranking"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user