feat: Weights and score normalization for DocumentJoiner with reciprocal rank fusion (#6735)

* Add weighting and score normalization for DocumentJoiner w/ reciprocal rank fusion (fix trailing whitespace)

* Add release notes

* Add unit test

* Update release note

---------

Co-authored-by: Vladimir Blagojevic <dovlex@gmail.com>
This commit is contained in:
Rob Pasternak 2024-01-24 15:45:53 +01:00 committed by GitHub
parent 6e86f4e26a
commit 7358b910d7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 54 additions and 2 deletions

View File

@ -142,11 +142,19 @@ class DocumentJoiner:
scores_map = defaultdict(int)
documents_map = {}
for documents in document_lists:
weights = self.weights if self.weights else [1 / len(document_lists)] * len(document_lists)
# Calculate weighted reciprocal rank fusion score
for documents, weight in zip(document_lists, weights):
for rank, doc in enumerate(documents):
scores_map[doc.id] += 1 / (k + rank)
scores_map[doc.id] += (weight * len(document_lists)) / (k + rank)
documents_map[doc.id] = doc
# Normalize scores. Note: len(results) / k is the maximum possible score,
# achieved by being ranked first in all doc lists with non-zero weight.
for id in scores_map:
scores_map[id] /= len(document_lists) / k
for doc in documents_map.values():
doc.score = scores_map[doc.id]

View File

@ -0,0 +1,4 @@
---
enhancements:
- |
Introduces weighted score normalization for the DocumentJoiner's reciprocal rank fusion, enhancing the relevance of document sorting by allowing customizable influence on the final scores

View File

@ -126,3 +126,43 @@ class TestDocumentJoiner:
documents_2 = [Document(content="d", score=0.2)]
output = joiner.run([documents_1, documents_2])
assert output["documents"] == documents_1 + documents_2
def test_test_score_norm_with_rrf(self):
"""
Verifies reciprocal rank fusion (RRF) of the DocumentJoiner component with various weight configurations.
It creates a set of documents, forms them into two lists, and then applies multiple DocumentJoiner
instances with distinct weights to these lists. The test checks if the resulting
joined documents are correctly sorted in descending order by score, ensuring the RRF ranking works as
expected under different weighting scenarios.
"""
num_docs = 6
docs = []
for i in range(num_docs):
docs.append(Document(content=f"doc{i}"))
docs_2 = [docs[0], docs[4], docs[2], docs[5], docs[1]]
document_lists = [docs, docs_2]
joiner_1 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.5, 0.5])
joiner_2 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[7, 7])
joiner_3 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.7, 0.3])
joiner_4 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.6, 0.4])
joiner_5 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[1, 0])
joiners = [joiner_1, joiner_2, joiner_3, joiner_4, joiner_5]
for index, joiner in enumerate(joiners):
join_results = joiner.run(documents=document_lists)
is_sorted = all(
join_results["documents"][i].score >= join_results["documents"][i + 1].score
for i in range(len(join_results["documents"]) - 1)
)
assert (
is_sorted
), "Documents are not sorted in descending order by score, there is an issue with rff ranking"