feat: Weights and score normalization for DocumentJoiner with reciprocal rank fusion (#6735)

* Add weighting and score normalization for DocumentJoiner w/ reciprocal rank fusion (fix trailing whitespace) * Add release notes * Add unit test * Update release note --------- Co-authored-by: Vladimir Blagojevic <dovlex@gmail.com>
2025-12-28 07:29:06 +00:00 · 2024-01-24 15:45:53 +01:00 · 2024-01-24 15:45:53 +01:00 · 7358b910d7
commit 7358b910d7
parent 6e86f4e26a
3 changed files with 54 additions and 2 deletions
--- a/haystack/components/joiners/document_joiner.py
+++ b/haystack/components/joiners/document_joiner.py
@ -142,11 +142,19 @@ class DocumentJoiner:

        scores_map = defaultdict(int)
        documents_map = {}
-        for documents in document_lists:
+        weights = self.weights if self.weights else [1 / len(document_lists)] * len(document_lists)
+
+        # Calculate weighted reciprocal rank fusion score
+        for documents, weight in zip(document_lists, weights):
            for rank, doc in enumerate(documents):
-                scores_map[doc.id] += 1 / (k + rank)
+                scores_map[doc.id] += (weight * len(document_lists)) / (k + rank)
                documents_map[doc.id] = doc

+        # Normalize scores. Note: len(results) / k is the maximum possible score,
+        # achieved by being ranked first in all doc lists with non-zero weight.
+        for id in scores_map:
+            scores_map[id] /= len(document_lists) / k
+
        for doc in documents_map.values():
            doc.score = scores_map[doc.id]

--- a/releasenotes/notes/weights-normalize-docjoin-rrf-v2-9cad33012fe90a55.yaml
+++ b/releasenotes/notes/weights-normalize-docjoin-rrf-v2-9cad33012fe90a55.yaml
@ -0,0 +1,4 @@
+---
+enhancements:
+  - |
+    Introduces weighted score normalization for the DocumentJoiner's reciprocal rank fusion, enhancing the relevance of document sorting by allowing customizable influence on the final scores
--- a/test/components/joiners/test_document_joiner.py
+++ b/test/components/joiners/test_document_joiner.py
@ -126,3 +126,43 @@ class TestDocumentJoiner:
        documents_2 = [Document(content="d", score=0.2)]
        output = joiner.run([documents_1, documents_2])
        assert output["documents"] == documents_1 + documents_2
+
+    def test_test_score_norm_with_rrf(self):
+        """
+        Verifies reciprocal rank fusion (RRF) of the DocumentJoiner component with various weight configurations.
+        It creates a set of documents, forms them into two lists, and then applies multiple DocumentJoiner
+        instances with distinct weights to these lists. The test checks if the resulting
+        joined documents are correctly sorted in descending order by score, ensuring the RRF ranking works as
+        expected under different weighting scenarios.
+        """
+        num_docs = 6
+        docs = []
+
+        for i in range(num_docs):
+            docs.append(Document(content=f"doc{i}"))
+
+        docs_2 = [docs[0], docs[4], docs[2], docs[5], docs[1]]
+        document_lists = [docs, docs_2]
+
+        joiner_1 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.5, 0.5])
+
+        joiner_2 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[7, 7])
+
+        joiner_3 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.7, 0.3])
+
+        joiner_4 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[0.6, 0.4])
+
+        joiner_5 = DocumentJoiner(join_mode="reciprocal_rank_fusion", weights=[1, 0])
+
+        joiners = [joiner_1, joiner_2, joiner_3, joiner_4, joiner_5]
+
+        for index, joiner in enumerate(joiners):
+            join_results = joiner.run(documents=document_lists)
+            is_sorted = all(
+                join_results["documents"][i].score >= join_results["documents"][i + 1].score
+                for i in range(len(join_results["documents"]) - 1)
+            )
+
+            assert (
+                is_sorted
+            ), "Documents are not sorted in descending order by score, there is an issue with rff ranking"