From cafcf51cb0eba832adbabb32a98c5e0e2b0b8ca9 Mon Sep 17 00:00:00 2001 From: Nicola Procopio Date: Thu, 4 Jul 2024 10:07:26 +0200 Subject: [PATCH] Fixed ZeroDivisionError in JoinDocuments (#7972) * added new strategy DBRF * fix hook * fix typos * added test for DBRF * fix format * new release note * reformatted with black * Update haystack/components/joiners/document_joiner.py Co-authored-by: Madeesh Kannan * updated comments * added type-hint and return type * fix * revert for lint problems * fix * fix * fix * fix * another tentative * dict out file * only output * fix output * revert * removed unused imports * fix typing * fixed ZeroDivisionError * added test * add release note * removed try - except * renamed test * Update test/components/joiners/test_document_joiner.py Co-authored-by: Madeesh Kannan * Update haystack/components/joiners/document_joiner.py Co-authored-by: Madeesh Kannan * fix format error * removed releasenotes/notes/release-note-9b2bc03a8a398078.yaml * added comment --------- Co-authored-by: Madeesh Kannan Co-authored-by: anakin87 --- .../components/joiners/document_joiner.py | 4 ++- .../joiners/test_document_joiner.py | 30 +++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/haystack/components/joiners/document_joiner.py b/haystack/components/joiners/document_joiner.py index 6e1e5a803..bd51613df 100644 --- a/haystack/components/joiners/document_joiner.py +++ b/haystack/components/joiners/document_joiner.py @@ -195,9 +195,11 @@ class DocumentJoiner: std_dev = (sum((x - mean_score) ** 2 for x in scores_list) / len(scores_list)) ** 0.5 min_score = mean_score - 3 * std_dev max_score = mean_score + 3 * std_dev + delta_score = max_score - min_score for doc in documents: - doc.score = (doc.score - min_score) / (max_score - min_score) + doc.score = (doc.score - min_score) / delta_score if delta_score != 0.0 else 0.0 + # if all docs have the same score delta_score is 0, the docs are uninformative for the query output = self._concatenate(document_lists=document_lists) diff --git a/test/components/joiners/test_document_joiner.py b/test/components/joiners/test_document_joiner.py index 41fe27554..0da082477 100644 --- a/test/components/joiners/test_document_joiner.py +++ b/test/components/joiners/test_document_joiner.py @@ -145,6 +145,36 @@ class TestDocumentJoiner: ] assert all(doc.id in expected_document_ids for doc in output["documents"]) + def test_run_with_distribution_based_rank_fusion_join_mode_same_scores(self): + joiner = DocumentJoiner(join_mode="distribution_based_rank_fusion") + documents_1 = [ + Document(content="a", score=0.2), + Document(content="b", score=0.2), + Document(content="c", score=0.2), + ] + documents_2 = [ + Document(content="d", score=0.5), + Document(content="e", score=0.8), + Document(content="f", score=1.1, meta={"key": "value"}), + Document(content="g", score=0.3), + Document(content="a", score=0.3), + ] + output = joiner.run([documents_1, documents_2]) + assert len(output["documents"]) == 7 + expected_document_ids = [ + doc.id + for doc in [ + Document(content="a", score=0), + Document(content="b", score=0), + Document(content="c", score=0), + Document(content="d", score=0.44), + Document(content="e", score=0.60), + Document(content="f", score=0.76, meta={"key": "value"}), + Document(content="g", score=0.33), + ] + ] + assert all(doc.id in expected_document_ids for doc in output["documents"]) + def test_run_with_top_k_in_run_method(self): joiner = DocumentJoiner() documents_1 = [Document(content="a"), Document(content="b"), Document(content="c")]