Fixed ZeroDivisionError in JoinDocuments (#7972)

* added new strategy DBRF

* fix hook

* fix typos

* added test for DBRF

* fix format

* new release note

* reformatted with black

* Update haystack/components/joiners/document_joiner.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* updated comments

* added type-hint and return type

* fix

* revert for lint problems

* fix

* fix

* fix

* fix

* another tentative

* dict out file

* only output

* fix output

* revert

* removed unused imports

* fix typing

* fixed ZeroDivisionError

* added test

* add release note

* removed try - except

* renamed test

* Update test/components/joiners/test_document_joiner.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update haystack/components/joiners/document_joiner.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* fix format error

* removed releasenotes/notes/release-note-9b2bc03a8a398078.yaml

* added comment

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: anakin87 <stefanofiorucci@gmail.com>
This commit is contained in:
Nicola Procopio 2024-07-04 10:07:26 +02:00 committed by GitHub
parent 03d9057e64
commit cafcf51cb0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 33 additions and 1 deletions

View File

@ -195,9 +195,11 @@ class DocumentJoiner:
std_dev = (sum((x - mean_score) ** 2 for x in scores_list) / len(scores_list)) ** 0.5
min_score = mean_score - 3 * std_dev
max_score = mean_score + 3 * std_dev
delta_score = max_score - min_score
for doc in documents:
doc.score = (doc.score - min_score) / (max_score - min_score)
doc.score = (doc.score - min_score) / delta_score if delta_score != 0.0 else 0.0
# if all docs have the same score delta_score is 0, the docs are uninformative for the query
output = self._concatenate(document_lists=document_lists)

View File

@ -145,6 +145,36 @@ class TestDocumentJoiner:
]
assert all(doc.id in expected_document_ids for doc in output["documents"])
def test_run_with_distribution_based_rank_fusion_join_mode_same_scores(self):
joiner = DocumentJoiner(join_mode="distribution_based_rank_fusion")
documents_1 = [
Document(content="a", score=0.2),
Document(content="b", score=0.2),
Document(content="c", score=0.2),
]
documents_2 = [
Document(content="d", score=0.5),
Document(content="e", score=0.8),
Document(content="f", score=1.1, meta={"key": "value"}),
Document(content="g", score=0.3),
Document(content="a", score=0.3),
]
output = joiner.run([documents_1, documents_2])
assert len(output["documents"]) == 7
expected_document_ids = [
doc.id
for doc in [
Document(content="a", score=0),
Document(content="b", score=0),
Document(content="c", score=0),
Document(content="d", score=0.44),
Document(content="e", score=0.60),
Document(content="f", score=0.76, meta={"key": "value"}),
Document(content="g", score=0.33),
]
]
assert all(doc.id in expected_document_ids for doc in output["documents"])
def test_run_with_top_k_in_run_method(self):
joiner = DocumentJoiner()
documents_1 = [Document(content="a"), Document(content="b"), Document(content="c")]