fixed join_docs.py concatenate (#5970)

* added hybrid search example

Added an example about hybrid search for faq pipeline on covid dataset

* formatted with back formatter

* renamed document

* fixed

* fixed typos

* added test

added test for hybrid search

* fixed withespaces

* removed test for hybrid search

* fixed pylint

* commented logging

* fixed bug in join_docs.py _concatenate_results

* Update join_docs.py

updated comment

* format with black

* added releasenote on PR

* updated release notes

* updated test_join_documents

* updated test

* updated test

* Update test_join_documents.py

* formatted with black

* fixed test

* fixed

---------

Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com>
This commit is contained in:
Nicola Procopio 2023-10-16 09:31:52 +02:00 committed by GitHub
parent 92ae169bdf
commit 32e87d37c1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 45 additions and 7 deletions

View File

@ -1,11 +1,10 @@
from collections import defaultdict
import logging
from collections import defaultdict
from math import inf
from typing import List, Optional
from typing import Optional, List
from haystack.schema import Document
from haystack.nodes.other.join import JoinNode
from haystack.schema import Document
logger = logging.getLogger(__name__)
@ -64,7 +63,7 @@ class JoinDocuments(JoinNode):
document_map = {doc.id: doc for result in results for doc in result}
if self.join_mode == "concatenate":
scores_map = self._concatenate_results(results)
scores_map = self._concatenate_results(results, document_map)
elif self.join_mode == "merge":
scores_map = self._calculate_comb_sum(results)
elif self.join_mode == "reciprocal_rank_fusion":
@ -118,11 +117,22 @@ class JoinDocuments(JoinNode):
return output, "output_1"
def _concatenate_results(self, results):
def _concatenate_results(self, results, document_map):
"""
Concatenates multiple document result lists.
Return the documents with the higher score.
"""
return {doc.id: doc.score for result in results for doc in result}
list_id = list(document_map.keys())
scores_map = {}
for idx in list_id:
tmp = []
for result in results:
for doc in result:
if doc.id == idx:
tmp.append(doc)
item_best_score = max(tmp, key=lambda x: x.score)
scores_map.update({idx: item_best_score.score})
return scores_map
def _calculate_comb_sum(self, results):
"""

View File

@ -0,0 +1,4 @@
---
enhancements:
- |
Make JoinDocuments return only the document with the highest score if there are duplicate documents in the list.

View File

@ -54,3 +54,27 @@ def test_joindocuments_preserves_root_node():
join_docs = JoinDocuments()
result, _ = join_docs.run(inputs)
assert result["root_node"] == "File"
@pytest.mark.unit
def test_joindocuments_concatenate_keep_only_highest_ranking_duplicate():
inputs = [
{
"documents": [
Document(content="text document 1", content_type="text", score=0.2),
Document(content="text document 2", content_type="text", score=0.3),
]
},
{"documents": [Document(content="text document 2", content_type="text", score=0.7)]},
]
expected_outputs = {
"documents": [
Document(content="text document 2", content_type="text", score=0.7),
Document(content="text document 1", content_type="text", score=0.2),
]
}
join_docs = JoinDocuments(join_mode="concatenate")
result, _ = join_docs.run(inputs)
assert len(result["documents"]) == 2
assert result["documents"] == expected_outputs["documents"]