mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-28 07:29:06 +00:00
fixed join_docs.py concatenate (#5970)
* added hybrid search example Added an example about hybrid search for faq pipeline on covid dataset * formatted with back formatter * renamed document * fixed * fixed typos * added test added test for hybrid search * fixed withespaces * removed test for hybrid search * fixed pylint * commented logging * fixed bug in join_docs.py _concatenate_results * Update join_docs.py updated comment * format with black * added releasenote on PR * updated release notes * updated test_join_documents * updated test * updated test * Update test_join_documents.py * formatted with black * fixed test * fixed --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com>
This commit is contained in:
parent
92ae169bdf
commit
32e87d37c1
@ -1,11 +1,10 @@
|
||||
from collections import defaultdict
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from math import inf
|
||||
from typing import List, Optional
|
||||
|
||||
from typing import Optional, List
|
||||
|
||||
from haystack.schema import Document
|
||||
from haystack.nodes.other.join import JoinNode
|
||||
from haystack.schema import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -64,7 +63,7 @@ class JoinDocuments(JoinNode):
|
||||
document_map = {doc.id: doc for result in results for doc in result}
|
||||
|
||||
if self.join_mode == "concatenate":
|
||||
scores_map = self._concatenate_results(results)
|
||||
scores_map = self._concatenate_results(results, document_map)
|
||||
elif self.join_mode == "merge":
|
||||
scores_map = self._calculate_comb_sum(results)
|
||||
elif self.join_mode == "reciprocal_rank_fusion":
|
||||
@ -118,11 +117,22 @@ class JoinDocuments(JoinNode):
|
||||
|
||||
return output, "output_1"
|
||||
|
||||
def _concatenate_results(self, results):
|
||||
def _concatenate_results(self, results, document_map):
|
||||
"""
|
||||
Concatenates multiple document result lists.
|
||||
Return the documents with the higher score.
|
||||
"""
|
||||
return {doc.id: doc.score for result in results for doc in result}
|
||||
list_id = list(document_map.keys())
|
||||
scores_map = {}
|
||||
for idx in list_id:
|
||||
tmp = []
|
||||
for result in results:
|
||||
for doc in result:
|
||||
if doc.id == idx:
|
||||
tmp.append(doc)
|
||||
item_best_score = max(tmp, key=lambda x: x.score)
|
||||
scores_map.update({idx: item_best_score.score})
|
||||
return scores_map
|
||||
|
||||
def _calculate_comb_sum(self, results):
|
||||
"""
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
enhancements:
|
||||
- |
|
||||
Make JoinDocuments return only the document with the highest score if there are duplicate documents in the list.
|
||||
@ -54,3 +54,27 @@ def test_joindocuments_preserves_root_node():
|
||||
join_docs = JoinDocuments()
|
||||
result, _ = join_docs.run(inputs)
|
||||
assert result["root_node"] == "File"
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_joindocuments_concatenate_keep_only_highest_ranking_duplicate():
|
||||
inputs = [
|
||||
{
|
||||
"documents": [
|
||||
Document(content="text document 1", content_type="text", score=0.2),
|
||||
Document(content="text document 2", content_type="text", score=0.3),
|
||||
]
|
||||
},
|
||||
{"documents": [Document(content="text document 2", content_type="text", score=0.7)]},
|
||||
]
|
||||
expected_outputs = {
|
||||
"documents": [
|
||||
Document(content="text document 2", content_type="text", score=0.7),
|
||||
Document(content="text document 1", content_type="text", score=0.2),
|
||||
]
|
||||
}
|
||||
|
||||
join_docs = JoinDocuments(join_mode="concatenate")
|
||||
result, _ = join_docs.run(inputs)
|
||||
assert len(result["documents"]) == 2
|
||||
assert result["documents"] == expected_outputs["documents"]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user