diff --git a/haystack/schema.py b/haystack/schema.py index 1b21daa94..7078afe08 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -1,5 +1,6 @@ from __future__ import annotations import csv +import hashlib import typing from typing import Any, Optional, Dict, List, Union @@ -635,7 +636,7 @@ class MultiLabel: self.query = self._aggregate_labels(key="query", must_be_single_value=True)[0] self.filters = self._aggregate_labels(key="filters", must_be_single_value=True)[0] - self.id = hash((self.query, json.dumps(self.filters, sort_keys=True).encode())) + self.id = hashlib.md5((self.query + json.dumps(self.filters, sort_keys=True)).encode()).hexdigest() # Currently no_answer is only true if all labels are "no_answers", we could later introduce a param here to let # users decided which aggregation logic they want diff --git a/test/others/test_schema.py b/test/others/test_schema.py index d1af0d02d..f83f91477 100644 --- a/test/others/test_schema.py +++ b/test/others/test_schema.py @@ -361,6 +361,46 @@ def test_multilabel_preserve_order_w_duplicates(): assert multilabel.labels[i].id == str(i) +def test_multilabel_id(): + query1 = "question 1" + query2 = "question 2" + document1 = Document(content="something", id="1") + answer1 = Answer(answer="answer 1") + filter1 = {"name": ["name 1"]} + filter2 = {"name": ["name 1"], "author": ["author 1"]} + label1 = Label( + query=query1, + document=document1, + is_correct_answer=True, + is_correct_document=True, + origin="gold-label", + answer=answer1, + filters=filter1, + ) + label2 = Label( + query=query2, + document=document1, + is_correct_answer=True, + is_correct_document=True, + origin="gold-label", + answer=answer1, + filters=filter2, + ) + label3 = Label( + query=query1, + document=document1, + is_correct_answer=True, + is_correct_document=True, + origin="gold-label", + answer=answer1, + filters=filter2, + ) + + assert MultiLabel(labels=[label1]).id == "33a3e58e13b16e9d6ec682ffe59ccc89" + assert MultiLabel(labels=[label2]).id == "1b3ad38b629db7b0e869373b01bc32b1" + assert MultiLabel(labels=[label3]).id == "531445fa3bdf98b8598a3bea032bd605" + + def test_serialize_speech_document(): speech_doc = SpeechDocument( id=12345,