mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-17 18:43:58 +00:00
bug: make MultiLabel ids consistent across python interpreters (#2998)
* use hashlib.md5() instead of (interpreter dependent) hash() funtion to generate MultiLabel id * add tests to assess constancy of MultiLabel.id * make test_multilabel_id test ensure that MultiLabel ids are always the same
This commit is contained in:
parent
b685409c78
commit
f363b152ff
@ -1,5 +1,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import csv
|
import csv
|
||||||
|
import hashlib
|
||||||
|
|
||||||
import typing
|
import typing
|
||||||
from typing import Any, Optional, Dict, List, Union
|
from typing import Any, Optional, Dict, List, Union
|
||||||
@ -635,7 +636,7 @@ class MultiLabel:
|
|||||||
|
|
||||||
self.query = self._aggregate_labels(key="query", must_be_single_value=True)[0]
|
self.query = self._aggregate_labels(key="query", must_be_single_value=True)[0]
|
||||||
self.filters = self._aggregate_labels(key="filters", must_be_single_value=True)[0]
|
self.filters = self._aggregate_labels(key="filters", must_be_single_value=True)[0]
|
||||||
self.id = hash((self.query, json.dumps(self.filters, sort_keys=True).encode()))
|
self.id = hashlib.md5((self.query + json.dumps(self.filters, sort_keys=True)).encode()).hexdigest()
|
||||||
|
|
||||||
# Currently no_answer is only true if all labels are "no_answers", we could later introduce a param here to let
|
# Currently no_answer is only true if all labels are "no_answers", we could later introduce a param here to let
|
||||||
# users decided which aggregation logic they want
|
# users decided which aggregation logic they want
|
||||||
|
|||||||
@ -361,6 +361,46 @@ def test_multilabel_preserve_order_w_duplicates():
|
|||||||
assert multilabel.labels[i].id == str(i)
|
assert multilabel.labels[i].id == str(i)
|
||||||
|
|
||||||
|
|
||||||
|
def test_multilabel_id():
|
||||||
|
query1 = "question 1"
|
||||||
|
query2 = "question 2"
|
||||||
|
document1 = Document(content="something", id="1")
|
||||||
|
answer1 = Answer(answer="answer 1")
|
||||||
|
filter1 = {"name": ["name 1"]}
|
||||||
|
filter2 = {"name": ["name 1"], "author": ["author 1"]}
|
||||||
|
label1 = Label(
|
||||||
|
query=query1,
|
||||||
|
document=document1,
|
||||||
|
is_correct_answer=True,
|
||||||
|
is_correct_document=True,
|
||||||
|
origin="gold-label",
|
||||||
|
answer=answer1,
|
||||||
|
filters=filter1,
|
||||||
|
)
|
||||||
|
label2 = Label(
|
||||||
|
query=query2,
|
||||||
|
document=document1,
|
||||||
|
is_correct_answer=True,
|
||||||
|
is_correct_document=True,
|
||||||
|
origin="gold-label",
|
||||||
|
answer=answer1,
|
||||||
|
filters=filter2,
|
||||||
|
)
|
||||||
|
label3 = Label(
|
||||||
|
query=query1,
|
||||||
|
document=document1,
|
||||||
|
is_correct_answer=True,
|
||||||
|
is_correct_document=True,
|
||||||
|
origin="gold-label",
|
||||||
|
answer=answer1,
|
||||||
|
filters=filter2,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert MultiLabel(labels=[label1]).id == "33a3e58e13b16e9d6ec682ffe59ccc89"
|
||||||
|
assert MultiLabel(labels=[label2]).id == "1b3ad38b629db7b0e869373b01bc32b1"
|
||||||
|
assert MultiLabel(labels=[label3]).id == "531445fa3bdf98b8598a3bea032bd605"
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_speech_document():
|
def test_serialize_speech_document():
|
||||||
speech_doc = SpeechDocument(
|
speech_doc = SpeechDocument(
|
||||||
id=12345,
|
id=12345,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user