mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-02 02:39:51 +00:00
bug: make MultiLabel ids consistent across python interpreters (#2998)
* use hashlib.md5() instead of (interpreter dependent) hash() funtion to generate MultiLabel id * add tests to assess constancy of MultiLabel.id * make test_multilabel_id test ensure that MultiLabel ids are always the same
This commit is contained in:
parent
b685409c78
commit
f363b152ff
@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
import csv
|
||||
import hashlib
|
||||
|
||||
import typing
|
||||
from typing import Any, Optional, Dict, List, Union
|
||||
@ -635,7 +636,7 @@ class MultiLabel:
|
||||
|
||||
self.query = self._aggregate_labels(key="query", must_be_single_value=True)[0]
|
||||
self.filters = self._aggregate_labels(key="filters", must_be_single_value=True)[0]
|
||||
self.id = hash((self.query, json.dumps(self.filters, sort_keys=True).encode()))
|
||||
self.id = hashlib.md5((self.query + json.dumps(self.filters, sort_keys=True)).encode()).hexdigest()
|
||||
|
||||
# Currently no_answer is only true if all labels are "no_answers", we could later introduce a param here to let
|
||||
# users decided which aggregation logic they want
|
||||
|
||||
@ -361,6 +361,46 @@ def test_multilabel_preserve_order_w_duplicates():
|
||||
assert multilabel.labels[i].id == str(i)
|
||||
|
||||
|
||||
def test_multilabel_id():
|
||||
query1 = "question 1"
|
||||
query2 = "question 2"
|
||||
document1 = Document(content="something", id="1")
|
||||
answer1 = Answer(answer="answer 1")
|
||||
filter1 = {"name": ["name 1"]}
|
||||
filter2 = {"name": ["name 1"], "author": ["author 1"]}
|
||||
label1 = Label(
|
||||
query=query1,
|
||||
document=document1,
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
origin="gold-label",
|
||||
answer=answer1,
|
||||
filters=filter1,
|
||||
)
|
||||
label2 = Label(
|
||||
query=query2,
|
||||
document=document1,
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
origin="gold-label",
|
||||
answer=answer1,
|
||||
filters=filter2,
|
||||
)
|
||||
label3 = Label(
|
||||
query=query1,
|
||||
document=document1,
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
origin="gold-label",
|
||||
answer=answer1,
|
||||
filters=filter2,
|
||||
)
|
||||
|
||||
assert MultiLabel(labels=[label1]).id == "33a3e58e13b16e9d6ec682ffe59ccc89"
|
||||
assert MultiLabel(labels=[label2]).id == "1b3ad38b629db7b0e869373b01bc32b1"
|
||||
assert MultiLabel(labels=[label3]).id == "531445fa3bdf98b8598a3bea032bd605"
|
||||
|
||||
|
||||
def test_serialize_speech_document():
|
||||
speech_doc = SpeechDocument(
|
||||
id=12345,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user