bug: make MultiLabel ids consistent across python interpreters (#2998)

* use hashlib.md5() instead of (interpreter dependent) hash() funtion to generate MultiLabel id

* add tests to assess constancy of MultiLabel.id

* make test_multilabel_id test ensure that MultiLabel ids are always the same
This commit is contained in:
camille 2022-08-10 09:43:21 +02:00 committed by GitHub
parent b685409c78
commit f363b152ff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 42 additions and 1 deletions

View File

@ -1,5 +1,6 @@
from __future__ import annotations
import csv
import hashlib
import typing
from typing import Any, Optional, Dict, List, Union
@ -635,7 +636,7 @@ class MultiLabel:
self.query = self._aggregate_labels(key="query", must_be_single_value=True)[0]
self.filters = self._aggregate_labels(key="filters", must_be_single_value=True)[0]
self.id = hash((self.query, json.dumps(self.filters, sort_keys=True).encode()))
self.id = hashlib.md5((self.query + json.dumps(self.filters, sort_keys=True)).encode()).hexdigest()
# Currently no_answer is only true if all labels are "no_answers", we could later introduce a param here to let
# users decided which aggregation logic they want

View File

@ -361,6 +361,46 @@ def test_multilabel_preserve_order_w_duplicates():
assert multilabel.labels[i].id == str(i)
def test_multilabel_id():
query1 = "question 1"
query2 = "question 2"
document1 = Document(content="something", id="1")
answer1 = Answer(answer="answer 1")
filter1 = {"name": ["name 1"]}
filter2 = {"name": ["name 1"], "author": ["author 1"]}
label1 = Label(
query=query1,
document=document1,
is_correct_answer=True,
is_correct_document=True,
origin="gold-label",
answer=answer1,
filters=filter1,
)
label2 = Label(
query=query2,
document=document1,
is_correct_answer=True,
is_correct_document=True,
origin="gold-label",
answer=answer1,
filters=filter2,
)
label3 = Label(
query=query1,
document=document1,
is_correct_answer=True,
is_correct_document=True,
origin="gold-label",
answer=answer1,
filters=filter2,
)
assert MultiLabel(labels=[label1]).id == "33a3e58e13b16e9d6ec682ffe59ccc89"
assert MultiLabel(labels=[label2]).id == "1b3ad38b629db7b0e869373b01bc32b1"
assert MultiLabel(labels=[label3]).id == "531445fa3bdf98b8598a3bea032bd605"
def test_serialize_speech_document():
speech_doc = SpeechDocument(
id=12345,