refactor: Update schema objects to handle Dataframes in to_{dict,json} and from_{dict,json} (#4747)

* Adding support for table Documents when serializing Labels in Haystack

* Fix table label equality test

* Add serialization support and __eq__ support for table answers

* Made convenience functions for converting dataframes. Added some TODOs. Epxanded schema tests for table labels. Updated Multilabel to not convert Dataframes into strings.

* get Answer and Label to_json working with DataFrame

* Fix from_dict method of Label

* Use Dict and remove unneccessary if check

* Using pydantic instead of builtins for type detection

* Update haystack/schema.py

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

* Update haystack/schema.py

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

* Update haystack/schema.py

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

* Separated table label equivalency tests and added pytest.mark.unit


* Added unit test for _dict_factory

* Using more descriptive variable names

* Adding json files to test to_json and from_json functions

* Added sample files for tests

---------

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
This commit is contained in:
Sebastian 2023-05-03 09:42:07 +02:00 committed by GitHub
parent a9ec954c45
commit a67ca289db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 593 additions and 106 deletions

View File

@ -185,7 +185,7 @@ class Document:
if k == "content":
# Convert pd.DataFrame to list of rows for serialization
if self.content_type == "table" and isinstance(self.content, pd.DataFrame):
v = [self.content.columns.tolist()] + self.content.values.tolist()
v = dataframe_to_list(self.content)
k = k if k not in inv_field_map else inv_field_map[k]
_doc[k] = v
return _doc
@ -232,7 +232,7 @@ class Document:
# Convert list of rows to pd.DataFrame
if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list):
_new_doc["content"] = pd.DataFrame(columns=_new_doc["content"][0], data=_new_doc["content"][1:])
_new_doc["content"] = dataframe_from_list(_new_doc["content"])
return cls(**_new_doc)
@ -243,11 +243,14 @@ class Document:
return json.dumps(dictionary, cls=NumpyEncoder)
@classmethod
def from_json(cls, data: str, field_map: Optional[Dict[str, Any]] = None) -> Document:
def from_json(cls, data: Union[str, Dict[str, Any]], field_map: Optional[Dict[str, Any]] = None) -> Document:
if not field_map:
field_map = {}
dictionary = json.loads(data)
return cls.from_dict(dictionary, field_map=field_map)
if isinstance(data, str):
dict_data = json.loads(data)
else:
dict_data = data
return cls.from_dict(dict_data, field_map=field_map)
def __eq__(self, other):
content = getattr(other, "content", None)
@ -401,6 +404,10 @@ class Answer:
if self.meta is None:
self.meta = {}
# In case the context is a list of lists for a table document that is instantiated by from_json() or from_dict()
if isinstance(self.context, list):
self.context = dataframe_from_list(self.context)
def __lt__(self, other):
"""Enable sorting of Answers by score"""
return self.score < other.score
@ -412,29 +419,30 @@ class Answer:
return f"<Answer: answer='{self.answer}', score={self.score}, context='{self.context[:50]}{'...' if len(self.context) > 50 else ''}'>"
def __repr__(self):
return f"<Answer {asdict(self)}>"
return f"<Answer {self.to_dict()}>"
def to_dict(self):
return asdict(self)
def to_dict(self) -> Dict:
return asdict(self, dict_factory=_dict_factory)
@classmethod
def from_dict(cls, dict: dict):
def from_dict(cls, dict: Dict) -> Answer:
# backwards compatibility: `document_id: Optional[str]` was changed to `document_ids: Optional[List[str]]`
if "document_id" in dict:
dict = dict.copy()
document_id = dict.pop("document_id")
dict["document_ids"] = [document_id] if document_id is not None else None
return _pydantic_dataclass_from_dict(dict=dict, pydantic_dataclass_type=cls)
return cls(**dict)
def to_json(self):
return json.dumps(self, default=pydantic_encoder)
return json.dumps(self.to_dict(), cls=NumpyEncoder)
@classmethod
def from_json(cls, data):
if type(data) == str:
data = json.loads(data)
return cls.from_dict(data)
def from_json(cls, data: Union[str, Dict[str, Any]]):
if isinstance(data, str):
dict_data = json.loads(data)
else:
dict_data = data
return cls.from_dict(dict_data)
@staticmethod
def _from_dict_offsets(offsets):
@ -449,6 +457,23 @@ class Answer:
converted_offsets.append(e)
return converted_offsets
def __eq__(self, other):
context = getattr(other, "context", None)
if isinstance(context, pd.DataFrame):
is_content_equal = context.equals(self.context)
else:
is_content_equal = context == self.context
return (
isinstance(other, self.__class__)
and is_content_equal
and getattr(other, "type", None) == self.type
and getattr(other, "score", None) == self.score
and getattr(other, "offsets_in_document", None) == self.offsets_in_document
and getattr(other, "offsets_in_context", None) == self.offsets_in_context
and getattr(other, "document_ids", None) == self.document_ids
and getattr(other, "meta", None) == self.meta
)
@dataclass
class Label:
@ -521,11 +546,7 @@ class Label:
self.updated_at = updated_at
self.query = query
if isinstance(answer, dict):
answer = Answer.from_dict(answer)
self.answer = answer
if isinstance(document, dict):
document = Document.from_dict(document)
self.document = document
self.is_correct_answer = is_correct_answer
@ -549,25 +570,28 @@ class Label:
return no_answer
def to_dict(self):
return asdict(self)
return asdict(self, dict_factory=_dict_factory)
@classmethod
def from_dict(cls, dict: dict):
# backward compatibility for old labels using answers with document_id instead of document_ids
def from_dict(cls, dict: Dict):
answer = dict.get("answer")
if answer and "document_id" in answer:
dict = dict.copy()
if answer and isinstance(answer, Dict):
dict["answer"] = Answer.from_dict(dict["answer"])
return _pydantic_dataclass_from_dict(dict=dict, pydantic_dataclass_type=cls)
doc = dict.get("document")
if isinstance(doc, Dict):
dict["document"] = Document.from_dict(dict["document"])
return cls(**dict)
def to_json(self):
return json.dumps(self, default=pydantic_encoder)
return json.dumps(self.to_dict(), cls=NumpyEncoder)
@classmethod
def from_json(cls, data):
if type(data) == str:
data = json.loads(data)
return cls.from_dict(data)
def from_json(cls, data: Union[str, Dict[str, Any]]):
if isinstance(data, str):
dict_data = json.loads(data)
else:
dict_data = data
return cls.from_dict(dict_data)
# define __eq__ and __hash__ functions to deduplicate Label Objects
def __eq__(self, other):
@ -732,7 +756,7 @@ class MultiLabel:
return {k[1:] if k[0] == "_" else k: v for k, v in vars(self).items()}
@classmethod
def from_dict(cls, dict: dict):
def from_dict(cls, dict: Dict):
# exclude extra arguments
return cls(**{k: v for k, v in dict.items() if k in inspect.signature(cls).parameters})
@ -741,7 +765,7 @@ class MultiLabel:
@classmethod
def from_json(cls, data: Union[str, Dict[str, Any]]):
if type(data) == str:
if isinstance(data, str):
dict_data = json.loads(data)
else:
dict_data = data
@ -758,7 +782,7 @@ class MultiLabel:
return f"<MultiLabel: {self.to_dict()}>"
def _pydantic_dataclass_from_dict(dict: dict, pydantic_dataclass_type) -> Any:
def _pydantic_dataclass_from_dict(dict: Dict, pydantic_dataclass_type) -> Any:
"""
Constructs a pydantic dataclass from a dict incl. other nested dataclasses.
This allows simple de-serialization of pydantic dataclasses from json.
@ -777,6 +801,21 @@ def _pydantic_dataclass_from_dict(dict: dict, pydantic_dataclass_type) -> Any:
return dataclass_object
def _dict_factory(data):
"""Meant to be as the dict_factory for `asdict`. This function is called within `asdict` to convert a list of tuples
into a dictionary object. This handles the conversion of pandas Dataframes into a list of lists.
:param data: list of (key, value) pairs
"""
def convert_value(v):
if isinstance(v, pd.DataFrame):
return dataframe_to_list(v)
return v
return {k: convert_value(v) for k, v in data}
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.ndarray):
@ -784,6 +823,14 @@ class NumpyEncoder(json.JSONEncoder):
return json.JSONEncoder.default(self, obj)
def dataframe_to_list(df: pd.DataFrame) -> List[List]:
return [df.columns.tolist()] + df.values.tolist()
def dataframe_from_list(list_df: List[List]) -> pd.DataFrame:
return pd.DataFrame(columns=list_df[0], data=list_df[1:])
class EvaluationResult:
def __init__(self, node_results: Optional[Dict[str, pd.DataFrame]] = None) -> None:
"""

View File

@ -1,4 +1,6 @@
from haystack.schema import Document, Label, Answer, Span, MultiLabel, TableCell
import json
from haystack.schema import Document, Label, Answer, Span, MultiLabel, TableCell, _dict_factory
import pytest
import numpy as np
import pandas as pd
@ -46,6 +48,74 @@ def text_labels():
]
@pytest.fixture
def table_label():
return Label(
query="some",
answer=Answer(
answer="text_2",
type="extractive",
score=0.1,
document_ids=["123"],
context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
offsets_in_document=[TableCell(row=1, col=0)],
),
document=Document(
content=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
content_type="table",
id="fe5cb68f8226776914781f6bd40ad718",
),
is_correct_answer=True,
is_correct_document=True,
origin="user-feedback",
created_at="2023-05-02 11:43:56",
updated_at=None,
id="fbd79f71-d690-4b21-bd0a-1094292b9809",
)
@pytest.fixture
def table_label_dict():
return {
"id": "fbd79f71-d690-4b21-bd0a-1094292b9809",
"query": "some",
"document": {
"id": "fe5cb68f8226776914781f6bd40ad718",
"content": [["col1", "col2"], ["text_1", 1], ["text_2", 2]],
"content_type": "table",
"meta": {},
"id_hash_keys": ["content"],
"score": None,
"embedding": None,
},
"is_correct_answer": True,
"is_correct_document": True,
"origin": "user-feedback",
"answer": {
"answer": "text_2",
"type": "extractive",
"score": 0.1,
"context": [["col1", "col2"], ["text_1", 1], ["text_2", 2]],
"offsets_in_document": [{"row": 1, "col": 0}],
"offsets_in_context": None,
"document_ids": ["123"],
"meta": {},
},
"pipeline_id": None,
"created_at": "2023-05-02 11:43:56",
"updated_at": None,
"meta": {},
"filters": None,
}
@pytest.fixture
def table_label_json(samples_path):
with open(samples_path / "schema" / "table_label.json") as f1:
data = json.load(f1)
return data
@pytest.fixture
def text_answer():
return Answer(
@ -59,6 +129,40 @@ def text_answer():
)
@pytest.fixture
def table_answer():
return Answer(
answer="text_2",
type="extractive",
score=0.1,
context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
offsets_in_document=[TableCell(row=1, col=0)],
offsets_in_context=[TableCell(row=1, col=0)],
document_ids=["123"],
)
@pytest.fixture
def table_answer_dict():
return {
"answer": "text_2",
"type": "extractive",
"score": 0.1,
"context": [["col1", "col2"], ["text_1", 1], ["text_2", 2]],
"offsets_in_document": [{"row": 1, "col": 0}],
"offsets_in_context": [{"row": 1, "col": 0}],
"document_ids": ["123"],
"meta": {},
}
@pytest.fixture
def table_answer_json(samples_path):
with open(samples_path / "schema" / "table_answer.json") as f1:
data = json.load(f1)
return data
@pytest.fixture
def table_doc():
data = {
@ -70,6 +174,31 @@ def table_doc():
return Document(content=pd.DataFrame(data), content_type="table", id="doc1")
@pytest.fixture
def table_doc_dict():
return {
"content": [
["actors", "age", "number of movies", "date of birth"],
["brad pitt", 58, 87, "18 december 1963"],
["leonardo di caprio", 47, 53, "11 november 1974"],
["george clooney", 60, 69, "6 may 1961"],
],
"content_type": "table",
"score": None,
"meta": {},
"id_hash_keys": ["content"],
"embedding": None,
"id": "doc1",
}
@pytest.fixture
def table_doc_json(samples_path):
with open(samples_path / "schema" / "table_doc.json") as f1:
json_str = f1.read()
return json_str
@pytest.fixture
def table_doc_with_embedding():
data = {
@ -79,71 +208,132 @@ def table_doc_with_embedding():
"date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
}
return Document(
content=pd.DataFrame(data), content_type="table", id="doc2", embedding=np.random.rand(768).astype(np.float32)
content=pd.DataFrame(data), content_type="table", id="doc2", embedding=np.array([1.1, 2.2, 3.3, 4.4])
)
@pytest.fixture
def table_doc_with_embedding_json(samples_path):
with open(samples_path / "schema" / "table_doc_emb.json") as f1:
json_str = f1.read()
return json_str
@pytest.mark.unit
def test_no_answer_label():
labels = [
Label(
query="question",
answer=Answer(answer=""),
is_correct_answer=True,
is_correct_document=True,
document=Document(content="some", id="777"),
origin="gold-label",
),
Label(
query="question",
answer=Answer(answer=""),
is_correct_answer=True,
is_correct_document=True,
document=Document(content="some", id="777"),
origin="gold-label",
),
Label(
query="question",
answer=Answer(answer="some"),
is_correct_answer=True,
is_correct_document=True,
document=Document(content="some", id="777"),
origin="gold-label",
),
Label(
query="question",
answer=Answer(answer="some"),
is_correct_answer=True,
is_correct_document=True,
document=Document(content="some", id="777"),
origin="gold-label",
),
]
assert labels[0].no_answer == True
assert labels[1].no_answer == True
assert labels[2].no_answer == False
assert labels[3].no_answer == False
label_no_answer = Label(
query="question",
answer=Answer(answer=""),
is_correct_answer=True,
is_correct_document=True,
document=Document(content="some", id="777"),
origin="gold-label",
)
label_with_answer = Label(
query="question",
answer=Answer(answer="some"),
is_correct_answer=True,
is_correct_document=True,
document=Document(content="some", id="777"),
origin="gold-label",
)
assert label_no_answer.no_answer
assert not label_with_answer.no_answer
@pytest.mark.unit
def test_equal_label(text_labels):
assert text_labels[2] == text_labels[0]
assert text_labels[1] != text_labels[0]
@pytest.mark.unit
def test_label_to_json(text_labels):
j0 = text_labels[0].to_json()
l_new = Label.from_json(j0)
assert l_new == text_labels[0]
assert l_new.answer.offsets_in_document[0].start == 1
text_label_json = text_labels[0].to_json()
text_label_from_json = Label.from_json(text_label_json)
assert text_label_from_json == text_labels[0]
assert text_label_from_json.answer.offsets_in_document[0].start == 1
@pytest.mark.unit
def test_label_to_dict(text_labels):
j0 = text_labels[0].to_dict()
l_new = Label.from_dict(j0)
assert l_new == text_labels[0]
assert l_new.answer.offsets_in_document[0].start == 1
text_label_dict = text_labels[0].to_dict()
text_label_from_dict = Label.from_dict(text_label_dict)
assert text_label_from_dict == text_labels[0]
assert text_label_from_dict.answer.offsets_in_document[0].start == 1
@pytest.mark.unit
def test_labels_with_identical_fields_are_equal(table_label):
table_label_copy = Label(
query="some",
answer=Answer(
answer="text_2",
type="extractive",
score=0.1,
document_ids=["123"],
context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
offsets_in_document=[TableCell(row=1, col=0)],
),
document=Document(
content=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
content_type="table",
),
is_correct_answer=True,
is_correct_document=True,
origin="user-feedback",
)
assert table_label == table_label_copy
@pytest.mark.unit
def test_labels_with_different_fields_are_not_equal(table_label):
table_label_different = Label(
query="some",
answer=Answer(
answer="text_1",
type="extractive",
score=0.1,
document_ids=["123"],
context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
offsets_in_document=[TableCell(row=0, col=0)],
),
document=Document(
content=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
content_type="table",
),
is_correct_answer=True,
is_correct_document=True,
origin="user-feedback",
)
assert table_label != table_label_different
@pytest.mark.unit
def test_table_label_from_json(table_label, table_label_json):
table_label_from_json = Label.from_json(table_label_json)
assert table_label_from_json == table_label
@pytest.mark.unit
def test_table_label_to_json(table_label, table_label_json):
table_label_to_json = json.loads(table_label.to_json())
assert table_label_to_json == table_label_json
@pytest.mark.unit
def test_table_label_from_dict(table_label, table_label_dict):
table_label_from_dict = Label.from_dict(table_label_dict)
assert table_label_from_dict == table_label
@pytest.mark.unit
def test_table_label_to_dict(table_label, table_label_dict):
table_label_to_dict = table_label.to_dict()
assert table_label_to_dict == table_label_dict
@pytest.mark.unit
def test_answer_to_json(text_answer):
a = text_answer
j = a.to_json()
@ -154,6 +344,7 @@ def test_answer_to_json(text_answer):
assert a_new == a
@pytest.mark.unit
def test_answer_to_dict(text_answer):
a = text_answer
j = a.to_dict()
@ -163,6 +354,29 @@ def test_answer_to_dict(text_answer):
assert a_new == a
@pytest.mark.unit
def test_table_answer_to_json(table_answer, table_answer_json):
table_answer_to_json = json.loads(table_answer.to_json())
assert table_answer_to_json == table_answer_json
@pytest.mark.unit
def test_table_answer_from_json(table_answer, table_answer_json):
table_answer_from_json = Answer.from_json(table_answer_json)
assert table_answer_from_json == table_answer
@pytest.mark.unit
def test_table_answer_to_dict(table_answer, table_answer_dict):
assert table_answer.to_dict() == table_answer_dict
@pytest.mark.unit
def test_table_answer_from_dict(table_answer, table_answer_dict):
assert table_answer == Answer.from_dict(table_answer_dict)
@pytest.mark.unit
def test_document_from_dict():
doc = Document(
content="this is the content of the document", meta={"some": "meta"}, id_hash_keys=["content", "meta"]
@ -170,13 +384,20 @@ def test_document_from_dict():
assert doc == Document.from_dict(doc.to_dict())
def test_table_document_from_dict(table_doc):
assert table_doc == Document.from_dict(table_doc.to_dict())
@pytest.mark.unit
def test_table_document_from_dict(table_doc, table_doc_dict):
assert table_doc == Document.from_dict(table_doc_dict)
@pytest.mark.unit
def test_table_document_to_dict(table_doc, table_doc_dict):
assert table_doc.to_dict() == table_doc_dict
@pytest.mark.unit
def test_doc_to_json():
# With embedding
d = Document(
doc_with_embedding = Document(
content="some text",
content_type="text",
id_hash_keys=["meta"],
@ -184,12 +405,12 @@ def test_doc_to_json():
meta={"name": "doc1"},
embedding=np.random.rand(768).astype(np.float32),
)
j0 = d.to_json()
d_new = Document.from_json(j0)
assert d == d_new
doc_emb_json = doc_with_embedding.to_json()
doc_emb_from_json = Document.from_json(doc_emb_json)
assert doc_with_embedding == doc_emb_from_json
# No embedding
d = Document(
doc_with_no_embedding = Document(
content="some text",
content_type="text",
score=0.99988,
@ -197,35 +418,48 @@ def test_doc_to_json():
id_hash_keys=["meta"],
embedding=None,
)
j0 = d.to_json()
d_new = Document.from_json(j0)
assert d == d_new
doc_no_emb_json = doc_with_no_embedding.to_json()
doc_no_emb_from_json = Document.from_json(doc_no_emb_json)
assert doc_with_no_embedding == doc_no_emb_from_json
def test_table_doc_to_json(table_doc, table_doc_with_embedding):
@pytest.mark.unit
def test_table_doc_from_json(table_doc, table_doc_with_embedding, table_doc_json, table_doc_with_embedding_json):
# With embedding
j0 = table_doc_with_embedding.to_json()
d_new = Document.from_json(j0)
assert table_doc_with_embedding == d_new
table_doc_emb_from_json = Document.from_json(table_doc_with_embedding_json)
assert table_doc_with_embedding == table_doc_emb_from_json
# No embedding
j0 = table_doc.to_json()
d_new = Document.from_json(j0)
assert table_doc == d_new
table_doc_no_emb_from_json = Document.from_json(table_doc_json)
assert table_doc == table_doc_no_emb_from_json
@pytest.mark.unit
def test_table_doc_to_json(table_doc, table_doc_with_embedding, table_doc_json, table_doc_with_embedding_json):
# With embedding
table_doc_emb_to_json = json.loads(table_doc_with_embedding.to_json())
assert json.loads(table_doc_with_embedding_json) == table_doc_emb_to_json
# No embedding
table_doc_no_emb_to_json = json.loads(table_doc.to_json())
assert json.loads(table_doc_json) == table_doc_no_emb_to_json
@pytest.mark.unit
def test_answer_postinit():
a = Answer(answer="test", offsets_in_document=[{"start": 10, "end": 20}])
assert a.meta == {}
assert isinstance(a.offsets_in_document[0], Span)
@pytest.mark.unit
def test_table_answer_postinit():
a = Answer(answer="test", offsets_in_document=[{"row": 1, "col": 2}])
assert a.meta == {}
assert isinstance(a.offsets_in_document[0], TableCell)
table_answer = Answer(answer="test", offsets_in_document=[{"row": 1, "col": 2}])
assert table_answer.meta == {}
assert isinstance(table_answer.offsets_in_document[0], TableCell)
@pytest.mark.unit
def test_generate_doc_id_using_text():
text1 = "text1"
text2 = "text2"
@ -237,6 +471,7 @@ def test_generate_doc_id_using_text():
assert doc1_text1.id != doc3_text2.id
@pytest.mark.unit
def test_generate_doc_id_using_custom_list():
text1 = "text1"
text2 = "text2"
@ -257,6 +492,7 @@ def test_generate_doc_id_using_custom_list():
_ = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content", "non_existing_field"])
@pytest.mark.unit
def test_generate_doc_id_custom_list_meta():
text1 = "text1"
text2 = "text2"
@ -280,6 +516,7 @@ def test_generate_doc_id_custom_list_meta():
assert doc1_text1.id != doc2_text2.id
@pytest.mark.unit
def test_aggregate_labels_with_labels():
label1_with_filter1 = Label(
query="question",
@ -314,6 +551,7 @@ def test_aggregate_labels_with_labels():
label = MultiLabel(labels=[label1_with_filter1, label3_with_filter2])
@pytest.mark.unit
def test_multilabel_preserve_order():
labels = [
Label(
@ -369,6 +607,7 @@ def test_multilabel_preserve_order():
assert multilabel.labels[i].id == str(i)
@pytest.mark.unit
def test_multilabel_preserve_order_w_duplicates():
labels = [
Label(
@ -455,6 +694,7 @@ def test_multilabel_preserve_order_w_duplicates():
assert multilabel.labels[i].id == str(i)
@pytest.mark.unit
def test_multilabel_id():
query1 = "question 1"
query2 = "question 2"
@ -495,6 +735,7 @@ def test_multilabel_id():
assert MultiLabel(labels=[label3]).id == "531445fa3bdf98b8598a3bea032bd605"
@pytest.mark.unit
def test_multilabel_with_doc_containing_dataframes():
table = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
table_doc = Document(content=table, content_type="table", id="table1")
@ -521,6 +762,7 @@ def test_multilabel_with_doc_containing_dataframes():
assert multilabel.offsets_in_contexts == [{"row": 0, "col": 0}]
@pytest.mark.unit
def test_multilabel_serialization():
label_dict = {
"id": "011079cf-c93f-49e6-83bb-42cd850dce12",
@ -566,23 +808,27 @@ def test_multilabel_serialization():
assert json_deserialized_multilabel.labels[0] == label
@pytest.mark.unit
def test_span_in():
assert 10 in Span(5, 15)
assert not 20 in Span(1, 15)
assert 20 not in Span(1, 15)
@pytest.mark.unit
def test_span_in_edges():
assert 5 in Span(5, 15)
assert not 15 in Span(5, 15)
assert 15 not in Span(5, 15)
@pytest.mark.unit
def test_span_in_other_values():
assert 10.0 in Span(5, 15)
assert "10" in Span(5, 15)
with pytest.raises(ValueError):
"hello" in Span(5, 15)
assert "hello" in Span(5, 15)
@pytest.mark.unit
def test_assert_span_vs_span():
assert Span(10, 11) in Span(5, 15)
assert Span(5, 10) in Span(5, 15)
@ -595,6 +841,7 @@ def test_assert_span_vs_span():
assert not Span(10, 20) in Span(5, 15)
@pytest.mark.unit
def test_id_hash_keys_not_ignored():
# Test that two documents with the same content but different metadata get assigned different ids if and only if
# id_hash_keys is set to 'meta'
@ -606,6 +853,7 @@ def test_id_hash_keys_not_ignored():
assert doc3.id == doc4.id
@pytest.mark.unit
def test_legacy_answer_document_id():
legacy_label = {
"id": "123",
@ -642,6 +890,7 @@ def test_legacy_answer_document_id():
assert label.answer.document_ids == ["fc18c987a8312e72a47fb1524f230bb0"]
@pytest.mark.unit
def test_legacy_answer_document_id_is_none():
legacy_label = {
"id": "123",
@ -676,3 +925,16 @@ def test_legacy_answer_document_id_is_none():
label = Label.from_dict(legacy_label)
assert label.answer.document_ids is None
@pytest.mark.unit
def test_dict_factory():
data = [
("key1", "some_value"),
("key2", ["val1", "val2"]),
("key3", pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})),
]
result = _dict_factory(data)
assert result["key1"] == "some_value"
assert result["key2"] == ["val1", "val2"]
assert result["key3"] == [["col1", "col2"], [1, 3], [2, 4]]

View File

@ -0,0 +1,35 @@
{
"answer": "text_2",
"type": "extractive",
"score": 0.1,
"context": [
[
"col1",
"col2"
],
[
"text_1",
1
],
[
"text_2",
2
]
],
"offsets_in_document": [
{
"row": 1,
"col": 0
}
],
"offsets_in_context": [
{
"row": 1,
"col": 0
}
],
"document_ids": [
"123"
],
"meta": {}
}

View File

@ -0,0 +1,36 @@
{
"content": [
[
"actors",
"age",
"number of movies",
"date of birth"
],
[
"brad pitt",
58,
87,
"18 december 1963"
],
[
"leonardo di caprio",
47,
53,
"11 november 1974"
],
[
"george clooney",
60,
69,
"6 may 1961"
]
],
"content_type": "table",
"score": null,
"meta": {},
"id_hash_keys": [
"content"
],
"embedding": null,
"id": "doc1"
}

View File

@ -0,0 +1,41 @@
{
"content": [
[
"actors",
"age",
"number of movies",
"date of birth"
],
[
"brad pitt",
58,
87,
"18 december 1963"
],
[
"leonardo di caprio",
47,
53,
"11 november 1974"
],
[
"george clooney",
60,
69,
"6 may 1961"
]
],
"content_type": "table",
"score": null,
"meta": {},
"id_hash_keys": [
"content"
],
"embedding": [
1.1,
2.2,
3.3,
4.4
],
"id": "doc2"
}

View File

@ -0,0 +1,66 @@
{
"id": "fbd79f71-d690-4b21-bd0a-1094292b9809",
"query": "some",
"document": {
"id": "fe5cb68f8226776914781f6bd40ad718",
"content": [
[
"col1",
"col2"
],
[
"text_1",
1
],
[
"text_2",
2
]
],
"content_type": "table",
"meta": {},
"id_hash_keys": [
"content"
],
"score": null,
"embedding": null
},
"is_correct_answer": true,
"is_correct_document": true,
"origin": "user-feedback",
"answer": {
"answer": "text_2",
"type": "extractive",
"score": 0.1,
"context": [
[
"col1",
"col2"
],
[
"text_1",
1
],
[
"text_2",
2
]
],
"offsets_in_document": [
{
"row": 1,
"col": 0
}
],
"offsets_in_context": null,
"document_ids": [
"123"
],
"meta": {}
},
"pipeline_id": null,
"created_at": "2023-05-02 11:43:56",
"updated_at": null,
"meta": {},
"filters": null
}