mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-30 08:37:20 +00:00
refactor: Update schema objects to handle Dataframes in to_{dict,json} and from_{dict,json} (#4747)
* Adding support for table Documents when serializing Labels in Haystack * Fix table label equality test * Add serialization support and __eq__ support for table answers * Made convenience functions for converting dataframes. Added some TODOs. Epxanded schema tests for table labels. Updated Multilabel to not convert Dataframes into strings. * get Answer and Label to_json working with DataFrame * Fix from_dict method of Label * Use Dict and remove unneccessary if check * Using pydantic instead of builtins for type detection * Update haystack/schema.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Update haystack/schema.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Update haystack/schema.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Separated table label equivalency tests and added pytest.mark.unit * Added unit test for _dict_factory * Using more descriptive variable names * Adding json files to test to_json and from_json functions * Added sample files for tests --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
This commit is contained in:
parent
a9ec954c45
commit
a67ca289db
@ -185,7 +185,7 @@ class Document:
|
||||
if k == "content":
|
||||
# Convert pd.DataFrame to list of rows for serialization
|
||||
if self.content_type == "table" and isinstance(self.content, pd.DataFrame):
|
||||
v = [self.content.columns.tolist()] + self.content.values.tolist()
|
||||
v = dataframe_to_list(self.content)
|
||||
k = k if k not in inv_field_map else inv_field_map[k]
|
||||
_doc[k] = v
|
||||
return _doc
|
||||
@ -232,7 +232,7 @@ class Document:
|
||||
|
||||
# Convert list of rows to pd.DataFrame
|
||||
if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list):
|
||||
_new_doc["content"] = pd.DataFrame(columns=_new_doc["content"][0], data=_new_doc["content"][1:])
|
||||
_new_doc["content"] = dataframe_from_list(_new_doc["content"])
|
||||
|
||||
return cls(**_new_doc)
|
||||
|
||||
@ -243,11 +243,14 @@ class Document:
|
||||
return json.dumps(dictionary, cls=NumpyEncoder)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, data: str, field_map: Optional[Dict[str, Any]] = None) -> Document:
|
||||
def from_json(cls, data: Union[str, Dict[str, Any]], field_map: Optional[Dict[str, Any]] = None) -> Document:
|
||||
if not field_map:
|
||||
field_map = {}
|
||||
dictionary = json.loads(data)
|
||||
return cls.from_dict(dictionary, field_map=field_map)
|
||||
if isinstance(data, str):
|
||||
dict_data = json.loads(data)
|
||||
else:
|
||||
dict_data = data
|
||||
return cls.from_dict(dict_data, field_map=field_map)
|
||||
|
||||
def __eq__(self, other):
|
||||
content = getattr(other, "content", None)
|
||||
@ -401,6 +404,10 @@ class Answer:
|
||||
if self.meta is None:
|
||||
self.meta = {}
|
||||
|
||||
# In case the context is a list of lists for a table document that is instantiated by from_json() or from_dict()
|
||||
if isinstance(self.context, list):
|
||||
self.context = dataframe_from_list(self.context)
|
||||
|
||||
def __lt__(self, other):
|
||||
"""Enable sorting of Answers by score"""
|
||||
return self.score < other.score
|
||||
@ -412,29 +419,30 @@ class Answer:
|
||||
return f"<Answer: answer='{self.answer}', score={self.score}, context='{self.context[:50]}{'...' if len(self.context) > 50 else ''}'>"
|
||||
|
||||
def __repr__(self):
|
||||
return f"<Answer {asdict(self)}>"
|
||||
return f"<Answer {self.to_dict()}>"
|
||||
|
||||
def to_dict(self):
|
||||
return asdict(self)
|
||||
def to_dict(self) -> Dict:
|
||||
return asdict(self, dict_factory=_dict_factory)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, dict: dict):
|
||||
def from_dict(cls, dict: Dict) -> Answer:
|
||||
# backwards compatibility: `document_id: Optional[str]` was changed to `document_ids: Optional[List[str]]`
|
||||
if "document_id" in dict:
|
||||
dict = dict.copy()
|
||||
document_id = dict.pop("document_id")
|
||||
dict["document_ids"] = [document_id] if document_id is not None else None
|
||||
|
||||
return _pydantic_dataclass_from_dict(dict=dict, pydantic_dataclass_type=cls)
|
||||
return cls(**dict)
|
||||
|
||||
def to_json(self):
|
||||
return json.dumps(self, default=pydantic_encoder)
|
||||
return json.dumps(self.to_dict(), cls=NumpyEncoder)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, data):
|
||||
if type(data) == str:
|
||||
data = json.loads(data)
|
||||
return cls.from_dict(data)
|
||||
def from_json(cls, data: Union[str, Dict[str, Any]]):
|
||||
if isinstance(data, str):
|
||||
dict_data = json.loads(data)
|
||||
else:
|
||||
dict_data = data
|
||||
return cls.from_dict(dict_data)
|
||||
|
||||
@staticmethod
|
||||
def _from_dict_offsets(offsets):
|
||||
@ -449,6 +457,23 @@ class Answer:
|
||||
converted_offsets.append(e)
|
||||
return converted_offsets
|
||||
|
||||
def __eq__(self, other):
|
||||
context = getattr(other, "context", None)
|
||||
if isinstance(context, pd.DataFrame):
|
||||
is_content_equal = context.equals(self.context)
|
||||
else:
|
||||
is_content_equal = context == self.context
|
||||
return (
|
||||
isinstance(other, self.__class__)
|
||||
and is_content_equal
|
||||
and getattr(other, "type", None) == self.type
|
||||
and getattr(other, "score", None) == self.score
|
||||
and getattr(other, "offsets_in_document", None) == self.offsets_in_document
|
||||
and getattr(other, "offsets_in_context", None) == self.offsets_in_context
|
||||
and getattr(other, "document_ids", None) == self.document_ids
|
||||
and getattr(other, "meta", None) == self.meta
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Label:
|
||||
@ -521,11 +546,7 @@ class Label:
|
||||
self.updated_at = updated_at
|
||||
self.query = query
|
||||
|
||||
if isinstance(answer, dict):
|
||||
answer = Answer.from_dict(answer)
|
||||
self.answer = answer
|
||||
if isinstance(document, dict):
|
||||
document = Document.from_dict(document)
|
||||
self.document = document
|
||||
|
||||
self.is_correct_answer = is_correct_answer
|
||||
@ -549,25 +570,28 @@ class Label:
|
||||
return no_answer
|
||||
|
||||
def to_dict(self):
|
||||
return asdict(self)
|
||||
return asdict(self, dict_factory=_dict_factory)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, dict: dict):
|
||||
# backward compatibility for old labels using answers with document_id instead of document_ids
|
||||
def from_dict(cls, dict: Dict):
|
||||
answer = dict.get("answer")
|
||||
if answer and "document_id" in answer:
|
||||
dict = dict.copy()
|
||||
if answer and isinstance(answer, Dict):
|
||||
dict["answer"] = Answer.from_dict(dict["answer"])
|
||||
return _pydantic_dataclass_from_dict(dict=dict, pydantic_dataclass_type=cls)
|
||||
doc = dict.get("document")
|
||||
if isinstance(doc, Dict):
|
||||
dict["document"] = Document.from_dict(dict["document"])
|
||||
return cls(**dict)
|
||||
|
||||
def to_json(self):
|
||||
return json.dumps(self, default=pydantic_encoder)
|
||||
return json.dumps(self.to_dict(), cls=NumpyEncoder)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, data):
|
||||
if type(data) == str:
|
||||
data = json.loads(data)
|
||||
return cls.from_dict(data)
|
||||
def from_json(cls, data: Union[str, Dict[str, Any]]):
|
||||
if isinstance(data, str):
|
||||
dict_data = json.loads(data)
|
||||
else:
|
||||
dict_data = data
|
||||
return cls.from_dict(dict_data)
|
||||
|
||||
# define __eq__ and __hash__ functions to deduplicate Label Objects
|
||||
def __eq__(self, other):
|
||||
@ -732,7 +756,7 @@ class MultiLabel:
|
||||
return {k[1:] if k[0] == "_" else k: v for k, v in vars(self).items()}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, dict: dict):
|
||||
def from_dict(cls, dict: Dict):
|
||||
# exclude extra arguments
|
||||
return cls(**{k: v for k, v in dict.items() if k in inspect.signature(cls).parameters})
|
||||
|
||||
@ -741,7 +765,7 @@ class MultiLabel:
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, data: Union[str, Dict[str, Any]]):
|
||||
if type(data) == str:
|
||||
if isinstance(data, str):
|
||||
dict_data = json.loads(data)
|
||||
else:
|
||||
dict_data = data
|
||||
@ -758,7 +782,7 @@ class MultiLabel:
|
||||
return f"<MultiLabel: {self.to_dict()}>"
|
||||
|
||||
|
||||
def _pydantic_dataclass_from_dict(dict: dict, pydantic_dataclass_type) -> Any:
|
||||
def _pydantic_dataclass_from_dict(dict: Dict, pydantic_dataclass_type) -> Any:
|
||||
"""
|
||||
Constructs a pydantic dataclass from a dict incl. other nested dataclasses.
|
||||
This allows simple de-serialization of pydantic dataclasses from json.
|
||||
@ -777,6 +801,21 @@ def _pydantic_dataclass_from_dict(dict: dict, pydantic_dataclass_type) -> Any:
|
||||
return dataclass_object
|
||||
|
||||
|
||||
def _dict_factory(data):
|
||||
"""Meant to be as the dict_factory for `asdict`. This function is called within `asdict` to convert a list of tuples
|
||||
into a dictionary object. This handles the conversion of pandas Dataframes into a list of lists.
|
||||
|
||||
:param data: list of (key, value) pairs
|
||||
"""
|
||||
|
||||
def convert_value(v):
|
||||
if isinstance(v, pd.DataFrame):
|
||||
return dataframe_to_list(v)
|
||||
return v
|
||||
|
||||
return {k: convert_value(v) for k, v in data}
|
||||
|
||||
|
||||
class NumpyEncoder(json.JSONEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, np.ndarray):
|
||||
@ -784,6 +823,14 @@ class NumpyEncoder(json.JSONEncoder):
|
||||
return json.JSONEncoder.default(self, obj)
|
||||
|
||||
|
||||
def dataframe_to_list(df: pd.DataFrame) -> List[List]:
|
||||
return [df.columns.tolist()] + df.values.tolist()
|
||||
|
||||
|
||||
def dataframe_from_list(list_df: List[List]) -> pd.DataFrame:
|
||||
return pd.DataFrame(columns=list_df[0], data=list_df[1:])
|
||||
|
||||
|
||||
class EvaluationResult:
|
||||
def __init__(self, node_results: Optional[Dict[str, pd.DataFrame]] = None) -> None:
|
||||
"""
|
||||
|
||||
@ -1,4 +1,6 @@
|
||||
from haystack.schema import Document, Label, Answer, Span, MultiLabel, TableCell
|
||||
import json
|
||||
|
||||
from haystack.schema import Document, Label, Answer, Span, MultiLabel, TableCell, _dict_factory
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
@ -46,6 +48,74 @@ def text_labels():
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table_label():
|
||||
return Label(
|
||||
query="some",
|
||||
answer=Answer(
|
||||
answer="text_2",
|
||||
type="extractive",
|
||||
score=0.1,
|
||||
document_ids=["123"],
|
||||
context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
|
||||
offsets_in_document=[TableCell(row=1, col=0)],
|
||||
),
|
||||
document=Document(
|
||||
content=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
|
||||
content_type="table",
|
||||
id="fe5cb68f8226776914781f6bd40ad718",
|
||||
),
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
origin="user-feedback",
|
||||
created_at="2023-05-02 11:43:56",
|
||||
updated_at=None,
|
||||
id="fbd79f71-d690-4b21-bd0a-1094292b9809",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table_label_dict():
|
||||
return {
|
||||
"id": "fbd79f71-d690-4b21-bd0a-1094292b9809",
|
||||
"query": "some",
|
||||
"document": {
|
||||
"id": "fe5cb68f8226776914781f6bd40ad718",
|
||||
"content": [["col1", "col2"], ["text_1", 1], ["text_2", 2]],
|
||||
"content_type": "table",
|
||||
"meta": {},
|
||||
"id_hash_keys": ["content"],
|
||||
"score": None,
|
||||
"embedding": None,
|
||||
},
|
||||
"is_correct_answer": True,
|
||||
"is_correct_document": True,
|
||||
"origin": "user-feedback",
|
||||
"answer": {
|
||||
"answer": "text_2",
|
||||
"type": "extractive",
|
||||
"score": 0.1,
|
||||
"context": [["col1", "col2"], ["text_1", 1], ["text_2", 2]],
|
||||
"offsets_in_document": [{"row": 1, "col": 0}],
|
||||
"offsets_in_context": None,
|
||||
"document_ids": ["123"],
|
||||
"meta": {},
|
||||
},
|
||||
"pipeline_id": None,
|
||||
"created_at": "2023-05-02 11:43:56",
|
||||
"updated_at": None,
|
||||
"meta": {},
|
||||
"filters": None,
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table_label_json(samples_path):
|
||||
with open(samples_path / "schema" / "table_label.json") as f1:
|
||||
data = json.load(f1)
|
||||
return data
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def text_answer():
|
||||
return Answer(
|
||||
@ -59,6 +129,40 @@ def text_answer():
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table_answer():
|
||||
return Answer(
|
||||
answer="text_2",
|
||||
type="extractive",
|
||||
score=0.1,
|
||||
context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
|
||||
offsets_in_document=[TableCell(row=1, col=0)],
|
||||
offsets_in_context=[TableCell(row=1, col=0)],
|
||||
document_ids=["123"],
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table_answer_dict():
|
||||
return {
|
||||
"answer": "text_2",
|
||||
"type": "extractive",
|
||||
"score": 0.1,
|
||||
"context": [["col1", "col2"], ["text_1", 1], ["text_2", 2]],
|
||||
"offsets_in_document": [{"row": 1, "col": 0}],
|
||||
"offsets_in_context": [{"row": 1, "col": 0}],
|
||||
"document_ids": ["123"],
|
||||
"meta": {},
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table_answer_json(samples_path):
|
||||
with open(samples_path / "schema" / "table_answer.json") as f1:
|
||||
data = json.load(f1)
|
||||
return data
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table_doc():
|
||||
data = {
|
||||
@ -70,6 +174,31 @@ def table_doc():
|
||||
return Document(content=pd.DataFrame(data), content_type="table", id="doc1")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table_doc_dict():
|
||||
return {
|
||||
"content": [
|
||||
["actors", "age", "number of movies", "date of birth"],
|
||||
["brad pitt", 58, 87, "18 december 1963"],
|
||||
["leonardo di caprio", 47, 53, "11 november 1974"],
|
||||
["george clooney", 60, 69, "6 may 1961"],
|
||||
],
|
||||
"content_type": "table",
|
||||
"score": None,
|
||||
"meta": {},
|
||||
"id_hash_keys": ["content"],
|
||||
"embedding": None,
|
||||
"id": "doc1",
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table_doc_json(samples_path):
|
||||
with open(samples_path / "schema" / "table_doc.json") as f1:
|
||||
json_str = f1.read()
|
||||
return json_str
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table_doc_with_embedding():
|
||||
data = {
|
||||
@ -79,71 +208,132 @@ def table_doc_with_embedding():
|
||||
"date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
|
||||
}
|
||||
return Document(
|
||||
content=pd.DataFrame(data), content_type="table", id="doc2", embedding=np.random.rand(768).astype(np.float32)
|
||||
content=pd.DataFrame(data), content_type="table", id="doc2", embedding=np.array([1.1, 2.2, 3.3, 4.4])
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def table_doc_with_embedding_json(samples_path):
|
||||
with open(samples_path / "schema" / "table_doc_emb.json") as f1:
|
||||
json_str = f1.read()
|
||||
return json_str
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_no_answer_label():
|
||||
labels = [
|
||||
Label(
|
||||
query="question",
|
||||
answer=Answer(answer=""),
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
document=Document(content="some", id="777"),
|
||||
origin="gold-label",
|
||||
),
|
||||
Label(
|
||||
query="question",
|
||||
answer=Answer(answer=""),
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
document=Document(content="some", id="777"),
|
||||
origin="gold-label",
|
||||
),
|
||||
Label(
|
||||
query="question",
|
||||
answer=Answer(answer="some"),
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
document=Document(content="some", id="777"),
|
||||
origin="gold-label",
|
||||
),
|
||||
Label(
|
||||
query="question",
|
||||
answer=Answer(answer="some"),
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
document=Document(content="some", id="777"),
|
||||
origin="gold-label",
|
||||
),
|
||||
]
|
||||
|
||||
assert labels[0].no_answer == True
|
||||
assert labels[1].no_answer == True
|
||||
assert labels[2].no_answer == False
|
||||
assert labels[3].no_answer == False
|
||||
label_no_answer = Label(
|
||||
query="question",
|
||||
answer=Answer(answer=""),
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
document=Document(content="some", id="777"),
|
||||
origin="gold-label",
|
||||
)
|
||||
label_with_answer = Label(
|
||||
query="question",
|
||||
answer=Answer(answer="some"),
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
document=Document(content="some", id="777"),
|
||||
origin="gold-label",
|
||||
)
|
||||
assert label_no_answer.no_answer
|
||||
assert not label_with_answer.no_answer
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_equal_label(text_labels):
|
||||
assert text_labels[2] == text_labels[0]
|
||||
assert text_labels[1] != text_labels[0]
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_label_to_json(text_labels):
|
||||
j0 = text_labels[0].to_json()
|
||||
l_new = Label.from_json(j0)
|
||||
assert l_new == text_labels[0]
|
||||
assert l_new.answer.offsets_in_document[0].start == 1
|
||||
text_label_json = text_labels[0].to_json()
|
||||
text_label_from_json = Label.from_json(text_label_json)
|
||||
assert text_label_from_json == text_labels[0]
|
||||
assert text_label_from_json.answer.offsets_in_document[0].start == 1
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_label_to_dict(text_labels):
|
||||
j0 = text_labels[0].to_dict()
|
||||
l_new = Label.from_dict(j0)
|
||||
assert l_new == text_labels[0]
|
||||
assert l_new.answer.offsets_in_document[0].start == 1
|
||||
text_label_dict = text_labels[0].to_dict()
|
||||
text_label_from_dict = Label.from_dict(text_label_dict)
|
||||
assert text_label_from_dict == text_labels[0]
|
||||
assert text_label_from_dict.answer.offsets_in_document[0].start == 1
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_labels_with_identical_fields_are_equal(table_label):
|
||||
table_label_copy = Label(
|
||||
query="some",
|
||||
answer=Answer(
|
||||
answer="text_2",
|
||||
type="extractive",
|
||||
score=0.1,
|
||||
document_ids=["123"],
|
||||
context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
|
||||
offsets_in_document=[TableCell(row=1, col=0)],
|
||||
),
|
||||
document=Document(
|
||||
content=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
|
||||
content_type="table",
|
||||
),
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
origin="user-feedback",
|
||||
)
|
||||
assert table_label == table_label_copy
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_labels_with_different_fields_are_not_equal(table_label):
|
||||
table_label_different = Label(
|
||||
query="some",
|
||||
answer=Answer(
|
||||
answer="text_1",
|
||||
type="extractive",
|
||||
score=0.1,
|
||||
document_ids=["123"],
|
||||
context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
|
||||
offsets_in_document=[TableCell(row=0, col=0)],
|
||||
),
|
||||
document=Document(
|
||||
content=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
|
||||
content_type="table",
|
||||
),
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
origin="user-feedback",
|
||||
)
|
||||
assert table_label != table_label_different
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_table_label_from_json(table_label, table_label_json):
|
||||
table_label_from_json = Label.from_json(table_label_json)
|
||||
assert table_label_from_json == table_label
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_table_label_to_json(table_label, table_label_json):
|
||||
table_label_to_json = json.loads(table_label.to_json())
|
||||
assert table_label_to_json == table_label_json
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_table_label_from_dict(table_label, table_label_dict):
|
||||
table_label_from_dict = Label.from_dict(table_label_dict)
|
||||
assert table_label_from_dict == table_label
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_table_label_to_dict(table_label, table_label_dict):
|
||||
table_label_to_dict = table_label.to_dict()
|
||||
assert table_label_to_dict == table_label_dict
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_answer_to_json(text_answer):
|
||||
a = text_answer
|
||||
j = a.to_json()
|
||||
@ -154,6 +344,7 @@ def test_answer_to_json(text_answer):
|
||||
assert a_new == a
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_answer_to_dict(text_answer):
|
||||
a = text_answer
|
||||
j = a.to_dict()
|
||||
@ -163,6 +354,29 @@ def test_answer_to_dict(text_answer):
|
||||
assert a_new == a
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_table_answer_to_json(table_answer, table_answer_json):
|
||||
table_answer_to_json = json.loads(table_answer.to_json())
|
||||
assert table_answer_to_json == table_answer_json
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_table_answer_from_json(table_answer, table_answer_json):
|
||||
table_answer_from_json = Answer.from_json(table_answer_json)
|
||||
assert table_answer_from_json == table_answer
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_table_answer_to_dict(table_answer, table_answer_dict):
|
||||
assert table_answer.to_dict() == table_answer_dict
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_table_answer_from_dict(table_answer, table_answer_dict):
|
||||
assert table_answer == Answer.from_dict(table_answer_dict)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_document_from_dict():
|
||||
doc = Document(
|
||||
content="this is the content of the document", meta={"some": "meta"}, id_hash_keys=["content", "meta"]
|
||||
@ -170,13 +384,20 @@ def test_document_from_dict():
|
||||
assert doc == Document.from_dict(doc.to_dict())
|
||||
|
||||
|
||||
def test_table_document_from_dict(table_doc):
|
||||
assert table_doc == Document.from_dict(table_doc.to_dict())
|
||||
@pytest.mark.unit
|
||||
def test_table_document_from_dict(table_doc, table_doc_dict):
|
||||
assert table_doc == Document.from_dict(table_doc_dict)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_table_document_to_dict(table_doc, table_doc_dict):
|
||||
assert table_doc.to_dict() == table_doc_dict
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_doc_to_json():
|
||||
# With embedding
|
||||
d = Document(
|
||||
doc_with_embedding = Document(
|
||||
content="some text",
|
||||
content_type="text",
|
||||
id_hash_keys=["meta"],
|
||||
@ -184,12 +405,12 @@ def test_doc_to_json():
|
||||
meta={"name": "doc1"},
|
||||
embedding=np.random.rand(768).astype(np.float32),
|
||||
)
|
||||
j0 = d.to_json()
|
||||
d_new = Document.from_json(j0)
|
||||
assert d == d_new
|
||||
doc_emb_json = doc_with_embedding.to_json()
|
||||
doc_emb_from_json = Document.from_json(doc_emb_json)
|
||||
assert doc_with_embedding == doc_emb_from_json
|
||||
|
||||
# No embedding
|
||||
d = Document(
|
||||
doc_with_no_embedding = Document(
|
||||
content="some text",
|
||||
content_type="text",
|
||||
score=0.99988,
|
||||
@ -197,35 +418,48 @@ def test_doc_to_json():
|
||||
id_hash_keys=["meta"],
|
||||
embedding=None,
|
||||
)
|
||||
j0 = d.to_json()
|
||||
d_new = Document.from_json(j0)
|
||||
assert d == d_new
|
||||
doc_no_emb_json = doc_with_no_embedding.to_json()
|
||||
doc_no_emb_from_json = Document.from_json(doc_no_emb_json)
|
||||
assert doc_with_no_embedding == doc_no_emb_from_json
|
||||
|
||||
|
||||
def test_table_doc_to_json(table_doc, table_doc_with_embedding):
|
||||
@pytest.mark.unit
|
||||
def test_table_doc_from_json(table_doc, table_doc_with_embedding, table_doc_json, table_doc_with_embedding_json):
|
||||
# With embedding
|
||||
j0 = table_doc_with_embedding.to_json()
|
||||
d_new = Document.from_json(j0)
|
||||
assert table_doc_with_embedding == d_new
|
||||
table_doc_emb_from_json = Document.from_json(table_doc_with_embedding_json)
|
||||
assert table_doc_with_embedding == table_doc_emb_from_json
|
||||
|
||||
# No embedding
|
||||
j0 = table_doc.to_json()
|
||||
d_new = Document.from_json(j0)
|
||||
assert table_doc == d_new
|
||||
table_doc_no_emb_from_json = Document.from_json(table_doc_json)
|
||||
assert table_doc == table_doc_no_emb_from_json
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_table_doc_to_json(table_doc, table_doc_with_embedding, table_doc_json, table_doc_with_embedding_json):
|
||||
# With embedding
|
||||
table_doc_emb_to_json = json.loads(table_doc_with_embedding.to_json())
|
||||
assert json.loads(table_doc_with_embedding_json) == table_doc_emb_to_json
|
||||
|
||||
# No embedding
|
||||
table_doc_no_emb_to_json = json.loads(table_doc.to_json())
|
||||
assert json.loads(table_doc_json) == table_doc_no_emb_to_json
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_answer_postinit():
|
||||
a = Answer(answer="test", offsets_in_document=[{"start": 10, "end": 20}])
|
||||
assert a.meta == {}
|
||||
assert isinstance(a.offsets_in_document[0], Span)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_table_answer_postinit():
|
||||
a = Answer(answer="test", offsets_in_document=[{"row": 1, "col": 2}])
|
||||
assert a.meta == {}
|
||||
assert isinstance(a.offsets_in_document[0], TableCell)
|
||||
table_answer = Answer(answer="test", offsets_in_document=[{"row": 1, "col": 2}])
|
||||
assert table_answer.meta == {}
|
||||
assert isinstance(table_answer.offsets_in_document[0], TableCell)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_generate_doc_id_using_text():
|
||||
text1 = "text1"
|
||||
text2 = "text2"
|
||||
@ -237,6 +471,7 @@ def test_generate_doc_id_using_text():
|
||||
assert doc1_text1.id != doc3_text2.id
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_generate_doc_id_using_custom_list():
|
||||
text1 = "text1"
|
||||
text2 = "text2"
|
||||
@ -257,6 +492,7 @@ def test_generate_doc_id_using_custom_list():
|
||||
_ = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content", "non_existing_field"])
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_generate_doc_id_custom_list_meta():
|
||||
text1 = "text1"
|
||||
text2 = "text2"
|
||||
@ -280,6 +516,7 @@ def test_generate_doc_id_custom_list_meta():
|
||||
assert doc1_text1.id != doc2_text2.id
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_aggregate_labels_with_labels():
|
||||
label1_with_filter1 = Label(
|
||||
query="question",
|
||||
@ -314,6 +551,7 @@ def test_aggregate_labels_with_labels():
|
||||
label = MultiLabel(labels=[label1_with_filter1, label3_with_filter2])
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_multilabel_preserve_order():
|
||||
labels = [
|
||||
Label(
|
||||
@ -369,6 +607,7 @@ def test_multilabel_preserve_order():
|
||||
assert multilabel.labels[i].id == str(i)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_multilabel_preserve_order_w_duplicates():
|
||||
labels = [
|
||||
Label(
|
||||
@ -455,6 +694,7 @@ def test_multilabel_preserve_order_w_duplicates():
|
||||
assert multilabel.labels[i].id == str(i)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_multilabel_id():
|
||||
query1 = "question 1"
|
||||
query2 = "question 2"
|
||||
@ -495,6 +735,7 @@ def test_multilabel_id():
|
||||
assert MultiLabel(labels=[label3]).id == "531445fa3bdf98b8598a3bea032bd605"
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_multilabel_with_doc_containing_dataframes():
|
||||
table = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
|
||||
table_doc = Document(content=table, content_type="table", id="table1")
|
||||
@ -521,6 +762,7 @@ def test_multilabel_with_doc_containing_dataframes():
|
||||
assert multilabel.offsets_in_contexts == [{"row": 0, "col": 0}]
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_multilabel_serialization():
|
||||
label_dict = {
|
||||
"id": "011079cf-c93f-49e6-83bb-42cd850dce12",
|
||||
@ -566,23 +808,27 @@ def test_multilabel_serialization():
|
||||
assert json_deserialized_multilabel.labels[0] == label
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_span_in():
|
||||
assert 10 in Span(5, 15)
|
||||
assert not 20 in Span(1, 15)
|
||||
assert 20 not in Span(1, 15)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_span_in_edges():
|
||||
assert 5 in Span(5, 15)
|
||||
assert not 15 in Span(5, 15)
|
||||
assert 15 not in Span(5, 15)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_span_in_other_values():
|
||||
assert 10.0 in Span(5, 15)
|
||||
assert "10" in Span(5, 15)
|
||||
with pytest.raises(ValueError):
|
||||
"hello" in Span(5, 15)
|
||||
assert "hello" in Span(5, 15)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_assert_span_vs_span():
|
||||
assert Span(10, 11) in Span(5, 15)
|
||||
assert Span(5, 10) in Span(5, 15)
|
||||
@ -595,6 +841,7 @@ def test_assert_span_vs_span():
|
||||
assert not Span(10, 20) in Span(5, 15)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_id_hash_keys_not_ignored():
|
||||
# Test that two documents with the same content but different metadata get assigned different ids if and only if
|
||||
# id_hash_keys is set to 'meta'
|
||||
@ -606,6 +853,7 @@ def test_id_hash_keys_not_ignored():
|
||||
assert doc3.id == doc4.id
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_legacy_answer_document_id():
|
||||
legacy_label = {
|
||||
"id": "123",
|
||||
@ -642,6 +890,7 @@ def test_legacy_answer_document_id():
|
||||
assert label.answer.document_ids == ["fc18c987a8312e72a47fb1524f230bb0"]
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_legacy_answer_document_id_is_none():
|
||||
legacy_label = {
|
||||
"id": "123",
|
||||
@ -676,3 +925,16 @@ def test_legacy_answer_document_id_is_none():
|
||||
|
||||
label = Label.from_dict(legacy_label)
|
||||
assert label.answer.document_ids is None
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_dict_factory():
|
||||
data = [
|
||||
("key1", "some_value"),
|
||||
("key2", ["val1", "val2"]),
|
||||
("key3", pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})),
|
||||
]
|
||||
result = _dict_factory(data)
|
||||
assert result["key1"] == "some_value"
|
||||
assert result["key2"] == ["val1", "val2"]
|
||||
assert result["key3"] == [["col1", "col2"], [1, 3], [2, 4]]
|
||||
|
||||
35
test/samples/schema/table_answer.json
Normal file
35
test/samples/schema/table_answer.json
Normal file
@ -0,0 +1,35 @@
|
||||
{
|
||||
"answer": "text_2",
|
||||
"type": "extractive",
|
||||
"score": 0.1,
|
||||
"context": [
|
||||
[
|
||||
"col1",
|
||||
"col2"
|
||||
],
|
||||
[
|
||||
"text_1",
|
||||
1
|
||||
],
|
||||
[
|
||||
"text_2",
|
||||
2
|
||||
]
|
||||
],
|
||||
"offsets_in_document": [
|
||||
{
|
||||
"row": 1,
|
||||
"col": 0
|
||||
}
|
||||
],
|
||||
"offsets_in_context": [
|
||||
{
|
||||
"row": 1,
|
||||
"col": 0
|
||||
}
|
||||
],
|
||||
"document_ids": [
|
||||
"123"
|
||||
],
|
||||
"meta": {}
|
||||
}
|
||||
36
test/samples/schema/table_doc.json
Normal file
36
test/samples/schema/table_doc.json
Normal file
@ -0,0 +1,36 @@
|
||||
{
|
||||
"content": [
|
||||
[
|
||||
"actors",
|
||||
"age",
|
||||
"number of movies",
|
||||
"date of birth"
|
||||
],
|
||||
[
|
||||
"brad pitt",
|
||||
58,
|
||||
87,
|
||||
"18 december 1963"
|
||||
],
|
||||
[
|
||||
"leonardo di caprio",
|
||||
47,
|
||||
53,
|
||||
"11 november 1974"
|
||||
],
|
||||
[
|
||||
"george clooney",
|
||||
60,
|
||||
69,
|
||||
"6 may 1961"
|
||||
]
|
||||
],
|
||||
"content_type": "table",
|
||||
"score": null,
|
||||
"meta": {},
|
||||
"id_hash_keys": [
|
||||
"content"
|
||||
],
|
||||
"embedding": null,
|
||||
"id": "doc1"
|
||||
}
|
||||
41
test/samples/schema/table_doc_emb.json
Normal file
41
test/samples/schema/table_doc_emb.json
Normal file
@ -0,0 +1,41 @@
|
||||
{
|
||||
"content": [
|
||||
[
|
||||
"actors",
|
||||
"age",
|
||||
"number of movies",
|
||||
"date of birth"
|
||||
],
|
||||
[
|
||||
"brad pitt",
|
||||
58,
|
||||
87,
|
||||
"18 december 1963"
|
||||
],
|
||||
[
|
||||
"leonardo di caprio",
|
||||
47,
|
||||
53,
|
||||
"11 november 1974"
|
||||
],
|
||||
[
|
||||
"george clooney",
|
||||
60,
|
||||
69,
|
||||
"6 may 1961"
|
||||
]
|
||||
],
|
||||
"content_type": "table",
|
||||
"score": null,
|
||||
"meta": {},
|
||||
"id_hash_keys": [
|
||||
"content"
|
||||
],
|
||||
"embedding": [
|
||||
1.1,
|
||||
2.2,
|
||||
3.3,
|
||||
4.4
|
||||
],
|
||||
"id": "doc2"
|
||||
}
|
||||
66
test/samples/schema/table_label.json
Normal file
66
test/samples/schema/table_label.json
Normal file
@ -0,0 +1,66 @@
|
||||
{
|
||||
"id": "fbd79f71-d690-4b21-bd0a-1094292b9809",
|
||||
"query": "some",
|
||||
"document": {
|
||||
"id": "fe5cb68f8226776914781f6bd40ad718",
|
||||
"content": [
|
||||
[
|
||||
"col1",
|
||||
"col2"
|
||||
],
|
||||
[
|
||||
"text_1",
|
||||
1
|
||||
],
|
||||
[
|
||||
"text_2",
|
||||
2
|
||||
]
|
||||
],
|
||||
"content_type": "table",
|
||||
"meta": {},
|
||||
"id_hash_keys": [
|
||||
"content"
|
||||
],
|
||||
"score": null,
|
||||
"embedding": null
|
||||
},
|
||||
"is_correct_answer": true,
|
||||
"is_correct_document": true,
|
||||
"origin": "user-feedback",
|
||||
"answer": {
|
||||
"answer": "text_2",
|
||||
"type": "extractive",
|
||||
"score": 0.1,
|
||||
"context": [
|
||||
[
|
||||
"col1",
|
||||
"col2"
|
||||
],
|
||||
[
|
||||
"text_1",
|
||||
1
|
||||
],
|
||||
[
|
||||
"text_2",
|
||||
2
|
||||
]
|
||||
],
|
||||
"offsets_in_document": [
|
||||
{
|
||||
"row": 1,
|
||||
"col": 0
|
||||
}
|
||||
],
|
||||
"offsets_in_context": null,
|
||||
"document_ids": [
|
||||
"123"
|
||||
],
|
||||
"meta": {}
|
||||
},
|
||||
"pipeline_id": null,
|
||||
"created_at": "2023-05-02 11:43:56",
|
||||
"updated_at": null,
|
||||
"meta": {},
|
||||
"filters": null
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user