From a67ca289dbfe9eca508b4e54c64f1c099189bb3f Mon Sep 17 00:00:00 2001 From: Sebastian Date: Wed, 3 May 2023 09:42:07 +0200 Subject: [PATCH] refactor: Update schema objects to handle Dataframes in to_{dict,json} and from_{dict,json} (#4747) * Adding support for table Documents when serializing Labels in Haystack * Fix table label equality test * Add serialization support and __eq__ support for table answers * Made convenience functions for converting dataframes. Added some TODOs. Epxanded schema tests for table labels. Updated Multilabel to not convert Dataframes into strings. * get Answer and Label to_json working with DataFrame * Fix from_dict method of Label * Use Dict and remove unneccessary if check * Using pydantic instead of builtins for type detection * Update haystack/schema.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Update haystack/schema.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Update haystack/schema.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Separated table label equivalency tests and added pytest.mark.unit * Added unit test for _dict_factory * Using more descriptive variable names * Adding json files to test to_json and from_json functions * Added sample files for tests --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> --- haystack/schema.py | 115 ++++--- test/others/test_schema.py | 406 ++++++++++++++++++++----- test/samples/schema/table_answer.json | 35 +++ test/samples/schema/table_doc.json | 36 +++ test/samples/schema/table_doc_emb.json | 41 +++ test/samples/schema/table_label.json | 66 ++++ 6 files changed, 593 insertions(+), 106 deletions(-) create mode 100644 test/samples/schema/table_answer.json create mode 100644 test/samples/schema/table_doc.json create mode 100644 test/samples/schema/table_doc_emb.json create mode 100644 test/samples/schema/table_label.json diff --git a/haystack/schema.py b/haystack/schema.py index 46b198021..ce192a849 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -185,7 +185,7 @@ class Document: if k == "content": # Convert pd.DataFrame to list of rows for serialization if self.content_type == "table" and isinstance(self.content, pd.DataFrame): - v = [self.content.columns.tolist()] + self.content.values.tolist() + v = dataframe_to_list(self.content) k = k if k not in inv_field_map else inv_field_map[k] _doc[k] = v return _doc @@ -232,7 +232,7 @@ class Document: # Convert list of rows to pd.DataFrame if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list): - _new_doc["content"] = pd.DataFrame(columns=_new_doc["content"][0], data=_new_doc["content"][1:]) + _new_doc["content"] = dataframe_from_list(_new_doc["content"]) return cls(**_new_doc) @@ -243,11 +243,14 @@ class Document: return json.dumps(dictionary, cls=NumpyEncoder) @classmethod - def from_json(cls, data: str, field_map: Optional[Dict[str, Any]] = None) -> Document: + def from_json(cls, data: Union[str, Dict[str, Any]], field_map: Optional[Dict[str, Any]] = None) -> Document: if not field_map: field_map = {} - dictionary = json.loads(data) - return cls.from_dict(dictionary, field_map=field_map) + if isinstance(data, str): + dict_data = json.loads(data) + else: + dict_data = data + return cls.from_dict(dict_data, field_map=field_map) def __eq__(self, other): content = getattr(other, "content", None) @@ -401,6 +404,10 @@ class Answer: if self.meta is None: self.meta = {} + # In case the context is a list of lists for a table document that is instantiated by from_json() or from_dict() + if isinstance(self.context, list): + self.context = dataframe_from_list(self.context) + def __lt__(self, other): """Enable sorting of Answers by score""" return self.score < other.score @@ -412,29 +419,30 @@ class Answer: return f" 50 else ''}'>" def __repr__(self): - return f"" + return f"" - def to_dict(self): - return asdict(self) + def to_dict(self) -> Dict: + return asdict(self, dict_factory=_dict_factory) @classmethod - def from_dict(cls, dict: dict): + def from_dict(cls, dict: Dict) -> Answer: # backwards compatibility: `document_id: Optional[str]` was changed to `document_ids: Optional[List[str]]` if "document_id" in dict: dict = dict.copy() document_id = dict.pop("document_id") dict["document_ids"] = [document_id] if document_id is not None else None - - return _pydantic_dataclass_from_dict(dict=dict, pydantic_dataclass_type=cls) + return cls(**dict) def to_json(self): - return json.dumps(self, default=pydantic_encoder) + return json.dumps(self.to_dict(), cls=NumpyEncoder) @classmethod - def from_json(cls, data): - if type(data) == str: - data = json.loads(data) - return cls.from_dict(data) + def from_json(cls, data: Union[str, Dict[str, Any]]): + if isinstance(data, str): + dict_data = json.loads(data) + else: + dict_data = data + return cls.from_dict(dict_data) @staticmethod def _from_dict_offsets(offsets): @@ -449,6 +457,23 @@ class Answer: converted_offsets.append(e) return converted_offsets + def __eq__(self, other): + context = getattr(other, "context", None) + if isinstance(context, pd.DataFrame): + is_content_equal = context.equals(self.context) + else: + is_content_equal = context == self.context + return ( + isinstance(other, self.__class__) + and is_content_equal + and getattr(other, "type", None) == self.type + and getattr(other, "score", None) == self.score + and getattr(other, "offsets_in_document", None) == self.offsets_in_document + and getattr(other, "offsets_in_context", None) == self.offsets_in_context + and getattr(other, "document_ids", None) == self.document_ids + and getattr(other, "meta", None) == self.meta + ) + @dataclass class Label: @@ -521,11 +546,7 @@ class Label: self.updated_at = updated_at self.query = query - if isinstance(answer, dict): - answer = Answer.from_dict(answer) self.answer = answer - if isinstance(document, dict): - document = Document.from_dict(document) self.document = document self.is_correct_answer = is_correct_answer @@ -549,25 +570,28 @@ class Label: return no_answer def to_dict(self): - return asdict(self) + return asdict(self, dict_factory=_dict_factory) @classmethod - def from_dict(cls, dict: dict): - # backward compatibility for old labels using answers with document_id instead of document_ids + def from_dict(cls, dict: Dict): answer = dict.get("answer") - if answer and "document_id" in answer: - dict = dict.copy() + if answer and isinstance(answer, Dict): dict["answer"] = Answer.from_dict(dict["answer"]) - return _pydantic_dataclass_from_dict(dict=dict, pydantic_dataclass_type=cls) + doc = dict.get("document") + if isinstance(doc, Dict): + dict["document"] = Document.from_dict(dict["document"]) + return cls(**dict) def to_json(self): - return json.dumps(self, default=pydantic_encoder) + return json.dumps(self.to_dict(), cls=NumpyEncoder) @classmethod - def from_json(cls, data): - if type(data) == str: - data = json.loads(data) - return cls.from_dict(data) + def from_json(cls, data: Union[str, Dict[str, Any]]): + if isinstance(data, str): + dict_data = json.loads(data) + else: + dict_data = data + return cls.from_dict(dict_data) # define __eq__ and __hash__ functions to deduplicate Label Objects def __eq__(self, other): @@ -732,7 +756,7 @@ class MultiLabel: return {k[1:] if k[0] == "_" else k: v for k, v in vars(self).items()} @classmethod - def from_dict(cls, dict: dict): + def from_dict(cls, dict: Dict): # exclude extra arguments return cls(**{k: v for k, v in dict.items() if k in inspect.signature(cls).parameters}) @@ -741,7 +765,7 @@ class MultiLabel: @classmethod def from_json(cls, data: Union[str, Dict[str, Any]]): - if type(data) == str: + if isinstance(data, str): dict_data = json.loads(data) else: dict_data = data @@ -758,7 +782,7 @@ class MultiLabel: return f"" -def _pydantic_dataclass_from_dict(dict: dict, pydantic_dataclass_type) -> Any: +def _pydantic_dataclass_from_dict(dict: Dict, pydantic_dataclass_type) -> Any: """ Constructs a pydantic dataclass from a dict incl. other nested dataclasses. This allows simple de-serialization of pydantic dataclasses from json. @@ -777,6 +801,21 @@ def _pydantic_dataclass_from_dict(dict: dict, pydantic_dataclass_type) -> Any: return dataclass_object +def _dict_factory(data): + """Meant to be as the dict_factory for `asdict`. This function is called within `asdict` to convert a list of tuples + into a dictionary object. This handles the conversion of pandas Dataframes into a list of lists. + + :param data: list of (key, value) pairs + """ + + def convert_value(v): + if isinstance(v, pd.DataFrame): + return dataframe_to_list(v) + return v + + return {k: convert_value(v) for k, v in data} + + class NumpyEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.ndarray): @@ -784,6 +823,14 @@ class NumpyEncoder(json.JSONEncoder): return json.JSONEncoder.default(self, obj) +def dataframe_to_list(df: pd.DataFrame) -> List[List]: + return [df.columns.tolist()] + df.values.tolist() + + +def dataframe_from_list(list_df: List[List]) -> pd.DataFrame: + return pd.DataFrame(columns=list_df[0], data=list_df[1:]) + + class EvaluationResult: def __init__(self, node_results: Optional[Dict[str, pd.DataFrame]] = None) -> None: """ diff --git a/test/others/test_schema.py b/test/others/test_schema.py index dd1553a2f..3d9ed484e 100644 --- a/test/others/test_schema.py +++ b/test/others/test_schema.py @@ -1,4 +1,6 @@ -from haystack.schema import Document, Label, Answer, Span, MultiLabel, TableCell +import json + +from haystack.schema import Document, Label, Answer, Span, MultiLabel, TableCell, _dict_factory import pytest import numpy as np import pandas as pd @@ -46,6 +48,74 @@ def text_labels(): ] +@pytest.fixture +def table_label(): + return Label( + query="some", + answer=Answer( + answer="text_2", + type="extractive", + score=0.1, + document_ids=["123"], + context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]), + offsets_in_document=[TableCell(row=1, col=0)], + ), + document=Document( + content=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]), + content_type="table", + id="fe5cb68f8226776914781f6bd40ad718", + ), + is_correct_answer=True, + is_correct_document=True, + origin="user-feedback", + created_at="2023-05-02 11:43:56", + updated_at=None, + id="fbd79f71-d690-4b21-bd0a-1094292b9809", + ) + + +@pytest.fixture +def table_label_dict(): + return { + "id": "fbd79f71-d690-4b21-bd0a-1094292b9809", + "query": "some", + "document": { + "id": "fe5cb68f8226776914781f6bd40ad718", + "content": [["col1", "col2"], ["text_1", 1], ["text_2", 2]], + "content_type": "table", + "meta": {}, + "id_hash_keys": ["content"], + "score": None, + "embedding": None, + }, + "is_correct_answer": True, + "is_correct_document": True, + "origin": "user-feedback", + "answer": { + "answer": "text_2", + "type": "extractive", + "score": 0.1, + "context": [["col1", "col2"], ["text_1", 1], ["text_2", 2]], + "offsets_in_document": [{"row": 1, "col": 0}], + "offsets_in_context": None, + "document_ids": ["123"], + "meta": {}, + }, + "pipeline_id": None, + "created_at": "2023-05-02 11:43:56", + "updated_at": None, + "meta": {}, + "filters": None, + } + + +@pytest.fixture +def table_label_json(samples_path): + with open(samples_path / "schema" / "table_label.json") as f1: + data = json.load(f1) + return data + + @pytest.fixture def text_answer(): return Answer( @@ -59,6 +129,40 @@ def text_answer(): ) +@pytest.fixture +def table_answer(): + return Answer( + answer="text_2", + type="extractive", + score=0.1, + context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]), + offsets_in_document=[TableCell(row=1, col=0)], + offsets_in_context=[TableCell(row=1, col=0)], + document_ids=["123"], + ) + + +@pytest.fixture +def table_answer_dict(): + return { + "answer": "text_2", + "type": "extractive", + "score": 0.1, + "context": [["col1", "col2"], ["text_1", 1], ["text_2", 2]], + "offsets_in_document": [{"row": 1, "col": 0}], + "offsets_in_context": [{"row": 1, "col": 0}], + "document_ids": ["123"], + "meta": {}, + } + + +@pytest.fixture +def table_answer_json(samples_path): + with open(samples_path / "schema" / "table_answer.json") as f1: + data = json.load(f1) + return data + + @pytest.fixture def table_doc(): data = { @@ -70,6 +174,31 @@ def table_doc(): return Document(content=pd.DataFrame(data), content_type="table", id="doc1") +@pytest.fixture +def table_doc_dict(): + return { + "content": [ + ["actors", "age", "number of movies", "date of birth"], + ["brad pitt", 58, 87, "18 december 1963"], + ["leonardo di caprio", 47, 53, "11 november 1974"], + ["george clooney", 60, 69, "6 may 1961"], + ], + "content_type": "table", + "score": None, + "meta": {}, + "id_hash_keys": ["content"], + "embedding": None, + "id": "doc1", + } + + +@pytest.fixture +def table_doc_json(samples_path): + with open(samples_path / "schema" / "table_doc.json") as f1: + json_str = f1.read() + return json_str + + @pytest.fixture def table_doc_with_embedding(): data = { @@ -79,71 +208,132 @@ def table_doc_with_embedding(): "date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"], } return Document( - content=pd.DataFrame(data), content_type="table", id="doc2", embedding=np.random.rand(768).astype(np.float32) + content=pd.DataFrame(data), content_type="table", id="doc2", embedding=np.array([1.1, 2.2, 3.3, 4.4]) ) +@pytest.fixture +def table_doc_with_embedding_json(samples_path): + with open(samples_path / "schema" / "table_doc_emb.json") as f1: + json_str = f1.read() + return json_str + + +@pytest.mark.unit def test_no_answer_label(): - labels = [ - Label( - query="question", - answer=Answer(answer=""), - is_correct_answer=True, - is_correct_document=True, - document=Document(content="some", id="777"), - origin="gold-label", - ), - Label( - query="question", - answer=Answer(answer=""), - is_correct_answer=True, - is_correct_document=True, - document=Document(content="some", id="777"), - origin="gold-label", - ), - Label( - query="question", - answer=Answer(answer="some"), - is_correct_answer=True, - is_correct_document=True, - document=Document(content="some", id="777"), - origin="gold-label", - ), - Label( - query="question", - answer=Answer(answer="some"), - is_correct_answer=True, - is_correct_document=True, - document=Document(content="some", id="777"), - origin="gold-label", - ), - ] - - assert labels[0].no_answer == True - assert labels[1].no_answer == True - assert labels[2].no_answer == False - assert labels[3].no_answer == False + label_no_answer = Label( + query="question", + answer=Answer(answer=""), + is_correct_answer=True, + is_correct_document=True, + document=Document(content="some", id="777"), + origin="gold-label", + ) + label_with_answer = Label( + query="question", + answer=Answer(answer="some"), + is_correct_answer=True, + is_correct_document=True, + document=Document(content="some", id="777"), + origin="gold-label", + ) + assert label_no_answer.no_answer + assert not label_with_answer.no_answer +@pytest.mark.unit def test_equal_label(text_labels): assert text_labels[2] == text_labels[0] assert text_labels[1] != text_labels[0] +@pytest.mark.unit def test_label_to_json(text_labels): - j0 = text_labels[0].to_json() - l_new = Label.from_json(j0) - assert l_new == text_labels[0] - assert l_new.answer.offsets_in_document[0].start == 1 + text_label_json = text_labels[0].to_json() + text_label_from_json = Label.from_json(text_label_json) + assert text_label_from_json == text_labels[0] + assert text_label_from_json.answer.offsets_in_document[0].start == 1 +@pytest.mark.unit def test_label_to_dict(text_labels): - j0 = text_labels[0].to_dict() - l_new = Label.from_dict(j0) - assert l_new == text_labels[0] - assert l_new.answer.offsets_in_document[0].start == 1 + text_label_dict = text_labels[0].to_dict() + text_label_from_dict = Label.from_dict(text_label_dict) + assert text_label_from_dict == text_labels[0] + assert text_label_from_dict.answer.offsets_in_document[0].start == 1 +@pytest.mark.unit +def test_labels_with_identical_fields_are_equal(table_label): + table_label_copy = Label( + query="some", + answer=Answer( + answer="text_2", + type="extractive", + score=0.1, + document_ids=["123"], + context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]), + offsets_in_document=[TableCell(row=1, col=0)], + ), + document=Document( + content=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]), + content_type="table", + ), + is_correct_answer=True, + is_correct_document=True, + origin="user-feedback", + ) + assert table_label == table_label_copy + + +@pytest.mark.unit +def test_labels_with_different_fields_are_not_equal(table_label): + table_label_different = Label( + query="some", + answer=Answer( + answer="text_1", + type="extractive", + score=0.1, + document_ids=["123"], + context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]), + offsets_in_document=[TableCell(row=0, col=0)], + ), + document=Document( + content=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]), + content_type="table", + ), + is_correct_answer=True, + is_correct_document=True, + origin="user-feedback", + ) + assert table_label != table_label_different + + +@pytest.mark.unit +def test_table_label_from_json(table_label, table_label_json): + table_label_from_json = Label.from_json(table_label_json) + assert table_label_from_json == table_label + + +@pytest.mark.unit +def test_table_label_to_json(table_label, table_label_json): + table_label_to_json = json.loads(table_label.to_json()) + assert table_label_to_json == table_label_json + + +@pytest.mark.unit +def test_table_label_from_dict(table_label, table_label_dict): + table_label_from_dict = Label.from_dict(table_label_dict) + assert table_label_from_dict == table_label + + +@pytest.mark.unit +def test_table_label_to_dict(table_label, table_label_dict): + table_label_to_dict = table_label.to_dict() + assert table_label_to_dict == table_label_dict + + +@pytest.mark.unit def test_answer_to_json(text_answer): a = text_answer j = a.to_json() @@ -154,6 +344,7 @@ def test_answer_to_json(text_answer): assert a_new == a +@pytest.mark.unit def test_answer_to_dict(text_answer): a = text_answer j = a.to_dict() @@ -163,6 +354,29 @@ def test_answer_to_dict(text_answer): assert a_new == a +@pytest.mark.unit +def test_table_answer_to_json(table_answer, table_answer_json): + table_answer_to_json = json.loads(table_answer.to_json()) + assert table_answer_to_json == table_answer_json + + +@pytest.mark.unit +def test_table_answer_from_json(table_answer, table_answer_json): + table_answer_from_json = Answer.from_json(table_answer_json) + assert table_answer_from_json == table_answer + + +@pytest.mark.unit +def test_table_answer_to_dict(table_answer, table_answer_dict): + assert table_answer.to_dict() == table_answer_dict + + +@pytest.mark.unit +def test_table_answer_from_dict(table_answer, table_answer_dict): + assert table_answer == Answer.from_dict(table_answer_dict) + + +@pytest.mark.unit def test_document_from_dict(): doc = Document( content="this is the content of the document", meta={"some": "meta"}, id_hash_keys=["content", "meta"] @@ -170,13 +384,20 @@ def test_document_from_dict(): assert doc == Document.from_dict(doc.to_dict()) -def test_table_document_from_dict(table_doc): - assert table_doc == Document.from_dict(table_doc.to_dict()) +@pytest.mark.unit +def test_table_document_from_dict(table_doc, table_doc_dict): + assert table_doc == Document.from_dict(table_doc_dict) +@pytest.mark.unit +def test_table_document_to_dict(table_doc, table_doc_dict): + assert table_doc.to_dict() == table_doc_dict + + +@pytest.mark.unit def test_doc_to_json(): # With embedding - d = Document( + doc_with_embedding = Document( content="some text", content_type="text", id_hash_keys=["meta"], @@ -184,12 +405,12 @@ def test_doc_to_json(): meta={"name": "doc1"}, embedding=np.random.rand(768).astype(np.float32), ) - j0 = d.to_json() - d_new = Document.from_json(j0) - assert d == d_new + doc_emb_json = doc_with_embedding.to_json() + doc_emb_from_json = Document.from_json(doc_emb_json) + assert doc_with_embedding == doc_emb_from_json # No embedding - d = Document( + doc_with_no_embedding = Document( content="some text", content_type="text", score=0.99988, @@ -197,35 +418,48 @@ def test_doc_to_json(): id_hash_keys=["meta"], embedding=None, ) - j0 = d.to_json() - d_new = Document.from_json(j0) - assert d == d_new + doc_no_emb_json = doc_with_no_embedding.to_json() + doc_no_emb_from_json = Document.from_json(doc_no_emb_json) + assert doc_with_no_embedding == doc_no_emb_from_json -def test_table_doc_to_json(table_doc, table_doc_with_embedding): +@pytest.mark.unit +def test_table_doc_from_json(table_doc, table_doc_with_embedding, table_doc_json, table_doc_with_embedding_json): # With embedding - j0 = table_doc_with_embedding.to_json() - d_new = Document.from_json(j0) - assert table_doc_with_embedding == d_new + table_doc_emb_from_json = Document.from_json(table_doc_with_embedding_json) + assert table_doc_with_embedding == table_doc_emb_from_json # No embedding - j0 = table_doc.to_json() - d_new = Document.from_json(j0) - assert table_doc == d_new + table_doc_no_emb_from_json = Document.from_json(table_doc_json) + assert table_doc == table_doc_no_emb_from_json +@pytest.mark.unit +def test_table_doc_to_json(table_doc, table_doc_with_embedding, table_doc_json, table_doc_with_embedding_json): + # With embedding + table_doc_emb_to_json = json.loads(table_doc_with_embedding.to_json()) + assert json.loads(table_doc_with_embedding_json) == table_doc_emb_to_json + + # No embedding + table_doc_no_emb_to_json = json.loads(table_doc.to_json()) + assert json.loads(table_doc_json) == table_doc_no_emb_to_json + + +@pytest.mark.unit def test_answer_postinit(): a = Answer(answer="test", offsets_in_document=[{"start": 10, "end": 20}]) assert a.meta == {} assert isinstance(a.offsets_in_document[0], Span) +@pytest.mark.unit def test_table_answer_postinit(): - a = Answer(answer="test", offsets_in_document=[{"row": 1, "col": 2}]) - assert a.meta == {} - assert isinstance(a.offsets_in_document[0], TableCell) + table_answer = Answer(answer="test", offsets_in_document=[{"row": 1, "col": 2}]) + assert table_answer.meta == {} + assert isinstance(table_answer.offsets_in_document[0], TableCell) +@pytest.mark.unit def test_generate_doc_id_using_text(): text1 = "text1" text2 = "text2" @@ -237,6 +471,7 @@ def test_generate_doc_id_using_text(): assert doc1_text1.id != doc3_text2.id +@pytest.mark.unit def test_generate_doc_id_using_custom_list(): text1 = "text1" text2 = "text2" @@ -257,6 +492,7 @@ def test_generate_doc_id_using_custom_list(): _ = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content", "non_existing_field"]) +@pytest.mark.unit def test_generate_doc_id_custom_list_meta(): text1 = "text1" text2 = "text2" @@ -280,6 +516,7 @@ def test_generate_doc_id_custom_list_meta(): assert doc1_text1.id != doc2_text2.id +@pytest.mark.unit def test_aggregate_labels_with_labels(): label1_with_filter1 = Label( query="question", @@ -314,6 +551,7 @@ def test_aggregate_labels_with_labels(): label = MultiLabel(labels=[label1_with_filter1, label3_with_filter2]) +@pytest.mark.unit def test_multilabel_preserve_order(): labels = [ Label( @@ -369,6 +607,7 @@ def test_multilabel_preserve_order(): assert multilabel.labels[i].id == str(i) +@pytest.mark.unit def test_multilabel_preserve_order_w_duplicates(): labels = [ Label( @@ -455,6 +694,7 @@ def test_multilabel_preserve_order_w_duplicates(): assert multilabel.labels[i].id == str(i) +@pytest.mark.unit def test_multilabel_id(): query1 = "question 1" query2 = "question 2" @@ -495,6 +735,7 @@ def test_multilabel_id(): assert MultiLabel(labels=[label3]).id == "531445fa3bdf98b8598a3bea032bd605" +@pytest.mark.unit def test_multilabel_with_doc_containing_dataframes(): table = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) table_doc = Document(content=table, content_type="table", id="table1") @@ -521,6 +762,7 @@ def test_multilabel_with_doc_containing_dataframes(): assert multilabel.offsets_in_contexts == [{"row": 0, "col": 0}] +@pytest.mark.unit def test_multilabel_serialization(): label_dict = { "id": "011079cf-c93f-49e6-83bb-42cd850dce12", @@ -566,23 +808,27 @@ def test_multilabel_serialization(): assert json_deserialized_multilabel.labels[0] == label +@pytest.mark.unit def test_span_in(): assert 10 in Span(5, 15) - assert not 20 in Span(1, 15) + assert 20 not in Span(1, 15) +@pytest.mark.unit def test_span_in_edges(): assert 5 in Span(5, 15) - assert not 15 in Span(5, 15) + assert 15 not in Span(5, 15) +@pytest.mark.unit def test_span_in_other_values(): assert 10.0 in Span(5, 15) assert "10" in Span(5, 15) with pytest.raises(ValueError): - "hello" in Span(5, 15) + assert "hello" in Span(5, 15) +@pytest.mark.unit def test_assert_span_vs_span(): assert Span(10, 11) in Span(5, 15) assert Span(5, 10) in Span(5, 15) @@ -595,6 +841,7 @@ def test_assert_span_vs_span(): assert not Span(10, 20) in Span(5, 15) +@pytest.mark.unit def test_id_hash_keys_not_ignored(): # Test that two documents with the same content but different metadata get assigned different ids if and only if # id_hash_keys is set to 'meta' @@ -606,6 +853,7 @@ def test_id_hash_keys_not_ignored(): assert doc3.id == doc4.id +@pytest.mark.unit def test_legacy_answer_document_id(): legacy_label = { "id": "123", @@ -642,6 +890,7 @@ def test_legacy_answer_document_id(): assert label.answer.document_ids == ["fc18c987a8312e72a47fb1524f230bb0"] +@pytest.mark.unit def test_legacy_answer_document_id_is_none(): legacy_label = { "id": "123", @@ -676,3 +925,16 @@ def test_legacy_answer_document_id_is_none(): label = Label.from_dict(legacy_label) assert label.answer.document_ids is None + + +@pytest.mark.unit +def test_dict_factory(): + data = [ + ("key1", "some_value"), + ("key2", ["val1", "val2"]), + ("key3", pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})), + ] + result = _dict_factory(data) + assert result["key1"] == "some_value" + assert result["key2"] == ["val1", "val2"] + assert result["key3"] == [["col1", "col2"], [1, 3], [2, 4]] diff --git a/test/samples/schema/table_answer.json b/test/samples/schema/table_answer.json new file mode 100644 index 000000000..2f3065300 --- /dev/null +++ b/test/samples/schema/table_answer.json @@ -0,0 +1,35 @@ +{ + "answer": "text_2", + "type": "extractive", + "score": 0.1, + "context": [ + [ + "col1", + "col2" + ], + [ + "text_1", + 1 + ], + [ + "text_2", + 2 + ] + ], + "offsets_in_document": [ + { + "row": 1, + "col": 0 + } + ], + "offsets_in_context": [ + { + "row": 1, + "col": 0 + } + ], + "document_ids": [ + "123" + ], + "meta": {} +} diff --git a/test/samples/schema/table_doc.json b/test/samples/schema/table_doc.json new file mode 100644 index 000000000..0662847dc --- /dev/null +++ b/test/samples/schema/table_doc.json @@ -0,0 +1,36 @@ +{ + "content": [ + [ + "actors", + "age", + "number of movies", + "date of birth" + ], + [ + "brad pitt", + 58, + 87, + "18 december 1963" + ], + [ + "leonardo di caprio", + 47, + 53, + "11 november 1974" + ], + [ + "george clooney", + 60, + 69, + "6 may 1961" + ] + ], + "content_type": "table", + "score": null, + "meta": {}, + "id_hash_keys": [ + "content" + ], + "embedding": null, + "id": "doc1" +} diff --git a/test/samples/schema/table_doc_emb.json b/test/samples/schema/table_doc_emb.json new file mode 100644 index 000000000..4ee0875e5 --- /dev/null +++ b/test/samples/schema/table_doc_emb.json @@ -0,0 +1,41 @@ +{ + "content": [ + [ + "actors", + "age", + "number of movies", + "date of birth" + ], + [ + "brad pitt", + 58, + 87, + "18 december 1963" + ], + [ + "leonardo di caprio", + 47, + 53, + "11 november 1974" + ], + [ + "george clooney", + 60, + 69, + "6 may 1961" + ] + ], + "content_type": "table", + "score": null, + "meta": {}, + "id_hash_keys": [ + "content" + ], + "embedding": [ + 1.1, + 2.2, + 3.3, + 4.4 + ], + "id": "doc2" +} diff --git a/test/samples/schema/table_label.json b/test/samples/schema/table_label.json new file mode 100644 index 000000000..38cb0eb99 --- /dev/null +++ b/test/samples/schema/table_label.json @@ -0,0 +1,66 @@ +{ + "id": "fbd79f71-d690-4b21-bd0a-1094292b9809", + "query": "some", + "document": { + "id": "fe5cb68f8226776914781f6bd40ad718", + "content": [ + [ + "col1", + "col2" + ], + [ + "text_1", + 1 + ], + [ + "text_2", + 2 + ] + ], + "content_type": "table", + "meta": {}, + "id_hash_keys": [ + "content" + ], + "score": null, + "embedding": null + }, + "is_correct_answer": true, + "is_correct_document": true, + "origin": "user-feedback", + "answer": { + "answer": "text_2", + "type": "extractive", + "score": 0.1, + "context": [ + [ + "col1", + "col2" + ], + [ + "text_1", + 1 + ], + [ + "text_2", + 2 + ] + ], + "offsets_in_document": [ + { + "row": 1, + "col": 0 + } + ], + "offsets_in_context": null, + "document_ids": [ + "123" + ], + "meta": {} + }, + "pipeline_id": null, + "created_at": "2023-05-02 11:43:56", + "updated_at": null, + "meta": {}, + "filters": null +}