refactor: Update schema objects to handle Dataframes in to_{dict,json} and from_{dict,json} (#4747)

* Adding support for table Documents when serializing Labels in Haystack * Fix table label equality test * Add serialization support and __eq__ support for table answers * Made convenience functions for converting dataframes. Added some TODOs. Epxanded schema tests for table labels. Updated Multilabel to not convert Dataframes into strings. * get Answer and Label to_json working with DataFrame * Fix from_dict method of Label * Use Dict and remove unneccessary if check * Using pydantic instead of builtins for type detection * Update haystack/schema.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Update haystack/schema.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Update haystack/schema.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Separated table label equivalency tests and added pytest.mark.unit * Added unit test for _dict_factory * Using more descriptive variable names * Adding json files to test to_json and from_json functions * Added sample files for tests --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
2026-01-01 01:27:28 +00:00 · 2023-05-03 09:42:07 +02:00 · 2023-05-03 09:42:07 +02:00 · a67ca289db
commit a67ca289db
parent a9ec954c45
6 changed files with 593 additions and 106 deletions
--- a/haystack/schema.py
+++ b/haystack/schema.py
@ -185,7 +185,7 @@ class Document:
            if k == "content":
                # Convert pd.DataFrame to list of rows for serialization
                if self.content_type == "table" and isinstance(self.content, pd.DataFrame):
-                    v = [self.content.columns.tolist()] + self.content.values.tolist()
+                    v = dataframe_to_list(self.content)
            k = k if k not in inv_field_map else inv_field_map[k]
            _doc[k] = v
        return _doc
@ -232,7 +232,7 @@ class Document:

        # Convert list of rows to pd.DataFrame
        if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list):
-            _new_doc["content"] = pd.DataFrame(columns=_new_doc["content"][0], data=_new_doc["content"][1:])
+            _new_doc["content"] = dataframe_from_list(_new_doc["content"])

        return cls(**_new_doc)

@ -243,11 +243,14 @@ class Document:
        return json.dumps(dictionary, cls=NumpyEncoder)

    @classmethod
-    def from_json(cls, data: str, field_map: Optional[Dict[str, Any]] = None) -> Document:
+    def from_json(cls, data: Union[str, Dict[str, Any]], field_map: Optional[Dict[str, Any]] = None) -> Document:
        if not field_map:
            field_map = {}
-        dictionary = json.loads(data)
-        return cls.from_dict(dictionary, field_map=field_map)
+        if isinstance(data, str):
+            dict_data = json.loads(data)
+        else:
+            dict_data = data
+        return cls.from_dict(dict_data, field_map=field_map)

    def __eq__(self, other):
        content = getattr(other, "content", None)
@ -401,6 +404,10 @@ class Answer:
        if self.meta is None:
            self.meta = {}

+        # In case the context is a list of lists for a table document that is instantiated by from_json() or from_dict()
+        if isinstance(self.context, list):
+            self.context = dataframe_from_list(self.context)
+
    def __lt__(self, other):
        """Enable sorting of Answers by score"""
        return self.score < other.score
@ -412,29 +419,30 @@ class Answer:
        return f"<Answer: answer='{self.answer}', score={self.score}, context='{self.context[:50]}{'...' if len(self.context) > 50 else ''}'>"

    def __repr__(self):
-        return f"<Answer {asdict(self)}>"
+        return f"<Answer {self.to_dict()}>"

-    def to_dict(self):
-        return asdict(self)
+    def to_dict(self) -> Dict:
+        return asdict(self, dict_factory=_dict_factory)

    @classmethod
-    def from_dict(cls, dict: dict):
+    def from_dict(cls, dict: Dict) -> Answer:
        # backwards compatibility: `document_id: Optional[str]` was changed to `document_ids: Optional[List[str]]`
        if "document_id" in dict:
            dict = dict.copy()
            document_id = dict.pop("document_id")
            dict["document_ids"] = [document_id] if document_id is not None else None
-
-        return _pydantic_dataclass_from_dict(dict=dict, pydantic_dataclass_type=cls)
+        return cls(**dict)

    def to_json(self):
-        return json.dumps(self, default=pydantic_encoder)
+        return json.dumps(self.to_dict(), cls=NumpyEncoder)

    @classmethod
-    def from_json(cls, data):
-        if type(data) == str:
-            data = json.loads(data)
-        return cls.from_dict(data)
+    def from_json(cls, data: Union[str, Dict[str, Any]]):
+        if isinstance(data, str):
+            dict_data = json.loads(data)
+        else:
+            dict_data = data
+        return cls.from_dict(dict_data)

    @staticmethod
    def _from_dict_offsets(offsets):
@ -449,6 +457,23 @@ class Answer:
                converted_offsets.append(e)
        return converted_offsets

+    def __eq__(self, other):
+        context = getattr(other, "context", None)
+        if isinstance(context, pd.DataFrame):
+            is_content_equal = context.equals(self.context)
+        else:
+            is_content_equal = context == self.context
+        return (
+            isinstance(other, self.__class__)
+            and is_content_equal
+            and getattr(other, "type", None) == self.type
+            and getattr(other, "score", None) == self.score
+            and getattr(other, "offsets_in_document", None) == self.offsets_in_document
+            and getattr(other, "offsets_in_context", None) == self.offsets_in_context
+            and getattr(other, "document_ids", None) == self.document_ids
+            and getattr(other, "meta", None) == self.meta
+        )
+

@dataclass
 class Label:
@ -521,11 +546,7 @@ class Label:
        self.updated_at = updated_at
        self.query = query

-        if isinstance(answer, dict):
-            answer = Answer.from_dict(answer)
        self.answer = answer
-        if isinstance(document, dict):
-            document = Document.from_dict(document)
        self.document = document

        self.is_correct_answer = is_correct_answer
@ -549,25 +570,28 @@ class Label:
        return no_answer

    def to_dict(self):
-        return asdict(self)
+        return asdict(self, dict_factory=_dict_factory)

    @classmethod
-    def from_dict(cls, dict: dict):
-        # backward compatibility for old labels using answers with document_id instead of document_ids
+    def from_dict(cls, dict: Dict):
        answer = dict.get("answer")
-        if answer and "document_id" in answer:
-            dict = dict.copy()
+        if answer and isinstance(answer, Dict):
            dict["answer"] = Answer.from_dict(dict["answer"])
-        return _pydantic_dataclass_from_dict(dict=dict, pydantic_dataclass_type=cls)
+        doc = dict.get("document")
+        if isinstance(doc, Dict):
+            dict["document"] = Document.from_dict(dict["document"])
+        return cls(**dict)

    def to_json(self):
-        return json.dumps(self, default=pydantic_encoder)
+        return json.dumps(self.to_dict(), cls=NumpyEncoder)

    @classmethod
-    def from_json(cls, data):
-        if type(data) == str:
-            data = json.loads(data)
-        return cls.from_dict(data)
+    def from_json(cls, data: Union[str, Dict[str, Any]]):
+        if isinstance(data, str):
+            dict_data = json.loads(data)
+        else:
+            dict_data = data
+        return cls.from_dict(dict_data)

    # define __eq__ and __hash__ functions to deduplicate Label Objects
    def __eq__(self, other):
@ -732,7 +756,7 @@ class MultiLabel:
        return {k[1:] if k[0] == "_" else k: v for k, v in vars(self).items()}

    @classmethod
-    def from_dict(cls, dict: dict):
+    def from_dict(cls, dict: Dict):
        # exclude extra arguments
        return cls(**{k: v for k, v in dict.items() if k in inspect.signature(cls).parameters})

@ -741,7 +765,7 @@ class MultiLabel:

    @classmethod
    def from_json(cls, data: Union[str, Dict[str, Any]]):
-        if type(data) == str:
+        if isinstance(data, str):
            dict_data = json.loads(data)
        else:
            dict_data = data
@ -758,7 +782,7 @@ class MultiLabel:
        return f"<MultiLabel: {self.to_dict()}>"


-def _pydantic_dataclass_from_dict(dict: dict, pydantic_dataclass_type) -> Any:
+def _pydantic_dataclass_from_dict(dict: Dict, pydantic_dataclass_type) -> Any:
    """
    Constructs a pydantic dataclass from a dict incl. other nested dataclasses.
    This allows simple de-serialization of pydantic dataclasses from json.
@ -777,6 +801,21 @@ def _pydantic_dataclass_from_dict(dict: dict, pydantic_dataclass_type) -> Any:
    return dataclass_object


+def _dict_factory(data):
+    """Meant to be as the dict_factory for `asdict`. This function is called within `asdict` to convert a list of tuples
+    into a dictionary object. This handles the conversion of pandas Dataframes into a list of lists.
+
+    :param data: list of (key, value) pairs
+    """
+
+    def convert_value(v):
+        if isinstance(v, pd.DataFrame):
+            return dataframe_to_list(v)
+        return v
+
+    return {k: convert_value(v) for k, v in data}
+
+
 class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
@ -784,6 +823,14 @@ class NumpyEncoder(json.JSONEncoder):
        return json.JSONEncoder.default(self, obj)


+def dataframe_to_list(df: pd.DataFrame) -> List[List]:
+    return [df.columns.tolist()] + df.values.tolist()
+
+
+def dataframe_from_list(list_df: List[List]) -> pd.DataFrame:
+    return pd.DataFrame(columns=list_df[0], data=list_df[1:])
+
+
 class EvaluationResult:
    def __init__(self, node_results: Optional[Dict[str, pd.DataFrame]] = None) -> None:
        """
--- a/test/others/test_schema.py
+++ b/test/others/test_schema.py
@ -1,4 +1,6 @@
-from haystack.schema import Document, Label, Answer, Span, MultiLabel, TableCell
+import json
+
+from haystack.schema import Document, Label, Answer, Span, MultiLabel, TableCell, _dict_factory
 import pytest
 import numpy as np
 import pandas as pd
@ -46,6 +48,74 @@ def text_labels():
    ]


+@pytest.fixture
+def table_label():
+    return Label(
+        query="some",
+        answer=Answer(
+            answer="text_2",
+            type="extractive",
+            score=0.1,
+            document_ids=["123"],
+            context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
+            offsets_in_document=[TableCell(row=1, col=0)],
+        ),
+        document=Document(
+            content=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
+            content_type="table",
+            id="fe5cb68f8226776914781f6bd40ad718",
+        ),
+        is_correct_answer=True,
+        is_correct_document=True,
+        origin="user-feedback",
+        created_at="2023-05-02 11:43:56",
+        updated_at=None,
+        id="fbd79f71-d690-4b21-bd0a-1094292b9809",
+    )
+
+
+@pytest.fixture
+def table_label_dict():
+    return {
+        "id": "fbd79f71-d690-4b21-bd0a-1094292b9809",
+        "query": "some",
+        "document": {
+            "id": "fe5cb68f8226776914781f6bd40ad718",
+            "content": [["col1", "col2"], ["text_1", 1], ["text_2", 2]],
+            "content_type": "table",
+            "meta": {},
+            "id_hash_keys": ["content"],
+            "score": None,
+            "embedding": None,
+        },
+        "is_correct_answer": True,
+        "is_correct_document": True,
+        "origin": "user-feedback",
+        "answer": {
+            "answer": "text_2",
+            "type": "extractive",
+            "score": 0.1,
+            "context": [["col1", "col2"], ["text_1", 1], ["text_2", 2]],
+            "offsets_in_document": [{"row": 1, "col": 0}],
+            "offsets_in_context": None,
+            "document_ids": ["123"],
+            "meta": {},
+        },
+        "pipeline_id": None,
+        "created_at": "2023-05-02 11:43:56",
+        "updated_at": None,
+        "meta": {},
+        "filters": None,
+    }
+
+
+@pytest.fixture
+def table_label_json(samples_path):
+    with open(samples_path / "schema" / "table_label.json") as f1:
+        data = json.load(f1)
+    return data
+
+
@pytest.fixture
 def text_answer():
    return Answer(
@ -59,6 +129,40 @@ def text_answer():
    )


+@pytest.fixture
+def table_answer():
+    return Answer(
+        answer="text_2",
+        type="extractive",
+        score=0.1,
+        context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
+        offsets_in_document=[TableCell(row=1, col=0)],
+        offsets_in_context=[TableCell(row=1, col=0)],
+        document_ids=["123"],
+    )
+
+
+@pytest.fixture
+def table_answer_dict():
+    return {
+        "answer": "text_2",
+        "type": "extractive",
+        "score": 0.1,
+        "context": [["col1", "col2"], ["text_1", 1], ["text_2", 2]],
+        "offsets_in_document": [{"row": 1, "col": 0}],
+        "offsets_in_context": [{"row": 1, "col": 0}],
+        "document_ids": ["123"],
+        "meta": {},
+    }
+
+
+@pytest.fixture
+def table_answer_json(samples_path):
+    with open(samples_path / "schema" / "table_answer.json") as f1:
+        data = json.load(f1)
+    return data
+
+
@pytest.fixture
 def table_doc():
    data = {
@ -70,6 +174,31 @@ def table_doc():
    return Document(content=pd.DataFrame(data), content_type="table", id="doc1")


+@pytest.fixture
+def table_doc_dict():
+    return {
+        "content": [
+            ["actors", "age", "number of movies", "date of birth"],
+            ["brad pitt", 58, 87, "18 december 1963"],
+            ["leonardo di caprio", 47, 53, "11 november 1974"],
+            ["george clooney", 60, 69, "6 may 1961"],
+        ],
+        "content_type": "table",
+        "score": None,
+        "meta": {},
+        "id_hash_keys": ["content"],
+        "embedding": None,
+        "id": "doc1",
+    }
+
+
+@pytest.fixture
+def table_doc_json(samples_path):
+    with open(samples_path / "schema" / "table_doc.json") as f1:
+        json_str = f1.read()
+    return json_str
+
+
@pytest.fixture
 def table_doc_with_embedding():
    data = {
@ -79,71 +208,132 @@ def table_doc_with_embedding():
        "date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
    }
    return Document(
-        content=pd.DataFrame(data), content_type="table", id="doc2", embedding=np.random.rand(768).astype(np.float32)
+        content=pd.DataFrame(data), content_type="table", id="doc2", embedding=np.array([1.1, 2.2, 3.3, 4.4])
    )


+@pytest.fixture
+def table_doc_with_embedding_json(samples_path):
+    with open(samples_path / "schema" / "table_doc_emb.json") as f1:
+        json_str = f1.read()
+    return json_str
+
+
+@pytest.mark.unit
 def test_no_answer_label():
-    labels = [
-        Label(
-            query="question",
-            answer=Answer(answer=""),
-            is_correct_answer=True,
-            is_correct_document=True,
-            document=Document(content="some", id="777"),
-            origin="gold-label",
-        ),
-        Label(
-            query="question",
-            answer=Answer(answer=""),
-            is_correct_answer=True,
-            is_correct_document=True,
-            document=Document(content="some", id="777"),
-            origin="gold-label",
-        ),
-        Label(
-            query="question",
-            answer=Answer(answer="some"),
-            is_correct_answer=True,
-            is_correct_document=True,
-            document=Document(content="some", id="777"),
-            origin="gold-label",
-        ),
-        Label(
-            query="question",
-            answer=Answer(answer="some"),
-            is_correct_answer=True,
-            is_correct_document=True,
-            document=Document(content="some", id="777"),
-            origin="gold-label",
-        ),
-    ]
-
-    assert labels[0].no_answer == True
-    assert labels[1].no_answer == True
-    assert labels[2].no_answer == False
-    assert labels[3].no_answer == False
+    label_no_answer = Label(
+        query="question",
+        answer=Answer(answer=""),
+        is_correct_answer=True,
+        is_correct_document=True,
+        document=Document(content="some", id="777"),
+        origin="gold-label",
+    )
+    label_with_answer = Label(
+        query="question",
+        answer=Answer(answer="some"),
+        is_correct_answer=True,
+        is_correct_document=True,
+        document=Document(content="some", id="777"),
+        origin="gold-label",
+    )
+    assert label_no_answer.no_answer
+    assert not label_with_answer.no_answer


+@pytest.mark.unit
 def test_equal_label(text_labels):
    assert text_labels[2] == text_labels[0]
    assert text_labels[1] != text_labels[0]


+@pytest.mark.unit
 def test_label_to_json(text_labels):
-    j0 = text_labels[0].to_json()
-    l_new = Label.from_json(j0)
-    assert l_new == text_labels[0]
-    assert l_new.answer.offsets_in_document[0].start == 1
+    text_label_json = text_labels[0].to_json()
+    text_label_from_json = Label.from_json(text_label_json)
+    assert text_label_from_json == text_labels[0]
+    assert text_label_from_json.answer.offsets_in_document[0].start == 1


+@pytest.mark.unit
 def test_label_to_dict(text_labels):
-    j0 = text_labels[0].to_dict()
-    l_new = Label.from_dict(j0)
-    assert l_new == text_labels[0]
-    assert l_new.answer.offsets_in_document[0].start == 1
+    text_label_dict = text_labels[0].to_dict()
+    text_label_from_dict = Label.from_dict(text_label_dict)
+    assert text_label_from_dict == text_labels[0]
+    assert text_label_from_dict.answer.offsets_in_document[0].start == 1


+@pytest.mark.unit
+def test_labels_with_identical_fields_are_equal(table_label):
+    table_label_copy = Label(
+        query="some",
+        answer=Answer(
+            answer="text_2",
+            type="extractive",
+            score=0.1,
+            document_ids=["123"],
+            context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
+            offsets_in_document=[TableCell(row=1, col=0)],
+        ),
+        document=Document(
+            content=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
+            content_type="table",
+        ),
+        is_correct_answer=True,
+        is_correct_document=True,
+        origin="user-feedback",
+    )
+    assert table_label == table_label_copy
+
+
+@pytest.mark.unit
+def test_labels_with_different_fields_are_not_equal(table_label):
+    table_label_different = Label(
+        query="some",
+        answer=Answer(
+            answer="text_1",
+            type="extractive",
+            score=0.1,
+            document_ids=["123"],
+            context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
+            offsets_in_document=[TableCell(row=0, col=0)],
+        ),
+        document=Document(
+            content=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
+            content_type="table",
+        ),
+        is_correct_answer=True,
+        is_correct_document=True,
+        origin="user-feedback",
+    )
+    assert table_label != table_label_different
+
+
+@pytest.mark.unit
+def test_table_label_from_json(table_label, table_label_json):
+    table_label_from_json = Label.from_json(table_label_json)
+    assert table_label_from_json == table_label
+
+
+@pytest.mark.unit
+def test_table_label_to_json(table_label, table_label_json):
+    table_label_to_json = json.loads(table_label.to_json())
+    assert table_label_to_json == table_label_json
+
+
+@pytest.mark.unit
+def test_table_label_from_dict(table_label, table_label_dict):
+    table_label_from_dict = Label.from_dict(table_label_dict)
+    assert table_label_from_dict == table_label
+
+
+@pytest.mark.unit
+def test_table_label_to_dict(table_label, table_label_dict):
+    table_label_to_dict = table_label.to_dict()
+    assert table_label_to_dict == table_label_dict
+
+
+@pytest.mark.unit
 def test_answer_to_json(text_answer):
    a = text_answer
    j = a.to_json()
@ -154,6 +344,7 @@ def test_answer_to_json(text_answer):
    assert a_new == a


+@pytest.mark.unit
 def test_answer_to_dict(text_answer):
    a = text_answer
    j = a.to_dict()
@ -163,6 +354,29 @@ def test_answer_to_dict(text_answer):
    assert a_new == a


+@pytest.mark.unit
+def test_table_answer_to_json(table_answer, table_answer_json):
+    table_answer_to_json = json.loads(table_answer.to_json())
+    assert table_answer_to_json == table_answer_json
+
+
+@pytest.mark.unit
+def test_table_answer_from_json(table_answer, table_answer_json):
+    table_answer_from_json = Answer.from_json(table_answer_json)
+    assert table_answer_from_json == table_answer
+
+
+@pytest.mark.unit
+def test_table_answer_to_dict(table_answer, table_answer_dict):
+    assert table_answer.to_dict() == table_answer_dict
+
+
+@pytest.mark.unit
+def test_table_answer_from_dict(table_answer, table_answer_dict):
+    assert table_answer == Answer.from_dict(table_answer_dict)
+
+
+@pytest.mark.unit
 def test_document_from_dict():
    doc = Document(
        content="this is the content of the document", meta={"some": "meta"}, id_hash_keys=["content", "meta"]
@ -170,13 +384,20 @@ def test_document_from_dict():
    assert doc == Document.from_dict(doc.to_dict())


-def test_table_document_from_dict(table_doc):
-    assert table_doc == Document.from_dict(table_doc.to_dict())
+@pytest.mark.unit
+def test_table_document_from_dict(table_doc, table_doc_dict):
+    assert table_doc == Document.from_dict(table_doc_dict)


+@pytest.mark.unit
+def test_table_document_to_dict(table_doc, table_doc_dict):
+    assert table_doc.to_dict() == table_doc_dict
+
+
+@pytest.mark.unit
 def test_doc_to_json():
    # With embedding
-    d = Document(
+    doc_with_embedding = Document(
        content="some text",
        content_type="text",
        id_hash_keys=["meta"],
@ -184,12 +405,12 @@ def test_doc_to_json():
        meta={"name": "doc1"},
        embedding=np.random.rand(768).astype(np.float32),
    )
-    j0 = d.to_json()
-    d_new = Document.from_json(j0)
-    assert d == d_new
+    doc_emb_json = doc_with_embedding.to_json()
+    doc_emb_from_json = Document.from_json(doc_emb_json)
+    assert doc_with_embedding == doc_emb_from_json

    # No embedding
-    d = Document(
+    doc_with_no_embedding = Document(
        content="some text",
        content_type="text",
        score=0.99988,
@ -197,35 +418,48 @@ def test_doc_to_json():
        id_hash_keys=["meta"],
        embedding=None,
    )
-    j0 = d.to_json()
-    d_new = Document.from_json(j0)
-    assert d == d_new
+    doc_no_emb_json = doc_with_no_embedding.to_json()
+    doc_no_emb_from_json = Document.from_json(doc_no_emb_json)
+    assert doc_with_no_embedding == doc_no_emb_from_json


-def test_table_doc_to_json(table_doc, table_doc_with_embedding):
+@pytest.mark.unit
+def test_table_doc_from_json(table_doc, table_doc_with_embedding, table_doc_json, table_doc_with_embedding_json):
    # With embedding
-    j0 = table_doc_with_embedding.to_json()
-    d_new = Document.from_json(j0)
-    assert table_doc_with_embedding == d_new
+    table_doc_emb_from_json = Document.from_json(table_doc_with_embedding_json)
+    assert table_doc_with_embedding == table_doc_emb_from_json

    # No embedding
-    j0 = table_doc.to_json()
-    d_new = Document.from_json(j0)
-    assert table_doc == d_new
+    table_doc_no_emb_from_json = Document.from_json(table_doc_json)
+    assert table_doc == table_doc_no_emb_from_json


+@pytest.mark.unit
+def test_table_doc_to_json(table_doc, table_doc_with_embedding, table_doc_json, table_doc_with_embedding_json):
+    # With embedding
+    table_doc_emb_to_json = json.loads(table_doc_with_embedding.to_json())
+    assert json.loads(table_doc_with_embedding_json) == table_doc_emb_to_json
+
+    # No embedding
+    table_doc_no_emb_to_json = json.loads(table_doc.to_json())
+    assert json.loads(table_doc_json) == table_doc_no_emb_to_json
+
+
+@pytest.mark.unit
 def test_answer_postinit():
    a = Answer(answer="test", offsets_in_document=[{"start": 10, "end": 20}])
    assert a.meta == {}
    assert isinstance(a.offsets_in_document[0], Span)


+@pytest.mark.unit
 def test_table_answer_postinit():
-    a = Answer(answer="test", offsets_in_document=[{"row": 1, "col": 2}])
-    assert a.meta == {}
-    assert isinstance(a.offsets_in_document[0], TableCell)
+    table_answer = Answer(answer="test", offsets_in_document=[{"row": 1, "col": 2}])
+    assert table_answer.meta == {}
+    assert isinstance(table_answer.offsets_in_document[0], TableCell)


+@pytest.mark.unit
 def test_generate_doc_id_using_text():
    text1 = "text1"
    text2 = "text2"
@ -237,6 +471,7 @@ def test_generate_doc_id_using_text():
    assert doc1_text1.id != doc3_text2.id


+@pytest.mark.unit
 def test_generate_doc_id_using_custom_list():
    text1 = "text1"
    text2 = "text2"
@ -257,6 +492,7 @@ def test_generate_doc_id_using_custom_list():
        _ = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content", "non_existing_field"])


+@pytest.mark.unit
 def test_generate_doc_id_custom_list_meta():
    text1 = "text1"
    text2 = "text2"
@ -280,6 +516,7 @@ def test_generate_doc_id_custom_list_meta():
    assert doc1_text1.id != doc2_text2.id


+@pytest.mark.unit
 def test_aggregate_labels_with_labels():
    label1_with_filter1 = Label(
        query="question",
@ -314,6 +551,7 @@ def test_aggregate_labels_with_labels():
        label = MultiLabel(labels=[label1_with_filter1, label3_with_filter2])


+@pytest.mark.unit
 def test_multilabel_preserve_order():
    labels = [
        Label(
@ -369,6 +607,7 @@ def test_multilabel_preserve_order():
        assert multilabel.labels[i].id == str(i)


+@pytest.mark.unit
 def test_multilabel_preserve_order_w_duplicates():
    labels = [
        Label(
@ -455,6 +694,7 @@ def test_multilabel_preserve_order_w_duplicates():
        assert multilabel.labels[i].id == str(i)


+@pytest.mark.unit
 def test_multilabel_id():
    query1 = "question 1"
    query2 = "question 2"
@ -495,6 +735,7 @@ def test_multilabel_id():
    assert MultiLabel(labels=[label3]).id == "531445fa3bdf98b8598a3bea032bd605"


+@pytest.mark.unit
 def test_multilabel_with_doc_containing_dataframes():
    table = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
    table_doc = Document(content=table, content_type="table", id="table1")
@ -521,6 +762,7 @@ def test_multilabel_with_doc_containing_dataframes():
    assert multilabel.offsets_in_contexts == [{"row": 0, "col": 0}]


+@pytest.mark.unit
 def test_multilabel_serialization():
    label_dict = {
        "id": "011079cf-c93f-49e6-83bb-42cd850dce12",
@ -566,23 +808,27 @@ def test_multilabel_serialization():
    assert json_deserialized_multilabel.labels[0] == label


+@pytest.mark.unit
 def test_span_in():
    assert 10 in Span(5, 15)
-    assert not 20 in Span(1, 15)
+    assert 20 not in Span(1, 15)


+@pytest.mark.unit
 def test_span_in_edges():
    assert 5 in Span(5, 15)
-    assert not 15 in Span(5, 15)
+    assert 15 not in Span(5, 15)


+@pytest.mark.unit
 def test_span_in_other_values():
    assert 10.0 in Span(5, 15)
    assert "10" in Span(5, 15)
    with pytest.raises(ValueError):
-        "hello" in Span(5, 15)
+        assert "hello" in Span(5, 15)


+@pytest.mark.unit
 def test_assert_span_vs_span():
    assert Span(10, 11) in Span(5, 15)
    assert Span(5, 10) in Span(5, 15)
@ -595,6 +841,7 @@ def test_assert_span_vs_span():
    assert not Span(10, 20) in Span(5, 15)


+@pytest.mark.unit
 def test_id_hash_keys_not_ignored():
    # Test that two documents with the same content but different metadata get assigned different ids if and only if
    # id_hash_keys is set to 'meta'
@ -606,6 +853,7 @@ def test_id_hash_keys_not_ignored():
    assert doc3.id == doc4.id


+@pytest.mark.unit
 def test_legacy_answer_document_id():
    legacy_label = {
        "id": "123",
@ -642,6 +890,7 @@ def test_legacy_answer_document_id():
    assert label.answer.document_ids == ["fc18c987a8312e72a47fb1524f230bb0"]


+@pytest.mark.unit
 def test_legacy_answer_document_id_is_none():
    legacy_label = {
        "id": "123",
@ -676,3 +925,16 @@ def test_legacy_answer_document_id_is_none():

    label = Label.from_dict(legacy_label)
    assert label.answer.document_ids is None
+
+
+@pytest.mark.unit
+def test_dict_factory():
+    data = [
+        ("key1", "some_value"),
+        ("key2", ["val1", "val2"]),
+        ("key3", pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})),
+    ]
+    result = _dict_factory(data)
+    assert result["key1"] == "some_value"
+    assert result["key2"] == ["val1", "val2"]
+    assert result["key3"] == [["col1", "col2"], [1, 3], [2, 4]]
--- a/test/samples/schema/table_answer.json
+++ b/test/samples/schema/table_answer.json
@ -0,0 +1,35 @@
+{
+  "answer": "text_2",
+  "type": "extractive",
+  "score": 0.1,
+  "context": [
+    [
+      "col1",
+      "col2"
+    ],
+    [
+      "text_1",
+      1
+    ],
+    [
+      "text_2",
+      2
+    ]
+  ],
+  "offsets_in_document": [
+    {
+      "row": 1,
+      "col": 0
+    }
+  ],
+  "offsets_in_context": [
+    {
+      "row": 1,
+      "col": 0
+    }
+  ],
+  "document_ids": [
+    "123"
+  ],
+  "meta": {}
+}
--- a/test/samples/schema/table_doc.json
+++ b/test/samples/schema/table_doc.json
@ -0,0 +1,36 @@
+{
+  "content": [
+    [
+      "actors",
+      "age",
+      "number of movies",
+      "date of birth"
+    ],
+    [
+      "brad pitt",
+      58,
+      87,
+      "18 december 1963"
+    ],
+    [
+      "leonardo di caprio",
+      47,
+      53,
+      "11 november 1974"
+    ],
+    [
+      "george clooney",
+      60,
+      69,
+      "6 may 1961"
+    ]
+  ],
+  "content_type": "table",
+  "score": null,
+  "meta": {},
+  "id_hash_keys": [
+    "content"
+  ],
+  "embedding": null,
+  "id": "doc1"
+}
--- a/test/samples/schema/table_doc_emb.json
+++ b/test/samples/schema/table_doc_emb.json
@ -0,0 +1,41 @@
+{
+  "content": [
+    [
+      "actors",
+      "age",
+      "number of movies",
+      "date of birth"
+    ],
+    [
+      "brad pitt",
+      58,
+      87,
+      "18 december 1963"
+    ],
+    [
+      "leonardo di caprio",
+      47,
+      53,
+      "11 november 1974"
+    ],
+    [
+      "george clooney",
+      60,
+      69,
+      "6 may 1961"
+    ]
+  ],
+  "content_type": "table",
+  "score": null,
+  "meta": {},
+  "id_hash_keys": [
+    "content"
+  ],
+  "embedding": [
+    1.1,
+    2.2,
+    3.3,
+    4.4
+  ],
+  "id": "doc2"
+}
--- a/test/samples/schema/table_label.json
+++ b/test/samples/schema/table_label.json
@ -0,0 +1,66 @@
+{
+  "id": "fbd79f71-d690-4b21-bd0a-1094292b9809",
+  "query": "some",
+  "document": {
+    "id": "fe5cb68f8226776914781f6bd40ad718",
+    "content": [
+      [
+        "col1",
+        "col2"
+      ],
+      [
+        "text_1",
+        1
+      ],
+      [
+        "text_2",
+        2
+      ]
+    ],
+    "content_type": "table",
+    "meta": {},
+    "id_hash_keys": [
+      "content"
+    ],
+    "score": null,
+    "embedding": null
+  },
+  "is_correct_answer": true,
+  "is_correct_document": true,
+  "origin": "user-feedback",
+  "answer": {
+    "answer": "text_2",
+    "type": "extractive",
+    "score": 0.1,
+    "context": [
+      [
+        "col1",
+        "col2"
+      ],
+      [
+        "text_1",
+        1
+      ],
+      [
+        "text_2",
+        2
+      ]
+    ],
+    "offsets_in_document": [
+      {
+        "row": 1,
+        "col": 0
+      }
+    ],
+    "offsets_in_context": null,
+    "document_ids": [
+      "123"
+    ],
+    "meta": {}
+  },
+  "pipeline_id": null,
+  "created_at": "2023-05-02 11:43:56",
+  "updated_at": null,
+  "meta": {},
+  "filters": null
+}