From a67ca289dbfe9eca508b4e54c64f1c099189bb3f Mon Sep 17 00:00:00 2001
From: Sebastian <sjrl@users.noreply.github.com>
Date: Wed, 3 May 2023 09:42:07 +0200
Subject: [PATCH] refactor: Update schema objects to handle Dataframes in
 to_{dict,json} and from_{dict,json} (#4747)

* Adding support for table Documents when serializing Labels in Haystack

* Fix table label equality test

* Add serialization support and __eq__ support for table answers

* Made convenience functions for converting dataframes. Added some TODOs. Epxanded schema tests for table labels. Updated Multilabel to not convert Dataframes into strings.

* get Answer and Label to_json working with DataFrame

* Fix from_dict method of Label

* Use Dict and remove unneccessary if check

* Using pydantic instead of builtins for type detection

* Update haystack/schema.py

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

* Update haystack/schema.py

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

* Update haystack/schema.py

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

* Separated table label equivalency tests and added pytest.mark.unit


* Added unit test for _dict_factory

* Using more descriptive variable names

* Adding json files to test to_json and from_json functions

* Added sample files for tests

---------

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
---
 haystack/schema.py                     | 115 ++++---
 test/others/test_schema.py             | 406 ++++++++++++++++++++-----
 test/samples/schema/table_answer.json  |  35 +++
 test/samples/schema/table_doc.json     |  36 +++
 test/samples/schema/table_doc_emb.json |  41 +++
 test/samples/schema/table_label.json   |  66 ++++
 6 files changed, 593 insertions(+), 106 deletions(-)
 create mode 100644 test/samples/schema/table_answer.json
 create mode 100644 test/samples/schema/table_doc.json
 create mode 100644 test/samples/schema/table_doc_emb.json
 create mode 100644 test/samples/schema/table_label.json

diff --git a/haystack/schema.py b/haystack/schema.py
index 46b198021..ce192a849 100644
--- a/haystack/schema.py
+++ b/haystack/schema.py
@@ -185,7 +185,7 @@ class Document:
             if k == "content":
                 # Convert pd.DataFrame to list of rows for serialization
                 if self.content_type == "table" and isinstance(self.content, pd.DataFrame):
-                    v = [self.content.columns.tolist()] + self.content.values.tolist()
+                    v = dataframe_to_list(self.content)
             k = k if k not in inv_field_map else inv_field_map[k]
             _doc[k] = v
         return _doc
@@ -232,7 +232,7 @@ class Document:
 
         # Convert list of rows to pd.DataFrame
         if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list):
-            _new_doc["content"] = pd.DataFrame(columns=_new_doc["content"][0], data=_new_doc["content"][1:])
+            _new_doc["content"] = dataframe_from_list(_new_doc["content"])
 
         return cls(**_new_doc)
 
@@ -243,11 +243,14 @@ class Document:
         return json.dumps(dictionary, cls=NumpyEncoder)
 
     @classmethod
-    def from_json(cls, data: str, field_map: Optional[Dict[str, Any]] = None) -> Document:
+    def from_json(cls, data: Union[str, Dict[str, Any]], field_map: Optional[Dict[str, Any]] = None) -> Document:
         if not field_map:
             field_map = {}
-        dictionary = json.loads(data)
-        return cls.from_dict(dictionary, field_map=field_map)
+        if isinstance(data, str):
+            dict_data = json.loads(data)
+        else:
+            dict_data = data
+        return cls.from_dict(dict_data, field_map=field_map)
 
     def __eq__(self, other):
         content = getattr(other, "content", None)
@@ -401,6 +404,10 @@ class Answer:
         if self.meta is None:
             self.meta = {}
 
+        # In case the context is a list of lists for a table document that is instantiated by from_json() or from_dict()
+        if isinstance(self.context, list):
+            self.context = dataframe_from_list(self.context)
+
     def __lt__(self, other):
         """Enable sorting of Answers by score"""
         return self.score < other.score
@@ -412,29 +419,30 @@ class Answer:
         return f"<Answer: answer='{self.answer}', score={self.score}, context='{self.context[:50]}{'...' if len(self.context) > 50 else ''}'>"
 
     def __repr__(self):
-        return f"<Answer {asdict(self)}>"
+        return f"<Answer {self.to_dict()}>"
 
-    def to_dict(self):
-        return asdict(self)
+    def to_dict(self) -> Dict:
+        return asdict(self, dict_factory=_dict_factory)
 
     @classmethod
-    def from_dict(cls, dict: dict):
+    def from_dict(cls, dict: Dict) -> Answer:
         # backwards compatibility: `document_id: Optional[str]` was changed to `document_ids: Optional[List[str]]`
         if "document_id" in dict:
             dict = dict.copy()
             document_id = dict.pop("document_id")
             dict["document_ids"] = [document_id] if document_id is not None else None
-
-        return _pydantic_dataclass_from_dict(dict=dict, pydantic_dataclass_type=cls)
+        return cls(**dict)
 
     def to_json(self):
-        return json.dumps(self, default=pydantic_encoder)
+        return json.dumps(self.to_dict(), cls=NumpyEncoder)
 
     @classmethod
-    def from_json(cls, data):
-        if type(data) == str:
-            data = json.loads(data)
-        return cls.from_dict(data)
+    def from_json(cls, data: Union[str, Dict[str, Any]]):
+        if isinstance(data, str):
+            dict_data = json.loads(data)
+        else:
+            dict_data = data
+        return cls.from_dict(dict_data)
 
     @staticmethod
     def _from_dict_offsets(offsets):
@@ -449,6 +457,23 @@ class Answer:
                 converted_offsets.append(e)
         return converted_offsets
 
+    def __eq__(self, other):
+        context = getattr(other, "context", None)
+        if isinstance(context, pd.DataFrame):
+            is_content_equal = context.equals(self.context)
+        else:
+            is_content_equal = context == self.context
+        return (
+            isinstance(other, self.__class__)
+            and is_content_equal
+            and getattr(other, "type", None) == self.type
+            and getattr(other, "score", None) == self.score
+            and getattr(other, "offsets_in_document", None) == self.offsets_in_document
+            and getattr(other, "offsets_in_context", None) == self.offsets_in_context
+            and getattr(other, "document_ids", None) == self.document_ids
+            and getattr(other, "meta", None) == self.meta
+        )
+
 
 @dataclass
 class Label:
@@ -521,11 +546,7 @@ class Label:
         self.updated_at = updated_at
         self.query = query
 
-        if isinstance(answer, dict):
-            answer = Answer.from_dict(answer)
         self.answer = answer
-        if isinstance(document, dict):
-            document = Document.from_dict(document)
         self.document = document
 
         self.is_correct_answer = is_correct_answer
@@ -549,25 +570,28 @@ class Label:
         return no_answer
 
     def to_dict(self):
-        return asdict(self)
+        return asdict(self, dict_factory=_dict_factory)
 
     @classmethod
-    def from_dict(cls, dict: dict):
-        # backward compatibility for old labels using answers with document_id instead of document_ids
+    def from_dict(cls, dict: Dict):
         answer = dict.get("answer")
-        if answer and "document_id" in answer:
-            dict = dict.copy()
+        if answer and isinstance(answer, Dict):
             dict["answer"] = Answer.from_dict(dict["answer"])
-        return _pydantic_dataclass_from_dict(dict=dict, pydantic_dataclass_type=cls)
+        doc = dict.get("document")
+        if isinstance(doc, Dict):
+            dict["document"] = Document.from_dict(dict["document"])
+        return cls(**dict)
 
     def to_json(self):
-        return json.dumps(self, default=pydantic_encoder)
+        return json.dumps(self.to_dict(), cls=NumpyEncoder)
 
     @classmethod
-    def from_json(cls, data):
-        if type(data) == str:
-            data = json.loads(data)
-        return cls.from_dict(data)
+    def from_json(cls, data: Union[str, Dict[str, Any]]):
+        if isinstance(data, str):
+            dict_data = json.loads(data)
+        else:
+            dict_data = data
+        return cls.from_dict(dict_data)
 
     # define __eq__ and __hash__ functions to deduplicate Label Objects
     def __eq__(self, other):
@@ -732,7 +756,7 @@ class MultiLabel:
         return {k[1:] if k[0] == "_" else k: v for k, v in vars(self).items()}
 
     @classmethod
-    def from_dict(cls, dict: dict):
+    def from_dict(cls, dict: Dict):
         # exclude extra arguments
         return cls(**{k: v for k, v in dict.items() if k in inspect.signature(cls).parameters})
 
@@ -741,7 +765,7 @@ class MultiLabel:
 
     @classmethod
     def from_json(cls, data: Union[str, Dict[str, Any]]):
-        if type(data) == str:
+        if isinstance(data, str):
             dict_data = json.loads(data)
         else:
             dict_data = data
@@ -758,7 +782,7 @@ class MultiLabel:
         return f"<MultiLabel: {self.to_dict()}>"
 
 
-def _pydantic_dataclass_from_dict(dict: dict, pydantic_dataclass_type) -> Any:
+def _pydantic_dataclass_from_dict(dict: Dict, pydantic_dataclass_type) -> Any:
     """
     Constructs a pydantic dataclass from a dict incl. other nested dataclasses.
     This allows simple de-serialization of pydantic dataclasses from json.
@@ -777,6 +801,21 @@ def _pydantic_dataclass_from_dict(dict: dict, pydantic_dataclass_type) -> Any:
     return dataclass_object
 
 
+def _dict_factory(data):
+    """Meant to be as the dict_factory for `asdict`. This function is called within `asdict` to convert a list of tuples
+    into a dictionary object. This handles the conversion of pandas Dataframes into a list of lists.
+
+    :param data: list of (key, value) pairs
+    """
+
+    def convert_value(v):
+        if isinstance(v, pd.DataFrame):
+            return dataframe_to_list(v)
+        return v
+
+    return {k: convert_value(v) for k, v in data}
+
+
 class NumpyEncoder(json.JSONEncoder):
     def default(self, obj):
         if isinstance(obj, np.ndarray):
@@ -784,6 +823,14 @@ class NumpyEncoder(json.JSONEncoder):
         return json.JSONEncoder.default(self, obj)
 
 
+def dataframe_to_list(df: pd.DataFrame) -> List[List]:
+    return [df.columns.tolist()] + df.values.tolist()
+
+
+def dataframe_from_list(list_df: List[List]) -> pd.DataFrame:
+    return pd.DataFrame(columns=list_df[0], data=list_df[1:])
+
+
 class EvaluationResult:
     def __init__(self, node_results: Optional[Dict[str, pd.DataFrame]] = None) -> None:
         """
diff --git a/test/others/test_schema.py b/test/others/test_schema.py
index dd1553a2f..3d9ed484e 100644
--- a/test/others/test_schema.py
+++ b/test/others/test_schema.py
@@ -1,4 +1,6 @@
-from haystack.schema import Document, Label, Answer, Span, MultiLabel, TableCell
+import json
+
+from haystack.schema import Document, Label, Answer, Span, MultiLabel, TableCell, _dict_factory
 import pytest
 import numpy as np
 import pandas as pd
@@ -46,6 +48,74 @@ def text_labels():
     ]
 
 
+@pytest.fixture
+def table_label():
+    return Label(
+        query="some",
+        answer=Answer(
+            answer="text_2",
+            type="extractive",
+            score=0.1,
+            document_ids=["123"],
+            context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
+            offsets_in_document=[TableCell(row=1, col=0)],
+        ),
+        document=Document(
+            content=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
+            content_type="table",
+            id="fe5cb68f8226776914781f6bd40ad718",
+        ),
+        is_correct_answer=True,
+        is_correct_document=True,
+        origin="user-feedback",
+        created_at="2023-05-02 11:43:56",
+        updated_at=None,
+        id="fbd79f71-d690-4b21-bd0a-1094292b9809",
+    )
+
+
+@pytest.fixture
+def table_label_dict():
+    return {
+        "id": "fbd79f71-d690-4b21-bd0a-1094292b9809",
+        "query": "some",
+        "document": {
+            "id": "fe5cb68f8226776914781f6bd40ad718",
+            "content": [["col1", "col2"], ["text_1", 1], ["text_2", 2]],
+            "content_type": "table",
+            "meta": {},
+            "id_hash_keys": ["content"],
+            "score": None,
+            "embedding": None,
+        },
+        "is_correct_answer": True,
+        "is_correct_document": True,
+        "origin": "user-feedback",
+        "answer": {
+            "answer": "text_2",
+            "type": "extractive",
+            "score": 0.1,
+            "context": [["col1", "col2"], ["text_1", 1], ["text_2", 2]],
+            "offsets_in_document": [{"row": 1, "col": 0}],
+            "offsets_in_context": None,
+            "document_ids": ["123"],
+            "meta": {},
+        },
+        "pipeline_id": None,
+        "created_at": "2023-05-02 11:43:56",
+        "updated_at": None,
+        "meta": {},
+        "filters": None,
+    }
+
+
+@pytest.fixture
+def table_label_json(samples_path):
+    with open(samples_path / "schema" / "table_label.json") as f1:
+        data = json.load(f1)
+    return data
+
+
 @pytest.fixture
 def text_answer():
     return Answer(
@@ -59,6 +129,40 @@ def text_answer():
     )
 
 
+@pytest.fixture
+def table_answer():
+    return Answer(
+        answer="text_2",
+        type="extractive",
+        score=0.1,
+        context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
+        offsets_in_document=[TableCell(row=1, col=0)],
+        offsets_in_context=[TableCell(row=1, col=0)],
+        document_ids=["123"],
+    )
+
+
+@pytest.fixture
+def table_answer_dict():
+    return {
+        "answer": "text_2",
+        "type": "extractive",
+        "score": 0.1,
+        "context": [["col1", "col2"], ["text_1", 1], ["text_2", 2]],
+        "offsets_in_document": [{"row": 1, "col": 0}],
+        "offsets_in_context": [{"row": 1, "col": 0}],
+        "document_ids": ["123"],
+        "meta": {},
+    }
+
+
+@pytest.fixture
+def table_answer_json(samples_path):
+    with open(samples_path / "schema" / "table_answer.json") as f1:
+        data = json.load(f1)
+    return data
+
+
 @pytest.fixture
 def table_doc():
     data = {
@@ -70,6 +174,31 @@ def table_doc():
     return Document(content=pd.DataFrame(data), content_type="table", id="doc1")
 
 
+@pytest.fixture
+def table_doc_dict():
+    return {
+        "content": [
+            ["actors", "age", "number of movies", "date of birth"],
+            ["brad pitt", 58, 87, "18 december 1963"],
+            ["leonardo di caprio", 47, 53, "11 november 1974"],
+            ["george clooney", 60, 69, "6 may 1961"],
+        ],
+        "content_type": "table",
+        "score": None,
+        "meta": {},
+        "id_hash_keys": ["content"],
+        "embedding": None,
+        "id": "doc1",
+    }
+
+
+@pytest.fixture
+def table_doc_json(samples_path):
+    with open(samples_path / "schema" / "table_doc.json") as f1:
+        json_str = f1.read()
+    return json_str
+
+
 @pytest.fixture
 def table_doc_with_embedding():
     data = {
@@ -79,71 +208,132 @@ def table_doc_with_embedding():
         "date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
     }
     return Document(
-        content=pd.DataFrame(data), content_type="table", id="doc2", embedding=np.random.rand(768).astype(np.float32)
+        content=pd.DataFrame(data), content_type="table", id="doc2", embedding=np.array([1.1, 2.2, 3.3, 4.4])
     )
 
 
+@pytest.fixture
+def table_doc_with_embedding_json(samples_path):
+    with open(samples_path / "schema" / "table_doc_emb.json") as f1:
+        json_str = f1.read()
+    return json_str
+
+
+@pytest.mark.unit
 def test_no_answer_label():
-    labels = [
-        Label(
-            query="question",
-            answer=Answer(answer=""),
-            is_correct_answer=True,
-            is_correct_document=True,
-            document=Document(content="some", id="777"),
-            origin="gold-label",
-        ),
-        Label(
-            query="question",
-            answer=Answer(answer=""),
-            is_correct_answer=True,
-            is_correct_document=True,
-            document=Document(content="some", id="777"),
-            origin="gold-label",
-        ),
-        Label(
-            query="question",
-            answer=Answer(answer="some"),
-            is_correct_answer=True,
-            is_correct_document=True,
-            document=Document(content="some", id="777"),
-            origin="gold-label",
-        ),
-        Label(
-            query="question",
-            answer=Answer(answer="some"),
-            is_correct_answer=True,
-            is_correct_document=True,
-            document=Document(content="some", id="777"),
-            origin="gold-label",
-        ),
-    ]
-
-    assert labels[0].no_answer == True
-    assert labels[1].no_answer == True
-    assert labels[2].no_answer == False
-    assert labels[3].no_answer == False
+    label_no_answer = Label(
+        query="question",
+        answer=Answer(answer=""),
+        is_correct_answer=True,
+        is_correct_document=True,
+        document=Document(content="some", id="777"),
+        origin="gold-label",
+    )
+    label_with_answer = Label(
+        query="question",
+        answer=Answer(answer="some"),
+        is_correct_answer=True,
+        is_correct_document=True,
+        document=Document(content="some", id="777"),
+        origin="gold-label",
+    )
+    assert label_no_answer.no_answer
+    assert not label_with_answer.no_answer
 
 
+@pytest.mark.unit
 def test_equal_label(text_labels):
     assert text_labels[2] == text_labels[0]
     assert text_labels[1] != text_labels[0]
 
 
+@pytest.mark.unit
 def test_label_to_json(text_labels):
-    j0 = text_labels[0].to_json()
-    l_new = Label.from_json(j0)
-    assert l_new == text_labels[0]
-    assert l_new.answer.offsets_in_document[0].start == 1
+    text_label_json = text_labels[0].to_json()
+    text_label_from_json = Label.from_json(text_label_json)
+    assert text_label_from_json == text_labels[0]
+    assert text_label_from_json.answer.offsets_in_document[0].start == 1
 
 
+@pytest.mark.unit
 def test_label_to_dict(text_labels):
-    j0 = text_labels[0].to_dict()
-    l_new = Label.from_dict(j0)
-    assert l_new == text_labels[0]
-    assert l_new.answer.offsets_in_document[0].start == 1
+    text_label_dict = text_labels[0].to_dict()
+    text_label_from_dict = Label.from_dict(text_label_dict)
+    assert text_label_from_dict == text_labels[0]
+    assert text_label_from_dict.answer.offsets_in_document[0].start == 1
 
 
+@pytest.mark.unit
+def test_labels_with_identical_fields_are_equal(table_label):
+    table_label_copy = Label(
+        query="some",
+        answer=Answer(
+            answer="text_2",
+            type="extractive",
+            score=0.1,
+            document_ids=["123"],
+            context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
+            offsets_in_document=[TableCell(row=1, col=0)],
+        ),
+        document=Document(
+            content=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
+            content_type="table",
+        ),
+        is_correct_answer=True,
+        is_correct_document=True,
+        origin="user-feedback",
+    )
+    assert table_label == table_label_copy
+
+
+@pytest.mark.unit
+def test_labels_with_different_fields_are_not_equal(table_label):
+    table_label_different = Label(
+        query="some",
+        answer=Answer(
+            answer="text_1",
+            type="extractive",
+            score=0.1,
+            document_ids=["123"],
+            context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
+            offsets_in_document=[TableCell(row=0, col=0)],
+        ),
+        document=Document(
+            content=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]),
+            content_type="table",
+        ),
+        is_correct_answer=True,
+        is_correct_document=True,
+        origin="user-feedback",
+    )
+    assert table_label != table_label_different
+
+
+@pytest.mark.unit
+def test_table_label_from_json(table_label, table_label_json):
+    table_label_from_json = Label.from_json(table_label_json)
+    assert table_label_from_json == table_label
+
+
+@pytest.mark.unit
+def test_table_label_to_json(table_label, table_label_json):
+    table_label_to_json = json.loads(table_label.to_json())
+    assert table_label_to_json == table_label_json
+
+
+@pytest.mark.unit
+def test_table_label_from_dict(table_label, table_label_dict):
+    table_label_from_dict = Label.from_dict(table_label_dict)
+    assert table_label_from_dict == table_label
+
+
+@pytest.mark.unit
+def test_table_label_to_dict(table_label, table_label_dict):
+    table_label_to_dict = table_label.to_dict()
+    assert table_label_to_dict == table_label_dict
+
+
+@pytest.mark.unit
 def test_answer_to_json(text_answer):
     a = text_answer
     j = a.to_json()
@@ -154,6 +344,7 @@ def test_answer_to_json(text_answer):
     assert a_new == a
 
 
+@pytest.mark.unit
 def test_answer_to_dict(text_answer):
     a = text_answer
     j = a.to_dict()
@@ -163,6 +354,29 @@ def test_answer_to_dict(text_answer):
     assert a_new == a
 
 
+@pytest.mark.unit
+def test_table_answer_to_json(table_answer, table_answer_json):
+    table_answer_to_json = json.loads(table_answer.to_json())
+    assert table_answer_to_json == table_answer_json
+
+
+@pytest.mark.unit
+def test_table_answer_from_json(table_answer, table_answer_json):
+    table_answer_from_json = Answer.from_json(table_answer_json)
+    assert table_answer_from_json == table_answer
+
+
+@pytest.mark.unit
+def test_table_answer_to_dict(table_answer, table_answer_dict):
+    assert table_answer.to_dict() == table_answer_dict
+
+
+@pytest.mark.unit
+def test_table_answer_from_dict(table_answer, table_answer_dict):
+    assert table_answer == Answer.from_dict(table_answer_dict)
+
+
+@pytest.mark.unit
 def test_document_from_dict():
     doc = Document(
         content="this is the content of the document", meta={"some": "meta"}, id_hash_keys=["content", "meta"]
@@ -170,13 +384,20 @@ def test_document_from_dict():
     assert doc == Document.from_dict(doc.to_dict())
 
 
-def test_table_document_from_dict(table_doc):
-    assert table_doc == Document.from_dict(table_doc.to_dict())
+@pytest.mark.unit
+def test_table_document_from_dict(table_doc, table_doc_dict):
+    assert table_doc == Document.from_dict(table_doc_dict)
 
 
+@pytest.mark.unit
+def test_table_document_to_dict(table_doc, table_doc_dict):
+    assert table_doc.to_dict() == table_doc_dict
+
+
+@pytest.mark.unit
 def test_doc_to_json():
     # With embedding
-    d = Document(
+    doc_with_embedding = Document(
         content="some text",
         content_type="text",
         id_hash_keys=["meta"],
@@ -184,12 +405,12 @@ def test_doc_to_json():
         meta={"name": "doc1"},
         embedding=np.random.rand(768).astype(np.float32),
     )
-    j0 = d.to_json()
-    d_new = Document.from_json(j0)
-    assert d == d_new
+    doc_emb_json = doc_with_embedding.to_json()
+    doc_emb_from_json = Document.from_json(doc_emb_json)
+    assert doc_with_embedding == doc_emb_from_json
 
     # No embedding
-    d = Document(
+    doc_with_no_embedding = Document(
         content="some text",
         content_type="text",
         score=0.99988,
@@ -197,35 +418,48 @@ def test_doc_to_json():
         id_hash_keys=["meta"],
         embedding=None,
     )
-    j0 = d.to_json()
-    d_new = Document.from_json(j0)
-    assert d == d_new
+    doc_no_emb_json = doc_with_no_embedding.to_json()
+    doc_no_emb_from_json = Document.from_json(doc_no_emb_json)
+    assert doc_with_no_embedding == doc_no_emb_from_json
 
 
-def test_table_doc_to_json(table_doc, table_doc_with_embedding):
+@pytest.mark.unit
+def test_table_doc_from_json(table_doc, table_doc_with_embedding, table_doc_json, table_doc_with_embedding_json):
     # With embedding
-    j0 = table_doc_with_embedding.to_json()
-    d_new = Document.from_json(j0)
-    assert table_doc_with_embedding == d_new
+    table_doc_emb_from_json = Document.from_json(table_doc_with_embedding_json)
+    assert table_doc_with_embedding == table_doc_emb_from_json
 
     # No embedding
-    j0 = table_doc.to_json()
-    d_new = Document.from_json(j0)
-    assert table_doc == d_new
+    table_doc_no_emb_from_json = Document.from_json(table_doc_json)
+    assert table_doc == table_doc_no_emb_from_json
 
 
+@pytest.mark.unit
+def test_table_doc_to_json(table_doc, table_doc_with_embedding, table_doc_json, table_doc_with_embedding_json):
+    # With embedding
+    table_doc_emb_to_json = json.loads(table_doc_with_embedding.to_json())
+    assert json.loads(table_doc_with_embedding_json) == table_doc_emb_to_json
+
+    # No embedding
+    table_doc_no_emb_to_json = json.loads(table_doc.to_json())
+    assert json.loads(table_doc_json) == table_doc_no_emb_to_json
+
+
+@pytest.mark.unit
 def test_answer_postinit():
     a = Answer(answer="test", offsets_in_document=[{"start": 10, "end": 20}])
     assert a.meta == {}
     assert isinstance(a.offsets_in_document[0], Span)
 
 
+@pytest.mark.unit
 def test_table_answer_postinit():
-    a = Answer(answer="test", offsets_in_document=[{"row": 1, "col": 2}])
-    assert a.meta == {}
-    assert isinstance(a.offsets_in_document[0], TableCell)
+    table_answer = Answer(answer="test", offsets_in_document=[{"row": 1, "col": 2}])
+    assert table_answer.meta == {}
+    assert isinstance(table_answer.offsets_in_document[0], TableCell)
 
 
+@pytest.mark.unit
 def test_generate_doc_id_using_text():
     text1 = "text1"
     text2 = "text2"
@@ -237,6 +471,7 @@ def test_generate_doc_id_using_text():
     assert doc1_text1.id != doc3_text2.id
 
 
+@pytest.mark.unit
 def test_generate_doc_id_using_custom_list():
     text1 = "text1"
     text2 = "text2"
@@ -257,6 +492,7 @@ def test_generate_doc_id_using_custom_list():
         _ = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content", "non_existing_field"])
 
 
+@pytest.mark.unit
 def test_generate_doc_id_custom_list_meta():
     text1 = "text1"
     text2 = "text2"
@@ -280,6 +516,7 @@ def test_generate_doc_id_custom_list_meta():
     assert doc1_text1.id != doc2_text2.id
 
 
+@pytest.mark.unit
 def test_aggregate_labels_with_labels():
     label1_with_filter1 = Label(
         query="question",
@@ -314,6 +551,7 @@ def test_aggregate_labels_with_labels():
         label = MultiLabel(labels=[label1_with_filter1, label3_with_filter2])
 
 
+@pytest.mark.unit
 def test_multilabel_preserve_order():
     labels = [
         Label(
@@ -369,6 +607,7 @@ def test_multilabel_preserve_order():
         assert multilabel.labels[i].id == str(i)
 
 
+@pytest.mark.unit
 def test_multilabel_preserve_order_w_duplicates():
     labels = [
         Label(
@@ -455,6 +694,7 @@ def test_multilabel_preserve_order_w_duplicates():
         assert multilabel.labels[i].id == str(i)
 
 
+@pytest.mark.unit
 def test_multilabel_id():
     query1 = "question 1"
     query2 = "question 2"
@@ -495,6 +735,7 @@ def test_multilabel_id():
     assert MultiLabel(labels=[label3]).id == "531445fa3bdf98b8598a3bea032bd605"
 
 
+@pytest.mark.unit
 def test_multilabel_with_doc_containing_dataframes():
     table = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
     table_doc = Document(content=table, content_type="table", id="table1")
@@ -521,6 +762,7 @@ def test_multilabel_with_doc_containing_dataframes():
     assert multilabel.offsets_in_contexts == [{"row": 0, "col": 0}]
 
 
+@pytest.mark.unit
 def test_multilabel_serialization():
     label_dict = {
         "id": "011079cf-c93f-49e6-83bb-42cd850dce12",
@@ -566,23 +808,27 @@ def test_multilabel_serialization():
     assert json_deserialized_multilabel.labels[0] == label
 
 
+@pytest.mark.unit
 def test_span_in():
     assert 10 in Span(5, 15)
-    assert not 20 in Span(1, 15)
+    assert 20 not in Span(1, 15)
 
 
+@pytest.mark.unit
 def test_span_in_edges():
     assert 5 in Span(5, 15)
-    assert not 15 in Span(5, 15)
+    assert 15 not in Span(5, 15)
 
 
+@pytest.mark.unit
 def test_span_in_other_values():
     assert 10.0 in Span(5, 15)
     assert "10" in Span(5, 15)
     with pytest.raises(ValueError):
-        "hello" in Span(5, 15)
+        assert "hello" in Span(5, 15)
 
 
+@pytest.mark.unit
 def test_assert_span_vs_span():
     assert Span(10, 11) in Span(5, 15)
     assert Span(5, 10) in Span(5, 15)
@@ -595,6 +841,7 @@ def test_assert_span_vs_span():
     assert not Span(10, 20) in Span(5, 15)
 
 
+@pytest.mark.unit
 def test_id_hash_keys_not_ignored():
     # Test that two documents with the same content but different metadata get assigned different ids if and only if
     # id_hash_keys is set to 'meta'
@@ -606,6 +853,7 @@ def test_id_hash_keys_not_ignored():
     assert doc3.id == doc4.id
 
 
+@pytest.mark.unit
 def test_legacy_answer_document_id():
     legacy_label = {
         "id": "123",
@@ -642,6 +890,7 @@ def test_legacy_answer_document_id():
     assert label.answer.document_ids == ["fc18c987a8312e72a47fb1524f230bb0"]
 
 
+@pytest.mark.unit
 def test_legacy_answer_document_id_is_none():
     legacy_label = {
         "id": "123",
@@ -676,3 +925,16 @@ def test_legacy_answer_document_id_is_none():
 
     label = Label.from_dict(legacy_label)
     assert label.answer.document_ids is None
+
+
+@pytest.mark.unit
+def test_dict_factory():
+    data = [
+        ("key1", "some_value"),
+        ("key2", ["val1", "val2"]),
+        ("key3", pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})),
+    ]
+    result = _dict_factory(data)
+    assert result["key1"] == "some_value"
+    assert result["key2"] == ["val1", "val2"]
+    assert result["key3"] == [["col1", "col2"], [1, 3], [2, 4]]
diff --git a/test/samples/schema/table_answer.json b/test/samples/schema/table_answer.json
new file mode 100644
index 000000000..2f3065300
--- /dev/null
+++ b/test/samples/schema/table_answer.json
@@ -0,0 +1,35 @@
+{
+  "answer": "text_2",
+  "type": "extractive",
+  "score": 0.1,
+  "context": [
+    [
+      "col1",
+      "col2"
+    ],
+    [
+      "text_1",
+      1
+    ],
+    [
+      "text_2",
+      2
+    ]
+  ],
+  "offsets_in_document": [
+    {
+      "row": 1,
+      "col": 0
+    }
+  ],
+  "offsets_in_context": [
+    {
+      "row": 1,
+      "col": 0
+    }
+  ],
+  "document_ids": [
+    "123"
+  ],
+  "meta": {}
+}
diff --git a/test/samples/schema/table_doc.json b/test/samples/schema/table_doc.json
new file mode 100644
index 000000000..0662847dc
--- /dev/null
+++ b/test/samples/schema/table_doc.json
@@ -0,0 +1,36 @@
+{
+  "content": [
+    [
+      "actors",
+      "age",
+      "number of movies",
+      "date of birth"
+    ],
+    [
+      "brad pitt",
+      58,
+      87,
+      "18 december 1963"
+    ],
+    [
+      "leonardo di caprio",
+      47,
+      53,
+      "11 november 1974"
+    ],
+    [
+      "george clooney",
+      60,
+      69,
+      "6 may 1961"
+    ]
+  ],
+  "content_type": "table",
+  "score": null,
+  "meta": {},
+  "id_hash_keys": [
+    "content"
+  ],
+  "embedding": null,
+  "id": "doc1"
+}
diff --git a/test/samples/schema/table_doc_emb.json b/test/samples/schema/table_doc_emb.json
new file mode 100644
index 000000000..4ee0875e5
--- /dev/null
+++ b/test/samples/schema/table_doc_emb.json
@@ -0,0 +1,41 @@
+{
+  "content": [
+    [
+      "actors",
+      "age",
+      "number of movies",
+      "date of birth"
+    ],
+    [
+      "brad pitt",
+      58,
+      87,
+      "18 december 1963"
+    ],
+    [
+      "leonardo di caprio",
+      47,
+      53,
+      "11 november 1974"
+    ],
+    [
+      "george clooney",
+      60,
+      69,
+      "6 may 1961"
+    ]
+  ],
+  "content_type": "table",
+  "score": null,
+  "meta": {},
+  "id_hash_keys": [
+    "content"
+  ],
+  "embedding": [
+    1.1,
+    2.2,
+    3.3,
+    4.4
+  ],
+  "id": "doc2"
+}
diff --git a/test/samples/schema/table_label.json b/test/samples/schema/table_label.json
new file mode 100644
index 000000000..38cb0eb99
--- /dev/null
+++ b/test/samples/schema/table_label.json
@@ -0,0 +1,66 @@
+{
+  "id": "fbd79f71-d690-4b21-bd0a-1094292b9809",
+  "query": "some",
+  "document": {
+    "id": "fe5cb68f8226776914781f6bd40ad718",
+    "content": [
+      [
+        "col1",
+        "col2"
+      ],
+      [
+        "text_1",
+        1
+      ],
+      [
+        "text_2",
+        2
+      ]
+    ],
+    "content_type": "table",
+    "meta": {},
+    "id_hash_keys": [
+      "content"
+    ],
+    "score": null,
+    "embedding": null
+  },
+  "is_correct_answer": true,
+  "is_correct_document": true,
+  "origin": "user-feedback",
+  "answer": {
+    "answer": "text_2",
+    "type": "extractive",
+    "score": 0.1,
+    "context": [
+      [
+        "col1",
+        "col2"
+      ],
+      [
+        "text_1",
+        1
+      ],
+      [
+        "text_2",
+        2
+      ]
+    ],
+    "offsets_in_document": [
+      {
+        "row": 1,
+        "col": 0
+      }
+    ],
+    "offsets_in_context": null,
+    "document_ids": [
+      "123"
+    ],
+    "meta": {}
+  },
+  "pipeline_id": null,
+  "created_at": "2023-05-02 11:43:56",
+  "updated_at": null,
+  "meta": {},
+  "filters": null
+}