fix: Document v2 JSON serialization (#4863)

* fix json serialization * add missing markers * pylint * fix decoder bug * pylint * add some more tests * linting & windows * windows * windows * windows paths again
2026-01-09 05:37:25 +00:00 · 2023-05-15 11:39:04 +02:00 · 2023-05-15 11:39:04 +02:00 · 8fbfca9ebb
commit 8fbfca9ebb
parent bffe2d8c19
2 changed files with 396 additions and 12 deletions
--- a/haystack/preview/dataclasses/document.py
+++ b/haystack/preview/dataclasses/document.py
@ -1,4 +1,4 @@
-from typing import List, Any, Dict, Literal, Optional
+from typing import List, Any, Dict, Literal, Optional, Type

 import json
 import hashlib
@ -34,10 +34,10 @@ def _create_id(
    """
    Creates a hash of the content given that acts as the document's ID.
    """
+    if not metadata:
+        metadata = {}
    content_to_hash = f"{classname}:{content}"
    if id_hash_keys:
-        if not metadata:
-            raise ValueError("If 'id_hash_keys' is provided, you must provide 'metadata' too.")
        content_to_hash = ":".join([content_to_hash, *[str(metadata.get(key, "")) for key in id_hash_keys]])
    return hashlib.sha256(str(content_to_hash).encode("utf-8")).hexdigest()

@ -61,6 +61,47 @@ def _safe_equals(obj_1, obj_2) -> bool:
    return obj_1 == obj_2


+class DocumentEncoder(json.JSONEncoder):
+    """
+    Encodes more exotic datatypes like pandas dataframes or file paths.
+    """
+
+    def default(self, obj):
+        if isinstance(obj, numpy.ndarray):
+            return obj.tolist()
+        if isinstance(obj, pandas.DataFrame):
+            return obj.to_json()
+        if isinstance(obj, Path):
+            return str(obj.absolute())
+        try:
+            return json.JSONEncoder.default(self, obj)
+        except TypeError:
+            return str(obj)
+
+
+class DocumentDecoder(json.JSONDecoder):
+    """
+    Decodes more exotic datatypes like pandas dataframes or file paths.
+    """
+
+    def __init__(self, *_, object_hook=None, **__):
+        super().__init__(object_hook=object_hook or self.document_decoder)
+
+    def document_decoder(self, dictionary):
+        # Decode content types
+        if "content_type" in dictionary:
+            if dictionary["content_type"] == "table":
+                dictionary["content"] = pandas.read_json(dictionary.get("content", None))
+            elif dictionary["content_type"] == "image":
+                dictionary["content"] = Path(dictionary.get("content", None))
+
+        # Decode embeddings
+        if "embedding" in dictionary and dictionary.get("embedding"):
+            dictionary["embedding"] = numpy.array(dictionary.get("embedding"))
+
+        return dictionary
+
+
@dataclass(frozen=True)
 class Document:
    """
@ -106,6 +147,11 @@ class Document:
        content.
        """
        # Validate content_type
+        if self.content_type not in PYTHON_TYPES_FOR_CONTENT:
+            raise ValueError(
+                f"Content type unknown: '{self.content_type}'. "
+                f"Choose among: {', '.join(PYTHON_TYPES_FOR_CONTENT.keys())}"
+            )
        if not isinstance(self.content, PYTHON_TYPES_FOR_CONTENT[self.content_type]):
            raise ValueError(
                f"The type of content ({type(self.content)}) does not match the "
@ -139,11 +185,11 @@ class Document:
        """
        return asdict(self)

-    def to_json(self, **json_kwargs):
+    def to_json(self, json_encoder: Optional[Type[DocumentEncoder]] = None, **json_kwargs):
        """
        Saves the Document into a JSON string that can be later loaded back.
        """
-        return json.dumps(self.to_dict(), **json_kwargs)
+        return json.dumps(self.to_dict(), cls=json_encoder or DocumentEncoder, **json_kwargs)

    @classmethod
    def from_dict(cls, dictionary):
@ -153,11 +199,11 @@ class Document:
        return cls(**dictionary)

    @classmethod
-    def from_json(cls, data, **json_kwargs):
+    def from_json(cls, data, json_decoder: Optional[Type[DocumentDecoder]] = None, **json_kwargs):
        """
        Creates a new Document object from a JSON string.
        """
-        dictionary = json.loads(data, **json_kwargs)
+        dictionary = json.loads(data, cls=json_decoder or DocumentDecoder, **json_kwargs)
        return cls.from_dict(dictionary=dictionary)

    def flatten(self) -> Dict[str, Any]:
--- a/test/preview/dataclasses/test_document.py
+++ b/test/preview/dataclasses/test_document.py
@ -1,12 +1,13 @@
 from pathlib import Path
 import dataclasses
+import textwrap

 import pytest
 import pandas as pd
 import numpy as np

 from haystack.preview import Document
-from haystack.preview.dataclasses.document import _create_id
+from haystack.preview.dataclasses.document import _create_id, DocumentEncoder, DocumentDecoder


@pytest.mark.unit
@ -16,6 +17,58 @@ def test_document_is_immutable():
        doc.content = "won't work"


+@pytest.mark.unit
+def test_content_type_unknown():
+    with pytest.raises(ValueError, match="Content type unknown: 'other'"):
+        Document(content="test content", content_type="other")
+
+
+@pytest.mark.unit
+def test_content_type_must_match_text():
+    Document(content="test content")
+    Document(content="test content", content_type="text")
+
+    with pytest.raises(ValueError, match="does not match the content type"):
+        Document(content="test content", content_type="table")
+
+    with pytest.raises(ValueError, match="does not match the content type"):
+        Document(content="test content", content_type="image")
+
+    with pytest.raises(ValueError, match="does not match the content type"):
+        Document(content="test content", content_type="audio")
+
+
+@pytest.mark.unit
+def test_content_type_must_match_table():
+    with pytest.raises(ValueError, match="does not match the content type"):
+        Document(content=pd.DataFrame([1, 2]), content_type="text")
+
+    with pytest.raises(ValueError, match="does not match the content type"):
+        Document(content=pd.DataFrame([1, 2]), content_type="image")
+
+    with pytest.raises(ValueError, match="does not match the content type"):
+        Document(content=pd.DataFrame([1, 2]), content_type="audio")
+
+
+@pytest.mark.unit
+def test_content_type_must_match_image_audio():
+    # Mimetypes are not checked - yet
+    Document(content=Path(__file__), content_type="image")
+    Document(content=Path(__file__), content_type="audio")
+
+    with pytest.raises(ValueError, match="does not match the content type"):
+        Document(content=Path(__file__), content_type="text")
+
+    with pytest.raises(ValueError, match="does not match the content type"):
+        Document(content=Path(__file__), content_type="table")
+
+
+@pytest.mark.unit
+def test_id_hash_keys_require_metadata():
+    with pytest.raises(ValueError, match="must be present in the metadata of the Document if you want to use it"):
+        Document(content="test content", id_hash_keys=["something"])
+
+
@pytest.mark.unit
 def test_init_document_same_meta_as_main_fields():
    """
@ -26,10 +79,9 @@ def test_init_document_same_meta_as_main_fields():


@pytest.mark.unit
-def test_simple_text_document_equality():
-    doc1 = Document(content="test content")
-    doc2 = Document(content="test content")
-    assert doc1 == doc2
+def test_basic_equality_type_mismatch():
+    doc = Document(content="test content")
+    assert doc != "test content"


@pytest.mark.unit
@ -74,6 +126,13 @@ def test_equality_with_simple_metadata():
    assert doc1 == doc2


+@pytest.mark.unit
+def test_equality_with_different_metadata():
+    doc1 = Document(content="test content", metadata={"value": 1})
+    doc2 = Document(content="test content", metadata={"value": 1, "another": "value"})
+    assert doc1 != doc2
+
+
@pytest.mark.unit
 def test_equality_with_nested_metadata():
    doc1 = Document(content="test content", metadata={"value": {"another": "value"}})
@ -248,6 +307,282 @@ def test_document_with_most_attributes_from_dict():
    )


+@pytest.mark.unit
+def test_default_text_document_to_json():
+    doc_id = _create_id(classname=Document.__name__, content="test content")
+    doc_1 = Document(content="test content").to_json(indent=4).strip()
+    doc_2 = textwrap.dedent(
+        """    {
+        "id": \""""
+        + doc_id
+        + """\",
+        "content": "test content",
+        "content_type": "text",
+        "metadata": {},
+        "id_hash_keys": [],
+        "score": null,
+        "embedding": null
+    }"""
+    ).strip()
+    assert doc_1 == doc_2
+
+
+@pytest.mark.unit
+def test_default_text_document_from_json():
+    doc_id = _create_id(classname=Document.__name__, content="test content")
+    doc_1 = Document(content="test content")
+    doc_2 = Document.from_json(
+        """{"id": \""""
+        + doc_id
+        + """\",
+        "content": "test content",
+        "content_type": "text",
+        "metadata": {},
+        "id_hash_keys": [],
+        "score": null,
+        "embedding": null
+    }"""
+    )
+    assert doc_1 == doc_2
+
+
+@pytest.mark.unit
+def test_default_table_document_to_json():
+    df = pd.DataFrame([1, 2])
+    doc_id = _create_id(classname=Document.__name__, content=df)
+    doc_1 = Document(content=df, content_type="table").to_json(indent=4).strip()
+    doc_2 = textwrap.dedent(
+        """    {
+        "id": \""""
+        + doc_id
+        + """\",
+        "content": \""""
+        + df.to_json().replace('"', '\\"')
+        + """\",
+        "content_type": "table",
+        "metadata": {},
+        "id_hash_keys": [],
+        "score": null,
+        "embedding": null
+    }"""
+    ).strip()
+    assert doc_1 == doc_2
+
+
+@pytest.mark.unit
+def test_default_table_document_from_json():
+    df = pd.DataFrame([1, 2])
+    doc_id = _create_id(classname=Document.__name__, content=pd.DataFrame([1, 2]))
+    doc_1 = Document(content=df, content_type="table")
+    doc_2 = Document.from_json(
+        """{
+        "id": \""""
+        + doc_id
+        + """\",
+        "content": \""""
+        + pd.DataFrame([1, 2]).to_json().replace('"', '\\"')
+        + """\",
+        "content_type": "table",
+        "metadata": {},
+        "id_hash_keys": [],
+        "score": null,
+        "embedding": null
+    }"""
+    )
+    assert doc_1 == doc_2
+
+
+@pytest.mark.unit
+def test_default_image_document_to_json():
+    path = Path(__file__).parent / "test_files" / "apple.jpg"
+    doc_id = _create_id(classname=Document.__name__, content=path)
+    doc_1 = Document(content=path, content_type="image").to_json(indent=4).strip()
+    doc_2 = textwrap.dedent(
+        """    {
+        "id": \""""
+        + doc_id
+        + """\",
+        "content": \""""
+        + str(path.absolute()).replace("\\", "\\\\")
+        + """\",
+        "content_type": "image",
+        "metadata": {},
+        "id_hash_keys": [],
+        "score": null,
+        "embedding": null
+    }"""
+    ).strip()
+    assert doc_1 == doc_2
+
+
+@pytest.mark.unit
+def test_default_image_document_from_json():
+    path = Path(__file__).parent / "test_files" / "apple.jpg"
+    doc_id = _create_id(classname=Document.__name__, content=path)
+    doc_1 = Document(content=path, content_type="image")
+    doc_2 = Document.from_json(
+        """{"id": \""""
+        + doc_id
+        + """\",
+        "content": \""""
+        + str(path.absolute()).replace("\\", "\\\\")
+        + """\",
+        "content_type": "image",
+        "metadata": {},
+        "id_hash_keys": [],
+        "score": null,
+        "embedding": null
+    }"""
+    )
+    assert doc_1 == doc_2
+
+
+@pytest.mark.unit
+def test_full_document_to_json(tmp_path):
+    class TestClass:
+        def __repr__(self):
+            return "<the object>"
+
+    doc_id = _create_id(classname=Document.__name__, content="test content")
+    doc_1 = Document(
+        content="test content",
+        metadata={"some object": TestClass(), "a path": tmp_path / "test.txt"},
+        embedding=np.array([1, 2, 3, 4]),
+    )
+
+    doc_json = doc_1.to_json(indent=4).strip()
+    doc_2 = textwrap.dedent(
+        """    {
+        "id": \""""
+        + doc_id
+        + """\",
+        "content": "test content",
+        "content_type": "text",
+        "metadata": {
+            "some object": "<the object>",
+            "a path": \""""
+        + str((tmp_path / "test.txt").absolute()).replace("\\", "\\\\")
+        + """\"
+        },
+        "id_hash_keys": [],
+        "score": null,
+        "embedding": [
+            1,
+            2,
+            3,
+            4
+        ]
+    }"""
+    ).strip()
+    assert doc_json == doc_2
+
+
+@pytest.mark.unit
+def test_full_document_from_json(tmp_path):
+    doc_id = _create_id(classname=Document.__name__, content="test content")
+    doc_1 = Document(
+        content="test content",
+        metadata={"a path": str(tmp_path / "test.txt")},  # Paths are not cast back to Path objects
+        embedding=np.array([1, 2, 3, 4]),
+    )
+    doc_2 = Document.from_json(
+        """{"id": \""""
+        + doc_id
+        + """\",
+        "content": "test content",
+        "content_type": "text",
+        "metadata": {
+            "a path": \""""
+        + str((tmp_path / "test.txt").absolute()).replace("\\", "\\\\")
+        + """\"
+        },
+        "id_hash_keys": [],
+        "score": null,
+        "embedding": [
+            1,
+            2,
+            3,
+            4
+        ]
+    }"""
+    )
+    assert doc_1 == doc_2
+
+
+@pytest.mark.unit
+def test_to_json_custom_encoder(tmp_path):
+    class SerializableTestClass:
+        ...
+
+    class TestEncoder(DocumentEncoder):
+        def default(self, obj):
+            if isinstance(obj, SerializableTestClass):
+                return "<<CUSTOM ENCODING>>"
+            return DocumentEncoder.default(self, obj)
+
+    doc_id = _create_id(classname=Document.__name__, content="test content")
+    doc = Document(content="test content", metadata={"some object": SerializableTestClass()})
+    doc_json = doc.to_json(indent=4, json_encoder=TestEncoder).strip()
+
+    assert (
+        doc_json
+        == textwrap.dedent(
+            """    {
+        "id": \""""
+            + doc_id
+            + """\",
+        "content": "test content",
+        "content_type": "text",
+        "metadata": {
+            "some object": "<<CUSTOM ENCODING>>"
+        },
+        "id_hash_keys": [],
+        "score": null,
+        "embedding": null
+    }"""
+        ).strip()
+    )
+
+
+@pytest.mark.unit
+def test_from_json_custom_decoder():
+    class TestClass:
+        def __eq__(self, other):
+            return type(self) == type(other)
+
+    class TestDecoder(DocumentDecoder):
+        def __init__(self, *args, **kwargs):
+            super().__init__(object_hook=self.object_hook)
+
+        def object_hook(self, dictionary):
+            if "metadata" in dictionary:
+                for key, value in dictionary["metadata"].items():
+                    if value == "<<CUSTOM ENCODING>>":
+                        dictionary["metadata"][key] = TestClass()
+            return dictionary
+
+    doc_id = _create_id(classname=Document.__name__, content="test content")
+    doc = Document(content="test content", metadata={"some object": TestClass()})
+
+    assert doc == Document.from_json(
+        """    {
+        "id": \""""
+        + doc_id
+        + """\",
+        "content": "test content",
+        "content_type": "text",
+        "metadata": {
+            "some object": "<<CUSTOM ENCODING>>"
+        },
+        "id_hash_keys": [],
+        "score": null,
+        "embedding": null
+    }""",
+        json_decoder=TestDecoder,
+    )
+
+
+@pytest.mark.unit
 def test_flatten_text_document_no_meta():
    assert Document(content="test content").flatten() == {
        "id": _create_id(classname=Document.__name__, content="test content"),
@ -259,6 +594,7 @@ def test_flatten_text_document_no_meta():
    }


+@pytest.mark.unit
 def test_flatten_text_document():
    assert Document(content="test content", metadata={"name": "document name", "page": 123}).flatten() == {
        "id": _create_id(classname=Document.__name__, content="test content"),
@ -272,6 +608,7 @@ def test_flatten_text_document():
    }


+@pytest.mark.unit
 def test_flatten_table_document():
    df = pd.DataFrame([1, 2])
    flat = Document(content=df, content_type="table", metadata={"table-name": "table title", "section": 3}).flatten()
@ -289,6 +626,7 @@ def test_flatten_table_document():
    }


+@pytest.mark.unit
 def test_flatten_image_document():
    path = Path(__file__).parent / "test_files" / "apple.jpg"
    assert Document(