mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-09 05:37:25 +00:00
fix: Document v2 JSON serialization (#4863)
* fix json serialization * add missing markers * pylint * fix decoder bug * pylint * add some more tests * linting & windows * windows * windows * windows paths again
This commit is contained in:
parent
bffe2d8c19
commit
8fbfca9ebb
@ -1,4 +1,4 @@
|
||||
from typing import List, Any, Dict, Literal, Optional
|
||||
from typing import List, Any, Dict, Literal, Optional, Type
|
||||
|
||||
import json
|
||||
import hashlib
|
||||
@ -34,10 +34,10 @@ def _create_id(
|
||||
"""
|
||||
Creates a hash of the content given that acts as the document's ID.
|
||||
"""
|
||||
if not metadata:
|
||||
metadata = {}
|
||||
content_to_hash = f"{classname}:{content}"
|
||||
if id_hash_keys:
|
||||
if not metadata:
|
||||
raise ValueError("If 'id_hash_keys' is provided, you must provide 'metadata' too.")
|
||||
content_to_hash = ":".join([content_to_hash, *[str(metadata.get(key, "")) for key in id_hash_keys]])
|
||||
return hashlib.sha256(str(content_to_hash).encode("utf-8")).hexdigest()
|
||||
|
||||
@ -61,6 +61,47 @@ def _safe_equals(obj_1, obj_2) -> bool:
|
||||
return obj_1 == obj_2
|
||||
|
||||
|
||||
class DocumentEncoder(json.JSONEncoder):
|
||||
"""
|
||||
Encodes more exotic datatypes like pandas dataframes or file paths.
|
||||
"""
|
||||
|
||||
def default(self, obj):
|
||||
if isinstance(obj, numpy.ndarray):
|
||||
return obj.tolist()
|
||||
if isinstance(obj, pandas.DataFrame):
|
||||
return obj.to_json()
|
||||
if isinstance(obj, Path):
|
||||
return str(obj.absolute())
|
||||
try:
|
||||
return json.JSONEncoder.default(self, obj)
|
||||
except TypeError:
|
||||
return str(obj)
|
||||
|
||||
|
||||
class DocumentDecoder(json.JSONDecoder):
|
||||
"""
|
||||
Decodes more exotic datatypes like pandas dataframes or file paths.
|
||||
"""
|
||||
|
||||
def __init__(self, *_, object_hook=None, **__):
|
||||
super().__init__(object_hook=object_hook or self.document_decoder)
|
||||
|
||||
def document_decoder(self, dictionary):
|
||||
# Decode content types
|
||||
if "content_type" in dictionary:
|
||||
if dictionary["content_type"] == "table":
|
||||
dictionary["content"] = pandas.read_json(dictionary.get("content", None))
|
||||
elif dictionary["content_type"] == "image":
|
||||
dictionary["content"] = Path(dictionary.get("content", None))
|
||||
|
||||
# Decode embeddings
|
||||
if "embedding" in dictionary and dictionary.get("embedding"):
|
||||
dictionary["embedding"] = numpy.array(dictionary.get("embedding"))
|
||||
|
||||
return dictionary
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Document:
|
||||
"""
|
||||
@ -106,6 +147,11 @@ class Document:
|
||||
content.
|
||||
"""
|
||||
# Validate content_type
|
||||
if self.content_type not in PYTHON_TYPES_FOR_CONTENT:
|
||||
raise ValueError(
|
||||
f"Content type unknown: '{self.content_type}'. "
|
||||
f"Choose among: {', '.join(PYTHON_TYPES_FOR_CONTENT.keys())}"
|
||||
)
|
||||
if not isinstance(self.content, PYTHON_TYPES_FOR_CONTENT[self.content_type]):
|
||||
raise ValueError(
|
||||
f"The type of content ({type(self.content)}) does not match the "
|
||||
@ -139,11 +185,11 @@ class Document:
|
||||
"""
|
||||
return asdict(self)
|
||||
|
||||
def to_json(self, **json_kwargs):
|
||||
def to_json(self, json_encoder: Optional[Type[DocumentEncoder]] = None, **json_kwargs):
|
||||
"""
|
||||
Saves the Document into a JSON string that can be later loaded back.
|
||||
"""
|
||||
return json.dumps(self.to_dict(), **json_kwargs)
|
||||
return json.dumps(self.to_dict(), cls=json_encoder or DocumentEncoder, **json_kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, dictionary):
|
||||
@ -153,11 +199,11 @@ class Document:
|
||||
return cls(**dictionary)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, data, **json_kwargs):
|
||||
def from_json(cls, data, json_decoder: Optional[Type[DocumentDecoder]] = None, **json_kwargs):
|
||||
"""
|
||||
Creates a new Document object from a JSON string.
|
||||
"""
|
||||
dictionary = json.loads(data, **json_kwargs)
|
||||
dictionary = json.loads(data, cls=json_decoder or DocumentDecoder, **json_kwargs)
|
||||
return cls.from_dict(dictionary=dictionary)
|
||||
|
||||
def flatten(self) -> Dict[str, Any]:
|
||||
|
||||
@ -1,12 +1,13 @@
|
||||
from pathlib import Path
|
||||
import dataclasses
|
||||
import textwrap
|
||||
|
||||
import pytest
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from haystack.preview import Document
|
||||
from haystack.preview.dataclasses.document import _create_id
|
||||
from haystack.preview.dataclasses.document import _create_id, DocumentEncoder, DocumentDecoder
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
@ -16,6 +17,58 @@ def test_document_is_immutable():
|
||||
doc.content = "won't work"
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_content_type_unknown():
|
||||
with pytest.raises(ValueError, match="Content type unknown: 'other'"):
|
||||
Document(content="test content", content_type="other")
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_content_type_must_match_text():
|
||||
Document(content="test content")
|
||||
Document(content="test content", content_type="text")
|
||||
|
||||
with pytest.raises(ValueError, match="does not match the content type"):
|
||||
Document(content="test content", content_type="table")
|
||||
|
||||
with pytest.raises(ValueError, match="does not match the content type"):
|
||||
Document(content="test content", content_type="image")
|
||||
|
||||
with pytest.raises(ValueError, match="does not match the content type"):
|
||||
Document(content="test content", content_type="audio")
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_content_type_must_match_table():
|
||||
with pytest.raises(ValueError, match="does not match the content type"):
|
||||
Document(content=pd.DataFrame([1, 2]), content_type="text")
|
||||
|
||||
with pytest.raises(ValueError, match="does not match the content type"):
|
||||
Document(content=pd.DataFrame([1, 2]), content_type="image")
|
||||
|
||||
with pytest.raises(ValueError, match="does not match the content type"):
|
||||
Document(content=pd.DataFrame([1, 2]), content_type="audio")
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_content_type_must_match_image_audio():
|
||||
# Mimetypes are not checked - yet
|
||||
Document(content=Path(__file__), content_type="image")
|
||||
Document(content=Path(__file__), content_type="audio")
|
||||
|
||||
with pytest.raises(ValueError, match="does not match the content type"):
|
||||
Document(content=Path(__file__), content_type="text")
|
||||
|
||||
with pytest.raises(ValueError, match="does not match the content type"):
|
||||
Document(content=Path(__file__), content_type="table")
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_id_hash_keys_require_metadata():
|
||||
with pytest.raises(ValueError, match="must be present in the metadata of the Document if you want to use it"):
|
||||
Document(content="test content", id_hash_keys=["something"])
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_init_document_same_meta_as_main_fields():
|
||||
"""
|
||||
@ -26,10 +79,9 @@ def test_init_document_same_meta_as_main_fields():
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_simple_text_document_equality():
|
||||
doc1 = Document(content="test content")
|
||||
doc2 = Document(content="test content")
|
||||
assert doc1 == doc2
|
||||
def test_basic_equality_type_mismatch():
|
||||
doc = Document(content="test content")
|
||||
assert doc != "test content"
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
@ -74,6 +126,13 @@ def test_equality_with_simple_metadata():
|
||||
assert doc1 == doc2
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_equality_with_different_metadata():
|
||||
doc1 = Document(content="test content", metadata={"value": 1})
|
||||
doc2 = Document(content="test content", metadata={"value": 1, "another": "value"})
|
||||
assert doc1 != doc2
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_equality_with_nested_metadata():
|
||||
doc1 = Document(content="test content", metadata={"value": {"another": "value"}})
|
||||
@ -248,6 +307,282 @@ def test_document_with_most_attributes_from_dict():
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_default_text_document_to_json():
|
||||
doc_id = _create_id(classname=Document.__name__, content="test content")
|
||||
doc_1 = Document(content="test content").to_json(indent=4).strip()
|
||||
doc_2 = textwrap.dedent(
|
||||
""" {
|
||||
"id": \""""
|
||||
+ doc_id
|
||||
+ """\",
|
||||
"content": "test content",
|
||||
"content_type": "text",
|
||||
"metadata": {},
|
||||
"id_hash_keys": [],
|
||||
"score": null,
|
||||
"embedding": null
|
||||
}"""
|
||||
).strip()
|
||||
assert doc_1 == doc_2
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_default_text_document_from_json():
|
||||
doc_id = _create_id(classname=Document.__name__, content="test content")
|
||||
doc_1 = Document(content="test content")
|
||||
doc_2 = Document.from_json(
|
||||
"""{"id": \""""
|
||||
+ doc_id
|
||||
+ """\",
|
||||
"content": "test content",
|
||||
"content_type": "text",
|
||||
"metadata": {},
|
||||
"id_hash_keys": [],
|
||||
"score": null,
|
||||
"embedding": null
|
||||
}"""
|
||||
)
|
||||
assert doc_1 == doc_2
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_default_table_document_to_json():
|
||||
df = pd.DataFrame([1, 2])
|
||||
doc_id = _create_id(classname=Document.__name__, content=df)
|
||||
doc_1 = Document(content=df, content_type="table").to_json(indent=4).strip()
|
||||
doc_2 = textwrap.dedent(
|
||||
""" {
|
||||
"id": \""""
|
||||
+ doc_id
|
||||
+ """\",
|
||||
"content": \""""
|
||||
+ df.to_json().replace('"', '\\"')
|
||||
+ """\",
|
||||
"content_type": "table",
|
||||
"metadata": {},
|
||||
"id_hash_keys": [],
|
||||
"score": null,
|
||||
"embedding": null
|
||||
}"""
|
||||
).strip()
|
||||
assert doc_1 == doc_2
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_default_table_document_from_json():
|
||||
df = pd.DataFrame([1, 2])
|
||||
doc_id = _create_id(classname=Document.__name__, content=pd.DataFrame([1, 2]))
|
||||
doc_1 = Document(content=df, content_type="table")
|
||||
doc_2 = Document.from_json(
|
||||
"""{
|
||||
"id": \""""
|
||||
+ doc_id
|
||||
+ """\",
|
||||
"content": \""""
|
||||
+ pd.DataFrame([1, 2]).to_json().replace('"', '\\"')
|
||||
+ """\",
|
||||
"content_type": "table",
|
||||
"metadata": {},
|
||||
"id_hash_keys": [],
|
||||
"score": null,
|
||||
"embedding": null
|
||||
}"""
|
||||
)
|
||||
assert doc_1 == doc_2
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_default_image_document_to_json():
|
||||
path = Path(__file__).parent / "test_files" / "apple.jpg"
|
||||
doc_id = _create_id(classname=Document.__name__, content=path)
|
||||
doc_1 = Document(content=path, content_type="image").to_json(indent=4).strip()
|
||||
doc_2 = textwrap.dedent(
|
||||
""" {
|
||||
"id": \""""
|
||||
+ doc_id
|
||||
+ """\",
|
||||
"content": \""""
|
||||
+ str(path.absolute()).replace("\\", "\\\\")
|
||||
+ """\",
|
||||
"content_type": "image",
|
||||
"metadata": {},
|
||||
"id_hash_keys": [],
|
||||
"score": null,
|
||||
"embedding": null
|
||||
}"""
|
||||
).strip()
|
||||
assert doc_1 == doc_2
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_default_image_document_from_json():
|
||||
path = Path(__file__).parent / "test_files" / "apple.jpg"
|
||||
doc_id = _create_id(classname=Document.__name__, content=path)
|
||||
doc_1 = Document(content=path, content_type="image")
|
||||
doc_2 = Document.from_json(
|
||||
"""{"id": \""""
|
||||
+ doc_id
|
||||
+ """\",
|
||||
"content": \""""
|
||||
+ str(path.absolute()).replace("\\", "\\\\")
|
||||
+ """\",
|
||||
"content_type": "image",
|
||||
"metadata": {},
|
||||
"id_hash_keys": [],
|
||||
"score": null,
|
||||
"embedding": null
|
||||
}"""
|
||||
)
|
||||
assert doc_1 == doc_2
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_full_document_to_json(tmp_path):
|
||||
class TestClass:
|
||||
def __repr__(self):
|
||||
return "<the object>"
|
||||
|
||||
doc_id = _create_id(classname=Document.__name__, content="test content")
|
||||
doc_1 = Document(
|
||||
content="test content",
|
||||
metadata={"some object": TestClass(), "a path": tmp_path / "test.txt"},
|
||||
embedding=np.array([1, 2, 3, 4]),
|
||||
)
|
||||
|
||||
doc_json = doc_1.to_json(indent=4).strip()
|
||||
doc_2 = textwrap.dedent(
|
||||
""" {
|
||||
"id": \""""
|
||||
+ doc_id
|
||||
+ """\",
|
||||
"content": "test content",
|
||||
"content_type": "text",
|
||||
"metadata": {
|
||||
"some object": "<the object>",
|
||||
"a path": \""""
|
||||
+ str((tmp_path / "test.txt").absolute()).replace("\\", "\\\\")
|
||||
+ """\"
|
||||
},
|
||||
"id_hash_keys": [],
|
||||
"score": null,
|
||||
"embedding": [
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4
|
||||
]
|
||||
}"""
|
||||
).strip()
|
||||
assert doc_json == doc_2
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_full_document_from_json(tmp_path):
|
||||
doc_id = _create_id(classname=Document.__name__, content="test content")
|
||||
doc_1 = Document(
|
||||
content="test content",
|
||||
metadata={"a path": str(tmp_path / "test.txt")}, # Paths are not cast back to Path objects
|
||||
embedding=np.array([1, 2, 3, 4]),
|
||||
)
|
||||
doc_2 = Document.from_json(
|
||||
"""{"id": \""""
|
||||
+ doc_id
|
||||
+ """\",
|
||||
"content": "test content",
|
||||
"content_type": "text",
|
||||
"metadata": {
|
||||
"a path": \""""
|
||||
+ str((tmp_path / "test.txt").absolute()).replace("\\", "\\\\")
|
||||
+ """\"
|
||||
},
|
||||
"id_hash_keys": [],
|
||||
"score": null,
|
||||
"embedding": [
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4
|
||||
]
|
||||
}"""
|
||||
)
|
||||
assert doc_1 == doc_2
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_to_json_custom_encoder(tmp_path):
|
||||
class SerializableTestClass:
|
||||
...
|
||||
|
||||
class TestEncoder(DocumentEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, SerializableTestClass):
|
||||
return "<<CUSTOM ENCODING>>"
|
||||
return DocumentEncoder.default(self, obj)
|
||||
|
||||
doc_id = _create_id(classname=Document.__name__, content="test content")
|
||||
doc = Document(content="test content", metadata={"some object": SerializableTestClass()})
|
||||
doc_json = doc.to_json(indent=4, json_encoder=TestEncoder).strip()
|
||||
|
||||
assert (
|
||||
doc_json
|
||||
== textwrap.dedent(
|
||||
""" {
|
||||
"id": \""""
|
||||
+ doc_id
|
||||
+ """\",
|
||||
"content": "test content",
|
||||
"content_type": "text",
|
||||
"metadata": {
|
||||
"some object": "<<CUSTOM ENCODING>>"
|
||||
},
|
||||
"id_hash_keys": [],
|
||||
"score": null,
|
||||
"embedding": null
|
||||
}"""
|
||||
).strip()
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_from_json_custom_decoder():
|
||||
class TestClass:
|
||||
def __eq__(self, other):
|
||||
return type(self) == type(other)
|
||||
|
||||
class TestDecoder(DocumentDecoder):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(object_hook=self.object_hook)
|
||||
|
||||
def object_hook(self, dictionary):
|
||||
if "metadata" in dictionary:
|
||||
for key, value in dictionary["metadata"].items():
|
||||
if value == "<<CUSTOM ENCODING>>":
|
||||
dictionary["metadata"][key] = TestClass()
|
||||
return dictionary
|
||||
|
||||
doc_id = _create_id(classname=Document.__name__, content="test content")
|
||||
doc = Document(content="test content", metadata={"some object": TestClass()})
|
||||
|
||||
assert doc == Document.from_json(
|
||||
""" {
|
||||
"id": \""""
|
||||
+ doc_id
|
||||
+ """\",
|
||||
"content": "test content",
|
||||
"content_type": "text",
|
||||
"metadata": {
|
||||
"some object": "<<CUSTOM ENCODING>>"
|
||||
},
|
||||
"id_hash_keys": [],
|
||||
"score": null,
|
||||
"embedding": null
|
||||
}""",
|
||||
json_decoder=TestDecoder,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_flatten_text_document_no_meta():
|
||||
assert Document(content="test content").flatten() == {
|
||||
"id": _create_id(classname=Document.__name__, content="test content"),
|
||||
@ -259,6 +594,7 @@ def test_flatten_text_document_no_meta():
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_flatten_text_document():
|
||||
assert Document(content="test content", metadata={"name": "document name", "page": 123}).flatten() == {
|
||||
"id": _create_id(classname=Document.__name__, content="test content"),
|
||||
@ -272,6 +608,7 @@ def test_flatten_text_document():
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_flatten_table_document():
|
||||
df = pd.DataFrame([1, 2])
|
||||
flat = Document(content=df, content_type="table", metadata={"table-name": "table title", "section": 3}).flatten()
|
||||
@ -289,6 +626,7 @@ def test_flatten_table_document():
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_flatten_image_document():
|
||||
path = Path(__file__).parent / "test_files" / "apple.jpg"
|
||||
assert Document(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user