haystack/test/preview/dataclasses/test_document.py
Stefano Fiorucci ef40c7c728
refactor: make sure that Document's id_hash_keys has a valid value (#6112)
* fix handling id_hash_keys

* reno

* handle empty id_hash_keys in post_init

* fix

* reno

* test
2023-10-19 12:10:19 +02:00

448 lines
14 KiB
Python

import json
from pathlib import Path
import numpy as np
import pandas as pd
import pytest
from haystack.preview import Document
from haystack.preview.dataclasses.document import DocumentDecoder, DocumentEncoder
@pytest.mark.unit
@pytest.mark.parametrize(
"doc,doc_str",
[
(Document(text="test text"), "text: 'test text'"),
(Document(array=np.zeros((3, 7))), "array: (3, 7)"),
(
Document(dataframe=pd.DataFrame([["John", 25], ["Martha", 34]], columns=["name", "age"])),
"dataframe: (2, 2)",
),
(Document(blob=bytes("hello, test string".encode("utf-8"))), "blob: 18 bytes"),
(
Document(
text="test text",
array=np.zeros((3, 7)),
dataframe=pd.DataFrame([["John", 25], ["Martha", 34]], columns=["name", "age"]),
blob=bytes("hello, test string".encode("utf-8")),
),
"text: 'test text', array: (3, 7), dataframe: (2, 2), blob: 18 bytes",
),
],
)
def test_document_str(doc, doc_str):
assert f"Document(id={doc.id}, mimetype: 'text/plain', {doc_str})" == str(doc)
@pytest.mark.unit
@pytest.mark.parametrize(
"doc1_data,doc2_data",
[
[{"text": "test text"}, {"text": "test text", "mime_type": "text/plain"}],
[{"text": "test text", "mime_type": "text/html"}, {"text": "test text", "mime_type": "text/plain"}],
[{"text": "test text"}, {"text": "test text", "metadata": {"path": Path(__file__)}}],
[
{"text": "test text", "metadata": {"path": Path(__file__).parent}},
{"text": "test text", "metadata": {"path": Path(__file__)}},
],
[{"text": "test text"}, {"text": "test text", "score": 200}],
[{"text": "test text", "score": 0}, {"text": "test text", "score": 200}],
[{"text": "test text"}, {"text": "test text", "embedding": np.array([1, 2, 3])}],
[
{"text": "test text", "embedding": np.array([100, 222, 345])},
{"text": "test text", "embedding": np.array([1, 2, 3])},
],
[{"array": np.array(range(10))}, {"array": np.array(range(10))}],
[{"dataframe": pd.DataFrame([1, 2, 3])}, {"dataframe": pd.DataFrame([1, 2, 3])}],
[{"blob": b"some bytes"}, {"blob": b"some bytes"}],
],
)
def test_id_hash_keys_default_fields_equal_id(doc1_data, doc2_data):
doc1 = Document.from_dict(doc1_data)
doc2 = Document.from_dict(doc2_data)
assert doc1.id == doc2.id
@pytest.mark.unit
@pytest.mark.parametrize(
"doc1_data,doc2_data",
[
[{"text": "test text"}, {"text": "test text "}],
[{"array": np.array(range(10))}, {"array": np.array(range(11))}],
[{"dataframe": pd.DataFrame([1, 2, 3])}, {"dataframe": pd.DataFrame([1, 2, 3, 4])}],
[{"blob": b"some bytes"}, {"blob": "something else".encode()}],
],
)
def test_id_hash_keys_default_fields_different_ids(doc1_data, doc2_data):
doc1 = Document.from_dict(doc1_data)
doc2 = Document.from_dict(doc2_data)
assert doc1.id != doc2.id
@pytest.mark.unit
def test_id_hash_keys_changes_id():
doc1 = Document(text="test text", metadata={"some-value": "value"})
doc2 = Document(text="test text", metadata={"some-value": "value"}, id_hash_keys=["text", "some-value"])
assert doc1.id != doc2.id
@pytest.mark.unit
def test_id_hash_keys_field_may_be_missing(caplog):
doc1 = Document(text="test text", id_hash_keys=["something"])
doc2 = Document(text="test text", id_hash_keys=["something else"])
assert doc1.id == doc2.id
assert "is missing the following id_hash_keys: ['something']." in caplog.text
assert "is missing the following id_hash_keys: ['something else']." in caplog.text
@pytest.mark.unit
def test_id_hash_keys_is_set_to_default_if_invalid():
doc = Document(text="test text", id_hash_keys=None)
assert doc.id_hash_keys == ["text", "array", "dataframe", "blob"]
doc = Document(text="test text", id_hash_keys=[])
assert doc.id_hash_keys == ["text", "array", "dataframe", "blob"]
@pytest.mark.unit
def test_init_document_same_meta_as_main_fields():
"""
This is forbidden to prevent later issues with `Document.flatten()`
"""
with pytest.raises(ValueError, match="score"):
Document(text="test text", metadata={"score": "10/10"})
@pytest.mark.unit
def test_basic_equality_type_mismatch():
doc = Document(text="test text")
assert doc != "test text"
@pytest.mark.unit
def test_basic_equality_id():
doc1 = Document(text="test text")
doc2 = Document(text="test text")
assert doc1 == doc2
object.__setattr__(doc1, "id", "1234")
object.__setattr__(doc2, "id", "5678")
assert doc1 != doc2
@pytest.mark.unit
def test_equality_with_metadata_with_objects():
class TestObject:
def __eq__(self, other):
if type(self) == type(other):
return True
doc1 = Document(
text="test text", metadata={"value": np.array([0, 1, 2]), "path": Path(__file__), "obj": TestObject()}
)
doc2 = Document(
text="test text", metadata={"value": np.array([0, 1, 2]), "path": Path(__file__), "obj": TestObject()}
)
assert doc1 == doc2
@pytest.mark.unit
def test_empty_document_to_dict():
doc = Document()
assert doc.to_dict() == {
"id": doc._create_id(),
"text": None,
"array": None,
"dataframe": None,
"blob": None,
"mime_type": "text/plain",
"metadata": {},
"id_hash_keys": ["text", "array", "dataframe", "blob"],
"score": None,
"embedding": None,
}
@pytest.mark.unit
def test_empty_document_from_dict():
assert Document.from_dict({}) == Document()
@pytest.mark.unit
def test_full_document_to_dict():
doc = Document(
text="test text",
array=np.array([1, 2, 3]),
dataframe=pd.DataFrame([10, 20, 30]),
blob=b"some bytes",
mime_type="application/pdf",
metadata={"some": "values", "test": 10},
id_hash_keys=["test"],
score=0.99,
embedding=np.zeros([10, 10]),
)
dictionary = doc.to_dict()
array = dictionary.pop("array")
assert array.shape == doc.array.shape and (array == doc.array).all()
dataframe = dictionary.pop("dataframe")
assert dataframe.equals(doc.dataframe)
blob = dictionary.pop("blob")
assert blob == doc.blob
embedding = dictionary.pop("embedding")
assert (embedding == doc.embedding).all()
assert dictionary == {
"id": doc.id,
"text": "test text",
"mime_type": "application/pdf",
"metadata": {"some": "values", "test": 10},
"id_hash_keys": ["test"],
"score": 0.99,
}
@pytest.mark.unit
def test_document_with_most_attributes_from_dict():
embedding = np.zeros([10, 10])
assert Document.from_dict(
{
"text": "test text",
"array": np.array([1, 2, 3]),
"dataframe": pd.DataFrame([10, 20, 30]),
"blob": b"some bytes",
"mime_type": "application/pdf",
"metadata": {"some": "values", "test": 10},
"id_hash_keys": ["test"],
"score": 0.99,
"embedding": embedding,
}
) == Document(
text="test text",
array=np.array([1, 2, 3]),
dataframe=pd.DataFrame([10, 20, 30]),
blob=b"some bytes",
mime_type="application/pdf",
metadata={"some": "values", "test": 10},
id_hash_keys=["test"],
score=0.99,
embedding=embedding,
)
@pytest.mark.unit
def test_empty_document_to_json():
doc = Document()
assert doc.to_json() == json.dumps(
{
"id": doc.id,
"text": None,
"array": None,
"dataframe": None,
"mime_type": "text/plain",
"metadata": {},
"id_hash_keys": ["text", "array", "dataframe", "blob"],
"score": None,
"embedding": None,
}
)
@pytest.mark.unit
def test_empty_document_from_json():
assert Document.from_json("{}") == Document()
@pytest.mark.unit
def test_full_document_to_json(tmp_path):
class TestClass:
def __repr__(self):
return "<the object>"
doc_1 = Document(
text="test text",
array=np.array([1, 2, 3]),
dataframe=pd.DataFrame([10, 20, 30]),
blob=b"some bytes",
mime_type="application/pdf",
metadata={"some object": TestClass(), "a path": tmp_path / "test.txt"},
id_hash_keys=["test"],
score=0.5,
embedding=np.array([1, 2, 3, 4]),
)
assert doc_1.to_json() == json.dumps(
{
"id": doc_1.id,
"text": "test text",
"array": [1, 2, 3],
"dataframe": '{"0":{"0":10,"1":20,"2":30}}',
"mime_type": "application/pdf",
"metadata": {"some object": "<the object>", "a path": str((tmp_path / "test.txt").absolute())},
"id_hash_keys": ["test"],
"score": 0.5,
"embedding": [1, 2, 3, 4],
}
)
@pytest.mark.unit
def test_full_document_from_json(tmp_path):
class TestClass:
def __repr__(self):
return "'<the object>'"
def __eq__(self, other):
return type(self) == type(other)
doc = Document.from_json(
json.dumps(
{
"text": "test text",
"array": [1, 2, 3],
"dataframe": '{"0":{"0":10,"1":20,"2":30}}',
"mime_type": "application/pdf",
"metadata": {"some object": "<the object>", "a path": str((tmp_path / "test.txt").absolute())},
"id_hash_keys": ["test"],
"score": 0.5,
"embedding": [1, 2, 3, 4],
}
)
)
assert doc == Document(
text="test text",
array=np.array([1, 2, 3]),
dataframe=pd.DataFrame([10, 20, 30]),
blob=None,
mime_type="application/pdf",
# Note the object serialization
metadata={"some object": "<the object>", "a path": str((tmp_path / "test.txt").absolute())},
id_hash_keys=["test"],
score=0.5,
embedding=np.array([1, 2, 3, 4]),
)
@pytest.mark.unit
def test_to_json_custom_encoder(tmp_path):
class SerializableTestClass:
...
class TestEncoder(DocumentEncoder):
def default(self, obj):
if isinstance(obj, SerializableTestClass):
return "<<CUSTOM ENCODING>>"
return DocumentEncoder.default(self, obj)
doc = Document(text="test text", metadata={"some object": SerializableTestClass()})
doc_json = doc.to_json(indent=4, json_encoder=TestEncoder).strip()
assert doc_json == json.dumps(
{
"id": doc.id,
"text": "test text",
"array": None,
"dataframe": None,
"mime_type": "text/plain",
"metadata": {"some object": "<<CUSTOM ENCODING>>"},
"id_hash_keys": ["text", "array", "dataframe", "blob"],
"score": None,
"embedding": None,
},
indent=4,
)
@pytest.mark.unit
def test_from_json_custom_decoder():
class TestClass:
def __eq__(self, other):
return type(self) == type(other)
class TestDecoder(DocumentDecoder):
def __init__(self, *args, **kwargs):
super().__init__(object_hook=self.object_hook)
def object_hook(self, dictionary):
if "metadata" in dictionary:
for key, value in dictionary["metadata"].items():
if value == "<<CUSTOM ENCODING>>":
dictionary["metadata"][key] = TestClass()
return dictionary
doc = Document(text="test text", metadata={"some object": TestClass()})
assert doc == Document.from_json(
json.dumps(
{
"id": doc.id,
"text": "test text",
"array": None,
"dataframe": None,
"mime_type": "text/plain",
"metadata": {"some object": "<<CUSTOM ENCODING>>"},
"id_hash_keys": ["text", "array", "dataframe", "blob"],
"score": None,
"embedding": None,
}
),
json_decoder=TestDecoder,
)
@pytest.mark.unit
def test_flatten_document_no_meta():
doc = Document(text="test text")
assert doc.flatten() == {
"id": doc.id,
"text": "test text",
"array": None,
"dataframe": None,
"blob": None,
"mime_type": "text/plain",
"id_hash_keys": ["text", "array", "dataframe", "blob"],
"score": None,
"embedding": None,
}
@pytest.mark.unit
def test_flatten_document_with_flat_meta():
doc = Document(text="test text", metadata={"some-key": "a value", "another-key": "another value!"})
assert doc.flatten() == {
"id": doc.id,
"text": "test text",
"array": None,
"dataframe": None,
"blob": None,
"mime_type": "text/plain",
"id_hash_keys": ["text", "array", "dataframe", "blob"],
"score": None,
"embedding": None,
"some-key": "a value",
"another-key": "another value!",
}
@pytest.mark.unit
def test_flatten_document_with_nested_meta():
doc = Document(text="test text", metadata={"some-key": "a value", "nested": {"key": 10, "key2": 50}})
assert doc.flatten() == {
"id": doc.id,
"text": "test text",
"array": None,
"dataframe": None,
"blob": None,
"mime_type": "text/plain",
"id_hash_keys": ["text", "array", "dataframe", "blob"],
"score": None,
"embedding": None,
"some-key": "a value",
"nested": {"key": 10, "key2": 50},
}