import dataclasses import json import textwrap from pathlib import Path import numpy as np import pandas as pd import pytest from haystack.preview import Document from haystack.preview.dataclasses.document import DocumentDecoder, DocumentEncoder @pytest.mark.unit @pytest.mark.parametrize( "doc,doc_str", [ (Document(text="test text"), "text: 'test text'"), (Document(array=np.zeros((3, 7))), "array: (3, 7)"), ( Document(dataframe=pd.DataFrame([["John", 25], ["Martha", 34]], columns=["name", "age"])), "dataframe: (2, 2)", ), (Document(blob=bytes("hello, test string".encode("utf-8"))), "blob: 18 bytes"), ( Document( text="test text", array=np.zeros((3, 7)), dataframe=pd.DataFrame([["John", 25], ["Martha", 34]], columns=["name", "age"]), blob=bytes("hello, test string".encode("utf-8")), ), "text: 'test text', array: (3, 7), dataframe: (2, 2), blob: 18 bytes", ), ], ) def test_document_str(doc, doc_str): assert f"Document(id={doc.id}, mimetype: 'text/plain', {doc_str})" == str(doc) @pytest.mark.unit @pytest.mark.parametrize( "doc1_data,doc2_data", [ [{"text": "test text"}, {"text": "test text", "mime_type": "text/plain"}], [{"text": "test text", "mime_type": "text/html"}, {"text": "test text", "mime_type": "text/plain"}], [{"text": "test text"}, {"text": "test text", "metadata": {"path": Path(__file__)}}], [ {"text": "test text", "metadata": {"path": Path(__file__).parent}}, {"text": "test text", "metadata": {"path": Path(__file__)}}, ], [{"text": "test text"}, {"text": "test text", "score": 200}], [{"text": "test text", "score": 0}, {"text": "test text", "score": 200}], [{"text": "test text"}, {"text": "test text", "embedding": np.array([1, 2, 3])}], [ {"text": "test text", "embedding": np.array([100, 222, 345])}, {"text": "test text", "embedding": np.array([1, 2, 3])}, ], [{"array": np.array(range(10))}, {"array": np.array(range(10))}], [{"dataframe": pd.DataFrame([1, 2, 3])}, {"dataframe": pd.DataFrame([1, 2, 3])}], [{"blob": b"some bytes"}, {"blob": b"some bytes"}], ], ) def test_id_hash_keys_default_fields_equal_id(doc1_data, doc2_data): doc1 = Document.from_dict(doc1_data) doc2 = Document.from_dict(doc2_data) assert doc1.id == doc2.id @pytest.mark.unit @pytest.mark.parametrize( "doc1_data,doc2_data", [ [{"text": "test text"}, {"text": "test text "}], [{"array": np.array(range(10))}, {"array": np.array(range(11))}], [{"dataframe": pd.DataFrame([1, 2, 3])}, {"dataframe": pd.DataFrame([1, 2, 3, 4])}], [{"blob": b"some bytes"}, {"blob": "something else".encode()}], ], ) def test_id_hash_keys_default_fields_different_ids(doc1_data, doc2_data): doc1 = Document.from_dict(doc1_data) doc2 = Document.from_dict(doc2_data) assert doc1.id != doc2.id @pytest.mark.unit def test_id_hash_keys_changes_id(): doc1 = Document(text="test text", metadata={"some-value": "value"}) doc2 = Document(text="test text", metadata={"some-value": "value"}, id_hash_keys=["text", "some-value"]) assert doc1.id != doc2.id @pytest.mark.unit def test_id_hash_keys_field_may_be_missing(caplog): doc1 = Document(text="test text", id_hash_keys=["something"]) doc2 = Document(text="test text", id_hash_keys=["something else"]) assert doc1.id == doc2.id assert "is missing the following id_hash_keys: ['something']." in caplog.text assert "is missing the following id_hash_keys: ['something else']." in caplog.text @pytest.mark.unit def test_init_document_same_meta_as_main_fields(): """ This is forbidden to prevent later issues with `Document.flatten()` """ with pytest.raises(ValueError, match="score"): Document(text="test text", metadata={"score": "10/10"}) @pytest.mark.unit def test_basic_equality_type_mismatch(): doc = Document(text="test text") assert doc != "test text" @pytest.mark.unit def test_basic_equality_id(): doc1 = Document(text="test text") doc2 = Document(text="test text") assert doc1 == doc2 object.__setattr__(doc1, "id", "1234") object.__setattr__(doc2, "id", "5678") assert doc1 != doc2 @pytest.mark.unit def test_equality_with_metadata_with_objects(): class TestObject: def __eq__(self, other): if type(self) == type(other): return True doc1 = Document( text="test text", metadata={"value": np.array([0, 1, 2]), "path": Path(__file__), "obj": TestObject()} ) doc2 = Document( text="test text", metadata={"value": np.array([0, 1, 2]), "path": Path(__file__), "obj": TestObject()} ) assert doc1 == doc2 @pytest.mark.unit def test_empty_document_to_dict(): doc = Document() assert doc.to_dict() == { "id": doc._create_id(), "text": None, "array": None, "dataframe": None, "blob": None, "mime_type": "text/plain", "metadata": {}, "id_hash_keys": ["text", "array", "dataframe", "blob"], "score": None, "embedding": None, } @pytest.mark.unit def test_empty_document_from_dict(): assert Document.from_dict({}) == Document() @pytest.mark.unit def test_full_document_to_dict(): doc = Document( text="test text", array=np.array([1, 2, 3]), dataframe=pd.DataFrame([10, 20, 30]), blob=b"some bytes", mime_type="application/pdf", metadata={"some": "values", "test": 10}, id_hash_keys=["test"], score=0.99, embedding=np.zeros([10, 10]), ) dictionary = doc.to_dict() array = dictionary.pop("array") assert array.shape == doc.array.shape and (array == doc.array).all() dataframe = dictionary.pop("dataframe") assert dataframe.equals(doc.dataframe) blob = dictionary.pop("blob") assert blob == doc.blob embedding = dictionary.pop("embedding") assert (embedding == doc.embedding).all() assert dictionary == { "id": doc.id, "text": "test text", "mime_type": "application/pdf", "metadata": {"some": "values", "test": 10}, "id_hash_keys": ["test"], "score": 0.99, } @pytest.mark.unit def test_document_with_most_attributes_from_dict(): embedding = np.zeros([10, 10]) assert Document.from_dict( { "text": "test text", "array": np.array([1, 2, 3]), "dataframe": pd.DataFrame([10, 20, 30]), "blob": b"some bytes", "mime_type": "application/pdf", "metadata": {"some": "values", "test": 10}, "id_hash_keys": ["test"], "score": 0.99, "embedding": embedding, } ) == Document( text="test text", array=np.array([1, 2, 3]), dataframe=pd.DataFrame([10, 20, 30]), blob=b"some bytes", mime_type="application/pdf", metadata={"some": "values", "test": 10}, id_hash_keys=["test"], score=0.99, embedding=embedding, ) @pytest.mark.unit def test_empty_document_to_json(): doc = Document() assert doc.to_json() == json.dumps( { "id": doc.id, "text": None, "array": None, "dataframe": None, "mime_type": "text/plain", "metadata": {}, "id_hash_keys": ["text", "array", "dataframe", "blob"], "score": None, "embedding": None, } ) @pytest.mark.unit def test_empty_document_from_json(): assert Document.from_json("{}") == Document() @pytest.mark.unit def test_full_document_to_json(tmp_path): class TestClass: def __repr__(self): return "" doc_1 = Document( text="test text", array=np.array([1, 2, 3]), dataframe=pd.DataFrame([10, 20, 30]), blob=b"some bytes", mime_type="application/pdf", metadata={"some object": TestClass(), "a path": tmp_path / "test.txt"}, id_hash_keys=["test"], score=0.5, embedding=np.array([1, 2, 3, 4]), ) assert doc_1.to_json() == json.dumps( { "id": doc_1.id, "text": "test text", "array": [1, 2, 3], "dataframe": '{"0":{"0":10,"1":20,"2":30}}', "mime_type": "application/pdf", "metadata": {"some object": "", "a path": str((tmp_path / "test.txt").absolute())}, "id_hash_keys": ["test"], "score": 0.5, "embedding": [1, 2, 3, 4], } ) @pytest.mark.unit def test_full_document_from_json(tmp_path): class TestClass: def __repr__(self): return "''" def __eq__(self, other): return type(self) == type(other) doc = Document.from_json( json.dumps( { "text": "test text", "array": [1, 2, 3], "dataframe": '{"0":{"0":10,"1":20,"2":30}}', "mime_type": "application/pdf", "metadata": {"some object": "", "a path": str((tmp_path / "test.txt").absolute())}, "id_hash_keys": ["test"], "score": 0.5, "embedding": [1, 2, 3, 4], } ) ) assert doc == Document( text="test text", array=np.array([1, 2, 3]), dataframe=pd.DataFrame([10, 20, 30]), blob=None, mime_type="application/pdf", # Note the object serialization metadata={"some object": "", "a path": str((tmp_path / "test.txt").absolute())}, id_hash_keys=["test"], score=0.5, embedding=np.array([1, 2, 3, 4]), ) @pytest.mark.unit def test_to_json_custom_encoder(tmp_path): class SerializableTestClass: ... class TestEncoder(DocumentEncoder): def default(self, obj): if isinstance(obj, SerializableTestClass): return "<>" return DocumentEncoder.default(self, obj) doc = Document(text="test text", metadata={"some object": SerializableTestClass()}) doc_json = doc.to_json(indent=4, json_encoder=TestEncoder).strip() assert doc_json == json.dumps( { "id": doc.id, "text": "test text", "array": None, "dataframe": None, "mime_type": "text/plain", "metadata": {"some object": "<>"}, "id_hash_keys": ["text", "array", "dataframe", "blob"], "score": None, "embedding": None, }, indent=4, ) @pytest.mark.unit def test_from_json_custom_decoder(): class TestClass: def __eq__(self, other): return type(self) == type(other) class TestDecoder(DocumentDecoder): def __init__(self, *args, **kwargs): super().__init__(object_hook=self.object_hook) def object_hook(self, dictionary): if "metadata" in dictionary: for key, value in dictionary["metadata"].items(): if value == "<>": dictionary["metadata"][key] = TestClass() return dictionary doc = Document(text="test text", metadata={"some object": TestClass()}) assert doc == Document.from_json( json.dumps( { "id": doc.id, "text": "test text", "array": None, "dataframe": None, "mime_type": "text/plain", "metadata": {"some object": "<>"}, "id_hash_keys": ["text", "array", "dataframe", "blob"], "score": None, "embedding": None, } ), json_decoder=TestDecoder, ) @pytest.mark.unit def test_flatten_document_no_meta(): doc = Document(text="test text") assert doc.flatten() == { "id": doc.id, "text": "test text", "array": None, "dataframe": None, "blob": None, "mime_type": "text/plain", "id_hash_keys": ["text", "array", "dataframe", "blob"], "score": None, "embedding": None, } @pytest.mark.unit def test_flatten_document_with_flat_meta(): doc = Document(text="test text", metadata={"some-key": "a value", "another-key": "another value!"}) assert doc.flatten() == { "id": doc.id, "text": "test text", "array": None, "dataframe": None, "blob": None, "mime_type": "text/plain", "id_hash_keys": ["text", "array", "dataframe", "blob"], "score": None, "embedding": None, "some-key": "a value", "another-key": "another value!", } @pytest.mark.unit def test_flatten_document_with_nested_meta(): doc = Document(text="test text", metadata={"some-key": "a value", "nested": {"key": 10, "key2": 50}}) assert doc.flatten() == { "id": doc.id, "text": "test text", "array": None, "dataframe": None, "blob": None, "mime_type": "text/plain", "id_hash_keys": ["text", "array", "dataframe", "blob"], "score": None, "embedding": None, "some-key": "a value", "nested": {"key": 10, "key2": 50}, }