haystack/test/preview/dataclasses/test_document.py

import json
from pathlib import Path

import numpy as np
import pandas as pd
import pytest

from haystack.preview import Document
from haystack.preview.dataclasses.document import DocumentDecoder, DocumentEncoder


@pytest.mark.unit
@pytest.mark.parametrize(
    "doc,doc_str",
    [
        (Document(text="test text"), "text: 'test text'"),
        (Document(array=np.zeros((3, 7))), "array: (3, 7)"),
        (
            Document(dataframe=pd.DataFrame([["John", 25], ["Martha", 34]], columns=["name", "age"])),
            "dataframe: (2, 2)",
        ),
        (Document(blob=bytes("hello, test string".encode("utf-8"))), "blob: 18 bytes"),
        (
            Document(
                text="test text",
                array=np.zeros((3, 7)),
                dataframe=pd.DataFrame([["John", 25], ["Martha", 34]], columns=["name", "age"]),
                blob=bytes("hello, test string".encode("utf-8")),
            ),
            "text: 'test text', array: (3, 7), dataframe: (2, 2), blob: 18 bytes",
        ),
    ],
)
def test_document_str(doc, doc_str):
    assert f"Document(id={doc.id}, mimetype: 'text/plain', {doc_str})" == str(doc)


@pytest.mark.unit
@pytest.mark.parametrize(
    "doc1_data,doc2_data",
    [
        [{"text": "test text"}, {"text": "test text", "mime_type": "text/plain"}],
        [{"text": "test text", "mime_type": "text/html"}, {"text": "test text", "mime_type": "text/plain"}],
        [{"text": "test text"}, {"text": "test text", "metadata": {"path": Path(__file__)}}],
        [
            {"text": "test text", "metadata": {"path": Path(__file__).parent}},
            {"text": "test text", "metadata": {"path": Path(__file__)}},
        ],
        [{"text": "test text"}, {"text": "test text", "score": 200}],
        [{"text": "test text", "score": 0}, {"text": "test text", "score": 200}],
        [{"text": "test text"}, {"text": "test text", "embedding": np.array([1, 2, 3])}],
        [
            {"text": "test text", "embedding": np.array([100, 222, 345])},
            {"text": "test text", "embedding": np.array([1, 2, 3])},
        ],
        [{"array": np.array(range(10))}, {"array": np.array(range(10))}],
        [{"dataframe": pd.DataFrame([1, 2, 3])}, {"dataframe": pd.DataFrame([1, 2, 3])}],
        [{"blob": b"some bytes"}, {"blob": b"some bytes"}],
    ],
)
def test_id_hash_keys_default_fields_equal_id(doc1_data, doc2_data):
    doc1 = Document.from_dict(doc1_data)
    doc2 = Document.from_dict(doc2_data)
    assert doc1.id == doc2.id


@pytest.mark.unit
@pytest.mark.parametrize(
    "doc1_data,doc2_data",
    [
        [{"text": "test text"}, {"text": "test text "}],
        [{"array": np.array(range(10))}, {"array": np.array(range(11))}],
        [{"dataframe": pd.DataFrame([1, 2, 3])}, {"dataframe": pd.DataFrame([1, 2, 3, 4])}],
        [{"blob": b"some bytes"}, {"blob": "something else".encode()}],
    ],
)
def test_id_hash_keys_default_fields_different_ids(doc1_data, doc2_data):
    doc1 = Document.from_dict(doc1_data)
    doc2 = Document.from_dict(doc2_data)
    assert doc1.id != doc2.id


@pytest.mark.unit
def test_id_hash_keys_changes_id():
    doc1 = Document(text="test text", metadata={"some-value": "value"})
    doc2 = Document(text="test text", metadata={"some-value": "value"}, id_hash_keys=["text", "some-value"])
    assert doc1.id != doc2.id


@pytest.mark.unit
def test_id_hash_keys_field_may_be_missing(caplog):
    doc1 = Document(text="test text", id_hash_keys=["something"])
    doc2 = Document(text="test text", id_hash_keys=["something else"])
    assert doc1.id == doc2.id
    assert "is missing the following id_hash_keys: ['something']." in caplog.text
    assert "is missing the following id_hash_keys: ['something else']." in caplog.text


@pytest.mark.unit
def test_id_hash_keys_is_set_to_default_if_invalid():
    doc = Document(text="test text", id_hash_keys=None)
    assert doc.id_hash_keys == ["text", "array", "dataframe", "blob"]

    doc = Document(text="test text", id_hash_keys=[])
    assert doc.id_hash_keys == ["text", "array", "dataframe", "blob"]


@pytest.mark.unit
def test_init_document_same_meta_as_main_fields():
    """
    This is forbidden to prevent later issues with `Document.flatten()`
    """
    with pytest.raises(ValueError, match="score"):
        Document(text="test text", metadata={"score": "10/10"})


@pytest.mark.unit
def test_basic_equality_type_mismatch():
    doc = Document(text="test text")
    assert doc != "test text"


@pytest.mark.unit
def test_basic_equality_id():
    doc1 = Document(text="test text")
    doc2 = Document(text="test text")

    assert doc1 == doc2

    object.__setattr__(doc1, "id", "1234")
    object.__setattr__(doc2, "id", "5678")

    assert doc1 != doc2


@pytest.mark.unit
def test_equality_with_metadata_with_objects():
    class TestObject:
        def __eq__(self, other):
            if type(self) == type(other):
                return True

    doc1 = Document(
        text="test text", metadata={"value": np.array([0, 1, 2]), "path": Path(__file__), "obj": TestObject()}
    )
    doc2 = Document(
        text="test text", metadata={"value": np.array([0, 1, 2]), "path": Path(__file__), "obj": TestObject()}
    )
    assert doc1 == doc2


@pytest.mark.unit
def test_empty_document_to_dict():
    doc = Document()
    assert doc.to_dict() == {
        "id": doc._create_id(),
        "text": None,
        "array": None,
        "dataframe": None,
        "blob": None,
        "mime_type": "text/plain",
        "metadata": {},
        "id_hash_keys": ["text", "array", "dataframe", "blob"],
        "score": None,
        "embedding": None,
    }


@pytest.mark.unit
def test_empty_document_from_dict():
    assert Document.from_dict({}) == Document()


@pytest.mark.unit
def test_full_document_to_dict():
    doc = Document(
        text="test text",
        array=np.array([1, 2, 3]),
        dataframe=pd.DataFrame([10, 20, 30]),
        blob=b"some bytes",
        mime_type="application/pdf",
        metadata={"some": "values", "test": 10},
        id_hash_keys=["test"],
        score=0.99,
        embedding=np.zeros([10, 10]),
    )
    dictionary = doc.to_dict()

    array = dictionary.pop("array")
    assert array.shape == doc.array.shape and (array == doc.array).all()

    dataframe = dictionary.pop("dataframe")
    assert dataframe.equals(doc.dataframe)

    blob = dictionary.pop("blob")
    assert blob == doc.blob

    embedding = dictionary.pop("embedding")
    assert (embedding == doc.embedding).all()

    assert dictionary == {
        "id": doc.id,
        "text": "test text",
        "mime_type": "application/pdf",
        "metadata": {"some": "values", "test": 10},
        "id_hash_keys": ["test"],
        "score": 0.99,
    }


@pytest.mark.unit
def test_document_with_most_attributes_from_dict():
    embedding = np.zeros([10, 10])
    assert Document.from_dict(
        {
            "text": "test text",
            "array": np.array([1, 2, 3]),
            "dataframe": pd.DataFrame([10, 20, 30]),
            "blob": b"some bytes",
            "mime_type": "application/pdf",
            "metadata": {"some": "values", "test": 10},
            "id_hash_keys": ["test"],
            "score": 0.99,
            "embedding": embedding,
        }
    ) == Document(
        text="test text",
        array=np.array([1, 2, 3]),
        dataframe=pd.DataFrame([10, 20, 30]),
        blob=b"some bytes",
        mime_type="application/pdf",
        metadata={"some": "values", "test": 10},
        id_hash_keys=["test"],
        score=0.99,
        embedding=embedding,
    )


@pytest.mark.unit
def test_empty_document_to_json():
    doc = Document()
    assert doc.to_json() == json.dumps(
        {
            "id": doc.id,
            "text": None,
            "array": None,
            "dataframe": None,
            "mime_type": "text/plain",
            "metadata": {},
            "id_hash_keys": ["text", "array", "dataframe", "blob"],
            "score": None,
            "embedding": None,
        }
    )


@pytest.mark.unit
def test_empty_document_from_json():
    assert Document.from_json("{}") == Document()


@pytest.mark.unit
def test_full_document_to_json(tmp_path):
    class TestClass:
        def __repr__(self):
            return "<the object>"

    doc_1 = Document(
        text="test text",
        array=np.array([1, 2, 3]),
        dataframe=pd.DataFrame([10, 20, 30]),
        blob=b"some bytes",
        mime_type="application/pdf",
        metadata={"some object": TestClass(), "a path": tmp_path / "test.txt"},
        id_hash_keys=["test"],
        score=0.5,
        embedding=np.array([1, 2, 3, 4]),
    )
    assert doc_1.to_json() == json.dumps(
        {
            "id": doc_1.id,
            "text": "test text",
            "array": [1, 2, 3],
            "dataframe": '{"0":{"0":10,"1":20,"2":30}}',
            "mime_type": "application/pdf",
            "metadata": {"some object": "<the object>", "a path": str((tmp_path / "test.txt").absolute())},
            "id_hash_keys": ["test"],
            "score": 0.5,
            "embedding": [1, 2, 3, 4],
        }
    )


@pytest.mark.unit
def test_full_document_from_json(tmp_path):
    class TestClass:
        def __repr__(self):
            return "'<the object>'"

        def __eq__(self, other):
            return type(self) == type(other)

    doc = Document.from_json(
        json.dumps(
            {
                "text": "test text",
                "array": [1, 2, 3],
                "dataframe": '{"0":{"0":10,"1":20,"2":30}}',
                "mime_type": "application/pdf",
                "metadata": {"some object": "<the object>", "a path": str((tmp_path / "test.txt").absolute())},
                "id_hash_keys": ["test"],
                "score": 0.5,
                "embedding": [1, 2, 3, 4],
            }
        )
    )
    assert doc == Document(
        text="test text",
        array=np.array([1, 2, 3]),
        dataframe=pd.DataFrame([10, 20, 30]),
        blob=None,
        mime_type="application/pdf",
        # Note the object serialization
        metadata={"some object": "<the object>", "a path": str((tmp_path / "test.txt").absolute())},
        id_hash_keys=["test"],
        score=0.5,
        embedding=np.array([1, 2, 3, 4]),
    )


@pytest.mark.unit
def test_to_json_custom_encoder(tmp_path):
    class SerializableTestClass:
        ...

    class TestEncoder(DocumentEncoder):
        def default(self, obj):
            if isinstance(obj, SerializableTestClass):
                return "<<CUSTOM ENCODING>>"
            return DocumentEncoder.default(self, obj)

    doc = Document(text="test text", metadata={"some object": SerializableTestClass()})
    doc_json = doc.to_json(indent=4, json_encoder=TestEncoder).strip()

    assert doc_json == json.dumps(
        {
            "id": doc.id,
            "text": "test text",
            "array": None,
            "dataframe": None,
            "mime_type": "text/plain",
            "metadata": {"some object": "<<CUSTOM ENCODING>>"},
            "id_hash_keys": ["text", "array", "dataframe", "blob"],
            "score": None,
            "embedding": None,
        },
        indent=4,
    )


@pytest.mark.unit
def test_from_json_custom_decoder():
    class TestClass:
        def __eq__(self, other):
            return type(self) == type(other)

    class TestDecoder(DocumentDecoder):
        def __init__(self, *args, **kwargs):
            super().__init__(object_hook=self.object_hook)

        def object_hook(self, dictionary):
            if "metadata" in dictionary:
                for key, value in dictionary["metadata"].items():
                    if value == "<<CUSTOM ENCODING>>":
                        dictionary["metadata"][key] = TestClass()
            return dictionary

    doc = Document(text="test text", metadata={"some object": TestClass()})

    assert doc == Document.from_json(
        json.dumps(
            {
                "id": doc.id,
                "text": "test text",
                "array": None,
                "dataframe": None,
                "mime_type": "text/plain",
                "metadata": {"some object": "<<CUSTOM ENCODING>>"},
                "id_hash_keys": ["text", "array", "dataframe", "blob"],
                "score": None,
                "embedding": None,
            }
        ),
        json_decoder=TestDecoder,
    )


@pytest.mark.unit
def test_flatten_document_no_meta():
    doc = Document(text="test text")
    assert doc.flatten() == {
        "id": doc.id,
        "text": "test text",
        "array": None,
        "dataframe": None,
        "blob": None,
        "mime_type": "text/plain",
        "id_hash_keys": ["text", "array", "dataframe", "blob"],
        "score": None,
        "embedding": None,
    }


@pytest.mark.unit
def test_flatten_document_with_flat_meta():
    doc = Document(text="test text", metadata={"some-key": "a value", "another-key": "another value!"})
    assert doc.flatten() == {
        "id": doc.id,
        "text": "test text",
        "array": None,
        "dataframe": None,
        "blob": None,
        "mime_type": "text/plain",
        "id_hash_keys": ["text", "array", "dataframe", "blob"],
        "score": None,
        "embedding": None,
        "some-key": "a value",
        "another-key": "another value!",
    }


@pytest.mark.unit
def test_flatten_document_with_nested_meta():
    doc = Document(text="test text", metadata={"some-key": "a value", "nested": {"key": 10, "key2": 50}})
    assert doc.flatten() == {
        "id": doc.id,
        "text": "test text",
        "array": None,
        "dataframe": None,
        "blob": None,
        "mime_type": "text/plain",
        "id_hash_keys": ["text", "array", "dataframe", "blob"],
        "score": None,
        "embedding": None,
        "some-key": "a value",
        "nested": {"key": 10, "key2": 50},
    }