haystack/test/preview/dataclasses/test_dataclasses.py
ZanSara 3a6db68408
feat: allow filtering documents on all fields (v2) (#4773)
* extend tests

* remove stray test

* pylint

* mypy

* review feedback

* fix tests

* fix last tests

* remove comment

* remove print statement

* pylint

* add flatten test

* remove direct acces/ direct write in docstore tests

* fix tests
2023-05-10 16:33:47 +02:00

229 lines
6.5 KiB
Python

from pathlib import Path
import pytest
import pandas as pd
import numpy as np
import pytest
from haystack.preview import Document
from haystack.preview.dataclasses.document import _create_id
@pytest.mark.unit
def test_init_document_same_meta_as_main_fields():
"""
This is forbidden to prevent later issues with `Document.flatten()`
"""
with pytest.raises(ValueError, match="score"):
Document(content="test content", metadata={"score": "10/10"})
@pytest.mark.unit
def test_default_text_document_to_dict():
assert Document(content="test content").to_dict() == {
"id": _create_id(classname=Document.__name__, content="test content"),
"content": "test content",
"content_type": "text",
"metadata": {},
"id_hash_keys": [],
"score": None,
"embedding": None,
}
@pytest.mark.unit
def test_default_text_document_from_dict():
assert Document.from_dict(
{
"id": _create_id(classname=Document.__name__, content="test content"),
"content": "test content",
"content_type": "text",
"metadata": {},
"id_hash_keys": [],
"score": None,
"embedding": None,
}
) == Document(content="test content")
@pytest.mark.unit
def test_default_table_document_to_dict():
df = pd.DataFrame([1, 2])
dictionary = Document(content=df, content_type="table").to_dict()
dataframe = dictionary.pop("content")
assert dataframe.equals(df)
assert dictionary == {
"id": _create_id(classname=Document.__name__, content=df),
"content_type": "table",
"metadata": {},
"id_hash_keys": [],
"score": None,
"embedding": None,
}
@pytest.mark.unit
def test_default_table_document_from_dict():
df = pd.DataFrame([1, 2])
assert Document.from_dict(
{
"id": _create_id(classname=Document.__name__, content=df),
"content": df,
"content_type": "table",
"metadata": {},
"id_hash_keys": [],
"score": None,
"embedding": None,
}
) == Document(content=df, content_type="table")
@pytest.mark.unit
def test_default_image_document_to_dict():
path = Path(__file__).parent / "test_files" / "apple.jpg"
assert Document(content=path, content_type="image").to_dict() == {
"id": _create_id(classname=Document.__name__, content=path),
"content": path,
"content_type": "image",
"metadata": {},
"id_hash_keys": [],
"score": None,
"embedding": None,
}
@pytest.mark.unit
def test_default_image_document_from_dict():
path = Path(__file__).parent / "test_files" / "apple.jpg"
assert Document.from_dict(
{
"id": _create_id(classname=Document.__name__, content=path),
"content": path,
"content_type": "image",
"metadata": {},
"id_hash_keys": [],
"score": None,
"embedding": None,
}
) == Document(content=path, content_type="image")
@pytest.mark.unit
def test_document_with_most_attributes_to_dict():
"""
This tests also id_hash_keys
"""
doc = Document(
content="test content",
content_type="text",
metadata={"some": "values", "test": 10},
id_hash_keys=["test"],
score=0.99,
embedding=np.zeros([10, 10]),
)
dictionary = doc.to_dict()
embedding = dictionary.pop("embedding")
assert (embedding == np.zeros([10, 10])).all()
assert dictionary == {
"id": _create_id(
classname=Document.__name__,
content="test content",
id_hash_keys=["test"],
metadata={"some": "values", "test": 10},
),
"content": "test content",
"content_type": "text",
"metadata": {"some": "values", "test": 10},
"id_hash_keys": ["test"],
"score": 0.99,
}
@pytest.mark.unit
def test_document_with_most_attributes_from_dict():
embedding = np.zeros([10, 10])
assert Document.from_dict(
{
"id": _create_id(
classname=Document.__name__,
content="test content",
id_hash_keys=["test"],
metadata={"some": "values", "test": 10},
),
"content": "test content",
"content_type": "text",
"metadata": {"some": "values", "test": 10},
"id_hash_keys": ["test"],
"score": 0.99,
"embedding": embedding,
}
) == Document(
content="test content",
content_type="text",
metadata={"some": "values", "test": 10},
id_hash_keys=["test"],
score=0.99,
embedding=embedding,
)
def test_flatten_text_document_no_meta():
assert Document(content="test content").flatten() == {
"id": _create_id(classname=Document.__name__, content="test content"),
"content": "test content",
"content_type": "text",
"id_hash_keys": [],
"score": None,
"embedding": None,
}
def test_flatten_text_document():
assert Document(content="test content", metadata={"name": "document name", "page": 123}).flatten() == {
"id": _create_id(classname=Document.__name__, content="test content"),
"content": "test content",
"content_type": "text",
"name": "document name",
"page": 123,
"id_hash_keys": [],
"score": None,
"embedding": None,
}
def test_flatten_table_document():
df = pd.DataFrame([1, 2])
flat = Document(content=df, content_type="table", metadata={"table-name": "table title", "section": 3}).flatten()
dataframe = flat.pop("content")
assert dataframe.equals(df)
assert flat == {
"id": _create_id(classname=Document.__name__, content=df),
"content_type": "table",
"table-name": "table title",
"section": 3,
"id_hash_keys": [],
"score": None,
"embedding": None,
}
def test_flatten_image_document():
path = Path(__file__).parent / "test_files" / "apple.jpg"
assert Document(
content=path, content_type="image", metadata={"image title": "The Apple", "year": 1993}
).flatten() == {
"id": _create_id(classname=Document.__name__, content=path),
"content": path,
"content_type": "image",
"image title": "The Apple",
"year": 1993,
"id_hash_keys": [],
"score": None,
"embedding": None,
}