haystack/test/preview/dataclasses/test_dataclasses.py

229 lines
6.5 KiB
Python
Raw Normal View History

from pathlib import Path
import pytest
import pandas as pd
import numpy as np
import pytest
from haystack.preview import Document
from haystack.preview.dataclasses.document import _create_id
@pytest.mark.unit
def test_init_document_same_meta_as_main_fields():
"""
This is forbidden to prevent later issues with `Document.flatten()`
"""
with pytest.raises(ValueError, match="score"):
Document(content="test content", metadata={"score": "10/10"})
@pytest.mark.unit
def test_default_text_document_to_dict():
assert Document(content="test content").to_dict() == {
"id": _create_id(classname=Document.__name__, content="test content"),
"content": "test content",
"content_type": "text",
"metadata": {},
"id_hash_keys": [],
"score": None,
"embedding": None,
}
@pytest.mark.unit
def test_default_text_document_from_dict():
assert Document.from_dict(
{
"id": _create_id(classname=Document.__name__, content="test content"),
"content": "test content",
"content_type": "text",
"metadata": {},
"id_hash_keys": [],
"score": None,
"embedding": None,
}
) == Document(content="test content")
@pytest.mark.unit
def test_default_table_document_to_dict():
df = pd.DataFrame([1, 2])
dictionary = Document(content=df, content_type="table").to_dict()
dataframe = dictionary.pop("content")
assert dataframe.equals(df)
assert dictionary == {
"id": _create_id(classname=Document.__name__, content=df),
"content_type": "table",
"metadata": {},
"id_hash_keys": [],
"score": None,
"embedding": None,
}
@pytest.mark.unit
def test_default_table_document_from_dict():
df = pd.DataFrame([1, 2])
assert Document.from_dict(
{
"id": _create_id(classname=Document.__name__, content=df),
"content": df,
"content_type": "table",
"metadata": {},
"id_hash_keys": [],
"score": None,
"embedding": None,
}
) == Document(content=df, content_type="table")
@pytest.mark.unit
def test_default_image_document_to_dict():
path = Path(__file__).parent / "test_files" / "apple.jpg"
assert Document(content=path, content_type="image").to_dict() == {
"id": _create_id(classname=Document.__name__, content=path),
"content": path,
"content_type": "image",
"metadata": {},
"id_hash_keys": [],
"score": None,
"embedding": None,
}
@pytest.mark.unit
def test_default_image_document_from_dict():
path = Path(__file__).parent / "test_files" / "apple.jpg"
assert Document.from_dict(
{
"id": _create_id(classname=Document.__name__, content=path),
"content": path,
"content_type": "image",
"metadata": {},
"id_hash_keys": [],
"score": None,
"embedding": None,
}
) == Document(content=path, content_type="image")
@pytest.mark.unit
def test_document_with_most_attributes_to_dict():
"""
This tests also id_hash_keys
"""
doc = Document(
content="test content",
content_type="text",
metadata={"some": "values", "test": 10},
id_hash_keys=["test"],
score=0.99,
embedding=np.zeros([10, 10]),
)
dictionary = doc.to_dict()
embedding = dictionary.pop("embedding")
assert (embedding == np.zeros([10, 10])).all()
assert dictionary == {
"id": _create_id(
classname=Document.__name__,
content="test content",
id_hash_keys=["test"],
metadata={"some": "values", "test": 10},
),
"content": "test content",
"content_type": "text",
"metadata": {"some": "values", "test": 10},
"id_hash_keys": ["test"],
"score": 0.99,
}
@pytest.mark.unit
def test_document_with_most_attributes_from_dict():
embedding = np.zeros([10, 10])
assert Document.from_dict(
{
"id": _create_id(
classname=Document.__name__,
content="test content",
id_hash_keys=["test"],
metadata={"some": "values", "test": 10},
),
"content": "test content",
"content_type": "text",
"metadata": {"some": "values", "test": 10},
"id_hash_keys": ["test"],
"score": 0.99,
"embedding": embedding,
}
) == Document(
content="test content",
content_type="text",
metadata={"some": "values", "test": 10},
id_hash_keys=["test"],
score=0.99,
embedding=embedding,
)
def test_flatten_text_document_no_meta():
assert Document(content="test content").flatten() == {
"id": _create_id(classname=Document.__name__, content="test content"),
"content": "test content",
"content_type": "text",
"id_hash_keys": [],
"score": None,
"embedding": None,
}
def test_flatten_text_document():
assert Document(content="test content", metadata={"name": "document name", "page": 123}).flatten() == {
"id": _create_id(classname=Document.__name__, content="test content"),
"content": "test content",
"content_type": "text",
"name": "document name",
"page": 123,
"id_hash_keys": [],
"score": None,
"embedding": None,
}
def test_flatten_table_document():
df = pd.DataFrame([1, 2])
flat = Document(content=df, content_type="table", metadata={"table-name": "table title", "section": 3}).flatten()
dataframe = flat.pop("content")
assert dataframe.equals(df)
assert flat == {
"id": _create_id(classname=Document.__name__, content=df),
"content_type": "table",
"table-name": "table title",
"section": 3,
"id_hash_keys": [],
"score": None,
"embedding": None,
}
def test_flatten_image_document():
path = Path(__file__).parent / "test_files" / "apple.jpg"
assert Document(
content=path, content_type="image", metadata={"image title": "The Apple", "year": 1993}
).flatten() == {
"id": _create_id(classname=Document.__name__, content=path),
"content": path,
"content_type": "image",
"image title": "The Apple",
"year": 1993,
"id_hash_keys": [],
"score": None,
"embedding": None,
}