mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-31 09:10:15 +00:00
* fix json serialization * add missing markers * pylint * fix decoder bug * pylint * add some more tests * linting & windows * windows * windows * windows paths again
644 lines
19 KiB
Python
644 lines
19 KiB
Python
from pathlib import Path
|
|
import dataclasses
|
|
import textwrap
|
|
|
|
import pytest
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
from haystack.preview import Document
|
|
from haystack.preview.dataclasses.document import _create_id, DocumentEncoder, DocumentDecoder
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_document_is_immutable():
|
|
doc = Document(content="test content")
|
|
with pytest.raises(dataclasses.FrozenInstanceError):
|
|
doc.content = "won't work"
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_content_type_unknown():
|
|
with pytest.raises(ValueError, match="Content type unknown: 'other'"):
|
|
Document(content="test content", content_type="other")
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_content_type_must_match_text():
|
|
Document(content="test content")
|
|
Document(content="test content", content_type="text")
|
|
|
|
with pytest.raises(ValueError, match="does not match the content type"):
|
|
Document(content="test content", content_type="table")
|
|
|
|
with pytest.raises(ValueError, match="does not match the content type"):
|
|
Document(content="test content", content_type="image")
|
|
|
|
with pytest.raises(ValueError, match="does not match the content type"):
|
|
Document(content="test content", content_type="audio")
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_content_type_must_match_table():
|
|
with pytest.raises(ValueError, match="does not match the content type"):
|
|
Document(content=pd.DataFrame([1, 2]), content_type="text")
|
|
|
|
with pytest.raises(ValueError, match="does not match the content type"):
|
|
Document(content=pd.DataFrame([1, 2]), content_type="image")
|
|
|
|
with pytest.raises(ValueError, match="does not match the content type"):
|
|
Document(content=pd.DataFrame([1, 2]), content_type="audio")
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_content_type_must_match_image_audio():
|
|
# Mimetypes are not checked - yet
|
|
Document(content=Path(__file__), content_type="image")
|
|
Document(content=Path(__file__), content_type="audio")
|
|
|
|
with pytest.raises(ValueError, match="does not match the content type"):
|
|
Document(content=Path(__file__), content_type="text")
|
|
|
|
with pytest.raises(ValueError, match="does not match the content type"):
|
|
Document(content=Path(__file__), content_type="table")
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_id_hash_keys_require_metadata():
|
|
with pytest.raises(ValueError, match="must be present in the metadata of the Document if you want to use it"):
|
|
Document(content="test content", id_hash_keys=["something"])
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_init_document_same_meta_as_main_fields():
|
|
"""
|
|
This is forbidden to prevent later issues with `Document.flatten()`
|
|
"""
|
|
with pytest.raises(ValueError, match="score"):
|
|
Document(content="test content", metadata={"score": "10/10"})
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_basic_equality_type_mismatch():
|
|
doc = Document(content="test content")
|
|
assert doc != "test content"
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_simple_table_document_equality():
|
|
doc1 = Document(content=pd.DataFrame([1, 2]), content_type="table")
|
|
doc2 = Document(content=pd.DataFrame([1, 2]), content_type="table")
|
|
assert doc1 == doc2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_simple_image_document_equality():
|
|
doc1 = Document(content=Path(__file__).parent / "test_files" / "apple.jpg", content_type="image")
|
|
doc2 = Document(content=Path(__file__).parent / "test_files" / "apple.jpg", content_type="image")
|
|
assert doc1 == doc2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_equality_with_embeddings():
|
|
doc1 = Document(content="test content", embedding=np.array([10, 10]))
|
|
doc2 = Document(content="test content", embedding=np.array([10, 10]))
|
|
assert doc1 == doc2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_equality_with_embeddings_shape_check():
|
|
doc1 = Document(content="test content", embedding=np.array([10, 10]))
|
|
doc2 = Document(content="test content", embedding=np.array([[[10, 10]]]))
|
|
assert doc1 != doc2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_equality_with_scores():
|
|
doc1 = Document(content="test content", score=100)
|
|
doc2 = Document(content="test content", score=100)
|
|
assert doc1 == doc2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_equality_with_simple_metadata():
|
|
doc1 = Document(content="test content", metadata={"value": 1, "another": "value"})
|
|
doc2 = Document(content="test content", metadata={"value": 1, "another": "value"})
|
|
assert doc1 == doc2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_equality_with_different_metadata():
|
|
doc1 = Document(content="test content", metadata={"value": 1})
|
|
doc2 = Document(content="test content", metadata={"value": 1, "another": "value"})
|
|
assert doc1 != doc2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_equality_with_nested_metadata():
|
|
doc1 = Document(content="test content", metadata={"value": {"another": "value"}})
|
|
doc2 = Document(content="test content", metadata={"value": {"another": "value"}})
|
|
assert doc1 == doc2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_equality_with_metadata_with_objects():
|
|
class TestObject:
|
|
def __eq__(self, other):
|
|
if type(self) == type(other):
|
|
return True
|
|
|
|
doc1 = Document(
|
|
content="test content", metadata={"value": np.array([0, 1, 2]), "path": Path(__file__), "obj": TestObject()}
|
|
)
|
|
doc2 = Document(
|
|
content="test content", metadata={"value": np.array([0, 1, 2]), "path": Path(__file__), "obj": TestObject()}
|
|
)
|
|
assert doc1 == doc2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_default_text_document_to_dict():
|
|
assert Document(content="test content").to_dict() == {
|
|
"id": _create_id(classname=Document.__name__, content="test content"),
|
|
"content": "test content",
|
|
"content_type": "text",
|
|
"metadata": {},
|
|
"id_hash_keys": [],
|
|
"score": None,
|
|
"embedding": None,
|
|
}
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_default_text_document_from_dict():
|
|
assert Document.from_dict(
|
|
{
|
|
"id": _create_id(classname=Document.__name__, content="test content"),
|
|
"content": "test content",
|
|
"content_type": "text",
|
|
"metadata": {},
|
|
"id_hash_keys": [],
|
|
"score": None,
|
|
"embedding": None,
|
|
}
|
|
) == Document(content="test content")
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_default_table_document_to_dict():
|
|
df = pd.DataFrame([1, 2])
|
|
dictionary = Document(content=df, content_type="table").to_dict()
|
|
|
|
dataframe = dictionary.pop("content")
|
|
assert dataframe.equals(df)
|
|
|
|
assert dictionary == {
|
|
"id": _create_id(classname=Document.__name__, content=df),
|
|
"content_type": "table",
|
|
"metadata": {},
|
|
"id_hash_keys": [],
|
|
"score": None,
|
|
"embedding": None,
|
|
}
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_default_table_document_from_dict():
|
|
assert Document.from_dict(
|
|
{
|
|
"id": _create_id(classname=Document.__name__, content=pd.DataFrame([1, 2])),
|
|
"content": pd.DataFrame([1, 2]),
|
|
"content_type": "table",
|
|
"metadata": {},
|
|
"id_hash_keys": [],
|
|
"score": None,
|
|
"embedding": None,
|
|
}
|
|
) == Document(content=pd.DataFrame([1, 2]), content_type="table")
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_default_image_document_to_dict():
|
|
path = Path(__file__).parent / "test_files" / "apple.jpg"
|
|
assert Document(content=path, content_type="image").to_dict() == {
|
|
"id": _create_id(classname=Document.__name__, content=path),
|
|
"content": path,
|
|
"content_type": "image",
|
|
"metadata": {},
|
|
"id_hash_keys": [],
|
|
"score": None,
|
|
"embedding": None,
|
|
}
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_default_image_document_from_dict():
|
|
assert Document.from_dict(
|
|
{
|
|
"id": _create_id(classname=Document.__name__, content=Path(__file__).parent / "test_files" / "apple.jpg"),
|
|
"content": Path(__file__).parent / "test_files" / "apple.jpg",
|
|
"content_type": "image",
|
|
"metadata": {},
|
|
"id_hash_keys": [],
|
|
"score": None,
|
|
"embedding": None,
|
|
}
|
|
) == Document(content=Path(__file__).parent / "test_files" / "apple.jpg", content_type="image")
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_document_with_most_attributes_to_dict():
|
|
"""
|
|
This tests also id_hash_keys
|
|
"""
|
|
doc = Document(
|
|
content="test content",
|
|
content_type="text",
|
|
metadata={"some": "values", "test": 10},
|
|
id_hash_keys=["test"],
|
|
score=0.99,
|
|
embedding=np.zeros([10, 10]),
|
|
)
|
|
dictionary = doc.to_dict()
|
|
|
|
embedding = dictionary.pop("embedding")
|
|
assert (embedding == np.zeros([10, 10])).all()
|
|
|
|
assert dictionary == {
|
|
"id": _create_id(
|
|
classname=Document.__name__,
|
|
content="test content",
|
|
id_hash_keys=["test"],
|
|
metadata={"some": "values", "test": 10},
|
|
),
|
|
"content": "test content",
|
|
"content_type": "text",
|
|
"metadata": {"some": "values", "test": 10},
|
|
"id_hash_keys": ["test"],
|
|
"score": 0.99,
|
|
}
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_document_with_most_attributes_from_dict():
|
|
embedding = np.zeros([10, 10])
|
|
assert Document.from_dict(
|
|
{
|
|
"id": _create_id(
|
|
classname=Document.__name__,
|
|
content="test content",
|
|
id_hash_keys=["test"],
|
|
metadata={"some": "values", "test": 10},
|
|
),
|
|
"content": "test content",
|
|
"content_type": "text",
|
|
"metadata": {"some": "values", "test": 10},
|
|
"id_hash_keys": ["test"],
|
|
"score": 0.99,
|
|
"embedding": embedding,
|
|
}
|
|
) == Document(
|
|
content="test content",
|
|
content_type="text",
|
|
metadata={"some": "values", "test": 10},
|
|
id_hash_keys=["test"],
|
|
score=0.99,
|
|
embedding=embedding,
|
|
)
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_default_text_document_to_json():
|
|
doc_id = _create_id(classname=Document.__name__, content="test content")
|
|
doc_1 = Document(content="test content").to_json(indent=4).strip()
|
|
doc_2 = textwrap.dedent(
|
|
""" {
|
|
"id": \""""
|
|
+ doc_id
|
|
+ """\",
|
|
"content": "test content",
|
|
"content_type": "text",
|
|
"metadata": {},
|
|
"id_hash_keys": [],
|
|
"score": null,
|
|
"embedding": null
|
|
}"""
|
|
).strip()
|
|
assert doc_1 == doc_2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_default_text_document_from_json():
|
|
doc_id = _create_id(classname=Document.__name__, content="test content")
|
|
doc_1 = Document(content="test content")
|
|
doc_2 = Document.from_json(
|
|
"""{"id": \""""
|
|
+ doc_id
|
|
+ """\",
|
|
"content": "test content",
|
|
"content_type": "text",
|
|
"metadata": {},
|
|
"id_hash_keys": [],
|
|
"score": null,
|
|
"embedding": null
|
|
}"""
|
|
)
|
|
assert doc_1 == doc_2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_default_table_document_to_json():
|
|
df = pd.DataFrame([1, 2])
|
|
doc_id = _create_id(classname=Document.__name__, content=df)
|
|
doc_1 = Document(content=df, content_type="table").to_json(indent=4).strip()
|
|
doc_2 = textwrap.dedent(
|
|
""" {
|
|
"id": \""""
|
|
+ doc_id
|
|
+ """\",
|
|
"content": \""""
|
|
+ df.to_json().replace('"', '\\"')
|
|
+ """\",
|
|
"content_type": "table",
|
|
"metadata": {},
|
|
"id_hash_keys": [],
|
|
"score": null,
|
|
"embedding": null
|
|
}"""
|
|
).strip()
|
|
assert doc_1 == doc_2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_default_table_document_from_json():
|
|
df = pd.DataFrame([1, 2])
|
|
doc_id = _create_id(classname=Document.__name__, content=pd.DataFrame([1, 2]))
|
|
doc_1 = Document(content=df, content_type="table")
|
|
doc_2 = Document.from_json(
|
|
"""{
|
|
"id": \""""
|
|
+ doc_id
|
|
+ """\",
|
|
"content": \""""
|
|
+ pd.DataFrame([1, 2]).to_json().replace('"', '\\"')
|
|
+ """\",
|
|
"content_type": "table",
|
|
"metadata": {},
|
|
"id_hash_keys": [],
|
|
"score": null,
|
|
"embedding": null
|
|
}"""
|
|
)
|
|
assert doc_1 == doc_2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_default_image_document_to_json():
|
|
path = Path(__file__).parent / "test_files" / "apple.jpg"
|
|
doc_id = _create_id(classname=Document.__name__, content=path)
|
|
doc_1 = Document(content=path, content_type="image").to_json(indent=4).strip()
|
|
doc_2 = textwrap.dedent(
|
|
""" {
|
|
"id": \""""
|
|
+ doc_id
|
|
+ """\",
|
|
"content": \""""
|
|
+ str(path.absolute()).replace("\\", "\\\\")
|
|
+ """\",
|
|
"content_type": "image",
|
|
"metadata": {},
|
|
"id_hash_keys": [],
|
|
"score": null,
|
|
"embedding": null
|
|
}"""
|
|
).strip()
|
|
assert doc_1 == doc_2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_default_image_document_from_json():
|
|
path = Path(__file__).parent / "test_files" / "apple.jpg"
|
|
doc_id = _create_id(classname=Document.__name__, content=path)
|
|
doc_1 = Document(content=path, content_type="image")
|
|
doc_2 = Document.from_json(
|
|
"""{"id": \""""
|
|
+ doc_id
|
|
+ """\",
|
|
"content": \""""
|
|
+ str(path.absolute()).replace("\\", "\\\\")
|
|
+ """\",
|
|
"content_type": "image",
|
|
"metadata": {},
|
|
"id_hash_keys": [],
|
|
"score": null,
|
|
"embedding": null
|
|
}"""
|
|
)
|
|
assert doc_1 == doc_2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_full_document_to_json(tmp_path):
|
|
class TestClass:
|
|
def __repr__(self):
|
|
return "<the object>"
|
|
|
|
doc_id = _create_id(classname=Document.__name__, content="test content")
|
|
doc_1 = Document(
|
|
content="test content",
|
|
metadata={"some object": TestClass(), "a path": tmp_path / "test.txt"},
|
|
embedding=np.array([1, 2, 3, 4]),
|
|
)
|
|
|
|
doc_json = doc_1.to_json(indent=4).strip()
|
|
doc_2 = textwrap.dedent(
|
|
""" {
|
|
"id": \""""
|
|
+ doc_id
|
|
+ """\",
|
|
"content": "test content",
|
|
"content_type": "text",
|
|
"metadata": {
|
|
"some object": "<the object>",
|
|
"a path": \""""
|
|
+ str((tmp_path / "test.txt").absolute()).replace("\\", "\\\\")
|
|
+ """\"
|
|
},
|
|
"id_hash_keys": [],
|
|
"score": null,
|
|
"embedding": [
|
|
1,
|
|
2,
|
|
3,
|
|
4
|
|
]
|
|
}"""
|
|
).strip()
|
|
assert doc_json == doc_2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_full_document_from_json(tmp_path):
|
|
doc_id = _create_id(classname=Document.__name__, content="test content")
|
|
doc_1 = Document(
|
|
content="test content",
|
|
metadata={"a path": str(tmp_path / "test.txt")}, # Paths are not cast back to Path objects
|
|
embedding=np.array([1, 2, 3, 4]),
|
|
)
|
|
doc_2 = Document.from_json(
|
|
"""{"id": \""""
|
|
+ doc_id
|
|
+ """\",
|
|
"content": "test content",
|
|
"content_type": "text",
|
|
"metadata": {
|
|
"a path": \""""
|
|
+ str((tmp_path / "test.txt").absolute()).replace("\\", "\\\\")
|
|
+ """\"
|
|
},
|
|
"id_hash_keys": [],
|
|
"score": null,
|
|
"embedding": [
|
|
1,
|
|
2,
|
|
3,
|
|
4
|
|
]
|
|
}"""
|
|
)
|
|
assert doc_1 == doc_2
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_to_json_custom_encoder(tmp_path):
|
|
class SerializableTestClass:
|
|
...
|
|
|
|
class TestEncoder(DocumentEncoder):
|
|
def default(self, obj):
|
|
if isinstance(obj, SerializableTestClass):
|
|
return "<<CUSTOM ENCODING>>"
|
|
return DocumentEncoder.default(self, obj)
|
|
|
|
doc_id = _create_id(classname=Document.__name__, content="test content")
|
|
doc = Document(content="test content", metadata={"some object": SerializableTestClass()})
|
|
doc_json = doc.to_json(indent=4, json_encoder=TestEncoder).strip()
|
|
|
|
assert (
|
|
doc_json
|
|
== textwrap.dedent(
|
|
""" {
|
|
"id": \""""
|
|
+ doc_id
|
|
+ """\",
|
|
"content": "test content",
|
|
"content_type": "text",
|
|
"metadata": {
|
|
"some object": "<<CUSTOM ENCODING>>"
|
|
},
|
|
"id_hash_keys": [],
|
|
"score": null,
|
|
"embedding": null
|
|
}"""
|
|
).strip()
|
|
)
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_from_json_custom_decoder():
|
|
class TestClass:
|
|
def __eq__(self, other):
|
|
return type(self) == type(other)
|
|
|
|
class TestDecoder(DocumentDecoder):
|
|
def __init__(self, *args, **kwargs):
|
|
super().__init__(object_hook=self.object_hook)
|
|
|
|
def object_hook(self, dictionary):
|
|
if "metadata" in dictionary:
|
|
for key, value in dictionary["metadata"].items():
|
|
if value == "<<CUSTOM ENCODING>>":
|
|
dictionary["metadata"][key] = TestClass()
|
|
return dictionary
|
|
|
|
doc_id = _create_id(classname=Document.__name__, content="test content")
|
|
doc = Document(content="test content", metadata={"some object": TestClass()})
|
|
|
|
assert doc == Document.from_json(
|
|
""" {
|
|
"id": \""""
|
|
+ doc_id
|
|
+ """\",
|
|
"content": "test content",
|
|
"content_type": "text",
|
|
"metadata": {
|
|
"some object": "<<CUSTOM ENCODING>>"
|
|
},
|
|
"id_hash_keys": [],
|
|
"score": null,
|
|
"embedding": null
|
|
}""",
|
|
json_decoder=TestDecoder,
|
|
)
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_flatten_text_document_no_meta():
|
|
assert Document(content="test content").flatten() == {
|
|
"id": _create_id(classname=Document.__name__, content="test content"),
|
|
"content": "test content",
|
|
"content_type": "text",
|
|
"id_hash_keys": [],
|
|
"score": None,
|
|
"embedding": None,
|
|
}
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_flatten_text_document():
|
|
assert Document(content="test content", metadata={"name": "document name", "page": 123}).flatten() == {
|
|
"id": _create_id(classname=Document.__name__, content="test content"),
|
|
"content": "test content",
|
|
"content_type": "text",
|
|
"name": "document name",
|
|
"page": 123,
|
|
"id_hash_keys": [],
|
|
"score": None,
|
|
"embedding": None,
|
|
}
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_flatten_table_document():
|
|
df = pd.DataFrame([1, 2])
|
|
flat = Document(content=df, content_type="table", metadata={"table-name": "table title", "section": 3}).flatten()
|
|
|
|
dataframe = flat.pop("content")
|
|
assert dataframe.equals(df)
|
|
assert flat == {
|
|
"id": _create_id(classname=Document.__name__, content=df),
|
|
"content_type": "table",
|
|
"table-name": "table title",
|
|
"section": 3,
|
|
"id_hash_keys": [],
|
|
"score": None,
|
|
"embedding": None,
|
|
}
|
|
|
|
|
|
@pytest.mark.unit
|
|
def test_flatten_image_document():
|
|
path = Path(__file__).parent / "test_files" / "apple.jpg"
|
|
assert Document(
|
|
content=path, content_type="image", metadata={"image title": "The Apple", "year": 1993}
|
|
).flatten() == {
|
|
"id": _create_id(classname=Document.__name__, content=path),
|
|
"content": path,
|
|
"content_type": "image",
|
|
"image title": "The Apple",
|
|
"year": 1993,
|
|
"id_hash_keys": [],
|
|
"score": None,
|
|
"embedding": None,
|
|
}
|