mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-28 03:12:54 +00:00

* using text hash as id to prevent document duplication. Also providing a way customize it. * Add latest docstring and tutorial changes * Fixing duplicate value test when text is same * Adding test for duplicate ids in document store * Changing exception to generic Exception type * add exception for inmemory. update docstring Document. remove id_hash_keys from object attribute * Add latest docstring and tutorial changes * Add latest docstring and tutorial changes Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
25 lines
826 B
Python
25 lines
826 B
Python
from haystack import Document
|
|
|
|
|
|
def test_generate_doc_id_using_text():
|
|
text1 = "text1"
|
|
text2 = "text2"
|
|
doc1_text1 = Document(text=text1, meta={"name": "doc1"})
|
|
doc2_text1 = Document(text=text1, meta={"name": "doc2"})
|
|
doc3_text2 = Document(text=text2, meta={"name": "doc3"})
|
|
|
|
assert doc1_text1.id == doc2_text1.id
|
|
assert doc1_text1.id != doc3_text2.id
|
|
|
|
|
|
def test_generate_doc_id_using_custom_list():
|
|
text1 = "text1"
|
|
text2 = "text2"
|
|
|
|
doc1_text1 = Document(text=text1, meta={"name": "doc1"}, id_hash_keys=["key1", text1])
|
|
doc2_text1 = Document(text=text1, meta={"name": "doc2"}, id_hash_keys=["key1", text1])
|
|
doc3_text2 = Document(text=text2, meta={"name": "doc3"}, id_hash_keys=["key1", text2])
|
|
|
|
assert doc1_text1.id == doc2_text1.id
|
|
assert doc1_text1.id != doc3_text2.id
|