haystack/test/test_schema.py
Lalit Pagaria f46b09c756
Using text hash as id to prevent document duplication (#1000)
* using text hash as id to prevent document duplication. Also providing a way customize it.

* Add latest docstring and tutorial changes

* Fixing duplicate value test when text is same

* Adding test for duplicate ids in document store

* Changing exception to generic Exception type

* add exception for inmemory. update docstring Document. remove id_hash_keys from object attribute

* Add latest docstring and tutorial changes

* Add latest docstring and tutorial changes

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
2021-05-17 17:51:52 +02:00

25 lines
826 B
Python

from haystack import Document
def test_generate_doc_id_using_text():
text1 = "text1"
text2 = "text2"
doc1_text1 = Document(text=text1, meta={"name": "doc1"})
doc2_text1 = Document(text=text1, meta={"name": "doc2"})
doc3_text2 = Document(text=text2, meta={"name": "doc3"})
assert doc1_text1.id == doc2_text1.id
assert doc1_text1.id != doc3_text2.id
def test_generate_doc_id_using_custom_list():
text1 = "text1"
text2 = "text2"
doc1_text1 = Document(text=text1, meta={"name": "doc1"}, id_hash_keys=["key1", text1])
doc2_text1 = Document(text=text1, meta={"name": "doc2"}, id_hash_keys=["key1", text1])
doc3_text2 = Document(text=text2, meta={"name": "doc3"}, id_hash_keys=["key1", text2])
assert doc1_text1.id == doc2_text1.id
assert doc1_text1.id != doc3_text2.id