mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-11 07:58:12 +00:00
Custom id hashing on documentstore level (#1910)
* adding dynamic id hashing * Add latest docstring and tutorial changes * added pr review * Add latest docstring and tutorial changes * fixed tests * fix mypy error * fix mypy issue * ignore typing * fixed correct check * fixed tests * try fixing the tests * set id hash keys only if not none * dont store id_hash_keys * fix tests * Add latest docstring and tutorial changes Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
a846be99d1
commit
6e8e3c68d9
@ -164,6 +164,27 @@ from disk and also indexed batchwise to the DocumentStore in order to prevent ou
|
||||
same question might be found in different contexts.
|
||||
- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
|
||||
|
||||
<a name="base.BaseDocumentStore.run"></a>
|
||||
#### run
|
||||
|
||||
```python
|
||||
| run(documents: List[dict], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, id_hash_keys: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
Run requests of document stores
|
||||
|
||||
Comment: We will gradually introduce the primitives. The doument stores also accept dicts and parse them to documents.
|
||||
In the future, however, only documents themselves will be accepted. Parsing the dictionaries in the run function
|
||||
is therefore only an interim solution until the run function also accepts documents.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `documents`: A list of dicts that are documents.
|
||||
- `headers`: A list of headers.
|
||||
- `index`: Optional name of index where the documents shall be written to.
|
||||
If None, the DocumentStore's default index (self.index) will be used.
|
||||
- `id_hash_keys`: List of the fields that the hashes of the ids are generated from.
|
||||
|
||||
<a name="base.get_batches_from_generator"></a>
|
||||
#### get\_batches\_from\_generator
|
||||
|
||||
|
@ -39,9 +39,10 @@ There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
|
||||
In the range of [0,1], where 1 means extremely relevant.
|
||||
- `meta`: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed).
|
||||
- `embedding`: Vector encoding of the text
|
||||
- `id_hash_keys`: Generate the document id from a custom list of strings.
|
||||
- `id_hash_keys`: Generate the document id from a custom list of strings that refere to the documents attributes.
|
||||
If you want ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can provide custom strings here that will be used (e.g. ["filename_xy", "text_of_doc"].
|
||||
not unique, you can modify the metadata and pass e.g. "meta" to this field (e.g. ["content", "meta"]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
|
||||
<a name="schema.Document.to_dict"></a>
|
||||
#### to\_dict
|
||||
@ -71,7 +72,7 @@ dict with content of the Document
|
||||
|
||||
```python
|
||||
| @classmethod
|
||||
| from_dict(cls, dict, field_map={})
|
||||
| from_dict(cls, dict, field_map={}, id_hash_keys=None)
|
||||
```
|
||||
|
||||
Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the
|
||||
|
@ -7,6 +7,11 @@ from itertools import islice
|
||||
from abc import abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
from typing import Literal
|
||||
except ImportError:
|
||||
from typing_extensions import Literal #type: ignore
|
||||
|
||||
from haystack.schema import Document, Label, MultiLabel
|
||||
from haystack.nodes.base import BaseComponent
|
||||
from haystack.errors import DuplicateDocumentError
|
||||
@ -342,9 +347,29 @@ class BaseDocumentStore(BaseComponent):
|
||||
@abstractmethod
|
||||
def delete_labels(self, index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _create_document_field_map(self) -> Dict:
|
||||
pass
|
||||
|
||||
def run(self, documents: List[dict], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None): # type: ignore
|
||||
self.write_documents(documents=documents, index=index, headers=headers)
|
||||
def run(self, documents: List[dict], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, id_hash_keys: Optional[List[str]] = None ): # type: ignore
|
||||
"""
|
||||
Run requests of document stores
|
||||
|
||||
Comment: We will gradually introduce the primitives. The doument stores also accept dicts and parse them to documents.
|
||||
In the future, however, only documents themselves will be accepted. Parsing the dictionaries in the run function
|
||||
is therefore only an interim solution until the run function also accepts documents.
|
||||
|
||||
:param documents: A list of dicts that are documents.
|
||||
:param headers: A list of headers.
|
||||
:param index: Optional name of index where the documents shall be written to.
|
||||
If None, the DocumentStore's default index (self.index) will be used.
|
||||
:param id_hash_keys: List of the fields that the hashes of the ids are generated from.
|
||||
"""
|
||||
|
||||
field_map = self._create_document_field_map()
|
||||
doc_objects = [Document.from_dict(d, field_map=field_map, id_hash_keys=id_hash_keys) for d in documents]
|
||||
self.write_documents(documents=doc_objects, index=index, headers=headers)
|
||||
return {}, "output_1"
|
||||
|
||||
@abstractmethod
|
||||
|
@ -1,3 +1,4 @@
|
||||
|
||||
from typing import Dict, Optional
|
||||
|
||||
import requests
|
||||
@ -125,6 +126,12 @@ class GraphDBKnowledgeGraph(BaseKnowledgeGraph):
|
||||
results = self.query(sparql_query=sparql_query, index=index, headers=headers)
|
||||
return results
|
||||
|
||||
def _create_document_field_map(self)->Dict:
|
||||
"""
|
||||
There is no field mapping required
|
||||
"""
|
||||
return {}
|
||||
|
||||
def get_all_objects(self, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None):
|
||||
"""
|
||||
Query the given index in the GraphDB instance for all its stored objects. Duplicates are not filtered.
|
||||
|
@ -225,6 +225,12 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
batch_size=batch_size,
|
||||
)
|
||||
yield from result
|
||||
|
||||
def _create_document_field_map(self)->Dict:
|
||||
"""
|
||||
There is no field mapping required
|
||||
"""
|
||||
return {}
|
||||
|
||||
def _query(
|
||||
self,
|
||||
|
@ -79,9 +79,10 @@ class Document:
|
||||
In the range of [0,1], where 1 means extremely relevant.
|
||||
:param meta: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed).
|
||||
:param embedding: Vector encoding of the text
|
||||
:param id_hash_keys: Generate the document id from a custom list of strings.
|
||||
:param id_hash_keys: Generate the document id from a custom list of strings that refere to the documents attributes.
|
||||
If you want ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can provide custom strings here that will be used (e.g. ["filename_xy", "text_of_doc"].
|
||||
not unique, you can modify the metadata and pass e.g. "meta" to this field (e.g. ["content", "meta"]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
"""
|
||||
|
||||
if content is None:
|
||||
@ -92,6 +93,14 @@ class Document:
|
||||
self.score = score
|
||||
self.meta = meta or {}
|
||||
|
||||
allowed_hash_key_attributes = ["content", "content_type", "score", "meta", "embedding" ]
|
||||
|
||||
|
||||
if id_hash_keys is not None:
|
||||
if not set(id_hash_keys) <= set(allowed_hash_key_attributes): #type: ignore
|
||||
raise ValueError(f"You passed custom strings {id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See https://github.com/deepset-ai/haystack/pull/1910 for details)")
|
||||
|
||||
|
||||
if embedding is not None:
|
||||
embedding = np.asarray(embedding)
|
||||
self.embedding = embedding
|
||||
@ -100,11 +109,30 @@ class Document:
|
||||
if id:
|
||||
self.id: str = str(id)
|
||||
else:
|
||||
self.id: str = self._get_id(id_hash_keys)
|
||||
self.id: str = self._get_id(id_hash_keys=id_hash_keys)
|
||||
|
||||
def _get_id(self, id_hash_keys):
|
||||
final_hash_key = ":".join(id_hash_keys) if id_hash_keys else str(self.content)
|
||||
return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False))
|
||||
|
||||
def _get_id(self,
|
||||
id_hash_keys: Optional[List[str]] = None
|
||||
):
|
||||
"""
|
||||
Generate the id of a document by creating the hash of strings. By default the content of a document is
|
||||
used to generate the hash. There are two ways of modifying the generated id of a document. Either static keys
|
||||
or a selection of the content.
|
||||
:param id_hash_keys: Optional list of fields that should be dynamically used to generate the hash.
|
||||
"""
|
||||
|
||||
if id_hash_keys is None:
|
||||
return '{:02x}'.format(mmh3.hash128(str(self.content), signed=False))
|
||||
|
||||
final_hash_key = ""
|
||||
for attr in id_hash_keys:
|
||||
final_hash_key += ":" + str(getattr(self,attr))
|
||||
|
||||
if final_hash_key == "":
|
||||
raise ValueError(f"Cant't create 'Document': 'id_hash_keys' must contain at least one of ['content', 'meta']")
|
||||
|
||||
return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False))
|
||||
|
||||
def to_dict(self, field_map={}) -> Dict:
|
||||
"""
|
||||
@ -131,7 +159,7 @@ class Document:
|
||||
return _doc
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, dict, field_map={}):
|
||||
def from_dict(cls, dict, field_map={}, id_hash_keys=None):
|
||||
"""
|
||||
Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the
|
||||
input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
|
||||
@ -160,6 +188,9 @@ class Document:
|
||||
elif k in field_map:
|
||||
k = field_map[k]
|
||||
_new_doc[k] = v
|
||||
|
||||
if _doc.get("id") is None:
|
||||
_new_doc["id_hash_keys"]=id_hash_keys
|
||||
|
||||
# Convert list of rows to pd.DataFrame
|
||||
if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list):
|
||||
|
@ -40,56 +40,57 @@ def test_init_elastic_client():
|
||||
|
||||
|
||||
def test_write_with_duplicate_doc_ids(document_store):
|
||||
documents = [
|
||||
duplicate_documents = [
|
||||
Document(
|
||||
content="Doc1",
|
||||
id_hash_keys=["key1"]
|
||||
id_hash_keys=["content"]
|
||||
),
|
||||
Document(
|
||||
content="Doc2",
|
||||
id_hash_keys=["key1"]
|
||||
content="Doc1",
|
||||
id_hash_keys=["content"]
|
||||
)
|
||||
]
|
||||
document_store.write_documents(documents, duplicate_documents="skip")
|
||||
document_store.write_documents(duplicate_documents, duplicate_documents="skip")
|
||||
assert len(document_store.get_all_documents()) == 1
|
||||
with pytest.raises(Exception):
|
||||
document_store.write_documents(documents, duplicate_documents="fail")
|
||||
document_store.write_documents(duplicate_documents, duplicate_documents="fail")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate"], indirect=True)
|
||||
def test_write_with_duplicate_doc_ids_custom_index(document_store):
|
||||
documents = [
|
||||
duplicate_documents = [
|
||||
Document(
|
||||
content="Doc1",
|
||||
id_hash_keys=["key1"]
|
||||
id_hash_keys=["content"]
|
||||
),
|
||||
Document(
|
||||
content="Doc2",
|
||||
id_hash_keys=["key1"]
|
||||
content="Doc1",
|
||||
id_hash_keys=["content"]
|
||||
)
|
||||
]
|
||||
document_store.delete_documents(index="haystack_custom_test")
|
||||
document_store.write_documents(documents, index="haystack_custom_test", duplicate_documents="skip")
|
||||
document_store.write_documents(duplicate_documents, index="haystack_custom_test", duplicate_documents="skip")
|
||||
assert len(document_store.get_all_documents(index="haystack_custom_test")) == 1
|
||||
with pytest.raises(DuplicateDocumentError):
|
||||
document_store.write_documents(documents, index="haystack_custom_test", duplicate_documents="fail")
|
||||
document_store.write_documents(duplicate_documents, index="haystack_custom_test", duplicate_documents="fail")
|
||||
|
||||
# Weaviate manipulates document objects in-place when writing them to an index.
|
||||
# It generates a uuid based on the provided id and the index name where the document is added to.
|
||||
# We need to get rid of these generated uuids for this test and therefore reset the document objects.
|
||||
# As a result, the documents will receive a fresh uuid based on their id_hash_keys and a different index name.
|
||||
if isinstance(document_store, WeaviateDocumentStore):
|
||||
documents = [
|
||||
duplicate_documents = [
|
||||
Document(
|
||||
content="Doc1",
|
||||
id_hash_keys=["key1"]
|
||||
id_hash_keys=["content"]
|
||||
),
|
||||
Document(
|
||||
content="Doc2",
|
||||
id_hash_keys=["key1"]
|
||||
content="Doc1",
|
||||
id_hash_keys=["content"]
|
||||
)
|
||||
]
|
||||
# writing to the default, empty index should still work
|
||||
document_store.write_documents(documents, duplicate_documents="fail")
|
||||
document_store.write_documents(duplicate_documents, duplicate_documents="fail")
|
||||
|
||||
|
||||
def test_get_all_documents_without_filters(document_store_with_docs):
|
||||
@ -105,17 +106,17 @@ def test_get_all_document_filter_duplicate_text_value(document_store):
|
||||
Document(
|
||||
content="Doc1",
|
||||
meta={"f1": "0"},
|
||||
id_hash_keys=["Doc1", "1"]
|
||||
id_hash_keys=["meta"]
|
||||
),
|
||||
Document(
|
||||
content="Doc1",
|
||||
meta={"f1": "1", "meta_id": "0"},
|
||||
id_hash_keys=["Doc1", "2"]
|
||||
id_hash_keys=["meta"]
|
||||
),
|
||||
Document(
|
||||
content="Doc2",
|
||||
meta={"f3": "0"},
|
||||
id_hash_keys=["Doc2", "3"]
|
||||
id_hash_keys=["meta"]
|
||||
)
|
||||
]
|
||||
document_store.write_documents(documents)
|
||||
@ -124,6 +125,16 @@ def test_get_all_document_filter_duplicate_text_value(document_store):
|
||||
assert len(documents) == 1
|
||||
assert {d.meta["meta_id"] for d in documents} == {"0"}
|
||||
|
||||
documents = document_store.get_all_documents(filters={"f1": ["0"]})
|
||||
assert documents[0].content == "Doc1"
|
||||
assert len(documents) == 1
|
||||
assert documents[0].meta.get("meta_id") is None
|
||||
|
||||
documents = document_store.get_all_documents(filters={"f3": ["0"]})
|
||||
assert documents[0].content == "Doc2"
|
||||
assert len(documents) == 1
|
||||
assert documents[0].meta.get("meta_id") is None
|
||||
|
||||
|
||||
def test_get_all_documents_with_correct_filters(document_store_with_docs):
|
||||
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test2"]})
|
||||
|
@ -1,4 +1,5 @@
|
||||
from haystack.schema import Document, Label, Answer, Span
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
LABELS = [
|
||||
@ -152,9 +153,18 @@ def test_generate_doc_id_using_custom_list():
|
||||
text1 = "text1"
|
||||
text2 = "text2"
|
||||
|
||||
doc1_text1 = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["key1", text1])
|
||||
doc2_text1 = Document(content=text1, meta={"name": "doc2"}, id_hash_keys=["key1", text1])
|
||||
doc3_text2 = Document(content=text2, meta={"name": "doc3"}, id_hash_keys=["key1", text2])
|
||||
doc1_meta1_id_by_content = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content"])
|
||||
doc1_meta2_id_by_content = Document(content=text1, meta={"name": "doc2"}, id_hash_keys=["content"])
|
||||
assert doc1_meta1_id_by_content.id == doc1_meta2_id_by_content.id
|
||||
|
||||
assert doc1_text1.id == doc2_text1.id
|
||||
doc1_meta1_id_by_content_and_meta = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content","meta"])
|
||||
doc1_meta2_id_by_content_and_meta = Document(content=text1, meta={"name": "doc2"}, id_hash_keys=["content", "meta"])
|
||||
assert doc1_meta1_id_by_content_and_meta.id != doc1_meta2_id_by_content_and_meta.id
|
||||
|
||||
doc1_text1 = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content"])
|
||||
doc3_text2 = Document(content=text2, meta={"name": "doc3"}, id_hash_keys=["content"])
|
||||
assert doc1_text1.id != doc3_text2.id
|
||||
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
_ = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content","non_existing_field"])
|
Loading…
x
Reference in New Issue
Block a user