mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-24 13:38:53 +00:00
refactor: make sure that Document's id_hash_keys has a valid value (#6112)
* fix handling id_hash_keys * reno * handle empty id_hash_keys in post_init * fix * reno * test
This commit is contained in:
parent
9f3b6512be
commit
ef40c7c728
@ -102,9 +102,6 @@ class AzureOCRDocumentConverter:
|
||||
else:
|
||||
text = result.content
|
||||
|
||||
if id_hash_keys:
|
||||
document = Document(text=text, id_hash_keys=id_hash_keys)
|
||||
else:
|
||||
document = Document(text=text)
|
||||
document = Document(text=text, id_hash_keys=id_hash_keys)
|
||||
|
||||
return document
|
||||
|
||||
@ -61,10 +61,7 @@ class TikaDocumentConverter:
|
||||
if not extracted_text:
|
||||
logger.warning("Skipping file at '%s' as Tika was not able to extract any content.", str(path))
|
||||
continue
|
||||
if id_hash_keys:
|
||||
document = Document(text=extracted_text, id_hash_keys=id_hash_keys)
|
||||
else:
|
||||
document = Document(text=extracted_text)
|
||||
document = Document(text=extracted_text, id_hash_keys=id_hash_keys)
|
||||
documents.append(document)
|
||||
except Exception as e:
|
||||
logger.error("Could not convert file at '%s' to Document. Error: %s", str(path), e)
|
||||
|
||||
@ -48,18 +48,20 @@ class DocumentDecoder(json.JSONDecoder):
|
||||
return dictionary
|
||||
|
||||
|
||||
def id_hash_keys_default_factory():
|
||||
"""
|
||||
Default factory for the id_hash_keys field of the Document dataclass.
|
||||
We need a callable instead of a default value, because mutable default values are not allowed.
|
||||
"""
|
||||
return ["text", "array", "dataframe", "blob"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Document:
|
||||
"""
|
||||
Base data class containing some data to be queried.
|
||||
Can contain text snippets, tables, and file paths to images or audios.
|
||||
Documents can be sorted by score, saved to/from dictionary and JSON, and are immutable.
|
||||
|
||||
Immutability is due to the fact that the document's ID depends on its content, so upon changing the content, also
|
||||
the ID should change. To avoid keeping IDs in sync with the content by using properties, and asking docstores to
|
||||
be aware of this corner case, we decide to make Documents immutable and remove the issue. If you need to modify a
|
||||
Document, consider using `to_dict()`, modifying the dict, and then create a new Document object using
|
||||
`Document.from_dict()`.
|
||||
Documents can be sorted by score and saved to/from dictionary and JSON.
|
||||
|
||||
:param id: Unique identifier for the document. When not set, it's generated based on the document's attributes (see id_hash_keys).
|
||||
:param text: Text of the document, if the document contains text.
|
||||
@ -84,7 +86,7 @@ class Document:
|
||||
blob: Optional[bytes] = field(default=None)
|
||||
mime_type: str = field(default="text/plain")
|
||||
metadata: Dict[str, Any] = field(default_factory=dict, hash=False)
|
||||
id_hash_keys: List[str] = field(default_factory=lambda: ["text", "array", "dataframe", "blob"], hash=False)
|
||||
id_hash_keys: List[str] = field(default_factory=id_hash_keys_default_factory, hash=False)
|
||||
score: Optional[float] = field(default=None, compare=False)
|
||||
embedding: Optional[numpy.ndarray] = field(default=None, repr=False)
|
||||
|
||||
@ -113,6 +115,10 @@ class Document:
|
||||
"""
|
||||
Generate the ID based on the init parameters.
|
||||
"""
|
||||
# if id_hash_keys is None or empty, use the default factory
|
||||
if not self.id_hash_keys:
|
||||
self.id_hash_keys = id_hash_keys_default_factory()
|
||||
|
||||
# Validate metadata
|
||||
for key in self.metadata:
|
||||
if key in [field.name for field in fields(self)]:
|
||||
|
||||
@ -0,0 +1,5 @@
|
||||
---
|
||||
preview:
|
||||
- |
|
||||
The Document dataclass checks if `id_hash_keys` is None or empty in
|
||||
__post_init__. If so, it uses the default factory to set a default valid value.
|
||||
@ -1,6 +1,4 @@
|
||||
import dataclasses
|
||||
import json
|
||||
import textwrap
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
@ -98,6 +96,15 @@ def test_id_hash_keys_field_may_be_missing(caplog):
|
||||
assert "is missing the following id_hash_keys: ['something else']." in caplog.text
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_id_hash_keys_is_set_to_default_if_invalid():
|
||||
doc = Document(text="test text", id_hash_keys=None)
|
||||
assert doc.id_hash_keys == ["text", "array", "dataframe", "blob"]
|
||||
|
||||
doc = Document(text="test text", id_hash_keys=[])
|
||||
assert doc.id_hash_keys == ["text", "array", "dataframe", "blob"]
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_init_document_same_meta_as_main_fields():
|
||||
"""
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user