refactor: make sure that Document's id_hash_keys has a valid value (#6112)

* fix handling id_hash_keys

* reno

* handle empty id_hash_keys in post_init

* fix

* reno

* test
This commit is contained in:
Stefano Fiorucci 2023-10-19 12:10:19 +02:00 committed by GitHub
parent 9f3b6512be
commit ef40c7c728
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 30 additions and 18 deletions

View File

@ -102,9 +102,6 @@ class AzureOCRDocumentConverter:
else:
text = result.content
if id_hash_keys:
document = Document(text=text, id_hash_keys=id_hash_keys)
else:
document = Document(text=text)
document = Document(text=text, id_hash_keys=id_hash_keys)
return document

View File

@ -61,10 +61,7 @@ class TikaDocumentConverter:
if not extracted_text:
logger.warning("Skipping file at '%s' as Tika was not able to extract any content.", str(path))
continue
if id_hash_keys:
document = Document(text=extracted_text, id_hash_keys=id_hash_keys)
else:
document = Document(text=extracted_text)
document = Document(text=extracted_text, id_hash_keys=id_hash_keys)
documents.append(document)
except Exception as e:
logger.error("Could not convert file at '%s' to Document. Error: %s", str(path), e)

View File

@ -48,18 +48,20 @@ class DocumentDecoder(json.JSONDecoder):
return dictionary
def id_hash_keys_default_factory():
"""
Default factory for the id_hash_keys field of the Document dataclass.
We need a callable instead of a default value, because mutable default values are not allowed.
"""
return ["text", "array", "dataframe", "blob"]
@dataclass
class Document:
"""
Base data class containing some data to be queried.
Can contain text snippets, tables, and file paths to images or audios.
Documents can be sorted by score, saved to/from dictionary and JSON, and are immutable.
Immutability is due to the fact that the document's ID depends on its content, so upon changing the content, also
the ID should change. To avoid keeping IDs in sync with the content by using properties, and asking docstores to
be aware of this corner case, we decide to make Documents immutable and remove the issue. If you need to modify a
Document, consider using `to_dict()`, modifying the dict, and then create a new Document object using
`Document.from_dict()`.
Documents can be sorted by score and saved to/from dictionary and JSON.
:param id: Unique identifier for the document. When not set, it's generated based on the document's attributes (see id_hash_keys).
:param text: Text of the document, if the document contains text.
@ -84,7 +86,7 @@ class Document:
blob: Optional[bytes] = field(default=None)
mime_type: str = field(default="text/plain")
metadata: Dict[str, Any] = field(default_factory=dict, hash=False)
id_hash_keys: List[str] = field(default_factory=lambda: ["text", "array", "dataframe", "blob"], hash=False)
id_hash_keys: List[str] = field(default_factory=id_hash_keys_default_factory, hash=False)
score: Optional[float] = field(default=None, compare=False)
embedding: Optional[numpy.ndarray] = field(default=None, repr=False)
@ -113,6 +115,10 @@ class Document:
"""
Generate the ID based on the init parameters.
"""
# if id_hash_keys is None or empty, use the default factory
if not self.id_hash_keys:
self.id_hash_keys = id_hash_keys_default_factory()
# Validate metadata
for key in self.metadata:
if key in [field.name for field in fields(self)]:

View File

@ -0,0 +1,5 @@
---
preview:
- |
The Document dataclass checks if `id_hash_keys` is None or empty in
__post_init__. If so, it uses the default factory to set a default valid value.

View File

@ -1,6 +1,4 @@
import dataclasses
import json
import textwrap
from pathlib import Path
import numpy as np
@ -98,6 +96,15 @@ def test_id_hash_keys_field_may_be_missing(caplog):
assert "is missing the following id_hash_keys: ['something else']." in caplog.text
@pytest.mark.unit
def test_id_hash_keys_is_set_to_default_if_invalid():
doc = Document(text="test text", id_hash_keys=None)
assert doc.id_hash_keys == ["text", "array", "dataframe", "blob"]
doc = Document(text="test text", id_hash_keys=[])
assert doc.id_hash_keys == ["text", "array", "dataframe", "blob"]
@pytest.mark.unit
def test_init_document_same_meta_as_main_fields():
"""