refactor: make sure that Document's id_hash_keys has a valid value (#6112)

* fix handling id_hash_keys * reno * handle empty id_hash_keys in post_init * fix * reno * test
2025-12-24 13:38:53 +00:00 · 2023-10-19 12:10:19 +02:00 · 2023-10-19 12:10:19 +02:00 · ef40c7c728
commit ef40c7c728
parent 9f3b6512be
5 changed files with 30 additions and 18 deletions
--- a/haystack/preview/components/file_converters/azure.py
+++ b/haystack/preview/components/file_converters/azure.py
@ -102,9 +102,6 @@ class AzureOCRDocumentConverter:
        else:
            text = result.content

-        if id_hash_keys:
-            document = Document(text=text, id_hash_keys=id_hash_keys)
-        else:
-            document = Document(text=text)
+        document = Document(text=text, id_hash_keys=id_hash_keys)

        return document
--- a/haystack/preview/components/file_converters/tika.py
+++ b/haystack/preview/components/file_converters/tika.py
@ -61,10 +61,7 @@ class TikaDocumentConverter:
                if not extracted_text:
                    logger.warning("Skipping file at '%s' as Tika was not able to extract any content.", str(path))
                    continue
-                if id_hash_keys:
-                    document = Document(text=extracted_text, id_hash_keys=id_hash_keys)
-                else:
-                    document = Document(text=extracted_text)
+                document = Document(text=extracted_text, id_hash_keys=id_hash_keys)
                documents.append(document)
            except Exception as e:
                logger.error("Could not convert file at '%s' to Document. Error: %s", str(path), e)
--- a/haystack/preview/dataclasses/document.py
+++ b/haystack/preview/dataclasses/document.py
@ -48,18 +48,20 @@ class DocumentDecoder(json.JSONDecoder):
        return dictionary


+def id_hash_keys_default_factory():
+    """
+    Default factory for the id_hash_keys field of the Document dataclass.
+    We need a callable instead of a default value, because mutable default values are not allowed.
+    """
+    return ["text", "array", "dataframe", "blob"]
+
+
@dataclass
 class Document:
    """
    Base data class containing some data to be queried.
    Can contain text snippets, tables, and file paths to images or audios.
-    Documents can be sorted by score, saved to/from dictionary and JSON, and are immutable.
-
-    Immutability is due to the fact that the document's ID depends on its content, so upon changing the content, also
-    the ID should change. To avoid keeping IDs in sync with the content by using properties, and asking docstores to
-    be aware of this corner case, we decide to make Documents immutable and remove the issue. If you need to modify a
-    Document, consider using `to_dict()`, modifying the dict, and then create a new Document object using
-    `Document.from_dict()`.
+    Documents can be sorted by score and saved to/from dictionary and JSON.

    :param id: Unique identifier for the document. When not set, it's generated based on the document's attributes (see id_hash_keys).
    :param text: Text of the document, if the document contains text.
@ -84,7 +86,7 @@ class Document:
    blob: Optional[bytes] = field(default=None)
    mime_type: str = field(default="text/plain")
    metadata: Dict[str, Any] = field(default_factory=dict, hash=False)
-    id_hash_keys: List[str] = field(default_factory=lambda: ["text", "array", "dataframe", "blob"], hash=False)
+    id_hash_keys: List[str] = field(default_factory=id_hash_keys_default_factory, hash=False)
    score: Optional[float] = field(default=None, compare=False)
    embedding: Optional[numpy.ndarray] = field(default=None, repr=False)

@ -113,6 +115,10 @@ class Document:
        """
        Generate the ID based on the init parameters.
        """
+        # if id_hash_keys is None or empty, use the default factory
+        if not self.id_hash_keys:
+            self.id_hash_keys = id_hash_keys_default_factory()
+
        # Validate metadata
        for key in self.metadata:
            if key in [field.name for field in fields(self)]:
--- a/releasenotes/notes/check-id-hash-keys-postinit-28b012cdeada2c1e.yaml
+++ b/releasenotes/notes/check-id-hash-keys-postinit-28b012cdeada2c1e.yaml
@ -0,0 +1,5 @@
+---
+preview:
+  - |
+    The Document dataclass checks if `id_hash_keys` is None or empty in
+    __post_init__. If so, it uses the default factory to set a default valid value.
--- a/test/preview/dataclasses/test_document.py
+++ b/test/preview/dataclasses/test_document.py
@ -1,6 +1,4 @@
-import dataclasses
 import json
-import textwrap
 from pathlib import Path

 import numpy as np
@ -98,6 +96,15 @@ def test_id_hash_keys_field_may_be_missing(caplog):
    assert "is missing the following id_hash_keys: ['something else']." in caplog.text


+@pytest.mark.unit
+def test_id_hash_keys_is_set_to_default_if_invalid():
+    doc = Document(text="test text", id_hash_keys=None)
+    assert doc.id_hash_keys == ["text", "array", "dataframe", "blob"]
+
+    doc = Document(text="test text", id_hash_keys=[])
+    assert doc.id_hash_keys == ["text", "array", "dataframe", "blob"]
+
+
@pytest.mark.unit
 def test_init_document_same_meta_as_main_fields():
    """