feat: optionally pass an id to the Document constructor (#5862)

* revert #5826 * do not use Optional
2025-08-28 10:26:27 +00:00 · 2023-09-22 11:09:59 +02:00 · 2023-09-22 11:09:59 +02:00 · a5a0dc9f87
commit a5a0dc9f87
parent cc4f95bf51
6 changed files with 12 additions and 9 deletions
--- a/haystack/preview/components/embedders/sentence_transformers_document_embedder.py
+++ b/haystack/preview/components/embedders/sentence_transformers_document_embedder.py
@ -128,7 +128,6 @@ class SentenceTransformersDocumentEmbedder:
        for doc, emb in zip(documents, embeddings):
            doc_as_dict = doc.to_dict()
            doc_as_dict["embedding"] = emb
-            del doc_as_dict["id"]
            documents_with_embeddings.append(Document.from_dict(doc_as_dict))

        return {"documents": documents_with_embeddings}
--- a/haystack/preview/dataclasses/document.py
+++ b/haystack/preview/dataclasses/document.py
@ -63,7 +63,7 @@ class Document:
    Document, consider using `to_dict()`, modifying the dict, and then create a new Document object using
    `Document.from_dict()`.

-    :param id: Unique identifier for the document. Generated based on the document's attributes (see id_hash_keys).
+    :param id: Unique identifier for the document. When not set, it's generated based on the document's attributes (see id_hash_keys).
    :param text: Text of the document, if the document contains text.
    :param array: Array of numbers associated with the document, if the document contains matrix data like image,
        audio, video, and such.
@ -79,7 +79,7 @@ class Document:
    :param embedding: Vector representation of the document.
    """

-    id: str = field(default_factory=str, init=False)
+    id: str = field(default="")
    text: Optional[str] = field(default=None)
    array: Optional[numpy.ndarray] = field(default=None)
    dataframe: Optional[pandas.DataFrame] = field(default=None)
@ -121,8 +121,8 @@ class Document:
                raise ValueError(f"Cannot name metadata fields as top-level document fields, like '{key}'.")

        # Note: we need to set the id this way because the dataclass is frozen. See the docstring.
-        hashed_content = self._create_id()
-        object.__setattr__(self, "id", hashed_content)
+        if self.id == "":
+            object.__setattr__(self, "id", self._create_id())

    def _create_id(self):
        """
--- a/haystack/preview/document_stores/memory/document_store.py
+++ b/haystack/preview/document_stores/memory/document_store.py
@ -270,7 +270,6 @@ class MemoryDocumentStore:
            doc = all_documents[i]
            doc_fields = doc.to_dict()
            doc_fields["score"] = docs_scores[i]
-            del doc_fields["id"]
            return_document = Document(**doc_fields)
            return_documents.append(return_document)
        return return_documents
@ -323,7 +322,6 @@ class MemoryDocumentStore:
            doc_fields["score"] = score
            if return_embedding is False:
                doc_fields["embedding"] = None
-            del doc_fields["id"]
            top_documents.append(Document(**doc_fields))

        return top_documents
--- a/releasenotes/notes/pass-id-to-doc-init-c6b44d30978f2d9f.yaml
+++ b/releasenotes/notes/pass-id-to-doc-init-c6b44d30978f2d9f.yaml
@ -0,0 +1,5 @@
+---
+preview:
+  - |
+    Revert #5826 and optionally take the `id` in the Document
+    class constructor.
--- a/test/preview/dataclasses/test_document.py
+++ b/test/preview/dataclasses/test_document.py
@ -380,6 +380,7 @@ def test_from_json_custom_decoder():
    assert doc == Document.from_json(
        json.dumps(
            {
+                "id": doc.id,
                "text": "test text",
                "array": None,
                "dataframe": None,
--- a/test/preview/testing/test_factory.py
+++ b/test/preview/testing/test_factory.py
@ -32,7 +32,7 @@ def test_document_store_class_is_registered():

@pytest.mark.unit
 def test_document_store_class_with_documents():
-    doc = Document(text="This is a document")
+    doc = Document(id="fake_id", text="This is a document")
    MyStore = document_store_class("MyStore", documents=[doc])
    store = MyStore()
    assert store.count_documents() == 1
@ -49,7 +49,7 @@ def test_document_store_class_with_documents_count():

@pytest.mark.unit
 def test_document_store_class_with_documents_and_documents_count():
-    doc = Document(text="This is a document")
+    doc = Document(id="fake_id", text="This is a document")
    MyStore = document_store_class("MyStore", documents=[doc], documents_count=100)
    store = MyStore()
    assert store.count_documents() == 100