From a5a0dc9f878ee9d5cbc4e2317fdfa8449f31e7bd Mon Sep 17 00:00:00 2001
From: Massimiliano Pippi <mpippi@gmail.com>
Date: Fri, 22 Sep 2023 11:09:59 +0200
Subject: [PATCH] feat: optionally pass an id to the Document constructor
 (#5862)

* revert #5826

* do not use Optional
---
 .../embedders/sentence_transformers_document_embedder.py  | 1 -
 haystack/preview/dataclasses/document.py                  | 8 ++++----
 haystack/preview/document_stores/memory/document_store.py | 2 --
 .../notes/pass-id-to-doc-init-c6b44d30978f2d9f.yaml       | 5 +++++
 test/preview/dataclasses/test_document.py                 | 1 +
 test/preview/testing/test_factory.py                      | 4 ++--
 6 files changed, 12 insertions(+), 9 deletions(-)
 create mode 100644 releasenotes/notes/pass-id-to-doc-init-c6b44d30978f2d9f.yaml

diff --git a/haystack/preview/components/embedders/sentence_transformers_document_embedder.py b/haystack/preview/components/embedders/sentence_transformers_document_embedder.py
index 82466d960..40ac170aa 100644
--- a/haystack/preview/components/embedders/sentence_transformers_document_embedder.py
+++ b/haystack/preview/components/embedders/sentence_transformers_document_embedder.py
@@ -128,7 +128,6 @@ class SentenceTransformersDocumentEmbedder:
         for doc, emb in zip(documents, embeddings):
             doc_as_dict = doc.to_dict()
             doc_as_dict["embedding"] = emb
-            del doc_as_dict["id"]
             documents_with_embeddings.append(Document.from_dict(doc_as_dict))
 
         return {"documents": documents_with_embeddings}
diff --git a/haystack/preview/dataclasses/document.py b/haystack/preview/dataclasses/document.py
index 6f8a3b7bc..eb0704b86 100644
--- a/haystack/preview/dataclasses/document.py
+++ b/haystack/preview/dataclasses/document.py
@@ -63,7 +63,7 @@ class Document:
     Document, consider using `to_dict()`, modifying the dict, and then create a new Document object using
     `Document.from_dict()`.
 
-    :param id: Unique identifier for the document. Generated based on the document's attributes (see id_hash_keys).
+    :param id: Unique identifier for the document. When not set, it's generated based on the document's attributes (see id_hash_keys).
     :param text: Text of the document, if the document contains text.
     :param array: Array of numbers associated with the document, if the document contains matrix data like image,
         audio, video, and such.
@@ -79,7 +79,7 @@ class Document:
     :param embedding: Vector representation of the document.
     """
 
-    id: str = field(default_factory=str, init=False)
+    id: str = field(default="")
     text: Optional[str] = field(default=None)
     array: Optional[numpy.ndarray] = field(default=None)
     dataframe: Optional[pandas.DataFrame] = field(default=None)
@@ -121,8 +121,8 @@ class Document:
                 raise ValueError(f"Cannot name metadata fields as top-level document fields, like '{key}'.")
 
         # Note: we need to set the id this way because the dataclass is frozen. See the docstring.
-        hashed_content = self._create_id()
-        object.__setattr__(self, "id", hashed_content)
+        if self.id == "":
+            object.__setattr__(self, "id", self._create_id())
 
     def _create_id(self):
         """
diff --git a/haystack/preview/document_stores/memory/document_store.py b/haystack/preview/document_stores/memory/document_store.py
index 2dc041c0f..917f35549 100644
--- a/haystack/preview/document_stores/memory/document_store.py
+++ b/haystack/preview/document_stores/memory/document_store.py
@@ -270,7 +270,6 @@ class MemoryDocumentStore:
             doc = all_documents[i]
             doc_fields = doc.to_dict()
             doc_fields["score"] = docs_scores[i]
-            del doc_fields["id"]
             return_document = Document(**doc_fields)
             return_documents.append(return_document)
         return return_documents
@@ -323,7 +322,6 @@ class MemoryDocumentStore:
             doc_fields["score"] = score
             if return_embedding is False:
                 doc_fields["embedding"] = None
-            del doc_fields["id"]
             top_documents.append(Document(**doc_fields))
 
         return top_documents
diff --git a/releasenotes/notes/pass-id-to-doc-init-c6b44d30978f2d9f.yaml b/releasenotes/notes/pass-id-to-doc-init-c6b44d30978f2d9f.yaml
new file mode 100644
index 000000000..b8318be9f
--- /dev/null
+++ b/releasenotes/notes/pass-id-to-doc-init-c6b44d30978f2d9f.yaml
@@ -0,0 +1,5 @@
+---
+preview:
+  - |
+    Revert #5826 and optionally take the `id` in the Document
+    class constructor.
diff --git a/test/preview/dataclasses/test_document.py b/test/preview/dataclasses/test_document.py
index 4e0706060..f53b146fe 100644
--- a/test/preview/dataclasses/test_document.py
+++ b/test/preview/dataclasses/test_document.py
@@ -380,6 +380,7 @@ def test_from_json_custom_decoder():
     assert doc == Document.from_json(
         json.dumps(
             {
+                "id": doc.id,
                 "text": "test text",
                 "array": None,
                 "dataframe": None,
diff --git a/test/preview/testing/test_factory.py b/test/preview/testing/test_factory.py
index 89aaf658f..9583318ef 100644
--- a/test/preview/testing/test_factory.py
+++ b/test/preview/testing/test_factory.py
@@ -32,7 +32,7 @@ def test_document_store_class_is_registered():
 
 @pytest.mark.unit
 def test_document_store_class_with_documents():
-    doc = Document(text="This is a document")
+    doc = Document(id="fake_id", text="This is a document")
     MyStore = document_store_class("MyStore", documents=[doc])
     store = MyStore()
     assert store.count_documents() == 1
@@ -49,7 +49,7 @@ def test_document_store_class_with_documents_count():
 
 @pytest.mark.unit
 def test_document_store_class_with_documents_and_documents_count():
-    doc = Document(text="This is a document")
+    doc = Document(id="fake_id", text="This is a document")
     MyStore = document_store_class("MyStore", documents=[doc], documents_count=100)
     store = MyStore()
     assert store.count_documents() == 100