From a5a0dc9f878ee9d5cbc4e2317fdfa8449f31e7bd Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Fri, 22 Sep 2023 11:09:59 +0200 Subject: [PATCH] feat: optionally pass an id to the Document constructor (#5862) * revert #5826 * do not use Optional --- .../embedders/sentence_transformers_document_embedder.py | 1 - haystack/preview/dataclasses/document.py | 8 ++++---- haystack/preview/document_stores/memory/document_store.py | 2 -- .../notes/pass-id-to-doc-init-c6b44d30978f2d9f.yaml | 5 +++++ test/preview/dataclasses/test_document.py | 1 + test/preview/testing/test_factory.py | 4 ++-- 6 files changed, 12 insertions(+), 9 deletions(-) create mode 100644 releasenotes/notes/pass-id-to-doc-init-c6b44d30978f2d9f.yaml diff --git a/haystack/preview/components/embedders/sentence_transformers_document_embedder.py b/haystack/preview/components/embedders/sentence_transformers_document_embedder.py index 82466d960..40ac170aa 100644 --- a/haystack/preview/components/embedders/sentence_transformers_document_embedder.py +++ b/haystack/preview/components/embedders/sentence_transformers_document_embedder.py @@ -128,7 +128,6 @@ class SentenceTransformersDocumentEmbedder: for doc, emb in zip(documents, embeddings): doc_as_dict = doc.to_dict() doc_as_dict["embedding"] = emb - del doc_as_dict["id"] documents_with_embeddings.append(Document.from_dict(doc_as_dict)) return {"documents": documents_with_embeddings} diff --git a/haystack/preview/dataclasses/document.py b/haystack/preview/dataclasses/document.py index 6f8a3b7bc..eb0704b86 100644 --- a/haystack/preview/dataclasses/document.py +++ b/haystack/preview/dataclasses/document.py @@ -63,7 +63,7 @@ class Document: Document, consider using `to_dict()`, modifying the dict, and then create a new Document object using `Document.from_dict()`. - :param id: Unique identifier for the document. Generated based on the document's attributes (see id_hash_keys). + :param id: Unique identifier for the document. When not set, it's generated based on the document's attributes (see id_hash_keys). :param text: Text of the document, if the document contains text. :param array: Array of numbers associated with the document, if the document contains matrix data like image, audio, video, and such. @@ -79,7 +79,7 @@ class Document: :param embedding: Vector representation of the document. """ - id: str = field(default_factory=str, init=False) + id: str = field(default="") text: Optional[str] = field(default=None) array: Optional[numpy.ndarray] = field(default=None) dataframe: Optional[pandas.DataFrame] = field(default=None) @@ -121,8 +121,8 @@ class Document: raise ValueError(f"Cannot name metadata fields as top-level document fields, like '{key}'.") # Note: we need to set the id this way because the dataclass is frozen. See the docstring. - hashed_content = self._create_id() - object.__setattr__(self, "id", hashed_content) + if self.id == "": + object.__setattr__(self, "id", self._create_id()) def _create_id(self): """ diff --git a/haystack/preview/document_stores/memory/document_store.py b/haystack/preview/document_stores/memory/document_store.py index 2dc041c0f..917f35549 100644 --- a/haystack/preview/document_stores/memory/document_store.py +++ b/haystack/preview/document_stores/memory/document_store.py @@ -270,7 +270,6 @@ class MemoryDocumentStore: doc = all_documents[i] doc_fields = doc.to_dict() doc_fields["score"] = docs_scores[i] - del doc_fields["id"] return_document = Document(**doc_fields) return_documents.append(return_document) return return_documents @@ -323,7 +322,6 @@ class MemoryDocumentStore: doc_fields["score"] = score if return_embedding is False: doc_fields["embedding"] = None - del doc_fields["id"] top_documents.append(Document(**doc_fields)) return top_documents diff --git a/releasenotes/notes/pass-id-to-doc-init-c6b44d30978f2d9f.yaml b/releasenotes/notes/pass-id-to-doc-init-c6b44d30978f2d9f.yaml new file mode 100644 index 000000000..b8318be9f --- /dev/null +++ b/releasenotes/notes/pass-id-to-doc-init-c6b44d30978f2d9f.yaml @@ -0,0 +1,5 @@ +--- +preview: + - | + Revert #5826 and optionally take the `id` in the Document + class constructor. diff --git a/test/preview/dataclasses/test_document.py b/test/preview/dataclasses/test_document.py index 4e0706060..f53b146fe 100644 --- a/test/preview/dataclasses/test_document.py +++ b/test/preview/dataclasses/test_document.py @@ -380,6 +380,7 @@ def test_from_json_custom_decoder(): assert doc == Document.from_json( json.dumps( { + "id": doc.id, "text": "test text", "array": None, "dataframe": None, diff --git a/test/preview/testing/test_factory.py b/test/preview/testing/test_factory.py index 89aaf658f..9583318ef 100644 --- a/test/preview/testing/test_factory.py +++ b/test/preview/testing/test_factory.py @@ -32,7 +32,7 @@ def test_document_store_class_is_registered(): @pytest.mark.unit def test_document_store_class_with_documents(): - doc = Document(text="This is a document") + doc = Document(id="fake_id", text="This is a document") MyStore = document_store_class("MyStore", documents=[doc]) store = MyStore() assert store.count_documents() == 1 @@ -49,7 +49,7 @@ def test_document_store_class_with_documents_count(): @pytest.mark.unit def test_document_store_class_with_documents_and_documents_count(): - doc = Document(text="This is a document") + doc = Document(id="fake_id", text="This is a document") MyStore = document_store_class("MyStore", documents=[doc], documents_count=100) store = MyStore() assert store.count_documents() == 100