diff --git a/haystack/preview/components/embedders/sentence_transformers_document_embedder.py b/haystack/preview/components/embedders/sentence_transformers_document_embedder.py index 40ac170aa..82466d960 100644 --- a/haystack/preview/components/embedders/sentence_transformers_document_embedder.py +++ b/haystack/preview/components/embedders/sentence_transformers_document_embedder.py @@ -128,6 +128,7 @@ class SentenceTransformersDocumentEmbedder: for doc, emb in zip(documents, embeddings): doc_as_dict = doc.to_dict() doc_as_dict["embedding"] = emb + del doc_as_dict["id"] documents_with_embeddings.append(Document.from_dict(doc_as_dict)) return {"documents": documents_with_embeddings} diff --git a/haystack/preview/dataclasses/document.py b/haystack/preview/dataclasses/document.py index e40d44118..6f8a3b7bc 100644 --- a/haystack/preview/dataclasses/document.py +++ b/haystack/preview/dataclasses/document.py @@ -63,8 +63,7 @@ class Document: Document, consider using `to_dict()`, modifying the dict, and then create a new Document object using `Document.from_dict()`. - :param id: Unique identifier for the document. Do not provide this value when initializing a document: it will be - generated based on the document's attributes (see id_hash_keys). + :param id: Unique identifier for the document. Generated based on the document's attributes (see id_hash_keys). :param text: Text of the document, if the document contains text. :param array: Array of numbers associated with the document, if the document contains matrix data like image, audio, video, and such. @@ -80,7 +79,7 @@ class Document: :param embedding: Vector representation of the document. """ - id: str = field(default_factory=str) + id: str = field(default_factory=str, init=False) text: Optional[str] = field(default=None) array: Optional[numpy.ndarray] = field(default=None) dataframe: Optional[pandas.DataFrame] = field(default=None) diff --git a/haystack/preview/document_stores/memory/document_store.py b/haystack/preview/document_stores/memory/document_store.py index 917f35549..2dc041c0f 100644 --- a/haystack/preview/document_stores/memory/document_store.py +++ b/haystack/preview/document_stores/memory/document_store.py @@ -270,6 +270,7 @@ class MemoryDocumentStore: doc = all_documents[i] doc_fields = doc.to_dict() doc_fields["score"] = docs_scores[i] + del doc_fields["id"] return_document = Document(**doc_fields) return_documents.append(return_document) return return_documents @@ -322,6 +323,7 @@ class MemoryDocumentStore: doc_fields["score"] = score if return_embedding is False: doc_fields["embedding"] = None + del doc_fields["id"] top_documents.append(Document(**doc_fields)) return top_documents diff --git a/releasenotes/notes/fix-document-constructor-18b9e5dd9522a918.yaml b/releasenotes/notes/fix-document-constructor-18b9e5dd9522a918.yaml new file mode 100644 index 000000000..022706315 --- /dev/null +++ b/releasenotes/notes/fix-document-constructor-18b9e5dd9522a918.yaml @@ -0,0 +1,5 @@ +--- +preview: + - | + Remove `id` parameter from `Document` constructor as it was ignored and a new one was generated anyway. + This is a backwards incompatible change. diff --git a/test/preview/dataclasses/test_document.py b/test/preview/dataclasses/test_document.py index f53b146fe..4e0706060 100644 --- a/test/preview/dataclasses/test_document.py +++ b/test/preview/dataclasses/test_document.py @@ -380,7 +380,6 @@ def test_from_json_custom_decoder(): assert doc == Document.from_json( json.dumps( { - "id": doc.id, "text": "test text", "array": None, "dataframe": None, diff --git a/test/preview/testing/test_factory.py b/test/preview/testing/test_factory.py index 9583318ef..89aaf658f 100644 --- a/test/preview/testing/test_factory.py +++ b/test/preview/testing/test_factory.py @@ -32,7 +32,7 @@ def test_document_store_class_is_registered(): @pytest.mark.unit def test_document_store_class_with_documents(): - doc = Document(id="fake_id", text="This is a document") + doc = Document(text="This is a document") MyStore = document_store_class("MyStore", documents=[doc]) store = MyStore() assert store.count_documents() == 1 @@ -49,7 +49,7 @@ def test_document_store_class_with_documents_count(): @pytest.mark.unit def test_document_store_class_with_documents_and_documents_count(): - doc = Document(id="fake_id", text="This is a document") + doc = Document(text="This is a document") MyStore = document_store_class("MyStore", documents=[doc], documents_count=100) store = MyStore() assert store.count_documents() == 100