mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-28 18:36:36 +00:00
feat: optionally pass an id to the Document constructor (#5862)
* revert #5826 * do not use Optional
This commit is contained in:
parent
cc4f95bf51
commit
a5a0dc9f87
@ -128,7 +128,6 @@ class SentenceTransformersDocumentEmbedder:
|
|||||||
for doc, emb in zip(documents, embeddings):
|
for doc, emb in zip(documents, embeddings):
|
||||||
doc_as_dict = doc.to_dict()
|
doc_as_dict = doc.to_dict()
|
||||||
doc_as_dict["embedding"] = emb
|
doc_as_dict["embedding"] = emb
|
||||||
del doc_as_dict["id"]
|
|
||||||
documents_with_embeddings.append(Document.from_dict(doc_as_dict))
|
documents_with_embeddings.append(Document.from_dict(doc_as_dict))
|
||||||
|
|
||||||
return {"documents": documents_with_embeddings}
|
return {"documents": documents_with_embeddings}
|
||||||
|
@ -63,7 +63,7 @@ class Document:
|
|||||||
Document, consider using `to_dict()`, modifying the dict, and then create a new Document object using
|
Document, consider using `to_dict()`, modifying the dict, and then create a new Document object using
|
||||||
`Document.from_dict()`.
|
`Document.from_dict()`.
|
||||||
|
|
||||||
:param id: Unique identifier for the document. Generated based on the document's attributes (see id_hash_keys).
|
:param id: Unique identifier for the document. When not set, it's generated based on the document's attributes (see id_hash_keys).
|
||||||
:param text: Text of the document, if the document contains text.
|
:param text: Text of the document, if the document contains text.
|
||||||
:param array: Array of numbers associated with the document, if the document contains matrix data like image,
|
:param array: Array of numbers associated with the document, if the document contains matrix data like image,
|
||||||
audio, video, and such.
|
audio, video, and such.
|
||||||
@ -79,7 +79,7 @@ class Document:
|
|||||||
:param embedding: Vector representation of the document.
|
:param embedding: Vector representation of the document.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
id: str = field(default_factory=str, init=False)
|
id: str = field(default="")
|
||||||
text: Optional[str] = field(default=None)
|
text: Optional[str] = field(default=None)
|
||||||
array: Optional[numpy.ndarray] = field(default=None)
|
array: Optional[numpy.ndarray] = field(default=None)
|
||||||
dataframe: Optional[pandas.DataFrame] = field(default=None)
|
dataframe: Optional[pandas.DataFrame] = field(default=None)
|
||||||
@ -121,8 +121,8 @@ class Document:
|
|||||||
raise ValueError(f"Cannot name metadata fields as top-level document fields, like '{key}'.")
|
raise ValueError(f"Cannot name metadata fields as top-level document fields, like '{key}'.")
|
||||||
|
|
||||||
# Note: we need to set the id this way because the dataclass is frozen. See the docstring.
|
# Note: we need to set the id this way because the dataclass is frozen. See the docstring.
|
||||||
hashed_content = self._create_id()
|
if self.id == "":
|
||||||
object.__setattr__(self, "id", hashed_content)
|
object.__setattr__(self, "id", self._create_id())
|
||||||
|
|
||||||
def _create_id(self):
|
def _create_id(self):
|
||||||
"""
|
"""
|
||||||
|
@ -270,7 +270,6 @@ class MemoryDocumentStore:
|
|||||||
doc = all_documents[i]
|
doc = all_documents[i]
|
||||||
doc_fields = doc.to_dict()
|
doc_fields = doc.to_dict()
|
||||||
doc_fields["score"] = docs_scores[i]
|
doc_fields["score"] = docs_scores[i]
|
||||||
del doc_fields["id"]
|
|
||||||
return_document = Document(**doc_fields)
|
return_document = Document(**doc_fields)
|
||||||
return_documents.append(return_document)
|
return_documents.append(return_document)
|
||||||
return return_documents
|
return return_documents
|
||||||
@ -323,7 +322,6 @@ class MemoryDocumentStore:
|
|||||||
doc_fields["score"] = score
|
doc_fields["score"] = score
|
||||||
if return_embedding is False:
|
if return_embedding is False:
|
||||||
doc_fields["embedding"] = None
|
doc_fields["embedding"] = None
|
||||||
del doc_fields["id"]
|
|
||||||
top_documents.append(Document(**doc_fields))
|
top_documents.append(Document(**doc_fields))
|
||||||
|
|
||||||
return top_documents
|
return top_documents
|
||||||
|
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
preview:
|
||||||
|
- |
|
||||||
|
Revert #5826 and optionally take the `id` in the Document
|
||||||
|
class constructor.
|
@ -380,6 +380,7 @@ def test_from_json_custom_decoder():
|
|||||||
assert doc == Document.from_json(
|
assert doc == Document.from_json(
|
||||||
json.dumps(
|
json.dumps(
|
||||||
{
|
{
|
||||||
|
"id": doc.id,
|
||||||
"text": "test text",
|
"text": "test text",
|
||||||
"array": None,
|
"array": None,
|
||||||
"dataframe": None,
|
"dataframe": None,
|
||||||
|
@ -32,7 +32,7 @@ def test_document_store_class_is_registered():
|
|||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_document_store_class_with_documents():
|
def test_document_store_class_with_documents():
|
||||||
doc = Document(text="This is a document")
|
doc = Document(id="fake_id", text="This is a document")
|
||||||
MyStore = document_store_class("MyStore", documents=[doc])
|
MyStore = document_store_class("MyStore", documents=[doc])
|
||||||
store = MyStore()
|
store = MyStore()
|
||||||
assert store.count_documents() == 1
|
assert store.count_documents() == 1
|
||||||
@ -49,7 +49,7 @@ def test_document_store_class_with_documents_count():
|
|||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_document_store_class_with_documents_and_documents_count():
|
def test_document_store_class_with_documents_and_documents_count():
|
||||||
doc = Document(text="This is a document")
|
doc = Document(id="fake_id", text="This is a document")
|
||||||
MyStore = document_store_class("MyStore", documents=[doc], documents_count=100)
|
MyStore = document_store_class("MyStore", documents=[doc], documents_count=100)
|
||||||
store = MyStore()
|
store = MyStore()
|
||||||
assert store.count_documents() == 100
|
assert store.count_documents() == 100
|
||||||
|
Loading…
x
Reference in New Issue
Block a user