feat: Add DocumentWriter v2 (#5435)

* add draft of WriteToStore and basic test

* add DocumentWriter implementation

* draft unit and integration tests

* add release note

* mock Store in unit tests

* pylint

* Update haystack/preview/components/writers/document_writer.py

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* Remove unnecessary test

* Rework DocumentWriter to support new Component I/O definition

---------

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
Co-authored-by: Silvano Cerza <silvanocerza@gmail.com>
This commit is contained in:
Julian Risch 2023-08-16 13:48:33 +02:00 committed by GitHub
parent d4c1a0508a
commit 22c7601729
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 83 additions and 0 deletions

View File

@ -2,3 +2,4 @@ from haystack.preview.components.audio.whisper_local import LocalWhisperTranscri
from haystack.preview.components.audio.whisper_remote import RemoteWhisperTranscriber
from haystack.preview.components.file_converters import TextFileToDocument
from haystack.preview.components.classifiers import FileExtensionClassifier
from haystack.preview.components.writers.document_writer import DocumentWriter

View File

@ -0,0 +1,42 @@
from typing import List, Optional
from haystack.preview import component, Document
from haystack.preview.document_stores import DocumentStoreAwareMixin, DocumentStore, DuplicatePolicy
@component
class DocumentWriter(DocumentStoreAwareMixin):
"""
A component for writing documents to a DocumentStore.
"""
supported_document_stores = [DocumentStore] # type: ignore
def __init__(self, policy: DuplicatePolicy = DuplicatePolicy.FAIL):
"""
Create a DocumentWriter component.
:param policy: The policy to use when encountering duplicate documents (default is DuplicatePolicy.FAIL).
"""
self.policy = policy
def run(self, documents: List[Document], policy: Optional[DuplicatePolicy] = None):
"""
Run DocumentWriter on the given input data.
:param documents: A list of documents to write to the store.
:param policy: The policy to use when encountering duplicate documents.
:return: None
:raises ValueError: If the specified document store is not found.
"""
if not self.document_store:
raise ValueError(
"DocumentWriter needs a DocumentStore to run: set the DocumentStore instance to the self.document_store attribute."
)
if policy is None:
policy = self.policy
self.document_store.write_documents(documents=documents, policy=policy)
return {}

View File

@ -0,0 +1,4 @@
---
features:
- |
Added new DocumentWriter component to Haystack v2 preview so that documents can be written to stores.

View File

@ -0,0 +1,36 @@
from unittest.mock import MagicMock
import pytest
from haystack.preview import Document
from haystack.preview.components.writers.document_writer import DocumentWriter
from haystack.preview.document_stores import DuplicatePolicy
from test.preview.components.base import BaseTestComponent
class TestDocumentWriter(BaseTestComponent):
@pytest.mark.unit
def test_run(self):
writer = DocumentWriter()
documents = [
Document(content="This is the text of a document."),
Document(content="This is the text of another document."),
]
mocked_document_store = MagicMock()
mocked_document_store.__haystack_document_store__ = True
writer.document_store = mocked_document_store
writer.run(documents=documents)
mocked_document_store.write_documents.assert_called_once_with(documents=documents, policy=DuplicatePolicy.FAIL)
@pytest.mark.unit
def test_run_without_store(self):
writer = DocumentWriter()
documents = [Document(content="test")]
with pytest.raises(
ValueError,
match="DocumentWriter needs a DocumentStore to run: set the DocumentStore instance to the "
"self.document_store attribute",
):
writer.run(documents=documents)