diff --git a/e2e/__init__.py b/e2e/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/e2e/document_stores/conftest.py b/e2e/conftest.py similarity index 98% rename from e2e/document_stores/conftest.py rename to e2e/conftest.py index ebc044f02..81efd6332 100644 --- a/e2e/document_stores/conftest.py +++ b/e2e/conftest.py @@ -1,6 +1,7 @@ import os import uuid from contextlib import contextmanager +from pathlib import Path import pytest @@ -16,6 +17,9 @@ from haystack.document_stores import ( ) +SAMPLES_PATH = Path(__file__).parent.parent / "test" / "samples" + + @pytest.fixture def docs_all_formats(): return [ diff --git a/e2e/document_stores/test_cosine_similarity.py b/e2e/document_stores/test_cosine_similarity.py index e5d63633f..278a2b8e4 100644 --- a/e2e/document_stores/test_cosine_similarity.py +++ b/e2e/document_stores/test_cosine_similarity.py @@ -6,7 +6,7 @@ import numpy as np from haystack.schema import Document -from .conftest import document_store +from ..conftest import document_store DOCUMENTS = [ diff --git a/e2e/document_stores/test_similarity_score.py b/e2e/document_stores/test_similarity_score.py index 4ebb9611d..19cf6b971 100644 --- a/e2e/document_stores/test_similarity_score.py +++ b/e2e/document_stores/test_similarity_score.py @@ -3,7 +3,7 @@ import pytest from haystack.nodes import EmbeddingRetriever from haystack.pipelines import DocumentSearchPipeline -from .conftest import document_store +from ..conftest import document_store @pytest.mark.parametrize("name", ["memory", "faiss", "milvus", "weaviate", "elasticsearch"]) diff --git a/e2e/document_stores/test_update_embeddings.py b/e2e/document_stores/test_update_embeddings.py index d85b9eff9..e42355a79 100644 --- a/e2e/document_stores/test_update_embeddings.py +++ b/e2e/document_stores/test_update_embeddings.py @@ -4,7 +4,7 @@ import pandas as pd from haystack.nodes import EmbeddingRetriever, TableTextRetriever -from .conftest import document_store +from ..conftest import document_store @pytest.mark.parametrize("name", ["elasticsearch", "faiss", "memory", "milvus"]) diff --git a/e2e/nodes/__init__.py b/e2e/nodes/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/e2e/nodes/test_translator.py b/e2e/nodes/test_translator.py new file mode 100644 index 000000000..1e0764592 --- /dev/null +++ b/e2e/nodes/test_translator.py @@ -0,0 +1,13 @@ +from haystack import Document +from haystack.nodes import TransformersTranslator + + +def test_translator(): + en_to_de_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de") + + original = "I live in Berlin" + translation = "Ich lebe in Berlin" + + assert en_to_de_translator.translate(query=original) == translation + assert en_to_de_translator.translate(documents=[original])[0] == translation + assert en_to_de_translator.translate(documents=[Document(content=original)])[0].content == translation diff --git a/e2e/pipelines/__init__.py b/e2e/pipelines/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/e2e/pipelines/test_extractive_qa.py b/e2e/pipelines/test_extractive_qa.py new file mode 100644 index 000000000..24c43e9cd --- /dev/null +++ b/e2e/pipelines/test_extractive_qa.py @@ -0,0 +1,29 @@ +from haystack.nodes import TransformersTranslator, FARMReader, TfidfRetriever +from haystack.pipelines import ExtractiveQAPipeline, TranslationWrapperPipeline +from haystack.document_stores import InMemoryDocumentStore + + +def test_extractive_qa_answers_with_translator(docs): + en_to_de_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de") + de_to_en_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-de-en") + + ds = InMemoryDocumentStore(use_bm25=False) + retriever = TfidfRetriever(document_store=ds) + reader = FARMReader( + model_name_or_path="deepset/bert-medium-squad2-distilled", use_gpu=False, top_k_per_sample=5, num_processes=0 + ) + ds.write_documents(docs) + + base_pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever) + pipeline = TranslationWrapperPipeline( + input_translator=de_to_en_translator, output_translator=en_to_de_translator, pipeline=base_pipeline + ) + + prediction = pipeline.run(query="Wer lebt in Berlin?", params={"Reader": {"top_k": 3}}) + assert prediction is not None + assert prediction["query"] == "Wer lebt in Berlin?" + assert "Carla" in prediction["answers"][0].answer + assert prediction["answers"][0].score <= 1 + assert prediction["answers"][0].score >= 0 + assert prediction["answers"][0].meta["meta_field"] == "test1" + assert prediction["answers"][0].context == "My name is Carla and I live in Berlin" diff --git a/e2e/pipelines/test_generative_qa.py b/e2e/pipelines/test_generative_qa.py new file mode 100644 index 000000000..a9dfa4535 --- /dev/null +++ b/e2e/pipelines/test_generative_qa.py @@ -0,0 +1,36 @@ +from haystack import Document +from haystack.pipelines import TranslationWrapperPipeline, GenerativeQAPipeline +from haystack.document_stores import InMemoryDocumentStore +from haystack.nodes import DensePassageRetriever, RAGenerator, TransformersTranslator + + +def test_generative_pipeline_with_translator(): + docs = [ + Document(content="The capital of Germany is the city state of Berlin."), + Document(content="Berlin is the capital and largest city of Germany by both area and population."), + ] + ds = InMemoryDocumentStore(use_bm25=True) + ds.write_documents(docs) + retriever = DensePassageRetriever( # Needs DPR or RAGenerator will thrown an exception... + document_store=ds, + query_embedding_model="facebook/dpr-question_encoder-single-nq-base", + passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", + use_gpu=False, + embed_title=True, + ) + ds.update_embeddings(retriever=retriever) + rag_generator = RAGenerator( + model_name_or_path="facebook/rag-token-nq", generator_type="token", max_length=20, retriever=retriever + ) + en_to_de_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de") + de_to_en_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-de-en") + + query = "Was ist die Hauptstadt der Bundesrepublik Deutschland?" + base_pipeline = GenerativeQAPipeline(retriever=retriever, generator=rag_generator) + pipeline = TranslationWrapperPipeline( + input_translator=de_to_en_translator, output_translator=en_to_de_translator, pipeline=base_pipeline + ) + output = pipeline.run(query=query, params={"Generator": {"top_k": 2}, "Retriever": {"top_k": 1}}) + answers = output["answers"] + assert len(answers) == 2 + assert "berlin" in answers[0].answer.lower() diff --git a/test/conftest.py b/test/conftest.py index 22668830e..75e7ae917 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -54,7 +54,6 @@ from haystack.nodes import ( TableReader, RCIReader, TransformersSummarizer, - TransformersTranslator, QuestionGenerator, PromptTemplate, ) @@ -555,16 +554,6 @@ def summarizer(): return TransformersSummarizer(model_name_or_path="sshleifer/distilbart-xsum-12-6", use_gpu=False) -@pytest.fixture -def en_to_de_translator(): - return TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de") - - -@pytest.fixture -def de_to_en_translator(): - return TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-de-en") - - @pytest.fixture def reader_without_normalized_scores(): return FARMReader( diff --git a/test/nodes/test_extractor_translation.py b/test/nodes/test_extractor_translation.py deleted file mode 100644 index e8489136a..000000000 --- a/test/nodes/test_extractor_translation.py +++ /dev/null @@ -1,26 +0,0 @@ -import pytest - -from haystack.pipelines import TranslationWrapperPipeline, ExtractiveQAPipeline -from .test_summarizer import SPLIT_DOCS - - -# Keeping few (retriever,document_store,reader) combination to reduce test time -@pytest.mark.integration -@pytest.mark.summarizer -@pytest.mark.parametrize("retriever,document_store,reader", [("embedding", "memory", "farm")], indirect=True) -def test_extractive_qa_pipeline_with_translator( - document_store, retriever, reader, en_to_de_translator, de_to_en_translator -): - document_store.write_documents(SPLIT_DOCS) - document_store.update_embeddings(retriever=retriever) - - query = "Wo steht der Eiffelturm?" - base_pipeline = ExtractiveQAPipeline(retriever=retriever, reader=reader) - pipeline = TranslationWrapperPipeline( - input_translator=de_to_en_translator, output_translator=en_to_de_translator, pipeline=base_pipeline - ) - output = pipeline.run(query=query, params={"Retriever": {"top_k": 2}}) - assert len(output["documents"]) == 2 - answers_texts = [el.answer for el in output["answers"]] - - assert "Frankreich" in answers_texts diff --git a/test/nodes/test_generator.py b/test/nodes/test_generator.py index 22f4605a6..31455d5ff 100644 --- a/test/nodes/test_generator.py +++ b/test/nodes/test_generator.py @@ -6,32 +6,12 @@ import pytest from haystack.schema import Document from haystack.nodes.answer_generator import Seq2SeqGenerator, OpenAIAnswerGenerator -from haystack.pipelines import TranslationWrapperPipeline, GenerativeQAPipeline +from haystack.pipelines import GenerativeQAPipeline from haystack.nodes import PromptTemplate import logging -# Keeping few (retriever,document_store) combination to reduce test time -@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Causes OOM on windows github runner") -@pytest.mark.integration -@pytest.mark.generator -@pytest.mark.parametrize("retriever,document_store", [("embedding", "memory")], indirect=True) -def test_generator_pipeline_with_translator( - document_store, retriever, rag_generator, en_to_de_translator, de_to_en_translator, docs_with_true_emb -): - document_store.write_documents(docs_with_true_emb) - query = "Was ist die Hauptstadt der Bundesrepublik Deutschland?" - base_pipeline = GenerativeQAPipeline(retriever=retriever, generator=rag_generator) - pipeline = TranslationWrapperPipeline( - input_translator=de_to_en_translator, output_translator=en_to_de_translator, pipeline=base_pipeline - ) - output = pipeline.run(query=query, params={"Generator": {"top_k": 2}, "Retriever": {"top_k": 1}}) - answers = output["answers"] - assert len(answers) == 2 - assert "berlin" in answers[0].answer - - @pytest.mark.integration @pytest.mark.generator def test_rag_token_generator(rag_generator, docs_with_true_emb): diff --git a/test/nodes/test_translator.py b/test/nodes/test_translator.py index e1f2f478a..83a21dfb2 100644 --- a/test/nodes/test_translator.py +++ b/test/nodes/test_translator.py @@ -1,60 +1,115 @@ -from haystack.schema import Document - import pytest -EXPECTED_OUTPUT = "Ich lebe in Berlin" -INPUT = "I live in Berlin" - -DOCUMENT_INPUT = Document(content=INPUT) +import haystack +from haystack.schema import Document +from haystack.nodes import TransformersTranslator +ORIGINAL_TEXT = "TEST QUERY" +TRANSLATION = "MOCK TRANSLATION" + + +class MockTokenizer: + @classmethod + def from_pretrained(cls, *a, **k): + return cls() + + def __call__(self, *a, **k): + return self + + def to(self, *a, **k): + return {} + + def batch_decode(self, *a, **k): + return [TRANSLATION] + + +class MockModel: + @classmethod + def from_pretrained(cls, *a, **k): + return cls() + + def generate(self, *a, **k): + return None + + def to(self, *a, **k): + return None + + +@pytest.fixture +def mock_models(monkeypatch): + monkeypatch.setattr(haystack.nodes.translator.transformers, "AutoModelForSeq2SeqLM", MockModel) + monkeypatch.setattr(haystack.nodes.translator.transformers, "AutoTokenizer", MockTokenizer) + + +@pytest.fixture +def en_to_de_translator(mock_models) -> TransformersTranslator: + return TransformersTranslator(model_name_or_path="irrelevant/anyway") + + +@pytest.fixture +def de_to_en_translator(mock_models) -> TransformersTranslator: + return TransformersTranslator(model_name_or_path="irrelevant/anyway") + + +@pytest.mark.unit def test_translator_with_query(en_to_de_translator): - assert en_to_de_translator.translate(query=INPUT) == EXPECTED_OUTPUT + assert en_to_de_translator.translate(query=ORIGINAL_TEXT) == TRANSLATION +@pytest.mark.unit def test_translator_with_list(en_to_de_translator): - assert en_to_de_translator.translate(documents=[INPUT])[0] == EXPECTED_OUTPUT + assert en_to_de_translator.translate(documents=[ORIGINAL_TEXT])[0] == TRANSLATION +@pytest.mark.unit def test_translator_with_document(en_to_de_translator): - assert en_to_de_translator.translate(documents=[Document(content=INPUT)])[0].content == EXPECTED_OUTPUT + assert en_to_de_translator.translate(documents=[Document(content=ORIGINAL_TEXT)])[0].content == TRANSLATION -def test_translator_with_document_preserves_input(en_to_de_translator): - original_document = Document(content=INPUT) - en_to_de_translator.translate(documents=[original_document])[0] # pylint: disable=expression-not-assigned - assert original_document.content == INPUT +@pytest.mark.unit +def test_translator_with_document_preserves_original(en_to_de_translator): + original_document = Document(content=ORIGINAL_TEXT) + en_to_de_translator.translate(documents=[original_document]) + assert original_document.content == ORIGINAL_TEXT +@pytest.mark.unit def test_translator_with_dictionary(en_to_de_translator): - assert en_to_de_translator.translate(documents=[{"content": INPUT}])[0]["content"] == EXPECTED_OUTPUT + assert en_to_de_translator.translate(documents=[{"content": ORIGINAL_TEXT}])[0]["content"] == TRANSLATION -def test_translator_with_dictionary_preserves_input(en_to_de_translator): - original_document = {"content": INPUT} - en_to_de_translator.translate(documents=[original_document])[0] # pylint: disable=expression-not-assigned - assert original_document["content"] == INPUT +@pytest.mark.unit +def test_translator_with_dictionary_preserves_original(en_to_de_translator): + original_document = {"content": ORIGINAL_TEXT} + en_to_de_translator.translate(documents=[original_document]) + assert original_document["content"] == ORIGINAL_TEXT +@pytest.mark.unit def test_translator_with_dictionary_with_dict_key(en_to_de_translator): - assert en_to_de_translator.translate(documents=[{"key": INPUT}], dict_key="key")[0]["key"] == EXPECTED_OUTPUT + assert en_to_de_translator.translate(documents=[{"key": ORIGINAL_TEXT}], dict_key="key")[0]["key"] == TRANSLATION -def test_translator_with_empty_input(en_to_de_translator): +@pytest.mark.unit +def test_translator_with_empty_original(en_to_de_translator): with pytest.raises(AttributeError): en_to_de_translator.translate() +@pytest.mark.unit def test_translator_with_query_and_documents(en_to_de_translator): with pytest.raises(AttributeError): - en_to_de_translator.translate(query=INPUT, documents=[INPUT]) + en_to_de_translator.translate(query=ORIGINAL_TEXT, documents=[ORIGINAL_TEXT]) +@pytest.mark.unit def test_translator_with_dict_without_text_key(en_to_de_translator): with pytest.raises(AttributeError): - en_to_de_translator.translate(documents=[{"text1": INPUT}]) + en_to_de_translator.translate(documents=[{"text1": ORIGINAL_TEXT}]) +@pytest.mark.unit def test_translator_with_dict_with_non_string_value(en_to_de_translator): with pytest.raises(AttributeError): en_to_de_translator.translate(documents=[{"text": 123}]) diff --git a/test/pipelines/test_pipeline_extractive_qa.py b/test/pipelines/test_pipeline_extractive_qa.py index 5f6b0cab6..fcc91baa9 100644 --- a/test/pipelines/test_pipeline_extractive_qa.py +++ b/test/pipelines/test_pipeline_extractive_qa.py @@ -60,22 +60,3 @@ def test_extractive_qa_answers_single_result(reader, retriever_with_docs): prediction = pipeline.run(query=query, params={"Retriever": {"top_k": 1}, "Reader": {"top_k": 1}}) assert prediction is not None assert len(prediction["answers"]) == 1 - - -@pytest.mark.integration -@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) -@pytest.mark.parametrize("reader", ["farm"], indirect=True) -def test_extractive_qa_answers_with_translator(reader, retriever_with_docs, en_to_de_translator, de_to_en_translator): - base_pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) - pipeline = TranslationWrapperPipeline( - input_translator=de_to_en_translator, output_translator=en_to_de_translator, pipeline=base_pipeline - ) - - prediction = pipeline.run(query="Wer lebt in Berlin?", params={"Reader": {"top_k": 3}}) - assert prediction is not None - assert prediction["query"] == "Wer lebt in Berlin?" - assert "Carla" in prediction["answers"][0].answer - assert prediction["answers"][0].score <= 1 - assert prediction["answers"][0].score >= 0 - assert prediction["answers"][0].meta["meta_field"] == "test1" - assert prediction["answers"][0].context == "My name is Carla and I live in Berlin"