test: mock all Translator tests and move one to e2e (#4290)

* mock all translator tests and move one to e2e * typo * extract pipeline tests using translator * remove duplicate test * move generator test in e2e * Update e2e/pipelines/test_extractive_qa.py * pytest.mark.unit * black * remove model name as well * remove unused fixture * rename original and improve pipeline tests * fixes * pylint
2026-01-08 04:56:45 +00:00 · 2023-03-01 14:52:05 +01:00 · 2023-03-01 14:52:05 +01:00 · 165a0a5faa
commit 165a0a5faa
parent 7e0f9715ba
15 changed files with 163 additions and 102 deletions
--- a/e2e/init.py
+++ b/e2e/init.py
--- a/e2e/document_stores/conftest.py
+++ b/e2e/document_stores/conftest.py
@ -1,6 +1,7 @@
 import os
 import uuid
 from contextlib import contextmanager
+from pathlib import Path

 import pytest

@ -16,6 +17,9 @@ from haystack.document_stores import (
 )


+SAMPLES_PATH = Path(__file__).parent.parent / "test" / "samples"
+
+
@pytest.fixture
 def docs_all_formats():
    return [
--- a/e2e/document_stores/test_cosine_similarity.py
+++ b/e2e/document_stores/test_cosine_similarity.py
@ -6,7 +6,7 @@ import numpy as np

 from haystack.schema import Document

-from .conftest import document_store
+from ..conftest import document_store


 DOCUMENTS = [
--- a/e2e/document_stores/test_similarity_score.py
+++ b/e2e/document_stores/test_similarity_score.py
@ -3,7 +3,7 @@ import pytest
 from haystack.nodes import EmbeddingRetriever
 from haystack.pipelines import DocumentSearchPipeline

-from .conftest import document_store
+from ..conftest import document_store


@pytest.mark.parametrize("name", ["memory", "faiss", "milvus", "weaviate", "elasticsearch"])
--- a/e2e/document_stores/test_update_embeddings.py
+++ b/e2e/document_stores/test_update_embeddings.py
@ -4,7 +4,7 @@ import pandas as pd

 from haystack.nodes import EmbeddingRetriever, TableTextRetriever

-from .conftest import document_store
+from ..conftest import document_store


@pytest.mark.parametrize("name", ["elasticsearch", "faiss", "memory", "milvus"])
--- a/e2e/nodes/init.py
+++ b/e2e/nodes/init.py
--- a/e2e/nodes/test_translator.py
+++ b/e2e/nodes/test_translator.py
@ -0,0 +1,13 @@
+from haystack import Document
+from haystack.nodes import TransformersTranslator
+
+
+def test_translator():
+    en_to_de_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de")
+
+    original = "I live in Berlin"
+    translation = "Ich lebe in Berlin"
+
+    assert en_to_de_translator.translate(query=original) == translation
+    assert en_to_de_translator.translate(documents=[original])[0] == translation
+    assert en_to_de_translator.translate(documents=[Document(content=original)])[0].content == translation
--- a/e2e/pipelines/init.py
+++ b/e2e/pipelines/init.py
--- a/e2e/pipelines/test_extractive_qa.py
+++ b/e2e/pipelines/test_extractive_qa.py
@ -0,0 +1,29 @@
+from haystack.nodes import TransformersTranslator, FARMReader, TfidfRetriever
+from haystack.pipelines import ExtractiveQAPipeline, TranslationWrapperPipeline
+from haystack.document_stores import InMemoryDocumentStore
+
+
+def test_extractive_qa_answers_with_translator(docs):
+    en_to_de_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de")
+    de_to_en_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-de-en")
+
+    ds = InMemoryDocumentStore(use_bm25=False)
+    retriever = TfidfRetriever(document_store=ds)
+    reader = FARMReader(
+        model_name_or_path="deepset/bert-medium-squad2-distilled", use_gpu=False, top_k_per_sample=5, num_processes=0
+    )
+    ds.write_documents(docs)
+
+    base_pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)
+    pipeline = TranslationWrapperPipeline(
+        input_translator=de_to_en_translator, output_translator=en_to_de_translator, pipeline=base_pipeline
+    )
+
+    prediction = pipeline.run(query="Wer lebt in Berlin?", params={"Reader": {"top_k": 3}})
+    assert prediction is not None
+    assert prediction["query"] == "Wer lebt in Berlin?"
+    assert "Carla" in prediction["answers"][0].answer
+    assert prediction["answers"][0].score <= 1
+    assert prediction["answers"][0].score >= 0
+    assert prediction["answers"][0].meta["meta_field"] == "test1"
+    assert prediction["answers"][0].context == "My name is Carla and I live in Berlin"
--- a/e2e/pipelines/test_generative_qa.py
+++ b/e2e/pipelines/test_generative_qa.py
@ -0,0 +1,36 @@
+from haystack import Document
+from haystack.pipelines import TranslationWrapperPipeline, GenerativeQAPipeline
+from haystack.document_stores import InMemoryDocumentStore
+from haystack.nodes import DensePassageRetriever, RAGenerator, TransformersTranslator
+
+
+def test_generative_pipeline_with_translator():
+    docs = [
+        Document(content="The capital of Germany is the city state of Berlin."),
+        Document(content="Berlin is the capital and largest city of Germany by both area and population."),
+    ]
+    ds = InMemoryDocumentStore(use_bm25=True)
+    ds.write_documents(docs)
+    retriever = DensePassageRetriever(  # Needs DPR or RAGenerator will thrown an exception...
+        document_store=ds,
+        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
+        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
+        use_gpu=False,
+        embed_title=True,
+    )
+    ds.update_embeddings(retriever=retriever)
+    rag_generator = RAGenerator(
+        model_name_or_path="facebook/rag-token-nq", generator_type="token", max_length=20, retriever=retriever
+    )
+    en_to_de_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de")
+    de_to_en_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-de-en")
+
+    query = "Was ist die Hauptstadt der Bundesrepublik Deutschland?"
+    base_pipeline = GenerativeQAPipeline(retriever=retriever, generator=rag_generator)
+    pipeline = TranslationWrapperPipeline(
+        input_translator=de_to_en_translator, output_translator=en_to_de_translator, pipeline=base_pipeline
+    )
+    output = pipeline.run(query=query, params={"Generator": {"top_k": 2}, "Retriever": {"top_k": 1}})
+    answers = output["answers"]
+    assert len(answers) == 2
+    assert "berlin" in answers[0].answer.lower()
--- a/test/conftest.py
+++ b/test/conftest.py
@ -54,7 +54,6 @@ from haystack.nodes import (
    TableReader,
    RCIReader,
    TransformersSummarizer,
-    TransformersTranslator,
    QuestionGenerator,
    PromptTemplate,
 )
@ -555,16 +554,6 @@ def summarizer():
    return TransformersSummarizer(model_name_or_path="sshleifer/distilbart-xsum-12-6", use_gpu=False)


-@pytest.fixture
-def en_to_de_translator():
-    return TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de")
-
-
-@pytest.fixture
-def de_to_en_translator():
-    return TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-de-en")
-
-
@pytest.fixture
 def reader_without_normalized_scores():
    return FARMReader(
--- a/test/nodes/test_extractor_translation.py
+++ b/test/nodes/test_extractor_translation.py
@ -1,26 +0,0 @@
-import pytest
-
-from haystack.pipelines import TranslationWrapperPipeline, ExtractiveQAPipeline
-from .test_summarizer import SPLIT_DOCS
-
-
-# Keeping few (retriever,document_store,reader) combination to reduce test time
-@pytest.mark.integration
-@pytest.mark.summarizer
-@pytest.mark.parametrize("retriever,document_store,reader", [("embedding", "memory", "farm")], indirect=True)
-def test_extractive_qa_pipeline_with_translator(
-    document_store, retriever, reader, en_to_de_translator, de_to_en_translator
-):
-    document_store.write_documents(SPLIT_DOCS)
-    document_store.update_embeddings(retriever=retriever)
-
-    query = "Wo steht der Eiffelturm?"
-    base_pipeline = ExtractiveQAPipeline(retriever=retriever, reader=reader)
-    pipeline = TranslationWrapperPipeline(
-        input_translator=de_to_en_translator, output_translator=en_to_de_translator, pipeline=base_pipeline
-    )
-    output = pipeline.run(query=query, params={"Retriever": {"top_k": 2}})
-    assert len(output["documents"]) == 2
-    answers_texts = [el.answer for el in output["answers"]]
-
-    assert "Frankreich" in answers_texts
--- a/test/nodes/test_generator.py
+++ b/test/nodes/test_generator.py
@ -6,32 +6,12 @@ import pytest

 from haystack.schema import Document
 from haystack.nodes.answer_generator import Seq2SeqGenerator, OpenAIAnswerGenerator
-from haystack.pipelines import TranslationWrapperPipeline, GenerativeQAPipeline
+from haystack.pipelines import GenerativeQAPipeline
 from haystack.nodes import PromptTemplate

 import logging


-# Keeping few (retriever,document_store) combination to reduce test time
-@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Causes OOM on windows github runner")
-@pytest.mark.integration
-@pytest.mark.generator
-@pytest.mark.parametrize("retriever,document_store", [("embedding", "memory")], indirect=True)
-def test_generator_pipeline_with_translator(
-    document_store, retriever, rag_generator, en_to_de_translator, de_to_en_translator, docs_with_true_emb
-):
-    document_store.write_documents(docs_with_true_emb)
-    query = "Was ist die Hauptstadt der Bundesrepublik Deutschland?"
-    base_pipeline = GenerativeQAPipeline(retriever=retriever, generator=rag_generator)
-    pipeline = TranslationWrapperPipeline(
-        input_translator=de_to_en_translator, output_translator=en_to_de_translator, pipeline=base_pipeline
-    )
-    output = pipeline.run(query=query, params={"Generator": {"top_k": 2}, "Retriever": {"top_k": 1}})
-    answers = output["answers"]
-    assert len(answers) == 2
-    assert "berlin" in answers[0].answer
-
-
@pytest.mark.integration
@pytest.mark.generator
 def test_rag_token_generator(rag_generator, docs_with_true_emb):
--- a/test/nodes/test_translator.py
+++ b/test/nodes/test_translator.py
@ -1,60 +1,115 @@
-from haystack.schema import Document
-
 import pytest

-EXPECTED_OUTPUT = "Ich lebe in Berlin"
-INPUT = "I live in Berlin"
-
-DOCUMENT_INPUT = Document(content=INPUT)
+import haystack
+from haystack.schema import Document
+from haystack.nodes import TransformersTranslator


+ORIGINAL_TEXT = "TEST QUERY"
+TRANSLATION = "MOCK TRANSLATION"
+
+
+class MockTokenizer:
+    @classmethod
+    def from_pretrained(cls, *a, **k):
+        return cls()
+
+    def __call__(self, *a, **k):
+        return self
+
+    def to(self, *a, **k):
+        return {}
+
+    def batch_decode(self, *a, **k):
+        return [TRANSLATION]
+
+
+class MockModel:
+    @classmethod
+    def from_pretrained(cls, *a, **k):
+        return cls()
+
+    def generate(self, *a, **k):
+        return None
+
+    def to(self, *a, **k):
+        return None
+
+
+@pytest.fixture
+def mock_models(monkeypatch):
+    monkeypatch.setattr(haystack.nodes.translator.transformers, "AutoModelForSeq2SeqLM", MockModel)
+    monkeypatch.setattr(haystack.nodes.translator.transformers, "AutoTokenizer", MockTokenizer)
+
+
+@pytest.fixture
+def en_to_de_translator(mock_models) -> TransformersTranslator:
+    return TransformersTranslator(model_name_or_path="irrelevant/anyway")
+
+
+@pytest.fixture
+def de_to_en_translator(mock_models) -> TransformersTranslator:
+    return TransformersTranslator(model_name_or_path="irrelevant/anyway")
+
+
+@pytest.mark.unit
 def test_translator_with_query(en_to_de_translator):
-    assert en_to_de_translator.translate(query=INPUT) == EXPECTED_OUTPUT
+    assert en_to_de_translator.translate(query=ORIGINAL_TEXT) == TRANSLATION


+@pytest.mark.unit
 def test_translator_with_list(en_to_de_translator):
-    assert en_to_de_translator.translate(documents=[INPUT])[0] == EXPECTED_OUTPUT
+    assert en_to_de_translator.translate(documents=[ORIGINAL_TEXT])[0] == TRANSLATION


+@pytest.mark.unit
 def test_translator_with_document(en_to_de_translator):
-    assert en_to_de_translator.translate(documents=[Document(content=INPUT)])[0].content == EXPECTED_OUTPUT
+    assert en_to_de_translator.translate(documents=[Document(content=ORIGINAL_TEXT)])[0].content == TRANSLATION


-def test_translator_with_document_preserves_input(en_to_de_translator):
-    original_document = Document(content=INPUT)
-    en_to_de_translator.translate(documents=[original_document])[0]  # pylint: disable=expression-not-assigned
-    assert original_document.content == INPUT
+@pytest.mark.unit
+def test_translator_with_document_preserves_original(en_to_de_translator):
+    original_document = Document(content=ORIGINAL_TEXT)
+    en_to_de_translator.translate(documents=[original_document])
+    assert original_document.content == ORIGINAL_TEXT


+@pytest.mark.unit
 def test_translator_with_dictionary(en_to_de_translator):
-    assert en_to_de_translator.translate(documents=[{"content": INPUT}])[0]["content"] == EXPECTED_OUTPUT
+    assert en_to_de_translator.translate(documents=[{"content": ORIGINAL_TEXT}])[0]["content"] == TRANSLATION


-def test_translator_with_dictionary_preserves_input(en_to_de_translator):
-    original_document = {"content": INPUT}
-    en_to_de_translator.translate(documents=[original_document])[0]  # pylint: disable=expression-not-assigned
-    assert original_document["content"] == INPUT
+@pytest.mark.unit
+def test_translator_with_dictionary_preserves_original(en_to_de_translator):
+    original_document = {"content": ORIGINAL_TEXT}
+    en_to_de_translator.translate(documents=[original_document])
+    assert original_document["content"] == ORIGINAL_TEXT


+@pytest.mark.unit
 def test_translator_with_dictionary_with_dict_key(en_to_de_translator):
-    assert en_to_de_translator.translate(documents=[{"key": INPUT}], dict_key="key")[0]["key"] == EXPECTED_OUTPUT
+    assert en_to_de_translator.translate(documents=[{"key": ORIGINAL_TEXT}], dict_key="key")[0]["key"] == TRANSLATION


-def test_translator_with_empty_input(en_to_de_translator):
+@pytest.mark.unit
+def test_translator_with_empty_original(en_to_de_translator):
    with pytest.raises(AttributeError):
        en_to_de_translator.translate()


+@pytest.mark.unit
 def test_translator_with_query_and_documents(en_to_de_translator):
    with pytest.raises(AttributeError):
-        en_to_de_translator.translate(query=INPUT, documents=[INPUT])
+        en_to_de_translator.translate(query=ORIGINAL_TEXT, documents=[ORIGINAL_TEXT])


+@pytest.mark.unit
 def test_translator_with_dict_without_text_key(en_to_de_translator):
    with pytest.raises(AttributeError):
-        en_to_de_translator.translate(documents=[{"text1": INPUT}])
+        en_to_de_translator.translate(documents=[{"text1": ORIGINAL_TEXT}])


+@pytest.mark.unit
 def test_translator_with_dict_with_non_string_value(en_to_de_translator):
    with pytest.raises(AttributeError):
        en_to_de_translator.translate(documents=[{"text": 123}])
--- a/test/pipelines/test_pipeline_extractive_qa.py
+++ b/test/pipelines/test_pipeline_extractive_qa.py
@ -60,22 +60,3 @@ def test_extractive_qa_answers_single_result(reader, retriever_with_docs):
    prediction = pipeline.run(query=query, params={"Retriever": {"top_k": 1}, "Reader": {"top_k": 1}})
    assert prediction is not None
    assert len(prediction["answers"]) == 1
-
-
-@pytest.mark.integration
-@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
-@pytest.mark.parametrize("reader", ["farm"], indirect=True)
-def test_extractive_qa_answers_with_translator(reader, retriever_with_docs, en_to_de_translator, de_to_en_translator):
-    base_pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
-    pipeline = TranslationWrapperPipeline(
-        input_translator=de_to_en_translator, output_translator=en_to_de_translator, pipeline=base_pipeline
-    )
-
-    prediction = pipeline.run(query="Wer lebt in Berlin?", params={"Reader": {"top_k": 3}})
-    assert prediction is not None
-    assert prediction["query"] == "Wer lebt in Berlin?"
-    assert "Carla" in prediction["answers"][0].answer
-    assert prediction["answers"][0].score <= 1
-    assert prediction["answers"][0].score >= 0
-    assert prediction["answers"][0].meta["meta_field"] == "test1"
-    assert prediction["answers"][0].context == "My name is Carla and I live in Berlin"