haystack/test/test_generator.py
tstadel dde9d59271
fix pip backtracking issue (#2281)
* fix pip backtracking issue

* restrict azure-core version

* Remove the trailing comma

* Add skip_magic_trailing_comma in pyproject.toml for pydoc compatibility

* Pin pydoc-markdown _again_

Co-authored-by: Sara Zan <sarazanzo94@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2022-03-07 19:25:33 +01:00

129 lines
5.5 KiB
Python

import sys
from typing import List
import numpy as np
import pytest
from haystack.schema import Document
from haystack.nodes.answer_generator import Seq2SeqGenerator
from haystack.pipelines import TranslationWrapperPipeline, GenerativeQAPipeline
from conftest import DOCS_WITH_EMBEDDINGS
# Keeping few (retriever,document_store) combination to reduce test time
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Causes OOM on windows github runner")
@pytest.mark.slow
@pytest.mark.generator
@pytest.mark.parametrize("retriever,document_store", [("embedding", "memory")], indirect=True)
def test_generator_pipeline_with_translator(
document_store, retriever, rag_generator, en_to_de_translator, de_to_en_translator
):
document_store.write_documents(DOCS_WITH_EMBEDDINGS)
query = "Was ist die Hauptstadt der Bundesrepublik Deutschland?"
base_pipeline = GenerativeQAPipeline(retriever=retriever, generator=rag_generator)
pipeline = TranslationWrapperPipeline(
input_translator=de_to_en_translator, output_translator=en_to_de_translator, pipeline=base_pipeline
)
output = pipeline.run(query=query, params={"Generator": {"top_k": 2}, "Retriever": {"top_k": 1}})
answers = output["answers"]
assert len(answers) == 2
assert "berlin" in answers[0].answer
@pytest.mark.slow
@pytest.mark.generator
def test_rag_token_generator(rag_generator):
query = "What is capital of the Germany?"
generated_docs = rag_generator.predict(query=query, documents=DOCS_WITH_EMBEDDINGS, top_k=1)
answers = generated_docs["answers"]
assert len(answers) == 1
assert "berlin" in answers[0].answer
@pytest.mark.slow
@pytest.mark.generator
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
def test_generator_pipeline(document_store, retriever, rag_generator):
document_store.write_documents(DOCS_WITH_EMBEDDINGS)
query = "What is capital of the Germany?"
pipeline = GenerativeQAPipeline(retriever=retriever, generator=rag_generator)
output = pipeline.run(query=query, params={"Generator": {"top_k": 2}, "Retriever": {"top_k": 1}})
answers = output["answers"]
assert len(answers) == 2
assert "berlin" in answers[0].answer
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Causes OOM on windows github runner")
@pytest.mark.slow
@pytest.mark.generator
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["retribert"], indirect=True)
@pytest.mark.embedding_dim(128)
def test_lfqa_pipeline(document_store, retriever, eli5_generator):
# reuse existing DOCS but regenerate embeddings with retribert
docs: List[Document] = []
for idx, d in enumerate(DOCS_WITH_EMBEDDINGS):
docs.append(Document(d.content, str(idx)))
document_store.write_documents(docs)
document_store.update_embeddings(retriever)
query = "Tell me about Berlin?"
pipeline = GenerativeQAPipeline(retriever=retriever, generator=eli5_generator)
output = pipeline.run(query=query, params={"top_k": 1})
answers = output["answers"]
assert len(answers) == 1
assert "Germany" in answers[0].answer
@pytest.mark.slow
@pytest.mark.generator
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["retribert"], indirect=True)
@pytest.mark.embedding_dim(128)
def test_lfqa_pipeline_unknown_converter(document_store, retriever):
# reuse existing DOCS but regenerate embeddings with retribert
docs: List[Document] = []
for idx, d in enumerate(DOCS_WITH_EMBEDDINGS):
docs.append(Document(d.content, str(idx)))
document_store.write_documents(docs)
document_store.update_embeddings(retriever)
seq2seq = Seq2SeqGenerator(model_name_or_path="patrickvonplaten/t5-tiny-random")
query = "Tell me about Berlin?"
pipeline = GenerativeQAPipeline(retriever=retriever, generator=seq2seq)
# raises exception as we don't have converter for "patrickvonplaten/t5-tiny-random" in Seq2SeqGenerator
with pytest.raises(Exception) as exception_info:
output = pipeline.run(query=query, params={"top_k": 1})
assert "doesn't have input converter registered for patrickvonplaten/t5-tiny-random" in str(exception_info.value)
@pytest.mark.slow
@pytest.mark.generator
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["retribert"], indirect=True)
@pytest.mark.embedding_dim(128)
def test_lfqa_pipeline_invalid_converter(document_store, retriever):
# reuse existing DOCS but regenerate embeddings with retribert
docs: List[Document] = []
for idx, d in enumerate(DOCS_WITH_EMBEDDINGS):
docs.append(Document(d.content, str(idx)))
document_store.write_documents(docs)
document_store.update_embeddings(retriever)
class _InvalidConverter:
def __call__(self, some_invalid_para: str, another_invalid_param: str) -> None:
pass
seq2seq = Seq2SeqGenerator(
model_name_or_path="patrickvonplaten/t5-tiny-random", input_converter=_InvalidConverter()
)
query = "This query will fail due to InvalidConverter used"
pipeline = GenerativeQAPipeline(retriever=retriever, generator=seq2seq)
# raises exception as we are using invalid method signature in _InvalidConverter
with pytest.raises(Exception) as exception_info:
output = pipeline.run(query=query, params={"top_k": 1})
assert "does not have a valid __call__ method signature" in str(exception_info.value)