haystack/test/test_standard_pipelines.py

385 lines
16 KiB
Python
Raw Normal View History

from pathlib import Path
from collections import defaultdict
import os
import math
import pytest
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack.pipelines import Pipeline, FAQPipeline, DocumentSearchPipeline, RootNode, MostSimilarDocumentsPipeline
from haystack.nodes import (
DensePassageRetriever,
BM25Retriever,
SklearnQueryClassifier,
TransformersQueryClassifier,
JoinDocuments,
)
from haystack.schema import Document
Pipeline's YAML: syntax validation (#2226) * Add BasePipeline.validate_config, BasePipeline.validate_yaml, and some new custom exception classes * Make error composition work properly * Clarify typing * Help mypy a bit more * Update Documentation & Code Style * Enable autogenerated docs for Milvus1 and 2 separately * Revert "Enable autogenerated docs for Milvus1 and 2 separately" This reverts commit 282be4a78a6e95862a9b4c924fc3dea5ca71e28d. * Update Documentation & Code Style * Re-enable 'additionalProperties: False' * Add pipeline.type to JSON Schema, was somehow forgotten * Disable additionalProperties on the pipeline properties too * Fix json-schemas for 1.1.0 and 1.2.0 (should not do it again in the future) * Cal super in PipelineValidationError * Improve _read_pipeline_config_from_yaml's error handling * Fix generate_json_schema.py to include document stores * Fix json schemas (retro-fix 1.1.0 again) * Improve custom errors printing, add link to docs * Add function in BaseComponent to list its subclasses in a module * Make some document stores base classes abstract * Add marker 'integration' in pytest flags * Slighly improve validation of pipelines at load * Adding tests for YAML loading and validation * Make custom_query Optional for validation issues * Fix bug in _read_pipeline_config_from_yaml * Improve error handling in BasePipeline and Pipeline and add DAG check * Move json schema generation into haystack/nodes/_json_schema.py (useful for tests) * Simplify errors slightly * Add some YAML validation tests * Remove load_from_config from BasePipeline, it was never used anyway * Improve tests * Include json-schemas in package * Fix conftest imports * Make BasePipeline abstract * Improve mocking by making the test independent from the YAML version * Add exportable_to_yaml decorator to forget about set_config on mock nodes * Fix mypy errors * Comment out one monkeypatch * Fix typing again * Improve error message for validation * Add required properties to pipelines * Fix YAML version for REST API YAMLs to 1.2.0 * Fix load_from_yaml call in load_from_deepset_cloud * fix HaystackError.__getattr__ * Add super().__init__()in most nodes and docstore, comment set_config * Remove type from REST API pipelines * Remove useless init from doc2answers * Call super in Seq3SeqGenerator * Typo in deepsetcloud.py * Fix rest api indexing error mismatch and mock version of JSON schema in all tests * Working on pipeline tests * Improve errors printing slightly * Add back test_pipeline.yaml * _json_schema.py supports different versions with identical schemas * Add type to 0.7 schema for backwards compatibility * Fix small bug in _json_schema.py * Try alternative to generate json schemas on the CI * Update Documentation & Code Style * Make linux CI match autoformat CI * Fix super-init-not-called * Accidentally committed file * Update Documentation & Code Style * fix test_summarizer_translation.py's import * Mock YAML in a few suites, split and simplify test_pipeline_debug_and_validation.py::test_invalid_run_args * Fix json schema for ray tests too * Update Documentation & Code Style * Reintroduce validation * Usa unstable version in tests and rest api * Make unstable support the latest versions * Update Documentation & Code Style * Remove needless fixture * Make type in pipeline optional in the strings validation * Fix schemas * Fix string validation for pipeline type * Improve validate_config_strings * Remove type from test p[ipelines * Update Documentation & Code Style * Fix test_pipeline * Removing more type from pipelines * Temporary CI patc * Fix issue with exportable_to_yaml never invoking the wrapped init * rm stray file * pipeline tests are green again * Linux CI now needs .[all] to generate the schema * Bugfixes, pipeline tests seems to be green * Typo in version after merge * Implement missing methods in Weaviate * Trying to avoid FAISS tests from running in the Milvus1 test suite * Fix some stray test paths and faiss index dumping * Fix pytest markers list * Temporarily disable cache to be able to see tests failures * Fix pyproject.toml syntax * Use only tmp_path * Fix preprocessor signature after merge * Fix faiss bug * Fix Ray test * Fix documentation issue by removing quotes from faiss type * Update Documentation & Code Style * use document properly in preprocessor tests * Update Documentation & Code Style * make preprocessor capable of handling documents * import document * Revert support for documents in preprocessor, do later * Fix bug in _json_schema.py that was breaking validation * re-enable cache * Update Documentation & Code Style * Simplify calling _json_schema.py from the CI * Remove redundant ABC inheritance * Ensure exportable_to_yaml works only on implementations * Rename subclass to class_ in Meta * Make run() and get_config() abstract in BasePipeline * Revert unintended change in preprocessor * Move outgoing_edges_input_node check inside try block * Rename VALID_CODE_GEN_INPUT_REGEX into VALID_INPUT_REGEX * Add check for a RecursionError on validate_config_strings * Address usages of _pipeline_config in data silo and elasticsearch * Rename _pipeline_config into _init_parameters * Fix pytest marker and remove unused imports * Remove most redundant ABCs * Rename _init_parameters into _component_configuration * Remove set_config and type from _component_configuration's dict * Remove last instances of set_config and replace with super().__init__() * Implement __init_subclass__ approach * Simplify checks on the existence of _component_configuration * Fix faiss issue * Dynamic generation of node schemas & weed out old schemas * Add debatable test * Add docstring to debatable test * Positive diff between schemas implemented * Improve diff printing * Rename REST API YAML files to trigger IDE validation * Fix typing issues * Fix more typing * Typo in YAML filename * Remove needless type:ignore * Add tests * Fix tests & validation feedback for accessory classes in custom nodes * Refactor RAGeneratorType out * Fix broken import in conftest * Improve source error handling * Remove unused import in test_eval.py breaking tests * Fix changed error message in tests matches too * Normalize generate_openapi_specs.py and generate_json_schema.py in the actions * Fix path to generate_openapi_specs.py in autoformat.yml * Update Documentation & Code Style * Add test for FAISSDocumentStore-like situations (superclass with init params) * Update Documentation & Code Style * Fix indentation * Remove commented set_config * Store model_name_or_path in FARMReader to use in DistillationDataSilo * Rename _component_configuration into _component_config * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2022-03-15 11:17:26 +01:00
from .conftest import SAMPLES_PATH
@pytest.mark.parametrize(
"retriever,document_store",
[("embedding", "memory"), ("embedding", "faiss"), ("embedding", "milvus1"), ("embedding", "elasticsearch")],
indirect=True,
)
def test_faq_pipeline(retriever, document_store):
documents = [
{"content": "How to test module-1?", "meta": {"source": "wiki1", "answer": "Using tests for module-1"}},
{"content": "How to test module-2?", "meta": {"source": "wiki2", "answer": "Using tests for module-2"}},
{"content": "How to test module-3?", "meta": {"source": "wiki3", "answer": "Using tests for module-3"}},
{"content": "How to test module-4?", "meta": {"source": "wiki4", "answer": "Using tests for module-4"}},
{"content": "How to test module-5?", "meta": {"source": "wiki5", "answer": "Using tests for module-5"}},
]
document_store.write_documents(documents)
document_store.update_embeddings(retriever)
pipeline = FAQPipeline(retriever=retriever)
output = pipeline.run(query="How to test this?", params={"Retriever": {"top_k": 3}})
assert len(output["answers"]) == 3
assert output["query"].startswith("How to")
assert output["answers"][0].answer.startswith("Using tests")
if isinstance(document_store, ElasticsearchDocumentStore):
output = pipeline.run(
query="How to test this?", params={"Retriever": {"filters": {"source": ["wiki2"]}, "top_k": 5}}
)
assert len(output["answers"]) == 1
@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
@pytest.mark.parametrize(
Add `PineconeDocumentStore` (#2254) * added core install and functionality of pinecone doc store (init, upsert, query, delete) * implemented core functionality of Pinecone doc store * Update Documentation & Code Style * updated filtering to use Haystack filtering and reduced default batch_size * Update Documentation & Code Style * removed debugging code * updated Pinecone filtering to use filter_utils * removed uneeded methods and minor tweaks to current methods * fixed typing issues * Update Documentation & Code Style * Allow filters in al methods except get_embedding_count * Fix skipping document store tests * Update Documentation & Code Style * Fix handling of Milvus1 and Milvus2 in tests * Update Documentation & Code Style * Fix handling of Milvus1 and Milvus2 in tests * Update Documentation & Code Style * Remove SQL from tests requiring embeddings * Update Documentation & Code Style * Fix get_embedding_count of Milvus2 * Make sure to start Milvus2 tests with a new collection * Add pinecone to test suite * Update Documentation & Code Style * Fix typing * Update Documentation & Code Style * Add pinecone to docstores dependendcy * Add PineconeDocStore to API Documentation * Add missing comma * Update Documentation & Code Style * Adapt format of doc strings * Update Documentation & Code Style * Set API key as environment variable * Skip Pinecone tests in forks * Add sleep after deleting index * Add sleep after deleting index * Add sleep after creating index * Add check if index ready * Remove printing of index stats * Create new index for each pinecone test * Use RestAPI instead of Python API for describe_index_stats * Fix accessing describe_index_stats * Remove usages of describe_index_stats * Run pinecone tests separately * Update Documentation & Code Style * Add pdftotext to pinecone tests * Remove sleep from doc store fixture * Add describe_index_stats * Remove unused imports * Use pull_request_target trigger * Revert use pull_request_target trigger * Remove set_config * Add os to conftest * Integrate review comments * Set include_values to False * Remove quotation marks from pinecone.Index type * Update Documentation & Code Style * Update Documentation & Code Style * Fix number of args in error messages Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: bogdankostic <bogdankostic@web.de>
2022-03-21 22:24:09 +07:00
"document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate", "pinecone"], indirect=True
)
def test_document_search_pipeline(retriever, document_store):
documents = [
{"content": "Sample text for document-1", "meta": {"source": "wiki1"}},
{"content": "Sample text for document-2", "meta": {"source": "wiki2"}},
{"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
{"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
{"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
]
document_store.write_documents(documents)
document_store.update_embeddings(retriever)
pipeline = DocumentSearchPipeline(retriever=retriever)
output = pipeline.run(query="How to test this?", params={"top_k": 4})
assert len(output.get("documents", [])) == 4
if isinstance(document_store, ElasticsearchDocumentStore):
output = pipeline.run(query="How to test this?", params={"filters": {"source": ["wiki2"]}, "top_k": 5})
assert len(output["documents"]) == 1
@pytest.mark.parametrize(
"retriever,document_store",
[("embedding", "faiss"), ("embedding", "milvus1"), ("embedding", "elasticsearch")],
indirect=True,
)
def test_most_similar_documents_pipeline(retriever, document_store):
documents = [
{"id": "a", "content": "Sample text for document-1", "meta": {"source": "wiki1"}},
{"id": "b", "content": "Sample text for document-2", "meta": {"source": "wiki2"}},
{"content": "Sample text for document-3", "meta": {"source": "wiki3"}},
{"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
{"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
]
document_store.write_documents(documents)
document_store.update_embeddings(retriever)
docs_id: list = ["a", "b"]
pipeline = MostSimilarDocumentsPipeline(document_store=document_store)
list_of_documents = pipeline.run(document_ids=docs_id)
assert len(list_of_documents[0]) > 1
assert isinstance(list_of_documents, list)
assert len(list_of_documents) == len(docs_id)
for another_list in list_of_documents:
assert isinstance(another_list, list)
for document in another_list:
assert isinstance(document, Document)
assert isinstance(document.id, str)
assert isinstance(document.content, str)
@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store_dot_product_with_docs", ["elasticsearch"], indirect=True)
def test_join_merge_no_weights(document_store_dot_product_with_docs):
es = BM25Retriever(document_store=document_store_dot_product_with_docs)
dpr = DensePassageRetriever(
document_store=document_store_dot_product_with_docs,
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
use_gpu=False,
)
document_store_dot_product_with_docs.update_embeddings(dpr)
query = "Where does Carla live?"
join_node = JoinDocuments(join_mode="merge")
p = Pipeline()
p.add_node(component=es, name="R1", inputs=["Query"])
p.add_node(component=dpr, name="R2", inputs=["Query"])
p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
results = p.run(query=query)
assert len(results["documents"]) == 5
@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store_dot_product_with_docs", ["elasticsearch"], indirect=True)
def test_join_merge_with_weights(document_store_dot_product_with_docs):
es = BM25Retriever(document_store=document_store_dot_product_with_docs)
dpr = DensePassageRetriever(
document_store=document_store_dot_product_with_docs,
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
use_gpu=False,
)
document_store_dot_product_with_docs.update_embeddings(dpr)
query = "Where does Carla live?"
join_node = JoinDocuments(join_mode="merge", weights=[1000, 1], top_k_join=2)
p = Pipeline()
p.add_node(component=es, name="R1", inputs=["Query"])
p.add_node(component=dpr, name="R2", inputs=["Query"])
p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
results = p.run(query=query)
assert math.isclose(results["documents"][0].score, 0.5481393431183286, rel_tol=0.0001)
assert len(results["documents"]) == 2
@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store_dot_product_with_docs", ["elasticsearch"], indirect=True)
def test_join_concatenate(document_store_dot_product_with_docs):
es = BM25Retriever(document_store=document_store_dot_product_with_docs)
dpr = DensePassageRetriever(
document_store=document_store_dot_product_with_docs,
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
use_gpu=False,
)
document_store_dot_product_with_docs.update_embeddings(dpr)
query = "Where does Carla live?"
join_node = JoinDocuments(join_mode="concatenate")
p = Pipeline()
p.add_node(component=es, name="R1", inputs=["Query"])
p.add_node(component=dpr, name="R2", inputs=["Query"])
p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
results = p.run(query=query)
assert len(results["documents"]) == 5
@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store_dot_product_with_docs", ["elasticsearch"], indirect=True)
def test_join_concatenate_with_topk(document_store_dot_product_with_docs):
es = BM25Retriever(document_store=document_store_dot_product_with_docs)
dpr = DensePassageRetriever(
document_store=document_store_dot_product_with_docs,
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
use_gpu=False,
)
document_store_dot_product_with_docs.update_embeddings(dpr)
query = "Where does Carla live?"
join_node = JoinDocuments(join_mode="concatenate")
p = Pipeline()
p.add_node(component=es, name="R1", inputs=["Query"])
p.add_node(component=dpr, name="R2", inputs=["Query"])
p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
one_result = p.run(query=query, params={"Join": {"top_k_join": 1}})
two_results = p.run(query=query, params={"Join": {"top_k_join": 2}})
assert len(one_result["documents"]) == 1
assert len(two_results["documents"]) == 2
@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store_dot_product_with_docs", ["elasticsearch"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_join_with_reader(document_store_dot_product_with_docs, reader):
es = BM25Retriever(document_store=document_store_dot_product_with_docs)
dpr = DensePassageRetriever(
document_store=document_store_dot_product_with_docs,
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
use_gpu=False,
)
document_store_dot_product_with_docs.update_embeddings(dpr)
query = "Where does Carla live?"
join_node = JoinDocuments()
p = Pipeline()
p.add_node(component=es, name="R1", inputs=["Query"])
p.add_node(component=dpr, name="R2", inputs=["Query"])
p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
p.add_node(component=reader, name="Reader", inputs=["Join"])
results = p.run(query=query)
# check whether correct answer is within top 2 predictions
assert results["answers"][0].answer == "Berlin" or results["answers"][1].answer == "Berlin"
@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store_dot_product_with_docs", ["elasticsearch"], indirect=True)
def test_join_with_rrf(document_store_dot_product_with_docs):
es = BM25Retriever(document_store=document_store_dot_product_with_docs)
dpr = DensePassageRetriever(
document_store=document_store_dot_product_with_docs,
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
use_gpu=False,
)
document_store_dot_product_with_docs.update_embeddings(dpr)
query = "Where does Carla live?"
join_node = JoinDocuments(join_mode="reciprocal_rank_fusion")
p = Pipeline()
p.add_node(component=es, name="R1", inputs=["Query"])
p.add_node(component=dpr, name="R2", inputs=["Query"])
p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
results = p.run(query=query)
# list of precalculated expected results
expected_scores = [
0.03278688524590164,
0.03200204813108039,
0.03200204813108039,
0.031009615384615385,
0.031009615384615385,
]
assert all([doc.score == expected_scores[idx] for idx, doc in enumerate(results["documents"])])
def test_query_keyword_statement_classifier():
class KeywordOutput(RootNode):
outgoing_edges = 2
def run(self, **kwargs):
kwargs["output"] = "keyword"
return kwargs, "output_1"
class QuestionOutput(RootNode):
outgoing_edges = 2
def run(self, **kwargs):
kwargs["output"] = "question"
return kwargs, "output_2"
pipeline = Pipeline()
pipeline.add_node(name="SkQueryKeywordQuestionClassifier", component=SklearnQueryClassifier(), inputs=["Query"])
pipeline.add_node(
name="KeywordNode", component=KeywordOutput(), inputs=["SkQueryKeywordQuestionClassifier.output_2"]
)
pipeline.add_node(
name="QuestionNode", component=QuestionOutput(), inputs=["SkQueryKeywordQuestionClassifier.output_1"]
)
output = pipeline.run(query="morse code")
assert output["output"] == "keyword"
output = pipeline.run(query="How old is John?")
assert output["output"] == "question"
pipeline = Pipeline()
pipeline.add_node(
name="TfQueryKeywordQuestionClassifier", component=TransformersQueryClassifier(), inputs=["Query"]
)
pipeline.add_node(
name="KeywordNode", component=KeywordOutput(), inputs=["TfQueryKeywordQuestionClassifier.output_2"]
)
pipeline.add_node(
name="QuestionNode", component=QuestionOutput(), inputs=["TfQueryKeywordQuestionClassifier.output_1"]
)
output = pipeline.run(query="morse code")
assert output["output"] == "keyword"
output = pipeline.run(query="How old is John?")
assert output["output"] == "question"
@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
def test_indexing_pipeline_with_classifier(document_store):
# test correct load of indexing pipeline from yaml
pipeline = Pipeline.load_from_yaml(
SAMPLES_PATH / "pipeline" / "test_pipeline.yaml", pipeline_name="indexing_pipeline_with_classifier"
)
pipeline.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
# test correct load of query pipeline from yaml
pipeline = Pipeline.load_from_yaml(SAMPLES_PATH / "pipeline" / "test_pipeline.yaml", pipeline_name="query_pipeline")
prediction = pipeline.run(
query="Who made the PDF specification?", params={"ESRetriever": {"top_k": 10}, "Reader": {"top_k": 3}}
)
assert prediction["query"] == "Who made the PDF specification?"
assert prediction["answers"][0].answer == "Adobe Systems"
assert prediction["answers"][0].meta["classification"]["label"] == "joy"
assert "_debug" not in prediction.keys()
@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
def test_query_pipeline_with_document_classifier(document_store):
# test correct load of indexing pipeline from yaml
pipeline = Pipeline.load_from_yaml(
SAMPLES_PATH / "pipeline" / "test_pipeline.yaml", pipeline_name="indexing_pipeline"
)
pipeline.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
# test correct load of query pipeline from yaml
pipeline = Pipeline.load_from_yaml(
SAMPLES_PATH / "pipeline" / "test_pipeline.yaml", pipeline_name="query_pipeline_with_document_classifier"
)
prediction = pipeline.run(
query="Who made the PDF specification?", params={"ESRetriever": {"top_k": 10}, "Reader": {"top_k": 3}}
)
assert prediction["query"] == "Who made the PDF specification?"
assert prediction["answers"][0].answer == "Adobe Systems"
assert prediction["answers"][0].meta["classification"]["label"] == "joy"
assert "_debug" not in prediction.keys()
def test_existing_faiss_document_store():
clean_faiss_document_store()
pipeline = Pipeline.load_from_yaml(
SAMPLES_PATH / "pipeline" / "test_pipeline_faiss_indexing.yaml", pipeline_name="indexing_pipeline"
)
pipeline.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
new_document_store = pipeline.get_document_store()
new_document_store.save("existing_faiss_document_store")
# test correct load of query pipeline from yaml
pipeline = Pipeline.load_from_yaml(
SAMPLES_PATH / "pipeline" / "test_pipeline_faiss_retrieval.yaml", pipeline_name="query_pipeline"
)
existing_document_store = pipeline.get_document_store()
faiss_index = existing_document_store.faiss_indexes["document"]
assert faiss_index.ntotal == 2
prediction = pipeline.run(query="Who made the PDF specification?", params={"DPRRetriever": {"top_k": 10}})
assert prediction["query"] == "Who made the PDF specification?"
assert len(prediction["documents"]) == 2
clean_faiss_document_store()
def clean_faiss_document_store():
if Path("existing_faiss_document_store").exists():
os.remove("existing_faiss_document_store")
if Path("existing_faiss_document_store.json").exists():
os.remove("existing_faiss_document_store.json")
if Path("faiss_document_store.db").exists():
os.remove("faiss_document_store.db")