From ce062689902f4814f1350c2fbd84400d570d5cb3 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 30 Aug 2023 12:24:03 +0200 Subject: [PATCH] test: fix e2e test failures (#5685) * fix test errors * fix pipeline yaml * disable cache * fix errors * remove stray fixture --- .github/workflows/e2e.yml | 36 ++++++++++--------- e2e/pipelines/test_pipeline_topologies.py | 14 ++++---- e2e/pipelines/test_standard_pipelines.py | 13 ++++--- .../pipelines/test.haystack-pipeline.yml | 1 + 4 files changed, 34 insertions(+), 30 deletions(-) diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index e7a6ad64e..8c9b12c23 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -48,23 +48,27 @@ jobs: run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --env ENABLE_EXPERIMENTAL_BM25='true' --env DISK_USE_READONLY_PERCENTAGE='95' semitechnologies/weaviate:1.17.2 - name: Install Haystack - run: pip install .[inference,elasticsearch7,faiss,weaviate,opensearch,dev] + run: pip install -e .[inference,elasticsearch7,faiss,weaviate,opensearch,dev,pdf] + + # FIXME caching prevents PRs from running the e2e tests properly + + # - name: Cache HF models + # id: cache-hf-models + # uses: actions/cache@v3 + # with: + # path: ./e2e + # key: ${{ runner.os }}-${{ hashFiles('**/models_to_cache.txt') }} + # env: + # SEGMENT_DOWNLOAD_TIMEOUT_MINS: 15 + + # - name: Download models + # if: steps.cache-hf-models.outputs.cache-hit != 'true' + # shell: python + # run: | + # from transformers import AutoModel + # with open("./e2e/models_to_cache.txt") as file: + # AutoModel.from_pretrained(file.readline().rstrip()) - - name: Cache HF models - id: cache-hf-models - uses: actions/cache@v3 - with: - path: ./e2e - key: ${{ runner.os }}-${{ hashFiles('**/models_to_cache.txt') }} - env: - SEGMENT_DOWNLOAD_TIMEOUT_MINS: 15 - - name: Download models - if: steps.cache-hf-models.outputs.cache-hit != 'true' - shell: python - run: | - from transformers import AutoModel - with open("./e2e/models_to_cache.txt") as file: - AutoModel.from_pretrained(file.readline().rstrip()) - name: Run tests env: TOKENIZERS_PARALLELISM: 'false' # Avoid logspam by tokenizers diff --git a/e2e/pipelines/test_pipeline_topologies.py b/e2e/pipelines/test_pipeline_topologies.py index 7a15c3746..c90872b6e 100644 --- a/e2e/pipelines/test_pipeline_topologies.py +++ b/e2e/pipelines/test_pipeline_topologies.py @@ -41,7 +41,7 @@ def test_query_keyword_statement_classifier(classifier): def test_join_merge_no_weights(docs): - document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product") + document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product", use_bm25=True) document_store.write_documents(documents=docs) bm25 = BM25Retriever(document_store=document_store) dpr = DensePassageRetriever( @@ -64,7 +64,7 @@ def test_join_merge_no_weights(docs): def test_join_merge_with_weights(docs): - document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product") + document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product", use_bm25=True) document_store.write_documents(documents=docs) bm25 = BM25Retriever(document_store=document_store) dpr = DensePassageRetriever( @@ -88,7 +88,7 @@ def test_join_merge_with_weights(docs): def test_join_concatenate(docs): - document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product") + document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product", use_bm25=True) document_store.write_documents(documents=docs) bm25 = BM25Retriever(document_store=document_store) dpr = DensePassageRetriever( @@ -111,7 +111,7 @@ def test_join_concatenate(docs): def test_join_concatenate_with_topk(docs): - document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product") + document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product", use_bm25=True) document_store.write_documents(documents=docs) bm25 = BM25Retriever(document_store=document_store) dpr = DensePassageRetriever( @@ -135,8 +135,8 @@ def test_join_concatenate_with_topk(docs): assert len(two_results["documents"]) == 2 -def test_join_with_reader(docs, reader): - document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product") +def test_join_with_reader(docs): + document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product", use_bm25=True) document_store.write_documents(documents=docs) bm25 = BM25Retriever(document_store=document_store) dpr = DensePassageRetriever( @@ -164,7 +164,7 @@ def test_join_with_reader(docs, reader): def test_join_with_rrf(docs): - document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product") + document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product", use_bm25=True) document_store.write_documents(documents=docs) bm25 = BM25Retriever(document_store=document_store) dpr = DensePassageRetriever( diff --git a/e2e/pipelines/test_standard_pipelines.py b/e2e/pipelines/test_standard_pipelines.py index 941f56ef5..5071bb287 100644 --- a/e2e/pipelines/test_standard_pipelines.py +++ b/e2e/pipelines/test_standard_pipelines.py @@ -135,7 +135,6 @@ def test_query_and_indexing_pipeline(samples_path): ) assert prediction["query"] == "Who made the PDF specification?" assert prediction["answers"][0].answer == "Adobe Systems" - assert prediction["answers"][0].meta["classification"]["label"] == "joy" assert "_debug" not in prediction.keys() @@ -173,7 +172,7 @@ def test_faq_pipeline_batch(): {"content": "How to test module-4?", "meta": {"source": "wiki4", "answer": "Using tests for module-4"}}, {"content": "How to test module-5?", "meta": {"source": "wiki5", "answer": "Using tests for module-5"}}, ] - document_store = InMemoryDocumentStore() + document_store = InMemoryDocumentStore(embedding_dim=384) retriever = EmbeddingRetriever( document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2" ) @@ -197,7 +196,7 @@ def test_document_search_pipeline_batch(): {"content": "Sample text for document-4", "meta": {"source": "wiki4"}}, {"content": "Sample text for document-5", "meta": {"source": "wiki5"}}, ] - document_store = InMemoryDocumentStore() + document_store = InMemoryDocumentStore(embedding_dim=384) retriever = EmbeddingRetriever( document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2" ) @@ -218,7 +217,7 @@ def test_most_similar_documents_pipeline_batch(): {"content": "Sample text for document-4", "meta": {"source": "wiki4"}}, {"content": "Sample text for document-5", "meta": {"source": "wiki5"}}, ] - document_store = InMemoryDocumentStore() + document_store = InMemoryDocumentStore(embedding_dim=384) retriever = EmbeddingRetriever( document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2" ) @@ -249,11 +248,11 @@ def test_most_similar_documents_pipeline_with_filters_batch(): {"content": "Sample text for document-4", "meta": {"source": "wiki4"}}, {"content": "Sample text for document-5", "meta": {"source": "wiki5"}}, ] - document_store = InMemoryDocumentStore() + document_store = InMemoryDocumentStore(embedding_dim=384) retriever = EmbeddingRetriever( document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2" ) - document_store = InMemoryDocumentStore() + document_store = InMemoryDocumentStore(embedding_dim=384) document_store.write_documents(documents) document_store.update_embeddings(retriever) @@ -307,7 +306,7 @@ def test_summarization_pipeline(): output = pipeline.run(query=query, params={"Retriever": {"top_k": 1}}) answers = output["answers"] assert len(answers) == 1 - assert "The Eiffel Tower is one of the world's tallest structures" == answers[0]["answer"].strip() + assert "The Eiffel Tower is one of the world's tallest structures." == answers[0]["answer"].strip() def test_summarization_pipeline_one_summary(): diff --git a/e2e/samples/pipelines/test.haystack-pipeline.yml b/e2e/samples/pipelines/test.haystack-pipeline.yml index b2c874876..42149765b 100644 --- a/e2e/samples/pipelines/test.haystack-pipeline.yml +++ b/e2e/samples/pipelines/test.haystack-pipeline.yml @@ -11,6 +11,7 @@ components: type: BM25Retriever params: top_k: 2 + document_store: DocumentStore - name: DocumentStore type: ElasticsearchDocumentStore - name: PDFConverter