From ce062689902f4814f1350c2fbd84400d570d5cb3 Mon Sep 17 00:00:00 2001
From: ZanSara <sara.zanzottera@deepset.ai>
Date: Wed, 30 Aug 2023 12:24:03 +0200
Subject: [PATCH] test: fix e2e test failures (#5685)

* fix test errors

* fix pipeline yaml

* disable cache

* fix errors

* remove stray fixture
---
 .github/workflows/e2e.yml                     | 36 ++++++++++---------
 e2e/pipelines/test_pipeline_topologies.py     | 14 ++++----
 e2e/pipelines/test_standard_pipelines.py      | 13 ++++---
 .../pipelines/test.haystack-pipeline.yml      |  1 +
 4 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
index e7a6ad64e..8c9b12c23 100644
--- a/.github/workflows/e2e.yml
+++ b/.github/workflows/e2e.yml
@@ -48,23 +48,27 @@ jobs:
       run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --env ENABLE_EXPERIMENTAL_BM25='true' --env DISK_USE_READONLY_PERCENTAGE='95' semitechnologies/weaviate:1.17.2
 
     - name: Install Haystack
-      run: pip install .[inference,elasticsearch7,faiss,weaviate,opensearch,dev]
+      run: pip install -e .[inference,elasticsearch7,faiss,weaviate,opensearch,dev,pdf]
+
+    # FIXME caching prevents PRs from running the e2e tests properly
+
+    # - name: Cache HF models
+    #   id: cache-hf-models
+    #   uses: actions/cache@v3
+    #   with:
+    #     path: ./e2e
+    #     key: ${{ runner.os }}-${{ hashFiles('**/models_to_cache.txt') }}
+    #   env:
+    #     SEGMENT_DOWNLOAD_TIMEOUT_MINS: 15
+
+    # - name: Download models
+    #   if: steps.cache-hf-models.outputs.cache-hit != 'true'
+    #   shell: python
+    #   run: |
+    #     from transformers import AutoModel
+    #     with open("./e2e/models_to_cache.txt") as file:
+    #       AutoModel.from_pretrained(file.readline().rstrip())
 
-    - name: Cache HF models
-      id: cache-hf-models
-      uses: actions/cache@v3
-      with:
-        path: ./e2e
-        key: ${{ runner.os }}-${{ hashFiles('**/models_to_cache.txt') }}
-      env:
-        SEGMENT_DOWNLOAD_TIMEOUT_MINS: 15
-    - name: Download models
-      if: steps.cache-hf-models.outputs.cache-hit != 'true'
-      shell: python
-      run: |
-        from transformers import AutoModel
-        with open("./e2e/models_to_cache.txt") as file:
-          AutoModel.from_pretrained(file.readline().rstrip())
     - name: Run tests
       env:
         TOKENIZERS_PARALLELISM: 'false'  # Avoid logspam by tokenizers
diff --git a/e2e/pipelines/test_pipeline_topologies.py b/e2e/pipelines/test_pipeline_topologies.py
index 7a15c3746..c90872b6e 100644
--- a/e2e/pipelines/test_pipeline_topologies.py
+++ b/e2e/pipelines/test_pipeline_topologies.py
@@ -41,7 +41,7 @@ def test_query_keyword_statement_classifier(classifier):
 
 
 def test_join_merge_no_weights(docs):
-    document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product")
+    document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product", use_bm25=True)
     document_store.write_documents(documents=docs)
     bm25 = BM25Retriever(document_store=document_store)
     dpr = DensePassageRetriever(
@@ -64,7 +64,7 @@ def test_join_merge_no_weights(docs):
 
 
 def test_join_merge_with_weights(docs):
-    document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product")
+    document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product", use_bm25=True)
     document_store.write_documents(documents=docs)
     bm25 = BM25Retriever(document_store=document_store)
     dpr = DensePassageRetriever(
@@ -88,7 +88,7 @@ def test_join_merge_with_weights(docs):
 
 
 def test_join_concatenate(docs):
-    document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product")
+    document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product", use_bm25=True)
     document_store.write_documents(documents=docs)
     bm25 = BM25Retriever(document_store=document_store)
     dpr = DensePassageRetriever(
@@ -111,7 +111,7 @@ def test_join_concatenate(docs):
 
 
 def test_join_concatenate_with_topk(docs):
-    document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product")
+    document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product", use_bm25=True)
     document_store.write_documents(documents=docs)
     bm25 = BM25Retriever(document_store=document_store)
     dpr = DensePassageRetriever(
@@ -135,8 +135,8 @@ def test_join_concatenate_with_topk(docs):
     assert len(two_results["documents"]) == 2
 
 
-def test_join_with_reader(docs, reader):
-    document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product")
+def test_join_with_reader(docs):
+    document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product", use_bm25=True)
     document_store.write_documents(documents=docs)
     bm25 = BM25Retriever(document_store=document_store)
     dpr = DensePassageRetriever(
@@ -164,7 +164,7 @@ def test_join_with_reader(docs, reader):
 
 
 def test_join_with_rrf(docs):
-    document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product")
+    document_store = InMemoryDocumentStore(embedding_dim=768, similarity="dot_product", use_bm25=True)
     document_store.write_documents(documents=docs)
     bm25 = BM25Retriever(document_store=document_store)
     dpr = DensePassageRetriever(
diff --git a/e2e/pipelines/test_standard_pipelines.py b/e2e/pipelines/test_standard_pipelines.py
index 941f56ef5..5071bb287 100644
--- a/e2e/pipelines/test_standard_pipelines.py
+++ b/e2e/pipelines/test_standard_pipelines.py
@@ -135,7 +135,6 @@ def test_query_and_indexing_pipeline(samples_path):
     )
     assert prediction["query"] == "Who made the PDF specification?"
     assert prediction["answers"][0].answer == "Adobe Systems"
-    assert prediction["answers"][0].meta["classification"]["label"] == "joy"
     assert "_debug" not in prediction.keys()
 
 
@@ -173,7 +172,7 @@ def test_faq_pipeline_batch():
         {"content": "How to test module-4?", "meta": {"source": "wiki4", "answer": "Using tests for module-4"}},
         {"content": "How to test module-5?", "meta": {"source": "wiki5", "answer": "Using tests for module-5"}},
     ]
-    document_store = InMemoryDocumentStore()
+    document_store = InMemoryDocumentStore(embedding_dim=384)
     retriever = EmbeddingRetriever(
         document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
     )
@@ -197,7 +196,7 @@ def test_document_search_pipeline_batch():
         {"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
         {"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
     ]
-    document_store = InMemoryDocumentStore()
+    document_store = InMemoryDocumentStore(embedding_dim=384)
     retriever = EmbeddingRetriever(
         document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
     )
@@ -218,7 +217,7 @@ def test_most_similar_documents_pipeline_batch():
         {"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
         {"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
     ]
-    document_store = InMemoryDocumentStore()
+    document_store = InMemoryDocumentStore(embedding_dim=384)
     retriever = EmbeddingRetriever(
         document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
     )
@@ -249,11 +248,11 @@ def test_most_similar_documents_pipeline_with_filters_batch():
         {"content": "Sample text for document-4", "meta": {"source": "wiki4"}},
         {"content": "Sample text for document-5", "meta": {"source": "wiki5"}},
     ]
-    document_store = InMemoryDocumentStore()
+    document_store = InMemoryDocumentStore(embedding_dim=384)
     retriever = EmbeddingRetriever(
         document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2"
     )
-    document_store = InMemoryDocumentStore()
+    document_store = InMemoryDocumentStore(embedding_dim=384)
     document_store.write_documents(documents)
     document_store.update_embeddings(retriever)
 
@@ -307,7 +306,7 @@ def test_summarization_pipeline():
     output = pipeline.run(query=query, params={"Retriever": {"top_k": 1}})
     answers = output["answers"]
     assert len(answers) == 1
-    assert "The Eiffel Tower is one of the world's tallest structures" == answers[0]["answer"].strip()
+    assert "The Eiffel Tower is one of the world's tallest structures." == answers[0]["answer"].strip()
 
 
 def test_summarization_pipeline_one_summary():
diff --git a/e2e/samples/pipelines/test.haystack-pipeline.yml b/e2e/samples/pipelines/test.haystack-pipeline.yml
index b2c874876..42149765b 100644
--- a/e2e/samples/pipelines/test.haystack-pipeline.yml
+++ b/e2e/samples/pipelines/test.haystack-pipeline.yml
@@ -11,6 +11,7 @@ components:
     type: BM25Retriever
     params:
       top_k: 2
+      document_store: DocumentStore
   - name: DocumentStore
     type: ElasticsearchDocumentStore
   - name: PDFConverter