From eeb29b568612366539b129ac30039ef79764bfd0 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Thu, 20 Jul 2023 11:48:51 +0200
Subject: [PATCH] test: Re-activate end-to-end tests workflow (#5343)

* Install haystack with required extras

* remove whitespaces

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

* Add sleep

* Add s for seconds

* Move container initialization in workflow

* Update e2e.yml

add nightly run

* use new folder for initial e2e test

* use file hash for caching and trigger on push to branch

* remove \n from model names read from file

* remove trigger on push to branch

---------

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
Co-authored-by: bogdankostic <bogdankostic@web.de>
---
 .github/workflows/e2e.yml                     | 37 ++++++++++---------
 e2e/document_search/__init__.py               |  0
 e2e/document_search/test_standard_pipeline.py | 35 ++++++++++++++++++
 e2e/models_to_cache.txt                       |  1 +
 4 files changed, 56 insertions(+), 17 deletions(-)
 create mode 100644 e2e/document_search/__init__.py
 create mode 100644 e2e/document_search/test_standard_pipeline.py
 create mode 100644 e2e/models_to_cache.txt

diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
index 43536a5cf..e9f9f4048 100644
--- a/.github/workflows/e2e.yml
+++ b/.github/workflows/e2e.yml
@@ -2,7 +2,9 @@
 name: end-to-end
 
 on:
-  workflow_dispatch:
+  workflow_dispatch: # Activate this workflow manually
+  schedule:
+    - cron: "0 0 * * *"
 
 env:
   PYTHON_VERSION: "3.8"
@@ -14,7 +16,7 @@ jobs:
       fail-fast: false  # Avoid cancelling the others if one of these fails
       matrix:
         folder:
-          - "document_stores"
+          - "document_search"
 
     runs-on: ubuntu-latest
 
@@ -25,20 +27,6 @@ jobs:
       with:
         python-version: ${{ env.PYTHON_VERSION }}
 
-    - name: Cache HF models
-      id: cache-hf-models
-      uses: actions/cache@v3
-      with:
-        path: ~/.cache/huggingface/transformers/
-        key: hf-models
-      env:
-        SEGMENT_DOWNLOAD_TIMEOUT_MINS: 15
-
-    - name: Download models
-      if: steps.cache-hf-models.outputs.cache-hit != 'true'
-      run: |
-         python -c "from transformers import AutoModel;[AutoModel.from_pretrained(model_name) for model_name in ['vblagoje/dpr-ctx_encoder-single-lfqa-wiki', 'vblagoje/dpr-question_encoder-single-lfqa-wiki', 'facebook/dpr-question_encoder-single-nq-base', 'facebook/dpr-ctx_encoder-single-nq-base', 'elastic/distilbert-base-cased-finetuned-conll03-english', 'deepset/bert-medium-squad2-distilled']]"
-
     - name: Run Elasticsearch
       run: |
         docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
@@ -51,8 +39,23 @@ jobs:
       run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --env ENABLE_EXPERIMENTAL_BM25='true' --env DISK_USE_READONLY_PERCENTAGE='95' semitechnologies/weaviate:1.17.2
 
     - name: Install Haystack
-      run: pip install .
+      run: pip install .[inference,elasticsearch7,faiss,weaviate,opensearch,dev]
 
+    - name: Cache HF models
+      id: cache-hf-models
+      uses: actions/cache@v3
+      with:
+        path: ./e2e
+        key: ${{ runner.os }}-${{ hashFiles('**/models_to_cache.txt') }}
+      env:
+        SEGMENT_DOWNLOAD_TIMEOUT_MINS: 15
+    - name: Download models
+      if: steps.cache-hf-models.outputs.cache-hit != 'true'
+      shell: python
+      run: |
+        from transformers import AutoModel
+        with open("./e2e/models_to_cache.txt") as file:
+          AutoModel.from_pretrained(file.readline().rstrip())
     - name: Run tests
       env:
         TOKENIZERS_PARALLELISM: 'false'  # Avoid logspam by tokenizers
diff --git a/e2e/document_search/__init__.py b/e2e/document_search/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/e2e/document_search/test_standard_pipeline.py b/e2e/document_search/test_standard_pipeline.py
new file mode 100644
index 000000000..5401e70f2
--- /dev/null
+++ b/e2e/document_search/test_standard_pipeline.py
@@ -0,0 +1,35 @@
+import pytest
+
+from haystack.nodes import EmbeddingRetriever
+from haystack.pipelines import DocumentSearchPipeline
+
+from ..conftest import document_store
+
+
+@pytest.mark.parametrize("document_store_name", ["memory", "faiss", "weaviate", "elasticsearch"])
+def test_document_search_standard_pipeline(document_store_name, docs, tmp_path):
+    """
+    Testing the DocumentSearchPipeline with most common parameters according to our template:
+    https://github.com/deepset-ai/templates/blob/main/pipelines/DenseDocSearch.yaml
+    The common multi-qa-mpnet-base-dot-v1 model is replaced with the very similar paraphrase-MiniLM-L3-v2,
+    which reduces runtime and model size by ~6x
+    """
+    with document_store(document_store_name, docs, tmp_path, embedding_dim=384) as ds:
+        retriever = EmbeddingRetriever(
+            document_store=ds, embedding_model="sentence-transformers/paraphrase-MiniLM-L3-v2"
+        )
+        ds.update_embeddings(retriever)
+        pipeline = DocumentSearchPipeline(retriever)
+        prediction = pipeline.run("Paul lives in New York")
+        scores = [document.score for document in prediction["documents"]]
+        assert [document.content for document in prediction["documents"]] == [
+            "My name is Paul and I live in New York",
+            "My name is Matteo and I live in Rome",
+            "My name is Christelle and I live in Paris",
+            "My name is Carla and I live in Berlin",
+            "My name is Camila and I live in Madrid",
+        ]
+        assert scores == pytest.approx(
+            [0.9149981737136841, 0.6895168423652649, 0.641706794500351, 0.6206043660640717, 0.5837393924593925],
+            abs=1e-3,
+        )
diff --git a/e2e/models_to_cache.txt b/e2e/models_to_cache.txt
new file mode 100644
index 000000000..e16bb0316
--- /dev/null
+++ b/e2e/models_to_cache.txt
@@ -0,0 +1 @@
+sentence-transformers/paraphrase-MiniLM-L3-v2