test: Re-activate end-to-end tests workflow (#5343)

* Install haystack with required extras

* remove whitespaces

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

* Add sleep

* Add s for seconds

* Move container initialization in workflow

* Update e2e.yml

add nightly run

* use new folder for initial e2e test

* use file hash for caching and trigger on push to branch

* remove \n from model names read from file

* remove trigger on push to branch

---------

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
Co-authored-by: bogdankostic <bogdankostic@web.de>
This commit is contained in:
Julian Risch 2023-07-20 11:48:51 +02:00 committed by GitHub
parent f7642e83ea
commit eeb29b5686
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 56 additions and 17 deletions

View File

@ -2,7 +2,9 @@
name: end-to-end
on:
workflow_dispatch:
workflow_dispatch: # Activate this workflow manually
schedule:
- cron: "0 0 * * *"
env:
PYTHON_VERSION: "3.8"
@ -14,7 +16,7 @@ jobs:
fail-fast: false # Avoid cancelling the others if one of these fails
matrix:
folder:
- "document_stores"
- "document_search"
runs-on: ubuntu-latest
@ -25,20 +27,6 @@ jobs:
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Cache HF models
id: cache-hf-models
uses: actions/cache@v3
with:
path: ~/.cache/huggingface/transformers/
key: hf-models
env:
SEGMENT_DOWNLOAD_TIMEOUT_MINS: 15
- name: Download models
if: steps.cache-hf-models.outputs.cache-hit != 'true'
run: |
python -c "from transformers import AutoModel;[AutoModel.from_pretrained(model_name) for model_name in ['vblagoje/dpr-ctx_encoder-single-lfqa-wiki', 'vblagoje/dpr-question_encoder-single-lfqa-wiki', 'facebook/dpr-question_encoder-single-nq-base', 'facebook/dpr-ctx_encoder-single-nq-base', 'elastic/distilbert-base-cased-finetuned-conll03-english', 'deepset/bert-medium-squad2-distilled']]"
- name: Run Elasticsearch
run: |
docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
@ -51,8 +39,23 @@ jobs:
run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --env ENABLE_EXPERIMENTAL_BM25='true' --env DISK_USE_READONLY_PERCENTAGE='95' semitechnologies/weaviate:1.17.2
- name: Install Haystack
run: pip install .
run: pip install .[inference,elasticsearch7,faiss,weaviate,opensearch,dev]
- name: Cache HF models
id: cache-hf-models
uses: actions/cache@v3
with:
path: ./e2e
key: ${{ runner.os }}-${{ hashFiles('**/models_to_cache.txt') }}
env:
SEGMENT_DOWNLOAD_TIMEOUT_MINS: 15
- name: Download models
if: steps.cache-hf-models.outputs.cache-hit != 'true'
shell: python
run: |
from transformers import AutoModel
with open("./e2e/models_to_cache.txt") as file:
AutoModel.from_pretrained(file.readline().rstrip())
- name: Run tests
env:
TOKENIZERS_PARALLELISM: 'false' # Avoid logspam by tokenizers

View File

View File

@ -0,0 +1,35 @@
import pytest
from haystack.nodes import EmbeddingRetriever
from haystack.pipelines import DocumentSearchPipeline
from ..conftest import document_store
@pytest.mark.parametrize("document_store_name", ["memory", "faiss", "weaviate", "elasticsearch"])
def test_document_search_standard_pipeline(document_store_name, docs, tmp_path):
"""
Testing the DocumentSearchPipeline with most common parameters according to our template:
https://github.com/deepset-ai/templates/blob/main/pipelines/DenseDocSearch.yaml
The common multi-qa-mpnet-base-dot-v1 model is replaced with the very similar paraphrase-MiniLM-L3-v2,
which reduces runtime and model size by ~6x
"""
with document_store(document_store_name, docs, tmp_path, embedding_dim=384) as ds:
retriever = EmbeddingRetriever(
document_store=ds, embedding_model="sentence-transformers/paraphrase-MiniLM-L3-v2"
)
ds.update_embeddings(retriever)
pipeline = DocumentSearchPipeline(retriever)
prediction = pipeline.run("Paul lives in New York")
scores = [document.score for document in prediction["documents"]]
assert [document.content for document in prediction["documents"]] == [
"My name is Paul and I live in New York",
"My name is Matteo and I live in Rome",
"My name is Christelle and I live in Paris",
"My name is Carla and I live in Berlin",
"My name is Camila and I live in Madrid",
]
assert scores == pytest.approx(
[0.9149981737136841, 0.6895168423652649, 0.641706794500351, 0.6206043660640717, 0.5837393924593925],
abs=1e-3,
)

1
e2e/models_to_cache.txt Normal file
View File

@ -0,0 +1 @@
sentence-transformers/paraphrase-MiniLM-L3-v2