haystack/test/conftest.py
Malte Pietsch 8edeb844f7
Remove phi normalization from FAISS, support more index types, 3x speedup (#467)
* remove phi normalization

* add special case for hnsw

* rename vector_size to vector_dim

* fix loading. fix extra dim in tests

* switch to new ES syntax for vector similarity

* 3x sql speed up. cascade deletes. add train_index()

* add docstrings. remove vector_dim from load()

* delete docs from faiss and sql

* fix delete of docs in test

* relax type hint for faiss index

* rename metric to metric_type

Co-authored-by: lalitpagaria <19303690+lalitpagaria@users.noreply.github.com>
2020-10-06 16:09:56 +02:00

180 lines
7.1 KiB
Python

import os
import subprocess
import time
from subprocess import run
from sys import platform
import pytest
import requests
from elasticsearch import Elasticsearch
from haystack import Document
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.document_store.faiss import FAISSDocumentStore
from haystack.document_store.memory import InMemoryDocumentStore
from haystack.document_store.sql import SQLDocumentStore
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
@pytest.fixture(scope="session")
def elasticsearch_fixture():
# test if a ES cluster is already running. If not, download and start an ES instance locally.
try:
client = Elasticsearch(hosts=[{"host": "localhost", "port": "9200"}])
client.info()
except:
print("Starting Elasticsearch ...")
status = subprocess.run(
['docker rm haystack_test_elastic'],
shell=True
)
status = subprocess.run(
['docker run -d --name haystack_test_elastic -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.1'],
shell=True
)
if status.returncode:
raise Exception(
"Failed to launch Elasticsearch. Please check docker container logs.")
time.sleep(30)
@pytest.fixture(scope="session")
def tika_fixture():
try:
tika_url = "http://localhost:9998/tika"
ping = requests.get(tika_url)
if ping.status_code != 200:
raise Exception(
"Unable to connect Tika. Please check tika endpoint {0}.".format(tika_url))
except:
print("Starting Tika ...")
status = subprocess.run(
['docker run -d --name tika -p 9998:9998 apache/tika:1.24.1'],
shell=True
)
if status.returncode:
raise Exception(
"Failed to launch Tika. Please check docker container logs.")
time.sleep(30)
@pytest.fixture(scope="session")
def xpdf_fixture(tika_fixture):
verify_installation = run(["pdftotext"], shell=True)
if verify_installation.returncode == 127:
if platform.startswith("linux"):
platform_id = "linux"
sudo_prefix = "sudo"
elif platform.startswith("darwin"):
platform_id = "mac"
# For Mac, generally sudo need password in interactive console.
# But most of the cases current user already have permission to copy to /user/local/bin.
# Hence removing sudo requirement for Mac.
sudo_prefix = ""
else:
raise Exception(
"""Currently auto installation of pdftotext is not supported on {0} platform """.format(platform)
)
commands = """ wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-{0}-4.02.tar.gz &&
tar -xvf xpdf-tools-{0}-4.02.tar.gz &&
{1} cp xpdf-tools-{0}-4.02/bin64/pdftotext /usr/local/bin""".format(platform_id, sudo_prefix)
run([commands], shell=True)
verify_installation = run(["pdftotext -v"], shell=True)
if verify_installation.returncode == 127:
raise Exception(
"""pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
You can download for your OS from here: https://www.xpdfreader.com/download.html."""
)
@pytest.fixture(params=["elasticsearch", "faiss", "memory", "sql"])
def document_store(request, test_docs_xs, elasticsearch_fixture):
return get_document_store(request.param)
@pytest.fixture()
def test_docs_xs():
return [
# current "dict" format for a document
{"text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1", "name": "filename1"}},
# meta_field at the top level for backward compatibility
{"text": "My name is Paul and I live in New York", "meta_field": "test2", "name": "filename2"},
# Document object for a doc
Document(text="My name is Christelle and I live in Paris", meta={"meta_field": "test3", "name": "filename3"})
]
@pytest.fixture(params=["farm", "transformers"])
def reader(request):
if request.param == "farm":
return FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad",
use_gpu=False, top_k_per_sample=5, num_processes=0)
if request.param == "transformers":
return TransformersReader(model="distilbert-base-uncased-distilled-squad",
tokenizer="distilbert-base-uncased",
use_gpu=-1)
# TODO Fix bug in test_no_answer_output when using
# @pytest.fixture(params=["farm", "transformers"])
@pytest.fixture(params=["farm"])
def no_answer_reader(request):
if request.param == "farm":
return FARMReader(model_name_or_path="deepset/roberta-base-squad2",
use_gpu=False, top_k_per_sample=5, no_ans_boost=0, num_processes=0)
if request.param == "transformers":
return TransformersReader(model="deepset/roberta-base-squad2",
tokenizer="deepset/roberta-base-squad2",
use_gpu=-1, top_k_per_candidate=5)
@pytest.fixture()
def prediction(reader, test_docs_xs):
docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
prediction = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5)
return prediction
@pytest.fixture()
def no_answer_prediction(no_answer_reader, test_docs_xs):
docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
prediction = no_answer_reader.predict(question="What is the meaning of life?", documents=docs, top_k=5)
return prediction
@pytest.fixture(params=["elasticsearch", "faiss", "memory", "sql"])
def document_store_with_docs(request, test_docs_xs, elasticsearch_fixture):
document_store = get_document_store(request.param)
document_store.write_documents(test_docs_xs)
return document_store
@pytest.fixture(params=["elasticsearch", "faiss", "memory", "sql"])
def document_store(request, test_docs_xs, elasticsearch_fixture):
return get_document_store(request.param)
def get_document_store(document_store_type):
if document_store_type == "sql":
if os.path.exists("haystack_test.db"):
os.remove("haystack_test.db")
document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
elif document_store_type == "memory":
document_store = InMemoryDocumentStore()
elif document_store_type == "elasticsearch":
# make sure we start from a fresh index
client = Elasticsearch()
client.indices.delete(index='haystack_test*', ignore=[404])
document_store = ElasticsearchDocumentStore(index="haystack_test")
elif document_store_type == "faiss":
if os.path.exists("haystack_test_faiss.db"):
os.remove("haystack_test_faiss.db")
document_store = FAISSDocumentStore(sql_url="sqlite:///haystack_test_faiss.db")
else:
raise Exception(f"No document store fixture for '{document_store_type}'")
return document_store