haystack/test/benchmarks/utils.py

import os
from haystack.document_store.sql import SQLDocumentStore
from haystack.document_store.memory import InMemoryDocumentStore
from haystack.document_store.elasticsearch import Elasticsearch, ElasticsearchDocumentStore
from haystack.document_store.faiss import FAISSDocumentStore
from haystack.retriever.sparse import ElasticsearchRetriever, TfidfRetriever
from haystack.retriever.dense import DensePassageRetriever
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
import logging
import subprocess
import time

from pathlib import Path
logger = logging.getLogger(__name__)


reader_models = ["deepset/roberta-base-squad2", "deepset/minilm-uncased-squad2", "deepset/bert-base-cased-squad2", "deepset/bert-large-uncased-whole-word-masking-squad2", "deepset/xlm-roberta-large-squad2"]
reader_types = ["farm"]
data_dir_reader = Path("../../data/squad20")
filename_reader = "dev-v2.0.json"

doc_index = "eval_document"
label_index = "label"

def get_document_store(document_store_type, es_similarity='cosine'):
    """ TODO This method is taken from test/conftest.py but maybe should be within Haystack.
    Perhaps a class method of DocStore that just takes string for type of DocStore"""
    if document_store_type == "sql":
        if os.path.exists("haystack_test.db"):
            os.remove("haystack_test.db")
        document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore()
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(index="eval_document", similarity=es_similarity)
    elif document_store_type in("faiss_flat", "faiss_hnsw"):
        if document_store_type == "faiss_flat":
            index_type = "Flat"
        elif document_store_type == "faiss_hnsw":
            index_type = "HNSW"

        #TEMP FIX for issue with deleting docs
        # status = subprocess.run(
        #     ['docker rm -f haystack-postgres'],
        #     shell=True)
        # time.sleep(3)
        # try:
        #     document_store = FAISSDocumentStore(sql_url="postgresql://postgres:password@localhost:5432/haystack",
        #                                         faiss_index_factory_str=index_type)
        # except:
        # Launch a postgres instance & create empty DB
        # logger.info("Didn't find Postgres. Start a new instance...")
        status = subprocess.run(
            ['docker rm -f haystack-postgres'],
            shell=True)
        time.sleep(1)
        status = subprocess.run(
            ['docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres'],
            shell=True)
        time.sleep(3)
        status = subprocess.run(
            ['docker exec -it haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"'], shell=True)
        time.sleep(1)
        document_store = FAISSDocumentStore(sql_url="postgresql://postgres:password@localhost:5432/haystack",
                                            faiss_index_factory_str=index_type)

    else:
        raise Exception(f"No document store fixture for '{document_store_type}'")
    return document_store

def get_retriever(retriever_name, doc_store):
    if retriever_name == "elastic":
        return ElasticsearchRetriever(doc_store)
    if retriever_name == "tfidf":
        return TfidfRetriever(doc_store)
    if retriever_name == "dpr":
        return DensePassageRetriever(document_store=doc_store,
                                      query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                      passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                      use_gpu=True)

def get_reader(reader_name, reader_type, max_seq_len=384):
    reader_class = None
    if reader_type == "farm":
        reader_class = FARMReader
    elif reader_type == "transformers":
        reader_class = TransformersReader
    return reader_class(reader_name, top_k_per_candidate=4, max_seq_len=max_seq_len)

def index_to_doc_store(doc_store, docs, retriever, labels=None):
    doc_store.write_documents(docs, doc_index)
    if labels:
        doc_store.write_labels(labels, index=label_index)
    # these lines are not run if the docs.embedding field is already populated with precomputed embeddings
    # See the prepare_data() fn in the retriever benchmark script
    elif callable(getattr(retriever, "embed_passages", None)) and docs[0].embedding is None:
        doc_store.update_embeddings(retriever, index=doc_index)
Create time and performance benchmarks for all readers and retrievers (#339) * add time and perf benchmark for es * Add retriever benchmarking * Add Reader benchmarking * add nq to squad conversion * add conversion stats * clean benchmarks * Add link to dataset * Update imports * add first support for neg psgs * Refactor test * set max_seq_len * cleanup benchmark * begin retriever speed benchmarking * Add support for retriever query index benchmarking * improve reader eval, retriever speed benchmarking * improve retriever speed benchmarking * Add retriever accuracy benchmark * Add neg doc shuffling * Add top_n * 3x speedup of SQL. add postgres docker run. make shuffle neg a param. add more logging * Add models to sweep * add option for faiss index type * remove unneeded line * change faiss to faiss_flat * begin automatic benchmark script * remove existing postgres docker for benchmarking * Add data processing scripts * Remove shuffle in script bc data already shuffled * switch hnsw setup from 256 to 128 * change es similarity to dot product by default * Error includes stack trace * Change ES default timeout * remove delete_docs() from timing for indexing * Add support for website export * update website on push to benchmarks * add complete benchmarks results * new json format * removed NaN as is not a valid json token * fix benchmarking for faiss hnsw queries. do sql calls in update_embeddings() as batches * update benchmarks for hnsw 128,20,80 * don't delete full index in delete_all_documents() * update texts for charts * update recall column for retriever * change scale and add units to desc * add units to legend * add axis titles. update desc * add html tags Co-authored-by: deepset <deepset@Crenolape.localdomain> Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai> Co-authored-by: PiffPaffM <markuspaff.mp@gmail.com> 2020-10-12 13:34:42 +02:00			`import os`
			`from haystack.document_store.sql import SQLDocumentStore`
			`from haystack.document_store.memory import InMemoryDocumentStore`
			`from haystack.document_store.elasticsearch import Elasticsearch, ElasticsearchDocumentStore`
			`from haystack.document_store.faiss import FAISSDocumentStore`
			`from haystack.retriever.sparse import ElasticsearchRetriever, TfidfRetriever`
			`from haystack.retriever.dense import DensePassageRetriever`
			`from haystack.reader.farm import FARMReader`
			`from haystack.reader.transformers import TransformersReader`
			`import logging`
			`import subprocess`
			`import time`

			`from pathlib import Path`
			`logger = logging.getLogger(__name__)`


			`reader_models = ["deepset/roberta-base-squad2", "deepset/minilm-uncased-squad2", "deepset/bert-base-cased-squad2", "deepset/bert-large-uncased-whole-word-masking-squad2", "deepset/xlm-roberta-large-squad2"]`
			`reader_types = ["farm"]`
			`data_dir_reader = Path("../../data/squad20")`
			`filename_reader = "dev-v2.0.json"`

			`doc_index = "eval_document"`
			`label_index = "label"`

			`def get_document_store(document_store_type, es_similarity='cosine'):`
			`""" TODO This method is taken from test/conftest.py but maybe should be within Haystack.`
			`Perhaps a class method of DocStore that just takes string for type of DocStore"""`
			`if document_store_type == "sql":`
			`if os.path.exists("haystack_test.db"):`
			`os.remove("haystack_test.db")`
			`document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")`
			`elif document_store_type == "memory":`
			`document_store = InMemoryDocumentStore()`
			`elif document_store_type == "elasticsearch":`
			`# make sure we start from a fresh index`
			`client = Elasticsearch()`
			`client.indices.delete(index='haystack_test*', ignore=[404])`
			`document_store = ElasticsearchDocumentStore(index="eval_document", similarity=es_similarity)`
			`elif document_store_type in("faiss_flat", "faiss_hnsw"):`
			`if document_store_type == "faiss_flat":`
			`index_type = "Flat"`
			`elif document_store_type == "faiss_hnsw":`
			`index_type = "HNSW"`

			`#TEMP FIX for issue with deleting docs`
			`# status = subprocess.run(`
			`# ['docker rm -f haystack-postgres'],`
			`# shell=True)`
			`# time.sleep(3)`
			`# try:`
			`# document_store = FAISSDocumentStore(sql_url="postgresql://postgres:password@localhost:5432/haystack",`
			`# faiss_index_factory_str=index_type)`
			`# except:`
			`# Launch a postgres instance & create empty DB`
			`# logger.info("Didn't find Postgres. Start a new instance...")`
			`status = subprocess.run(`
			`['docker rm -f haystack-postgres'],`
			`shell=True)`
			`time.sleep(1)`
			`status = subprocess.run(`
			`['docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres'],`
			`shell=True)`
			`time.sleep(3)`
			`status = subprocess.run(`
			`['docker exec -it haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"'], shell=True)`
			`time.sleep(1)`
			`document_store = FAISSDocumentStore(sql_url="postgresql://postgres:password@localhost:5432/haystack",`
			`faiss_index_factory_str=index_type)`

			`else:`
			`raise Exception(f"No document store fixture for '{document_store_type}'")`
			`return document_store`

			`def get_retriever(retriever_name, doc_store):`
			`if retriever_name == "elastic":`
			`return ElasticsearchRetriever(doc_store)`
			`if retriever_name == "tfidf":`
			`return TfidfRetriever(doc_store)`
			`if retriever_name == "dpr":`
			`return DensePassageRetriever(document_store=doc_store,`
			`query_embedding_model="facebook/dpr-question_encoder-single-nq-base",`
			`passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",`
			`use_gpu=True)`

			`def get_reader(reader_name, reader_type, max_seq_len=384):`
			`reader_class = None`
			`if reader_type == "farm":`
			`reader_class = FARMReader`
			`elif reader_type == "transformers":`
			`reader_class = TransformersReader`
			`return reader_class(reader_name, top_k_per_candidate=4, max_seq_len=max_seq_len)`

			`def index_to_doc_store(doc_store, docs, retriever, labels=None):`
			`doc_store.write_documents(docs, doc_index)`
			`if labels:`
			`doc_store.write_labels(labels, index=label_index)`
			`# these lines are not run if the docs.embedding field is already populated with precomputed embeddings`
			`# See the prepare_data() fn in the retriever benchmark script`
			`elif callable(getattr(retriever, "embed_passages", None)) and docs[0].embedding is None:`
			`doc_store.update_embeddings(retriever, index=doc_index)`