Merge branch 'master' into automate_benchmarks

This commit is contained in:
Branden Chan 2020-10-27 11:50:49 +01:00
commit d5cb227909
25 changed files with 218 additions and 98 deletions

View File

@ -24,7 +24,7 @@ jobs:
- name: Run Elasticsearch
uses: elastic/elastic-github-actions/elasticsearch@25ad91e35aeee806711d335fc9dec7927ae49bc6
with:
stack-version: 7.6.0
stack-version: 7.9.2
- name: Run Apache Tika
run: docker run -d -p 9998:9998 apache/tika:1.24.1

View File

@ -101,7 +101,7 @@ class BaseDocumentStore(ABC):
pass
@abstractmethod
def get_document_count(self, index: Optional[str] = None) -> int:
def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int:
pass
@abstractmethod
@ -109,7 +109,8 @@ class BaseDocumentStore(ABC):
query_emb: List[float],
filters: Optional[Optional[Dict[str, List[str]]]] = None,
top_k: int = 10,
index: Optional[str] = None) -> List[Document]:
index: Optional[str] = None,
return_embedding: Optional[bool] = None) -> List[Document]:
pass
@abstractmethod

View File

@ -1,6 +1,7 @@
import json
import logging
import time
from copy import deepcopy
from string import Template
from typing import List, Optional, Union, Dict, Any
from elasticsearch import Elasticsearch
@ -42,6 +43,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
refresh_type: str = "wait_for",
similarity="dot_product",
timeout=30,
return_embedding: Optional[bool] = True,
):
"""
A DocumentStore using Elasticsearch to store and query the documents for our search.
@ -80,6 +82,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
:param similarity: The similarity function used to compare document vectors. 'dot_product' is the default sine it is
more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model.
:param timeout: Number of seconds after which an ElasticSearch request times out.
:param return_embedding: To return document embedding
"""
@ -99,6 +102,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
self.embedding_dim = embedding_dim
self.excluded_meta_data = excluded_meta_data
self.faq_question_field = faq_question_field
self.return_embedding = return_embedding
self.custom_mapping = custom_mapping
self.index: str = index
@ -302,10 +306,25 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
body = {"doc": meta}
self.client.update(index=self.index, doc_type="_doc", id=id, body=body, refresh=self.refresh_type)
def get_document_count(self, index: Optional[str] = None) -> int:
if index is None:
index = self.index
result = self.client.count(index=index)
def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int:
index = index or self.index
body: dict = {"query": {"bool": {}}}
if filters:
filter_clause = []
for key, values in filters.items():
if type(values) != list:
raise ValueError(
f'Wrong filter format for key "{key}": Please provide a list of allowed values for each key. '
'Example: {"name": ["some", "more"], "category": ["only_one"]} ')
filter_clause.append(
{
"terms": {key: values}
}
)
body["query"]["bool"]["filter"] = filter_clause
result = self.client.count(index=index, body=body)
count = result["count"]
return count
@ -431,10 +450,13 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
query_emb: np.array,
filters: Optional[Dict[str, List[str]]] = None,
top_k: int = 10,
index: Optional[str] = None) -> List[Document]:
index: Optional[str] = None,
return_embedding: Optional[bool] = None) -> List[Document]:
if index is None:
index = self.index
return_embedding = return_embedding or self.return_embedding
if not self.embedding_field:
raise RuntimeError("Please specify arg `embedding_field` in ElasticsearchDocumentStore()")
else:
@ -445,7 +467,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
"script_score": {
"query": {"match_all": {}},
"script": {
"source": f"{self.similarity_fn_name}(params.query_vector,'{self.embedding_field}') + 1.0",
# offset score to ensure a positive range as required by Elasticsearch
"source": f"{self.similarity_fn_name}(params.query_vector,'{self.embedding_field}') + 1000",
"params": {
"query_vector": query_emb.tolist()
}
@ -463,8 +486,20 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
"terms": filters
}
excluded_meta_data: Optional[list] = None
if self.excluded_meta_data:
body["_source"] = {"excludes": self.excluded_meta_data}
excluded_meta_data = deepcopy(self.excluded_meta_data)
if return_embedding is True and self.embedding_field in excluded_meta_data:
excluded_meta_data.remove(self.embedding_field)
elif return_embedding is False and self.embedding_field not in excluded_meta_data:
excluded_meta_data.append(self.embedding_field)
elif return_embedding is False:
excluded_meta_data = [self.embedding_field]
if excluded_meta_data:
body["_source"] = {"excludes": excluded_meta_data}
logger.debug(f"Retriever query: {body}")
result = self.client.search(index=index, body=body, request_timeout=300)["hits"]["hits"]
@ -482,7 +517,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
score = hit["_score"] if hit["_score"] else None
if score:
if adapt_score_for_embedding:
score -= 1
score -= 1000
probability = (score + 1) / 2 # scaling probability from cosine similarity
else:
probability = float(expit(np.asarray(score / 8))) # scaling probability from TFIDF/BM25
@ -495,7 +530,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
score=score,
probability=probability,
question=hit["_source"].get(self.faq_question_field),
embedding=hit["_source"].get(self.embedding_field)
embedding=hit["_source"].get(self.embedding_field, None)
)
return document

View File

@ -36,6 +36,7 @@ class FAISSDocumentStore(SQLDocumentStore):
vector_dim: int = 768,
faiss_index_factory_str: str = "Flat",
faiss_index: Optional[faiss.swigfaiss.Index] = None,
return_embedding: Optional[bool] = True,
**kwargs,
):
"""
@ -61,6 +62,7 @@ class FAISSDocumentStore(SQLDocumentStore):
Benchmarks: XXX
:param faiss_index: Pass an existing FAISS Index, i.e. an empty one that you configured manually
or one with docs that you used in Haystack before and want to load again.
:param return_embedding: To return document embedding
"""
self.vector_dim = vector_dim
@ -70,6 +72,7 @@ class FAISSDocumentStore(SQLDocumentStore):
self.faiss_index = self._create_new_index(vector_dim=self.vector_dim, index_factory=faiss_index_factory_str, **kwargs)
self.index_buffer_size = index_buffer_size
self.return_embedding = return_embedding
super().__init__(url=sql_url)
def _create_new_index(self, vector_dim: int, index_factory: str = "Flat", metric_type=faiss.METRIC_INNER_PRODUCT, **kwargs):
@ -184,9 +187,12 @@ class FAISSDocumentStore(SQLDocumentStore):
self.faiss_index.reset()
super().delete_all_documents(index=index)
def query_by_embedding(
self, query_emb: np.array, filters: Optional[dict] = None, top_k: int = 10, index: Optional[str] = None
) -> List[Document]:
def query_by_embedding(self,
query_emb: np.array,
filters: Optional[dict] = None,
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None) -> List[Document]:
"""
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
@ -195,6 +201,7 @@ class FAISSDocumentStore(SQLDocumentStore):
Example: {"name": ["some", "more"], "category": ["only_one"]}
:param top_k: How many documents to return
:param index: (SQL) index name for storing the docs and metadata
:param return_embedding: To return document embedding
:return:
"""
if filters:
@ -202,6 +209,9 @@ class FAISSDocumentStore(SQLDocumentStore):
if not self.faiss_index:
raise Exception("No index exists. Use 'update_embeddings()` to create an index.")
return_embedding = return_embedding or self.return_embedding
index = index or self.index
query_emb = query_emb.reshape(1, -1).astype(np.float32)
score_matrix, vector_id_matrix = self.faiss_index.search(query_emb, top_k)
vector_ids_for_query = [str(vector_id) for vector_id in vector_id_matrix[0] if vector_id != -1]
@ -213,6 +223,9 @@ class FAISSDocumentStore(SQLDocumentStore):
for doc in documents:
doc.score = scores_for_vector_ids[doc.meta["vector_id"]] # type: ignore
doc.probability = (doc.score + 1) / 2
if return_embedding is True:
doc.embedding = self.faiss_index.reconstruct(int(doc.meta["vector_id"]))
return documents
def save(self, file_path: Union[str, Path]):

View File

@ -1,4 +1,5 @@
from typing import Any, Dict, List, Optional, Union
from copy import deepcopy
from typing import Dict, List, Optional, Union
from uuid import uuid4
from collections import defaultdict
@ -10,17 +11,19 @@ from haystack.retriever.base import BaseRetriever
import logging
logger = logging.getLogger(__name__)
class InMemoryDocumentStore(BaseDocumentStore):
"""
In-memory document store
"""
def __init__(self, embedding_field: Optional[str] = None):
def __init__(self, embedding_field: Optional[str] = "embedding", return_embedding: bool = True):
self.indexes: Dict[str, Dict] = defaultdict(dict)
self.index: str = "document"
self.label_index: str = "label"
self.embedding_field: str = "embedding"
self.embedding_dim : int = 768
self.embedding_field: str = embedding_field if embedding_field is not None else "embedding"
self.embedding_dim: int = 768
self.return_embedding: bool = return_embedding
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None):
"""
@ -68,7 +71,8 @@ class InMemoryDocumentStore(BaseDocumentStore):
query_emb: List[float],
filters: Optional[Dict[str, List[str]]] = None,
top_k: int = 10,
index: Optional[str] = None) -> List[Document]:
index: Optional[str] = None,
return_embedding: Optional[bool] = None) -> List[Document]:
from numpy import dot
from numpy.linalg import norm
@ -79,19 +83,28 @@ class InMemoryDocumentStore(BaseDocumentStore):
"use a different DocumentStore (e.g. ElasticsearchDocumentStore).")
index = index or self.index
return_embedding = return_embedding or self.return_embedding
if query_emb is None:
return []
candidate_docs = []
for idx, doc in self.indexes[index].items():
doc.score = dot(query_emb, doc.embedding) / (
new_document = Document(
id=doc.id,
text=doc.text,
meta=deepcopy(doc.meta)
)
new_document.embedding = doc.embedding if return_embedding is True else None
score = dot(query_emb, doc.embedding) / (
norm(query_emb) * norm(doc.embedding)
)
doc.probability = (doc.score + 1) / 2
candidate_docs.append(doc)
new_document.score = score
new_document.probability = (score + 1) / 2
return sorted(candidate_docs, key=lambda x: x.score, reverse=True)[0:top_k]
candidate_docs.append(new_document)
return sorted(candidate_docs, key=lambda x: x.score if x.score is not None else 0.0, reverse=True)[0:top_k]
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None):
"""
@ -122,9 +135,9 @@ class InMemoryDocumentStore(BaseDocumentStore):
for doc, emb in zip(docs, embeddings):
self.indexes[index][doc.id].embedding = emb
def get_document_count(self, index: Optional[str] = None) -> int:
index = index or self.index
return len(self.indexes[index].items())
def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int:
documents = self.get_all_documents(index=index, filters=filters)
return len(documents)
def get_label_count(self, index: Optional[str] = None) -> int:
index = index or self.label_index

View File

@ -196,9 +196,17 @@ class SQLDocumentStore(BaseDocumentStore):
self.write_documents(docs, index=doc_index)
self.write_labels(labels, index=label_index)
def get_document_count(self, index=None) -> int:
def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int:
index = index or self.index
return self.session.query(DocumentORM).filter_by(index=index).count()
query = self.session.query(DocumentORM).filter_by(index=index)
if filters:
query = query.join(MetaORM)
for key, values in filters.items():
query = query.filter(MetaORM.name == key, MetaORM.value.in_(values))
count = query.count()
return count
def get_label_count(self, index: Optional[str] = None) -> int:
index = index or self.index
@ -232,7 +240,8 @@ class SQLDocumentStore(BaseDocumentStore):
query_emb: List[float],
filters: Optional[dict] = None,
top_k: int = 10,
index: Optional[str] = None) -> List[Document]:
index: Optional[str] = None,
return_embedding: Optional[bool] = None) -> List[Document]:
raise NotImplementedError("SQLDocumentStore is currently not supporting embedding queries. "
"Change the query type (e.g. by choosing a different retriever) "

View File

@ -32,11 +32,13 @@ def eval_data_from_file(filename: str, n_docs: Union[int, bool]=None) -> Tuple[L
with open(filename, "r") as file:
data = json.load(file)
if "title" not in data["data"][0]:
logger.warning(f"No title information found for documents in QA file: {filename}")
for document in data["data"][:n_docs]:
# get all extra fields from document level (e.g. title)
meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")}
for paragraph in document["paragraphs"]:
cur_meta = {"name": document["title"]}
cur_meta = {"name": document.get("title", None)}
# all other fields from paragraph level
meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")}
cur_meta.update(meta_paragraph)

View File

@ -33,7 +33,7 @@ def elasticsearch_fixture():
shell=True
)
status = subprocess.run(
['docker run -d --name haystack_test_elastic -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.1'],
['docker run -d --name haystack_test_elastic -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'],
shell=True
)
if status.returncode:
@ -152,20 +152,25 @@ def no_answer_prediction(no_answer_reader, test_docs_xs):
def document_store_with_docs(request, test_docs_xs, elasticsearch_fixture):
document_store = get_document_store(request.param)
document_store.write_documents(test_docs_xs)
return document_store
yield document_store
if isinstance(document_store, FAISSDocumentStore):
document_store.faiss_index.reset()
@pytest.fixture(params=["elasticsearch", "faiss", "memory", "sql"])
def document_store(request, test_docs_xs, elasticsearch_fixture):
return get_document_store(request.param)
document_store = get_document_store(request.param)
yield document_store
if isinstance(document_store, FAISSDocumentStore):
document_store.faiss_index.reset()
@pytest.fixture(params=["es_filter_only", "elsticsearch", "dpr", "embedding", "tfidf"])
@pytest.fixture(params=["es_filter_only", "elasticsearch", "dpr", "embedding", "tfidf"])
def retriever(request, document_store):
return get_retriever(request.param, document_store)
@pytest.fixture(params=["es_filter_only", "elsticsearch", "dpr", "embedding", "tfidf"])
@pytest.fixture(params=["es_filter_only", "elasticsearch", "dpr", "embedding", "tfidf"])
def retriever_with_docs(request, document_store_with_docs):
return get_retriever(request.param, document_store_with_docs)
@ -176,16 +181,16 @@ def get_document_store(document_store_type):
os.remove("haystack_test.db")
document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
elif document_store_type == "memory":
document_store = InMemoryDocumentStore()
document_store = InMemoryDocumentStore(return_embedding=False)
elif document_store_type == "elasticsearch":
# make sure we start from a fresh index
client = Elasticsearch()
client.indices.delete(index='haystack_test*', ignore=[404])
document_store = ElasticsearchDocumentStore(index="haystack_test")
document_store = ElasticsearchDocumentStore(index="haystack_test", return_embedding=False)
elif document_store_type == "faiss":
if os.path.exists("haystack_test_faiss.db"):
os.remove("haystack_test_faiss.db")
document_store = FAISSDocumentStore(sql_url="sqlite:///haystack_test_faiss.db")
document_store = FAISSDocumentStore(sql_url="sqlite:///haystack_test_faiss.db", return_embedding=False)
else:
raise Exception(f"No document store fixture for '{document_store_type}'")
@ -206,7 +211,7 @@ def get_retriever(retriever_type, document_store):
retriever = EmbeddingRetriever(document_store=document_store,
embedding_model="deepset/sentence_bert",
use_gpu=False)
elif retriever_type == "elsticsearch":
elif retriever_type == "elasticsearch":
retriever = ElasticsearchRetriever(document_store=document_store)
elif retriever_type == "es_filter_only":
retriever = ElasticsearchFilterOnlyRetriever(document_store=document_store)

View File

@ -65,6 +65,19 @@ def test_get_documents_by_id(document_store_with_docs):
assert doc.text == documents[0].text
def test_get_document_count(document_store):
documents = [
{"text": "text1", "id": "1", "meta_field_for_count": "a"},
{"text": "text2", "id": "2", "meta_field_for_count": "b"},
{"text": "text3", "id": "3", "meta_field_for_count": "b"},
{"text": "text4", "id": "4", "meta_field_for_count": "b"},
]
document_store.write_documents(documents)
assert document_store.get_document_count() == 4
assert document_store.get_document_count(filters={"meta_field_for_count": ["a"]}) == 1
assert document_store.get_document_count(filters={"meta_field_for_count": ["b"]}) == 3
def test_write_document_meta(document_store):
documents = [
{"text": "dict_without_meta", "id": "1"},

View File

@ -5,9 +5,11 @@ from haystack import Document
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
@pytest.mark.slow
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
def test_dpr_inmemory_retrieval(document_store, retriever):
@pytest.mark.parametrize("return_embedding", [True, False])
def test_dpr_retrieval(document_store, retriever, return_embedding):
documents = [
Document(
@ -31,12 +33,13 @@ def test_dpr_inmemory_retrieval(document_store, retriever):
)
]
document_store.delete_all_documents(index="test_dpr")
document_store.write_documents(documents, index="test_dpr")
document_store.update_embeddings(retriever=retriever, index="test_dpr")
time.sleep(2)
document_store.return_embedding = return_embedding
document_store.write_documents(documents)
document_store.update_embeddings(retriever=retriever)
docs_with_emb = document_store.get_all_documents(index="test_dpr")
time.sleep(1)
docs_with_emb = document_store.get_all_documents()
# FAISSDocumentStore doesn't return embeddings, so these tests only work with ElasticsearchDocumentStore
if isinstance(document_store, ElasticsearchDocumentStore):
@ -46,8 +49,13 @@ def test_dpr_inmemory_retrieval(document_store, retriever):
assert (abs(docs_with_emb[2].embedding[0] - (-0.24695)) < 0.001)
assert (abs(docs_with_emb[3].embedding[0] - (-0.08017)) < 0.001)
assert (abs(docs_with_emb[4].embedding[0] - (-0.01534)) < 0.001)
res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?", index="test_dpr")
res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?")
assert res[0].meta["name"] == "1"
# clean up
document_store.delete_all_documents(index="test_dpr")
# test embedding
if return_embedding is True:
assert res[0].embedding is not None
else:
assert res[0].embedding is None

View File

@ -2,7 +2,7 @@ import pytest
@pytest.mark.parametrize("document_store_with_docs", [("elasticsearch")], indirect=True)
@pytest.mark.parametrize("retriever_with_docs", ["elsticsearch"], indirect=True)
@pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True)
def test_elasticsearch_retrieval(retriever_with_docs, document_store_with_docs):
res = retriever_with_docs.retrieve(query="Who lives in Berlin?")
assert res[0].text == "My name is Carla and I live in Berlin"
@ -11,7 +11,7 @@ def test_elasticsearch_retrieval(retriever_with_docs, document_store_with_docs):
@pytest.mark.parametrize("document_store_with_docs", [("elasticsearch")], indirect=True)
@pytest.mark.parametrize("retriever_with_docs", ["elsticsearch"], indirect=True)
@pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True)
def test_elasticsearch_retrieval_filters(retriever_with_docs, document_store_with_docs):
res = retriever_with_docs.retrieve(query="Who lives in Berlin?", filters={"name": ["filename1"]})
assert res[0].text == "My name is Carla and I live in Berlin"

View File

@ -2,6 +2,7 @@ import pytest
from haystack import Finder
@pytest.mark.slow
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
def test_embedding_retriever(retriever, document_store):

View File

@ -61,7 +61,7 @@ def test_eval_reader(reader, document_store: BaseDocumentStore):
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
@pytest.mark.parametrize("open_domain", [True, False])
@pytest.mark.parametrize("retriever", ["elsticsearch"], indirect=True)
@pytest.mark.parametrize("retriever", ["elasticsearch"], indirect=True)
def test_eval_elastic_retriever(document_store: BaseDocumentStore, open_domain, retriever):
# add eval data (SQUAD format)
document_store.delete_all_documents(index="test_eval_document")
@ -81,7 +81,7 @@ def test_eval_elastic_retriever(document_store: BaseDocumentStore, open_domain,
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
@pytest.mark.parametrize("retriever", ["elsticsearch"], indirect=True)
@pytest.mark.parametrize("retriever", ["elasticsearch"], indirect=True)
def test_eval_finder(document_store: BaseDocumentStore, reader, retriever):
finder = Finder(reader=reader, retriever=retriever)

View File

@ -76,6 +76,7 @@ def test_faiss_write_docs(document_store, index_buffer_size, batch_size):
check_data_correctness(documents_indexed, DOCUMENTS)
@pytest.mark.slow
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@pytest.mark.parametrize("index_buffer_size", [10_000, 2])

View File

@ -2,6 +2,7 @@ from haystack import Finder
import pytest
@pytest.mark.slow
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
def test_finder_get_answers(reader, retriever_with_docs, document_store_with_docs):
finder = Finder(reader, retriever_with_docs)
@ -31,6 +32,7 @@ def test_finder_offsets(reader, retriever_with_docs, document_store_with_docs):
assert prediction["answers"][0]["context"][start:end] == prediction["answers"][0]["answer"]
@pytest.mark.slow
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
def test_finder_get_answers_single_result(reader, retriever_with_docs, document_store_with_docs):
finder = Finder(reader, retriever_with_docs)

View File

@ -1,11 +1,12 @@
import math
import pytest
from haystack import Document
from haystack.reader.base import BaseReader
from haystack.reader.farm import FARMReader
def test_reader_basic(reader):
assert reader is not None
assert isinstance(reader, BaseReader)
@ -23,6 +24,7 @@ def test_output(prediction):
assert len(prediction["answers"]) == 5
@pytest.mark.slow
def test_no_answer_output(no_answer_prediction):
assert no_answer_prediction is not None
assert no_answer_prediction["question"] == "What is the meaning of life?"
@ -38,9 +40,12 @@ def test_no_answer_output(no_answer_prediction):
assert answers.count(None) == 1
assert len(no_answer_prediction["answers"]) == 5
# TODO Directly compare farm and transformers reader outputs
# TODO checks to see that model is responsive to input arguments e.g. context_window_size - topk
@pytest.mark.slow
def test_prediction_attributes(prediction):
# TODO FARM's prediction also has no_ans_gap
attributes_gold = ["question", "answers"]
@ -57,37 +62,42 @@ def test_answer_attributes(prediction):
assert ag in answer
def test_context_window_size(test_docs_xs):
# TODO parametrize window_size and farm/transformers reader using pytest
@pytest.mark.slow
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
@pytest.mark.parametrize("window_size", [10, 15, 20])
def test_context_window_size(reader, test_docs_xs, window_size):
docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
for window_size in [10, 15, 20]:
farm_reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", num_processes=0,
use_gpu=False, top_k_per_sample=5, no_ans_boost=None, context_window_size=window_size)
prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5)
for answer in prediction["answers"]:
# If the extracted answer is larger than the context window, the context window is expanded.
# If the extracted answer is odd in length, the resulting context window is one less than context_window_size
# due to rounding (See FARM's QACandidate)
# TODO Currently the behaviour of context_window_size in FARMReader and TransformerReader is different
if len(answer["answer"]) <= window_size:
assert len(answer["context"]) in [window_size, window_size-1]
else:
assert len(answer["answer"]) == len(answer["context"])
# TODO Need to test transformers reader
if isinstance(reader, FARMReader):
reader.inferencer.model.prediction_heads[0].context_window_size = window_size
prediction = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5)
for answer in prediction["answers"]:
# If the extracted answer is larger than the context window, the context window is expanded.
# If the extracted answer is odd in length, the resulting context window is one less than context_window_size
# due to rounding (See FARM's QACandidate)
# TODO Currently the behaviour of context_window_size in FARMReader and TransformerReader is different
if len(answer["answer"]) <= window_size:
assert len(answer["context"]) in [window_size, window_size - 1]
else:
assert len(answer["answer"]) == len(answer["context"])
# TODO Need to test transformers reader
# TODO Currently the behaviour of context_window_size in FARMReader and TransformerReader is different
def test_top_k(test_docs_xs):
# TODO parametrize top_k and farm/transformers reader using pytest
# TODO transformers reader was crashing when tested on this
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
@pytest.mark.parametrize("top_k", [2, 5, 10])
def test_top_k(reader, test_docs_xs, top_k):
docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
farm_reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", num_processes=0,
use_gpu=False, top_k_per_sample=4, no_ans_boost=None, top_k_per_candidate=4)
for top_k in [2, 5, 10]:
prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=top_k)
assert len(prediction["answers"]) == top_k
reader.top_k_per_candidate = 4
if isinstance(reader, FARMReader):
reader.inferencer.model.prediction_heads[0].n_best = reader.top_k_per_candidate + 1
try:
reader.inferencer.model.prediction_heads[0].n_best_per_sample = 4
except:
print("WARNING: Could not set `top_k_per_sample` in FARM. Please update FARM version.")
prediction = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=top_k)
assert len(prediction["answers"]) == top_k

View File

@ -18,6 +18,7 @@ def get_test_client_and_override_dependencies(reader, document_store_with_docs):
return TestClient(app)
@pytest.mark.slow
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_query_api(reader, document_store_with_docs):

View File

@ -30,7 +30,8 @@
"#! pip install farm-haystack\n",
"\n",
"# Install the latest master of Haystack\n",
"!pip install git+https://github.com/deepset-ai/haystack.git"
"!pip install git+https://github.com/deepset-ai/haystack.git\n",
"!pip install urllib3==1.25.4"
]
},
{

View File

@ -25,7 +25,8 @@
"#! pip install farm-haystack\n",
"\n",
"# Install the latest master of Haystack\n",
"!pip install git+https://github.com/deepset-ai/haystack.git"
"!pip install git+https://github.com/deepset-ai/haystack.git\n",
"!pip install urllib3==1.25.4"
]
},
{

View File

@ -25,7 +25,8 @@
"#! pip install farm-haystack\n",
"\n",
"# Install the latest master of Haystack\n",
"!pip install git+https://github.com/deepset-ai/haystack.git"
"!pip install git+https://github.com/deepset-ai/haystack.git\n",
"!pip install urllib3==1.25.4"
]
},
{

View File

@ -33,7 +33,8 @@
"#! pip install farm-haystack\n",
"\n",
"# Install the latest master of Haystack\n",
"!pip install git+https://github.com/deepset-ai/haystack.git"
"!pip install git+https://github.com/deepset-ai/haystack.git\n",
"!pip install urllib3==1.25.4"
]
},
{
@ -70,7 +71,7 @@
"outputs": [],
"source": [
"# Recommended: Start Elasticsearch using Docker\n",
"# ! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2"
"# ! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.9.2"
]
},
{
@ -80,13 +81,13 @@
"outputs": [],
"source": [
"# In Colab / No Docker environments: Start Elasticsearch from source\n",
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
"! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
"! chown -R daemon:daemon elasticsearch-7.6.2\n",
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q\n",
"! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz\n",
"! chown -R daemon:daemon elasticsearch-7.9.2\n",
"\n",
"import os\n",
"from subprocess import Popen, PIPE, STDOUT\n",
"es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
"es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],\n",
" stdout=PIPE, stderr=STDOUT,\n",
" preexec_fn=lambda: os.setuid(1) # as daemon\n",
" )\n",
@ -250,4 +251,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View File

@ -26,7 +26,7 @@ LAUNCH_ELASTICSEARCH=True
if LAUNCH_ELASTICSEARCH:
logging.info("Starting Elasticsearch ...")
status = subprocess.run(
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'], shell=True
)
if status.returncode:
raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"

View File

@ -46,7 +46,8 @@
"#! pip install farm-haystack\n",
"\n",
"# Install the latest master of Haystack\n",
"!pip install git+https://github.com/deepset-ai/haystack.git"
"!pip install git+https://github.com/deepset-ai/haystack.git\n",
"!pip install urllib3==1.25.4"
]
},
{
@ -64,13 +65,13 @@
"outputs": [],
"source": [
"# In Colab / No Docker environments: Start Elasticsearch from source\n",
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
"! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
"! chown -R daemon:daemon elasticsearch-7.6.2\n",
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q\n",
"! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz\n",
"! chown -R daemon:daemon elasticsearch-7.9.2\n",
"\n",
"import os\n",
"from subprocess import Popen, PIPE, STDOUT\n",
"es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
"es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],\n",
" stdout=PIPE, stderr=STDOUT,\n",
" preexec_fn=lambda: os.setuid(1) # as daemon\n",
" )\n",

View File

@ -34,7 +34,7 @@ device, n_gpu = initialize_device_settings(use_cuda=True)
if LAUNCH_ELASTICSEARCH:
logging.info("Starting Elasticsearch ...")
status = subprocess.run(
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'], shell=True
)
if status.returncode:
raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"

View File

@ -285,7 +285,8 @@
"#! pip install farm-haystack\n",
"\n",
"# Install the latest master of Haystack\n",
"!pip install git+https://github.com/deepset-ai/haystack.git"
"!pip install git+https://github.com/deepset-ai/haystack.git\n",
"!pip install urllib3==1.25.4"
]
},
{