mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 03:57:19 +00:00
Merge branch 'master' into automate_benchmarks
This commit is contained in:
commit
d5cb227909
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@ -24,7 +24,7 @@ jobs:
|
||||
- name: Run Elasticsearch
|
||||
uses: elastic/elastic-github-actions/elasticsearch@25ad91e35aeee806711d335fc9dec7927ae49bc6
|
||||
with:
|
||||
stack-version: 7.6.0
|
||||
stack-version: 7.9.2
|
||||
|
||||
- name: Run Apache Tika
|
||||
run: docker run -d -p 9998:9998 apache/tika:1.24.1
|
||||
|
||||
@ -101,7 +101,7 @@ class BaseDocumentStore(ABC):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_document_count(self, index: Optional[str] = None) -> int:
|
||||
def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
@ -109,7 +109,8 @@ class BaseDocumentStore(ABC):
|
||||
query_emb: List[float],
|
||||
filters: Optional[Optional[Dict[str, List[str]]]] = None,
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None) -> List[Document]:
|
||||
index: Optional[str] = None,
|
||||
return_embedding: Optional[bool] = None) -> List[Document]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from copy import deepcopy
|
||||
from string import Template
|
||||
from typing import List, Optional, Union, Dict, Any
|
||||
from elasticsearch import Elasticsearch
|
||||
@ -42,6 +43,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
refresh_type: str = "wait_for",
|
||||
similarity="dot_product",
|
||||
timeout=30,
|
||||
return_embedding: Optional[bool] = True,
|
||||
):
|
||||
"""
|
||||
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
||||
@ -80,6 +82,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
:param similarity: The similarity function used to compare document vectors. 'dot_product' is the default sine it is
|
||||
more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model.
|
||||
:param timeout: Number of seconds after which an ElasticSearch request times out.
|
||||
:param return_embedding: To return document embedding
|
||||
|
||||
|
||||
"""
|
||||
@ -99,6 +102,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
self.embedding_dim = embedding_dim
|
||||
self.excluded_meta_data = excluded_meta_data
|
||||
self.faq_question_field = faq_question_field
|
||||
self.return_embedding = return_embedding
|
||||
|
||||
self.custom_mapping = custom_mapping
|
||||
self.index: str = index
|
||||
@ -302,10 +306,25 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
body = {"doc": meta}
|
||||
self.client.update(index=self.index, doc_type="_doc", id=id, body=body, refresh=self.refresh_type)
|
||||
|
||||
def get_document_count(self, index: Optional[str] = None) -> int:
|
||||
if index is None:
|
||||
index = self.index
|
||||
result = self.client.count(index=index)
|
||||
def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int:
|
||||
index = index or self.index
|
||||
|
||||
body: dict = {"query": {"bool": {}}}
|
||||
if filters:
|
||||
filter_clause = []
|
||||
for key, values in filters.items():
|
||||
if type(values) != list:
|
||||
raise ValueError(
|
||||
f'Wrong filter format for key "{key}": Please provide a list of allowed values for each key. '
|
||||
'Example: {"name": ["some", "more"], "category": ["only_one"]} ')
|
||||
filter_clause.append(
|
||||
{
|
||||
"terms": {key: values}
|
||||
}
|
||||
)
|
||||
body["query"]["bool"]["filter"] = filter_clause
|
||||
|
||||
result = self.client.count(index=index, body=body)
|
||||
count = result["count"]
|
||||
return count
|
||||
|
||||
@ -431,10 +450,13 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
query_emb: np.array,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None) -> List[Document]:
|
||||
index: Optional[str] = None,
|
||||
return_embedding: Optional[bool] = None) -> List[Document]:
|
||||
if index is None:
|
||||
index = self.index
|
||||
|
||||
return_embedding = return_embedding or self.return_embedding
|
||||
|
||||
if not self.embedding_field:
|
||||
raise RuntimeError("Please specify arg `embedding_field` in ElasticsearchDocumentStore()")
|
||||
else:
|
||||
@ -445,7 +467,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
"script_score": {
|
||||
"query": {"match_all": {}},
|
||||
"script": {
|
||||
"source": f"{self.similarity_fn_name}(params.query_vector,'{self.embedding_field}') + 1.0",
|
||||
# offset score to ensure a positive range as required by Elasticsearch
|
||||
"source": f"{self.similarity_fn_name}(params.query_vector,'{self.embedding_field}') + 1000",
|
||||
"params": {
|
||||
"query_vector": query_emb.tolist()
|
||||
}
|
||||
@ -463,8 +486,20 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
"terms": filters
|
||||
}
|
||||
|
||||
excluded_meta_data: Optional[list] = None
|
||||
|
||||
if self.excluded_meta_data:
|
||||
body["_source"] = {"excludes": self.excluded_meta_data}
|
||||
excluded_meta_data = deepcopy(self.excluded_meta_data)
|
||||
|
||||
if return_embedding is True and self.embedding_field in excluded_meta_data:
|
||||
excluded_meta_data.remove(self.embedding_field)
|
||||
elif return_embedding is False and self.embedding_field not in excluded_meta_data:
|
||||
excluded_meta_data.append(self.embedding_field)
|
||||
elif return_embedding is False:
|
||||
excluded_meta_data = [self.embedding_field]
|
||||
|
||||
if excluded_meta_data:
|
||||
body["_source"] = {"excludes": excluded_meta_data}
|
||||
|
||||
logger.debug(f"Retriever query: {body}")
|
||||
result = self.client.search(index=index, body=body, request_timeout=300)["hits"]["hits"]
|
||||
@ -482,7 +517,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
score = hit["_score"] if hit["_score"] else None
|
||||
if score:
|
||||
if adapt_score_for_embedding:
|
||||
score -= 1
|
||||
score -= 1000
|
||||
probability = (score + 1) / 2 # scaling probability from cosine similarity
|
||||
else:
|
||||
probability = float(expit(np.asarray(score / 8))) # scaling probability from TFIDF/BM25
|
||||
@ -495,7 +530,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
score=score,
|
||||
probability=probability,
|
||||
question=hit["_source"].get(self.faq_question_field),
|
||||
embedding=hit["_source"].get(self.embedding_field)
|
||||
embedding=hit["_source"].get(self.embedding_field, None)
|
||||
)
|
||||
return document
|
||||
|
||||
|
||||
@ -36,6 +36,7 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
vector_dim: int = 768,
|
||||
faiss_index_factory_str: str = "Flat",
|
||||
faiss_index: Optional[faiss.swigfaiss.Index] = None,
|
||||
return_embedding: Optional[bool] = True,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
@ -61,6 +62,7 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
Benchmarks: XXX
|
||||
:param faiss_index: Pass an existing FAISS Index, i.e. an empty one that you configured manually
|
||||
or one with docs that you used in Haystack before and want to load again.
|
||||
:param return_embedding: To return document embedding
|
||||
"""
|
||||
self.vector_dim = vector_dim
|
||||
|
||||
@ -70,6 +72,7 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
self.faiss_index = self._create_new_index(vector_dim=self.vector_dim, index_factory=faiss_index_factory_str, **kwargs)
|
||||
|
||||
self.index_buffer_size = index_buffer_size
|
||||
self.return_embedding = return_embedding
|
||||
super().__init__(url=sql_url)
|
||||
|
||||
def _create_new_index(self, vector_dim: int, index_factory: str = "Flat", metric_type=faiss.METRIC_INNER_PRODUCT, **kwargs):
|
||||
@ -184,9 +187,12 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
self.faiss_index.reset()
|
||||
super().delete_all_documents(index=index)
|
||||
|
||||
def query_by_embedding(
|
||||
self, query_emb: np.array, filters: Optional[dict] = None, top_k: int = 10, index: Optional[str] = None
|
||||
) -> List[Document]:
|
||||
def query_by_embedding(self,
|
||||
query_emb: np.array,
|
||||
filters: Optional[dict] = None,
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None,
|
||||
return_embedding: Optional[bool] = None) -> List[Document]:
|
||||
"""
|
||||
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
|
||||
|
||||
@ -195,6 +201,7 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:param top_k: How many documents to return
|
||||
:param index: (SQL) index name for storing the docs and metadata
|
||||
:param return_embedding: To return document embedding
|
||||
:return:
|
||||
"""
|
||||
if filters:
|
||||
@ -202,6 +209,9 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
if not self.faiss_index:
|
||||
raise Exception("No index exists. Use 'update_embeddings()` to create an index.")
|
||||
|
||||
return_embedding = return_embedding or self.return_embedding
|
||||
index = index or self.index
|
||||
|
||||
query_emb = query_emb.reshape(1, -1).astype(np.float32)
|
||||
score_matrix, vector_id_matrix = self.faiss_index.search(query_emb, top_k)
|
||||
vector_ids_for_query = [str(vector_id) for vector_id in vector_id_matrix[0] if vector_id != -1]
|
||||
@ -213,6 +223,9 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
for doc in documents:
|
||||
doc.score = scores_for_vector_ids[doc.meta["vector_id"]] # type: ignore
|
||||
doc.probability = (doc.score + 1) / 2
|
||||
if return_embedding is True:
|
||||
doc.embedding = self.faiss_index.reconstruct(int(doc.meta["vector_id"]))
|
||||
|
||||
return documents
|
||||
|
||||
def save(self, file_path: Union[str, Path]):
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from copy import deepcopy
|
||||
from typing import Dict, List, Optional, Union
|
||||
from uuid import uuid4
|
||||
from collections import defaultdict
|
||||
|
||||
@ -10,17 +11,19 @@ from haystack.retriever.base import BaseRetriever
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class InMemoryDocumentStore(BaseDocumentStore):
|
||||
"""
|
||||
In-memory document store
|
||||
"""
|
||||
|
||||
def __init__(self, embedding_field: Optional[str] = None):
|
||||
def __init__(self, embedding_field: Optional[str] = "embedding", return_embedding: bool = True):
|
||||
self.indexes: Dict[str, Dict] = defaultdict(dict)
|
||||
self.index: str = "document"
|
||||
self.label_index: str = "label"
|
||||
self.embedding_field: str = "embedding"
|
||||
self.embedding_dim : int = 768
|
||||
self.embedding_field: str = embedding_field if embedding_field is not None else "embedding"
|
||||
self.embedding_dim: int = 768
|
||||
self.return_embedding: bool = return_embedding
|
||||
|
||||
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None):
|
||||
"""
|
||||
@ -68,7 +71,8 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
query_emb: List[float],
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None) -> List[Document]:
|
||||
index: Optional[str] = None,
|
||||
return_embedding: Optional[bool] = None) -> List[Document]:
|
||||
|
||||
from numpy import dot
|
||||
from numpy.linalg import norm
|
||||
@ -79,19 +83,28 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
"use a different DocumentStore (e.g. ElasticsearchDocumentStore).")
|
||||
|
||||
index = index or self.index
|
||||
return_embedding = return_embedding or self.return_embedding
|
||||
|
||||
if query_emb is None:
|
||||
return []
|
||||
|
||||
candidate_docs = []
|
||||
for idx, doc in self.indexes[index].items():
|
||||
doc.score = dot(query_emb, doc.embedding) / (
|
||||
new_document = Document(
|
||||
id=doc.id,
|
||||
text=doc.text,
|
||||
meta=deepcopy(doc.meta)
|
||||
)
|
||||
new_document.embedding = doc.embedding if return_embedding is True else None
|
||||
score = dot(query_emb, doc.embedding) / (
|
||||
norm(query_emb) * norm(doc.embedding)
|
||||
)
|
||||
doc.probability = (doc.score + 1) / 2
|
||||
candidate_docs.append(doc)
|
||||
new_document.score = score
|
||||
new_document.probability = (score + 1) / 2
|
||||
|
||||
return sorted(candidate_docs, key=lambda x: x.score, reverse=True)[0:top_k]
|
||||
candidate_docs.append(new_document)
|
||||
|
||||
return sorted(candidate_docs, key=lambda x: x.score if x.score is not None else 0.0, reverse=True)[0:top_k]
|
||||
|
||||
def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None):
|
||||
"""
|
||||
@ -122,9 +135,9 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
for doc, emb in zip(docs, embeddings):
|
||||
self.indexes[index][doc.id].embedding = emb
|
||||
|
||||
def get_document_count(self, index: Optional[str] = None) -> int:
|
||||
index = index or self.index
|
||||
return len(self.indexes[index].items())
|
||||
def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int:
|
||||
documents = self.get_all_documents(index=index, filters=filters)
|
||||
return len(documents)
|
||||
|
||||
def get_label_count(self, index: Optional[str] = None) -> int:
|
||||
index = index or self.label_index
|
||||
|
||||
@ -196,9 +196,17 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
self.write_documents(docs, index=doc_index)
|
||||
self.write_labels(labels, index=label_index)
|
||||
|
||||
def get_document_count(self, index=None) -> int:
|
||||
def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int:
|
||||
index = index or self.index
|
||||
return self.session.query(DocumentORM).filter_by(index=index).count()
|
||||
query = self.session.query(DocumentORM).filter_by(index=index)
|
||||
|
||||
if filters:
|
||||
query = query.join(MetaORM)
|
||||
for key, values in filters.items():
|
||||
query = query.filter(MetaORM.name == key, MetaORM.value.in_(values))
|
||||
|
||||
count = query.count()
|
||||
return count
|
||||
|
||||
def get_label_count(self, index: Optional[str] = None) -> int:
|
||||
index = index or self.index
|
||||
@ -232,7 +240,8 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
query_emb: List[float],
|
||||
filters: Optional[dict] = None,
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None) -> List[Document]:
|
||||
index: Optional[str] = None,
|
||||
return_embedding: Optional[bool] = None) -> List[Document]:
|
||||
|
||||
raise NotImplementedError("SQLDocumentStore is currently not supporting embedding queries. "
|
||||
"Change the query type (e.g. by choosing a different retriever) "
|
||||
|
||||
@ -32,11 +32,13 @@ def eval_data_from_file(filename: str, n_docs: Union[int, bool]=None) -> Tuple[L
|
||||
|
||||
with open(filename, "r") as file:
|
||||
data = json.load(file)
|
||||
if "title" not in data["data"][0]:
|
||||
logger.warning(f"No title information found for documents in QA file: {filename}")
|
||||
for document in data["data"][:n_docs]:
|
||||
# get all extra fields from document level (e.g. title)
|
||||
meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")}
|
||||
for paragraph in document["paragraphs"]:
|
||||
cur_meta = {"name": document["title"]}
|
||||
cur_meta = {"name": document.get("title", None)}
|
||||
# all other fields from paragraph level
|
||||
meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")}
|
||||
cur_meta.update(meta_paragraph)
|
||||
|
||||
@ -33,7 +33,7 @@ def elasticsearch_fixture():
|
||||
shell=True
|
||||
)
|
||||
status = subprocess.run(
|
||||
['docker run -d --name haystack_test_elastic -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.1'],
|
||||
['docker run -d --name haystack_test_elastic -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'],
|
||||
shell=True
|
||||
)
|
||||
if status.returncode:
|
||||
@ -152,20 +152,25 @@ def no_answer_prediction(no_answer_reader, test_docs_xs):
|
||||
def document_store_with_docs(request, test_docs_xs, elasticsearch_fixture):
|
||||
document_store = get_document_store(request.param)
|
||||
document_store.write_documents(test_docs_xs)
|
||||
return document_store
|
||||
yield document_store
|
||||
if isinstance(document_store, FAISSDocumentStore):
|
||||
document_store.faiss_index.reset()
|
||||
|
||||
|
||||
@pytest.fixture(params=["elasticsearch", "faiss", "memory", "sql"])
|
||||
def document_store(request, test_docs_xs, elasticsearch_fixture):
|
||||
return get_document_store(request.param)
|
||||
document_store = get_document_store(request.param)
|
||||
yield document_store
|
||||
if isinstance(document_store, FAISSDocumentStore):
|
||||
document_store.faiss_index.reset()
|
||||
|
||||
|
||||
@pytest.fixture(params=["es_filter_only", "elsticsearch", "dpr", "embedding", "tfidf"])
|
||||
@pytest.fixture(params=["es_filter_only", "elasticsearch", "dpr", "embedding", "tfidf"])
|
||||
def retriever(request, document_store):
|
||||
return get_retriever(request.param, document_store)
|
||||
|
||||
|
||||
@pytest.fixture(params=["es_filter_only", "elsticsearch", "dpr", "embedding", "tfidf"])
|
||||
@pytest.fixture(params=["es_filter_only", "elasticsearch", "dpr", "embedding", "tfidf"])
|
||||
def retriever_with_docs(request, document_store_with_docs):
|
||||
return get_retriever(request.param, document_store_with_docs)
|
||||
|
||||
@ -176,16 +181,16 @@ def get_document_store(document_store_type):
|
||||
os.remove("haystack_test.db")
|
||||
document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
|
||||
elif document_store_type == "memory":
|
||||
document_store = InMemoryDocumentStore()
|
||||
document_store = InMemoryDocumentStore(return_embedding=False)
|
||||
elif document_store_type == "elasticsearch":
|
||||
# make sure we start from a fresh index
|
||||
client = Elasticsearch()
|
||||
client.indices.delete(index='haystack_test*', ignore=[404])
|
||||
document_store = ElasticsearchDocumentStore(index="haystack_test")
|
||||
document_store = ElasticsearchDocumentStore(index="haystack_test", return_embedding=False)
|
||||
elif document_store_type == "faiss":
|
||||
if os.path.exists("haystack_test_faiss.db"):
|
||||
os.remove("haystack_test_faiss.db")
|
||||
document_store = FAISSDocumentStore(sql_url="sqlite:///haystack_test_faiss.db")
|
||||
document_store = FAISSDocumentStore(sql_url="sqlite:///haystack_test_faiss.db", return_embedding=False)
|
||||
else:
|
||||
raise Exception(f"No document store fixture for '{document_store_type}'")
|
||||
|
||||
@ -206,7 +211,7 @@ def get_retriever(retriever_type, document_store):
|
||||
retriever = EmbeddingRetriever(document_store=document_store,
|
||||
embedding_model="deepset/sentence_bert",
|
||||
use_gpu=False)
|
||||
elif retriever_type == "elsticsearch":
|
||||
elif retriever_type == "elasticsearch":
|
||||
retriever = ElasticsearchRetriever(document_store=document_store)
|
||||
elif retriever_type == "es_filter_only":
|
||||
retriever = ElasticsearchFilterOnlyRetriever(document_store=document_store)
|
||||
|
||||
@ -65,6 +65,19 @@ def test_get_documents_by_id(document_store_with_docs):
|
||||
assert doc.text == documents[0].text
|
||||
|
||||
|
||||
def test_get_document_count(document_store):
|
||||
documents = [
|
||||
{"text": "text1", "id": "1", "meta_field_for_count": "a"},
|
||||
{"text": "text2", "id": "2", "meta_field_for_count": "b"},
|
||||
{"text": "text3", "id": "3", "meta_field_for_count": "b"},
|
||||
{"text": "text4", "id": "4", "meta_field_for_count": "b"},
|
||||
]
|
||||
document_store.write_documents(documents)
|
||||
assert document_store.get_document_count() == 4
|
||||
assert document_store.get_document_count(filters={"meta_field_for_count": ["a"]}) == 1
|
||||
assert document_store.get_document_count(filters={"meta_field_for_count": ["b"]}) == 3
|
||||
|
||||
|
||||
def test_write_document_meta(document_store):
|
||||
documents = [
|
||||
{"text": "dict_without_meta", "id": "1"},
|
||||
|
||||
@ -5,9 +5,11 @@ from haystack import Document
|
||||
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory"], indirect=True)
|
||||
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
||||
def test_dpr_inmemory_retrieval(document_store, retriever):
|
||||
@pytest.mark.parametrize("return_embedding", [True, False])
|
||||
def test_dpr_retrieval(document_store, retriever, return_embedding):
|
||||
|
||||
documents = [
|
||||
Document(
|
||||
@ -31,12 +33,13 @@ def test_dpr_inmemory_retrieval(document_store, retriever):
|
||||
)
|
||||
]
|
||||
|
||||
document_store.delete_all_documents(index="test_dpr")
|
||||
document_store.write_documents(documents, index="test_dpr")
|
||||
document_store.update_embeddings(retriever=retriever, index="test_dpr")
|
||||
time.sleep(2)
|
||||
document_store.return_embedding = return_embedding
|
||||
document_store.write_documents(documents)
|
||||
document_store.update_embeddings(retriever=retriever)
|
||||
|
||||
docs_with_emb = document_store.get_all_documents(index="test_dpr")
|
||||
time.sleep(1)
|
||||
|
||||
docs_with_emb = document_store.get_all_documents()
|
||||
|
||||
# FAISSDocumentStore doesn't return embeddings, so these tests only work with ElasticsearchDocumentStore
|
||||
if isinstance(document_store, ElasticsearchDocumentStore):
|
||||
@ -46,8 +49,13 @@ def test_dpr_inmemory_retrieval(document_store, retriever):
|
||||
assert (abs(docs_with_emb[2].embedding[0] - (-0.24695)) < 0.001)
|
||||
assert (abs(docs_with_emb[3].embedding[0] - (-0.08017)) < 0.001)
|
||||
assert (abs(docs_with_emb[4].embedding[0] - (-0.01534)) < 0.001)
|
||||
res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?", index="test_dpr")
|
||||
|
||||
res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?")
|
||||
|
||||
assert res[0].meta["name"] == "1"
|
||||
|
||||
# clean up
|
||||
document_store.delete_all_documents(index="test_dpr")
|
||||
# test embedding
|
||||
if return_embedding is True:
|
||||
assert res[0].embedding is not None
|
||||
else:
|
||||
assert res[0].embedding is None
|
||||
|
||||
@ -2,7 +2,7 @@ import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("document_store_with_docs", [("elasticsearch")], indirect=True)
|
||||
@pytest.mark.parametrize("retriever_with_docs", ["elsticsearch"], indirect=True)
|
||||
@pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True)
|
||||
def test_elasticsearch_retrieval(retriever_with_docs, document_store_with_docs):
|
||||
res = retriever_with_docs.retrieve(query="Who lives in Berlin?")
|
||||
assert res[0].text == "My name is Carla and I live in Berlin"
|
||||
@ -11,7 +11,7 @@ def test_elasticsearch_retrieval(retriever_with_docs, document_store_with_docs):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("document_store_with_docs", [("elasticsearch")], indirect=True)
|
||||
@pytest.mark.parametrize("retriever_with_docs", ["elsticsearch"], indirect=True)
|
||||
@pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True)
|
||||
def test_elasticsearch_retrieval_filters(retriever_with_docs, document_store_with_docs):
|
||||
res = retriever_with_docs.retrieve(query="Who lives in Berlin?", filters={"name": ["filename1"]})
|
||||
assert res[0].text == "My name is Carla and I live in Berlin"
|
||||
|
||||
@ -2,6 +2,7 @@ import pytest
|
||||
from haystack import Finder
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory"], indirect=True)
|
||||
@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
|
||||
def test_embedding_retriever(retriever, document_store):
|
||||
|
||||
@ -61,7 +61,7 @@ def test_eval_reader(reader, document_store: BaseDocumentStore):
|
||||
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
|
||||
@pytest.mark.parametrize("open_domain", [True, False])
|
||||
@pytest.mark.parametrize("retriever", ["elsticsearch"], indirect=True)
|
||||
@pytest.mark.parametrize("retriever", ["elasticsearch"], indirect=True)
|
||||
def test_eval_elastic_retriever(document_store: BaseDocumentStore, open_domain, retriever):
|
||||
# add eval data (SQUAD format)
|
||||
document_store.delete_all_documents(index="test_eval_document")
|
||||
@ -81,7 +81,7 @@ def test_eval_elastic_retriever(document_store: BaseDocumentStore, open_domain,
|
||||
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
|
||||
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
|
||||
@pytest.mark.parametrize("retriever", ["elsticsearch"], indirect=True)
|
||||
@pytest.mark.parametrize("retriever", ["elasticsearch"], indirect=True)
|
||||
def test_eval_finder(document_store: BaseDocumentStore, reader, retriever):
|
||||
finder = Finder(reader=reader, retriever=retriever)
|
||||
|
||||
|
||||
@ -76,6 +76,7 @@ def test_faiss_write_docs(document_store, index_buffer_size, batch_size):
|
||||
check_data_correctness(documents_indexed, DOCUMENTS)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
|
||||
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
||||
@pytest.mark.parametrize("index_buffer_size", [10_000, 2])
|
||||
|
||||
@ -2,6 +2,7 @@ from haystack import Finder
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||
def test_finder_get_answers(reader, retriever_with_docs, document_store_with_docs):
|
||||
finder = Finder(reader, retriever_with_docs)
|
||||
@ -31,6 +32,7 @@ def test_finder_offsets(reader, retriever_with_docs, document_store_with_docs):
|
||||
assert prediction["answers"][0]["context"][start:end] == prediction["answers"][0]["answer"]
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||
def test_finder_get_answers_single_result(reader, retriever_with_docs, document_store_with_docs):
|
||||
finder = Finder(reader, retriever_with_docs)
|
||||
|
||||
@ -1,11 +1,12 @@
|
||||
import math
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack import Document
|
||||
from haystack.reader.base import BaseReader
|
||||
from haystack.reader.farm import FARMReader
|
||||
|
||||
|
||||
|
||||
def test_reader_basic(reader):
|
||||
assert reader is not None
|
||||
assert isinstance(reader, BaseReader)
|
||||
@ -23,6 +24,7 @@ def test_output(prediction):
|
||||
assert len(prediction["answers"]) == 5
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_no_answer_output(no_answer_prediction):
|
||||
assert no_answer_prediction is not None
|
||||
assert no_answer_prediction["question"] == "What is the meaning of life?"
|
||||
@ -38,9 +40,12 @@ def test_no_answer_output(no_answer_prediction):
|
||||
assert answers.count(None) == 1
|
||||
assert len(no_answer_prediction["answers"]) == 5
|
||||
|
||||
|
||||
# TODO Directly compare farm and transformers reader outputs
|
||||
# TODO checks to see that model is responsive to input arguments e.g. context_window_size - topk
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_prediction_attributes(prediction):
|
||||
# TODO FARM's prediction also has no_ans_gap
|
||||
attributes_gold = ["question", "answers"]
|
||||
@ -57,37 +62,42 @@ def test_answer_attributes(prediction):
|
||||
assert ag in answer
|
||||
|
||||
|
||||
def test_context_window_size(test_docs_xs):
|
||||
# TODO parametrize window_size and farm/transformers reader using pytest
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
|
||||
@pytest.mark.parametrize("window_size", [10, 15, 20])
|
||||
def test_context_window_size(reader, test_docs_xs, window_size):
|
||||
docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
|
||||
for window_size in [10, 15, 20]:
|
||||
farm_reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", num_processes=0,
|
||||
use_gpu=False, top_k_per_sample=5, no_ans_boost=None, context_window_size=window_size)
|
||||
prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5)
|
||||
for answer in prediction["answers"]:
|
||||
# If the extracted answer is larger than the context window, the context window is expanded.
|
||||
# If the extracted answer is odd in length, the resulting context window is one less than context_window_size
|
||||
# due to rounding (See FARM's QACandidate)
|
||||
# TODO Currently the behaviour of context_window_size in FARMReader and TransformerReader is different
|
||||
if len(answer["answer"]) <= window_size:
|
||||
assert len(answer["context"]) in [window_size, window_size-1]
|
||||
else:
|
||||
assert len(answer["answer"]) == len(answer["context"])
|
||||
|
||||
# TODO Need to test transformers reader
|
||||
if isinstance(reader, FARMReader):
|
||||
reader.inferencer.model.prediction_heads[0].context_window_size = window_size
|
||||
|
||||
prediction = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5)
|
||||
for answer in prediction["answers"]:
|
||||
# If the extracted answer is larger than the context window, the context window is expanded.
|
||||
# If the extracted answer is odd in length, the resulting context window is one less than context_window_size
|
||||
# due to rounding (See FARM's QACandidate)
|
||||
# TODO Currently the behaviour of context_window_size in FARMReader and TransformerReader is different
|
||||
if len(answer["answer"]) <= window_size:
|
||||
assert len(answer["context"]) in [window_size, window_size - 1]
|
||||
else:
|
||||
assert len(answer["answer"]) == len(answer["context"])
|
||||
|
||||
# TODO Need to test transformers reader
|
||||
# TODO Currently the behaviour of context_window_size in FARMReader and TransformerReader is different
|
||||
|
||||
|
||||
def test_top_k(test_docs_xs):
|
||||
# TODO parametrize top_k and farm/transformers reader using pytest
|
||||
# TODO transformers reader was crashing when tested on this
|
||||
|
||||
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
|
||||
@pytest.mark.parametrize("top_k", [2, 5, 10])
|
||||
def test_top_k(reader, test_docs_xs, top_k):
|
||||
docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
|
||||
farm_reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", num_processes=0,
|
||||
use_gpu=False, top_k_per_sample=4, no_ans_boost=None, top_k_per_candidate=4)
|
||||
for top_k in [2, 5, 10]:
|
||||
prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=top_k)
|
||||
assert len(prediction["answers"]) == top_k
|
||||
|
||||
|
||||
reader.top_k_per_candidate = 4
|
||||
if isinstance(reader, FARMReader):
|
||||
reader.inferencer.model.prediction_heads[0].n_best = reader.top_k_per_candidate + 1
|
||||
try:
|
||||
reader.inferencer.model.prediction_heads[0].n_best_per_sample = 4
|
||||
except:
|
||||
print("WARNING: Could not set `top_k_per_sample` in FARM. Please update FARM version.")
|
||||
|
||||
prediction = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=top_k)
|
||||
assert len(prediction["answers"]) == top_k
|
||||
|
||||
@ -18,6 +18,7 @@ def get_test_client_and_override_dependencies(reader, document_store_with_docs):
|
||||
return TestClient(app)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
|
||||
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
|
||||
def test_query_api(reader, document_store_with_docs):
|
||||
|
||||
@ -30,7 +30,8 @@
|
||||
"#! pip install farm-haystack\n",
|
||||
"\n",
|
||||
"# Install the latest master of Haystack\n",
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git"
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git\n",
|
||||
"!pip install urllib3==1.25.4"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@ -25,7 +25,8 @@
|
||||
"#! pip install farm-haystack\n",
|
||||
"\n",
|
||||
"# Install the latest master of Haystack\n",
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git"
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git\n",
|
||||
"!pip install urllib3==1.25.4"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@ -25,7 +25,8 @@
|
||||
"#! pip install farm-haystack\n",
|
||||
"\n",
|
||||
"# Install the latest master of Haystack\n",
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git"
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git\n",
|
||||
"!pip install urllib3==1.25.4"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@ -33,7 +33,8 @@
|
||||
"#! pip install farm-haystack\n",
|
||||
"\n",
|
||||
"# Install the latest master of Haystack\n",
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git"
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git\n",
|
||||
"!pip install urllib3==1.25.4"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -70,7 +71,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Recommended: Start Elasticsearch using Docker\n",
|
||||
"# ! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2"
|
||||
"# ! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.9.2"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -80,13 +81,13 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# In Colab / No Docker environments: Start Elasticsearch from source\n",
|
||||
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
|
||||
"! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
|
||||
"! chown -R daemon:daemon elasticsearch-7.6.2\n",
|
||||
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q\n",
|
||||
"! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz\n",
|
||||
"! chown -R daemon:daemon elasticsearch-7.9.2\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from subprocess import Popen, PIPE, STDOUT\n",
|
||||
"es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
|
||||
"es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],\n",
|
||||
" stdout=PIPE, stderr=STDOUT,\n",
|
||||
" preexec_fn=lambda: os.setuid(1) # as daemon\n",
|
||||
" )\n",
|
||||
@ -250,4 +251,4 @@
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
@ -26,7 +26,7 @@ LAUNCH_ELASTICSEARCH=True
|
||||
if LAUNCH_ELASTICSEARCH:
|
||||
logging.info("Starting Elasticsearch ...")
|
||||
status = subprocess.run(
|
||||
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True
|
||||
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'], shell=True
|
||||
)
|
||||
if status.returncode:
|
||||
raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
|
||||
|
||||
@ -46,7 +46,8 @@
|
||||
"#! pip install farm-haystack\n",
|
||||
"\n",
|
||||
"# Install the latest master of Haystack\n",
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git"
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git\n",
|
||||
"!pip install urllib3==1.25.4"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -64,13 +65,13 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# In Colab / No Docker environments: Start Elasticsearch from source\n",
|
||||
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
|
||||
"! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
|
||||
"! chown -R daemon:daemon elasticsearch-7.6.2\n",
|
||||
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q\n",
|
||||
"! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz\n",
|
||||
"! chown -R daemon:daemon elasticsearch-7.9.2\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from subprocess import Popen, PIPE, STDOUT\n",
|
||||
"es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
|
||||
"es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],\n",
|
||||
" stdout=PIPE, stderr=STDOUT,\n",
|
||||
" preexec_fn=lambda: os.setuid(1) # as daemon\n",
|
||||
" )\n",
|
||||
|
||||
@ -34,7 +34,7 @@ device, n_gpu = initialize_device_settings(use_cuda=True)
|
||||
if LAUNCH_ELASTICSEARCH:
|
||||
logging.info("Starting Elasticsearch ...")
|
||||
status = subprocess.run(
|
||||
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True
|
||||
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'], shell=True
|
||||
)
|
||||
if status.returncode:
|
||||
raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
|
||||
|
||||
@ -285,7 +285,8 @@
|
||||
"#! pip install farm-haystack\n",
|
||||
"\n",
|
||||
"# Install the latest master of Haystack\n",
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git"
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git\n",
|
||||
"!pip install urllib3==1.25.4"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user