Add InMemoryDocumentStore (#76)

This commit is contained in:
Stan Kirdey 2020-04-27 12:54:12 -07:00 committed by GitHub
parent a78659f234
commit 6038d40a53
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 146 additions and 2 deletions

5
.gitignore vendored
View File

@ -1,3 +1,8 @@
# Local run files
qa.db
**/qa.db
**/*qa*.db
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

View File

@ -18,7 +18,7 @@ class BaseDocumentStore:
pass
@abstractmethod
def get_document_ids_by_tag(self, tag):
def get_document_ids_by_tags(self, tag):
pass
@abstractmethod
@ -38,3 +38,9 @@ class Document(BaseModel):
question: Optional[str] = Field(None, description="Question text for FAQs.")
query_score: Optional[int] = Field(None, description="Elasticsearch query score for a retrieved document")
meta: Optional[Dict[str, Optional[str]]] = Field(None, description="")
def __getitem__(self, item):
if item == 'text':
return self.text
if item == 'id':
return self.id

View File

@ -0,0 +1,41 @@
from haystack.database.base import BaseDocumentStore, Document
class InMemoryDocumentStore(BaseDocumentStore):
"""
In-memory document store
"""
def __init__(self):
self.docs = {}
self.doc_tags = {}
def write_documents(self, documents):
import hashlib
for document in documents:
name = document.get("name", None)
text = document.get("text", None)
if name is None or text is None:
continue
signature = name + text
hash = hashlib.md5(signature.encode("utf-8")).hexdigest()
self.docs[hash] = document
def get_document_by_id(self, id):
return self.docs[id]
def get_document_ids_by_tags(self, tags):
"""
The format for the dict is {"tag-1": "value-1", "tag-2": "value-2" ...}
The format for the dict is {"tag-1": ["value-1","value-2"], "tag-2": ["value-3]" ...}
"""
pass
def get_document_count(self):
return len(self.docs.items())
def get_all_documents(self):
return [Document(id=item[0], text=item[1]['text'], name=item[1]['name']) for item in self.docs.items()]

View File

@ -2,6 +2,7 @@ import logging
from scipy.special import expit
import numpy as np
from haystack.database.base import Document
logger = logging.getLogger(__name__)
@ -37,7 +38,6 @@ class Finder:
results = {"question": question, "answers": []}
return results
# 2) Apply reader to get granular answer(s)
len_chars = sum([len(d.text) for d in documents])
logger.info(f"Reader is looking for detailed answer in {len_chars} chars ...")
results = self.reader.predict(question=question,

View File

@ -8,4 +8,5 @@ psycopg2-binary
sklearn
elasticsearch
elastic-apm
tox
# optional: sentence-transformers

40
test/test_finder.py Normal file
View File

@ -0,0 +1,40 @@
from haystack import Finder
from haystack.database.sql import SQLDocumentStore
from haystack.reader.transformers import TransformersReader
from haystack.retriever.tfidf import TfidfRetriever
def test_finder_get_answers():
test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1"},
{"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2"},
{"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3"}
]
document_store = SQLDocumentStore(url="sqlite:///qa_test.db")
document_store.write_documents(test_docs)
retriever = TfidfRetriever(document_store=document_store)
reader = TransformersReader(model="distilbert-base-uncased-distilled-squad",
tokenizer="distilbert-base-uncased", use_gpu=-1)
finder = Finder(reader, retriever)
prediction = finder.get_answers(question="testing finder", top_k_retriever=10,
top_k_reader=5)
assert prediction is not None
def test_finder_get_answers_single_result():
test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1"},
{"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2"},
{"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3"}
]
document_store = SQLDocumentStore(url="sqlite:///qa_test.db")
document_store.write_documents(test_docs)
retriever = TfidfRetriever(document_store=document_store)
reader = TransformersReader(model="distilbert-base-uncased-distilled-squad",
tokenizer="distilbert-base-uncased", use_gpu=-1)
finder = Finder(reader, retriever)
prediction = finder.get_answers(question="testing finder", top_k_retriever=1,
top_k_reader=1)
assert prediction is not None

19
test/test_imports.py Normal file
View File

@ -0,0 +1,19 @@
def test_module_imports():
from haystack import Finder
from haystack.database.sql import SQLDocumentStore
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.retriever.tfidf import TfidfRetriever
from haystack.utils import print_answers
assert Finder is not None
assert SQLDocumentStore is not None
assert clean_wiki_text is not None
assert write_documents_to_db is not None
assert fetch_archive_from_http is not None
assert FARMReader is not None
assert TransformersReader is not None
assert TfidfRetriever is not None
assert print_answers is not None

View File

@ -0,0 +1,22 @@
from haystack import Finder
from haystack.reader.transformers import TransformersReader
from haystack.retriever.tfidf import TfidfRetriever
def test_finder_get_answers_with_in_memory_store():
test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1"},
{"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2"},
{"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3"}
]
from haystack.database.memory import InMemoryDocumentStore
document_store = InMemoryDocumentStore()
document_store.write_documents(test_docs)
retriever = TfidfRetriever(document_store=document_store)
reader = TransformersReader(model="distilbert-base-uncased-distilled-squad",
tokenizer="distilbert-base-uncased", use_gpu=-1)
finder = Finder(reader, retriever)
prediction = finder.get_answers(question="testing finder", top_k_retriever=10,
top_k_reader=5)
assert prediction is not None

10
tox.ini Normal file
View File

@ -0,0 +1,10 @@
[tox]
requires = tox-venv
setuptools >= 30.0.0
envlist = py36,py37
[testenv]
changedir = test
deps = pytest
pandas
commands = pytest --basetemp="{envtmpdir}" {posargs}