Add InMemoryDocumentStore (#76)

2025-11-07 21:33:39 +00:00 · 2020-04-27 12:54:12 -07:00 · 2020-04-27 12:54:12 -07:00 · 6038d40a53
commit 6038d40a53
parent a78659f234
9 changed files with 146 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,8 @@
 # Local run files
 qa.db
 **/qa.db
 **/*qa*.db
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
--- a/haystack/database/base.py
+++ b/haystack/database/base.py
@ -18,7 +18,7 @@ class BaseDocumentStore:
        pass
    @abstractmethod
-    def get_document_ids_by_tag(self, tag):
+    def get_document_ids_by_tags(self, tag):
        pass
    @abstractmethod
@ -38,3 +38,9 @@ class Document(BaseModel):
    question: Optional[str] = Field(None, description="Question text for FAQs.")
    query_score: Optional[int] = Field(None, description="Elasticsearch query score for a retrieved document")
    meta: Optional[Dict[str, Optional[str]]] = Field(None, description="")
    def __getitem__(self, item):
        if item == 'text':
            return self.text
        if item == 'id':
            return self.id
--- a/haystack/database/memory.py
+++ b/haystack/database/memory.py
@ -0,0 +1,41 @@
 from haystack.database.base import BaseDocumentStore, Document
 class InMemoryDocumentStore(BaseDocumentStore):
    """
        In-memory document store
    """
    def __init__(self):
        self.docs = {}
        self.doc_tags = {}
    def write_documents(self, documents):
        import hashlib
        for document in documents:
            name = document.get("name", None)
            text = document.get("text", None)
            if name is None or text is None:
                continue
            signature = name + text
            hash = hashlib.md5(signature.encode("utf-8")).hexdigest()
            self.docs[hash] = document
    def get_document_by_id(self, id):
        return self.docs[id]
    def get_document_ids_by_tags(self, tags):
        """
        The format for the dict is {"tag-1": "value-1", "tag-2": "value-2" ...}
        The format for the dict is {"tag-1": ["value-1","value-2"], "tag-2": ["value-3]" ...}
        """
        pass
    def get_document_count(self):
        return len(self.docs.items())
    def get_all_documents(self):
        return [Document(id=item[0], text=item[1]['text'], name=item[1]['name']) for item in self.docs.items()]
--- a/haystack/finder.py
+++ b/haystack/finder.py
@ -2,6 +2,7 @@ import logging
 from scipy.special import expit
 import numpy as np
 from haystack.database.base import Document
 logger = logging.getLogger(__name__)
@ -37,7 +38,6 @@ class Finder:
            results = {"question": question, "answers": []}
            return results
        # 2) Apply reader to get granular answer(s)
        len_chars = sum([len(d.text) for d in documents])
        logger.info(f"Reader is looking for detailed answer in {len_chars} chars ...")
        results = self.reader.predict(question=question,
--- a/requirements.txt
+++ b/requirements.txt
@ -8,4 +8,5 @@ psycopg2-binary
 sklearn
 elasticsearch
 elastic-apm
 tox
 # optional: sentence-transformers
--- a/test/test_finder.py
+++ b/test/test_finder.py
@ -0,0 +1,40 @@
 from haystack import Finder
 from haystack.database.sql import SQLDocumentStore
 from haystack.reader.transformers import TransformersReader
 from haystack.retriever.tfidf import TfidfRetriever
 def test_finder_get_answers():
    test_docs = [
        {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1"},
        {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2"},
        {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3"}
    ]
    document_store = SQLDocumentStore(url="sqlite:///qa_test.db")
    document_store.write_documents(test_docs)
    retriever = TfidfRetriever(document_store=document_store)
    reader = TransformersReader(model="distilbert-base-uncased-distilled-squad",
                                tokenizer="distilbert-base-uncased", use_gpu=-1)
    finder = Finder(reader, retriever)
    prediction = finder.get_answers(question="testing finder", top_k_retriever=10,
                                    top_k_reader=5)
    assert prediction is not None
 def test_finder_get_answers_single_result():
    test_docs = [
        {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1"},
        {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2"},
        {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3"}
    ]
    document_store = SQLDocumentStore(url="sqlite:///qa_test.db")
    document_store.write_documents(test_docs)
    retriever = TfidfRetriever(document_store=document_store)
    reader = TransformersReader(model="distilbert-base-uncased-distilled-squad",
                                tokenizer="distilbert-base-uncased", use_gpu=-1)
    finder = Finder(reader, retriever)
    prediction = finder.get_answers(question="testing finder", top_k_retriever=1,
                                    top_k_reader=1)
    assert prediction is not None
--- a/test/test_imports.py
+++ b/test/test_imports.py
@ -0,0 +1,19 @@
 def test_module_imports():
    from haystack import Finder
    from haystack.database.sql import SQLDocumentStore
    from haystack.indexing.cleaning import clean_wiki_text
    from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
    from haystack.reader.farm import FARMReader
    from haystack.reader.transformers import TransformersReader
    from haystack.retriever.tfidf import TfidfRetriever
    from haystack.utils import print_answers
    assert Finder is not None
    assert SQLDocumentStore is not None
    assert clean_wiki_text is not None
    assert write_documents_to_db is not None
    assert fetch_archive_from_http is not None
    assert FARMReader is not None
    assert TransformersReader is not None
    assert TfidfRetriever is not None
    assert print_answers is not None
--- a/test/test_in_memory_store.py
+++ b/test/test_in_memory_store.py
@ -0,0 +1,22 @@
 from haystack import Finder
 from haystack.reader.transformers import TransformersReader
 from haystack.retriever.tfidf import TfidfRetriever
 def test_finder_get_answers_with_in_memory_store():
    test_docs = [
        {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1"},
        {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2"},
        {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3"}
    ]
    from haystack.database.memory import InMemoryDocumentStore
    document_store = InMemoryDocumentStore()
    document_store.write_documents(test_docs)
    retriever = TfidfRetriever(document_store=document_store)
    reader = TransformersReader(model="distilbert-base-uncased-distilled-squad",
                                tokenizer="distilbert-base-uncased", use_gpu=-1)
    finder = Finder(reader, retriever)
    prediction = finder.get_answers(question="testing finder", top_k_retriever=10,
                                    top_k_reader=5)
    assert prediction is not None
--- a/tox.ini
+++ b/tox.ini
@ -0,0 +1,10 @@
 [tox]
 requires = tox-venv
           setuptools >= 30.0.0
 envlist = py36,py37
 [testenv]
 changedir = test
 deps = pytest
       pandas
 commands = pytest --basetemp="{envtmpdir}" {posargs}