Add embedding query for InMemoryDocumentStore

2025-11-13 16:43:44 +00:00 · 2020-05-18 05:47:41 -07:00 · 2020-05-18 05:47:41 -07:00 · bf8e506c45
commit bf8e506c45
parent 5eee61a47b
5 changed files with 69 additions and 3 deletions
--- a/haystack/database/base.py
+++ b/haystack/database/base.py
@ -25,6 +25,10 @@ class BaseDocumentStore:
    def get_document_count(self):
        pass
    @abstractmethod
    def query_by_embedding(self, query_emb, top_k=10, candidate_doc_ids=None):
        pass
 class Document(BaseModel):
    id: str = Field(..., description="_id field from Elasticsearch")
--- a/haystack/database/memory.py
+++ b/haystack/database/memory.py
@ -51,6 +51,33 @@ class InMemoryDocumentStore(BaseDocumentStore):
    def get_document_by_id(self, id):
        return self.docs[id]
    def _convert_memory_hit_to_document(self, hit, doc_id=None) -> Document:
        document = Document(
            id=doc_id,
            text=hit[0].get('text', None),
            meta=hit[0].get('meta', {}),
            query_score=hit[1],
        )
        return document
    def query_by_embedding(self, query_emb, top_k=10, candidate_doc_ids=None) -> [Document]:
        from haystack.api import config
        from numpy import dot
        from numpy.linalg import norm
        embedding_field_name = config.EMBEDDING_FIELD_NAME
        if embedding_field_name is None:
            return []
        if query_emb is None:
            return []
        candidate_docs = [self._convert_memory_hit_to_document(
            (doc, dot(query_emb, doc[embedding_field_name]) / (norm(query_emb) * norm(doc[embedding_field_name]))), doc_id=idx) for idx, doc in self.docs.items()
        ]
        return sorted(candidate_docs, key=lambda x: x.query_score, reverse=True)[0:top_k]
    def get_document_ids_by_tags(self, tags):
        """
        The format for the dict is {"tag-1": "value-1", "tag-2": "value-2" ...}
--- a/haystack/retriever/tfidf.py
+++ b/haystack/retriever/tfidf.py
@ -72,8 +72,6 @@ class TfidfRetriever(BaseRetriever):
        # get scores
        indices_and_scores = self._calc_scores(query)
        print(indices_and_scores)
        # rank paragraphs
        df_sliced = self.df.loc[indices_and_scores.keys()]
        df_sliced = df_sliced[:top_k]
--- a/test/test_db.py
+++ b/test/test_db.py
@ -20,7 +20,6 @@ def test_elasticsearch_write_read(elasticsearch_fixture):
    write_documents_to_db(document_store=document_store, document_dir="samples/docs")
    sleep(2)  # wait for documents to be available for query
    documents = document_store.get_all_documents()
    print(documents)
    assert len(documents) == 2
    assert documents[0].id
    assert documents[0].text
--- a/test/test_faq_retriever.py
+++ b/test/test_faq_retriever.py
@ -0,0 +1,38 @@
 from haystack import Finder
 def test_faq_retriever_in_memory_store(monkeypatch):
    monkeypatch.setenv("EMBEDDING_FIELD_NAME", "embedding")
    from haystack.database.memory import InMemoryDocumentStore
    from haystack.retriever.elasticsearch import EmbeddingRetriever
    document_store = InMemoryDocumentStore()
    documents = [
        {'name': 'How to test this library?', 'text': 'By running tox in the command line!', 'meta': {'question': 'How to test this library?'}},
        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
    ]
    retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", gpu=False)
    embedded = []
    for doc in documents:
        doc['embedding'] = retriever.create_embedding([doc['meta']['question']])[0]
        embedded.append(doc)
    document_store.write_documents(embedded)
    finder = Finder(reader=None, retriever=retriever)
    prediction = finder.get_answers_via_similar_questions(question="How to test this?", top_k_retriever=1)
    assert len(prediction.get('answers', [])) == 1