From 72a3b70d7a794f0ec8e0dc257fe3d1e6d0a4547d Mon Sep 17 00:00:00 2001
From: Stan Kirdey <skirdey@users.noreply.github.com>
Date: Thu, 14 May 2020 13:12:25 -0700
Subject: [PATCH] Add filtering by tags for InMemoryDocumentStore (#108)

---
 README.rst                   |  5 +++
 haystack/database/base.py    |  7 +---
 haystack/database/memory.py  | 45 +++++++++++++++++++++--
 test/test_db.py              |  1 +
 test/test_document.py        |  1 -
 test/test_farm_reader.py     |  2 --
 test/test_in_memory_store.py | 70 ++++++++++++++++++++++++++++++++++--
 7 files changed, 117 insertions(+), 14 deletions(-)

diff --git a/README.rst b/README.rst
index f24453f03..47c72d1da 100644
--- a/README.rst
+++ b/README.rst
@@ -223,3 +223,8 @@ You will find the Swagger API documentation at http://127.0.0.1:80/docs
 * Coming soon: more file formats for document upload, metrics for label quality ...
 
 .. image:: https://raw.githubusercontent.com/deepset-ai/haystack/master/docs/img/annotation_tool.png
+
+
+7. Development
+-------------------
+* Unit tests are executed by running ```tox```
\ No newline at end of file
diff --git a/haystack/database/base.py b/haystack/database/base.py
index 731f1ad57..5ede723a9 100644
--- a/haystack/database/base.py
+++ b/haystack/database/base.py
@@ -38,9 +38,4 @@ class Document(BaseModel):
     question: Optional[str] = Field(None, description="Question text for FAQs.")
     query_score: Optional[float] = Field(None, description="Elasticsearch query score for a retrieved document")
     meta: Optional[Dict[str, Any]] = Field(None, description="")
-
-    def __getitem__(self, item):
-        if item == 'text':
-            return self.text
-        if item == 'id':
-            return self.id
+    tags: Optional[Dict[str, Any]] = Field(None, description="Tags that allow filtering of the data")
diff --git a/haystack/database/memory.py b/haystack/database/memory.py
index 5b552cf84..fb7abf070 100644
--- a/haystack/database/memory.py
+++ b/haystack/database/memory.py
@@ -12,6 +12,10 @@ class InMemoryDocumentStore(BaseDocumentStore):
 
     def write_documents(self, documents):
         import hashlib
+
+        if documents is None:
+            return
+
         for document in documents:
             name = document.get("name", None)
             text = document.get("text", None)
@@ -20,10 +24,30 @@ class InMemoryDocumentStore(BaseDocumentStore):
                 continue
 
             signature = name + text
+
             hash = hashlib.md5(signature.encode("utf-8")).hexdigest()
 
             self.docs[hash] = document
 
+            tags = document.get('tags', [])
+
+            self._map_tags_to_ids(hash, tags)
+
+    def _map_tags_to_ids(self, hash, tags):
+        if isinstance(tags, list):
+            for tag in tags:
+                if isinstance(tag, dict):
+                    tag_keys = tag.keys()
+                    for tag_key in tag_keys:
+                        tag_values = tag.get(tag_key, [])
+                        if tag_values:
+                            for tag_value in tag_values:
+                                comp_key = str((tag_key, tag_value))
+                                if comp_key in self.doc_tags:
+                                    self.doc_tags[comp_key].append(hash)
+                                else:
+                                    self.doc_tags[comp_key] = [hash]
+
     def get_document_by_id(self, id):
         return self.docs[id]
 
@@ -32,10 +56,27 @@ class InMemoryDocumentStore(BaseDocumentStore):
         The format for the dict is {"tag-1": "value-1", "tag-2": "value-2" ...}
         The format for the dict is {"tag-1": ["value-1","value-2"], "tag-2": ["value-3]" ...}
         """
-        pass
+        if not isinstance(tags, list):
+            tags = [tags]
+        result = self._find_ids_by_tags(tags)
+        return result
+
+    def _find_ids_by_tags(self, tags):
+        result = []
+        for tag in tags:
+            tag_keys = tag.keys()
+            for tag_key in tag_keys:
+                tag_values = tag.get(tag_key, None)
+                if tag_values:
+                    for tag_value in tag_values:
+                        comp_key = str((tag_key, tag_value))
+                        doc_ids = self.doc_tags.get(comp_key, [])
+                        for doc_id in doc_ids:
+                            result.append(self.docs.get(doc_id))
+        return result
 
     def get_document_count(self):
         return len(self.docs.items())
 
     def get_all_documents(self):
-        return [Document(id=item[0], text=item[1]['text'], name=item[1]['name']) for item in self.docs.items()]
\ No newline at end of file
+        return [Document(id=item[0], text=item[1]['text'], name=item[1]['name'], meta=item[1].get('meta', {})) for item in self.docs.items()]
diff --git a/test/test_db.py b/test/test_db.py
index 89eba0f37..5977ddbcf 100644
--- a/test/test_db.py
+++ b/test/test_db.py
@@ -20,6 +20,7 @@ def test_elasticsearch_write_read(elasticsearch_fixture):
     write_documents_to_db(document_store=document_store, document_dir="samples/docs")
     sleep(2)  # wait for documents to be available for query
     documents = document_store.get_all_documents()
+    print(documents)
     assert len(documents) == 2
     assert documents[0].id
     assert documents[0].text
diff --git a/test/test_document.py b/test/test_document.py
index deb540c49..1592ec541 100644
--- a/test/test_document.py
+++ b/test/test_document.py
@@ -4,4 +4,3 @@ from haystack.database.base import Document
 def test_document_data_access():
     doc = Document(id=1, text="test")
     assert doc.text == "test"
-    assert doc['text'] == "test"
diff --git a/test/test_farm_reader.py b/test/test_farm_reader.py
index bf8be081c..31e338b5d 100644
--- a/test/test_farm_reader.py
+++ b/test/test_farm_reader.py
@@ -1,5 +1,3 @@
-import pytest
-
 from haystack.reader.farm import FARMReader
 
 
diff --git a/test/test_in_memory_store.py b/test/test_in_memory_store.py
index 316dbadec..d78f57ba2 100644
--- a/test/test_in_memory_store.py
+++ b/test/test_in_memory_store.py
@@ -5,14 +5,15 @@ from haystack.retriever.tfidf import TfidfRetriever
 
 def test_finder_get_answers_with_in_memory_store():
     test_docs = [
-        {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1"},
-        {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2"},
-        {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3"}
+        {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}},
+        {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': 'url'}},
+        {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}}
     ]
 
     from haystack.database.memory import InMemoryDocumentStore
     document_store = InMemoryDocumentStore()
     document_store.write_documents(test_docs)
+
     retriever = TfidfRetriever(document_store=document_store)
     reader = TransformersReader(model="distilbert-base-uncased-distilled-squad",
                                 tokenizer="distilbert-base-uncased", use_gpu=-1)
@@ -20,3 +21,66 @@ def test_finder_get_answers_with_in_memory_store():
     prediction = finder.get_answers(question="testing finder", top_k_retriever=10,
                                     top_k_reader=5)
     assert prediction is not None
+
+
+def test_memory_store_get_by_tags():
+    test_docs = [
+        {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}},
+        {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}},
+        {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}}
+    ]
+
+    from haystack.database.memory import InMemoryDocumentStore
+    document_store = InMemoryDocumentStore()
+    document_store.write_documents(test_docs)
+
+    docs = document_store.get_document_ids_by_tags({'has_url': 'false'})
+
+    assert docs == []
+
+
+def test_memory_store_get_by_tag_lists_union():
+    test_docs = [
+        {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag2': ["1"]}]},
+        {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}, 'tags': [{'tag1': ['1']}]},
+        {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}, 'tags': [{'tag2': ["1", "2"]}]}
+    ]
+
+    from haystack.database.memory import InMemoryDocumentStore
+    document_store = InMemoryDocumentStore()
+    document_store.write_documents(test_docs)
+
+    docs = document_store.get_document_ids_by_tags({'tag2': ["1"]})
+
+    assert docs == [
+        {'name': 'testing the finder 1', 'text': 'testing the finder with pyhton unit test 1', 'meta': {'url': 'url'}, 'tags': [{'tag2': ['1']}]},
+        {'name': 'testing the finder 3', 'text': 'testing the finder with pyhton unit test 3', 'meta': {'url': 'url'}, 'tags': [{'tag2': ['1', '2']}]}
+    ]
+
+
+def test_memory_store_get_by_tag_lists_non_existent_tag():
+    test_docs = [
+        {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag1': ["1"]}]},
+    ]
+    from haystack.database.memory import InMemoryDocumentStore
+    document_store = InMemoryDocumentStore()
+    document_store.write_documents(test_docs)
+    docs = document_store.get_document_ids_by_tags({'tag1': ["3"]})
+    assert docs == []
+
+
+def test_memory_store_get_by_tag_lists_disjoint():
+    test_docs = [
+        {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag1': ["1"]}]},
+        {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}, 'tags': [{'tag2': ['1']}]},
+        {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}, 'tags': [{'tag3': ["1", "2"]}]},
+        {"name": "testing the finder 4", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}, 'tags': [{'tag3': ["1", "3"]}]}
+    ]
+
+    from haystack.database.memory import InMemoryDocumentStore
+    document_store = InMemoryDocumentStore()
+    document_store.write_documents(test_docs)
+
+    docs = document_store.get_document_ids_by_tags({'tag3': ["3"]})
+
+    assert docs == [{'name': 'testing the finder 4', 'text': 'testing the finder with pyhton unit test 3', 'meta': {'url': 'url'}, 'tags': [{'tag3': ['1', '3']}]}]