mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-08 13:54:31 +00:00
83 lines
2.7 KiB
Python
83 lines
2.7 KiB
Python
from haystack.database.base import BaseDocumentStore, Document
|
|
|
|
|
|
class InMemoryDocumentStore(BaseDocumentStore):
|
|
"""
|
|
In-memory document store
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.docs = {}
|
|
self.doc_tags = {}
|
|
|
|
def write_documents(self, documents):
|
|
import hashlib
|
|
|
|
if documents is None:
|
|
return
|
|
|
|
for document in documents:
|
|
name = document.get("name", None)
|
|
text = document.get("text", None)
|
|
|
|
if name is None or text is None:
|
|
continue
|
|
|
|
signature = name + text
|
|
|
|
hash = hashlib.md5(signature.encode("utf-8")).hexdigest()
|
|
|
|
self.docs[hash] = document
|
|
|
|
tags = document.get('tags', [])
|
|
|
|
self._map_tags_to_ids(hash, tags)
|
|
|
|
def _map_tags_to_ids(self, hash, tags):
|
|
if isinstance(tags, list):
|
|
for tag in tags:
|
|
if isinstance(tag, dict):
|
|
tag_keys = tag.keys()
|
|
for tag_key in tag_keys:
|
|
tag_values = tag.get(tag_key, [])
|
|
if tag_values:
|
|
for tag_value in tag_values:
|
|
comp_key = str((tag_key, tag_value))
|
|
if comp_key in self.doc_tags:
|
|
self.doc_tags[comp_key].append(hash)
|
|
else:
|
|
self.doc_tags[comp_key] = [hash]
|
|
|
|
def get_document_by_id(self, id):
|
|
return self.docs[id]
|
|
|
|
def get_document_ids_by_tags(self, tags):
|
|
"""
|
|
The format for the dict is {"tag-1": "value-1", "tag-2": "value-2" ...}
|
|
The format for the dict is {"tag-1": ["value-1","value-2"], "tag-2": ["value-3]" ...}
|
|
"""
|
|
if not isinstance(tags, list):
|
|
tags = [tags]
|
|
result = self._find_ids_by_tags(tags)
|
|
return result
|
|
|
|
def _find_ids_by_tags(self, tags):
|
|
result = []
|
|
for tag in tags:
|
|
tag_keys = tag.keys()
|
|
for tag_key in tag_keys:
|
|
tag_values = tag.get(tag_key, None)
|
|
if tag_values:
|
|
for tag_value in tag_values:
|
|
comp_key = str((tag_key, tag_value))
|
|
doc_ids = self.doc_tags.get(comp_key, [])
|
|
for doc_id in doc_ids:
|
|
result.append(self.docs.get(doc_id))
|
|
return result
|
|
|
|
def get_document_count(self):
|
|
return len(self.docs.items())
|
|
|
|
def get_all_documents(self):
|
|
return [Document(id=item[0], text=item[1]['text'], name=item[1]['name'], meta=item[1].get('meta', {})) for item in self.docs.items()]
|