diff --git a/haystack/database/base.py b/haystack/database/base.py index 3a1fbb7a1..33e913fa4 100644 --- a/haystack/database/base.py +++ b/haystack/database/base.py @@ -12,10 +12,9 @@ class Document(BaseModel): description="id for the source file the document was created from. In the case when a large file is divided " "across multiple Elasticsearch documents, this id can be used to reference original source file.", ) - # name: Optional[str] = Field(None, description="Title of the document") question: Optional[str] = Field(None, description="Question text for FAQs.") query_score: Optional[float] = Field(None, description="Elasticsearch query score for a retrieved document") - meta: Dict[str, Any] = Field({}, description="") + meta: Dict[str, Any] = Field({}, description="Meta fields for a document like name, url, or author.") tags: Optional[Dict[str, Any]] = Field(None, description="Tags that allow filtering of the data") @@ -30,8 +29,11 @@ class BaseDocumentStore(ABC): """ Indexes documents for later queries. - :param documents: List of dictionaries in the format {"name": ", "text": ""}. - Optionally, further fields can be supplied depending on the child class. + :param documents: List of dictionaries. + Default format: {"text": ""} + Optionally: Include meta data via {"text": "", + "meta":{"name": ", "author": "somebody", ...}} + It can be used for filtering and is accessible in the responses of the Finder. :return: None """ diff --git a/haystack/database/elasticsearch.py b/haystack/database/elasticsearch.py index 15d941355..303d3cc65 100644 --- a/haystack/database/elasticsearch.py +++ b/haystack/database/elasticsearch.py @@ -117,9 +117,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore): Indexes documents for later queries in Elasticsearch. :param documents: List of dictionaries. - Default format: {"name": ", "text": ""} - Optionally: Include meta data via {"name": ", - "text": "", "meta":{"author": "somebody", ...}} + Default format: {"text": ""} + Optionally: Include meta data via {"text": "", + "meta":{"name": ", "author": "somebody", ...}} It can be used for filtering and is accessible in the responses of the Finder. Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary should be changed to what you have set for self.text_field and self.name_field . diff --git a/haystack/database/memory.py b/haystack/database/memory.py index 9dbaf1b81..555aa3c44 100644 --- a/haystack/database/memory.py +++ b/haystack/database/memory.py @@ -18,9 +18,9 @@ class InMemoryDocumentStore(BaseDocumentStore): """ Indexes documents for later queries. - :param documents: List of dictionaries in the format {"name": ", "text": ""}. + :param documents: List of dictionaries in the format {"text": ""}. Optionally, you can also supply "tags": ["one-tag", "another-one"] - or additional meta data via "meta": {"author": "someone", "url":"some-url" ...} + or additional meta data via "meta": {"name": ", "author": "someone", "url":"some-url" ...} :return: None """ @@ -30,19 +30,21 @@ class InMemoryDocumentStore(BaseDocumentStore): return for document in documents: - name = document.get("name", None) - text = document.get("text", None) + text = document["text"] + if "meta" not in document.keys(): + document["meta"] = {} + for k, v in document.items(): # put additional fields other than text in meta + if k not in ["text", "meta", "tags"]: + document["meta"][k] = v - if name is None or text is None: - continue + if not text: + raise Exception("A document cannot have empty text field.") - signature = name + text - - hash = hashlib.md5(signature.encode("utf-8")).hexdigest() + hash = hashlib.md5(text.encode("utf-8")).hexdigest() self.docs[hash] = document - tags = document.get('tags', []) + tags = document.get("tags", []) self._map_tags_to_ids(hash, tags) @@ -65,12 +67,12 @@ class InMemoryDocumentStore(BaseDocumentStore): document = self._convert_memory_hit_to_document(self.docs[id], doc_id=id) return document - def _convert_memory_hit_to_document(self, hit: Tuple[Any, Any], doc_id: Optional[str] = None) -> Document: + def _convert_memory_hit_to_document(self, hit: Dict[str, Any], doc_id: Optional[str] = None) -> Document: document = Document( id=doc_id, - text=hit[0].get('text', None), - meta=hit[0].get('meta', {}), - query_score=hit[1], + text=hit.get("text", None), + meta=hit.get("meta", {}), + query_score=hit.get("query_score", None), ) return document @@ -89,14 +91,21 @@ class InMemoryDocumentStore(BaseDocumentStore): "use a different DocumentStore (e.g. ElasticsearchDocumentStore).") if self.embedding_field is None: - return [] + raise Exception( + "To use query_by_embedding() 'embedding field' must " + "be specified when initializing the document store." + ) if query_emb is None: return [] - candidate_docs = [self._convert_memory_hit_to_document( - (doc, dot(query_emb, doc[self.embedding_field]) / (norm(query_emb) * norm(doc[self.embedding_field]))), doc_id=idx) for idx, doc in self.docs.items() - ] + candidate_docs = [] + for idx, hit in self.docs.items(): + hit["query_score"] = dot(query_emb, hit[self.embedding_field]) / ( + norm(query_emb) * norm(hit[self.embedding_field]) + ) + _doc = self._convert_memory_hit_to_document(hit=hit, doc_id=idx) + candidate_docs.append(_doc) return sorted(candidate_docs, key=lambda x: x.query_score, reverse=True)[0:top_k] @@ -139,4 +148,7 @@ class InMemoryDocumentStore(BaseDocumentStore): return len(self.docs.items()) def get_all_documents(self) -> List[Document]: - return [Document(id=item[0], text=item[1]['text'], name=item[1]['name'], meta=item[1].get('meta', {})) for item in self.docs.items()] + return [ + Document(id=item[0], text=item[1]["text"], meta=item[1].get("meta", {})) + for item in self.docs.items() + ] diff --git a/haystack/database/sql.py b/haystack/database/sql.py index eaf2e188f..ecb568a4b 100644 --- a/haystack/database/sql.py +++ b/haystack/database/sql.py @@ -20,7 +20,6 @@ class ORMBase(Base): class Document(ORMBase): __tablename__ = "document" - name = Column(String) text = Column(String) meta_data = Column(PickleType) @@ -96,14 +95,19 @@ class SQLDocumentStore(BaseDocumentStore): """ Indexes documents for later queries. - :param documents: List of dictionaries in the format {"name": ", "text": ""}. + :param documents: List of dictionaries in the format {"text": ""}. Optionally, you can also supply meta data via "meta": {"author": "someone", "url":"some-url" ...} :return: None """ for doc in documents: - row = Document(name=doc["name"], text=doc["text"], meta_data=doc.get("meta", {})) + if "meta" not in doc.keys(): + doc["meta"] = {} + for k, v in doc.items(): # put additional fields other than text in meta + if k not in ["text", "meta", "tags"]: + doc["meta"][k] = v + row = Document(text=doc["text"], meta_data=doc.get("meta", {})) self.session.add(row) self.session.commit() diff --git a/haystack/indexing/utils.py b/haystack/indexing/utils.py index 9f8dd1edf..068e9e635 100644 --- a/haystack/indexing/utils.py +++ b/haystack/indexing/utils.py @@ -48,9 +48,9 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None, for para in text.split("\n\n"): if not para.strip(): # skip empty paragraphs continue - documents.append({"name": path.name, "text": para}) + documents.append({"text": para, "meta": {"name": path.name}}) else: - documents.append({"name": path.name, "text": text}) + documents.append({"text": text, "meta": {"name": path.name}}) return documents diff --git a/test/conftest.py b/test/conftest.py index 336244d51..b286fb71c 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -54,9 +54,10 @@ def xpdf_fixture(): @pytest.fixture() def test_docs_xs(): return [ - {"name": "filename1", "text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1"}}, - {"name": "filename2", "text": "My name is Paul and I live in New York", "meta": {"meta_field": "test2"}}, - {"name": "filename3", "text": "My name is Christelle and I live in Paris", "meta": {"meta_field": "test3"}} + {"text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1", "name": "filename1"}}, + {"text": "My name is Paul and I live in New York", "meta": {"meta_field": "test2", "name": "filename2"}}, + {"text": "My name is Christelle and I live in Paris", "meta_field": "test3", "meta": {"name": "filename3"}} + # last doc has meta_field at the top level for backward compatibility ] diff --git a/test/test_db.py b/test/test_db.py index 2a0f3ca42..ce83c7c52 100644 --- a/test/test_db.py +++ b/test/test_db.py @@ -1,27 +1,12 @@ -from time import sleep - -from haystack.database.elasticsearch import ElasticsearchDocumentStore -from haystack.database.sql import SQLDocumentStore -from haystack.indexing.utils import convert_files_to_dicts +from haystack.database.base import Document -def test_sql_write_read(): - sql_document_store = SQLDocumentStore() - documents = convert_files_to_dicts(dir_path="samples/docs") - sql_document_store.write_documents(documents) - documents = sql_document_store.get_all_documents() - assert len(documents) == 2 - doc = sql_document_store.get_document_by_id("1") - assert doc.id - assert doc.text - - -def test_elasticsearch_write_read(elasticsearch_fixture): - document_store = ElasticsearchDocumentStore() - documents = convert_files_to_dicts(dir_path="samples/docs") - document_store.write_documents(documents) - sleep(2) # wait for documents to be available for query - documents = document_store.get_all_documents() - assert len(documents) == 2 - assert documents[0].id - assert documents[0].text +def test_get_all_documents(document_store_with_docs): + documents = document_store_with_docs.get_all_documents() + assert all(isinstance(d, Document) for d in documents) + assert len(documents) == 3 + assert {d.meta["name"] for d in documents} == {"filename1", "filename2", "filename3"} + assert {d.meta["meta_field"] for d in documents} == {"test1", "test2", "test3"} + doc = document_store_with_docs.get_document_by_id(documents[0].id) + assert doc.id == documents[0].id + assert doc.text == documents[0].text diff --git a/test/test_dpr_retriever.py b/test/test_dpr_retriever.py index fbeb2bd7b..34f965e80 100644 --- a/test/test_dpr_retriever.py +++ b/test/test_dpr_retriever.py @@ -1,20 +1,8 @@ -from haystack.retriever.dpr_utils import download_dpr - -def test_dpr_passage_encoder(): - from haystack.retriever.dense import DensePassageRetriever - - passage = ["Let's encode this one"] - retriever = DensePassageRetriever(document_store=None, embedding_model="dpr-bert-base-nq", gpu=False) - emb = retriever.embed_passages(passage)[0] - assert(emb.shape[0] == 768) - assert(emb[0]-0.52872 < 0.001) +from haystack.database.memory import InMemoryDocumentStore +from haystack.retriever.dense import DensePassageRetriever def test_dpr_inmemory_retrieval(): - - from haystack.database.memory import InMemoryDocumentStore - from haystack.retriever.dense import DensePassageRetriever - document_store = InMemoryDocumentStore(embedding_field="embedding") documents = [ @@ -27,8 +15,13 @@ def test_dpr_inmemory_retrieval(): embedded = [] for doc in documents: - doc['embedding'] = retriever.embed_passages([doc['text']])[0] + embedding = retriever.embed_passages([doc['text']])[0] + doc['embedding'] = embedding embedded.append(doc) + + assert (embedding.shape[0] == 768) + assert (embedding[0] - 0.52872 < 0.001) + document_store.write_documents(embedded) res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?") diff --git a/test/test_faq_retriever.py b/test/test_faq_retriever.py index ed931ce6d..9e54dc675 100644 --- a/test/test_faq_retriever.py +++ b/test/test_faq_retriever.py @@ -9,17 +9,17 @@ def test_faq_retriever_in_memory_store(): document_store = InMemoryDocumentStore(embedding_field="embedding") documents = [ - {'name': 'How to test this library?', 'text': 'By running tox in the command line!', 'meta': {'question': 'How to test this library?'}}, - {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, - {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, - {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, - {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, - {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, - {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, - {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, - {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, - {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, - {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}}, + {'text': 'By running tox in the command line!', 'meta': {'name': 'How to test this library?', 'question': 'How to test this library?'}}, + {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, + {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, + {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, + {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, + {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, + {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, + {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, + {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, + {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, + {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}}, ] retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", gpu=False) diff --git a/test/test_in_memory_store.py b/test/test_in_memory_store.py index 9b8953f6a..dd5bac7d6 100644 --- a/test/test_in_memory_store.py +++ b/test/test_in_memory_store.py @@ -5,9 +5,9 @@ from haystack.retriever.sparse import TfidfRetriever def test_finder_get_answers_with_in_memory_store(): test_docs = [ - {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}}, - {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': 'url'}}, - {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}} + {"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}}, + {"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': 'url'}}, + {"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}} ] from haystack.database.memory import InMemoryDocumentStore @@ -25,9 +25,9 @@ def test_finder_get_answers_with_in_memory_store(): def test_memory_store_get_by_tags(): test_docs = [ - {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}}, - {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}}, - {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}} + {"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}}, + {"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': None}}, + {"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}} ] from haystack.database.memory import InMemoryDocumentStore @@ -41,9 +41,9 @@ def test_memory_store_get_by_tags(): def test_memory_store_get_by_tag_lists_union(): test_docs = [ - {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag2': ["1"]}]}, - {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}, 'tags': [{'tag1': ['1']}]}, - {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}, 'tags': [{'tag2': ["1", "2"]}]} + {"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}, 'tags': [{'tag2': ["1"]}]}, + {"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': None}, 'tags': [{'tag1': ['1']}]}, + {"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}, 'tags': [{'tag2': ["1", "2"]}]} ] from haystack.database.memory import InMemoryDocumentStore @@ -53,14 +53,14 @@ def test_memory_store_get_by_tag_lists_union(): docs = document_store.get_document_ids_by_tags({'tag2': ["1"]}) assert docs == [ - {'name': 'testing the finder 1', 'text': 'testing the finder with pyhton unit test 1', 'meta': {'url': 'url'}, 'tags': [{'tag2': ['1']}]}, - {'name': 'testing the finder 3', 'text': 'testing the finder with pyhton unit test 3', 'meta': {'url': 'url'}, 'tags': [{'tag2': ['1', '2']}]} + {'text': 'testing the finder with pyhton unit test 1', 'meta': {'name': 'testing the finder 1', 'url': 'url'}, 'tags': [{'tag2': ['1']}]}, + {'text': 'testing the finder with pyhton unit test 3', 'meta': {'name': 'testing the finder 3', 'url': 'url'}, 'tags': [{'tag2': ['1', '2']}]} ] def test_memory_store_get_by_tag_lists_non_existent_tag(): test_docs = [ - {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag1': ["1"]}]}, + {"text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url', "name": "testing the finder 1"}, 'tags': [{'tag1': ["1"]}]}, ] from haystack.database.memory import InMemoryDocumentStore document_store = InMemoryDocumentStore() @@ -71,10 +71,10 @@ def test_memory_store_get_by_tag_lists_non_existent_tag(): def test_memory_store_get_by_tag_lists_disjoint(): test_docs = [ - {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}, 'tags': [{'tag1': ["1"]}]}, - {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}, 'tags': [{'tag2': ['1']}]}, - {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}, 'tags': [{'tag3': ["1", "2"]}]}, - {"name": "testing the finder 4", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}, 'tags': [{'tag3': ["1", "3"]}]} + {"text": "testing the finder with pyhton unit test 1", 'meta': {"name": "testing the finder 1", 'url': 'url'}, 'tags': [{'tag1': ["1"]}]}, + {"text": "testing the finder with pyhton unit test 2", 'meta': {"name": "testing the finder 2", 'url': None}, 'tags': [{'tag2': ['1']}]}, + {"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 3", 'url': 'url'}, 'tags': [{'tag3': ["1", "2"]}]}, + {"text": "testing the finder with pyhton unit test 3", 'meta': {"name": "testing the finder 4", 'url': 'url'}, 'tags': [{'tag3': ["1", "3"]}]} ] from haystack.database.memory import InMemoryDocumentStore @@ -83,4 +83,4 @@ def test_memory_store_get_by_tag_lists_disjoint(): docs = document_store.get_document_ids_by_tags({'tag3': ["3"]}) - assert docs == [{'name': 'testing the finder 4', 'text': 'testing the finder with pyhton unit test 3', 'meta': {'url': 'url'}, 'tags': [{'tag3': ['1', '3']}]}] + assert docs == [{'text': 'testing the finder with pyhton unit test 3', 'meta': {'name': 'testing the finder 4', 'url': 'url'}, 'tags': [{'tag3': ['1', '3']}]}] diff --git a/test/test_reader.py b/test/test_reader.py index ecad69526..b0482282e 100644 --- a/test/test_reader.py +++ b/test/test_reader.py @@ -12,7 +12,7 @@ def test_reader_basic(reader): def test_output(reader, test_docs_xs): docs = [] for d in test_docs_xs: - doc = Document(id=d["name"], text=d["text"], meta=d["meta"]) + doc = Document(id=d["meta"]["name"], text=d["text"], meta=d["meta"]) docs.append(doc) results = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5) assert results is not None diff --git a/test/test_tfidf_retriever.py b/test/test_tfidf_retriever.py index 126e4ff80..fffcdde03 100644 --- a/test/test_tfidf_retriever.py +++ b/test/test_tfidf_retriever.py @@ -16,4 +16,13 @@ def test_tfidf_retriever(): retriever = TfidfRetriever(document_store) retriever.fit() - assert retriever.retrieve("godzilla", top_k=1) == [Document(id='0', text='godzilla says hello', external_source_id=None, question=None, query_score=None, meta={})] + assert retriever.retrieve("godzilla", top_k=1) == [ + Document( + id='0', + text='godzilla says hello', + external_source_id=None, + question=None, + query_score=None, + meta={"name": "testing the finder 1"}, + ) + ]