diff --git a/README.rst b/README.rst index 88d08c4d2..e51702180 100644 --- a/README.rst +++ b/README.rst @@ -65,21 +65,26 @@ Usage .. image:: https://raw.githubusercontent.com/deepset-ai/haystack/master/docs/img/code_snippet_usage.png -Configuration -------------- -The configuration can be supplied in a :code:`qa_config.py` placed in the PYTHONPATH. Alternatively, the :code:`DATABASE_URL` can also be set as an environment variable. - Deployment ========== -SQL Backend ------------ -The database ORM layer is implemented using SQLAlchemy library. By default, it uses the file-based SQLite database. For large scale deployments, the configuration can be changed to use other compatible databases like PostgreSQL or MySQL. +Haystack has an extensible document store layer. +There are currently implementations of Elasticsearch and SQL (see :code:`haystack.database.elasticsearch.ElasticsearchDocumentStore` and :code:`haystack.database.sql.SQLDocumentStore`). Elasticsearch Backend ----------------------- -(Coming soon) +--------------------- +Elasticsearch is recommended for deploying on a large scale. The documents can optionally be chunked into smaller units (e.g., paragraphs) before indexing to make the results returned by the Retriever more granular and accurate. +Retrievers can access an Elasticsearch index to find the relevant paragraphs(or documents) for a query. The default `ElasticsearchRetriever` uses Elasticsearch's native scoring (BM25), but can be extended easily with custom implementations. + +You can get started by running a single Elasticsearch node using docker:: + + docker run -d -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch:7.5.1 + +SQL Backend +----------- +The SQL backend layer is mainly meant to simplify the first development steps. By default, a local file-based SQLite database is initialized. +However, if you prefer a PostgreSQL or MySQL backend for production, you can easily configure this since our implementation is based on SQLAlchemy. REST API -------- diff --git a/docs/img/code_snippet_usage.png b/docs/img/code_snippet_usage.png index 89a280c8f..a10d49153 100644 Binary files a/docs/img/code_snippet_usage.png and b/docs/img/code_snippet_usage.png differ diff --git a/haystack/__init__.py b/haystack/__init__.py index e230e7420..916fe5259 100644 --- a/haystack/__init__.py +++ b/haystack/__init__.py @@ -1,8 +1,7 @@ -from haystack.retriever.tfidf import TfidfRetriever -from haystack.reader.farm import FARMReader import logging import pandas as pd + pd.options.display.max_colwidth = 80 logger = logging.getLogger(__name__) @@ -21,8 +20,6 @@ class Finder: def __init__(self, reader, retriever): self.retriever = retriever - self.retriever.fit() - self.reader = reader def get_answers(self, question, top_k_reader=1, top_k_retriever=10, filters=None): @@ -39,7 +36,7 @@ class Finder: # 1) Optional: reduce the search space via document tags if filters: - candidate_doc_ids = self.retriever.datastore.get_document_ids_by_tags(filters) + candidate_doc_ids = self.retriever.document_store.get_document_ids_by_tags(filters) else: candidate_doc_ids = None diff --git a/haystack/database/base.py b/haystack/database/base.py index 4ad69ca6b..56f5271a2 100644 --- a/haystack/database/base.py +++ b/haystack/database/base.py @@ -3,7 +3,7 @@ from abc import abstractmethod class BaseDocumentStore: """ - Base class for implementing DataStores. + Base class for implementing Document Stores. """ @abstractmethod diff --git a/haystack/database/elasticsearch.py b/haystack/database/elasticsearch.py new file mode 100644 index 000000000..7fb9d23ab --- /dev/null +++ b/haystack/database/elasticsearch.py @@ -0,0 +1,85 @@ +from elasticsearch import Elasticsearch +from elasticsearch_dsl import Search, Document as ESDoc, Text, connections +from haystack.database.base import BaseDocumentStore + + +class Document(ESDoc): + name = Text() + text = Text() + tags = Text() + + class Index: + name = "document" + + +class ElasticsearchDocumentStore(BaseDocumentStore): + def __init__(self, host="localhost", username="", password="", index="document"): + self.client = Elasticsearch(hosts=[{"host": host}], http_auth=(username, password)) + self.connections = connections.create_connection(hosts=[{"host": host}], http_auth=(username, password)) + Document.init() # create mapping if not exists. + self.index = index + + def get_document_by_id(self, id): + query = {"filter": {"term": {"_id": id}}} + result = self.client.search(index=self.index, body=query)["hits"]["hits"] + if result: + document = {"id": result["_id"], "name": result["name"], "text": result["text"]} + else: + document = None + return document + + def get_document_ids_by_tags(self, tags): + query = { + "query": { + "bool": { + "should": [ + { + "terms": { + "tags": tags + } + } + ] + } + } + } + result = self.client.search(index=self.index, body=query)["hits"]["hits"] + documents = [] + for hit in result: + documents.append({"id": hit["_id"], "name": hit["name"], "text": hit["text"]}) + return documents + + def write_documents(self, documents): + for doc in documents: + d = Document( + name=doc["name"], + text=doc["text"], + document_id=doc.get("document_id", None), + tags=doc.get("tags", None), + ) + d.save() + + def get_document_count(self): + s = Search(using=self.client, index=self.index) + return s.count() + + def get_all_documents(self): + search = Search(using=self.client, index=self.index) + documents = [] + for hit in search: + documents.append( + { + "id": hit.meta["id"], + "name": hit["name"], + "text": hit["text"], + } + ) + return documents + + def query(self, query, top_k=10): + search = Search(using=self.client, index=self.index).query("match", text=query)[:top_k].execute() + paragraphs = [] + meta_data = [] + for hit in search: + paragraphs.append(hit["text"]) + meta_data.append({"paragraph_id": hit.meta["id"], "document_id": hit["document_id"]}) + return paragraphs, meta_data diff --git a/haystack/database/sql.py b/haystack/database/sql.py index 4101e447f..466172dc1 100644 --- a/haystack/database/sql.py +++ b/haystack/database/sql.py @@ -83,9 +83,8 @@ class SQLDocumentStore(BaseDocumentStore): GROUP BY dt.document_id """ tag_filters = [] - for tag, value in tags.items(): - if value: - tag_filters.append(f"SUM(CASE WHEN t.value='{value}' THEN 1 ELSE 0 END) > 0") + for tag in tags: + tag_filters.append(f"SUM(CASE WHEN t.value='{tag}' THEN 1 ELSE 0 END) > 0") final_query = f"{query} HAVING {' AND '.join(tag_filters)});" query_results = self.session.execute(final_query) diff --git a/haystack/indexing/io.py b/haystack/indexing/io.py index cae24a0c3..0c8f22b2e 100644 --- a/haystack/indexing/io.py +++ b/haystack/indexing/io.py @@ -8,7 +8,7 @@ import zipfile logger = logging.getLogger(__name__) -def write_documents_to_db(datastore, document_dir, clean_func=None, only_empty_db=False): +def write_documents_to_db(document_store, document_dir, clean_func=None, only_empty_db=False, split_paragrahs=False): """ Write all text files(.txt) in the sub-directories of the given path to the connected database. @@ -22,28 +22,42 @@ def write_documents_to_db(datastore, document_dir, clean_func=None, only_empty_d # check if db has already docs if only_empty_db: - n_docs = datastore.get_document_count() + n_docs = document_store.get_document_count() if n_docs > 0: logger.info(f"Skip writing documents since DB already contains {n_docs} docs ... " "(Disable `only_empty_db`, if you want to add docs anyway.)") return None # read and add docs - documents_to_write = [] + docs_to_index = [] + doc_id = 1 for path in file_paths: with open(path) as doc: text = doc.read() if clean_func: text = clean_func(text) - documents_to_write.append( - { - "name": path.name, - "text": text, - } - ) - datastore.write_documents(documents_to_write) - logger.info(f"Wrote {len(documents_to_write)} docs to DB") + if split_paragrahs: + for para in text.split("\n\n"): + if not para.strip(): # skip empty paragraphs + continue + docs_to_index.append( + { + "name": path.name, + "text": para, + "document_id": doc_id + } + ) + doc_id += 1 + else: + docs_to_index.append( + { + "name": path.name, + "text": text, + } + ) + document_store.write_documents(docs_to_index) + logger.info(f"Wrote {len(docs_to_index)} docs to DB") def fetch_archive_from_http(url, output_dir, proxies=None): diff --git a/haystack/retriever/base.py b/haystack/retriever/base.py new file mode 100644 index 000000000..ce3c29cb9 --- /dev/null +++ b/haystack/retriever/base.py @@ -0,0 +1,7 @@ +from abc import ABC, abstractmethod + + +class BaseRetriever(ABC): + @abstractmethod + def retrieve(self, query, candidate_doc_ids=None, top_k=1): + pass diff --git a/haystack/retriever/elasticsearch.py b/haystack/retriever/elasticsearch.py new file mode 100644 index 000000000..d80ef6a72 --- /dev/null +++ b/haystack/retriever/elasticsearch.py @@ -0,0 +1,9 @@ +from haystack.retriever.base import BaseRetriever + + +class ElasticsearchRetriever(BaseRetriever): + def __init__(self, document_store): + self.document_store = document_store + + def retrieve(self, query, candidate_doc_ids=None, top_k=10): + return self.document_store.query(query, top_k) diff --git a/haystack/retriever/tfidf.py b/haystack/retriever/tfidf.py index 9b5b0c9ef..27a9d4279 100644 --- a/haystack/retriever/tfidf.py +++ b/haystack/retriever/tfidf.py @@ -1,7 +1,8 @@ -from abc import ABC, abstractmethod -from collections import OrderedDict, namedtuple import logging +from collections import OrderedDict, namedtuple + import pandas as pd +from haystack.retriever.base import BaseRetriever from sklearn.feature_extraction.text import TfidfVectorizer @@ -11,20 +12,6 @@ logger = logging.getLogger(__name__) Paragraph = namedtuple("Paragraph", ["paragraph_id", "document_id", "text"]) -class BaseRetriever(ABC): - @abstractmethod - def _get_all_paragraphs(self): - pass - - @abstractmethod - def retrieve(self, query, candidate_doc_ids=None, top_k=1): - pass - - @abstractmethod - def fit(self): - pass - - class TfidfRetriever(BaseRetriever): """ Read all documents from a SQL backend. @@ -35,7 +22,7 @@ class TfidfRetriever(BaseRetriever): It uses sklearn's TfidfVectorizer to compute a tf-idf matrix. """ - def __init__(self, datastore): + def __init__(self, document_store): self.vectorizer = TfidfVectorizer( lowercase=True, stop_words=None, @@ -43,7 +30,7 @@ class TfidfRetriever(BaseRetriever): ngram_range=(1, 1), ) - self.datastore = datastore + self.document_store = document_store self.paragraphs = self._get_all_paragraphs() self.df = None self.fit() @@ -52,12 +39,11 @@ class TfidfRetriever(BaseRetriever): """ Split the list of documents in paragraphs """ - documents = self.datastore.get_all_documents() + documents = self.document_store.get_all_documents() paragraphs = [] p_id = 0 for doc in documents: - _pgs = [d for d in doc["text"].splitlines() if d.strip()] for p in doc["text"].split("\n\n"): if not p.strip(): # skip empty paragraphs continue diff --git a/qa_config.py b/qa_config.py deleted file mode 100644 index 39862eb9b..000000000 --- a/qa_config.py +++ /dev/null @@ -1 +0,0 @@ -DATABASE_URL = "sqlite:///qa.db" diff --git a/test/test_db.py b/test/test_db.py index c980c8f4e..a96694e1c 100644 --- a/test/test_db.py +++ b/test/test_db.py @@ -3,9 +3,9 @@ from haystack.indexing.io import write_documents_to_db def test_db_write_read(): - sql_datastore = SQLDocumentStore() - write_documents_to_db(datastore=sql_datastore, document_dir="samples/docs") - documents = sql_datastore.get_all_documents() + sql_document_store = SQLDocumentStore() + write_documents_to_db(document_store=sql_document_store, document_dir="samples/docs") + documents = sql_document_store.get_all_documents() assert len(documents) == 2 - doc = sql_datastore.get_document_by_id("1") + doc = sql_document_store.get_document_by_id("1") assert doc.keys() == {"id", "name", "text", "tags"} diff --git a/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb index f4839e7ea..8bdbccbe2 100644 --- a/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb +++ b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb @@ -86,12 +86,12 @@ "# The documents can be stored in different types of \"DocumentStores\".\n", "# For dev we suggest a light-weight SQL DB\n", "# For production we suggest elasticsearch\n", - "datastore = SQLDocumentStore(url=\"sqlite:///qa.db\")\n", + "document_store = SQLDocumentStore(url=\"sqlite:///qa.db\")\n", "\n", "# Now, let's write the docs to our DB.\n", "# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)\n", "# It must take a str as input, and return a str.\n", - "write_documents_to_db(datastore=datastore, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True)" + "write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True)" ] }, { @@ -122,7 +122,7 @@ "source": [ "# A retriever identifies the k most promising chunks of text that might contain the answer for our question\n", "# Retrievers use some simple but fast algorithm, here: TF-IDF\n", - "retriever = TfidfRetriever(datastore=datastore)" + "retriever = TfidfRetriever(document_store=document_store)" ] }, { diff --git a/tutorials/Tutorial1_Basic_QA_Pipeline.py b/tutorials/Tutorial1_Basic_QA_Pipeline.py index 410d118e9..52ed87b00 100755 --- a/tutorials/Tutorial1_Basic_QA_Pipeline.py +++ b/tutorials/Tutorial1_Basic_QA_Pipeline.py @@ -20,18 +20,18 @@ fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # The documents can be stored in different types of "DocumentStores". # For dev we suggest a light-weight SQL DB # For production we suggest elasticsearch -datastore = SQLDocumentStore(url="sqlite:///qa.db") +document_store = SQLDocumentStore(url="sqlite:///qa.db") # Now, let's write the docs to our DB. # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. -write_documents_to_db(datastore=datastore, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True) +write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True) ## Initalize Reader, Retriever & Finder # A retriever identifies the k most promising chunks of text that might contain the answer for our question # Retrievers use some simple but fast algorithm, here: TF-IDF -retriever = TfidfRetriever(datastore=datastore) +retriever = TfidfRetriever(document_store=document_store) # A reader scans the text chunks in detail and extracts the k best answers # Reader use more powerful but slower deep learning models diff --git a/tutorials/Tutorial2_Finetune_a_model_on_your_data.py b/tutorials/Tutorial2_Finetune_a_model_on_your_data.py index 9345cc21c..84dfe485c 100755 --- a/tutorials/Tutorial2_Finetune_a_model_on_your_data.py +++ b/tutorials/Tutorial2_Finetune_a_model_on_your_data.py @@ -27,14 +27,14 @@ fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # Init Document store & write docs to it -datastore = SQLDocumentStore(url="sqlite:///qa.db") -write_documents_to_db(datastore=datastore, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True) +document_store = SQLDocumentStore(url="sqlite:///qa.db") +write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True) ## Initalize Reader, Retriever & Finder # A retriever identifies the k most promising chunks of text that might contain the answer for our question # Retrievers use some simple but fast algorithm, here: TF-IDF -retriever = TfidfRetriever(datastore=datastore) +retriever = TfidfRetriever(document_store=document_store) # The Finder sticks together retriever and retriever in a pipeline to answer our actual questions finder = Finder(reader, retriever)