Add minimal example for ES (#19)

2025-10-19 20:08:54 +00:00 · 2020-02-10 18:10:18 +01:00 · 2020-02-10 18:10:18 +01:00 · d33ef9c345
commit d33ef9c345
parent b009ec24ef
3 changed files with 42 additions and 4 deletions
--- a/haystack/indexing/io.py
+++ b/haystack/indexing/io.py
@ -8,7 +8,7 @@ import zipfile
 logger = logging.getLogger(__name__)
-def write_documents_to_db(document_store, document_dir, clean_func=None, only_empty_db=False, split_paragrahs=False):
+def write_documents_to_db(document_store, document_dir, clean_func=None, only_empty_db=False, split_paragraphs=False):
    """
    Write all text files(.txt) in the sub-directories of the given path to the connected database.
@ -37,7 +37,7 @@ def write_documents_to_db(document_store, document_dir, clean_func=None, only_em
            if clean_func:
                text = clean_func(text)
-            if split_paragrahs:
+            if split_paragraphs:
                for para in text.split("\n\n"):
                    if not para.strip():  # skip empty paragraphs
                        continue
@ -54,6 +54,7 @@ def write_documents_to_db(document_store, document_dir, clean_func=None, only_em
                    {
                        "name": path.name,
                        "text": text,
                        "document_id": doc_id
                    }
                )
    document_store.write_documents(docs_to_index)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,9 +1,10 @@
 # FARM (incl. transformers 2.3.0 with pipelines)
-#farm -e git+https://github.com/deepset-ai/FARM.git@1d30237b037050ef0ac5516f427443cdd18a4d43
+farm==0.4.1
 -e git://github.com/deepset-ai/FARM.git@e6224bd87ee50f3ff8bb23415e7f3a1b4793a257#egg=farm
 fastapi
 uvicorn
 flask_sqlalchemy
 pandas
 psycopg2-binary
 sklearn
 elasticsearch
 elasticsearch_dsl
--- a/tutorials/Tutorial3_Elasticsearch_backend.py
+++ b/tutorials/Tutorial3_Elasticsearch_backend.py
@ -0,0 +1,36 @@
 from haystack import Finder
 from haystack.database.elasticsearch import ElasticsearchDocumentStore
 from haystack.indexing.cleaning import clean_wiki_text
 from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
 from haystack.reader.transformers import TransformersReader
 from haystack.retriever.elasticsearch import ElasticsearchRetriever
 from haystack.utils import print_answers
 # Our pipeline remains very similar to the one in Tutorial 1, where we had a SQL backend
 # (https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial1_Basic_QA_Pipeline.py)
 # We therefore only highlight the three key differences here
 # Get documents (same as in Tutorial 1)
 doc_dir = "data/article_txt_got"
 s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
 fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 # Difference 1: Initialize a document store for Elasticsearch
 # This requires a running Elasticsearch instance. To run one locally you can execute:
 # docker run -d -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch:7.5.1
 document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
 # Difference 2: Split docs into paragraphs before indexing (set split_paragraphs=True)
 write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text,
                      only_empty_db=True, split_paragraphs=True)
 # Difference 3: Use the native Elasticsearch implementation of BM25 as a Retriever
 retriever = ElasticsearchRetriever(document_store=document_store)
 # Init reader & and use Finder to get answer (same as in Tutorial 1)
 reader = TransformersReader(model="deepset/bert-base-cased-squad2",tokenizer="deepset/bert-base-cased-squad2",use_gpu=-1)
 finder = Finder(reader, retriever)
 prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5)
 print_answers(prediction, details="minimal")