From 6aaf69d3f40f42e4bdd4abd5883d371a1ba1fbd5 Mon Sep 17 00:00:00 2001
From: Malte Pietsch <malte.pietsch@deepset.ai>
Date: Tue, 2 Jun 2020 17:59:40 +0200
Subject: [PATCH] Add custom port to ElasticsearchDocumentStore and remove
 outdated tag_fields arg (#129)

---
 haystack/api/config.py               |  1 +
 haystack/api/elasticsearch_client.py |  4 +-
 haystack/database/elasticsearch.py   | 66 +++++++++++++++++++---------
 3 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/haystack/api/config.py b/haystack/api/config.py
index 93f76dcca..f5e73e4df 100644
--- a/haystack/api/config.py
+++ b/haystack/api/config.py
@@ -9,6 +9,7 @@ CONCURRENT_REQUEST_PER_WORKER = int(os.getenv("CONCURRENT_REQUEST_PER_WORKER", 4
 
 # DB
 DB_HOST = os.getenv("DB_HOST", "localhost")
+DB_PORT = int(os.getenv("DB_PORT", 9200))
 DB_USER = os.getenv("DB_USER", "")
 DB_PW = os.getenv("DB_PW", "")
 DB_INDEX = os.getenv("DB_INDEX", "document")
diff --git a/haystack/api/elasticsearch_client.py b/haystack/api/elasticsearch_client.py
index f56aaa38b..94531a963 100644
--- a/haystack/api/elasticsearch_client.py
+++ b/haystack/api/elasticsearch_client.py
@@ -1,7 +1,7 @@
 from elasticsearch import Elasticsearch
 
-from haystack.api.config import DB_HOST, DB_USER, DB_PW
+from haystack.api.config import DB_HOST, DB_USER, DB_PW, DB_PORT, ES_CONN_SCHEME
 
 elasticsearch_client = Elasticsearch(
-    hosts=[{"host": DB_HOST}], http_auth=(DB_USER, DB_PW), scheme="http", ca_certs=False, verify_certs=False
+    hosts=[{"host": DB_HOST, "port": DB_PORT}], http_auth=(DB_USER, DB_PW), scheme=ES_CONN_SCHEME, ca_certs=False, verify_certs=False
 )
diff --git a/haystack/database/elasticsearch.py b/haystack/database/elasticsearch.py
index 42e2a6741..c11bb7b1e 100644
--- a/haystack/database/elasticsearch.py
+++ b/haystack/database/elasticsearch.py
@@ -1,7 +1,7 @@
 import json
 import logging
 from string import Template
-
+from typing import Union
 from elasticsearch import Elasticsearch
 from elasticsearch.helpers import bulk, scan
 
@@ -13,25 +13,52 @@ logger = logging.getLogger(__name__)
 class ElasticsearchDocumentStore(BaseDocumentStore):
     def __init__(
         self,
-        host="localhost",
-        username="",
-        password="",
-        index="document",
-        search_fields="text",
-        text_field="text",
-        name_field="name",
-        external_source_id_field="external_source_id",
-        tag_fields=None,
-        embedding_field=None,
-        embedding_dim=None,
-        custom_mapping=None,
-        excluded_meta_data=None,
-        scheme="http",
-        ca_certs=False,
-        verify_certs=True,
-        create_index=True
+        host: str = "localhost",
+        port: int = 9200,
+        username: str = "",
+        password: str = "",
+        index: str = "document",
+        search_fields: Union[str,list] = "text",
+        text_field: str = "text",
+        name_field: str = "name",
+        external_source_id_field: str = "external_source_id",
+        embedding_field: str = None,
+        embedding_dim: str = None,
+        custom_mapping: dict = None,
+        excluded_meta_data: list = None,
+        scheme: str = "http",
+        ca_certs: bool = False,
+        verify_certs: bool = True,
+        create_index: bool = True
     ):
-        self.client = Elasticsearch(hosts=[{"host": host}], http_auth=(username, password),
+        """
+        A DocumentStore using Elasticsearch to store and query the documents for our search.
+
+            * Keeps all the logic to store and query documents from Elastic, incl. mapping of fields, adding filters or boosts to your queries, and storing embeddings
+            * You can either use an existing Elasticsearch index or create a new one via haystack
+            * Retrievers operate on top of this DocumentStore to find the relevant documents for a query
+
+        :param host: url of elasticsearch
+        :param port: port of elasticsearch
+        :param username: username
+        :param password: password
+        :param index: Name of index in elasticsearch to use. If not existing yet, we will create one.
+        :param search_fields: Name of fields used by ElasticsearchRetriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"]
+        :param text_field: Name of field that might contain the answer and will therefore be passed to the Reader Model (e.g. "full_text").
+                           If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will just be returned.
+        :param name_field: Name of field that contains the title of the the doc
+        :param external_source_id_field: If you have an external id (= non-elasticsearch) that identifies your documents, you can specify it here.
+        :param embedding_field: Name of field containing an embedding vector (Only needed when using the EmbeddingRetriever on top)
+        :param embedding_dim: Dimensionality of embedding vector (Only needed when using the EmbeddingRetriever on top)
+        :param custom_mapping: If you want to use your own custom mapping for creating a new index in Elasticsearch, you can supply it here as a dictionary.
+        :param excluded_meta_data: Name of fields in Elasticsearch that should not be returned (e.g. [field_one, field_two]).
+                                   Helpful if you have fields with long, irrelevant content that you don't want to display in results (e.g. embedding vectors).
+        :param scheme: 'https' or 'http', protocol used to connect to your elasticsearch instance
+        :param ca_certs: Root certificates for SSL
+        :param verify_certs: Whether to be strict about ca certificates
+        :param create_index: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case)
+        """
+        self.client = Elasticsearch(hosts=[{"host": host, "port": port}], http_auth=(username, password),
                                     scheme=scheme, ca_certs=ca_certs, verify_certs=verify_certs)
 
         # if no custom_mapping is supplied, use the default mapping
@@ -62,7 +89,6 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
         self.search_fields = search_fields
         self.text_field = text_field
         self.name_field = name_field
-        self.tag_fields = tag_fields
         self.external_source_id_field = external_source_id_field
         self.embedding_field = embedding_field
         self.excluded_meta_data = excluded_meta_data