Add basic tutorial for FAQ-based QA & batch comp. of embeddings (#98)

* Add basic tutorial for FAQ-based QA and switch to bach computation of embeddings * update readme & haystack version in tutorial
2026-01-06 03:57:19 +00:00 · 2020-05-07 10:19:26 +02:00 · 2020-05-07 10:19:26 +02:00 · a431a94b04
commit a431a94b04
parent f1b1793e29
6 changed files with 352 additions and 14 deletions
--- a/README.rst
+++ b/README.rst
@ -58,6 +58,7 @@ Resources
 - Tutorial 1  - Basic QA Pipeline: `Jupyter notebook  <https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb>`__  or `Colab <https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb>`__
 - Tutorial 2  - Fine-tuning a model on own data: `Jupyter notebook <https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb>`__ or `Colab <https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb>`__
 - Tutorial 3  - Basic QA Pipeline without Elasticsearch: `Jupyter notebook <https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb>`__ or `Colab <https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb>`__
+- Tutorial 4  - FAQ-style QA: `Jupyter notebook <https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial4_FAQ_style_QA.ipynb.ipynb>`__ or `Colab <https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial4_FAQ_style_QA.ipynb.ipynb>`__

 Quick Start
 ===========
--- a/haystack/database/base.py
+++ b/haystack/database/base.py
@ -36,7 +36,7 @@ class Document(BaseModel):
    )
    # name: Optional[str] = Field(None, description="Title of the document")
    question: Optional[str] = Field(None, description="Question text for FAQs.")
-    query_score: Optional[int] = Field(None, description="Elasticsearch query score for a retrieved document")
+    query_score: Optional[float] = Field(None, description="Elasticsearch query score for a retrieved document")
    meta: Optional[Dict[str, Optional[str]]] = Field(None, description="")

    def __getitem__(self, item):
--- a/haystack/finder.py
+++ b/haystack/finder.py
@ -85,7 +85,7 @@ class Finder:
        # 3) Format response
        for doc in documents:
            #TODO proper calibratation of pseudo probabilities
-            cur_answer = {"question": doc.question, "answer": doc.text, "context": doc.text,
+            cur_answer = {"question": doc.meta["question"], "answer": doc.text, "context": doc.text,
                          "score": doc.query_score, "offset_start": 0, "offset_end": len(doc.text),
                          }
            if self.retriever.embedding_model:
--- a/haystack/retriever/elasticsearch.py
+++ b/haystack/retriever/elasticsearch.py
@ -74,7 +74,8 @@ class EmbeddingRetriever(BaseRetriever):
        logger.info(f"Init retriever using embeddings of model {embedding_model}")
        if model_format == "farm" or model_format == "transformers":
            self.embedding_model = Inferencer.load(
-                embedding_model, task_type="embeddings", gpu=gpu, batch_size=4, max_seq_len=512
+                embedding_model, task_type="embeddings", extraction_strategy=self.pooling_strategy,
+                extraction_layer=self.emb_extraction_layer, gpu=gpu, batch_size=4, max_seq_len=512, num_processes=0
            )

        elif model_format == "sentence_transformers":
@ -87,21 +88,28 @@ class EmbeddingRetriever(BaseRetriever):
            raise NotImplementedError

    def retrieve(self, query: str, candidate_doc_ids: [str] = None, top_k: int = 10) -> [Document]:
-        query_emb = self.create_embedding(query)
-        documents = self.document_store.query_by_embedding(query_emb, top_k, candidate_doc_ids)
+        query_emb = self.create_embedding(texts=[query])
+        documents = self.document_store.query_by_embedding(query_emb[0], top_k, candidate_doc_ids)

        return documents

-    def create_embedding(self, text):
+    def create_embedding(self, texts: [str]):
+        """
+        Create embeddings for each text in a list of texts using the retrievers model (`self.embedding_model`)
+        :param texts: texts to embed
+        :return: list of embeddings (one per input text). Each embedding is a list of floats.
+        """
+
+        # for backward compatibility: cast pure str input
+        if type(texts) == str:
+            texts = [texts]
+        assert type(texts) == list, "Expecting a list of texts, i.e. create_embeddings(texts=['text1',...])"
+
        if self.model_format == "farm":
-            res = self.embedding_model.extract_vectors(
-                dicts=[{"text": text}],
-                extraction_strategy=self.pooling_strategy,
-                extraction_layer=self.emb_extraction_layer,
-            )
-            emb = list(res[0]["vec"])
+            res = self.embedding_model.inference_from_dicts(dicts=[{"text": t} for t in texts])
+            emb = [list(r["vec"]) for r in res] #cast from numpy
        elif self.model_format == "sentence_transformers":
            # text is single string, sentence-transformers needs a list of strings
-            res = self.embedding_model.encode([text])  # get back list of numpy embedding vectors
-            emb = res[0].tolist()
+            res = self.embedding_model.encode(texts)  # get back list of numpy embedding vectors
+            emb = [list(r) for r in res] #cast from numpy
        return emb
--- a/tutorials/Tutorial4_FAQ_style_QA.ipynb
+++ b/tutorials/Tutorial4_FAQ_style_QA.ipynb
@ -0,0 +1,249 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## \"FAQ-Style QA\": Utilizing existing FAQs for Question Answering\n",
+    "\n",
+    "While *extractive Question Answering* works on pure texts and is therefore more generalizable, there's also a common alternative that utilizes existing FAQ data.\n",
+    "\n",
+    "Pros:\n",
+    "- Very fast at inference time\n",
+    "- Utilize existing FAQ data\n",
+    "- Quite good control over answers\n",
+    "\n",
+    "Cons:\n",
+    "- Generalizability: We can only answer questions that are similar to existing ones in FAQ\n",
+    "\n",
+    "In some use cases, a combination of extractive QA and FAQ-style can also be an interesting option.\n",
+    "\n",
+    "*Use this [link](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial4_Tutorial4_FAQ_style_QA.ipynb) to open the notebook in Google Colab.*\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#TODO\n",
+    "! pip install git+git://github.com/deepset-ai/haystack.git@319e238f4652a05a95f02fa4cd19ef406440a789\n",
+    "#! pip install farm-haystack"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from haystack import Finder\n",
+    "from haystack.database.elasticsearch import ElasticsearchDocumentStore\n",
+    "\n",
+    "from haystack.retriever.elasticsearch import EmbeddingRetriever\n",
+    "from haystack.utils import print_answers\n",
+    "import pandas as pd\n",
+    "import requests\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Start an Elasticsearch server\n",
+    "You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Recommended: Start Elasticsearch using Docker\n",
+    "# ! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# In Colab / No Docker environments: Start Elasticsearch from source\n",
+    "! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
+    "! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
+    "! chown -R daemon:daemon elasticsearch-7.6.2\n",
+    "\n",
+    "import os\n",
+    "from subprocess import Popen, PIPE, STDOUT\n",
+    "es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
+    "                   stdout=PIPE, stderr=STDOUT,\n",
+    "                   preexec_fn=lambda: os.setuid(1)  # as daemon\n",
+    "                  )\n",
+    "# wait until ES has started\n",
+    "! sleep 30\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Init the DocumentStore\n",
+    "In contrast to Tutorial 1 (extractive QA), we:\n",
+    "\n",
+    "* specify the name of our `text_field` in Elasticsearch that we want to return as an answer\n",
+    "* specify the name of our `embedding_field` in Elasticsearch where we'll store the embedding of our question and that is used later for calculating our similarity to the incoming user question\n",
+    "* set `excluded_meta_data=[\"question_emb\"]` so that we don't return the huge embedding vectors in our search results"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "04/28/2020 12:27:32 - INFO - elasticsearch -   PUT http://localhost:9200/document [status:400 request:0.010s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from haystack.database.elasticsearch import ElasticsearchDocumentStore\n",
+    "document_store = ElasticsearchDocumentStore(host=\"localhost\", username=\"\", password=\"\",\n",
+    "                                            index=\"document\",\n",
+    "                                            text_field=\"answer\",\n",
+    "                                            embedding_field=\"question_emb\",\n",
+    "                                            embedding_dim=768,\n",
+    "                                            excluded_meta_data=[\"question_emb\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Create a Retriever using embeddings\n",
+    "Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones).\n",
+    "We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "retriever = EmbeddingRetriever(document_store=document_store, embedding_model=\"deepset/sentence_bert\", gpu=False)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Prepare & Index FAQ data\n",
+    "We create a pandas dataframe containing some FAQ data (i.e curated pairs of question + answer) and index those in elasticsearch.\n",
+    "Here: We download some question-answer pairs related to COVID-19"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# Download\n",
+    "temp = requests.get(\"https://raw.githubusercontent.com/deepset-ai/COVID-QA/master/data/faqs/faq_covidbert.csv\")\n",
+    "open('small_faq_covid.csv', 'wb').write(temp.content)\n",
+    "\n",
+    "# Get dataframe with columns \"question\", \"answer\" and some custom metadata\n",
+    "df = pd.read_csv(\"small_faq_covid.csv\")\n",
+    "# Minimal cleaning\n",
+    "df.fillna(value=\"\", inplace=True)\n",
+    "df[\"question\"] = df[\"question\"].apply(lambda x: x.strip())\n",
+    "print(df.head())\n",
+    "\n",
+    "# Get embeddings for our questions from the FAQs\n",
+    "questions = list(df[\"question\"].values)\n",
+    "df[\"question_emb\"] = retriever.create_embedding(texts=questions)\n",
+    "\n",
+    "# Convert Dataframe to list of dicts and index them in our DocumentStore\n",
+    "docs_to_index = df.to_dict(orient=\"records\")\n",
+    "document_store.write_documents(docs_to_index)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Ask questions\n",
+    "Initialize a Finder (this time without a reader) and ask questions"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "finder = Finder(reader=None, retriever=retriever)\n",
+    "prediction = finder.get_answers_via_similar_questions(question=\"How is the virus spreading?\", top_k_retriever=10)\n",
+    "print_answers(prediction, details=\"all\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/tutorials/Tutorial4_FAQ_style_QA.py
+++ b/tutorials/Tutorial4_FAQ_style_QA.py
@ -0,0 +1,80 @@
+from haystack import Finder
+from haystack.database.elasticsearch import ElasticsearchDocumentStore
+
+from haystack.retriever.elasticsearch import EmbeddingRetriever
+from haystack.utils import print_answers
+import pandas as pd
+import requests
+import logging
+import subprocess
+import time
+## "FAQ-Style QA": Utilizing existing FAQs for Question Answering
+
+# While *extractive Question Answering* works on pure texts and is therefore more generalizable, there's also a common alternative that utilizes existing FAQ data.
+#
+# Pros:
+# - Very fast at inference time
+# - Utilize existing FAQ data
+# - Quite good control over answers
+#
+# Cons:
+# - Generalizability: We can only answer questions that are similar to existing ones in FAQ
+#
+# In some use cases, a combination of extractive QA and FAQ-style can also be an interesting option.
+LAUNCH_ELASTICSEARCH=True
+
+if LAUNCH_ELASTICSEARCH:
+    logging.info("Starting Elasticsearch ...")
+    status = subprocess.run(
+        ['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True
+    )
+    if status.returncode:
+        raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
+                        "then set LAUNCH_ELASTICSEARCH in the script to False.")
+    time.sleep(15)
+
+### Init the DocumentStore
+# In contrast to Tutorial 1 (extractive QA), we:
+#
+# * specify the name of our `text_field` in Elasticsearch that we want to return as an answer
+# * specify the name of our `embedding_field` in Elasticsearch where we'll store the embedding of our question and that is used later for calculating our similarity to the incoming user question
+# * set `excluded_meta_data=["question_emb"]` so that we don't return the huge embedding vectors in our search results
+
+document_store = ElasticsearchDocumentStore(host="localhost", username="", password="",
+                                            index="document",
+                                            text_field="answer",
+                                            embedding_field="question_emb",
+                                            embedding_dim=768,
+                                            excluded_meta_data=["question_emb"])
+
+### Create a Retriever using embeddings
+# Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones).
+# We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings.
+#
+retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", gpu=False)
+
+# Download a csv containing some FAQ data
+# Here: Some question-answer pairs related to COVID-19
+temp = requests.get("https://raw.githubusercontent.com/deepset-ai/COVID-QA/master/data/faqs/faq_covidbert.csv")
+open('small_faq_covid.csv', 'wb').write(temp.content)
+
+# Get dataframe with columns "question", "answer" and some custom metadata
+df = pd.read_csv("small_faq_covid.csv")
+# Minimal cleaning
+df.fillna(value="", inplace=True)
+df["question"] = df["question"].apply(lambda x: x.strip())
+print(df.head())
+
+# Get embeddings for our questions from the FAQs
+questions = list(df["question"].values)
+df["question_emb"] = retriever.create_embedding(texts=questions)
+
+# Convert Dataframe to list of dicts and index them in our DocumentStore
+docs_to_index = df.to_dict(orient="records")
+document_store.write_documents(docs_to_index)
+
+
+# Init reader & and use Finder to get answer (same as in Tutorial 1)
+finder = Finder(reader=None, retriever=retriever)
+prediction = finder.get_answers_via_similar_questions(question="How is the virus spreading?", top_k_retriever=10)
+print_answers(prediction, details="all")