Update tutorials (#200)

* fix link in readme. update installation in tutorials * update haystack version to latest master * add basic documentation for input to write_documents() * add docstring for sqldocumentstore * comment out docker in notebook
2025-12-12 23:37:36 +00:00 · 2020-07-07 14:59:01 +02:00 · 2020-07-07 14:59:01 +02:00 · fe33a481ad
commit fe33a481ad
parent ff7e35581b
11 changed files with 97 additions and 31 deletions
--- a/README.rst
+++ b/README.rst
@ -64,7 +64,7 @@ Resources

 - Tutorial 1  - Basic QA Pipeline: `Jupyter notebook  <https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb>`_  or `Colab <https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb>`_
 - Tutorial 2  - Fine-tuning a model on own data: `Jupyter notebook <https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb>`_ or `Colab <https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb>`_
- Tutorial 3  - Basic QA Pipeline without Elasticsearch: `Jupyter notebook <https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb>`_ or `Colab <https://colab.research.google.com/github/deepset-ai/haystack/blob/update-tutorials/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb>`_
+- Tutorial 3  - Basic QA Pipeline without Elasticsearch: `Jupyter notebook <https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb>`_ or `Colab <https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb>`_
 - Tutorial 4  - FAQ-style QA: `Jupyter notebook <https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial4_FAQ_style_QA.ipynb>`__ or `Colab <https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial4_FAQ_style_QA.ipynb>`__
 - Tutorial 5  - Evaluation of the whole QA-Pipeline: `Jupyter noteboook <https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial5_Evaluation.ipynb>`_ or `Colab <https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial5_Evaluation.ipynb>`_
 - Tutorial 6  - Better Retrievers via "Dense Passage Retrieval": `Jupyter noteboook <https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb>`_ or `Colab <https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb>`_
--- a/haystack/database/base.py
+++ b/haystack/database/base.py
@ -27,6 +27,14 @@ class BaseDocumentStore(ABC):

    @abstractmethod
    def write_documents(self, documents: List[dict]):
+        """
+        Indexes documents for later queries.
+
+        :param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}.
+                          Optionally, further fields can be supplied depending on the child class.
+
+        :return: None
+        """
        pass

    @abstractmethod
--- a/haystack/database/elasticsearch.py
+++ b/haystack/database/elasticsearch.py
@ -113,6 +113,17 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
        return doc_ids

    def write_documents(self, documents: List[dict]):
+        """
+        Indexes documents for later queries in Elasticsearch.
+
+        :param documents: List of dictionaries.
+                          Default format: {"name": "<some-document-name>, "text": "<the-actual-text>"}
+                          Optionally: You can add more key-value-pairs, that will be indexed as fields in
+                          Elasticsearch and can be accessed later for filtering or shown in the responses of the Finder
+                          Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
+                          should be changed to what you have set for self.text_field and self.name_field .
+        :return: None
+        """
        for doc in documents:
            doc["_op_type"] = "create"
            doc["_index"] = self.index
--- a/haystack/database/memory.py
+++ b/haystack/database/memory.py
@ -15,6 +15,15 @@ class InMemoryDocumentStore(BaseDocumentStore):
        self.index = None

    def write_documents(self, documents: List[dict]):
+        """
+        Indexes documents for later queries.
+
+        :param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}.
+                          Optionally, you can also supply "tags": ["one-tag", "another-one"]
+                          or additional meta data via "meta": {"author": "someone", "url":"some-url" ...}
+
+        :return: None
+        """
        import hashlib

        if documents is None:
--- a/haystack/database/sql.py
+++ b/haystack/database/sql.py
@ -93,6 +93,15 @@ class SQLDocumentStore(BaseDocumentStore):
        return doc_ids

    def write_documents(self, documents: List[dict]):
+        """
+        Indexes documents for later queries.
+
+        :param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}.
+                          Optionally, you can also supply meta data via "meta": {"author": "someone", "url":"some-url" ...}
+
+        :return: None
+        """
+
        for doc in documents:
            row = Document(name=doc["name"], text=doc["text"], meta_data=doc.get("meta", {}))
            self.session.add(row)
--- a/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb
+++ b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb
@ -26,18 +26,14 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "! pip install git+git://github.com/deepset-ai/haystack.git@ef9e4f4467a2e265bad72b048a1a3186e40969b1\n",
+    "! pip install git+https://github.com/deepset-ai/haystack.git\n",
    "#! pip install farm-haystack"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "is_executing": false
-    }
-   },
+   "execution_count": 2,
+   "metadata": {},
   "outputs": [],
   "source": [
    "from haystack import Finder\n",
@ -68,12 +64,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0ae423cd9c30d6f02ca2073e430d4e1f4403d88b8ec316411ec4c198bad3d416\r\n"
+     ]
+    }
+   ],
   "source": [
    "# Recommended: Start Elasticsearch using Docker\n",
-    "# ! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2"
+    "#! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2"
   ]
  },
  {
@ -99,7 +103,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 7,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
@ -110,7 +114,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "04/28/2020 12:27:32 - INFO - elasticsearch -   PUT http://localhost:9200/document [status:400 request:0.010s]\n"
+      "07/07/2020 10:41:47 - INFO - elasticsearch -   PUT http://localhost:9200/document [status:200 request:0.364s]\n"
     ]
    }
   ],
@ -138,7 +142,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
@ -149,11 +153,20 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "04/28/2020 12:19:09 - INFO - haystack.indexing.io -   Found data stored in `data/article_txt_got`. Delete this first if you really want to fetch new data.\n",
-      "04/28/2020 12:19:09 - INFO - elasticsearch -   POST http://localhost:9200/_count [status:200 request:0.066s]\n",
-      "04/28/2020 12:19:10 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.603s]\n",
-      "04/28/2020 12:19:10 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.040s]\n",
-      "04/28/2020 12:19:10 - INFO - haystack.indexing.io -   Wrote 517 docs to DB\n"
+      "07/07/2020 10:41:48 - INFO - haystack.indexing.utils -   Found data stored in `data/article_txt_got`. Delete this first if you really want to fetch new data.\n",
+      "07/07/2020 10:41:48 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.461s]\n",
+      "07/07/2020 10:41:49 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.259s]\n",
+      "07/07/2020 10:41:49 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.205s]\n",
+      "07/07/2020 10:41:49 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.158s]\n",
+      "07/07/2020 10:41:49 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.126s]\n",
+      "07/07/2020 10:41:49 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.095s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'name': '384_Maelor_Targaryen.txt', 'text': '#REDIRECT The Princess and the Queen'}, {'name': '314_Pypar.txt', 'text': \"#REDIRECT List of Game of Thrones characters#Night's Watch\"}, {'name': '73_A_Man_Without_Honor.txt', 'text': '\"\\'\\'\\'A Man Without Honor\\'\\'\\'\" is the seventh episode of the second season of HBO\\'s medieval fantasy television series \\'\\'Game of Thrones\\'\\'.\\nThe episode is written by series co-creators David Benioff and D. B. Weiss and directed, for the second time in this season, by David Nutter. It premiered on May 13, 2012.\\nThe name of the episode comes from Catelyn Stark\\'s assessment of Ser Jaime Lannister: \"You are a man without honor,\" after he kills a member of his own family to attempt escape.'}]\n"
     ]
    }
   ],
@ -169,6 +182,15 @@
    "# It must take a str as input, and return a str.\n",
    "dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)\n",
    "\n",
+    "# We now have a list of dictionaries that we can write to our document store.\n",
+    "# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself.\n",
+    "# The default format here is: {\"name\": \"<some-document-name>, \"text\": \"<the-actual-text>\"}\n",
+    "# (Optionally: you can also add more key-value-pairs here, that will be indexed as fields in Elasticsearch and\n",
+    "# can be accessed later for filtering or shown in the responses of the Finder)\n",
+    "\n",
+    "# Let's have a look at the first 3 entries:\n",
+    "print(dicts[:3])\n",
+    "\n",
    "# Now, let's write the dicts containing documents to our DB.\n",
    "document_store.write_documents(dicts)"
   ]
@ -189,8 +211,9 @@
    "**Alternatives:**\n",
    "\n",
    "- Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters\n",
+    "- Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging\n",
    "- Use `EmbeddingRetriever` to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT)\n",
-    "- Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging"
+    "- Use `DensePassageRetriever` to use different embedding models for passage and query (see Tutorial 6)"
   ]
  },
  {
@ -199,7 +222,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from haystack.retriever.elasticsearch import ElasticsearchRetriever\n",
+    "from haystack.retriever.sparse import ElasticsearchRetriever\n",
    "retriever = ElasticsearchRetriever(document_store=document_store)"
   ]
  },
@ -216,7 +239,7 @@
   "source": [
    "# Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes with SQLite document store.\n",
    "\n",
-    "# from haystack.retriever.tfidf import TfidfRetriever\n",
+    "# from haystack.retriever.sparse import TfidfRetriever\n",
    "# retriever = TfidfRetriever(document_store=document_store)"
   ]
  },
--- a/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb
+++ b/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb
@ -19,7 +19,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "! pip install git+git://github.com/deepset-ai/haystack.git@ef9e4f4467a2e265bad72b048a1a3186e40969b1"
+    "! pip install git+https://github.com/deepset-ai/haystack.git"
   ]
  },
  {
@ -149,4 +149,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 2
-}
+}
--- a/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb
+++ b/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb
@ -19,7 +19,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "! pip install git+git://github.com/deepset-ai/haystack.git@ef9e4f4467a2e265bad72b048a1a3186e40969b1\n",
+    "! pip install git+https://github.com/deepset-ai/haystack.git\n",
    "#! pip install farm-haystack"
   ]
  },
@ -115,6 +115,12 @@
    "# It must take a str as input, and return a str.\n",
    "dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)\n",
    "\n",
+    "# We now have a list of dictionaries that we can write to our document store.\n",
+    "# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself.\n",
+    "# The default format here is: {\"name\": \"<some-document-name>, \"text\": \"<the-actual-text>\"}\n",
+    "\n",
+    "# Let's have a look at the first 3 entries:\n",
+    "print(dicts[:3])\n",
    "# Now, let's write the docs to our DB.\n",
    "document_store.write_documents(dicts)"
   ]
@ -153,7 +159,7 @@
   "source": [
    "# An in-memory TfidfRetriever based on Pandas dataframes\n",
    "\n",
-    "from haystack.retriever.tfidf import TfidfRetriever\n",
+    "from haystack.retriever.sparse import TfidfRetriever\n",
    "retriever = TfidfRetriever(document_store=document_store)"
   ]
  },
--- a/tutorials/Tutorial4_FAQ_style_QA.ipynb
+++ b/tutorials/Tutorial4_FAQ_style_QA.ipynb
@ -28,7 +28,7 @@
   "outputs": [],
   "source": [
    "#TODO\n",
-    "! pip install git+git://github.com/deepset-ai/haystack.git@ef9e4f4467a2e265bad72b048a1a3186e40969b1\n",
+    "! pip install git+https://github.com/deepset-ai/haystack.git\n",
    "#! pip install farm-haystack"
   ]
  },
@ -45,7 +45,7 @@
    "from haystack import Finder\n",
    "from haystack.database.elasticsearch import ElasticsearchDocumentStore\n",
    "\n",
-    "from haystack.retriever.elasticsearch import EmbeddingRetriever\n",
+    "from haystack.retriever.dense import EmbeddingRetriever\n",
    "from haystack.utils import print_answers\n",
    "import pandas as pd\n",
    "import requests\n"
--- a/tutorials/Tutorial5_Evaluation.ipynb
+++ b/tutorials/Tutorial5_Evaluation.ipynb
@ -1584,7 +1584,7 @@
   },
   "source": [
    "# install haystack\n",
-    "! pip install git+git://github.com/deepset-ai/haystack.git@ef9e4f4467a2e265bad72b048a1a3186e40969b1"
+    "! pip install git+https://github.com/deepset-ai/haystack.git"
   ],
   "execution_count": 0,
   "outputs": []
@ -1741,7 +1741,7 @@
   },
   "source": [
    "# Initialize Retriever\n",
-    "from haystack.retriever.elasticsearch import ElasticsearchRetriever\n",
+    "from haystack.retriever.sparse import ElasticsearchRetriever\n",
    "\n",
    "retriever = ElasticsearchRetriever(document_store=document_store)"
   ],
--- a/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb
+++ b/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb
@ -79,7 +79,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "! pip install git+git://github.com/deepset-ai/haystack.git@8a9f97fad37241b0101c4561d10a49f2fbc6ee52"
+    "! pip install git+https://github.com/deepset-ai/haystack.git"
   ]
  },
  {