Update Tutorial 6

2025-10-21 04:48:59 +00:00 · 2020-07-03 16:06:46 +02:00 · 2020-07-03 16:06:46 +02:00 · c36f8c991e
commit c36f8c991e
parent 8a9f97fad3
2 changed files with 25 additions and 62 deletions
--- a/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb
+++ b/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb
@ -65,37 +65,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Fri Jul  3 09:43:18 2020       \r\n",
-      "+-----------------------------------------------------------------------------+\r\n",
-      "| NVIDIA-SMI 435.21       Driver Version: 435.21       CUDA Version: 10.1     |\r\n",
-      "|-------------------------------+----------------------+----------------------+\r\n",
-      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n",
-      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\r\n",
-      "|===============================+======================+======================|\r\n",
-      "|   0  GeForce 940MX       Off  | 00000000:02:00.0 Off |                  N/A |\r\n",
-      "| N/A   41C    P0    N/A /  N/A |    567MiB /  2004MiB |      4%      Default |\r\n",
-      "+-------------------------------+----------------------+----------------------+\r\n",
-      "                                                                               \r\n",
-      "+-----------------------------------------------------------------------------+\r\n",
-      "| Processes:                                                       GPU Memory |\r\n",
-      "|  GPU       PID   Type   Process name                             Usage      |\r\n",
-      "|=============================================================================|\r\n",
-      "|    0      1507      G   /usr/lib/xorg/Xorg                           212MiB |\r\n",
-      "|    0      1735      G   /usr/bin/gnome-shell                          85MiB |\r\n",
-      "|    0      3310      G   ...uest-channel-token=10103706267471532991    48MiB |\r\n",
-      "|    0      3507      G   ...AAAAAAAAAAAACAAAAAAAAAA= --shared-files   195MiB |\r\n",
-      "|    0     22962      G   ...p/pycharm-professional/201/jbr/bin/java    22MiB |\r\n",
-      "+-----------------------------------------------------------------------------+\r\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Make sure you have a GPU running\n",
    "!nvidia-smi"
@ -107,7 +79,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "! pip install git+git://github.com/deepset-ai/haystack.git@07ecfb60b944d9682f6d50317a15ffe5501ae456"
+    "! pip install git+git://github.com/deepset-ai/haystack.git@8a9f97fad37241b0101c4561d10a49f2fbc6ee52"
   ]
  },
  {
@ -153,9 +125,9 @@
   ],
   "source": [
    "# Recommended: Start Elasticsearch using Docker\n",
-    "! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2\n",
+    "#! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2\n",
    "# wait until ES has started\n",
-    "! sleep 30"
+    "#! sleep 30"
   ]
  },
  {
@ -165,18 +137,18 @@
   "outputs": [],
   "source": [
    "# In Colab / No Docker environments: Start Elasticsearch from source\n",
-    "#! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
-    "#! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
-    "#! chown -R daemon:daemon elasticsearch-7.6.2\n",
+    "! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
+    "! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
+    "! chown -R daemon:daemon elasticsearch-7.6.2\n",
    "\n",
-    "#import os\n",
-    "#from subprocess import Popen, PIPE, STDOUT\n",
-    "#es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
-    "#                   stdout=PIPE, stderr=STDOUT,\n",
-    "#                   preexec_fn=lambda: os.setuid(1)  # as daemon\n",
-    "#                  )\n",
+    "import os\n",
+    "from subprocess import Popen, PIPE, STDOUT\n",
+    "es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
+    "                   stdout=PIPE, stderr=STDOUT,\n",
+    "                   preexec_fn=lambda: os.setuid(1)  # as daemon\n",
+    "                  )\n",
    "# wait until ES has started\n",
-    "#! sleep 30"
+    "! sleep 30"
   ]
  },
  {
@ -340,7 +312,7 @@
    "                                  do_lower_case=True, gpu=True)\n",
    "\n",
    "# Important: \n",
-    "# Now that we have the DPR initialized, we need to call update_embeddings() to iterate over all \n",
+    "# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all\n",
    "# previously indexed documents and update their embedding representation. \n",
    "# While this can be a time consuming operation (depending on corpus size), it only needs to be done once. \n",
    "# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.\n",
@ -396,7 +368,7 @@
    "# Load a  local model or any of the QA models on\n",
    "# Hugging Face's model hub (https://huggingface.co/models)\n",
    "\n",
-    "reader = FARMReader(model_name_or_path=\"deepset/roberta-base-squad2\", use_gpu=False)"
+    "reader = FARMReader(model_name_or_path=\"deepset/roberta-base-squad2\", use_gpu=True)"
   ]
  },
  {
@ -459,15 +431,8 @@
   "source": [
    "# You can configure how many candidates the reader and retriever shall return\n",
    "# The higher top_k_retriever, the better (but also the slower) your answers. \n",
-    "prediction = finder.get_answers(question=\"Who created the Dothraki vocabulary?\", top_k_retriever=10, top_k_reader=5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "prediction = finder.get_answers(question=\"Who created the Dothraki vocabulary?\", top_k_retriever=10, top_k_reader=5)\n",
+    "\n",
    "#prediction = finder.get_answers(question=\"Who is the father of Arya Stark?\", top_k_retriever=10, top_k_reader=5)\n",
    "#prediction = finder.get_answers(question=\"Who is the sister of Sansa?\", top_k_retriever=10, top_k_reader=5)"
   ]
@ -516,13 +481,6 @@
   "source": [
    "print_answers(prediction, details=\"minimal\")"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
--- a/tutorials/Tutorial6_Better_Retrieval_via_DPR.py
+++ b/tutorials/Tutorial6_Better_Retrieval_via_DPR.py
@ -48,6 +48,11 @@ document_store.write_documents(dicts[:16])
 ### Retriever
 retriever = DensePassageRetriever(document_store=document_store, embedding_model="dpr-bert-base-nq",
                                  do_lower_case=True, gpu=True)
+# Important:
+# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
+# previously indexed documents and update their embedding representation.
+# While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
+# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
 document_store.update_embeddings(retriever)

 ### Reader