diff --git a/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb b/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb index ca2408613..1b81cf037 100644 --- a/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb +++ b/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb @@ -65,37 +65,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fri Jul 3 09:43:18 2020 \r\n", - "+-----------------------------------------------------------------------------+\r\n", - "| NVIDIA-SMI 435.21 Driver Version: 435.21 CUDA Version: 10.1 |\r\n", - "|-------------------------------+----------------------+----------------------+\r\n", - "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n", - "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n", - "|===============================+======================+======================|\r\n", - "| 0 GeForce 940MX Off | 00000000:02:00.0 Off | N/A |\r\n", - "| N/A 41C P0 N/A / N/A | 567MiB / 2004MiB | 4% Default |\r\n", - "+-------------------------------+----------------------+----------------------+\r\n", - " \r\n", - "+-----------------------------------------------------------------------------+\r\n", - "| Processes: GPU Memory |\r\n", - "| GPU PID Type Process name Usage |\r\n", - "|=============================================================================|\r\n", - "| 0 1507 G /usr/lib/xorg/Xorg 212MiB |\r\n", - "| 0 1735 G /usr/bin/gnome-shell 85MiB |\r\n", - "| 0 3310 G ...uest-channel-token=10103706267471532991 48MiB |\r\n", - "| 0 3507 G ...AAAAAAAAAAAACAAAAAAAAAA= --shared-files 195MiB |\r\n", - "| 0 22962 G ...p/pycharm-professional/201/jbr/bin/java 22MiB |\r\n", - "+-----------------------------------------------------------------------------+\r\n" - ] - } - ], + "outputs": [], "source": [ "# Make sure you have a GPU running\n", "!nvidia-smi" @@ -107,7 +79,7 @@ "metadata": {}, "outputs": [], "source": [ - "! pip install git+git://github.com/deepset-ai/haystack.git@07ecfb60b944d9682f6d50317a15ffe5501ae456" + "! pip install git+git://github.com/deepset-ai/haystack.git@8a9f97fad37241b0101c4561d10a49f2fbc6ee52" ] }, { @@ -153,9 +125,9 @@ ], "source": [ "# Recommended: Start Elasticsearch using Docker\n", - "! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2\n", + "#! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2\n", "# wait until ES has started\n", - "! sleep 30" + "#! sleep 30" ] }, { @@ -165,18 +137,18 @@ "outputs": [], "source": [ "# In Colab / No Docker environments: Start Elasticsearch from source\n", - "#! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n", - "#! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n", - "#! chown -R daemon:daemon elasticsearch-7.6.2\n", + "! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n", + "! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n", + "! chown -R daemon:daemon elasticsearch-7.6.2\n", "\n", - "#import os\n", - "#from subprocess import Popen, PIPE, STDOUT\n", - "#es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n", - "# stdout=PIPE, stderr=STDOUT,\n", - "# preexec_fn=lambda: os.setuid(1) # as daemon\n", - "# )\n", + "import os\n", + "from subprocess import Popen, PIPE, STDOUT\n", + "es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n", + " stdout=PIPE, stderr=STDOUT,\n", + " preexec_fn=lambda: os.setuid(1) # as daemon\n", + " )\n", "# wait until ES has started\n", - "#! sleep 30" + "! sleep 30" ] }, { @@ -340,7 +312,7 @@ " do_lower_case=True, gpu=True)\n", "\n", "# Important: \n", - "# Now that we have the DPR initialized, we need to call update_embeddings() to iterate over all \n", + "# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all\n", "# previously indexed documents and update their embedding representation. \n", "# While this can be a time consuming operation (depending on corpus size), it only needs to be done once. \n", "# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.\n", @@ -396,7 +368,7 @@ "# Load a local model or any of the QA models on\n", "# Hugging Face's model hub (https://huggingface.co/models)\n", "\n", - "reader = FARMReader(model_name_or_path=\"deepset/roberta-base-squad2\", use_gpu=False)" + "reader = FARMReader(model_name_or_path=\"deepset/roberta-base-squad2\", use_gpu=True)" ] }, { @@ -459,15 +431,8 @@ "source": [ "# You can configure how many candidates the reader and retriever shall return\n", "# The higher top_k_retriever, the better (but also the slower) your answers. \n", - "prediction = finder.get_answers(question=\"Who created the Dothraki vocabulary?\", top_k_retriever=10, top_k_reader=5)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ + "prediction = finder.get_answers(question=\"Who created the Dothraki vocabulary?\", top_k_retriever=10, top_k_reader=5)\n", + "\n", "#prediction = finder.get_answers(question=\"Who is the father of Arya Stark?\", top_k_retriever=10, top_k_reader=5)\n", "#prediction = finder.get_answers(question=\"Who is the sister of Sansa?\", top_k_retriever=10, top_k_reader=5)" ] @@ -516,13 +481,6 @@ "source": [ "print_answers(prediction, details=\"minimal\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -546,4 +504,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/tutorials/Tutorial6_Better_Retrieval_via_DPR.py b/tutorials/Tutorial6_Better_Retrieval_via_DPR.py index 24b6eebbc..47e3f6490 100755 --- a/tutorials/Tutorial6_Better_Retrieval_via_DPR.py +++ b/tutorials/Tutorial6_Better_Retrieval_via_DPR.py @@ -48,6 +48,11 @@ document_store.write_documents(dicts[:16]) ### Retriever retriever = DensePassageRetriever(document_store=document_store, embedding_model="dpr-bert-base-nq", do_lower_case=True, gpu=True) +# Important: +# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all +# previously indexed documents and update their embedding representation. +# While this can be a time consuming operation (depending on corpus size), it only needs to be done once. +# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast. document_store.update_embeddings(retriever) ### Reader