Update Tutorial 6

This commit is contained in:
Malte Pietsch 2020-07-03 16:06:46 +02:00
parent 8a9f97fad3
commit c36f8c991e
2 changed files with 25 additions and 62 deletions

View File

@ -65,37 +65,9 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fri Jul 3 09:43:18 2020 \r\n",
"+-----------------------------------------------------------------------------+\r\n",
"| NVIDIA-SMI 435.21 Driver Version: 435.21 CUDA Version: 10.1 |\r\n",
"|-------------------------------+----------------------+----------------------+\r\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n",
"|===============================+======================+======================|\r\n",
"| 0 GeForce 940MX Off | 00000000:02:00.0 Off | N/A |\r\n",
"| N/A 41C P0 N/A / N/A | 567MiB / 2004MiB | 4% Default |\r\n",
"+-------------------------------+----------------------+----------------------+\r\n",
" \r\n",
"+-----------------------------------------------------------------------------+\r\n",
"| Processes: GPU Memory |\r\n",
"| GPU PID Type Process name Usage |\r\n",
"|=============================================================================|\r\n",
"| 0 1507 G /usr/lib/xorg/Xorg 212MiB |\r\n",
"| 0 1735 G /usr/bin/gnome-shell 85MiB |\r\n",
"| 0 3310 G ...uest-channel-token=10103706267471532991 48MiB |\r\n",
"| 0 3507 G ...AAAAAAAAAAAACAAAAAAAAAA= --shared-files 195MiB |\r\n",
"| 0 22962 G ...p/pycharm-professional/201/jbr/bin/java 22MiB |\r\n",
"+-----------------------------------------------------------------------------+\r\n"
]
}
],
"outputs": [],
"source": [
"# Make sure you have a GPU running\n",
"!nvidia-smi"
@ -107,7 +79,7 @@
"metadata": {},
"outputs": [],
"source": [
"! pip install git+git://github.com/deepset-ai/haystack.git@07ecfb60b944d9682f6d50317a15ffe5501ae456"
"! pip install git+git://github.com/deepset-ai/haystack.git@8a9f97fad37241b0101c4561d10a49f2fbc6ee52"
]
},
{
@ -153,9 +125,9 @@
],
"source": [
"# Recommended: Start Elasticsearch using Docker\n",
"! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2\n",
"#! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2\n",
"# wait until ES has started\n",
"! sleep 30"
"#! sleep 30"
]
},
{
@ -165,18 +137,18 @@
"outputs": [],
"source": [
"# In Colab / No Docker environments: Start Elasticsearch from source\n",
"#! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
"#! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
"#! chown -R daemon:daemon elasticsearch-7.6.2\n",
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
"! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
"! chown -R daemon:daemon elasticsearch-7.6.2\n",
"\n",
"#import os\n",
"#from subprocess import Popen, PIPE, STDOUT\n",
"#es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
"# stdout=PIPE, stderr=STDOUT,\n",
"# preexec_fn=lambda: os.setuid(1) # as daemon\n",
"# )\n",
"import os\n",
"from subprocess import Popen, PIPE, STDOUT\n",
"es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
" stdout=PIPE, stderr=STDOUT,\n",
" preexec_fn=lambda: os.setuid(1) # as daemon\n",
" )\n",
"# wait until ES has started\n",
"#! sleep 30"
"! sleep 30"
]
},
{
@ -340,7 +312,7 @@
" do_lower_case=True, gpu=True)\n",
"\n",
"# Important: \n",
"# Now that we have the DPR initialized, we need to call update_embeddings() to iterate over all \n",
"# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all\n",
"# previously indexed documents and update their embedding representation. \n",
"# While this can be a time consuming operation (depending on corpus size), it only needs to be done once. \n",
"# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.\n",
@ -396,7 +368,7 @@
"# Load a local model or any of the QA models on\n",
"# Hugging Face's model hub (https://huggingface.co/models)\n",
"\n",
"reader = FARMReader(model_name_or_path=\"deepset/roberta-base-squad2\", use_gpu=False)"
"reader = FARMReader(model_name_or_path=\"deepset/roberta-base-squad2\", use_gpu=True)"
]
},
{
@ -459,15 +431,8 @@
"source": [
"# You can configure how many candidates the reader and retriever shall return\n",
"# The higher top_k_retriever, the better (but also the slower) your answers. \n",
"prediction = finder.get_answers(question=\"Who created the Dothraki vocabulary?\", top_k_retriever=10, top_k_reader=5)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"prediction = finder.get_answers(question=\"Who created the Dothraki vocabulary?\", top_k_retriever=10, top_k_reader=5)\n",
"\n",
"#prediction = finder.get_answers(question=\"Who is the father of Arya Stark?\", top_k_retriever=10, top_k_reader=5)\n",
"#prediction = finder.get_answers(question=\"Who is the sister of Sansa?\", top_k_retriever=10, top_k_reader=5)"
]
@ -516,13 +481,6 @@
"source": [
"print_answers(prediction, details=\"minimal\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@ -48,6 +48,11 @@ document_store.write_documents(dicts[:16])
### Retriever
retriever = DensePassageRetriever(document_store=document_store, embedding_model="dpr-bert-base-nq",
do_lower_case=True, gpu=True)
# Important:
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
document_store.update_embeddings(retriever)
### Reader