Replace dpr with embeddingretriever tut11 (#2287)

* images for tutorial 11 in .github folder for easy access

* ipynb: changed DPR to EmbeddingRetriever, incl. new graphs of pipelines

* Update Documentation & Code Style

* moved images into correct folder

* removed images path

* Update Documentation & Code Style

* fixed debugging run of p_classifier

* Update Documentation & Code Style

* Revert debug param change

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: brandenchan <brandenchan@icloud.com>
This commit is contained in:
mkkuemmel 2022-03-15 08:30:00 +01:00 committed by GitHub
parent 85571cdd15
commit a1040a17b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 57 additions and 40 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

View File

@ -100,7 +100,7 @@ to perform Open Domain Question Answering.
from haystack import Pipeline from haystack import Pipeline
from haystack.utils import launch_es from haystack.utils import launch_es
from haystack.document_stores import ElasticsearchDocumentStore from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import ElasticsearchRetriever, DensePassageRetriever, FARMReader from haystack.nodes import ElasticsearchRetriever, EmbeddingRetriever, FARMReader
# Initialize DocumentStore and index documents # Initialize DocumentStore and index documents
@ -113,9 +113,14 @@ document_store.write_documents(got_dicts)
es_retriever = ElasticsearchRetriever(document_store=document_store) es_retriever = ElasticsearchRetriever(document_store=document_store)
# Initialize dense retriever # Initialize dense retriever
dpr_retriever = DensePassageRetriever(document_store) embedding_retriever = EmbeddingRetriever(
document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False) document_store,
model_format="sentence_transformers",
embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
)
document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)
# Initialize reader
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
``` ```
@ -163,7 +168,7 @@ document_store.return_embedding = True
rag_generator = RAGenerator() rag_generator = RAGenerator()
# Generative QA # Generative QA
p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever) p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=embedding_retriever)
res = p_generator.run(query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}}) res = p_generator.run(query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}})
print_answers(res, details="minimum") print_answers(res, details="minimum")
@ -214,12 +219,12 @@ p_extractive.draw("pipeline_extractive.png")
``` ```
Pipelines offer a very simple way to ensemble together different components. Pipelines offer a very simple way to ensemble together different components.
In this example, we are going to combine the power of a `DensePassageRetriever` In this example, we are going to combine the power of an `EmbeddingRetriever`
with the keyword based `ElasticsearchRetriever`. with the keyword based `ElasticsearchRetriever`.
See our [documentation](https://haystack.deepset.ai/docs/latest/retrievermd) to understand why See our [documentation](https://haystack.deepset.ai/docs/latest/retrievermd) to understand why
we might want to combine a dense and sparse retriever. we might want to combine a dense and sparse retriever.
![image](https://user-images.githubusercontent.com/1563902/102451782-7bd80400-4039-11eb-9046-01b002a783f8.png) ![image]()
Here we use a `JoinDocuments` node so that the predictions from each retriever can be merged together. Here we use a `JoinDocuments` node so that the predictions from each retriever can be merged together.
@ -230,16 +235,16 @@ from haystack.pipelines import JoinDocuments
# Create ensembled pipeline # Create ensembled pipeline
p_ensemble = Pipeline() p_ensemble = Pipeline()
p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
p_ensemble.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"]) p_ensemble.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
p_ensemble.add_node( p_ensemble.add_node(
component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever"] component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "EmbeddingRetriever"]
) )
p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"]) p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"])
p_ensemble.draw("pipeline_ensemble.png") p_ensemble.draw("pipeline_ensemble.png")
# Run pipeline # Run pipeline
res = p_ensemble.run( res = p_ensemble.run(
query="Who is the father of Arya Stark?", params={"DPRRetriever": {"top_k": 5}, "ESRetriever": {"top_k": 5}} query="Who is the father of Arya Stark?", params={"EmbeddingRetriever": {"top_k": 5}, "ESRetriever": {"top_k": 5}}
) )
print_answers(res, details="minimum") print_answers(res, details="minimum")
``` ```
@ -277,10 +282,10 @@ class CustomNode(BaseComponent):
## Decision Nodes ## Decision Nodes
Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run. Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run.
One popular use case for such query classifiers is routing keyword queries to Elasticsearch and questions to DPR + Reader. One popular use case for such query classifiers is routing keyword queries to Elasticsearch and questions to EmbeddingRetriever + Reader.
With this approach you keep optimal speed and simplicity for keywords while going deep with transformers when it's most helpful. With this approach you keep optimal speed and simplicity for keywords while going deep with transformers when it's most helpful.
![image](https://user-images.githubusercontent.com/1563902/102452199-41229b80-403a-11eb-9365-7038697e7c3e.png) ![image]()
Though this looks very similar to the ensembled pipeline shown above, Though this looks very similar to the ensembled pipeline shown above,
the key difference is that only one of the retrievers is run for each request. the key difference is that only one of the retrievers is run for each request.
@ -304,13 +309,13 @@ class CustomQueryClassifier(BaseComponent):
p_classifier = Pipeline() p_classifier = Pipeline()
p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"]) p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"])
p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"]) p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"])
p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"]) p_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_2"])
p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"])
p_classifier.draw("pipeline_classifier.png") p_classifier.draw("pipeline_classifier.png")
# Run only the dense retriever on the full sentence query # Run only the dense retriever on the full sentence query
res_1 = p_classifier.run(query="Who is the father of Arya Stark?") res_1 = p_classifier.run(query="Who is the father of Arya Stark?")
print("DPR Results" + "\n" + "=" * 15) print("Embedding Retriever Results" + "\n" + "=" * 15)
print_answers(res_1) print_answers(res_1)
# Run only the sparse retriever on a keyword based query # Run only the sparse retriever on a keyword based query

View File

@ -210,7 +210,7 @@
"from haystack import Pipeline\n", "from haystack import Pipeline\n",
"from haystack.utils import launch_es\n", "from haystack.utils import launch_es\n",
"from haystack.document_stores import ElasticsearchDocumentStore\n", "from haystack.document_stores import ElasticsearchDocumentStore\n",
"from haystack.nodes import ElasticsearchRetriever, DensePassageRetriever, FARMReader\n", "from haystack.nodes import ElasticsearchRetriever, EmbeddingRetriever, FARMReader\n",
"\n", "\n",
"\n", "\n",
"# Initialize DocumentStore and index documents\n", "# Initialize DocumentStore and index documents\n",
@ -223,9 +223,14 @@
"es_retriever = ElasticsearchRetriever(document_store=document_store)\n", "es_retriever = ElasticsearchRetriever(document_store=document_store)\n",
"\n", "\n",
"# Initialize dense retriever\n", "# Initialize dense retriever\n",
"dpr_retriever = DensePassageRetriever(document_store)\n", "embedding_retriever = EmbeddingRetriever(\n",
"document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False)\n", " document_store,\n",
" model_format=\"sentence_transformers\",\n",
" embedding_model=\"sentence-transformers/multi-qa-mpnet-base-dot-v1\",\n",
")\n",
"document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)\n",
"\n", "\n",
"# Initialize reader\n",
"reader = FARMReader(model_name_or_path=\"deepset/roberta-base-squad2\")" "reader = FARMReader(model_name_or_path=\"deepset/roberta-base-squad2\")"
] ]
}, },
@ -324,7 +329,7 @@
"rag_generator = RAGenerator()\n", "rag_generator = RAGenerator()\n",
"\n", "\n",
"# Generative QA\n", "# Generative QA\n",
"p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever)\n", "p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=embedding_retriever)\n",
"res = p_generator.run(query=\"Who is the father of Arya Stark?\", params={\"Retriever\": {\"top_k\": 10}})\n", "res = p_generator.run(query=\"Who is the father of Arya Stark?\", params={\"Retriever\": {\"top_k\": 10}})\n",
"print_answers(res, details=\"minimum\")\n", "print_answers(res, details=\"minimum\")\n",
"\n", "\n",
@ -428,12 +433,12 @@
}, },
"source": [ "source": [
"Pipelines offer a very simple way to ensemble together different components.\n", "Pipelines offer a very simple way to ensemble together different components.\n",
"In this example, we are going to combine the power of a `DensePassageRetriever`\n", "In this example, we are going to combine the power of an `EmbeddingRetriever`\n",
"with the keyword based `ElasticsearchRetriever`.\n", "with the keyword based `ElasticsearchRetriever`.\n",
"See our [documentation](https://haystack.deepset.ai/docs/latest/retrievermd) to understand why\n", "See our [documentation](https://haystack.deepset.ai/docs/latest/retrievermd) to understand why\n",
"we might want to combine a dense and sparse retriever.\n", "we might want to combine a dense and sparse retriever.\n",
"\n", "\n",
"![image](https://user-images.githubusercontent.com/1563902/102451782-7bd80400-4039-11eb-9046-01b002a783f8.png)\n", "![image]()\n",
"\n", "\n",
"Here we use a `JoinDocuments` node so that the predictions from each retriever can be merged together." "Here we use a `JoinDocuments` node so that the predictions from each retriever can be merged together."
] ]
@ -454,16 +459,16 @@
"# Create ensembled pipeline\n", "# Create ensembled pipeline\n",
"p_ensemble = Pipeline()\n", "p_ensemble = Pipeline()\n",
"p_ensemble.add_node(component=es_retriever, name=\"ESRetriever\", inputs=[\"Query\"])\n", "p_ensemble.add_node(component=es_retriever, name=\"ESRetriever\", inputs=[\"Query\"])\n",
"p_ensemble.add_node(component=dpr_retriever, name=\"DPRRetriever\", inputs=[\"Query\"])\n", "p_ensemble.add_node(component=embedding_retriever, name=\"EmbeddingRetriever\", inputs=[\"Query\"])\n",
"p_ensemble.add_node(\n", "p_ensemble.add_node(\n",
" component=JoinDocuments(join_mode=\"concatenate\"), name=\"JoinResults\", inputs=[\"ESRetriever\", \"DPRRetriever\"]\n", " component=JoinDocuments(join_mode=\"concatenate\"), name=\"JoinResults\", inputs=[\"ESRetriever\", \"EmbeddingRetriever\"]\n",
")\n", ")\n",
"p_ensemble.add_node(component=reader, name=\"Reader\", inputs=[\"JoinResults\"])\n", "p_ensemble.add_node(component=reader, name=\"Reader\", inputs=[\"JoinResults\"])\n",
"p_ensemble.draw(\"pipeline_ensemble.png\")\n", "p_ensemble.draw(\"pipeline_ensemble.png\")\n",
"\n", "\n",
"# Run pipeline\n", "# Run pipeline\n",
"res = p_ensemble.run(\n", "res = p_ensemble.run(\n",
" query=\"Who is the father of Arya Stark?\", params={\"DPRRetriever\": {\"top_k\": 5}, \"ESRetriever\": {\"top_k\": 5}}\n", " query=\"Who is the father of Arya Stark?\", params={\"EmbeddingRetriever\": {\"top_k\": 5}, \"ESRetriever\": {\"top_k\": 5}}\n",
")\n", ")\n",
"print_answers(res, details=\"minimum\")" "print_answers(res, details=\"minimum\")"
] ]
@ -529,10 +534,10 @@
"## Decision Nodes\n", "## Decision Nodes\n",
"\n", "\n",
"Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run.\n", "Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run.\n",
"One popular use case for such query classifiers is routing keyword queries to Elasticsearch and questions to DPR + Reader.\n", "One popular use case for such query classifiers is routing keyword queries to Elasticsearch and questions to EmbeddingRetriever + Reader.\n",
"With this approach you keep optimal speed and simplicity for keywords while going deep with transformers when it's most helpful.\n", "With this approach you keep optimal speed and simplicity for keywords while going deep with transformers when it's most helpful.\n",
"\n", "\n",
"![image](https://user-images.githubusercontent.com/1563902/102452199-41229b80-403a-11eb-9365-7038697e7c3e.png)\n", "![image]()\n",
"\n", "\n",
"Though this looks very similar to the ensembled pipeline shown above,\n", "Though this looks very similar to the ensembled pipeline shown above,\n",
"the key difference is that only one of the retrievers is run for each request.\n", "the key difference is that only one of the retrievers is run for each request.\n",
@ -566,13 +571,13 @@
"p_classifier = Pipeline()\n", "p_classifier = Pipeline()\n",
"p_classifier.add_node(component=CustomQueryClassifier(), name=\"QueryClassifier\", inputs=[\"Query\"])\n", "p_classifier.add_node(component=CustomQueryClassifier(), name=\"QueryClassifier\", inputs=[\"Query\"])\n",
"p_classifier.add_node(component=es_retriever, name=\"ESRetriever\", inputs=[\"QueryClassifier.output_1\"])\n", "p_classifier.add_node(component=es_retriever, name=\"ESRetriever\", inputs=[\"QueryClassifier.output_1\"])\n",
"p_classifier.add_node(component=dpr_retriever, name=\"DPRRetriever\", inputs=[\"QueryClassifier.output_2\"])\n", "p_classifier.add_node(component=embedding_retriever, name=\"EmbeddingRetriever\", inputs=[\"QueryClassifier.output_2\"])\n",
"p_classifier.add_node(component=reader, name=\"QAReader\", inputs=[\"ESRetriever\", \"DPRRetriever\"])\n", "p_classifier.add_node(component=reader, name=\"QAReader\", inputs=[\"ESRetriever\", \"EmbeddingRetriever\"])\n",
"p_classifier.draw(\"pipeline_classifier.png\")\n", "p_classifier.draw(\"pipeline_classifier.png\")\n",
"\n", "\n",
"# Run only the dense retriever on the full sentence query\n", "# Run only the dense retriever on the full sentence query\n",
"res_1 = p_classifier.run(query=\"Who is the father of Arya Stark?\")\n", "res_1 = p_classifier.run(query=\"Who is the father of Arya Stark?\")\n",
"print(\"DPR Results\" + \"\\n\" + \"=\" * 15)\n", "print(\"Embedding Retriever Results\" + \"\\n\" + \"=\" * 15)\n",
"print_answers(res_1)\n", "print_answers(res_1)\n",
"\n", "\n",
"# Run only the sparse retriever on a keyword based query\n", "# Run only the sparse retriever on a keyword based query\n",
@ -772,4 +777,4 @@
}, },
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 2 "nbformat_minor": 2
} }

View File

@ -11,7 +11,7 @@ from haystack import Pipeline
from haystack.document_stores import ElasticsearchDocumentStore from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import ( from haystack.nodes import (
ElasticsearchRetriever, ElasticsearchRetriever,
DensePassageRetriever, EmbeddingRetriever,
FARMReader, FARMReader,
RAGenerator, RAGenerator,
BaseComponent, BaseComponent,
@ -39,8 +39,12 @@ def tutorial11_pipelines():
es_retriever = ElasticsearchRetriever(document_store=document_store) es_retriever = ElasticsearchRetriever(document_store=document_store)
# Initialize dense retriever # Initialize dense retriever
dpr_retriever = DensePassageRetriever(document_store) embedding_retriever = EmbeddingRetriever(
document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False) document_store,
model_format="sentence_transformers",
embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
)
document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
@ -83,7 +87,7 @@ def tutorial11_pipelines():
# Generative QA # Generative QA
query = "Who is the father of Arya Stark?" query = "Who is the father of Arya Stark?"
p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever) p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=embedding_retriever)
res = p_generator.run(query=query, params={"Retriever": {"top_k": 10}}) res = p_generator.run(query=query, params={"Retriever": {"top_k": 10}})
print() print()
print_answers(res, details="minimum") print_answers(res, details="minimum")
@ -129,9 +133,11 @@ def tutorial11_pipelines():
# Create ensembled pipeline # Create ensembled pipeline
p_ensemble = Pipeline() p_ensemble = Pipeline()
p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
p_ensemble.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"]) p_ensemble.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
p_ensemble.add_node( p_ensemble.add_node(
component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever"] component=JoinDocuments(join_mode="concatenate"),
name="JoinResults",
inputs=["ESRetriever", "EmbeddingRetriever"],
) )
p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"]) p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"])
p_ensemble.draw("pipeline_ensemble.png") p_ensemble.draw("pipeline_ensemble.png")
@ -139,7 +145,8 @@ def tutorial11_pipelines():
# Run pipeline # Run pipeline
query = "Who is the father of Arya Stark?" query = "Who is the father of Arya Stark?"
res = p_ensemble.run( res = p_ensemble.run(
query="Who is the father of Arya Stark?", params={"ESRetriever": {"top_k": 5}, "DPRRetriever": {"top_k": 5}} query="Who is the father of Arya Stark?",
params={"ESRetriever": {"top_k": 5}, "EmbeddingRetriever": {"top_k": 5}},
) )
print("\nQuery: ", query) print("\nQuery: ", query)
print("Answers:") print("Answers:")
@ -167,8 +174,8 @@ def tutorial11_pipelines():
p_classifier = Pipeline() p_classifier = Pipeline()
p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"]) p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"])
p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"]) p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"])
p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"]) p_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_2"])
p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"])
p_classifier.draw("pipeline_classifier.png") p_classifier.draw("pipeline_classifier.png")
# Run only the dense retriever on the full sentence query # Run only the dense retriever on the full sentence query
@ -176,7 +183,7 @@ def tutorial11_pipelines():
res_1 = p_classifier.run(query=query) res_1 = p_classifier.run(query=query)
print() print()
print("\nQuery: ", query) print("\nQuery: ", query)
print(" * DPR Answers:") print(" * Embedding Retriever Answers:")
print_answers(res_1, details="minimum") print_answers(res_1, details="minimum")
# Run only the sparse retriever on a keyword based query # Run only the sparse retriever on a keyword based query
@ -198,7 +205,7 @@ def tutorial11_pipelines():
# 2) You can provide `debug` as a parameter when running your pipeline # 2) You can provide `debug` as a parameter when running your pipeline
result = p_classifier.run(query="Who is the father of Arya Stark?", params={"ESRetriever": {"debug": True}}) result = p_classifier.run(query="Who is the father of Arya Stark?", params={"ESRetriever": {"debug": True}})
# 3) You can provide the `debug` paramter to all nodes in your pipeline # 3) You can provide the `debug` parameter to all nodes in your pipeline
result = p_classifier.run(query="Who is the father of Arya Stark?", params={"debug": True}) result = p_classifier.run(query="Who is the father of Arya Stark?", params={"debug": True})
pprint(result["_debug"]) pprint(result["_debug"])