mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-12 07:17:41 +00:00
Replace dpr with embeddingretriever tut11 (#2287)
* images for tutorial 11 in .github folder for easy access * ipynb: changed DPR to EmbeddingRetriever, incl. new graphs of pipelines * Update Documentation & Code Style * moved images into correct folder * removed images path * Update Documentation & Code Style * fixed debugging run of p_classifier * Update Documentation & Code Style * Revert debug param change * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: brandenchan <brandenchan@icloud.com>
This commit is contained in:
parent
85571cdd15
commit
a1040a17b2
BIN
docs/_src/img/tutorial11_custompipelines_pipeline_ensemble.png
Normal file
BIN
docs/_src/img/tutorial11_custompipelines_pipeline_ensemble.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 30 KiB |
BIN
docs/_src/img/tutorial11_decision_nodes_pipeline_classifier.png
Normal file
BIN
docs/_src/img/tutorial11_decision_nodes_pipeline_classifier.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 32 KiB |
@ -100,7 +100,7 @@ to perform Open Domain Question Answering.
|
||||
from haystack import Pipeline
|
||||
from haystack.utils import launch_es
|
||||
from haystack.document_stores import ElasticsearchDocumentStore
|
||||
from haystack.nodes import ElasticsearchRetriever, DensePassageRetriever, FARMReader
|
||||
from haystack.nodes import ElasticsearchRetriever, EmbeddingRetriever, FARMReader
|
||||
|
||||
|
||||
# Initialize DocumentStore and index documents
|
||||
@ -113,9 +113,14 @@ document_store.write_documents(got_dicts)
|
||||
es_retriever = ElasticsearchRetriever(document_store=document_store)
|
||||
|
||||
# Initialize dense retriever
|
||||
dpr_retriever = DensePassageRetriever(document_store)
|
||||
document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False)
|
||||
embedding_retriever = EmbeddingRetriever(
|
||||
document_store,
|
||||
model_format="sentence_transformers",
|
||||
embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
|
||||
)
|
||||
document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)
|
||||
|
||||
# Initialize reader
|
||||
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
|
||||
```
|
||||
|
||||
@ -163,7 +168,7 @@ document_store.return_embedding = True
|
||||
rag_generator = RAGenerator()
|
||||
|
||||
# Generative QA
|
||||
p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever)
|
||||
p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=embedding_retriever)
|
||||
res = p_generator.run(query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}})
|
||||
print_answers(res, details="minimum")
|
||||
|
||||
@ -214,12 +219,12 @@ p_extractive.draw("pipeline_extractive.png")
|
||||
```
|
||||
|
||||
Pipelines offer a very simple way to ensemble together different components.
|
||||
In this example, we are going to combine the power of a `DensePassageRetriever`
|
||||
In this example, we are going to combine the power of an `EmbeddingRetriever`
|
||||
with the keyword based `ElasticsearchRetriever`.
|
||||
See our [documentation](https://haystack.deepset.ai/docs/latest/retrievermd) to understand why
|
||||
we might want to combine a dense and sparse retriever.
|
||||
|
||||

|
||||
![image]()
|
||||
|
||||
Here we use a `JoinDocuments` node so that the predictions from each retriever can be merged together.
|
||||
|
||||
@ -230,16 +235,16 @@ from haystack.pipelines import JoinDocuments
|
||||
# Create ensembled pipeline
|
||||
p_ensemble = Pipeline()
|
||||
p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
|
||||
p_ensemble.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"])
|
||||
p_ensemble.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
|
||||
p_ensemble.add_node(
|
||||
component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever"]
|
||||
component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "EmbeddingRetriever"]
|
||||
)
|
||||
p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"])
|
||||
p_ensemble.draw("pipeline_ensemble.png")
|
||||
|
||||
# Run pipeline
|
||||
res = p_ensemble.run(
|
||||
query="Who is the father of Arya Stark?", params={"DPRRetriever": {"top_k": 5}, "ESRetriever": {"top_k": 5}}
|
||||
query="Who is the father of Arya Stark?", params={"EmbeddingRetriever": {"top_k": 5}, "ESRetriever": {"top_k": 5}}
|
||||
)
|
||||
print_answers(res, details="minimum")
|
||||
```
|
||||
@ -277,10 +282,10 @@ class CustomNode(BaseComponent):
|
||||
## Decision Nodes
|
||||
|
||||
Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run.
|
||||
One popular use case for such query classifiers is routing keyword queries to Elasticsearch and questions to DPR + Reader.
|
||||
One popular use case for such query classifiers is routing keyword queries to Elasticsearch and questions to EmbeddingRetriever + Reader.
|
||||
With this approach you keep optimal speed and simplicity for keywords while going deep with transformers when it's most helpful.
|
||||
|
||||

|
||||
![image]()
|
||||
|
||||
Though this looks very similar to the ensembled pipeline shown above,
|
||||
the key difference is that only one of the retrievers is run for each request.
|
||||
@ -304,13 +309,13 @@ class CustomQueryClassifier(BaseComponent):
|
||||
p_classifier = Pipeline()
|
||||
p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"])
|
||||
p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"])
|
||||
p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"])
|
||||
p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])
|
||||
p_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_2"])
|
||||
p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"])
|
||||
p_classifier.draw("pipeline_classifier.png")
|
||||
|
||||
# Run only the dense retriever on the full sentence query
|
||||
res_1 = p_classifier.run(query="Who is the father of Arya Stark?")
|
||||
print("DPR Results" + "\n" + "=" * 15)
|
||||
print("Embedding Retriever Results" + "\n" + "=" * 15)
|
||||
print_answers(res_1)
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
|
||||
@ -210,7 +210,7 @@
|
||||
"from haystack import Pipeline\n",
|
||||
"from haystack.utils import launch_es\n",
|
||||
"from haystack.document_stores import ElasticsearchDocumentStore\n",
|
||||
"from haystack.nodes import ElasticsearchRetriever, DensePassageRetriever, FARMReader\n",
|
||||
"from haystack.nodes import ElasticsearchRetriever, EmbeddingRetriever, FARMReader\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Initialize DocumentStore and index documents\n",
|
||||
@ -223,9 +223,14 @@
|
||||
"es_retriever = ElasticsearchRetriever(document_store=document_store)\n",
|
||||
"\n",
|
||||
"# Initialize dense retriever\n",
|
||||
"dpr_retriever = DensePassageRetriever(document_store)\n",
|
||||
"document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False)\n",
|
||||
"embedding_retriever = EmbeddingRetriever(\n",
|
||||
" document_store,\n",
|
||||
" model_format=\"sentence_transformers\",\n",
|
||||
" embedding_model=\"sentence-transformers/multi-qa-mpnet-base-dot-v1\",\n",
|
||||
")\n",
|
||||
"document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)\n",
|
||||
"\n",
|
||||
"# Initialize reader\n",
|
||||
"reader = FARMReader(model_name_or_path=\"deepset/roberta-base-squad2\")"
|
||||
]
|
||||
},
|
||||
@ -324,7 +329,7 @@
|
||||
"rag_generator = RAGenerator()\n",
|
||||
"\n",
|
||||
"# Generative QA\n",
|
||||
"p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever)\n",
|
||||
"p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=embedding_retriever)\n",
|
||||
"res = p_generator.run(query=\"Who is the father of Arya Stark?\", params={\"Retriever\": {\"top_k\": 10}})\n",
|
||||
"print_answers(res, details=\"minimum\")\n",
|
||||
"\n",
|
||||
@ -428,12 +433,12 @@
|
||||
},
|
||||
"source": [
|
||||
"Pipelines offer a very simple way to ensemble together different components.\n",
|
||||
"In this example, we are going to combine the power of a `DensePassageRetriever`\n",
|
||||
"In this example, we are going to combine the power of an `EmbeddingRetriever`\n",
|
||||
"with the keyword based `ElasticsearchRetriever`.\n",
|
||||
"See our [documentation](https://haystack.deepset.ai/docs/latest/retrievermd) to understand why\n",
|
||||
"we might want to combine a dense and sparse retriever.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"![image]()\n",
|
||||
"\n",
|
||||
"Here we use a `JoinDocuments` node so that the predictions from each retriever can be merged together."
|
||||
]
|
||||
@ -454,16 +459,16 @@
|
||||
"# Create ensembled pipeline\n",
|
||||
"p_ensemble = Pipeline()\n",
|
||||
"p_ensemble.add_node(component=es_retriever, name=\"ESRetriever\", inputs=[\"Query\"])\n",
|
||||
"p_ensemble.add_node(component=dpr_retriever, name=\"DPRRetriever\", inputs=[\"Query\"])\n",
|
||||
"p_ensemble.add_node(component=embedding_retriever, name=\"EmbeddingRetriever\", inputs=[\"Query\"])\n",
|
||||
"p_ensemble.add_node(\n",
|
||||
" component=JoinDocuments(join_mode=\"concatenate\"), name=\"JoinResults\", inputs=[\"ESRetriever\", \"DPRRetriever\"]\n",
|
||||
" component=JoinDocuments(join_mode=\"concatenate\"), name=\"JoinResults\", inputs=[\"ESRetriever\", \"EmbeddingRetriever\"]\n",
|
||||
")\n",
|
||||
"p_ensemble.add_node(component=reader, name=\"Reader\", inputs=[\"JoinResults\"])\n",
|
||||
"p_ensemble.draw(\"pipeline_ensemble.png\")\n",
|
||||
"\n",
|
||||
"# Run pipeline\n",
|
||||
"res = p_ensemble.run(\n",
|
||||
" query=\"Who is the father of Arya Stark?\", params={\"DPRRetriever\": {\"top_k\": 5}, \"ESRetriever\": {\"top_k\": 5}}\n",
|
||||
" query=\"Who is the father of Arya Stark?\", params={\"EmbeddingRetriever\": {\"top_k\": 5}, \"ESRetriever\": {\"top_k\": 5}}\n",
|
||||
")\n",
|
||||
"print_answers(res, details=\"minimum\")"
|
||||
]
|
||||
@ -529,10 +534,10 @@
|
||||
"## Decision Nodes\n",
|
||||
"\n",
|
||||
"Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run.\n",
|
||||
"One popular use case for such query classifiers is routing keyword queries to Elasticsearch and questions to DPR + Reader.\n",
|
||||
"One popular use case for such query classifiers is routing keyword queries to Elasticsearch and questions to EmbeddingRetriever + Reader.\n",
|
||||
"With this approach you keep optimal speed and simplicity for keywords while going deep with transformers when it's most helpful.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"![image]()\n",
|
||||
"\n",
|
||||
"Though this looks very similar to the ensembled pipeline shown above,\n",
|
||||
"the key difference is that only one of the retrievers is run for each request.\n",
|
||||
@ -566,13 +571,13 @@
|
||||
"p_classifier = Pipeline()\n",
|
||||
"p_classifier.add_node(component=CustomQueryClassifier(), name=\"QueryClassifier\", inputs=[\"Query\"])\n",
|
||||
"p_classifier.add_node(component=es_retriever, name=\"ESRetriever\", inputs=[\"QueryClassifier.output_1\"])\n",
|
||||
"p_classifier.add_node(component=dpr_retriever, name=\"DPRRetriever\", inputs=[\"QueryClassifier.output_2\"])\n",
|
||||
"p_classifier.add_node(component=reader, name=\"QAReader\", inputs=[\"ESRetriever\", \"DPRRetriever\"])\n",
|
||||
"p_classifier.add_node(component=embedding_retriever, name=\"EmbeddingRetriever\", inputs=[\"QueryClassifier.output_2\"])\n",
|
||||
"p_classifier.add_node(component=reader, name=\"QAReader\", inputs=[\"ESRetriever\", \"EmbeddingRetriever\"])\n",
|
||||
"p_classifier.draw(\"pipeline_classifier.png\")\n",
|
||||
"\n",
|
||||
"# Run only the dense retriever on the full sentence query\n",
|
||||
"res_1 = p_classifier.run(query=\"Who is the father of Arya Stark?\")\n",
|
||||
"print(\"DPR Results\" + \"\\n\" + \"=\" * 15)\n",
|
||||
"print(\"Embedding Retriever Results\" + \"\\n\" + \"=\" * 15)\n",
|
||||
"print_answers(res_1)\n",
|
||||
"\n",
|
||||
"# Run only the sparse retriever on a keyword based query\n",
|
||||
@ -772,4 +777,4 @@
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
@ -11,7 +11,7 @@ from haystack import Pipeline
|
||||
from haystack.document_stores import ElasticsearchDocumentStore
|
||||
from haystack.nodes import (
|
||||
ElasticsearchRetriever,
|
||||
DensePassageRetriever,
|
||||
EmbeddingRetriever,
|
||||
FARMReader,
|
||||
RAGenerator,
|
||||
BaseComponent,
|
||||
@ -39,8 +39,12 @@ def tutorial11_pipelines():
|
||||
es_retriever = ElasticsearchRetriever(document_store=document_store)
|
||||
|
||||
# Initialize dense retriever
|
||||
dpr_retriever = DensePassageRetriever(document_store)
|
||||
document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False)
|
||||
embedding_retriever = EmbeddingRetriever(
|
||||
document_store,
|
||||
model_format="sentence_transformers",
|
||||
embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
|
||||
)
|
||||
document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)
|
||||
|
||||
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
|
||||
|
||||
@ -83,7 +87,7 @@ def tutorial11_pipelines():
|
||||
|
||||
# Generative QA
|
||||
query = "Who is the father of Arya Stark?"
|
||||
p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever)
|
||||
p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=embedding_retriever)
|
||||
res = p_generator.run(query=query, params={"Retriever": {"top_k": 10}})
|
||||
print()
|
||||
print_answers(res, details="minimum")
|
||||
@ -129,9 +133,11 @@ def tutorial11_pipelines():
|
||||
# Create ensembled pipeline
|
||||
p_ensemble = Pipeline()
|
||||
p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
|
||||
p_ensemble.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"])
|
||||
p_ensemble.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
|
||||
p_ensemble.add_node(
|
||||
component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever"]
|
||||
component=JoinDocuments(join_mode="concatenate"),
|
||||
name="JoinResults",
|
||||
inputs=["ESRetriever", "EmbeddingRetriever"],
|
||||
)
|
||||
p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"])
|
||||
p_ensemble.draw("pipeline_ensemble.png")
|
||||
@ -139,7 +145,8 @@ def tutorial11_pipelines():
|
||||
# Run pipeline
|
||||
query = "Who is the father of Arya Stark?"
|
||||
res = p_ensemble.run(
|
||||
query="Who is the father of Arya Stark?", params={"ESRetriever": {"top_k": 5}, "DPRRetriever": {"top_k": 5}}
|
||||
query="Who is the father of Arya Stark?",
|
||||
params={"ESRetriever": {"top_k": 5}, "EmbeddingRetriever": {"top_k": 5}},
|
||||
)
|
||||
print("\nQuery: ", query)
|
||||
print("Answers:")
|
||||
@ -167,8 +174,8 @@ def tutorial11_pipelines():
|
||||
p_classifier = Pipeline()
|
||||
p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"])
|
||||
p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"])
|
||||
p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"])
|
||||
p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])
|
||||
p_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_2"])
|
||||
p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"])
|
||||
p_classifier.draw("pipeline_classifier.png")
|
||||
|
||||
# Run only the dense retriever on the full sentence query
|
||||
@ -176,7 +183,7 @@ def tutorial11_pipelines():
|
||||
res_1 = p_classifier.run(query=query)
|
||||
print()
|
||||
print("\nQuery: ", query)
|
||||
print(" * DPR Answers:")
|
||||
print(" * Embedding Retriever Answers:")
|
||||
print_answers(res_1, details="minimum")
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
@ -198,7 +205,7 @@ def tutorial11_pipelines():
|
||||
# 2) You can provide `debug` as a parameter when running your pipeline
|
||||
result = p_classifier.run(query="Who is the father of Arya Stark?", params={"ESRetriever": {"debug": True}})
|
||||
|
||||
# 3) You can provide the `debug` paramter to all nodes in your pipeline
|
||||
# 3) You can provide the `debug` parameter to all nodes in your pipeline
|
||||
result = p_classifier.run(query="Who is the father of Arya Stark?", params={"debug": True})
|
||||
|
||||
pprint(result["_debug"])
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user