diff --git a/docs/_src/img/tutorial11_custompipelines_pipeline_ensemble.png b/docs/_src/img/tutorial11_custompipelines_pipeline_ensemble.png new file mode 100644 index 000000000..56b58cbe3 Binary files /dev/null and b/docs/_src/img/tutorial11_custompipelines_pipeline_ensemble.png differ diff --git a/docs/_src/img/tutorial11_decision_nodes_pipeline_classifier.png b/docs/_src/img/tutorial11_decision_nodes_pipeline_classifier.png new file mode 100644 index 000000000..28a4eb4d3 Binary files /dev/null and b/docs/_src/img/tutorial11_decision_nodes_pipeline_classifier.png differ diff --git a/docs/_src/tutorials/tutorials/11.md b/docs/_src/tutorials/tutorials/11.md index b47517052..b6a30ec02 100644 --- a/docs/_src/tutorials/tutorials/11.md +++ b/docs/_src/tutorials/tutorials/11.md @@ -100,7 +100,7 @@ to perform Open Domain Question Answering. from haystack import Pipeline from haystack.utils import launch_es from haystack.document_stores import ElasticsearchDocumentStore -from haystack.nodes import ElasticsearchRetriever, DensePassageRetriever, FARMReader +from haystack.nodes import ElasticsearchRetriever, EmbeddingRetriever, FARMReader # Initialize DocumentStore and index documents @@ -113,9 +113,14 @@ document_store.write_documents(got_dicts) es_retriever = ElasticsearchRetriever(document_store=document_store) # Initialize dense retriever -dpr_retriever = DensePassageRetriever(document_store) -document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False) +embedding_retriever = EmbeddingRetriever( + document_store, + model_format="sentence_transformers", + embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1", +) +document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False) +# Initialize reader reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") ``` @@ -163,7 +168,7 @@ document_store.return_embedding = True rag_generator = RAGenerator() # Generative QA -p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever) +p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=embedding_retriever) res = p_generator.run(query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}}) print_answers(res, details="minimum") @@ -214,12 +219,12 @@ p_extractive.draw("pipeline_extractive.png") ``` Pipelines offer a very simple way to ensemble together different components. -In this example, we are going to combine the power of a `DensePassageRetriever` +In this example, we are going to combine the power of an `EmbeddingRetriever` with the keyword based `ElasticsearchRetriever`. See our [documentation](https://haystack.deepset.ai/docs/latest/retrievermd) to understand why we might want to combine a dense and sparse retriever. -![image](https://user-images.githubusercontent.com/1563902/102451782-7bd80400-4039-11eb-9046-01b002a783f8.png) +![image]() Here we use a `JoinDocuments` node so that the predictions from each retriever can be merged together. @@ -230,16 +235,16 @@ from haystack.pipelines import JoinDocuments # Create ensembled pipeline p_ensemble = Pipeline() p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) -p_ensemble.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"]) +p_ensemble.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"]) p_ensemble.add_node( - component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever"] + component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "EmbeddingRetriever"] ) p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"]) p_ensemble.draw("pipeline_ensemble.png") # Run pipeline res = p_ensemble.run( - query="Who is the father of Arya Stark?", params={"DPRRetriever": {"top_k": 5}, "ESRetriever": {"top_k": 5}} + query="Who is the father of Arya Stark?", params={"EmbeddingRetriever": {"top_k": 5}, "ESRetriever": {"top_k": 5}} ) print_answers(res, details="minimum") ``` @@ -277,10 +282,10 @@ class CustomNode(BaseComponent): ## Decision Nodes Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run. -One popular use case for such query classifiers is routing keyword queries to Elasticsearch and questions to DPR + Reader. +One popular use case for such query classifiers is routing keyword queries to Elasticsearch and questions to EmbeddingRetriever + Reader. With this approach you keep optimal speed and simplicity for keywords while going deep with transformers when it's most helpful. -![image](https://user-images.githubusercontent.com/1563902/102452199-41229b80-403a-11eb-9365-7038697e7c3e.png) +![image]() Though this looks very similar to the ensembled pipeline shown above, the key difference is that only one of the retrievers is run for each request. @@ -304,13 +309,13 @@ class CustomQueryClassifier(BaseComponent): p_classifier = Pipeline() p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"]) p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"]) -p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"]) -p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) +p_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_2"]) +p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"]) p_classifier.draw("pipeline_classifier.png") # Run only the dense retriever on the full sentence query res_1 = p_classifier.run(query="Who is the father of Arya Stark?") -print("DPR Results" + "\n" + "=" * 15) +print("Embedding Retriever Results" + "\n" + "=" * 15) print_answers(res_1) # Run only the sparse retriever on a keyword based query diff --git a/tutorials/Tutorial11_Pipelines.ipynb b/tutorials/Tutorial11_Pipelines.ipynb index 12ab3ed3b..6b94fd263 100644 --- a/tutorials/Tutorial11_Pipelines.ipynb +++ b/tutorials/Tutorial11_Pipelines.ipynb @@ -210,7 +210,7 @@ "from haystack import Pipeline\n", "from haystack.utils import launch_es\n", "from haystack.document_stores import ElasticsearchDocumentStore\n", - "from haystack.nodes import ElasticsearchRetriever, DensePassageRetriever, FARMReader\n", + "from haystack.nodes import ElasticsearchRetriever, EmbeddingRetriever, FARMReader\n", "\n", "\n", "# Initialize DocumentStore and index documents\n", @@ -223,9 +223,14 @@ "es_retriever = ElasticsearchRetriever(document_store=document_store)\n", "\n", "# Initialize dense retriever\n", - "dpr_retriever = DensePassageRetriever(document_store)\n", - "document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False)\n", + "embedding_retriever = EmbeddingRetriever(\n", + " document_store,\n", + " model_format=\"sentence_transformers\",\n", + " embedding_model=\"sentence-transformers/multi-qa-mpnet-base-dot-v1\",\n", + ")\n", + "document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)\n", "\n", + "# Initialize reader\n", "reader = FARMReader(model_name_or_path=\"deepset/roberta-base-squad2\")" ] }, @@ -324,7 +329,7 @@ "rag_generator = RAGenerator()\n", "\n", "# Generative QA\n", - "p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever)\n", + "p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=embedding_retriever)\n", "res = p_generator.run(query=\"Who is the father of Arya Stark?\", params={\"Retriever\": {\"top_k\": 10}})\n", "print_answers(res, details=\"minimum\")\n", "\n", @@ -428,12 +433,12 @@ }, "source": [ "Pipelines offer a very simple way to ensemble together different components.\n", - "In this example, we are going to combine the power of a `DensePassageRetriever`\n", + "In this example, we are going to combine the power of an `EmbeddingRetriever`\n", "with the keyword based `ElasticsearchRetriever`.\n", "See our [documentation](https://haystack.deepset.ai/docs/latest/retrievermd) to understand why\n", "we might want to combine a dense and sparse retriever.\n", "\n", - "![image](https://user-images.githubusercontent.com/1563902/102451782-7bd80400-4039-11eb-9046-01b002a783f8.png)\n", + "![image]()\n", "\n", "Here we use a `JoinDocuments` node so that the predictions from each retriever can be merged together." ] @@ -454,16 +459,16 @@ "# Create ensembled pipeline\n", "p_ensemble = Pipeline()\n", "p_ensemble.add_node(component=es_retriever, name=\"ESRetriever\", inputs=[\"Query\"])\n", - "p_ensemble.add_node(component=dpr_retriever, name=\"DPRRetriever\", inputs=[\"Query\"])\n", + "p_ensemble.add_node(component=embedding_retriever, name=\"EmbeddingRetriever\", inputs=[\"Query\"])\n", "p_ensemble.add_node(\n", - " component=JoinDocuments(join_mode=\"concatenate\"), name=\"JoinResults\", inputs=[\"ESRetriever\", \"DPRRetriever\"]\n", + " component=JoinDocuments(join_mode=\"concatenate\"), name=\"JoinResults\", inputs=[\"ESRetriever\", \"EmbeddingRetriever\"]\n", ")\n", "p_ensemble.add_node(component=reader, name=\"Reader\", inputs=[\"JoinResults\"])\n", "p_ensemble.draw(\"pipeline_ensemble.png\")\n", "\n", "# Run pipeline\n", "res = p_ensemble.run(\n", - " query=\"Who is the father of Arya Stark?\", params={\"DPRRetriever\": {\"top_k\": 5}, \"ESRetriever\": {\"top_k\": 5}}\n", + " query=\"Who is the father of Arya Stark?\", params={\"EmbeddingRetriever\": {\"top_k\": 5}, \"ESRetriever\": {\"top_k\": 5}}\n", ")\n", "print_answers(res, details=\"minimum\")" ] @@ -529,10 +534,10 @@ "## Decision Nodes\n", "\n", "Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run.\n", - "One popular use case for such query classifiers is routing keyword queries to Elasticsearch and questions to DPR + Reader.\n", + "One popular use case for such query classifiers is routing keyword queries to Elasticsearch and questions to EmbeddingRetriever + Reader.\n", "With this approach you keep optimal speed and simplicity for keywords while going deep with transformers when it's most helpful.\n", "\n", - "![image](https://user-images.githubusercontent.com/1563902/102452199-41229b80-403a-11eb-9365-7038697e7c3e.png)\n", + "![image]()\n", "\n", "Though this looks very similar to the ensembled pipeline shown above,\n", "the key difference is that only one of the retrievers is run for each request.\n", @@ -566,13 +571,13 @@ "p_classifier = Pipeline()\n", "p_classifier.add_node(component=CustomQueryClassifier(), name=\"QueryClassifier\", inputs=[\"Query\"])\n", "p_classifier.add_node(component=es_retriever, name=\"ESRetriever\", inputs=[\"QueryClassifier.output_1\"])\n", - "p_classifier.add_node(component=dpr_retriever, name=\"DPRRetriever\", inputs=[\"QueryClassifier.output_2\"])\n", - "p_classifier.add_node(component=reader, name=\"QAReader\", inputs=[\"ESRetriever\", \"DPRRetriever\"])\n", + "p_classifier.add_node(component=embedding_retriever, name=\"EmbeddingRetriever\", inputs=[\"QueryClassifier.output_2\"])\n", + "p_classifier.add_node(component=reader, name=\"QAReader\", inputs=[\"ESRetriever\", \"EmbeddingRetriever\"])\n", "p_classifier.draw(\"pipeline_classifier.png\")\n", "\n", "# Run only the dense retriever on the full sentence query\n", "res_1 = p_classifier.run(query=\"Who is the father of Arya Stark?\")\n", - "print(\"DPR Results\" + \"\\n\" + \"=\" * 15)\n", + "print(\"Embedding Retriever Results\" + \"\\n\" + \"=\" * 15)\n", "print_answers(res_1)\n", "\n", "# Run only the sparse retriever on a keyword based query\n", @@ -772,4 +777,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/tutorials/Tutorial11_Pipelines.py b/tutorials/Tutorial11_Pipelines.py index be4522c54..a8413d5a5 100644 --- a/tutorials/Tutorial11_Pipelines.py +++ b/tutorials/Tutorial11_Pipelines.py @@ -11,7 +11,7 @@ from haystack import Pipeline from haystack.document_stores import ElasticsearchDocumentStore from haystack.nodes import ( ElasticsearchRetriever, - DensePassageRetriever, + EmbeddingRetriever, FARMReader, RAGenerator, BaseComponent, @@ -39,8 +39,12 @@ def tutorial11_pipelines(): es_retriever = ElasticsearchRetriever(document_store=document_store) # Initialize dense retriever - dpr_retriever = DensePassageRetriever(document_store) - document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False) + embedding_retriever = EmbeddingRetriever( + document_store, + model_format="sentence_transformers", + embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1", + ) + document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") @@ -83,7 +87,7 @@ def tutorial11_pipelines(): # Generative QA query = "Who is the father of Arya Stark?" - p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever) + p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=embedding_retriever) res = p_generator.run(query=query, params={"Retriever": {"top_k": 10}}) print() print_answers(res, details="minimum") @@ -129,9 +133,11 @@ def tutorial11_pipelines(): # Create ensembled pipeline p_ensemble = Pipeline() p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) - p_ensemble.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"]) + p_ensemble.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"]) p_ensemble.add_node( - component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever"] + component=JoinDocuments(join_mode="concatenate"), + name="JoinResults", + inputs=["ESRetriever", "EmbeddingRetriever"], ) p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"]) p_ensemble.draw("pipeline_ensemble.png") @@ -139,7 +145,8 @@ def tutorial11_pipelines(): # Run pipeline query = "Who is the father of Arya Stark?" res = p_ensemble.run( - query="Who is the father of Arya Stark?", params={"ESRetriever": {"top_k": 5}, "DPRRetriever": {"top_k": 5}} + query="Who is the father of Arya Stark?", + params={"ESRetriever": {"top_k": 5}, "EmbeddingRetriever": {"top_k": 5}}, ) print("\nQuery: ", query) print("Answers:") @@ -167,8 +174,8 @@ def tutorial11_pipelines(): p_classifier = Pipeline() p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"]) p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"]) - p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"]) - p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) + p_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_2"]) + p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"]) p_classifier.draw("pipeline_classifier.png") # Run only the dense retriever on the full sentence query @@ -176,7 +183,7 @@ def tutorial11_pipelines(): res_1 = p_classifier.run(query=query) print() print("\nQuery: ", query) - print(" * DPR Answers:") + print(" * Embedding Retriever Answers:") print_answers(res_1, details="minimum") # Run only the sparse retriever on a keyword based query @@ -198,7 +205,7 @@ def tutorial11_pipelines(): # 2) You can provide `debug` as a parameter when running your pipeline result = p_classifier.run(query="Who is the father of Arya Stark?", params={"ESRetriever": {"debug": True}}) - # 3) You can provide the `debug` paramter to all nodes in your pipeline + # 3) You can provide the `debug` parameter to all nodes in your pipeline result = p_classifier.run(query="Who is the father of Arya Stark?", params={"debug": True}) pprint(result["_debug"])