diff --git a/docs/_src/tutorials/tutorials/1.md b/docs/_src/tutorials/tutorials/1.md index 0acbfb90a..8d7cc1de4 100644 --- a/docs/_src/tutorials/tutorials/1.md +++ b/docs/_src/tutorials/tutorials/1.md @@ -44,13 +44,11 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial # Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git !pip install urllib3==1.25.4 -!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html ``` ```python -from haystack import Finder from haystack.preprocessor.cleaning import clean_wiki_text from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http from haystack.reader.farm import FARMReader @@ -74,19 +72,19 @@ You can start Elasticsearch on your local machine instance using Docker. If Dock ```python # Recommended: Start Elasticsearch using Docker -#! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2 +#! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2 ``` ```python # In Colab / No Docker environments: Start Elasticsearch from source -! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q -! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz -! chown -R daemon:daemon elasticsearch-7.6.2 +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 import os from subprocess import Popen, PIPE, STDOUT -es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'], +es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon ) @@ -206,13 +204,17 @@ reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=Tr # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) ``` -### Finder +### Pipeline -The Finder sticks together reader and retriever in a pipeline to answer our actual questions. +With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. +Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. +To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. +You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). ```python -finder = Finder(reader, retriever) +from haystack.pipeline import ExtractiveQAPipeline +pipe = ExtractiveQAPipeline(reader, retriever) ``` ## Voilà! Ask a question! @@ -221,13 +223,13 @@ finder = Finder(reader, retriever) ```python # You can configure how many candidates the reader and retriever shall return # The higher top_k_retriever, the better (but also the slower) your answers. -prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) +prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) ``` ```python -# prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_reader=5) -# prediction = finder.get_answers(question="Who is the sister of Sansa?", top_k_reader=5) +# prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) +# prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) ``` diff --git a/docs/_src/tutorials/tutorials/2.md b/docs/_src/tutorials/tutorials/2.md index f17799108..5cbf95f5b 100644 --- a/docs/_src/tutorials/tutorials/2.md +++ b/docs/_src/tutorials/tutorials/2.md @@ -39,8 +39,6 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial # Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git !pip install urllib3==1.25.4 -!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html - ``` diff --git a/docs/_src/tutorials/tutorials/3.md b/docs/_src/tutorials/tutorials/3.md index 50dbc2d24..3273a075e 100644 --- a/docs/_src/tutorials/tutorials/3.md +++ b/docs/_src/tutorials/tutorials/3.md @@ -39,8 +39,6 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial # Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git !pip install urllib3==1.25.4 -!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html - ``` @@ -153,13 +151,17 @@ reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=Tr # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) ``` -### Finder +### Pipeline -The Finder sticks together reader and retriever in a pipeline to answer our actual questions. +With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. +Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. +To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. +You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). ```python -finder = Finder(reader, retriever) +from haystack.pipeline import ExtractiveQAPipeline +pipe = ExtractiveQAPipeline(reader, retriever) ``` ## Voilà! Ask a question! @@ -167,14 +169,14 @@ finder = Finder(reader, retriever) ```python # You can configure how many candidates the reader and retriever shall return -# The higher top_k_retriever, the better (but also the slower) your answers. -prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) +# The higher top_k_retriever, the better (but also the slower) your answers. +prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) ``` ```python -# prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_reader=5) -# prediction = finder.get_answers(question="Who is the sister of Sansa?", top_k_reader=5) +# prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) +# prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) ``` diff --git a/docs/_src/tutorials/tutorials/4.md b/docs/_src/tutorials/tutorials/4.md index e8a782917..470f2df82 100644 --- a/docs/_src/tutorials/tutorials/4.md +++ b/docs/_src/tutorials/tutorials/4.md @@ -47,8 +47,6 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial # Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git !pip install urllib3==1.25.4 -!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html - ``` @@ -136,7 +134,7 @@ print(df.head()) # Get embeddings for our questions from the FAQs questions = list(df["question"].values) df["question_emb"] = retriever.embed_queries(texts=questions) -df = df.rename(columns={"answer": "text"}) +df = df.rename(columns={"question": "text"}) # Convert Dataframe to list of dicts and index them in our DocumentStore docs_to_index = df.to_dict(orient="records") @@ -144,11 +142,18 @@ document_store.write_documents(docs_to_index) ``` ### Ask questions -Initialize a Finder (this time without a reader) and ask questions +Initialize a Pipeline (this time without a reader) and ask questions ```python -finder = Finder(reader=None, retriever=retriever) -prediction = finder.get_answers_via_similar_questions(question="How is the virus spreading?", top_k_retriever=10) -print_answers(prediction, details="all") +from haystack.pipeline import FAQPipeline +pipe = FAQPipeline(retriever=retriever) +``` + + +```python +prediction = pipe.run(query="How is the virus spreading?", top_k_retriever=10) +print_answers(prediction, details="all") + + ``` diff --git a/docs/_src/tutorials/tutorials/5.md b/docs/_src/tutorials/tutorials/5.md index d1bb11e31..61d7a5853 100644 --- a/docs/_src/tutorials/tutorials/5.md +++ b/docs/_src/tutorials/tutorials/5.md @@ -38,8 +38,6 @@ You can start Elasticsearch on your local machine instance using Docker. If Dock # Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git !pip install urllib3==1.25.4 -!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html - ``` @@ -177,7 +175,6 @@ print("Reader F1-Score:", reader_eval_results["f1"]) ```python # Evaluate combination of Reader and Retriever through Finder -# Evaluate combination of Reader and Retriever through Finder finder_eval_results = finder.eval(top_k_retriever=1, top_k_reader=10, label_index=label_index, doc_index=doc_index) finder.print_eval_results(finder_eval_results) ``` diff --git a/docs/_src/tutorials/tutorials/6.md b/docs/_src/tutorials/tutorials/6.md index 73b373efb..0c298e80c 100644 --- a/docs/_src/tutorials/tutorials/6.md +++ b/docs/_src/tutorials/tutorials/6.md @@ -80,8 +80,6 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial # Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git !pip install urllib3==1.25.4 -!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html - ``` @@ -179,33 +177,31 @@ Here we use a FARMReader with the *deepset/roberta-base-squad2* model (see: http reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) ``` -#### Finder +### Pipeline -The Finder sticks together reader and retriever in a pipeline to answer our actual questions. +With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. +Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. +To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. +You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). ```python -finder = Finder(reader, retriever) +from haystack.pipeline import ExtractiveQAPipeline +pipe = ExtractiveQAPipeline(reader, retriever) ``` -### Voilà! Ask a question! +## Voilà! Ask a question! ```python # You can configure how many candidates the reader and retriever shall return -# The higher top_k_retriever, the better (but also the slower) your answers. -prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_retriever=10, top_k_reader=5) - -#prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) -#prediction = finder.get_answers(question="Who is the sister of Sansa?", top_k_retriever=10, top_k_reader=5) +# The higher top_k_retriever, the better (but also the slower) your answers. +prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_retriever=10, top_k_reader=5) ``` ```python print_answers(prediction, details="minimal") -``` -```python - ``` diff --git a/docs/_src/tutorials/tutorials/7.md b/docs/_src/tutorials/tutorials/7.md index 992805010..c3f97854f 100644 --- a/docs/_src/tutorials/tutorials/7.md +++ b/docs/_src/tutorials/tutorials/7.md @@ -37,8 +37,6 @@ Here are the packages and imports that we'll need: ```python !pip install git+https://github.com/deepset-ai/haystack.git !pip install urllib3==1.25.4 -!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html - ``` @@ -188,3 +186,14 @@ for question in QUESTIONS: answers = predicted_result["answers"] print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'') ``` + + +```python +# Or alternatively use the Pipeline class +from haystack.pipeline import GenerativeQAPipeline + +pipe = GenerativeQAPipeline(generator=generator, retriever=retriever) +for question in QUESTIONS: + res = pipe.run(query=question, top_k_generator=1, top_k_retriever=5) + print(res) +``` diff --git a/docs/_src/tutorials/tutorials/8.md b/docs/_src/tutorials/tutorials/8.md index 2bafefe78..b89105edf 100644 --- a/docs/_src/tutorials/tutorials/8.md +++ b/docs/_src/tutorials/tutorials/8.md @@ -36,9 +36,8 @@ This tutorial will show you all the tools that Haystack provides to help you cas # Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git -!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html -!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.02.tar.gz -!tar -xvf xpdf-tools-linux-4.02.tar.gz && sudo cp xpdf-tools-linux-4.02/bin64/pdftotext /usr/local/bin +!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz +!tar -xvf xpdf-tools-linux-4.03.tar.gz && sudo cp xpdf-tools-linux-4.03/bin64/pdftotext /usr/local/bin ``` diff --git a/docs/_src/tutorials/tutorials/9.md b/docs/_src/tutorials/tutorials/9.md index fe48c528c..c8e09594a 100644 --- a/docs/_src/tutorials/tutorials/9.md +++ b/docs/_src/tutorials/tutorials/9.md @@ -21,7 +21,6 @@ This tutorial will guide you through the steps required to create a retriever th # Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git -!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html ``` diff --git a/haystack/pipeline.py b/haystack/pipeline.py index d2ce30e65..f05eb7a6c 100644 --- a/haystack/pipeline.py +++ b/haystack/pipeline.py @@ -434,7 +434,7 @@ class FAQPipeline(BaseStandardPipeline): results: Dict = {"query": query, "answers": []} for doc in documents: - # TODO proper calibratation of pseudo probabilities + # TODO proper calibration of pseudo probabilities cur_answer = { "query": doc.text, "answer": doc.meta["answer"], @@ -448,7 +448,6 @@ class FAQPipeline(BaseStandardPipeline): } results["answers"].append(cur_answer) - return results diff --git a/haystack/retriever/squad_to_dpr.py b/haystack/retriever/squad_to_dpr.py index e4bd39138..b298320df 100644 --- a/haystack/retriever/squad_to_dpr.py +++ b/haystack/retriever/squad_to_dpr.py @@ -98,7 +98,7 @@ class HaystackDocumentStore: if not es.ping(): logging.info("Starting Elasticsearch ...") status = subprocess.run( - ['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True + ['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'], shell=True ) if status.returncode: raise Exception( diff --git a/run_docker_gpu.sh b/run_docker_gpu.sh index cbfc98291..1c2523a82 100755 --- a/run_docker_gpu.sh +++ b/run_docker_gpu.sh @@ -6,7 +6,7 @@ # # To use GPU with Docker, ensure nvidia-docker(https://github.com/NVIDIA/nvidia-docker) is installed. -docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.1 +docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2 # alternative: for a demo you can also use this elasticsearch image with already indexed GoT articles #docker run -d -p 9200:9200 -e "discovery.type=single-node" deepset/elasticsearch-game-of-thrones diff --git a/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb index 952bb1a53..b1594d2d5 100644 --- a/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb +++ b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb @@ -60,8 +60,7 @@ "\n", "# Install the latest master of Haystack\n", "!pip install git+https://github.com/deepset-ai/haystack.git\n", - "!pip install urllib3==1.25.4\n", - "!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html\n" + "!pip install urllib3==1.25.4\n" ] }, { @@ -70,7 +69,6 @@ "metadata": {}, "outputs": [], "source": [ - "from haystack import Finder\n", "from haystack.preprocessor.cleaning import clean_wiki_text\n", "from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http\n", "from haystack.reader.farm import FARMReader\n", @@ -111,7 +109,7 @@ ], "source": [ "# Recommended: Start Elasticsearch using Docker\n", - "#! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2" + "#! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.9.2" ] }, { @@ -121,13 +119,13 @@ "outputs": [], "source": [ "# In Colab / No Docker environments: Start Elasticsearch from source\n", - "! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n", - "! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n", - "! chown -R daemon:daemon elasticsearch-7.6.2\n", + "! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q\n", + "! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz\n", + "! chown -R daemon:daemon elasticsearch-7.9.2\n", "\n", "import os\n", "from subprocess import Popen, PIPE, STDOUT\n", - "es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n", + "es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],\n", " stdout=PIPE, stderr=STDOUT,\n", " preexec_fn=lambda: os.setuid(1) # as daemon\n", " )\n", @@ -359,9 +357,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Finder\n", + "### Pipeline\n", "\n", - "The Finder sticks together reader and retriever in a pipeline to answer our actual questions. " + "With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.\n", + "Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.\n", + "To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.\n", + "You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd)." ] }, { @@ -374,7 +375,8 @@ }, "outputs": [], "source": [ - "finder = Finder(reader, retriever)" + "from haystack.pipeline import ExtractiveQAPipeline\n", + "pipe = ExtractiveQAPipeline(reader, retriever)" ] }, { @@ -406,7 +408,7 @@ "source": [ "# You can configure how many candidates the reader and retriever shall return\n", "# The higher top_k_retriever, the better (but also the slower) your answers. \n", - "prediction = finder.get_answers(question=\"Who is the father of Arya Stark?\", top_k_retriever=10, top_k_reader=5)" + "prediction = pipe.run(query=\"Who is the father of Arya Stark?\", top_k_retriever=10, top_k_reader=5)" ] }, { @@ -415,8 +417,8 @@ "metadata": {}, "outputs": [], "source": [ - "# prediction = finder.get_answers(question=\"Who created the Dothraki vocabulary?\", top_k_reader=5)\n", - "# prediction = finder.get_answers(question=\"Who is the sister of Sansa?\", top_k_reader=5)" + "# prediction = pipe.run(query=\"Who created the Dothraki vocabulary?\", top_k_reader=5)\n", + "# prediction = pipe.run(query=\"Who is the sister of Sansa?\", top_k_reader=5)" ] }, { diff --git a/tutorials/Tutorial1_Basic_QA_Pipeline.py b/tutorials/Tutorial1_Basic_QA_Pipeline.py index 8ae8f9f9c..15c614177 100755 --- a/tutorials/Tutorial1_Basic_QA_Pipeline.py +++ b/tutorials/Tutorial1_Basic_QA_Pipeline.py @@ -48,7 +48,7 @@ def tutorial1_basic_qa_pipeline(): if LAUNCH_ELASTICSEARCH: logging.info("Starting Elasticsearch ...") status = subprocess.run( - ['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True + ['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'], shell=True ) if status.returncode: raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance" @@ -138,20 +138,20 @@ def tutorial1_basic_qa_pipeline(): # reader = TransformersReader( # model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) - # ### Finder - # - # The Finder sticks together reader and retriever in a pipeline to answer our actual questions. + # ### Pipeline + # + # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. + # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. + # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. + # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). + from haystack.pipeline import ExtractiveQAPipeline + pipe = ExtractiveQAPipeline(reader, retriever) + + ## Voilà! Ask a question! + prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) - finder = Finder(reader, retriever) - - # ## Voilà! Ask a question! - # You can configure how many candidates the reader and retriever shall return - # The higher top_k_retriever, the better (but also the slower) your answers. - prediction = finder.get_answers(question="Who is the father of Sansa Stark?", top_k_retriever=10, top_k_reader=5) - - - # prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_reader=5) - # prediction = finder.get_answers(question="Who is the sister of Sansa?", top_k_reader=5) + # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) + # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) print_answers(prediction, details="minimal") diff --git a/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb b/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb index 0182cfe36..50d154a17 100644 --- a/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb +++ b/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb @@ -56,8 +56,7 @@ "\n", "# Install the latest master of Haystack\n", "!pip install git+https://github.com/deepset-ai/haystack.git\n", - "!pip install urllib3==1.25.4\n", - "!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html\n" + "!pip install urllib3==1.25.4" ] }, { diff --git a/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb b/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb index 7843982c8..f1fbbce6b 100644 --- a/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb +++ b/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb @@ -56,8 +56,7 @@ "\n", "# Install the latest master of Haystack\n", "!pip install git+https://github.com/deepset-ai/haystack.git\n", - "!pip install urllib3==1.25.4\n", - "!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html\n" + "!pip install urllib3==1.25.4" ] }, { @@ -278,9 +277,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Finder\n", + "### Pipeline\n", "\n", - "The Finder sticks together reader and retriever in a pipeline to answer our actual questions. " + "With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.\n", + "Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.\n", + "To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.\n", + "You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd)." ] }, { @@ -293,7 +295,8 @@ }, "outputs": [], "source": [ - "finder = Finder(reader, retriever)" + "from haystack.pipeline import ExtractiveQAPipeline\n", + "pipe = ExtractiveQAPipeline(reader, retriever)" ] }, { @@ -340,8 +343,8 @@ ], "source": [ "# You can configure how many candidates the reader and retriever shall return\n", - "# The higher top_k_retriever, the better (but also the slower) your answers. \n", - "prediction = finder.get_answers(question=\"Who is the father of Arya Stark?\", top_k_retriever=10, top_k_reader=5)" + "# The higher top_k_retriever, the better (but also the slower) your answers.\n", + "prediction = pipe.run(query=\"Who is the father of Arya Stark?\", top_k_retriever=10, top_k_reader=5)" ] }, { @@ -350,8 +353,8 @@ "metadata": {}, "outputs": [], "source": [ - "# prediction = finder.get_answers(question=\"Who created the Dothraki vocabulary?\", top_k_reader=5)\n", - "# prediction = finder.get_answers(question=\"Who is the sister of Sansa?\", top_k_reader=5)" + "# prediction = pipe.run(query=\"Who created the Dothraki vocabulary?\", top_k_reader=5)\n", + "# prediction = pipe.run(query=\"Who is the sister of Sansa?\", top_k_reader=5)" ] }, { diff --git a/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.py b/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.py index f5393edf6..51c1d7a62 100644 --- a/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.py +++ b/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.py @@ -91,18 +91,20 @@ def tutorial3_basic_qa_pipeline_without_elasticsearch(): # Alternative: # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) - # ### Finder + # ### Pipeline # - # The Finder sticks together reader and retriever in a pipeline to answer our actual questions. - finder = Finder(reader, retriever) + # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. + # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. + # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. + # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). + from haystack.pipeline import ExtractiveQAPipeline + pipe = ExtractiveQAPipeline(reader, retriever) - # ## Voilà! Ask a question! + ## Voilà! Ask a question! + prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) - # You can configure how many candidates the reader and retriever shall return - # The higher top_k_retriever, the better (but also the slower) your answers. - prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) - # prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_reader=5) - # prediction = finder.get_answers(question="Who is the sister of Sansa?", top_k_reader=5) + # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) + # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) print_answers(prediction, details="minimal") diff --git a/tutorials/Tutorial4_FAQ_style_QA.ipynb b/tutorials/Tutorial4_FAQ_style_QA.ipynb index 172fd49d9..bd5acb6a3 100644 --- a/tutorials/Tutorial4_FAQ_style_QA.ipynb +++ b/tutorials/Tutorial4_FAQ_style_QA.ipynb @@ -64,8 +64,7 @@ "\n", "# Install the latest master of Haystack\n", "!pip install git+https://github.com/deepset-ai/haystack.git\n", - "!pip install urllib3==1.25.4\n", - "!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html\n" + "!pip install urllib3==1.25.4" ] }, { @@ -221,7 +220,7 @@ "# Get embeddings for our questions from the FAQs\n", "questions = list(df[\"question\"].values)\n", "df[\"question_emb\"] = retriever.embed_queries(texts=questions)\n", - "df = df.rename(columns={\"answer\": \"text\"})\n", + "df = df.rename(columns={\"question\": \"text\"})\n", "\n", "# Convert Dataframe to list of dicts and index them in our DocumentStore\n", "docs_to_index = df.to_dict(orient=\"records\")\n", @@ -238,7 +237,7 @@ "cell_type": "markdown", "source": [ "### Ask questions\n", - "Initialize a Finder (this time without a reader) and ask questions" + "Initialize a Pipeline (this time without a reader) and ask questions" ], "metadata": { "collapsed": false @@ -249,9 +248,8 @@ "execution_count": null, "outputs": [], "source": [ - "finder = Finder(reader=None, retriever=retriever)\n", - "prediction = finder.get_answers_via_similar_questions(question=\"How is the virus spreading?\", top_k_retriever=10)\n", - "print_answers(prediction, details=\"all\")" + "from haystack.pipeline import FAQPipeline\n", + "pipe = FAQPipeline(retriever=retriever)" ], "metadata": { "collapsed": false, @@ -259,6 +257,22 @@ "name": "#%%\n" } } + }, + { + "cell_type": "code", + "source": [ + "prediction = pipe.run(query=\"How is the virus spreading?\", top_k_retriever=10)\n", + "print_answers(prediction, details=\"all\")\n", + "\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "execution_count": null, + "outputs": [] } ], "metadata": { diff --git a/tutorials/Tutorial4_FAQ_style_QA.py b/tutorials/Tutorial4_FAQ_style_QA.py index 87bebdb81..0cf3c9a9b 100755 --- a/tutorials/Tutorial4_FAQ_style_QA.py +++ b/tutorials/Tutorial4_FAQ_style_QA.py @@ -24,7 +24,7 @@ def tutorial4_faq_style_qa(): # - Generalizability: We can only answer questions that are similar to existing ones in FAQ # # In some use cases, a combination of extractive QA and FAQ-style can also be an interesting option. - LAUNCH_ELASTICSEARCH=True + LAUNCH_ELASTICSEARCH=False if LAUNCH_ELASTICSEARCH: logging.info("Starting Elasticsearch ...") @@ -34,7 +34,7 @@ def tutorial4_faq_style_qa(): if status.returncode: raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") - time.sleep(15) + time.sleep(30) ### Init the DocumentStore # In contrast to Tutorial 1 (extractive QA), we: @@ -71,16 +71,18 @@ def tutorial4_faq_style_qa(): # Get embeddings for our questions from the FAQs questions = list(df["question"].values) df["question_emb"] = retriever.embed_queries(texts=questions) - df = df.rename(columns={"answer": "text"}) + df = df.rename(columns={"question": "text"}) # Convert Dataframe to list of dicts and index them in our DocumentStore docs_to_index = df.to_dict(orient="records") document_store.write_documents(docs_to_index) + # Initialize a Pipeline (this time without a reader) and ask questions - # Init reader & and use Finder to get answer (same as in Tutorial 1) - finder = Finder(reader=None, retriever=retriever) - prediction = finder.get_answers_via_similar_questions(question="How is the virus spreading?", top_k_retriever=10) + from haystack.pipeline import FAQPipeline + pipe = FAQPipeline(retriever=retriever) + + prediction = pipe.run(query="How is the virus spreading?", top_k_retriever=10) print_answers(prediction, details="all") diff --git a/tutorials/Tutorial5_Evaluation.ipynb b/tutorials/Tutorial5_Evaluation.ipynb index 184c7dc7f..b7b030919 100644 --- a/tutorials/Tutorial5_Evaluation.ipynb +++ b/tutorials/Tutorial5_Evaluation.ipynb @@ -77,8 +77,7 @@ "\n", "# Install the latest master of Haystack\n", "!pip install git+https://github.com/deepset-ai/haystack.git\n", - "!pip install urllib3==1.25.4\n", - "!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html\n" + "!pip install urllib3==1.25.4" ] }, { @@ -469,7 +468,6 @@ }, "outputs": [], "source": [ - "# Evaluate combination of Reader and Retriever through Finder\n", "# Evaluate combination of Reader and Retriever through Finder\n", "finder_eval_results = finder.eval(top_k_retriever=1, top_k_reader=10, label_index=label_index, doc_index=doc_index)\n", "finder.print_eval_results(finder_eval_results)" diff --git a/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb b/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb index d712f4df2..a1945905a 100644 --- a/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb +++ b/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb @@ -286,8 +286,7 @@ "\n", "# Install the latest master of Haystack\n", "!pip install git+https://github.com/deepset-ai/haystack.git\n", - "!pip install urllib3==1.25.4\n", - "!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html\n" + "!pip install urllib3==1.25.4" ] }, { @@ -647,9 +646,12 @@ "id": "unhLD18yA6OF" }, "source": [ - "#### Finder\n", + "### Pipeline\n", "\n", - "The Finder sticks together reader and retriever in a pipeline to answer our actual questions. " + "With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.\n", + "Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.\n", + "To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.\n", + "You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd)." ] }, { @@ -662,7 +664,8 @@ }, "outputs": [], "source": [ - "finder = Finder(reader, retriever)" + "from haystack.pipeline import ExtractiveQAPipeline\n", + "pipe = ExtractiveQAPipeline(reader, retriever)" ] }, { @@ -672,7 +675,7 @@ "id": "bXlBBxKXA6OL" }, "source": [ - "### Voilà! Ask a question!" + "## Voilà! Ask a question!" ] }, { @@ -712,65 +715,8 @@ ], "source": [ "# You can configure how many candidates the reader and retriever shall return\n", - "# The higher top_k_retriever, the better (but also the slower) your answers. \n", - "prediction = finder.get_answers(question=\"Who created the Dothraki vocabulary?\", top_k_retriever=10, top_k_reader=5)\n", - "\n", - "#prediction = finder.get_answers(question=\"Who is the father of Arya Stark?\", top_k_retriever=10, top_k_reader=5)\n", - "#prediction = finder.get_answers(question=\"Who is the sister of Sansa?\", top_k_retriever=10, top_k_reader=5)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 561 - }, - "colab_type": "code", - "id": "N70FgfkwA6OQ", - "outputId": "9419c75d-181c-4ef6-cea8-b328a503f19a", - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ { 'answer': 'David J. Peterson',\n", - " 'context': '\\n'\n", - " '===Valyrian===\\n'\n", - " 'David J. Peterson, who created the Dothraki language for '\n", - " 'the first season of the show, was entrusted by the '\n", - " 'producers to design a new '},\n", - " { 'answer': 'David Peterson',\n", - " 'context': '\\n'\n", - " '==Phonology and romanization==\\n'\n", - " 'David Peterson has said, \"You know, most people probably '\n", - " \"don't really know what Arabic actually sounds like, so to \"\n", - " 'an '},\n", - " { 'answer': 'books',\n", - " 'context': 'ints. First, the language had to match the uses already '\n", - " 'put down in the books. Secondly, it had to be easily '\n", - " 'pronounceable or learnable by the actors'},\n", - " { 'answer': \"'''Nevakhi vekha ha maan: Rekke, m'aresakea norethi fitte.'\",\n", - " 'context': '\\n'\n", - " '==Sample==\\n'\n", - " \": '''Nevakhi vekha ha maan: Rekke, m'aresakea norethi \"\n", - " \"fitte.'''\\n\"\n", - " ': seat. exist. for there. with.coward. hair. short\\n'\n", - " \": ''There is a place f\"},\n", - " { 'answer': 'Tyrion',\n", - " 'context': 'ding, as well as his nephew Joffrey, the new king, as '\n", - " 'civil war begins. Tyrion struggles to strengthen and '\n", - " 'protect the city and family who hate him an'}]\n" - ] - } - ], - "source": [ - "print_answers(prediction, details=\"minimal\")" + "# The higher top_k_retriever, the better (but also the slower) your answers.\n", + "prediction = pipe.run(query=\"Who created the Dothraki vocabulary?\", top_k_retriever=10, top_k_reader=5)" ] }, { @@ -778,7 +724,10 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "print_answers(prediction, details=\"minimal\")\n", + "\n" + ] } ], "metadata": { diff --git a/tutorials/Tutorial6_Better_Retrieval_via_DPR.py b/tutorials/Tutorial6_Better_Retrieval_via_DPR.py index f538239bc..2d4c40333 100755 --- a/tutorials/Tutorial6_Better_Retrieval_via_DPR.py +++ b/tutorials/Tutorial6_Better_Retrieval_via_DPR.py @@ -52,18 +52,15 @@ def tutorial6_better_retrieval_via_dpr(): # Hugging Face's model hub (https://huggingface.co/models) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) - ### Finder - # The Finder sticks together reader and retriever in a pipeline to answer our actual questions. - finder = Finder(reader, retriever) + ### Pipeline + from haystack.pipeline import ExtractiveQAPipeline + pipe = ExtractiveQAPipeline(reader, retriever) - ### Voilà! Ask a question! - # You can configure how many candidates the reader and retriever shall return - # The higher top_k_retriever, the better (but also the slower) your answers. - prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) + ## Voilà! Ask a question! + prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) - - # prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_reader=5) - # prediction = finder.get_answers(question="Who is the sister of Sansa?", top_k_reader=5) + # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) + # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) print_answers(prediction, details="minimal") diff --git a/tutorials/Tutorial7_RAG_Generator.ipynb b/tutorials/Tutorial7_RAG_Generator.ipynb index 499a25fde..84962f289 100644 --- a/tutorials/Tutorial7_RAG_Generator.ipynb +++ b/tutorials/Tutorial7_RAG_Generator.ipynb @@ -62,8 +62,7 @@ "outputs": [], "source": [ "!pip install git+https://github.com/deepset-ai/haystack.git\n", - "!pip install urllib3==1.25.4\n", - "!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html\n" + "!pip install urllib3==1.25.4" ], "metadata": { "collapsed": false, @@ -322,6 +321,26 @@ "name": "#%%\n" } } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# Or alternatively use the Pipeline class\n", + "from haystack.pipeline import GenerativeQAPipeline\n", + "\n", + "pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)\n", + "for question in QUESTIONS:\n", + " res = pipe.run(query=question, top_k_generator=1, top_k_retriever=5)\n", + " print(res)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } } ], "metadata": { diff --git a/tutorials/Tutorial7_RAG_Generator.py b/tutorials/Tutorial7_RAG_Generator.py index f3e66e508..aec8be924 100644 --- a/tutorials/Tutorial7_RAG_Generator.py +++ b/tutorials/Tutorial7_RAG_Generator.py @@ -111,6 +111,12 @@ def tutorial7_rag_generator(): answers = predicted_result["answers"] print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'') + # Or alternatively use the Pipeline class + from haystack.pipeline import GenerativeQAPipeline + pipe = GenerativeQAPipeline(generator=generator, retriever=retriever) + for question in QUESTIONS: + res = pipe.run(query=question, top_k_generator=1, top_k_retriever=5) + print(res) if __name__ == "__main__": tutorial7_rag_generator() diff --git a/tutorials/Tutorial8_Preprocessing.ipynb b/tutorials/Tutorial8_Preprocessing.ipynb index 13024d01f..70d4533cb 100644 --- a/tutorials/Tutorial8_Preprocessing.ipynb +++ b/tutorials/Tutorial8_Preprocessing.ipynb @@ -59,9 +59,8 @@ "\n", "# Install the latest master of Haystack\n", "!pip install git+https://github.com/deepset-ai/haystack.git\n", - "!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html\n", - "!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.02.tar.gz\n", - "!tar -xvf xpdf-tools-linux-4.02.tar.gz && sudo cp xpdf-tools-linux-4.02/bin64/pdftotext /usr/local/bin" + "!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz\n", + "!tar -xvf xpdf-tools-linux-4.03.tar.gz && sudo cp xpdf-tools-linux-4.03/bin64/pdftotext /usr/local/bin" ], "metadata": { "collapsed": false, @@ -497,15 +496,6 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "source": [], - "metadata": { - "collapsed": false - } - } } }, "nbformat": 4, diff --git a/tutorials/Tutorial9_DPR_training.ipynb b/tutorials/Tutorial9_DPR_training.ipynb index 3cd4cdb1f..2b630dc39 100644 --- a/tutorials/Tutorial9_DPR_training.ipynb +++ b/tutorials/Tutorial9_DPR_training.ipynb @@ -26,8 +26,7 @@ "#! pip install farm-haystack\n", "\n", "# Install the latest master of Haystack\n", - "!pip install git+https://github.com/deepset-ai/haystack.git\n", - "!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html" + "!pip install git+https://github.com/deepset-ai/haystack.git" ], "metadata": { "collapsed": false, @@ -403,15 +402,6 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "source": [], - "metadata": { - "collapsed": false - } - } } }, "nbformat": 4,