Improve tutorials' output (#1694)

* Modify __str__ and __repr__ for Document and Answer

* Rename QueryClassifier in Tutorial11

* Improve the output of tutorial1

* Make the output of Tutorial8 a bit less dense

* Add a print_questions util to print the output of question generating pipelines

* Replace custom printing with the new utility in Tutorial13

* Ensure all output is printed with minimal details in Tutorial14 and add some titles

* Minor change to print_answers

* Make tutorial3's output the same as tutorial1

* Add __repr__ to Answer and fix to_dict()

* Fix a bug in the Document and Answer's __str__ method

* Improve print_answers, print_documents and print_questions

* Using print_answers in Tutorial7 and fixing typo in the utils

* Remove duplicate line in Tutorial12

* Use print_answers in Tutorial4

* Add explanation of what the documents in the output of the basic QA pipeline are

* Move the fields constant into print_answers

* Normalize all 'minimal' to 'minimum' (they were mixed up)

* Improve the sample output to include all fields from Document and Answer

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
Sara Zan 2021-11-09 15:09:26 +01:00 committed by GitHub
parent 861522b6b1
commit 91cafb49bb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 484 additions and 184 deletions

View File

@ -237,7 +237,35 @@ prediction = pipe.run(
```python ```python
print_answers(prediction, details="minimal") # Now you can either print the object directly...
from pprint import pprint
pprint(prediction)
# Sample output:
# {
# 'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
# <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
# ...
# ]
# 'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
# <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
# ...
# ],
# 'no_ans_gap': 11.688868522644043,
# 'node_id': 'Reader',
# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
# 'query': 'Who is the father of Arya Stark?',
# 'root_node': 'Query'
# }
```
```python
# ...or use a util to simplify the output
# Change `minimum` to `medium` or `all` to raise the level of detail
print_answers(prediction, details="minimum")
``` ```
## About us ## About us

View File

@ -296,7 +296,7 @@ Below, we define a very naive `QueryClassifier` and show how to use it:
```python ```python
class QueryClassifier(BaseComponent): class CustomQueryClassifier(BaseComponent):
outgoing_edges = 2 outgoing_edges = 2
def run(self, query: str): def run(self, query: str):
@ -307,7 +307,7 @@ class QueryClassifier(BaseComponent):
# Here we build the pipeline # Here we build the pipeline
p_classifier = Pipeline() p_classifier = Pipeline()
p_classifier.add_node(component=QueryClassifier(), name="QueryClassifier", inputs=["Query"]) p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"])
p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"]) p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"])
p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"]) p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"])
p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])

View File

@ -42,7 +42,8 @@ from tqdm import tqdm
from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader
from haystack.document_stores import ElasticsearchDocumentStore from haystack.document_stores import ElasticsearchDocumentStore
from haystack.pipelines import QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline, QuestionAnswerGenerationPipeline from haystack.pipelines import QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline, QuestionAnswerGenerationPipeline
from haystack.utils import launch_es from haystack.utils import launch_es, print_questions
``` ```
Let's start an Elasticsearch instance with one of the options below: Let's start an Elasticsearch instance with one of the options below:
@ -98,9 +99,11 @@ which the the document can answer.
```python ```python
question_generation_pipeline = QuestionGenerationPipeline(question_generator) question_generation_pipeline = QuestionGenerationPipeline(question_generator)
for document in document_store: for idx, document in enumerate(document_store):
print(f"\n * Generating questions for document {idx}: {document.content[:100]}...\n")
result = question_generation_pipeline.run(documents=[document]) result = question_generation_pipeline.run(documents=[document])
pprint(result) print_questions(result)
``` ```
## Retriever Question Generation Pipeline ## Retriever Question Generation Pipeline
@ -111,8 +114,10 @@ This pipeline takes a query as input. It retrieves relevant documents and then g
```python ```python
retriever = ElasticsearchRetriever(document_store=document_store) retriever = ElasticsearchRetriever(document_store=document_store)
rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator) rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator)
print(f"\n * Generating questions for documents matching the query 'Arya Stark'\n")
result = rqg_pipeline.run(query="Arya Stark") result = rqg_pipeline.run(query="Arya Stark")
pprint(result) print_questions(result)
``` ```
## Question Answer Generation Pipeline ## Question Answer Generation Pipeline
@ -124,9 +129,11 @@ a Reader model
```python ```python
reader = FARMReader("deepset/roberta-base-squad2") reader = FARMReader("deepset/roberta-base-squad2")
qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader) qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)
for document in tqdm(document_store): for idx, document in enumerate(tqdm(document_store)):
print(f"\n * Generating questions and answers for document {idx}: {document.content[:100]}...\n")
result = qag_pipeline.run(documents=[document]) result = qag_pipeline.run(documents=[document])
pprint(result) print_questions(result)
``` ```
## About us ## About us

View File

@ -161,14 +161,14 @@ res_1 = sklearn_keyword_classifier.run(
query="Who is the father of Arya Stark?" query="Who is the father of Arya Stark?"
) )
print("DPR Results" + "\n" + "="*15) print("DPR Results" + "\n" + "="*15)
print_answers(res_1) print_answers(res_1, details="minimum")
# Run only the sparse retriever on a keyword based query # Run only the sparse retriever on a keyword based query
res_2 = sklearn_keyword_classifier.run( res_2 = sklearn_keyword_classifier.run(
query="arya stark father" query="arya stark father"
) )
print("ES Results" + "\n" + "="*15) print("ES Results" + "\n" + "="*15)
print_answers(res_2) print_answers(res_2, details="minimum")
``` ```
@ -180,14 +180,14 @@ res_3 = sklearn_keyword_classifier.run(
query="which country was jon snow filmed ?" query="which country was jon snow filmed ?"
) )
print("DPR Results" + "\n" + "="*15) print("DPR Results" + "\n" + "="*15)
print_answers(res_3) print_answers(res_3, details="minimum")
# Run only the sparse retriever on a keyword based query # Run only the sparse retriever on a keyword based query
res_4 = sklearn_keyword_classifier.run( res_4 = sklearn_keyword_classifier.run(
query="jon snow country" query="jon snow country"
) )
print("ES Results" + "\n" + "="*15) print("ES Results" + "\n" + "="*15)
print_answers(res_4) print_answers(res_4, details="minimum")
``` ```
@ -197,14 +197,14 @@ res_5 = sklearn_keyword_classifier.run(
query="who are the younger brothers of arya stark ?" query="who are the younger brothers of arya stark ?"
) )
print("DPR Results" + "\n" + "="*15) print("DPR Results" + "\n" + "="*15)
print_answers(res_5) print_answers(res_5, details="minimum")
# Run only the sparse retriever on a keyword based query # Run only the sparse retriever on a keyword based query
res_6 = sklearn_keyword_classifier.run( res_6 = sklearn_keyword_classifier.run(
query="arya stark younger brothers" query="arya stark younger brothers"
) )
print("ES Results" + "\n" + "="*15) print("ES Results" + "\n" + "="*15)
print_answers(res_6) print_answers(res_6, details="minimum")
``` ```
## Transformer Keyword vs Question/Statement Classifier ## Transformer Keyword vs Question/Statement Classifier
@ -234,14 +234,14 @@ res_1 = transformer_keyword_classifier.run(
query="Who is the father of Arya Stark?" query="Who is the father of Arya Stark?"
) )
print("DPR Results" + "\n" + "="*15) print("DPR Results" + "\n" + "="*15)
print_answers(res_1) print_answers(res_1, details="minimum")
# Run only the sparse retriever on a keyword based query # Run only the sparse retriever on a keyword based query
res_2 = transformer_keyword_classifier.run( res_2 = transformer_keyword_classifier.run(
query="arya stark father" query="arya stark father"
) )
print("ES Results" + "\n" + "="*15) print("ES Results" + "\n" + "="*15)
print_answers(res_2) print_answers(res_2, details="minimum")
``` ```
@ -253,14 +253,14 @@ res_3 = transformer_keyword_classifier.run(
query="which country was jon snow filmed ?" query="which country was jon snow filmed ?"
) )
print("DPR Results" + "\n" + "="*15) print("DPR Results" + "\n" + "="*15)
print_answers(res_3) print_answers(res_3, details="minimum")
# Run only the sparse retriever on a keyword based query # Run only the sparse retriever on a keyword based query
res_4 = transformer_keyword_classifier.run( res_4 = transformer_keyword_classifier.run(
query="jon snow country" query="jon snow country"
) )
print("ES Results" + "\n" + "="*15) print("ES Results" + "\n" + "="*15)
print_answers(res_4) print_answers(res_4, details="minimum")
``` ```
@ -270,14 +270,14 @@ res_5 = transformer_keyword_classifier.run(
query="who are the younger brothers of arya stark ?" query="who are the younger brothers of arya stark ?"
) )
print("DPR Results" + "\n" + "="*15) print("DPR Results" + "\n" + "="*15)
print_answers(res_5) print_answers(res_5, details="minimum")
# Run only the sparse retriever on a keyword based query # Run only the sparse retriever on a keyword based query
res_6 = transformer_keyword_classifier.run( res_6 = transformer_keyword_classifier.run(
query="arya stark younger brothers" query="arya stark younger brothers"
) )
print("ES Results" + "\n" + "="*15) print("ES Results" + "\n" + "="*15)
print_answers(res_6) print_answers(res_6, details="minimum")
``` ```
## Question vs Statement Classifier ## Question vs Statement Classifier
@ -305,14 +305,14 @@ res_1 = transformer_question_classifier.run(
query="Who is the father of Arya Stark?" query="Who is the father of Arya Stark?"
) )
print("DPR Results" + "\n" + "="*15) print("DPR Results" + "\n" + "="*15)
print_answers(res_1) print_answers(res_1, details="minimum")
# Show only DPR results # Show only DPR results
res_2 = transformer_question_classifier.run( res_2 = transformer_question_classifier.run(
query="Arya Stark was the daughter of a Lord." query="Arya Stark was the daughter of a Lord."
) )
print("ES Results" + "\n" + "="*15) print("ES Results" + "\n" + "="*15)
res_2 print_answers(res_2, details="minimum")
``` ```
## Standalone Query Classifier ## Standalone Query Classifier

View File

@ -182,7 +182,34 @@ prediction = pipe.run(
```python ```python
print_answers(prediction, details="minimal") # Now you can either print the object directly...
from pprint import pprint
pprint(prediction)
# Sample output:
# {
# 'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
# <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
# ...
# ]
# 'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
# <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
# ...
# ],
# 'no_ans_gap': 11.688868522644043,
# 'node_id': 'Reader',
# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
# 'query': 'Who is the father of Arya Stark?',
# 'root_node': 'Query'
# }
```
```python
# ...or use a util to simplify the output
# Change `minimum` to `medium` or `all` to raise the level of detail
print_answers(prediction, details="minimum")
``` ```
## About us ## About us

View File

@ -155,12 +155,10 @@ pipe = FAQPipeline(retriever=retriever)
```python ```python
from haystack.utils import print_answers
prediction = pipe.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}}) prediction = pipe.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}})
for a in prediction["answers"]: print_answers(prediction, details="medium")
print(f"Answer: {a.answer}")
print(f"Question: {a.meta['query']}")
print(f"Score: {a.score}")
print("---------------------")
``` ```
## About us ## About us

View File

@ -193,11 +193,12 @@ for question in QUESTIONS:
```python ```python
# Or alternatively use the Pipeline class # Or alternatively use the Pipeline class
from haystack.pipelines import GenerativeQAPipeline from haystack.pipelines import GenerativeQAPipeline
from haystack.utils import print_answers
pipe = GenerativeQAPipeline(generator=generator, retriever=retriever) pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)
for question in QUESTIONS: for question in QUESTIONS:
res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}}) res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})
print(res) print_answers(res, details="minimum")
``` ```
## About us ## About us

View File

@ -186,10 +186,13 @@ class Document:
getattr(other, 'id_hash_keys', None) == self.id_hash_keys) getattr(other, 'id_hash_keys', None) == self.id_hash_keys)
def __repr__(self): def __repr__(self):
return str(self.to_dict()) return f"<Document: {str(self.to_dict())}>"
def __str__(self): def __str__(self):
return f"content: {self.content[:100]} {'[...]' if len(self.content) > 100 else ''}" # In some cases, self.content is None (therefore not subscriptable)
if not self.content:
return f"<Document: id={self.id}, content=None>"
return f"<Document: id={self.id}, content='{self.content[:100]} {'...' if len(self.content) > 100 else ''}'>"
def __lt__(self, other): def __lt__(self, other):
""" Enable sorting of Documents by score """ """ Enable sorting of Documents by score """
@ -262,7 +265,13 @@ class Answer:
return self.score < other.score return self.score < other.score
def __str__(self): def __str__(self):
return f"answer: {self.answer} \nscore: {self.score} \ncontext: {self.context}" # self.context might be None (therefore not subscriptable)
if not self.context:
return f"<Answer: answer='{self.answer}', score={self.score}, context=None>"
return f"<Answer: answer='{self.answer}', score={self.score}, context='{self.context[:50]}{'...' if len(self.context) > 50 else ''}'>"
def __repr__(self):
return f"<Answer {asdict(self)}>"
def to_dict(self): def to_dict(self):
return asdict(self) return asdict(self)

View File

@ -16,6 +16,7 @@ from haystack.utils.doc_store import (
from haystack.utils.export_utils import ( from haystack.utils.export_utils import (
print_answers, print_answers,
print_documents, print_documents,
print_questions,
export_answers_to_csv, export_answers_to_csv,
convert_labels_to_squad, convert_labels_to_squad,
) )

View File

@ -1,12 +1,8 @@
from typing import Dict, Any, List, Optional from typing import Dict, Any, List, Optional
import io
import re
import time
import json import json
import pprint import pprint
import logging import logging
import subprocess
import pandas as pd import pandas as pd
from collections import defaultdict from collections import defaultdict
@ -16,58 +12,98 @@ from haystack.document_stores.sql import DocumentORM
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def print_answers(results: dict, details: str = "all", max_text_len: Optional[int] = None):
def print_answers(results: dict, details: str = "all"):
""" """
Utilitiy function to print results of Haystack pipelines Utility function to print results of Haystack pipelines
:param results: Results from a pipeline :param results: Results from a pipeline
:param details: One of ["minimum", "medium", "all]. Defining the level of details to print. :param details: One of "minimum", "medium", "all". Defining the level of details to print.
:param max_text_lenght: shorten lengthy text fields to the maximum allowed length. Set to
None to not cut long text.
:return: None :return: None
""" """
# TODO: unify the output format of Generator and Reader so that this function doesn't have the try/except # Defines the fields to keep in the Answer for each detail level
# Or implement a class method like PredReader.print() and PredGenerator.print() that handles all this functionality. fields_to_keep_by_level = {
# This default case is when the answers come from a Reader "minimum": ["answer", "context"],
try: "medium": ["answer", "context", "score"]
}
if not "answers" in results.keys():
raise ValueError("The results object does not seem to come from a Reader: "
f"it does not contain the 'answers' key, but only: {results.keys()}. "
"Try print_documents or print_questions.")
if "query" in results.keys():
print(f"\nQuery: {results['query']}\nAnswers:")
answers = results["answers"] answers = results["answers"]
pp = pprint.PrettyPrinter(indent=4) pp = pprint.PrettyPrinter(indent=4)
if details in ("minimal", "medium"):
if details == "minimal":
keys_to_keep = set(["answer", "context"])
elif details == "medium":
keys_to_keep = set(["answer", "context", "score"])
# filter the results # Filter the results by detail level
filtered_answers = [] filtered_answers = []
if details in fields_to_keep_by_level.keys():
for ans in answers: for ans in answers:
filtered_answers.append({k: getattr(ans, k) for k in keys_to_keep}) filtered_answers.append({k: getattr(ans, k) for k in fields_to_keep_by_level[details]})
elif details == "all":
filtered_answers = answers
else:
logging.warn(f"print_answers received details='{details}', which was not understood. "
"Valid values are 'minimum', 'medium', and 'all'. Using 'all'.")
filtered_answers = answers
# Shorten long text fields
if max_text_len is not None:
for ans in answers:
if "context" in ans.keys() and len(ans["context"]) > 50:
ans["context"] = ans["context"][:50] + "..."
pp.pprint(filtered_answers) pp.pprint(filtered_answers)
else:
pp.pprint(results)
# This fall back case is when the answers come from a Generator
except:
if details == "minimal":
print(f"Query: {results['query']}")
for a in results["answers"]:
print(f"Answer: {a['answer']}")
else:
pp.pprint(results)
def print_documents(results: dict, max_text_len: Optional[int] = None, print_meta: bool = False): def print_documents(results: dict, max_text_len: Optional[int] = None, print_name: bool = True, print_meta: bool = False):
print(f"Query: {results['query']}") """
Utility that prints a compressed representation of the documents returned by a pipeline.
:param max_text_lenght: shorten the document's content to a maximum number of chars. if None, does not cut.
:param print_name: whether to print the document's name (from the metadata) or not.
:param print_meta: whether to print the document's metadata or not.
"""
print(f"\nQuery: {results['query']}\n")
pp = pprint.PrettyPrinter(indent=4) pp = pprint.PrettyPrinter(indent=4)
for d in results["documents"]:
print() for doc in results["documents"]:
new_text = d.content[:max_text_len] content = doc.content
if len(new_text) != len(d.content): if max_text_len:
new_text += "..." content = doc.content[:max_text_len] + ("..." if len(doc.content) > max_text_len else "")
results = { results = {"content": content}
"name": d.meta.get("name", None), if print_name:
"content": new_text results["name"] = doc.meta.get("name", None)
}
if print_meta: if print_meta:
results["meta"] = d.meta results["meta"] = doc.meta
pp.pprint(results) pp.pprint(results)
print()
def print_questions(results: dict):
"""
Utility to print the output of a question generating pipeline in a readable format.
"""
if "generated_questions" in results.keys():
print("\nGenerated questions:")
for result in results["generated_questions"]:
for question in result["questions"]:
print(f" - {question}")
elif "results" in results.keys():
print("\nGenerated pairs:")
for pair in results["results"]:
print(f" - Q:{pair['query']}")
for answer in pair["answers"]:
print(f" A: {answer.answer}")
else:
raise ValueError("This object does not seem to be the output "
"of a question generating pipeline: does not contain neither "
f"'generated_questions' nor 'results', but only: {results.keys()}. "
" Try `print_answers` or `print_documents`.")
def export_answers_to_csv(agg_results: list, output_file): def export_answers_to_csv(agg_results: list, output_file):

View File

@ -547,7 +547,7 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"source": [ "source": [
"class QueryClassifier(BaseComponent):\n", "class CustomQueryClassifier(BaseComponent):\n",
" outgoing_edges = 2\n", " outgoing_edges = 2\n",
"\n", "\n",
" def run(self, query: str):\n", " def run(self, query: str):\n",
@ -558,7 +558,7 @@
"\n", "\n",
"# Here we build the pipeline\n", "# Here we build the pipeline\n",
"p_classifier = Pipeline()\n", "p_classifier = Pipeline()\n",
"p_classifier.add_node(component=QueryClassifier(), name=\"QueryClassifier\", inputs=[\"Query\"])\n", "p_classifier.add_node(component=CustomQueryClassifier(), name=\"QueryClassifier\", inputs=[\"Query\"])\n",
"p_classifier.add_node(component=es_retriever, name=\"ESRetriever\", inputs=[\"QueryClassifier.output_1\"])\n", "p_classifier.add_node(component=es_retriever, name=\"ESRetriever\", inputs=[\"QueryClassifier.output_1\"])\n",
"p_classifier.add_node(component=dpr_retriever, name=\"DPRRetriever\", inputs=[\"QueryClassifier.output_2\"])\n", "p_classifier.add_node(component=dpr_retriever, name=\"DPRRetriever\", inputs=[\"QueryClassifier.output_2\"])\n",
"p_classifier.add_node(component=reader, name=\"QAReader\", inputs=[\"ESRetriever\", \"DPRRetriever\"])\n", "p_classifier.add_node(component=reader, name=\"QAReader\", inputs=[\"ESRetriever\", \"DPRRetriever\"])\n",

View File

@ -2,7 +2,7 @@ from haystack.utils import clean_wiki_text, print_answers, print_documents, fetc
from pprint import pprint from pprint import pprint
from haystack import Pipeline from haystack import Pipeline
from haystack.document_stores import ElasticsearchDocumentStore from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import ElasticsearchRetriever, DensePassageRetriever, FARMReader, RAGenerator, JoinDocuments from haystack.nodes import ElasticsearchRetriever, DensePassageRetriever, FARMReader, RAGenerator, BaseComponent, JoinDocuments
from haystack.pipelines import ExtractiveQAPipeline, DocumentSearchPipeline, GenerativeQAPipeline from haystack.pipelines import ExtractiveQAPipeline, DocumentSearchPipeline, GenerativeQAPipeline
@ -35,33 +35,44 @@ def tutorial11_pipelines():
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
###################### print()
# Prebuilt Pipelines # print("######################")
###################### print("# Prebuilt Pipelines #")
print("######################")
# Extractive QA Pipeline print()
######################## print("# Extractive QA Pipeline")
print("########################")
query="Who is the father of Arya Stark?"
p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=es_retriever) p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=es_retriever)
res = p_extractive_premade.run( res = p_extractive_premade.run(
query="Who is the father of Arya Stark?", query=query,
params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}},
) )
print_answers(res, details="minimal") print("\nQuery: ", query)
print("Answers:")
print_answers(res, details="minimum")
# Document Search Pipeline
##########################
print()
print("# Document Search Pipeline")
print("##########################")
query="Who is the father of Arya Stark?"
p_retrieval = DocumentSearchPipeline(es_retriever) p_retrieval = DocumentSearchPipeline(es_retriever)
res = p_retrieval.run( res = p_retrieval.run(
query="Who is the father of Arya Stark?", query=query,
params={"Retriever": {"top_k": 10}}, params={"Retriever": {"top_k": 10}},
) )
print()
print_documents(res, max_text_len=200) print_documents(res, max_text_len=200)
# Generator Pipeline
########################## print()
print("# Generator Pipeline")
print("####################")
# We set this to True so that the document store returns document embeddings # We set this to True so that the document store returns document embeddings
# with each document, this is needed by the Generator # with each document, this is needed by the Generator
@ -73,11 +84,12 @@ def tutorial11_pipelines():
# Generative QA # Generative QA
p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever) p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever)
res = p_generator.run( res = p_generator.run(
query="Who is the father of Arya Stark?", query=query,
params={"Retriever": {"top_k": 10}}, params={"Retriever": {"top_k": 10}},
) )
print_answers(res, details="minimal") print()
print_answers(res, details="minimum")
# We are setting this to False so that in later pipelines, # We are setting this to False so that in later pipelines,
# we get a cleaner printout # we get a cleaner printout
@ -91,12 +103,14 @@ def tutorial11_pipelines():
p_retrieval.draw("pipeline_retrieval.png") p_retrieval.draw("pipeline_retrieval.png")
p_generator.draw("pipeline_generator.png") p_generator.draw("pipeline_generator.png")
#################### print()
# Custom Pipelines # print("####################")
#################### print("# Custom Pipelines #")
print("####################")
# Extractive QA Pipeline print()
######################## print("# Extractive QA Pipeline")
print("########################")
# Custom built extractive QA pipeline # Custom built extractive QA pipeline
p_extractive = Pipeline() p_extractive = Pipeline()
@ -104,15 +118,20 @@ def tutorial11_pipelines():
p_extractive.add_node(component=reader, name="Reader", inputs=["Retriever"]) p_extractive.add_node(component=reader, name="Reader", inputs=["Retriever"])
# Now we can run it # Now we can run it
query="Who is the father of Arya Stark?"
res = p_extractive.run( res = p_extractive.run(
query="Who is the father of Arya Stark?", query=query,
params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}},
) )
print_answers(res, details="minimal") print("\nQuery: ", query)
print("Answers:")
print_answers(res, details="minimum")
p_extractive.draw("pipeline_extractive.png") p_extractive.draw("pipeline_extractive.png")
# Ensembled Retriever Pipeline
############################## print()
print("# Ensembled Retriever Pipeline")
print("##############################")
# Create ensembled pipeline # Create ensembled pipeline
p_ensemble = Pipeline() p_ensemble = Pipeline()
@ -123,22 +142,27 @@ def tutorial11_pipelines():
p_ensemble.draw("pipeline_ensemble.png") p_ensemble.draw("pipeline_ensemble.png")
# Run pipeline # Run pipeline
query="Who is the father of Arya Stark?"
res = p_ensemble.run( res = p_ensemble.run(
query="Who is the father of Arya Stark?", query="Who is the father of Arya Stark?",
params={"ESRetriever": {"top_k": 5}, "DPRRetriever": {"top_k": 5}}, params={"ESRetriever": {"top_k": 5}, "DPRRetriever": {"top_k": 5}},
) )
print_answers(res, details="minimal") print("\nQuery: ", query)
print("Answers:")
print_answers(res, details="minimum")
# Query Classification Pipeline
############################### print()
print("# Query Classification Pipeline")
print("###############################")
# Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run. # Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run.
# Though this looks very similar to the ensembled pipeline shown above, # Though this looks very similar to the ensembled pipeline shown above,
# the key difference is that only one of the retrievers is run for each request. # the key difference is that only one of the retrievers is run for each request.
# By contrast both retrievers are always run in the ensembled approach. # By contrast both retrievers are always run in the ensembled approach.
class QueryClassifier(): class CustomQueryClassifier(BaseComponent):
outgoing_edges = 2 outgoing_edges = 2
def run(self, query): def run(self, query):
@ -149,25 +173,32 @@ def tutorial11_pipelines():
# Here we build the pipeline # Here we build the pipeline
p_classifier = Pipeline() p_classifier = Pipeline()
p_classifier.add_node(component=QueryClassifier(), name="QueryClassifier", inputs=["Query"]) p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"])
p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"]) p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"])
p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"]) p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"])
p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])
p_classifier.draw("pipeline_classifier.png") p_classifier.draw("pipeline_classifier.png")
# Run only the dense retriever on the full sentence query # Run only the dense retriever on the full sentence query
query="Who is the father of Arya Stark?"
res_1 = p_classifier.run( res_1 = p_classifier.run(
query="Who is the father of Arya Stark?", query=query,
) )
print("DPR Results" + "\n" + "="*15) print()
print_answers(res_1) print("\nQuery: ", query)
print(" * DPR Answers:")
print_answers(res_1, details="minimum")
# Run only the sparse retriever on a keyword based query # Run only the sparse retriever on a keyword based query
query="Arya Stark father"
res_2 = p_classifier.run( res_2 = p_classifier.run(
query="Arya Stark father", query=query,
) )
print("ES Results" + "\n" + "="*15) print()
print_answers(res_2) print("\nQuery: ", query)
print(" * ES Answers:")
print_answers(res_2, details="minimum")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -91,7 +91,6 @@ def tutorial12_lfqa():
print(f"Query: {query_2}") print(f"Query: {query_2}")
print(f"Answer: {result_2['answers'][0]}") print(f"Answer: {result_2['answers'][0]}")
print() print()
pipe.run(query=query_2, params={"Retriever": {"top_k": 1}})
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -66,7 +66,7 @@
"from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader\n", "from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader\n",
"from haystack.document_stores import ElasticsearchDocumentStore\n", "from haystack.document_stores import ElasticsearchDocumentStore\n",
"from haystack.pipelines import QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline, QuestionAnswerGenerationPipeline\n", "from haystack.pipelines import QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline, QuestionAnswerGenerationPipeline\n",
"from haystack.utils import launch_es" "from haystack.utils import launch_es, print_questions\n"
], ],
"outputs": [], "outputs": [],
"metadata": { "metadata": {
@ -188,9 +188,11 @@
"execution_count": null, "execution_count": null,
"source": [ "source": [
"question_generation_pipeline = QuestionGenerationPipeline(question_generator)\n", "question_generation_pipeline = QuestionGenerationPipeline(question_generator)\n",
"for document in document_store:\n", "for idx, document in enumerate(document_store):\n",
" \n",
" print(f\"\\n * Generating questions for document {idx}: {document.content[:100]}...\\n\")\n",
" result = question_generation_pipeline.run(documents=[document])\n", " result = question_generation_pipeline.run(documents=[document])\n",
" pprint(result)" " print_questions(result)"
], ],
"outputs": [], "outputs": [],
"metadata": { "metadata": {
@ -220,8 +222,10 @@
"source": [ "source": [
"retriever = ElasticsearchRetriever(document_store=document_store)\n", "retriever = ElasticsearchRetriever(document_store=document_store)\n",
"rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator)\n", "rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator)\n",
"\n",
"print(f\"\\n * Generating questions for documents matching the query 'Arya Stark'\\n\")\n",
"result = rqg_pipeline.run(query=\"Arya Stark\")\n", "result = rqg_pipeline.run(query=\"Arya Stark\")\n",
"pprint(result)" "print_questions(result)"
], ],
"outputs": [], "outputs": [],
"metadata": { "metadata": {
@ -252,9 +256,11 @@
"source": [ "source": [
"reader = FARMReader(\"deepset/roberta-base-squad2\")\n", "reader = FARMReader(\"deepset/roberta-base-squad2\")\n",
"qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)\n", "qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)\n",
"for document in tqdm(document_store):\n", "for idx, document in enumerate(tqdm(document_store)):\n",
"\n",
" print(f\"\\n * Generating questions and answers for document {idx}: {document.content[:100]}...\\n\")\n",
" result = qag_pipeline.run(documents=[document])\n", " result = qag_pipeline.run(documents=[document])\n",
" pprint(result)" " print_questions(result)"
], ],
"outputs": [], "outputs": [],
"metadata": { "metadata": {

View File

@ -3,7 +3,7 @@ from pprint import pprint
from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader
from haystack.document_stores import ElasticsearchDocumentStore from haystack.document_stores import ElasticsearchDocumentStore
from haystack.pipelines import QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline, QuestionAnswerGenerationPipeline from haystack.pipelines import QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline, QuestionAnswerGenerationPipeline
from haystack.utils import launch_es from haystack.utils import launch_es, print_questions
""" """
This is a bare bones tutorial showing what is possible with the QuestionGenerator Node which automatically generates This is a bare bones tutorial showing what is possible with the QuestionGenerator Node which automatically generates
@ -34,20 +34,31 @@ which the the document can answer.
""" """
# QuestionGenerationPipeline # QuestionGenerationPipeline
print("\nQuestionGenerationPipeline")
print("==========================")
question_generation_pipeline = QuestionGenerationPipeline(question_generator) question_generation_pipeline = QuestionGenerationPipeline(question_generator)
for document in document_store: for idx, document in enumerate(document_store):
print(f"\n * Generating questions for document {idx}: {document.content[:100]}...\n")
result = question_generation_pipeline.run(documents=[document]) result = question_generation_pipeline.run(documents=[document])
pprint(result) print_questions(result)
""" """
This pipeline takes a query as input. It retrievers relevant documents and then generates questions based on these. This pipeline takes a query as input. It retrievers relevant documents and then generates questions based on these.
""" """
# RetrieverQuestionGenerationPipeline # RetrieverQuestionGenerationPipeline
print("\RetrieverQuestionGenerationPipeline")
print("==================================")
retriever = ElasticsearchRetriever(document_store=document_store) retriever = ElasticsearchRetriever(document_store=document_store)
rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator) rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator)
print(f"\n * Generating questions for documents matching the query 'Arya Stark'\n")
result = rqg_pipeline.run(query="Arya Stark") result = rqg_pipeline.run(query="Arya Stark")
pprint(result) print_questions(result)
""" """
This pipeline takes a document as input, generates questions on it, and attempts to answer these questions using This pipeline takes a document as input, generates questions on it, and attempts to answer these questions using
@ -55,11 +66,17 @@ a Reader model
""" """
# QuestionAnswerGenerationPipeline # QuestionAnswerGenerationPipeline
print("\QuestionAnswerGenerationPipeline")
print("===============================")
reader = FARMReader("deepset/roberta-base-squad2") reader = FARMReader("deepset/roberta-base-squad2")
qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader) qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)
for document in tqdm(document_store): for idx, document in enumerate(tqdm(document_store)):
print(f"\n * Generating questions and answers for document {idx}: {document.content[:100]}...\n")
result = qag_pipeline.run(documents=[document]) result = qag_pipeline.run(documents=[document])
pprint(result) print_questions(result)
# This Haystack script was made with love by deepset in Berlin, Germany # This Haystack script was made with love by deepset in Berlin, Germany
# Haystack: https://github.com/deepset-ai/haystack # Haystack: https://github.com/deepset-ai/haystack

View File

@ -1567,14 +1567,14 @@
" query=\"Who is the father of Arya Stark?\"\n", " query=\"Who is the father of Arya Stark?\"\n",
")\n", ")\n",
"print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n", "print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n",
"print_answers(res_1)\n", "print_answers(res_1, details=\"minimum\")\n",
"\n", "\n",
"# Run only the sparse retriever on a keyword based query\n", "# Run only the sparse retriever on a keyword based query\n",
"res_2 = sklearn_keyword_classifier.run(\n", "res_2 = sklearn_keyword_classifier.run(\n",
" query=\"arya stark father\"\n", " query=\"arya stark father\"\n",
")\n", ")\n",
"print(\"ES Results\" + \"\\n\" + \"=\"*15)\n", "print(\"ES Results\" + \"\\n\" + \"=\"*15)\n",
"print_answers(res_2)\n" "print_answers(res_2, details=\"minimum\")\n"
], ],
"outputs": [], "outputs": [],
"metadata": { "metadata": {
@ -1591,14 +1591,14 @@
" query=\"which country was jon snow filmed ?\"\n", " query=\"which country was jon snow filmed ?\"\n",
")\n", ")\n",
"print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n", "print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n",
"print_answers(res_3)\n", "print_answers(res_3, details=\"minimum\")\n",
"\n", "\n",
"# Run only the sparse retriever on a keyword based query\n", "# Run only the sparse retriever on a keyword based query\n",
"res_4 = sklearn_keyword_classifier.run(\n", "res_4 = sklearn_keyword_classifier.run(\n",
" query=\"jon snow country\"\n", " query=\"jon snow country\"\n",
")\n", ")\n",
"print(\"ES Results\" + \"\\n\" + \"=\"*15)\n", "print(\"ES Results\" + \"\\n\" + \"=\"*15)\n",
"print_answers(res_4)" "print_answers(res_4, details=\"minimum\")"
], ],
"outputs": [], "outputs": [],
"metadata": { "metadata": {
@ -1614,14 +1614,14 @@
" query=\"who are the younger brothers of arya stark ?\"\n", " query=\"who are the younger brothers of arya stark ?\"\n",
")\n", ")\n",
"print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n", "print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n",
"print_answers(res_5)\n", "print_answers(res_5, details=\"minimum\")\n",
"\n", "\n",
"# Run only the sparse retriever on a keyword based query\n", "# Run only the sparse retriever on a keyword based query\n",
"res_6 = sklearn_keyword_classifier.run(\n", "res_6 = sklearn_keyword_classifier.run(\n",
" query=\"arya stark younger brothers\"\n", " query=\"arya stark younger brothers\"\n",
")\n", ")\n",
"print(\"ES Results\" + \"\\n\" + \"=\"*15)\n", "print(\"ES Results\" + \"\\n\" + \"=\"*15)\n",
"print_answers(res_6)" "print_answers(res_6, details=\"minimum\")"
], ],
"outputs": [], "outputs": [],
"metadata": { "metadata": {
@ -1670,14 +1670,14 @@
" query=\"Who is the father of Arya Stark?\"\n", " query=\"Who is the father of Arya Stark?\"\n",
")\n", ")\n",
"print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n", "print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n",
"print_answers(res_1)\n", "print_answers(res_1, details=\"minimum\")\n",
"\n", "\n",
"# Run only the sparse retriever on a keyword based query\n", "# Run only the sparse retriever on a keyword based query\n",
"res_2 = transformer_keyword_classifier.run(\n", "res_2 = transformer_keyword_classifier.run(\n",
" query=\"arya stark father\"\n", " query=\"arya stark father\"\n",
")\n", ")\n",
"print(\"ES Results\" + \"\\n\" + \"=\"*15)\n", "print(\"ES Results\" + \"\\n\" + \"=\"*15)\n",
"print_answers(res_2)\n" "print_answers(res_2, details=\"minimum\")\n"
], ],
"outputs": [], "outputs": [],
"metadata": { "metadata": {
@ -1694,14 +1694,14 @@
" query=\"which country was jon snow filmed ?\"\n", " query=\"which country was jon snow filmed ?\"\n",
")\n", ")\n",
"print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n", "print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n",
"print_answers(res_3)\n", "print_answers(res_3, details=\"minimum\")\n",
"\n", "\n",
"# Run only the sparse retriever on a keyword based query\n", "# Run only the sparse retriever on a keyword based query\n",
"res_4 = transformer_keyword_classifier.run(\n", "res_4 = transformer_keyword_classifier.run(\n",
" query=\"jon snow country\"\n", " query=\"jon snow country\"\n",
")\n", ")\n",
"print(\"ES Results\" + \"\\n\" + \"=\"*15)\n", "print(\"ES Results\" + \"\\n\" + \"=\"*15)\n",
"print_answers(res_4)" "print_answers(res_4, details=\"minimum\")"
], ],
"outputs": [], "outputs": [],
"metadata": { "metadata": {
@ -1717,14 +1717,14 @@
" query=\"who are the younger brothers of arya stark ?\"\n", " query=\"who are the younger brothers of arya stark ?\"\n",
")\n", ")\n",
"print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n", "print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n",
"print_answers(res_5)\n", "print_answers(res_5, details=\"minimum\")\n",
"\n", "\n",
"# Run only the sparse retriever on a keyword based query\n", "# Run only the sparse retriever on a keyword based query\n",
"res_6 = transformer_keyword_classifier.run(\n", "res_6 = transformer_keyword_classifier.run(\n",
" query=\"arya stark younger brothers\"\n", " query=\"arya stark younger brothers\"\n",
")\n", ")\n",
"print(\"ES Results\" + \"\\n\" + \"=\"*15)\n", "print(\"ES Results\" + \"\\n\" + \"=\"*15)\n",
"print_answers(res_6)" "print_answers(res_6, details=\"minimum\")"
], ],
"outputs": [], "outputs": [],
"metadata": { "metadata": {
@ -1771,14 +1771,14 @@
" query=\"Who is the father of Arya Stark?\"\n", " query=\"Who is the father of Arya Stark?\"\n",
")\n", ")\n",
"print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n", "print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n",
"print_answers(res_1)\n", "print_answers(res_1, details=\"minimum\")\n",
"\n", "\n",
"# Show only DPR results\n", "# Show only DPR results\n",
"res_2 = transformer_question_classifier.run(\n", "res_2 = transformer_question_classifier.run(\n",
" query=\"Arya Stark was the daughter of a Lord.\"\n", " query=\"Arya Stark was the daughter of a Lord.\"\n",
")\n", ")\n",
"print(\"ES Results\" + \"\\n\" + \"=\"*15)\n", "print(\"ES Results\" + \"\\n\" + \"=\"*15)\n",
"res_2" "print_answers(res_2, details=\"minimum\")"
], ],
"outputs": [], "outputs": [],
"metadata": { "metadata": {

View File

@ -35,7 +35,9 @@ def tutorial14_query_classifier():
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
print()
print("Sklearn keyword classifier")
print("==========================")
# Here we build the pipeline # Here we build the pipeline
sklearn_keyword_classifier = Pipeline() sklearn_keyword_classifier = Pipeline()
sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"]) sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"])
@ -48,44 +50,53 @@ def tutorial14_query_classifier():
res_1 = sklearn_keyword_classifier.run( res_1 = sklearn_keyword_classifier.run(
query="Who is the father of Arya Stark?", query="Who is the father of Arya Stark?",
) )
print("\n===============================")
print("DPR Results" + "\n" + "="*15) print("DPR Results" + "\n" + "="*15)
print_answers(res_1) print_answers(res_1, details="minimum")
# Run only the sparse retriever on a keyword based query # Run only the sparse retriever on a keyword based query
res_2 = sklearn_keyword_classifier.run( res_2 = sklearn_keyword_classifier.run(
query="arya stark father", query="arya stark father",
) )
print("\n===============================")
print("ES Results" + "\n" + "="*15) print("ES Results" + "\n" + "="*15)
print_answers(res_2) print_answers(res_2, details="minimum")
# Run only the dense retriever on the full sentence query # Run only the dense retriever on the full sentence query
res_3 = sklearn_keyword_classifier.run( res_3 = sklearn_keyword_classifier.run(
query="which country was jon snow filmed ?", query="which country was jon snow filmed ?",
) )
print("\n===============================")
print("DPR Results" + "\n" + "="*15) print("DPR Results" + "\n" + "="*15)
print_answers(res_3) print_answers(res_3, details="minimum")
# Run only the sparse retriever on a keyword based query # Run only the sparse retriever on a keyword based query
res_4 = sklearn_keyword_classifier.run( res_4 = sklearn_keyword_classifier.run(
query="jon snow country", query="jon snow country",
) )
print("\n===============================")
print("ES Results" + "\n" + "="*15) print("ES Results" + "\n" + "="*15)
print_answers(res_4) print_answers(res_4, details="minimum")
# Run only the dense retriever on the full sentence query # Run only the dense retriever on the full sentence query
res_5 = sklearn_keyword_classifier.run( res_5 = sklearn_keyword_classifier.run(
query="who are the younger brothers of arya stark ?", query="who are the younger brothers of arya stark ?",
) )
print("\n===============================")
print("DPR Results" + "\n" + "="*15) print("DPR Results" + "\n" + "="*15)
print_answers(res_5) print_answers(res_5, details="minimum")
# Run only the sparse retriever on a keyword based query # Run only the sparse retriever on a keyword based query
res_6 = sklearn_keyword_classifier.run( res_6 = sklearn_keyword_classifier.run(
query="arya stark younger brothers", query="arya stark younger brothers",
) )
print("\n===============================")
print("ES Results" + "\n" + "="*15) print("ES Results" + "\n" + "="*15)
print_answers(res_6) print_answers(res_6, details="minimum")
print()
print("Transformer keyword classifier")
print("==============================")
# Here we build the pipeline # Here we build the pipeline
transformer_keyword_classifier = Pipeline() transformer_keyword_classifier = Pipeline()
transformer_keyword_classifier.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]) transformer_keyword_classifier.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"])
@ -98,43 +109,53 @@ def tutorial14_query_classifier():
res_1 = transformer_keyword_classifier.run( res_1 = transformer_keyword_classifier.run(
query="Who is the father of Arya Stark?", query="Who is the father of Arya Stark?",
) )
print("\n===============================")
print("DPR Results" + "\n" + "="*15) print("DPR Results" + "\n" + "="*15)
print_answers(res_1) print_answers(res_1, details="minimum")
# Run only the sparse retriever on a keyword based query # Run only the sparse retriever on a keyword based query
res_2 = transformer_keyword_classifier.run( res_2 = transformer_keyword_classifier.run(
query="arya stark father", query="arya stark father",
) )
print("\n===============================")
print("ES Results" + "\n" + "="*15) print("ES Results" + "\n" + "="*15)
print_answers(res_2) print_answers(res_2, details="minimum")
# Run only the dense retriever on the full sentence query # Run only the dense retriever on the full sentence query
res_3 = transformer_keyword_classifier.run( res_3 = transformer_keyword_classifier.run(
query="which country was jon snow filmed ?", query="which country was jon snow filmed ?",
) )
print("\n===============================")
print("DPR Results" + "\n" + "="*15) print("DPR Results" + "\n" + "="*15)
print_answers(res_3) print_answers(res_3, details="minimum")
# Run only the sparse retriever on a keyword based query # Run only the sparse retriever on a keyword based query
res_4 = transformer_keyword_classifier.run( res_4 = transformer_keyword_classifier.run(
query="jon snow country", query="jon snow country",
) )
print("\n===============================")
print("ES Results" + "\n" + "="*15) print("ES Results" + "\n" + "="*15)
print_answers(res_4) print_answers(res_4, details="minimum")
# Run only the dense retriever on the full sentence query # Run only the dense retriever on the full sentence query
res_5 = transformer_keyword_classifier.run( res_5 = transformer_keyword_classifier.run(
query="who are the younger brothers of arya stark ?", query="who are the younger brothers of arya stark ?",
) )
print("\n===============================")
print("DPR Results" + "\n" + "="*15) print("DPR Results" + "\n" + "="*15)
print_answers(res_5) print_answers(res_5, details="minimum")
# Run only the sparse retriever on a keyword based query # Run only the sparse retriever on a keyword based query
res_6 = transformer_keyword_classifier.run( res_6 = transformer_keyword_classifier.run(
query="arya stark younger brothers", query="arya stark younger brothers",
) )
print("\n===============================")
print("ES Results" + "\n" + "="*15) print("ES Results" + "\n" + "="*15)
print_answers(res_6) print_answers(res_6, details="minimum")
print()
print("Transformer question classifier")
print("===============================")
# Here we build the pipeline # Here we build the pipeline
transformer_question_classifier = Pipeline() transformer_question_classifier = Pipeline()
@ -147,15 +168,17 @@ def tutorial14_query_classifier():
res_1 = transformer_question_classifier.run( res_1 = transformer_question_classifier.run(
query="Who is the father of Arya Stark?", query="Who is the father of Arya Stark?",
) )
print("\n===============================")
print("DPR Results" + "\n" + "="*15) print("DPR Results" + "\n" + "="*15)
print_answers(res_1) print_answers(res_1, details="minimum")
# Show only DPR results # Show only DPR results
res_2 = transformer_question_classifier.run( res_2 = transformer_question_classifier.run(
query="Arya Stark was the daughter of a Lord.", query="Arya Stark was the daughter of a Lord.",
) )
print("\n===============================")
print("ES Results" + "\n" + "="*15) print("ES Results" + "\n" + "="*15)
res_2 print_answers(res_2, details="minimum")
# Here we create the keyword vs question/statement query classifier # Here we create the keyword vs question/statement query classifier

View File

@ -368,7 +368,38 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"source": [ "source": [
"print_answers(prediction, details=\"minimal\")" "# Now you can either print the object directly...\n",
"from pprint import pprint\n",
"\n",
"pprint(prediction)\n",
"\n",
"# Sample output: \n",
"# {\n",
"# 'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,\n",
"# <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,\n",
"# ...\n",
"# ]\n",
"# 'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\\n===\\'\\'A Game of Thrones\\'\\'===\\nSansa Stark begins the novel by being betrothed to Crown ...'>,\n",
"# <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\\n===Season 2===\\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,\n",
"# ...\n",
"# ],\n",
"# 'no_ans_gap': 11.688868522644043,\n",
"# 'node_id': 'Reader',\n",
"# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},\n",
"# 'query': 'Who is the father of Arya Stark?',\n",
"# 'root_node': 'Query'\n",
"# }\n"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"# ...or use a util to simplify the output\n",
"# Change `minimum` to `medium` or `all` to raise the level of detail\n",
"print_answers(prediction, details=\"minimum\")"
], ],
"outputs": [], "outputs": [],
"metadata": { "metadata": {

View File

@ -134,7 +134,37 @@ def tutorial1_basic_qa_pipeline():
# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}}) # prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}})
# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}}) # prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}})
print_answers(prediction, details="minimal") # Now you can either print the object directly
print("\n\nRaw object:\n")
from pprint import pprint
pprint(prediction)
# Sample output:
# {
# 'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
# <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
# ...
# ]
# 'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
# <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
# ...
# ],
# 'no_ans_gap': 11.688868522644043,
# 'node_id': 'Reader',
# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
# 'query': 'Who is the father of Arya Stark?',
# 'root_node': 'Query'
# }
# Note that the documents contained in the above object are the documents filtered by the Retriever from
# the document store. Although the answers were extracted from these documents, it's possible that many
# answers were taken from a single one of them, and that some of the documents were not source of any answer.
# Or use a util to simplify the output
# Change `minimum` to `medium` or `all` to raise the level of detail
print("\n\nSimplified output:\n")
print_answers(prediction, details="minimum")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -358,11 +358,42 @@
"outputs": [], "outputs": [],
"metadata": {} "metadata": {}
}, },
{
"cell_type": "code",
"execution_count": null,
"source": [
"# Now you can either print the object directly...\n",
"from pprint import pprint\n",
"\n",
"pprint(prediction)\n",
"\n",
"# Sample output: \n",
"# {\n",
"# 'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,\n",
"# <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,\n",
"# ...\n",
"# ]\n",
"# 'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\\n===\\'\\'A Game of Thrones\\'\\'===\\nSansa Stark begins the novel by being betrothed to Crown ...'>,\n",
"# <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\\n===Season 2===\\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,\n",
"# ...\n",
"# ],\n",
"# 'no_ans_gap': 11.688868522644043,\n",
"# 'node_id': 'Reader',\n",
"# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},\n",
"# 'query': 'Who is the father of Arya Stark?',\n",
"# 'root_node': 'Query'\n",
"# }"
],
"outputs": [],
"metadata": {}
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 11,
"source": [ "source": [
"print_answers(prediction, details=\"minimal\")" "# ...or use a util to simplify the output\n",
"# Change `minimum` to `medium` or `all` to raise the level of detail\n",
"print_answers(prediction, details=\"minimum\")"
], ],
"outputs": [ "outputs": [
{ {

View File

@ -101,7 +101,36 @@ def tutorial3_basic_qa_pipeline_without_elasticsearch():
# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}}) # prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}})
# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}}) # prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}})
print_answers(prediction, details="minimal") # Now you can either print the object directly
print("\n\nRaw object:\n")
from pprint import pprint
pprint(prediction)
# Sample output:
# {
# 'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
# <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
# ...
# ]
# 'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
# <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
# ...
# ],
# 'no_ans_gap': 11.688868522644043,
# 'node_id': 'Reader',
# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
# 'query': 'Who is the father of Arya Stark?',
# 'root_node': 'Query'
# }
# Note that the documents contained in the above object are the documents filtered by the Retriever from
# the document store. Although the answers were extracted from these documents, it's possible that many
# answers were taken from a single one of them, and that some of the documents were not source of any answer.
# Or use a util to simplify the output
# Change `minimum` to `medium` or `all` to raise the level of detail
print("\n\nSimplified output:\n")
print_answers(prediction, details="minimum")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -265,12 +265,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"source": [ "source": [
"from haystack.utils import print_answers\n",
"\n",
"prediction = pipe.run(query=\"How is the virus spreading?\", params={\"Retriever\": {\"top_k\": 10}})\n", "prediction = pipe.run(query=\"How is the virus spreading?\", params={\"Retriever\": {\"top_k\": 10}})\n",
"for a in prediction[\"answers\"]:\n", "print_answers(prediction, details=\"medium\")"
" print(f\"Answer: {a.answer}\")\n",
" print(f\"Question: {a.meta['query']}\")\n",
" print(f\"Score: {a.score}\")\n",
" print(\"---------------------\")"
], ],
"outputs": [], "outputs": [],
"metadata": { "metadata": {

View File

@ -1,7 +1,7 @@
from haystack.document_stores import ElasticsearchDocumentStore from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import EmbeddingRetriever from haystack.nodes import EmbeddingRetriever
from haystack.utils import launch_es from haystack.utils import launch_es, print_answers
import pandas as pd import pandas as pd
import requests import requests
import logging import logging
@ -72,11 +72,7 @@ def tutorial4_faq_style_qa():
pipe = FAQPipeline(retriever=retriever) pipe = FAQPipeline(retriever=retriever)
prediction = pipe.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}}) prediction = pipe.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}})
for a in prediction["answers"]: print_answers(prediction, details="medium")
print(f"Answer: {a.answer}")
print(f"Question: {a.meta['query']}")
print(f"Score: {a.score}")
print("---------------------")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -67,7 +67,7 @@ def tutorial6_better_retrieval_via_dpr():
# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}}) # prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}})
# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}}) # prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}})
print_answers(prediction, details="minimal") print_answers(prediction, details="minimum")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -330,11 +330,12 @@
"source": [ "source": [
"# Or alternatively use the Pipeline class\n", "# Or alternatively use the Pipeline class\n",
"from haystack.pipelines import GenerativeQAPipeline\n", "from haystack.pipelines import GenerativeQAPipeline\n",
"from haystack.utils import print_answers\n",
"\n", "\n",
"pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)\n", "pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)\n",
"for question in QUESTIONS:\n", "for question in QUESTIONS:\n",
" res = pipe.run(query=question, params={\"Generator\": {\"top_k\": 1}, \"Retriever\": {\"top_k\": 5}})\n", " res = pipe.run(query=question, params={\"Generator\": {\"top_k\": 1}, \"Retriever\": {\"top_k\": 5}})\n",
" print(res)" " print_answers(res, details=\"minimum\")"
], ],
"outputs": [], "outputs": [],
"metadata": { "metadata": {

View File

@ -4,6 +4,7 @@ import pandas as pd
from haystack import Document from haystack import Document
from haystack.document_stores import FAISSDocumentStore from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import RAGenerator, DensePassageRetriever from haystack.nodes import RAGenerator, DensePassageRetriever
from haystack.utils import print_answers
def tutorial7_rag_generator(): def tutorial7_rag_generator():
@ -35,7 +36,6 @@ def tutorial7_rag_generator():
) )
) )
# Initialize FAISS document store to documents and corresponding index for embeddings # Initialize FAISS document store to documents and corresponding index for embeddings
# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding # Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
document_store = FAISSDocumentStore( document_store = FAISSDocumentStore(
@ -108,14 +108,14 @@ def tutorial7_rag_generator():
# Print you answer # Print you answer
answers = predicted_result["answers"] answers = predicted_result["answers"]
print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'') print(f' -> Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')
# Or alternatively use the Pipeline class # Or alternatively use the Pipeline class
from haystack.pipelines import GenerativeQAPipeline from haystack.pipelines import GenerativeQAPipeline
pipe = GenerativeQAPipeline(generator=generator, retriever=retriever) pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)
for question in QUESTIONS: for question in QUESTIONS:
res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}}) res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})
print(res) print_answers(res, details="minimum")
if __name__ == "__main__": if __name__ == "__main__":
tutorial7_rag_generator() tutorial7_rag_generator()

View File

@ -80,7 +80,7 @@ def tutorial8_preprocessing():
split_respect_sentence_boundary=True split_respect_sentence_boundary=True
) )
docs_default = preprocessor.process(doc_txt) docs_default = preprocessor.process(doc_txt)
print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}") print(f"\nn_docs_input: 1\nn_docs_output: {len(docs_default)}")
""" """
## Cleaning ## Cleaning
@ -101,13 +101,14 @@ def tutorial8_preprocessing():
preprocessor_nrsb = PreProcessor(split_respect_sentence_boundary=False) preprocessor_nrsb = PreProcessor(split_respect_sentence_boundary=False)
docs_nrsb = preprocessor_nrsb.process(doc_txt) docs_nrsb = preprocessor_nrsb.process(doc_txt)
print("RESPECTING SENTENCE BOUNDARY") print("\nRESPECTING SENTENCE BOUNDARY:")
end_text = docs_default[0]["content"][-50:] end_text = docs_default[0]["content"][-50:]
print("End of document: \"..." + end_text + "\"") print("End of document: \"..." + end_text + "\"")
print()
print("NOT RESPECTING SENTENCE BOUNDARY") print("\nNOT RESPECTING SENTENCE BOUNDARY:")
end_text_nrsb = docs_nrsb[0]["content"][-50:] end_text_nrsb = docs_nrsb[0]["content"][-50:]
print("End of document: \"..." + end_text_nrsb + "\"") print("End of document: \"..." + end_text_nrsb + "\"")
print()
""" """
A commonly used strategy to split long documents, especially in the field of Question Answering, A commonly used strategy to split long documents, especially in the field of Question Answering,