mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-31 03:46:30 +00:00
Improve tutorials' output (#1694)
* Modify __str__ and __repr__ for Document and Answer * Rename QueryClassifier in Tutorial11 * Improve the output of tutorial1 * Make the output of Tutorial8 a bit less dense * Add a print_questions util to print the output of question generating pipelines * Replace custom printing with the new utility in Tutorial13 * Ensure all output is printed with minimal details in Tutorial14 and add some titles * Minor change to print_answers * Make tutorial3's output the same as tutorial1 * Add __repr__ to Answer and fix to_dict() * Fix a bug in the Document and Answer's __str__ method * Improve print_answers, print_documents and print_questions * Using print_answers in Tutorial7 and fixing typo in the utils * Remove duplicate line in Tutorial12 * Use print_answers in Tutorial4 * Add explanation of what the documents in the output of the basic QA pipeline are * Move the fields constant into print_answers * Normalize all 'minimal' to 'minimum' (they were mixed up) * Improve the sample output to include all fields from Document and Answer Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
861522b6b1
commit
91cafb49bb
@ -237,7 +237,35 @@ prediction = pipe.run(
|
||||
|
||||
|
||||
```python
|
||||
print_answers(prediction, details="minimal")
|
||||
# Now you can either print the object directly...
|
||||
from pprint import pprint
|
||||
|
||||
pprint(prediction)
|
||||
|
||||
# Sample output:
|
||||
# {
|
||||
# 'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
|
||||
# <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
|
||||
# ...
|
||||
# ]
|
||||
# 'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
|
||||
# <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
|
||||
# ...
|
||||
# ],
|
||||
# 'no_ans_gap': 11.688868522644043,
|
||||
# 'node_id': 'Reader',
|
||||
# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
|
||||
# 'query': 'Who is the father of Arya Stark?',
|
||||
# 'root_node': 'Query'
|
||||
# }
|
||||
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# ...or use a util to simplify the output
|
||||
# Change `minimum` to `medium` or `all` to raise the level of detail
|
||||
print_answers(prediction, details="minimum")
|
||||
```
|
||||
|
||||
## About us
|
||||
|
@ -296,7 +296,7 @@ Below, we define a very naive `QueryClassifier` and show how to use it:
|
||||
|
||||
|
||||
```python
|
||||
class QueryClassifier(BaseComponent):
|
||||
class CustomQueryClassifier(BaseComponent):
|
||||
outgoing_edges = 2
|
||||
|
||||
def run(self, query: str):
|
||||
@ -307,7 +307,7 @@ class QueryClassifier(BaseComponent):
|
||||
|
||||
# Here we build the pipeline
|
||||
p_classifier = Pipeline()
|
||||
p_classifier.add_node(component=QueryClassifier(), name="QueryClassifier", inputs=["Query"])
|
||||
p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"])
|
||||
p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"])
|
||||
p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"])
|
||||
p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])
|
||||
|
@ -42,7 +42,8 @@ from tqdm import tqdm
|
||||
from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader
|
||||
from haystack.document_stores import ElasticsearchDocumentStore
|
||||
from haystack.pipelines import QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline, QuestionAnswerGenerationPipeline
|
||||
from haystack.utils import launch_es
|
||||
from haystack.utils import launch_es, print_questions
|
||||
|
||||
```
|
||||
|
||||
Let's start an Elasticsearch instance with one of the options below:
|
||||
@ -98,9 +99,11 @@ which the the document can answer.
|
||||
|
||||
```python
|
||||
question_generation_pipeline = QuestionGenerationPipeline(question_generator)
|
||||
for document in document_store:
|
||||
for idx, document in enumerate(document_store):
|
||||
|
||||
print(f"\n * Generating questions for document {idx}: {document.content[:100]}...\n")
|
||||
result = question_generation_pipeline.run(documents=[document])
|
||||
pprint(result)
|
||||
print_questions(result)
|
||||
```
|
||||
|
||||
## Retriever Question Generation Pipeline
|
||||
@ -111,8 +114,10 @@ This pipeline takes a query as input. It retrieves relevant documents and then g
|
||||
```python
|
||||
retriever = ElasticsearchRetriever(document_store=document_store)
|
||||
rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator)
|
||||
|
||||
print(f"\n * Generating questions for documents matching the query 'Arya Stark'\n")
|
||||
result = rqg_pipeline.run(query="Arya Stark")
|
||||
pprint(result)
|
||||
print_questions(result)
|
||||
```
|
||||
|
||||
## Question Answer Generation Pipeline
|
||||
@ -124,9 +129,11 @@ a Reader model
|
||||
```python
|
||||
reader = FARMReader("deepset/roberta-base-squad2")
|
||||
qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)
|
||||
for document in tqdm(document_store):
|
||||
for idx, document in enumerate(tqdm(document_store)):
|
||||
|
||||
print(f"\n * Generating questions and answers for document {idx}: {document.content[:100]}...\n")
|
||||
result = qag_pipeline.run(documents=[document])
|
||||
pprint(result)
|
||||
print_questions(result)
|
||||
```
|
||||
|
||||
## About us
|
||||
|
@ -161,14 +161,14 @@ res_1 = sklearn_keyword_classifier.run(
|
||||
query="Who is the father of Arya Stark?"
|
||||
)
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_1)
|
||||
print_answers(res_1, details="minimum")
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_2 = sklearn_keyword_classifier.run(
|
||||
query="arya stark father"
|
||||
)
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_2)
|
||||
print_answers(res_2, details="minimum")
|
||||
|
||||
```
|
||||
|
||||
@ -180,14 +180,14 @@ res_3 = sklearn_keyword_classifier.run(
|
||||
query="which country was jon snow filmed ?"
|
||||
)
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_3)
|
||||
print_answers(res_3, details="minimum")
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_4 = sklearn_keyword_classifier.run(
|
||||
query="jon snow country"
|
||||
)
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_4)
|
||||
print_answers(res_4, details="minimum")
|
||||
```
|
||||
|
||||
|
||||
@ -197,14 +197,14 @@ res_5 = sklearn_keyword_classifier.run(
|
||||
query="who are the younger brothers of arya stark ?"
|
||||
)
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_5)
|
||||
print_answers(res_5, details="minimum")
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_6 = sklearn_keyword_classifier.run(
|
||||
query="arya stark younger brothers"
|
||||
)
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_6)
|
||||
print_answers(res_6, details="minimum")
|
||||
```
|
||||
|
||||
## Transformer Keyword vs Question/Statement Classifier
|
||||
@ -234,14 +234,14 @@ res_1 = transformer_keyword_classifier.run(
|
||||
query="Who is the father of Arya Stark?"
|
||||
)
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_1)
|
||||
print_answers(res_1, details="minimum")
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_2 = transformer_keyword_classifier.run(
|
||||
query="arya stark father"
|
||||
)
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_2)
|
||||
print_answers(res_2, details="minimum")
|
||||
|
||||
```
|
||||
|
||||
@ -253,14 +253,14 @@ res_3 = transformer_keyword_classifier.run(
|
||||
query="which country was jon snow filmed ?"
|
||||
)
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_3)
|
||||
print_answers(res_3, details="minimum")
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_4 = transformer_keyword_classifier.run(
|
||||
query="jon snow country"
|
||||
)
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_4)
|
||||
print_answers(res_4, details="minimum")
|
||||
```
|
||||
|
||||
|
||||
@ -270,14 +270,14 @@ res_5 = transformer_keyword_classifier.run(
|
||||
query="who are the younger brothers of arya stark ?"
|
||||
)
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_5)
|
||||
print_answers(res_5, details="minimum")
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_6 = transformer_keyword_classifier.run(
|
||||
query="arya stark younger brothers"
|
||||
)
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_6)
|
||||
print_answers(res_6, details="minimum")
|
||||
```
|
||||
|
||||
## Question vs Statement Classifier
|
||||
@ -305,14 +305,14 @@ res_1 = transformer_question_classifier.run(
|
||||
query="Who is the father of Arya Stark?"
|
||||
)
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_1)
|
||||
print_answers(res_1, details="minimum")
|
||||
|
||||
# Show only DPR results
|
||||
res_2 = transformer_question_classifier.run(
|
||||
query="Arya Stark was the daughter of a Lord."
|
||||
)
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
res_2
|
||||
print_answers(res_2, details="minimum")
|
||||
```
|
||||
|
||||
## Standalone Query Classifier
|
||||
|
@ -182,7 +182,34 @@ prediction = pipe.run(
|
||||
|
||||
|
||||
```python
|
||||
print_answers(prediction, details="minimal")
|
||||
# Now you can either print the object directly...
|
||||
from pprint import pprint
|
||||
|
||||
pprint(prediction)
|
||||
|
||||
# Sample output:
|
||||
# {
|
||||
# 'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
|
||||
# <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
|
||||
# ...
|
||||
# ]
|
||||
# 'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
|
||||
# <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
|
||||
# ...
|
||||
# ],
|
||||
# 'no_ans_gap': 11.688868522644043,
|
||||
# 'node_id': 'Reader',
|
||||
# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
|
||||
# 'query': 'Who is the father of Arya Stark?',
|
||||
# 'root_node': 'Query'
|
||||
# }
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# ...or use a util to simplify the output
|
||||
# Change `minimum` to `medium` or `all` to raise the level of detail
|
||||
print_answers(prediction, details="minimum")
|
||||
```
|
||||
|
||||
## About us
|
||||
|
@ -155,12 +155,10 @@ pipe = FAQPipeline(retriever=retriever)
|
||||
|
||||
|
||||
```python
|
||||
from haystack.utils import print_answers
|
||||
|
||||
prediction = pipe.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}})
|
||||
for a in prediction["answers"]:
|
||||
print(f"Answer: {a.answer}")
|
||||
print(f"Question: {a.meta['query']}")
|
||||
print(f"Score: {a.score}")
|
||||
print("---------------------")
|
||||
print_answers(prediction, details="medium")
|
||||
```
|
||||
|
||||
## About us
|
||||
|
@ -193,11 +193,12 @@ for question in QUESTIONS:
|
||||
```python
|
||||
# Or alternatively use the Pipeline class
|
||||
from haystack.pipelines import GenerativeQAPipeline
|
||||
from haystack.utils import print_answers
|
||||
|
||||
pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)
|
||||
for question in QUESTIONS:
|
||||
res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})
|
||||
print(res)
|
||||
print_answers(res, details="minimum")
|
||||
```
|
||||
|
||||
## About us
|
||||
|
@ -186,10 +186,13 @@ class Document:
|
||||
getattr(other, 'id_hash_keys', None) == self.id_hash_keys)
|
||||
|
||||
def __repr__(self):
|
||||
return str(self.to_dict())
|
||||
return f"<Document: {str(self.to_dict())}>"
|
||||
|
||||
def __str__(self):
|
||||
return f"content: {self.content[:100]} {'[...]' if len(self.content) > 100 else ''}"
|
||||
# In some cases, self.content is None (therefore not subscriptable)
|
||||
if not self.content:
|
||||
return f"<Document: id={self.id}, content=None>"
|
||||
return f"<Document: id={self.id}, content='{self.content[:100]} {'...' if len(self.content) > 100 else ''}'>"
|
||||
|
||||
def __lt__(self, other):
|
||||
""" Enable sorting of Documents by score """
|
||||
@ -262,7 +265,13 @@ class Answer:
|
||||
return self.score < other.score
|
||||
|
||||
def __str__(self):
|
||||
return f"answer: {self.answer} \nscore: {self.score} \ncontext: {self.context}"
|
||||
# self.context might be None (therefore not subscriptable)
|
||||
if not self.context:
|
||||
return f"<Answer: answer='{self.answer}', score={self.score}, context=None>"
|
||||
return f"<Answer: answer='{self.answer}', score={self.score}, context='{self.context[:50]}{'...' if len(self.context) > 50 else ''}'>"
|
||||
|
||||
def __repr__(self):
|
||||
return f"<Answer {asdict(self)}>"
|
||||
|
||||
def to_dict(self):
|
||||
return asdict(self)
|
||||
|
@ -16,6 +16,7 @@ from haystack.utils.doc_store import (
|
||||
from haystack.utils.export_utils import (
|
||||
print_answers,
|
||||
print_documents,
|
||||
print_questions,
|
||||
export_answers_to_csv,
|
||||
convert_labels_to_squad,
|
||||
)
|
||||
|
@ -1,12 +1,8 @@
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
import io
|
||||
import re
|
||||
import time
|
||||
import json
|
||||
import pprint
|
||||
import logging
|
||||
import subprocess
|
||||
import pandas as pd
|
||||
from collections import defaultdict
|
||||
|
||||
@ -16,58 +12,98 @@ from haystack.document_stores.sql import DocumentORM
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
||||
def print_answers(results: dict, details: str = "all"):
|
||||
def print_answers(results: dict, details: str = "all", max_text_len: Optional[int] = None):
|
||||
"""
|
||||
Utilitiy function to print results of Haystack pipelines
|
||||
Utility function to print results of Haystack pipelines
|
||||
:param results: Results from a pipeline
|
||||
:param details: One of ["minimum", "medium", "all]. Defining the level of details to print.
|
||||
:param details: One of "minimum", "medium", "all". Defining the level of details to print.
|
||||
:param max_text_lenght: shorten lengthy text fields to the maximum allowed length. Set to
|
||||
None to not cut long text.
|
||||
:return: None
|
||||
"""
|
||||
# TODO: unify the output format of Generator and Reader so that this function doesn't have the try/except
|
||||
# Or implement a class method like PredReader.print() and PredGenerator.print() that handles all this functionality.
|
||||
# This default case is when the answers come from a Reader
|
||||
try:
|
||||
answers = results["answers"]
|
||||
pp = pprint.PrettyPrinter(indent=4)
|
||||
if details in ("minimal", "medium"):
|
||||
if details == "minimal":
|
||||
keys_to_keep = set(["answer", "context"])
|
||||
elif details == "medium":
|
||||
keys_to_keep = set(["answer", "context", "score"])
|
||||
# Defines the fields to keep in the Answer for each detail level
|
||||
fields_to_keep_by_level = {
|
||||
"minimum": ["answer", "context"],
|
||||
"medium": ["answer", "context", "score"]
|
||||
}
|
||||
|
||||
# filter the results
|
||||
filtered_answers = []
|
||||
for ans in answers:
|
||||
filtered_answers.append({k: getattr(ans, k) for k in keys_to_keep})
|
||||
pp.pprint(filtered_answers)
|
||||
else:
|
||||
pp.pprint(results)
|
||||
# This fall back case is when the answers come from a Generator
|
||||
except:
|
||||
if details == "minimal":
|
||||
print(f"Query: {results['query']}")
|
||||
for a in results["answers"]:
|
||||
print(f"Answer: {a['answer']}")
|
||||
else:
|
||||
pp.pprint(results)
|
||||
if not "answers" in results.keys():
|
||||
raise ValueError("The results object does not seem to come from a Reader: "
|
||||
f"it does not contain the 'answers' key, but only: {results.keys()}. "
|
||||
"Try print_documents or print_questions.")
|
||||
|
||||
if "query" in results.keys():
|
||||
print(f"\nQuery: {results['query']}\nAnswers:")
|
||||
|
||||
def print_documents(results: dict, max_text_len: Optional[int] = None, print_meta: bool = False):
|
||||
print(f"Query: {results['query']}")
|
||||
answers = results["answers"]
|
||||
pp = pprint.PrettyPrinter(indent=4)
|
||||
for d in results["documents"]:
|
||||
print()
|
||||
new_text = d.content[:max_text_len]
|
||||
if len(new_text) != len(d.content):
|
||||
new_text += "..."
|
||||
results = {
|
||||
"name": d.meta.get("name", None),
|
||||
"content": new_text
|
||||
}
|
||||
|
||||
# Filter the results by detail level
|
||||
filtered_answers = []
|
||||
if details in fields_to_keep_by_level.keys():
|
||||
for ans in answers:
|
||||
filtered_answers.append({k: getattr(ans, k) for k in fields_to_keep_by_level[details]})
|
||||
elif details == "all":
|
||||
filtered_answers = answers
|
||||
else:
|
||||
logging.warn(f"print_answers received details='{details}', which was not understood. "
|
||||
"Valid values are 'minimum', 'medium', and 'all'. Using 'all'.")
|
||||
filtered_answers = answers
|
||||
|
||||
# Shorten long text fields
|
||||
if max_text_len is not None:
|
||||
for ans in answers:
|
||||
if "context" in ans.keys() and len(ans["context"]) > 50:
|
||||
ans["context"] = ans["context"][:50] + "..."
|
||||
|
||||
pp.pprint(filtered_answers)
|
||||
|
||||
|
||||
def print_documents(results: dict, max_text_len: Optional[int] = None, print_name: bool = True, print_meta: bool = False):
|
||||
"""
|
||||
Utility that prints a compressed representation of the documents returned by a pipeline.
|
||||
:param max_text_lenght: shorten the document's content to a maximum number of chars. if None, does not cut.
|
||||
:param print_name: whether to print the document's name (from the metadata) or not.
|
||||
:param print_meta: whether to print the document's metadata or not.
|
||||
"""
|
||||
print(f"\nQuery: {results['query']}\n")
|
||||
pp = pprint.PrettyPrinter(indent=4)
|
||||
|
||||
for doc in results["documents"]:
|
||||
content = doc.content
|
||||
if max_text_len:
|
||||
content = doc.content[:max_text_len] + ("..." if len(doc.content) > max_text_len else "")
|
||||
results = {"content": content}
|
||||
if print_name:
|
||||
results["name"] = doc.meta.get("name", None)
|
||||
if print_meta:
|
||||
results["meta"] = d.meta
|
||||
results["meta"] = doc.meta
|
||||
pp.pprint(results)
|
||||
print()
|
||||
|
||||
|
||||
def print_questions(results: dict):
|
||||
"""
|
||||
Utility to print the output of a question generating pipeline in a readable format.
|
||||
"""
|
||||
if "generated_questions" in results.keys():
|
||||
print("\nGenerated questions:")
|
||||
for result in results["generated_questions"]:
|
||||
for question in result["questions"]:
|
||||
print(f" - {question}")
|
||||
|
||||
elif "results" in results.keys():
|
||||
print("\nGenerated pairs:")
|
||||
for pair in results["results"]:
|
||||
print(f" - Q:{pair['query']}")
|
||||
for answer in pair["answers"]:
|
||||
print(f" A: {answer.answer}")
|
||||
|
||||
else:
|
||||
raise ValueError("This object does not seem to be the output "
|
||||
"of a question generating pipeline: does not contain neither "
|
||||
f"'generated_questions' nor 'results', but only: {results.keys()}. "
|
||||
" Try `print_answers` or `print_documents`.")
|
||||
|
||||
|
||||
def export_answers_to_csv(agg_results: list, output_file):
|
||||
|
@ -547,7 +547,7 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"class QueryClassifier(BaseComponent):\n",
|
||||
"class CustomQueryClassifier(BaseComponent):\n",
|
||||
" outgoing_edges = 2\n",
|
||||
"\n",
|
||||
" def run(self, query: str):\n",
|
||||
@ -558,7 +558,7 @@
|
||||
"\n",
|
||||
"# Here we build the pipeline\n",
|
||||
"p_classifier = Pipeline()\n",
|
||||
"p_classifier.add_node(component=QueryClassifier(), name=\"QueryClassifier\", inputs=[\"Query\"])\n",
|
||||
"p_classifier.add_node(component=CustomQueryClassifier(), name=\"QueryClassifier\", inputs=[\"Query\"])\n",
|
||||
"p_classifier.add_node(component=es_retriever, name=\"ESRetriever\", inputs=[\"QueryClassifier.output_1\"])\n",
|
||||
"p_classifier.add_node(component=dpr_retriever, name=\"DPRRetriever\", inputs=[\"QueryClassifier.output_2\"])\n",
|
||||
"p_classifier.add_node(component=reader, name=\"QAReader\", inputs=[\"ESRetriever\", \"DPRRetriever\"])\n",
|
||||
|
@ -2,7 +2,7 @@ from haystack.utils import clean_wiki_text, print_answers, print_documents, fetc
|
||||
from pprint import pprint
|
||||
from haystack import Pipeline
|
||||
from haystack.document_stores import ElasticsearchDocumentStore
|
||||
from haystack.nodes import ElasticsearchRetriever, DensePassageRetriever, FARMReader, RAGenerator, JoinDocuments
|
||||
from haystack.nodes import ElasticsearchRetriever, DensePassageRetriever, FARMReader, RAGenerator, BaseComponent, JoinDocuments
|
||||
from haystack.pipelines import ExtractiveQAPipeline, DocumentSearchPipeline, GenerativeQAPipeline
|
||||
|
||||
|
||||
@ -35,33 +35,44 @@ def tutorial11_pipelines():
|
||||
|
||||
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
|
||||
|
||||
######################
|
||||
# Prebuilt Pipelines #
|
||||
######################
|
||||
print()
|
||||
print("######################")
|
||||
print("# Prebuilt Pipelines #")
|
||||
print("######################")
|
||||
|
||||
# Extractive QA Pipeline
|
||||
########################
|
||||
print()
|
||||
print("# Extractive QA Pipeline")
|
||||
print("########################")
|
||||
|
||||
query="Who is the father of Arya Stark?"
|
||||
p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=es_retriever)
|
||||
res = p_extractive_premade.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
query=query,
|
||||
params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}},
|
||||
)
|
||||
print_answers(res, details="minimal")
|
||||
print("\nQuery: ", query)
|
||||
print("Answers:")
|
||||
print_answers(res, details="minimum")
|
||||
|
||||
# Document Search Pipeline
|
||||
##########################
|
||||
|
||||
print()
|
||||
print("# Document Search Pipeline")
|
||||
print("##########################")
|
||||
|
||||
query="Who is the father of Arya Stark?"
|
||||
p_retrieval = DocumentSearchPipeline(es_retriever)
|
||||
res = p_retrieval.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
query=query,
|
||||
params={"Retriever": {"top_k": 10}},
|
||||
|
||||
)
|
||||
print()
|
||||
print_documents(res, max_text_len=200)
|
||||
|
||||
# Generator Pipeline
|
||||
##########################
|
||||
|
||||
print()
|
||||
print("# Generator Pipeline")
|
||||
print("####################")
|
||||
|
||||
# We set this to True so that the document store returns document embeddings
|
||||
# with each document, this is needed by the Generator
|
||||
@ -73,11 +84,12 @@ def tutorial11_pipelines():
|
||||
# Generative QA
|
||||
p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever)
|
||||
res = p_generator.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
query=query,
|
||||
params={"Retriever": {"top_k": 10}},
|
||||
|
||||
)
|
||||
print_answers(res, details="minimal")
|
||||
print()
|
||||
print_answers(res, details="minimum")
|
||||
|
||||
# We are setting this to False so that in later pipelines,
|
||||
# we get a cleaner printout
|
||||
@ -91,12 +103,14 @@ def tutorial11_pipelines():
|
||||
p_retrieval.draw("pipeline_retrieval.png")
|
||||
p_generator.draw("pipeline_generator.png")
|
||||
|
||||
####################
|
||||
# Custom Pipelines #
|
||||
####################
|
||||
print()
|
||||
print("####################")
|
||||
print("# Custom Pipelines #")
|
||||
print("####################")
|
||||
|
||||
# Extractive QA Pipeline
|
||||
########################
|
||||
print()
|
||||
print("# Extractive QA Pipeline")
|
||||
print("########################")
|
||||
|
||||
# Custom built extractive QA pipeline
|
||||
p_extractive = Pipeline()
|
||||
@ -104,16 +118,21 @@ def tutorial11_pipelines():
|
||||
p_extractive.add_node(component=reader, name="Reader", inputs=["Retriever"])
|
||||
|
||||
# Now we can run it
|
||||
query="Who is the father of Arya Stark?"
|
||||
res = p_extractive.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
query=query,
|
||||
params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}},
|
||||
)
|
||||
print_answers(res, details="minimal")
|
||||
print("\nQuery: ", query)
|
||||
print("Answers:")
|
||||
print_answers(res, details="minimum")
|
||||
p_extractive.draw("pipeline_extractive.png")
|
||||
|
||||
# Ensembled Retriever Pipeline
|
||||
##############################
|
||||
|
||||
print()
|
||||
print("# Ensembled Retriever Pipeline")
|
||||
print("##############################")
|
||||
|
||||
# Create ensembled pipeline
|
||||
p_ensemble = Pipeline()
|
||||
p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
|
||||
@ -123,22 +142,27 @@ def tutorial11_pipelines():
|
||||
p_ensemble.draw("pipeline_ensemble.png")
|
||||
|
||||
# Run pipeline
|
||||
query="Who is the father of Arya Stark?"
|
||||
res = p_ensemble.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
params={"ESRetriever": {"top_k": 5}, "DPRRetriever": {"top_k": 5}},
|
||||
|
||||
)
|
||||
print_answers(res, details="minimal")
|
||||
print("\nQuery: ", query)
|
||||
print("Answers:")
|
||||
print_answers(res, details="minimum")
|
||||
|
||||
# Query Classification Pipeline
|
||||
###############################
|
||||
|
||||
print()
|
||||
print("# Query Classification Pipeline")
|
||||
print("###############################")
|
||||
|
||||
# Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run.
|
||||
# Though this looks very similar to the ensembled pipeline shown above,
|
||||
# the key difference is that only one of the retrievers is run for each request.
|
||||
# By contrast both retrievers are always run in the ensembled approach.
|
||||
|
||||
class QueryClassifier():
|
||||
class CustomQueryClassifier(BaseComponent):
|
||||
outgoing_edges = 2
|
||||
|
||||
def run(self, query):
|
||||
@ -149,25 +173,32 @@ def tutorial11_pipelines():
|
||||
|
||||
# Here we build the pipeline
|
||||
p_classifier = Pipeline()
|
||||
p_classifier.add_node(component=QueryClassifier(), name="QueryClassifier", inputs=["Query"])
|
||||
p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"])
|
||||
p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"])
|
||||
p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"])
|
||||
p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])
|
||||
p_classifier.draw("pipeline_classifier.png")
|
||||
|
||||
# Run only the dense retriever on the full sentence query
|
||||
query="Who is the father of Arya Stark?"
|
||||
res_1 = p_classifier.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
query=query,
|
||||
)
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_1)
|
||||
print()
|
||||
print("\nQuery: ", query)
|
||||
print(" * DPR Answers:")
|
||||
print_answers(res_1, details="minimum")
|
||||
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
query="Arya Stark father"
|
||||
res_2 = p_classifier.run(
|
||||
query="Arya Stark father",
|
||||
query=query,
|
||||
)
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_2)
|
||||
print()
|
||||
print("\nQuery: ", query)
|
||||
print(" * ES Answers:")
|
||||
print_answers(res_2, details="minimum")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -91,7 +91,6 @@ def tutorial12_lfqa():
|
||||
print(f"Query: {query_2}")
|
||||
print(f"Answer: {result_2['answers'][0]}")
|
||||
print()
|
||||
pipe.run(query=query_2, params={"Retriever": {"top_k": 1}})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -66,7 +66,7 @@
|
||||
"from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader\n",
|
||||
"from haystack.document_stores import ElasticsearchDocumentStore\n",
|
||||
"from haystack.pipelines import QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline, QuestionAnswerGenerationPipeline\n",
|
||||
"from haystack.utils import launch_es"
|
||||
"from haystack.utils import launch_es, print_questions\n"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
@ -188,9 +188,11 @@
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"question_generation_pipeline = QuestionGenerationPipeline(question_generator)\n",
|
||||
"for document in document_store:\n",
|
||||
"for idx, document in enumerate(document_store):\n",
|
||||
" \n",
|
||||
" print(f\"\\n * Generating questions for document {idx}: {document.content[:100]}...\\n\")\n",
|
||||
" result = question_generation_pipeline.run(documents=[document])\n",
|
||||
" pprint(result)"
|
||||
" print_questions(result)"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
@ -220,8 +222,10 @@
|
||||
"source": [
|
||||
"retriever = ElasticsearchRetriever(document_store=document_store)\n",
|
||||
"rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator)\n",
|
||||
"\n",
|
||||
"print(f\"\\n * Generating questions for documents matching the query 'Arya Stark'\\n\")\n",
|
||||
"result = rqg_pipeline.run(query=\"Arya Stark\")\n",
|
||||
"pprint(result)"
|
||||
"print_questions(result)"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
@ -252,9 +256,11 @@
|
||||
"source": [
|
||||
"reader = FARMReader(\"deepset/roberta-base-squad2\")\n",
|
||||
"qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)\n",
|
||||
"for document in tqdm(document_store):\n",
|
||||
"for idx, document in enumerate(tqdm(document_store)):\n",
|
||||
"\n",
|
||||
" print(f\"\\n * Generating questions and answers for document {idx}: {document.content[:100]}...\\n\")\n",
|
||||
" result = qag_pipeline.run(documents=[document])\n",
|
||||
" pprint(result)"
|
||||
" print_questions(result)"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
|
@ -3,7 +3,7 @@ from pprint import pprint
|
||||
from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader
|
||||
from haystack.document_stores import ElasticsearchDocumentStore
|
||||
from haystack.pipelines import QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline, QuestionAnswerGenerationPipeline
|
||||
from haystack.utils import launch_es
|
||||
from haystack.utils import launch_es, print_questions
|
||||
|
||||
"""
|
||||
This is a bare bones tutorial showing what is possible with the QuestionGenerator Node which automatically generates
|
||||
@ -34,20 +34,31 @@ which the the document can answer.
|
||||
"""
|
||||
|
||||
# QuestionGenerationPipeline
|
||||
print("\nQuestionGenerationPipeline")
|
||||
print("==========================")
|
||||
|
||||
question_generation_pipeline = QuestionGenerationPipeline(question_generator)
|
||||
for document in document_store:
|
||||
result = question_generation_pipeline.run(documents=[document])
|
||||
pprint(result)
|
||||
for idx, document in enumerate(document_store):
|
||||
|
||||
print(f"\n * Generating questions for document {idx}: {document.content[:100]}...\n")
|
||||
result = question_generation_pipeline.run(documents=[document])
|
||||
print_questions(result)
|
||||
|
||||
"""
|
||||
This pipeline takes a query as input. It retrievers relevant documents and then generates questions based on these.
|
||||
"""
|
||||
|
||||
# RetrieverQuestionGenerationPipeline
|
||||
print("\RetrieverQuestionGenerationPipeline")
|
||||
print("==================================")
|
||||
|
||||
retriever = ElasticsearchRetriever(document_store=document_store)
|
||||
rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator)
|
||||
|
||||
print(f"\n * Generating questions for documents matching the query 'Arya Stark'\n")
|
||||
result = rqg_pipeline.run(query="Arya Stark")
|
||||
pprint(result)
|
||||
print_questions(result)
|
||||
|
||||
|
||||
"""
|
||||
This pipeline takes a document as input, generates questions on it, and attempts to answer these questions using
|
||||
@ -55,11 +66,17 @@ a Reader model
|
||||
"""
|
||||
|
||||
# QuestionAnswerGenerationPipeline
|
||||
print("\QuestionAnswerGenerationPipeline")
|
||||
print("===============================")
|
||||
|
||||
reader = FARMReader("deepset/roberta-base-squad2")
|
||||
qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)
|
||||
for document in tqdm(document_store):
|
||||
for idx, document in enumerate(tqdm(document_store)):
|
||||
|
||||
print(f"\n * Generating questions and answers for document {idx}: {document.content[:100]}...\n")
|
||||
result = qag_pipeline.run(documents=[document])
|
||||
pprint(result)
|
||||
print_questions(result)
|
||||
|
||||
|
||||
# This Haystack script was made with love by deepset in Berlin, Germany
|
||||
# Haystack: https://github.com/deepset-ai/haystack
|
||||
|
@ -1567,14 +1567,14 @@
|
||||
" query=\"Who is the father of Arya Stark?\"\n",
|
||||
")\n",
|
||||
"print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n",
|
||||
"print_answers(res_1)\n",
|
||||
"print_answers(res_1, details=\"minimum\")\n",
|
||||
"\n",
|
||||
"# Run only the sparse retriever on a keyword based query\n",
|
||||
"res_2 = sklearn_keyword_classifier.run(\n",
|
||||
" query=\"arya stark father\"\n",
|
||||
")\n",
|
||||
"print(\"ES Results\" + \"\\n\" + \"=\"*15)\n",
|
||||
"print_answers(res_2)\n"
|
||||
"print_answers(res_2, details=\"minimum\")\n"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
@ -1591,14 +1591,14 @@
|
||||
" query=\"which country was jon snow filmed ?\"\n",
|
||||
")\n",
|
||||
"print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n",
|
||||
"print_answers(res_3)\n",
|
||||
"print_answers(res_3, details=\"minimum\")\n",
|
||||
"\n",
|
||||
"# Run only the sparse retriever on a keyword based query\n",
|
||||
"res_4 = sklearn_keyword_classifier.run(\n",
|
||||
" query=\"jon snow country\"\n",
|
||||
")\n",
|
||||
"print(\"ES Results\" + \"\\n\" + \"=\"*15)\n",
|
||||
"print_answers(res_4)"
|
||||
"print_answers(res_4, details=\"minimum\")"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
@ -1614,14 +1614,14 @@
|
||||
" query=\"who are the younger brothers of arya stark ?\"\n",
|
||||
")\n",
|
||||
"print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n",
|
||||
"print_answers(res_5)\n",
|
||||
"print_answers(res_5, details=\"minimum\")\n",
|
||||
"\n",
|
||||
"# Run only the sparse retriever on a keyword based query\n",
|
||||
"res_6 = sklearn_keyword_classifier.run(\n",
|
||||
" query=\"arya stark younger brothers\"\n",
|
||||
")\n",
|
||||
"print(\"ES Results\" + \"\\n\" + \"=\"*15)\n",
|
||||
"print_answers(res_6)"
|
||||
"print_answers(res_6, details=\"minimum\")"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
@ -1670,14 +1670,14 @@
|
||||
" query=\"Who is the father of Arya Stark?\"\n",
|
||||
")\n",
|
||||
"print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n",
|
||||
"print_answers(res_1)\n",
|
||||
"print_answers(res_1, details=\"minimum\")\n",
|
||||
"\n",
|
||||
"# Run only the sparse retriever on a keyword based query\n",
|
||||
"res_2 = transformer_keyword_classifier.run(\n",
|
||||
" query=\"arya stark father\"\n",
|
||||
")\n",
|
||||
"print(\"ES Results\" + \"\\n\" + \"=\"*15)\n",
|
||||
"print_answers(res_2)\n"
|
||||
"print_answers(res_2, details=\"minimum\")\n"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
@ -1694,14 +1694,14 @@
|
||||
" query=\"which country was jon snow filmed ?\"\n",
|
||||
")\n",
|
||||
"print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n",
|
||||
"print_answers(res_3)\n",
|
||||
"print_answers(res_3, details=\"minimum\")\n",
|
||||
"\n",
|
||||
"# Run only the sparse retriever on a keyword based query\n",
|
||||
"res_4 = transformer_keyword_classifier.run(\n",
|
||||
" query=\"jon snow country\"\n",
|
||||
")\n",
|
||||
"print(\"ES Results\" + \"\\n\" + \"=\"*15)\n",
|
||||
"print_answers(res_4)"
|
||||
"print_answers(res_4, details=\"minimum\")"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
@ -1717,14 +1717,14 @@
|
||||
" query=\"who are the younger brothers of arya stark ?\"\n",
|
||||
")\n",
|
||||
"print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n",
|
||||
"print_answers(res_5)\n",
|
||||
"print_answers(res_5, details=\"minimum\")\n",
|
||||
"\n",
|
||||
"# Run only the sparse retriever on a keyword based query\n",
|
||||
"res_6 = transformer_keyword_classifier.run(\n",
|
||||
" query=\"arya stark younger brothers\"\n",
|
||||
")\n",
|
||||
"print(\"ES Results\" + \"\\n\" + \"=\"*15)\n",
|
||||
"print_answers(res_6)"
|
||||
"print_answers(res_6, details=\"minimum\")"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
@ -1771,14 +1771,14 @@
|
||||
" query=\"Who is the father of Arya Stark?\"\n",
|
||||
")\n",
|
||||
"print(\"DPR Results\" + \"\\n\" + \"=\"*15)\n",
|
||||
"print_answers(res_1)\n",
|
||||
"print_answers(res_1, details=\"minimum\")\n",
|
||||
"\n",
|
||||
"# Show only DPR results\n",
|
||||
"res_2 = transformer_question_classifier.run(\n",
|
||||
" query=\"Arya Stark was the daughter of a Lord.\"\n",
|
||||
")\n",
|
||||
"print(\"ES Results\" + \"\\n\" + \"=\"*15)\n",
|
||||
"res_2"
|
||||
"print_answers(res_2, details=\"minimum\")"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
|
@ -35,7 +35,9 @@ def tutorial14_query_classifier():
|
||||
|
||||
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
|
||||
|
||||
|
||||
print()
|
||||
print("Sklearn keyword classifier")
|
||||
print("==========================")
|
||||
# Here we build the pipeline
|
||||
sklearn_keyword_classifier = Pipeline()
|
||||
sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"])
|
||||
@ -48,44 +50,53 @@ def tutorial14_query_classifier():
|
||||
res_1 = sklearn_keyword_classifier.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
)
|
||||
print("\n===============================")
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_1)
|
||||
print_answers(res_1, details="minimum")
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_2 = sklearn_keyword_classifier.run(
|
||||
query="arya stark father",
|
||||
)
|
||||
print("\n===============================")
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_2)
|
||||
print_answers(res_2, details="minimum")
|
||||
|
||||
# Run only the dense retriever on the full sentence query
|
||||
res_3 = sklearn_keyword_classifier.run(
|
||||
query="which country was jon snow filmed ?",
|
||||
)
|
||||
print("\n===============================")
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_3)
|
||||
print_answers(res_3, details="minimum")
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_4 = sklearn_keyword_classifier.run(
|
||||
query="jon snow country",
|
||||
)
|
||||
print("\n===============================")
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_4)
|
||||
print_answers(res_4, details="minimum")
|
||||
|
||||
# Run only the dense retriever on the full sentence query
|
||||
res_5 = sklearn_keyword_classifier.run(
|
||||
query="who are the younger brothers of arya stark ?",
|
||||
)
|
||||
print("\n===============================")
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_5)
|
||||
print_answers(res_5, details="minimum")
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_6 = sklearn_keyword_classifier.run(
|
||||
query="arya stark younger brothers",
|
||||
)
|
||||
print("\n===============================")
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_6)
|
||||
print_answers(res_6, details="minimum")
|
||||
|
||||
print()
|
||||
print("Transformer keyword classifier")
|
||||
print("==============================")
|
||||
# Here we build the pipeline
|
||||
transformer_keyword_classifier = Pipeline()
|
||||
transformer_keyword_classifier.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"])
|
||||
@ -98,43 +109,53 @@ def tutorial14_query_classifier():
|
||||
res_1 = transformer_keyword_classifier.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
)
|
||||
print("\n===============================")
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_1)
|
||||
print_answers(res_1, details="minimum")
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_2 = transformer_keyword_classifier.run(
|
||||
query="arya stark father",
|
||||
)
|
||||
print("\n===============================")
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_2)
|
||||
print_answers(res_2, details="minimum")
|
||||
|
||||
# Run only the dense retriever on the full sentence query
|
||||
res_3 = transformer_keyword_classifier.run(
|
||||
query="which country was jon snow filmed ?",
|
||||
)
|
||||
print("\n===============================")
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_3)
|
||||
print_answers(res_3, details="minimum")
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_4 = transformer_keyword_classifier.run(
|
||||
query="jon snow country",
|
||||
)
|
||||
print("\n===============================")
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_4)
|
||||
print_answers(res_4, details="minimum")
|
||||
|
||||
# Run only the dense retriever on the full sentence query
|
||||
res_5 = transformer_keyword_classifier.run(
|
||||
query="who are the younger brothers of arya stark ?",
|
||||
)
|
||||
print("\n===============================")
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_5)
|
||||
print_answers(res_5, details="minimum")
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_6 = transformer_keyword_classifier.run(
|
||||
query="arya stark younger brothers",
|
||||
)
|
||||
print("\n===============================")
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_6)
|
||||
print_answers(res_6, details="minimum")
|
||||
|
||||
print()
|
||||
print("Transformer question classifier")
|
||||
print("===============================")
|
||||
|
||||
# Here we build the pipeline
|
||||
transformer_question_classifier = Pipeline()
|
||||
@ -147,15 +168,17 @@ def tutorial14_query_classifier():
|
||||
res_1 = transformer_question_classifier.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
)
|
||||
print("\n===============================")
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_1)
|
||||
print_answers(res_1, details="minimum")
|
||||
|
||||
# Show only DPR results
|
||||
res_2 = transformer_question_classifier.run(
|
||||
query="Arya Stark was the daughter of a Lord.",
|
||||
)
|
||||
print("\n===============================")
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
res_2
|
||||
print_answers(res_2, details="minimum")
|
||||
|
||||
# Here we create the keyword vs question/statement query classifier
|
||||
|
||||
|
@ -368,7 +368,38 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"print_answers(prediction, details=\"minimal\")"
|
||||
"# Now you can either print the object directly...\n",
|
||||
"from pprint import pprint\n",
|
||||
"\n",
|
||||
"pprint(prediction)\n",
|
||||
"\n",
|
||||
"# Sample output: \n",
|
||||
"# {\n",
|
||||
"# 'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,\n",
|
||||
"# <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,\n",
|
||||
"# ...\n",
|
||||
"# ]\n",
|
||||
"# 'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\\n===\\'\\'A Game of Thrones\\'\\'===\\nSansa Stark begins the novel by being betrothed to Crown ...'>,\n",
|
||||
"# <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\\n===Season 2===\\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,\n",
|
||||
"# ...\n",
|
||||
"# ],\n",
|
||||
"# 'no_ans_gap': 11.688868522644043,\n",
|
||||
"# 'node_id': 'Reader',\n",
|
||||
"# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},\n",
|
||||
"# 'query': 'Who is the father of Arya Stark?',\n",
|
||||
"# 'root_node': 'Query'\n",
|
||||
"# }\n"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"# ...or use a util to simplify the output\n",
|
||||
"# Change `minimum` to `medium` or `all` to raise the level of detail\n",
|
||||
"print_answers(prediction, details=\"minimum\")"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
|
@ -134,7 +134,37 @@ def tutorial1_basic_qa_pipeline():
|
||||
# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}})
|
||||
# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}})
|
||||
|
||||
print_answers(prediction, details="minimal")
|
||||
# Now you can either print the object directly
|
||||
print("\n\nRaw object:\n")
|
||||
from pprint import pprint
|
||||
pprint(prediction)
|
||||
|
||||
# Sample output:
|
||||
# {
|
||||
# 'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
|
||||
# <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
|
||||
# ...
|
||||
# ]
|
||||
# 'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
|
||||
# <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
|
||||
# ...
|
||||
# ],
|
||||
# 'no_ans_gap': 11.688868522644043,
|
||||
# 'node_id': 'Reader',
|
||||
# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
|
||||
# 'query': 'Who is the father of Arya Stark?',
|
||||
# 'root_node': 'Query'
|
||||
# }
|
||||
|
||||
# Note that the documents contained in the above object are the documents filtered by the Retriever from
|
||||
# the document store. Although the answers were extracted from these documents, it's possible that many
|
||||
# answers were taken from a single one of them, and that some of the documents were not source of any answer.
|
||||
|
||||
# Or use a util to simplify the output
|
||||
# Change `minimum` to `medium` or `all` to raise the level of detail
|
||||
print("\n\nSimplified output:\n")
|
||||
print_answers(prediction, details="minimum")
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -358,11 +358,42 @@
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"# Now you can either print the object directly...\n",
|
||||
"from pprint import pprint\n",
|
||||
"\n",
|
||||
"pprint(prediction)\n",
|
||||
"\n",
|
||||
"# Sample output: \n",
|
||||
"# {\n",
|
||||
"# 'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,\n",
|
||||
"# <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,\n",
|
||||
"# ...\n",
|
||||
"# ]\n",
|
||||
"# 'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\\n===\\'\\'A Game of Thrones\\'\\'===\\nSansa Stark begins the novel by being betrothed to Crown ...'>,\n",
|
||||
"# <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\\n===Season 2===\\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,\n",
|
||||
"# ...\n",
|
||||
"# ],\n",
|
||||
"# 'no_ans_gap': 11.688868522644043,\n",
|
||||
"# 'node_id': 'Reader',\n",
|
||||
"# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},\n",
|
||||
"# 'query': 'Who is the father of Arya Stark?',\n",
|
||||
"# 'root_node': 'Query'\n",
|
||||
"# }"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"source": [
|
||||
"print_answers(prediction, details=\"minimal\")"
|
||||
"# ...or use a util to simplify the output\n",
|
||||
"# Change `minimum` to `medium` or `all` to raise the level of detail\n",
|
||||
"print_answers(prediction, details=\"minimum\")"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -101,7 +101,36 @@ def tutorial3_basic_qa_pipeline_without_elasticsearch():
|
||||
# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}})
|
||||
# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}})
|
||||
|
||||
print_answers(prediction, details="minimal")
|
||||
# Now you can either print the object directly
|
||||
print("\n\nRaw object:\n")
|
||||
from pprint import pprint
|
||||
pprint(prediction)
|
||||
|
||||
# Sample output:
|
||||
# {
|
||||
# 'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
|
||||
# <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
|
||||
# ...
|
||||
# ]
|
||||
# 'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
|
||||
# <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
|
||||
# ...
|
||||
# ],
|
||||
# 'no_ans_gap': 11.688868522644043,
|
||||
# 'node_id': 'Reader',
|
||||
# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
|
||||
# 'query': 'Who is the father of Arya Stark?',
|
||||
# 'root_node': 'Query'
|
||||
# }
|
||||
|
||||
# Note that the documents contained in the above object are the documents filtered by the Retriever from
|
||||
# the document store. Although the answers were extracted from these documents, it's possible that many
|
||||
# answers were taken from a single one of them, and that some of the documents were not source of any answer.
|
||||
|
||||
# Or use a util to simplify the output
|
||||
# Change `minimum` to `medium` or `all` to raise the level of detail
|
||||
print("\n\nSimplified output:\n")
|
||||
print_answers(prediction, details="minimum")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -265,12 +265,10 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"from haystack.utils import print_answers\n",
|
||||
"\n",
|
||||
"prediction = pipe.run(query=\"How is the virus spreading?\", params={\"Retriever\": {\"top_k\": 10}})\n",
|
||||
"for a in prediction[\"answers\"]:\n",
|
||||
" print(f\"Answer: {a.answer}\")\n",
|
||||
" print(f\"Question: {a.meta['query']}\")\n",
|
||||
" print(f\"Score: {a.score}\")\n",
|
||||
" print(\"---------------------\")"
|
||||
"print_answers(prediction, details=\"medium\")"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
|
@ -1,7 +1,7 @@
|
||||
from haystack.document_stores import ElasticsearchDocumentStore
|
||||
|
||||
from haystack.nodes import EmbeddingRetriever
|
||||
from haystack.utils import launch_es
|
||||
from haystack.utils import launch_es, print_answers
|
||||
import pandas as pd
|
||||
import requests
|
||||
import logging
|
||||
@ -66,17 +66,13 @@ def tutorial4_faq_style_qa():
|
||||
docs_to_index = df.to_dict(orient="records")
|
||||
document_store.write_documents(docs_to_index)
|
||||
|
||||
# Initialize a Pipeline (this time without a reader) and ask questions
|
||||
# Initialize a Pipeline (this time without a reader) and ask questions
|
||||
|
||||
from haystack.pipelines import FAQPipeline
|
||||
pipe = FAQPipeline(retriever=retriever)
|
||||
|
||||
prediction = pipe.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}})
|
||||
for a in prediction["answers"]:
|
||||
print(f"Answer: {a.answer}")
|
||||
print(f"Question: {a.meta['query']}")
|
||||
print(f"Score: {a.score}")
|
||||
print("---------------------")
|
||||
print_answers(prediction, details="medium")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -67,7 +67,7 @@ def tutorial6_better_retrieval_via_dpr():
|
||||
# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}})
|
||||
# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}})
|
||||
|
||||
print_answers(prediction, details="minimal")
|
||||
print_answers(prediction, details="minimum")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -330,11 +330,12 @@
|
||||
"source": [
|
||||
"# Or alternatively use the Pipeline class\n",
|
||||
"from haystack.pipelines import GenerativeQAPipeline\n",
|
||||
"from haystack.utils import print_answers\n",
|
||||
"\n",
|
||||
"pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)\n",
|
||||
"for question in QUESTIONS:\n",
|
||||
" res = pipe.run(query=question, params={\"Generator\": {\"top_k\": 1}, \"Retriever\": {\"top_k\": 5}})\n",
|
||||
" print(res)"
|
||||
" print_answers(res, details=\"minimum\")"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
|
@ -4,6 +4,7 @@ import pandas as pd
|
||||
from haystack import Document
|
||||
from haystack.document_stores import FAISSDocumentStore
|
||||
from haystack.nodes import RAGenerator, DensePassageRetriever
|
||||
from haystack.utils import print_answers
|
||||
|
||||
|
||||
def tutorial7_rag_generator():
|
||||
@ -35,7 +36,6 @@ def tutorial7_rag_generator():
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# Initialize FAISS document store to documents and corresponding index for embeddings
|
||||
# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
|
||||
document_store = FAISSDocumentStore(
|
||||
@ -108,14 +108,14 @@ def tutorial7_rag_generator():
|
||||
|
||||
# Print you answer
|
||||
answers = predicted_result["answers"]
|
||||
print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')
|
||||
print(f' -> Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')
|
||||
|
||||
# Or alternatively use the Pipeline class
|
||||
from haystack.pipelines import GenerativeQAPipeline
|
||||
pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)
|
||||
for question in QUESTIONS:
|
||||
res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})
|
||||
print(res)
|
||||
print_answers(res, details="minimum")
|
||||
|
||||
if __name__ == "__main__":
|
||||
tutorial7_rag_generator()
|
||||
|
@ -80,7 +80,7 @@ def tutorial8_preprocessing():
|
||||
split_respect_sentence_boundary=True
|
||||
)
|
||||
docs_default = preprocessor.process(doc_txt)
|
||||
print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}")
|
||||
print(f"\nn_docs_input: 1\nn_docs_output: {len(docs_default)}")
|
||||
|
||||
"""
|
||||
## Cleaning
|
||||
@ -101,13 +101,14 @@ def tutorial8_preprocessing():
|
||||
preprocessor_nrsb = PreProcessor(split_respect_sentence_boundary=False)
|
||||
docs_nrsb = preprocessor_nrsb.process(doc_txt)
|
||||
|
||||
print("RESPECTING SENTENCE BOUNDARY")
|
||||
print("\nRESPECTING SENTENCE BOUNDARY:")
|
||||
end_text = docs_default[0]["content"][-50:]
|
||||
print("End of document: \"..." + end_text + "\"")
|
||||
print()
|
||||
print("NOT RESPECTING SENTENCE BOUNDARY")
|
||||
|
||||
print("\nNOT RESPECTING SENTENCE BOUNDARY:")
|
||||
end_text_nrsb = docs_nrsb[0]["content"][-50:]
|
||||
print("End of document: \"..." + end_text_nrsb + "\"")
|
||||
print()
|
||||
|
||||
"""
|
||||
A commonly used strategy to split long documents, especially in the field of Question Answering,
|
||||
|
Loading…
x
Reference in New Issue
Block a user