mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-07 12:37:27 +00:00
Add QuestionGenerator (#1267)
* Create basic Question Generation * Split texts into 50 word chunks * Allow prompt to be changed * Implement iteration functionality in DS * Add docstrings, create pipelines * Make pipelines work * Add comments * Add tests * Add tutorials and docs * Add doc string
This commit is contained in:
parent
363be65a78
commit
937247d628
45
docs/_src/usage/usage/question_generator.md
Normal file
45
docs/_src/usage/usage/question_generator.md
Normal file
@ -0,0 +1,45 @@
|
||||
<!---
|
||||
title: "Question Generator"
|
||||
metaTitle: "Question Generator"
|
||||
metaDescription: ""
|
||||
slug: "/docs/question_generator"
|
||||
date: "2020-11-05"
|
||||
id: "questiongeneratormd"
|
||||
--->
|
||||
|
||||
# Question Generator
|
||||
|
||||
<div class="recommendation">
|
||||
|
||||
**Running examples**
|
||||
|
||||
Have a look at our [tutorial notebook](/docs/latest/tutorial13md))) if you'd like to start trying out Question Generation straight away!
|
||||
|
||||
</div>
|
||||
|
||||
The Question Generation module is used to generate SQuAD style questions on a given document.
|
||||
|
||||
This module is useful when it comes to labelling in a new domain. It can be used to generate questions quickly for an
|
||||
annotator to answer. If used in conjunction with a trained Reader model, you can automatically generate question answer
|
||||
pairs. High impact annotations can then be created if a human annotator looks over these pairs and corrects the incorrect predictions.
|
||||
|
||||
Question generation is also a good way to make large documents more navigable. Generated questions can
|
||||
quickly give the user a sense of what information is contained within the document, thus acting as a kind of summarization.
|
||||
|
||||
To initialize a question generator, simply call:
|
||||
|
||||
```python
|
||||
from haystack.question_generator import QuestionGenerator
|
||||
|
||||
question_generator = QuestionGenerator()
|
||||
```
|
||||
|
||||
This loads the [`valhalla/t5-base-e2e-qg`](https://huggingface.co/valhalla/t5-base-e2e-qg) model by default which is a T5 model trained on SQuAD for question generation.
|
||||
|
||||
To run the node in isolation, simply use the `generate()` method:
|
||||
|
||||
```python
|
||||
result = question_generator.generate(text="Nirvana was an American rock band formed in Aberdeen, Washington in 1987.")
|
||||
```
|
||||
|
||||
Otherwise, the node can be used in a pipeline where its `run()` method will called.
|
||||
@ -22,6 +22,7 @@ class BaseDocumentStore(BaseComponent):
|
||||
label_index: Optional[str]
|
||||
similarity: Optional[str]
|
||||
duplicate_documents_options: tuple = ('skip', 'overwrite', 'fail')
|
||||
ids_iterator = None
|
||||
|
||||
@abstractmethod
|
||||
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None,
|
||||
@ -66,6 +67,20 @@ class BaseDocumentStore(BaseComponent):
|
||||
"""
|
||||
pass
|
||||
|
||||
def __iter__(self):
|
||||
if not self.ids_iterator:
|
||||
self.ids_iterator = [x.id for x in self.get_all_documents()]
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if len(self.ids_iterator) == 0:
|
||||
raise StopIteration
|
||||
else:
|
||||
curr_id = self.ids_iterator[0]
|
||||
ret = self.get_document_by_id(curr_id)
|
||||
self.ids_iterator = self.ids_iterator[1:]
|
||||
return ret
|
||||
|
||||
@abstractmethod
|
||||
def get_all_labels(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]:
|
||||
pass
|
||||
|
||||
@ -8,6 +8,7 @@ from pathlib import Path
|
||||
from typing import List, Optional, Dict, Union, Any
|
||||
import pickle
|
||||
import urllib
|
||||
from functools import wraps
|
||||
|
||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
|
||||
|
||||
@ -591,6 +592,70 @@ class TranslationWrapperPipeline(BaseStandardPipeline):
|
||||
return output
|
||||
|
||||
|
||||
class QuestionGenerationPipeline(BaseStandardPipeline):
|
||||
"""
|
||||
A simple pipeline that takes documents as input and generates
|
||||
questions that it thinks can be answered by the documents.
|
||||
"""
|
||||
def __init__(self, question_generator):
|
||||
self.pipeline = Pipeline()
|
||||
self.pipeline.add_node(component=question_generator, name="QuestionGenerator", inputs=["Query"])
|
||||
|
||||
def run(self, documents, **kwargs):
|
||||
kwargs["documents"] = documents
|
||||
output = self.pipeline.run(**kwargs)
|
||||
return output
|
||||
|
||||
|
||||
class RetrieverQuestionGenerationPipeline(BaseStandardPipeline):
|
||||
"""
|
||||
A simple pipeline that takes a query as input, performs retrieval, and then generates
|
||||
questions that it thinks can be answered by the retrieved documents.
|
||||
"""
|
||||
def __init__(self, retriever, question_generator):
|
||||
self.pipeline = Pipeline()
|
||||
self.pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
|
||||
self.pipeline.add_node(component=question_generator, name="Question Generator", inputs=["Retriever"])
|
||||
|
||||
def run(self, query, **kwargs):
|
||||
kwargs["query"] = query
|
||||
output = self.pipeline.run(**kwargs)
|
||||
return output
|
||||
|
||||
|
||||
class QuestionAnswerGenerationPipeline(BaseStandardPipeline):
|
||||
"""
|
||||
This is a pipeline which takes a document as input, generates questions that the model thinks can be answered by
|
||||
this document, and then performs question answering of this questions using that single document.
|
||||
"""
|
||||
def __init__(self, question_generator, reader):
|
||||
question_generator.run = self.formatting_wrapper(question_generator.run)
|
||||
# Overwrite reader.run function so it can handle a batch of questions being passed on by the QuestionGenerator
|
||||
reader.run = reader.run_batch
|
||||
self.pipeline = Pipeline()
|
||||
self.pipeline.add_node(component=question_generator, name="QuestionGenerator", inputs=["Query"])
|
||||
self.pipeline.add_node(component=reader, name="Reader", inputs=["QuestionGenerator"])
|
||||
|
||||
# This is used to format the output of the QuestionGenerator so that its questions are ready to be answered by the reader
|
||||
def formatting_wrapper(self, fn):
|
||||
@wraps(fn)
|
||||
def wrapper(*args, **kwargs):
|
||||
output, output_stream = fn(*args, **kwargs)
|
||||
questions = output["generated_questions"][0]["questions"]
|
||||
documents = output["documents"]
|
||||
query_doc_list = []
|
||||
for q in questions:
|
||||
query_doc_list.append({"queries": q, "docs": documents})
|
||||
kwargs["query_doc_list"] = query_doc_list
|
||||
return kwargs, output_stream
|
||||
return wrapper
|
||||
|
||||
def run(self, document, **kwargs):
|
||||
kwargs["documents"] = [document]
|
||||
output = self.pipeline.run(**kwargs)
|
||||
return output
|
||||
|
||||
|
||||
class RootNode:
|
||||
outgoing_edges = 1
|
||||
|
||||
|
||||
1
haystack/question_generator/__init__.py
Normal file
1
haystack/question_generator/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from haystack.question_generator.question_generator import QuestionGenerator
|
||||
104
haystack/question_generator/question_generator.py
Normal file
104
haystack/question_generator/question_generator.py
Normal file
@ -0,0 +1,104 @@
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
from transformers import AutoTokenizer
|
||||
from haystack import BaseComponent
|
||||
from haystack.preprocessor import PreProcessor
|
||||
|
||||
|
||||
class QuestionGenerator(BaseComponent):
|
||||
""" The Question Generator takes only a document as input and outputs questions that it thinks can be
|
||||
answered by this document. In our current implementation, input texts are split into chunks of 50 words
|
||||
with a 10 word overlap. This is because the default model `valhalla/t5-base-e2e-qg` seems to generate only
|
||||
about 3 questions per passage regardless of length. Our approach prioritizes the creation of more questions
|
||||
over processing efficiency (T5 is able to digest much more than 50 words at once). The returned questions
|
||||
generally come in an order dictated by the order of their answers i.e. early questions in the list generally
|
||||
come from earlier in the document."""
|
||||
|
||||
outgoing_edges = 1
|
||||
|
||||
def __init__(self,
|
||||
model_name_or_path="valhalla/t5-base-e2e-qg",
|
||||
model_version=None,
|
||||
num_beams=4,
|
||||
max_length=256,
|
||||
no_repeat_ngram_size=3,
|
||||
length_penalty=1.5,
|
||||
early_stopping=True,
|
||||
split_length=50,
|
||||
split_overlap=10,
|
||||
prompt="generate questions:"):
|
||||
"""
|
||||
Uses the valhalla/t5-base-e2e-qg model by default. This class supports any question generation model that is
|
||||
implemented as a Seq2SeqLM in HuggingFace Transformers. Note that this style of question generation (where the only input
|
||||
is a document) is sometimes referred to as end-to-end question generation. Answer-supervised question
|
||||
generation is not currently supported.
|
||||
"""
|
||||
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
||||
self.set_config(
|
||||
model_name_or_path=model_name_or_path, model_version=model_version,
|
||||
max_length=max_length, num_beams=num_beams, no_repeat_ngram_size=no_repeat_ngram_size,
|
||||
length_penalty=length_penalty, early_stopping=early_stopping, split_length=split_length,
|
||||
split_overlap=split_overlap
|
||||
)
|
||||
self.num_beams = num_beams
|
||||
self.max_length = max_length
|
||||
self.no_repeat_ngram_size = no_repeat_ngram_size
|
||||
self.length_penalty = length_penalty
|
||||
self.early_stopping = early_stopping
|
||||
self.split_length = split_length
|
||||
self.split_overlap = split_overlap
|
||||
self.preprocessor = PreProcessor()
|
||||
self.prompt = prompt
|
||||
|
||||
def run(self, **kwargs):
|
||||
documents = kwargs["documents"]
|
||||
generated_questions = []
|
||||
for d in documents:
|
||||
questions = self.generate(d.text)
|
||||
curr_dict = {"document_id": d.id,
|
||||
"document_sample": d.text[:200],
|
||||
"questions": questions}
|
||||
generated_questions.append(curr_dict)
|
||||
output = {"generated_questions": generated_questions, **kwargs}
|
||||
return output, "output_1"
|
||||
|
||||
def generate(self, text):
|
||||
# Performing splitting because T5 has a max input length
|
||||
# Also currently, it seems that it only generates about 3 questions for the beginning section of text
|
||||
split_texts_dict = self.preprocessor.split(
|
||||
document={"text": text},
|
||||
split_by="word",
|
||||
split_respect_sentence_boundary=False,
|
||||
split_overlap=self.split_overlap,
|
||||
split_length=self.split_length
|
||||
)
|
||||
split_texts = [x["text"] for x in split_texts_dict]
|
||||
ret = []
|
||||
for split_text in split_texts:
|
||||
if self.prompt not in split_text:
|
||||
split_text = self.prompt + " " + split_text
|
||||
tokenized = self.tokenizer([split_text], return_tensors="pt")
|
||||
input_ids = tokenized["input_ids"]
|
||||
attention_mask = tokenized["attention_mask"] # necessary if padding is enabled so the model won't attend pad tokens
|
||||
tokens_output = self.model.generate(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
num_beams=self.num_beams,
|
||||
max_length=self.max_length,
|
||||
no_repeat_ngram_size=self.no_repeat_ngram_size,
|
||||
length_penalty=self.length_penalty,
|
||||
early_stopping=self.early_stopping,
|
||||
)
|
||||
|
||||
string_output = self.tokenizer.decode(tokens_output[0])
|
||||
string_output = string_output.replace("<pad>", "").replace("</s>", "")
|
||||
questions_string = string_output.split("<sep>")
|
||||
questions = [x for x in questions_string if x]
|
||||
|
||||
# Doing this instead of set to maintain order since the generated questions seem to have answers
|
||||
# that occur in order in the text
|
||||
for q in questions:
|
||||
if q not in ret:
|
||||
ret.append(q)
|
||||
return ret
|
||||
|
||||
@ -2,7 +2,7 @@ import numpy as np
|
||||
from scipy.special import expit
|
||||
from abc import ABC, abstractmethod
|
||||
from copy import deepcopy
|
||||
from typing import List, Optional, Sequence
|
||||
from typing import List, Optional, Sequence, Dict
|
||||
from functools import wraps
|
||||
from time import perf_counter
|
||||
|
||||
@ -69,6 +69,21 @@ class BaseReader(BaseComponent):
|
||||
results.update(**kwargs)
|
||||
return results, "output_1"
|
||||
|
||||
def run_batch(self, query_doc_list: List[Dict], top_k_reader: Optional[int] = None, **kwargs):
|
||||
""" A unoptimized implementation of running Reader queries in batch """
|
||||
self.query_count += len(query_doc_list)
|
||||
results = []
|
||||
if query_doc_list:
|
||||
for qd in query_doc_list:
|
||||
q = qd["queries"]
|
||||
docs = qd["docs"]
|
||||
predict = self.timing(self.predict, "query_time")
|
||||
result = predict(query=q, documents=docs, top_k=top_k_reader)
|
||||
results.append(result)
|
||||
else:
|
||||
results = [{"answers": [], "query": ""}]
|
||||
return results, "output_1"
|
||||
|
||||
def timing(self, fn, attr_name):
|
||||
"""Wrapper method used to time functions. """
|
||||
@wraps(fn)
|
||||
|
||||
@ -32,6 +32,7 @@ from haystack.reader.farm import FARMReader
|
||||
from haystack.reader.transformers import TransformersReader
|
||||
from haystack.summarizer.transformers import TransformersSummarizer
|
||||
from haystack.translator import TransformersTranslator
|
||||
from haystack.question_generator import QuestionGenerator
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
@ -233,6 +234,11 @@ def rag_generator():
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def question_generator():
|
||||
return QuestionGenerator(model_name_or_path="valhalla/t5-small-e2e-qg")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def eli5_generator():
|
||||
return Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5")
|
||||
|
||||
36
test/test_question_generator.py
Normal file
36
test/test_question_generator.py
Normal file
@ -0,0 +1,36 @@
|
||||
from haystack.pipeline import QuestionAnswerGenerationPipeline, QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline
|
||||
from haystack import Document
|
||||
import pytest
|
||||
|
||||
|
||||
text = 'The Living End are an Australian punk rockabilly band from Melbourne, formed in 1994. Since 2002, the line-up consists of Chris Cheney (vocals, guitar), Scott Owen (double bass, vocals), and Andy Strachan (drums). The band rose to fame in 1997 after the release of their EP Second Solution / Prisoner of Society, which peaked at No. 4 on the Australian ARIA Singles Chart. They have released eight studio albums, two of which reached the No. 1 spot on the ARIA Albums Chart: The Living End (October 1998) and State of Emergency (February 2006). They have also achieved chart success in the U.S. and the United Kingdom. The Band was nominated 27 times and won five awards at the Australian ARIA Music Awards ceremonies: "Highest Selling Single" for Second Solution / Prisoner of Society (1998), "Breakthrough Artist – Album" and "Best Group" for The Living End (1999), as well as "Best Rock Album" for White Noise (2008) and The Ending Is Just the Beginning Repeating (2011). In October 2010, their debut album was listed in the book "100 Best Australian Albums". Australian musicologist Ian McFarlane described the group as "one of Australia’s premier rock acts. By blending a range of styles (punk, rockabilly and flat out rock) with great success, The Living End has managed to produce anthemic choruses and memorable songs in abundance".'
|
||||
document = Document(text=text)
|
||||
query = "Living End"
|
||||
|
||||
|
||||
def test_qg_pipeline(question_generator):
|
||||
p = QuestionGenerationPipeline(question_generator)
|
||||
result = p.run(documents=[document])
|
||||
keys = list(result)
|
||||
assert "generated_questions" in keys
|
||||
assert len(result["generated_questions"][0]["questions"]) > 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("retriever,document_store", [("elasticsearch", "elasticsearch")], indirect=True)
|
||||
def test_rqg_pipeline(question_generator, retriever):
|
||||
retriever.document_store.write_documents([{"text": text}])
|
||||
p = RetrieverQuestionGenerationPipeline(retriever, question_generator)
|
||||
result = p.run(query)
|
||||
keys = list(result)
|
||||
assert "generated_questions" in keys
|
||||
assert len(result["generated_questions"][0]["questions"]) > 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
|
||||
def test_qag_pipeline(question_generator, reader):
|
||||
p = QuestionAnswerGenerationPipeline(question_generator, reader)
|
||||
result = p.run(document=document)
|
||||
assert len(result) > 0
|
||||
assert result[0]["query"]
|
||||
assert len(result[0]["answers"]) > 0
|
||||
assert result[0]["answers"][0]["answer"]
|
||||
213
tutorials/Tutorial13_Question_generation.ipynb
Normal file
213
tutorials/Tutorial13_Question_generation.ipynb
Normal file
@ -0,0 +1,213 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# Question Generation\n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial13_Question_generation.ipynb)\n",
|
||||
"\n",
|
||||
"This is a bare bones tutorial showing what is possible with the QuestionGenerator Nodes and Pipelines which automatically\n",
|
||||
"generate questions which the question generation model thinks can be answered by a given document."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from haystack.question_generator import QuestionGenerator\n",
|
||||
"from haystack.utils import launch_es\n",
|
||||
"from haystack.document_store import ElasticsearchDocumentStore\n",
|
||||
"from haystack.retriever import ElasticsearchRetriever\n",
|
||||
"from pprint import pprint\n",
|
||||
"from haystack.reader import FARMReader\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"from haystack.pipeline import QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline, QuestionAnswerGenerationPipeline"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Start Elasticsearch service via Docker\n",
|
||||
"launch_es()\n",
|
||||
"\n",
|
||||
"text1 = \"Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace.\"\n",
|
||||
"text2 = \"Princess Arya Stark is the third child and second daughter of Lord Eddard Stark and his wife, Lady Catelyn Stark. She is the sister of the incumbent Westerosi monarchs, Sansa, Queen in the North, and Brandon, King of the Andals and the First Men. After narrowly escaping the persecution of House Stark by House Lannister, Arya is trained as a Faceless Man at the House of Black and White in Braavos, using her abilities to avenge her family. Upon her return to Westeros, she exacts retribution for the Red Wedding by exterminating the Frey male line.\"\n",
|
||||
"text3 = \"Dry Cleaning are an English post-punk band who formed in South London in 2018.[3] The band is composed of vocalist Florence Shaw, guitarist Tom Dowse, bassist Lewis Maynard and drummer Nick Buxton. They are noted for their use of spoken word primarily in lieu of sung vocals, as well as their unconventional lyrics. Their musical stylings have been compared to Wire, Magazine and Joy Division.[4] The band released their debut single, 'Magic of Meghan' in 2019. Shaw wrote the song after going through a break-up and moving out of her former partner's apartment the same day that Meghan Markle and Prince Harry announced they were engaged.[5] This was followed by the release of two EPs that year: Sweet Princess in August and Boundary Road Snacks and Drinks in October. The band were included as part of the NME 100 of 2020,[6] as well as DIY magazine's Class of 2020.[7] The band signed to 4AD in late 2020 and shared a new single, 'Scratchcard Lanyard'.[8] In February 2021, the band shared details of their debut studio album, New Long Leg. They also shared the single 'Strong Feelings'.[9] The album, which was produced by John Parish, was released on 2 April 2021.[10]\"\n",
|
||||
"\n",
|
||||
"docs = [{\"text\": text1},\n",
|
||||
" {\"text\": text2},\n",
|
||||
" {\"text\": text3}]\n",
|
||||
"\n",
|
||||
"# Initialize document store and write in the documents\n",
|
||||
"document_store = ElasticsearchDocumentStore()\n",
|
||||
"document_store.write_documents(docs)\n",
|
||||
"\n",
|
||||
"# Initialize Question Generator\n",
|
||||
"question_generator = QuestionGenerator()"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## Question Generation Pipeline\n",
|
||||
"\n",
|
||||
"The most basic version of a question generator pipeline takes a document as input and outputs generated questions\n",
|
||||
"which the the document can answer."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"question_generation_pipeline = QuestionGenerationPipeline(question_generator)\n",
|
||||
"for document in document_store:\n",
|
||||
" result = question_generation_pipeline.run(documents=[document])\n",
|
||||
" pprint(result)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## Retriever Question Generation Pipeline\n",
|
||||
"\n",
|
||||
"This pipeline takes a query as input. It retrieves relevant documents and then generates questions based on these."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"retriever = ElasticsearchRetriever(document_store=document_store)\n",
|
||||
"rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator)\n",
|
||||
"result = rqg_pipeline.run(query=\"Arya Stark\")\n",
|
||||
"pprint(result)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## Question Answer Generation Pipeline\n",
|
||||
"\n",
|
||||
"This pipeline takes a document as input, generates questions on it, and attempts to answer these questions using\n",
|
||||
"a Reader model"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"reader = FARMReader(\"deepset/roberta-base-squad2\")\n",
|
||||
"qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)\n",
|
||||
"for document in tqdm(document_store):\n",
|
||||
" result = qag_pipeline.run(document=document)\n",
|
||||
" pprint(result)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## About us\n",
|
||||
"\n",
|
||||
"This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany\n",
|
||||
"\n",
|
||||
"We bring NLP to the industry via open source!\n",
|
||||
"Our focus: Industry specific language models & large scale QA systems.\n",
|
||||
"\n",
|
||||
"Some of our other work:\n",
|
||||
"- [German BERT](https://deepset.ai/german-bert)\n",
|
||||
"- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad)\n",
|
||||
"- [FARM](https://github.com/deepset-ai/FARM)\n",
|
||||
"\n",
|
||||
"Get in touch:\n",
|
||||
"[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)\n",
|
||||
"\n",
|
||||
"By the way: [we're hiring!](https://apply.workable.com/deepset/)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
68
tutorials/Tutorial13_Question_generation.py
Normal file
68
tutorials/Tutorial13_Question_generation.py
Normal file
@ -0,0 +1,68 @@
|
||||
from haystack.question_generator import QuestionGenerator
|
||||
from haystack.utils import launch_es
|
||||
from haystack.document_store import ElasticsearchDocumentStore
|
||||
from haystack.retriever import ElasticsearchRetriever
|
||||
from pprint import pprint
|
||||
from haystack.reader import FARMReader
|
||||
from tqdm import tqdm
|
||||
from haystack.pipeline import QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline, QuestionAnswerGenerationPipeline
|
||||
|
||||
"""
|
||||
This is a bare bones tutorial showing what is possible with the QuestionGenerator Node which automatically generates
|
||||
questions which the model thinks can be answered by a given document.
|
||||
"""
|
||||
|
||||
# Start Elasticsearch service via Docker
|
||||
launch_es()
|
||||
|
||||
text1 = "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace."
|
||||
text2 = "Princess Arya Stark is the third child and second daughter of Lord Eddard Stark and his wife, Lady Catelyn Stark. She is the sister of the incumbent Westerosi monarchs, Sansa, Queen in the North, and Brandon, King of the Andals and the First Men. After narrowly escaping the persecution of House Stark by House Lannister, Arya is trained as a Faceless Man at the House of Black and White in Braavos, using her abilities to avenge her family. Upon her return to Westeros, she exacts retribution for the Red Wedding by exterminating the Frey male line."
|
||||
text3 = "Dry Cleaning are an English post-punk band who formed in South London in 2018.[3] The band is composed of vocalist Florence Shaw, guitarist Tom Dowse, bassist Lewis Maynard and drummer Nick Buxton. They are noted for their use of spoken word primarily in lieu of sung vocals, as well as their unconventional lyrics. Their musical stylings have been compared to Wire, Magazine and Joy Division.[4] The band released their debut single, 'Magic of Meghan' in 2019. Shaw wrote the song after going through a break-up and moving out of her former partner's apartment the same day that Meghan Markle and Prince Harry announced they were engaged.[5] This was followed by the release of two EPs that year: Sweet Princess in August and Boundary Road Snacks and Drinks in October. The band were included as part of the NME 100 of 2020,[6] as well as DIY magazine's Class of 2020.[7] The band signed to 4AD in late 2020 and shared a new single, 'Scratchcard Lanyard'.[8] In February 2021, the band shared details of their debut studio album, New Long Leg. They also shared the single 'Strong Feelings'.[9] The album, which was produced by John Parish, was released on 2 April 2021.[10]"
|
||||
|
||||
docs = [{"text": text1},
|
||||
{"text": text2},
|
||||
{"text": text3}]
|
||||
|
||||
# Initialize document store and write in the documents
|
||||
document_store = ElasticsearchDocumentStore()
|
||||
document_store.write_documents(docs)
|
||||
|
||||
# Initialize Question Generator
|
||||
question_generator = QuestionGenerator()
|
||||
|
||||
"""
|
||||
The most basic version of a question generator pipeline takes a document as input and outputs generated questions
|
||||
which the the document can answer.
|
||||
"""
|
||||
|
||||
# QuestionGenerationPipeline
|
||||
question_generation_pipeline = QuestionGenerationPipeline(question_generator)
|
||||
for document in document_store:
|
||||
result = question_generation_pipeline.run(documents=[document])
|
||||
pprint(result)
|
||||
|
||||
"""
|
||||
This pipeline takes a query as input. It retrievers relevant documents and then generates questions based on these.
|
||||
"""
|
||||
|
||||
# RetrieverQuestionGenerationPipeline
|
||||
retriever = ElasticsearchRetriever(document_store=document_store)
|
||||
rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator)
|
||||
result = rqg_pipeline.run(query="Arya Stark")
|
||||
pprint(result)
|
||||
|
||||
"""
|
||||
This pipeline takes a document as input, generates questions on it, and attempts to answer these questions using
|
||||
a Reader model
|
||||
"""
|
||||
|
||||
# QuestionAnswerGenerationPipeline
|
||||
reader = FARMReader("deepset/roberta-base-squad2")
|
||||
qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)
|
||||
for document in tqdm(document_store):
|
||||
result = qag_pipeline.run(document=document)
|
||||
pprint(result)
|
||||
|
||||
# This Haystack script was made with love by deepset in Berlin, Germany
|
||||
# Haystack: https://github.com/deepset-ai/haystack
|
||||
# deepset: https://deepset.ai/
|
||||
Loading…
x
Reference in New Issue
Block a user