mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-16 10:28:55 +00:00
fix: strip whitespaces safely from FARMReader
's answers (#3526)
* remove .strip() * check for right-side offset * return the whitespace-cleaned answer * lstrip, not rstrip :D * remove int * left_offset * slightly refactor reader fixture * extend test_output
This commit is contained in:
parent
e6b7109164
commit
43b24fd1a7
@ -201,13 +201,18 @@ class QACandidate:
|
||||
# final_text can be an empty string if start_t points to the very final token of the passage
|
||||
# final_text can be a whitespace if there is a whitespace token in the text, e.g.,
|
||||
# if the original text contained multiple consecutive whitespaces
|
||||
if len(final_text.strip()) > 0:
|
||||
final_text = final_text.strip()
|
||||
else:
|
||||
cleaned_final_text = final_text.strip()
|
||||
if not cleaned_final_text:
|
||||
return "", 0, 0
|
||||
end_ch = int(start_ch + len(final_text))
|
||||
|
||||
return final_text, start_ch, end_ch
|
||||
# Adjust the offsets in case of whitespace at the beginning of the answer
|
||||
left_offset = len(final_text) - len(final_text.lstrip())
|
||||
if left_offset:
|
||||
start_ch = start_ch + left_offset
|
||||
|
||||
end_ch = start_ch + len(cleaned_final_text)
|
||||
|
||||
return cleaned_final_text, start_ch, end_ch
|
||||
|
||||
def to_doc_level(self, start: int, end: int):
|
||||
"""
|
||||
|
@ -720,40 +720,6 @@ def indexing_document_classifier():
|
||||
)
|
||||
|
||||
|
||||
# TODO Fix bug in test_no_answer_output when using
|
||||
# @pytest.fixture(params=["farm", "transformers"])
|
||||
@pytest.fixture(params=["farm"])
|
||||
def no_answer_reader(request):
|
||||
if request.param == "farm":
|
||||
return FARMReader(
|
||||
model_name_or_path="deepset/bert-medium-squad2-distilled",
|
||||
use_gpu=False,
|
||||
top_k_per_sample=5,
|
||||
no_ans_boost=0,
|
||||
return_no_answer=True,
|
||||
num_processes=0,
|
||||
)
|
||||
if request.param == "transformers":
|
||||
return TransformersReader(
|
||||
model_name_or_path="deepset/bert-medium-squad2-distilled",
|
||||
tokenizer="deepset/bert-medium-squad2-distilled",
|
||||
use_gpu=-1,
|
||||
top_k_per_candidate=5,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def prediction(reader, docs):
|
||||
prediction = reader.predict(query="Who lives in Berlin?", documents=docs, top_k=5)
|
||||
return prediction
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def no_answer_prediction(no_answer_reader, docs):
|
||||
prediction = no_answer_reader.predict(query="What is the meaning of life?", documents=docs, top_k=5)
|
||||
return prediction
|
||||
|
||||
|
||||
@pytest.fixture(params=["es_filter_only", "elasticsearch", "dpr", "embedding", "tfidf", "table_text_retriever"])
|
||||
def retriever(request, document_store):
|
||||
return get_retriever(request.param, document_store)
|
||||
|
@ -7,14 +7,21 @@ from haystack.modeling.infer import QAInferencer
|
||||
from haystack.modeling.data_handler.inputs import QAInput, Question
|
||||
|
||||
|
||||
DOC_TEXT = """Twilight Princess was released to universal critical acclaim and commercial success. \
|
||||
It received perfect scores from major publications such as 1UP.com, Computer and Video Games, \
|
||||
Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators \
|
||||
GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii \
|
||||
version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called \
|
||||
it one of the greatest games ever created."""
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def span_inference_result(bert_base_squad2, caplog=None):
|
||||
if caplog:
|
||||
caplog.set_level(logging.CRITICAL)
|
||||
obj_input = [
|
||||
QAInput(
|
||||
doc_text="Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
|
||||
questions=Question("Who counted the game among the best ever made?", uid="best_id_ever"),
|
||||
doc_text=DOC_TEXT, questions=Question("Who counted the game among the best ever made?", uid="best_id_ever")
|
||||
)
|
||||
]
|
||||
result = bert_base_squad2.inference_from_objects(obj_input, return_json=False)[0]
|
||||
@ -27,7 +34,13 @@ def no_answer_inference_result(bert_base_squad2, caplog=None):
|
||||
caplog.set_level(logging.CRITICAL)
|
||||
obj_input = [
|
||||
QAInput(
|
||||
doc_text='The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain "Amazonas" in their names. The Amazon represents over half of the planet\'s remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species.',
|
||||
doc_text="""\
|
||||
The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by
|
||||
Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana,
|
||||
Suriname and French Guiana. States or departments in four nations contain "Amazonas" in their names.
|
||||
The Amazon represents over half of the planet\'s remaining rainforests, and comprises the largest
|
||||
and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual
|
||||
trees divided into 16,000 species.""",
|
||||
questions=Question(
|
||||
"The Amazon represents less than half of the planets remaining what?", uid="best_id_ever"
|
||||
),
|
||||
@ -38,17 +51,9 @@ def no_answer_inference_result(bert_base_squad2, caplog=None):
|
||||
|
||||
|
||||
def test_inference_different_inputs(bert_base_squad2):
|
||||
qa_format_1 = [
|
||||
{
|
||||
"questions": ["Who counted the game among the best ever made?"],
|
||||
"text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
|
||||
}
|
||||
]
|
||||
qa_format_1 = [{"questions": ["Who counted the game among the best ever made?"], "text": DOC_TEXT}]
|
||||
q = Question(text="Who counted the game among the best ever made?")
|
||||
qa_format_2 = QAInput(
|
||||
questions=[q],
|
||||
doc_text="Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
|
||||
)
|
||||
qa_format_2 = QAInput(questions=[q], doc_text=DOC_TEXT)
|
||||
|
||||
result1 = bert_base_squad2.inference_from_dicts(dicts=qa_format_1)
|
||||
result2 = bert_base_squad2.inference_from_objects(objects=[qa_format_2])
|
||||
@ -60,8 +65,7 @@ def test_span_inference_result_ranking_by_confidence(bert_base_squad2, caplog=No
|
||||
caplog.set_level(logging.CRITICAL)
|
||||
obj_input = [
|
||||
QAInput(
|
||||
doc_text="Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
|
||||
questions=Question("Who counted the game among the best ever made?", uid="best_id_ever"),
|
||||
doc_text=DOC_TEXT, questions=Question("Who counted the game among the best ever made?", uid="best_id_ever")
|
||||
)
|
||||
]
|
||||
|
||||
|
@ -7,7 +7,29 @@ from haystack.modeling.data_handler.inputs import QAInput, Question
|
||||
|
||||
from haystack.schema import Document, Answer
|
||||
from haystack.nodes.reader.base import BaseReader
|
||||
from haystack.nodes.reader.farm import FARMReader
|
||||
from haystack.nodes import FARMReader, TransformersReader
|
||||
|
||||
|
||||
# TODO Fix bug in test_no_answer_output when using
|
||||
# @pytest.fixture(params=["farm", "transformers"])
|
||||
@pytest.fixture(params=["farm"])
|
||||
def no_answer_reader(request):
|
||||
if request.param == "farm":
|
||||
return FARMReader(
|
||||
model_name_or_path="deepset/bert-medium-squad2-distilled",
|
||||
use_gpu=False,
|
||||
top_k_per_sample=5,
|
||||
no_ans_boost=0,
|
||||
return_no_answer=True,
|
||||
num_processes=0,
|
||||
)
|
||||
if request.param == "transformers":
|
||||
return TransformersReader(
|
||||
model_name_or_path="deepset/bert-medium-squad2-distilled",
|
||||
tokenizer="deepset/bert-medium-squad2-distilled",
|
||||
use_gpu=-1,
|
||||
top_k_per_candidate=5,
|
||||
)
|
||||
|
||||
|
||||
def test_reader_basic(reader):
|
||||
@ -15,14 +37,17 @@ def test_reader_basic(reader):
|
||||
assert isinstance(reader, BaseReader)
|
||||
|
||||
|
||||
def test_output(prediction):
|
||||
def test_output(reader, docs):
|
||||
prediction = reader.predict(query="Who lives in Berlin?", documents=docs, top_k=5)
|
||||
assert prediction is not None
|
||||
assert prediction["query"] == "Who lives in Berlin?"
|
||||
assert prediction["answers"][0].answer == "Carla"
|
||||
assert prediction["answers"][0].offsets_in_context[0].start == 11
|
||||
assert prediction["answers"][0].offsets_in_context[0].end == 16
|
||||
assert prediction["answers"][0].score <= 1
|
||||
assert prediction["answers"][0].score >= 0
|
||||
assert prediction["answers"][0].offsets_in_document[0].start == 11
|
||||
assert prediction["answers"][0].offsets_in_document[0].end == 16
|
||||
assert prediction["answers"][0].type == "extractive"
|
||||
assert 0 <= prediction["answers"][0].score <= 1
|
||||
assert prediction["answers"][0].context == "My name is Carla and I live in Berlin"
|
||||
assert len(prediction["answers"]) == 5
|
||||
|
||||
@ -80,7 +105,8 @@ def test_output_batch_multiple_queries_multiple_doc_lists(reader, docs):
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_no_answer_output(no_answer_prediction):
|
||||
def test_no_answer_output(no_answer_reader, docs):
|
||||
no_answer_prediction = no_answer_reader.predict(query="What is the meaning of life?", documents=docs, top_k=5)
|
||||
assert no_answer_prediction is not None
|
||||
assert no_answer_prediction["query"] == "What is the meaning of life?"
|
||||
assert math.isclose(no_answer_prediction["no_ans_gap"], 0.9094805717468262, rel_tol=0.0001)
|
||||
@ -96,18 +122,6 @@ def test_no_answer_output(no_answer_prediction):
|
||||
assert len(no_answer_prediction["answers"]) == 5
|
||||
|
||||
|
||||
# TODO Directly compare farm and transformers reader outputs
|
||||
# TODO checks to see that model is responsive to input arguments e.g. context_window_size - topk
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_prediction_attributes(prediction):
|
||||
# TODO FARM's prediction also has no_ans_gap
|
||||
attributes_gold = ["query", "answers"]
|
||||
for ag in attributes_gold:
|
||||
assert ag in prediction
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_model_download_options():
|
||||
# download disabled and model is not cached locally
|
||||
@ -115,15 +129,6 @@ def test_model_download_options():
|
||||
impossible_reader = FARMReader("mfeb/albert-xxlarge-v2-squad2", local_files_only=True, num_processes=0)
|
||||
|
||||
|
||||
def test_answer_attributes(prediction):
|
||||
# TODO Transformers answer also has meta key
|
||||
answer = prediction["answers"][0]
|
||||
assert type(answer) == Answer
|
||||
attributes_gold = ["answer", "score", "context", "offsets_in_context", "offsets_in_document", "type"]
|
||||
for ag in attributes_gold:
|
||||
assert getattr(answer, ag, None) is not None
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
|
||||
@pytest.mark.parametrize("window_size", [10, 15, 20])
|
||||
|
Loading…
x
Reference in New Issue
Block a user