haystack/test/nodes/test_question_generator.py

from typing import List

import pytest

from haystack.pipelines import (
    QuestionAnswerGenerationPipeline,
    QuestionGenerationPipeline,
    RetrieverQuestionGenerationPipeline,
)
from haystack.nodes.question_generator import QuestionGenerator
from haystack.schema import Document


text = (
    "The Living End are an Australian punk rockabilly band from Melbourne, formed in 1994. Since 2002, "
    "the line-up consists of Chris Cheney (vocals, guitar), Scott Owen (double bass, vocals), and Andy "
    "Strachan (drums). The band rose to fame in 1997 after the release of their EP Second Solution / Prisoner "
    "of Society, which peaked at No. 4 on the Australian ARIA Singles Chart. They have released eight studio "
    "albums, two of which reached the No. 1 spot on the ARIA Albums Chart: The Living End (October 1998) and "
    "State of Emergency (February 2006). They have also achieved chart success in the U.S. and the United "
    "Kingdom. The Band was nominated 27 times and won five awards at the Australian ARIA Music Awards "
    'ceremonies: "Highest Selling Single" for Second Solution / Prisoner of Society (1998), "Breakthrough '
    'Artist – Album" and "Best Group" for The Living End (1999), as well as "Best Rock Album" for White '
    "Noise (2008) and The Ending Is Just the Beginning Repeating (2011). In October 2010, their debut album "
    'was listed in the book "100 Best Australian Albums". Australian musicologist Ian McFarlane described '
    'the group as "one of Australia’s premier rock acts. By blending a range of styles (punk, rockabilly '
    "and flat out rock) with great success, The Living End has managed to produce anthemic choruses and "
    'memorable songs in abundance".'
)
document = Document(content=text)
query = "Living End"
keywords = [
    "Australian",
    "punk",
    "drummer",
    "Living",
    "band",
    "Band",
    "Second",
    "album",
    "albums",
    "dialect",
    "music",
    "book",
    "group",
    "produce",
    "Music",
    "Awards",
    "year",
    "released",
]
text_2 = (
    "Berlin straddles the banks of the Spree, which flows into the Havel (a tributary of the Elbe) in the "
    "western borough of Spandau. Among the city's main topographical features are the many lakes in the western "
    "and southeastern boroughs formed by the Spree, Havel and Dahme, the largest of which is Lake Müggelsee. "
    "Due to its location in the European Plain, Berlin is influenced by a temperate seasonal climate. About "
    "one-third of the city's area is composed of forests, parks, gardens, rivers, canals and lakes. The city "
    "lies in the Central German dialect area, the Berlin dialect being a variant of the Lusatian-New Marchian "
    "dialects."
)
document_2 = Document(content=text_2)
keywords_2 = [
    "Berlin",
    "Elbe",
    "Spandau",
    "Spree",
    "boroughs",
    "lakes",
    "largest",
    "seasonal",
    "climate",
    "city",
    "dialect",
    "German",
]


def test_qg_pipeline(question_generator):
    p = QuestionGenerationPipeline(question_generator)
    result = p.run(documents=[document])
    keys = list(result)
    assert "generated_questions" in keys
    assert len(result["generated_questions"][0]["questions"]) > 0


def test_qg_pipeline_non_default_params():
    question_generator = QuestionGenerator(model_name_or_path="valhalla/t5-small-e2e-qg", num_queries_per_doc=2)
    p = QuestionGenerationPipeline(question_generator)
    result = p.run(documents=[document, document_2])
    assert isinstance(result, dict)
    assert "generated_questions" in result
    assert "documents" in result
    assert isinstance(result["generated_questions"], list)
    assert isinstance(result["documents"], list)
    assert len(result["generated_questions"]) == 2
    assert len(result["documents"]) == 2
    assert len(result["generated_questions"][0]["questions"]) == 26
    assert len(result["generated_questions"][1]["questions"]) == 12

    # first list of questions should be about Australian punk band
    verify_questions(result["generated_questions"][0]["questions"], keywords)
    # second list of questions should be about Berlin
    verify_questions(result["generated_questions"][1]["questions"], keywords_2)


@pytest.mark.parametrize("split_length, num_queries_per_doc", [(50, 1), (50, 2), (50, 3), (100, 1), (100, 2), (100, 3)])
def test_qa_generator_non_default_params(split_length, num_queries_per_doc):
    question_generator = QuestionGenerator(
        model_name_or_path="valhalla/t5-small-e2e-qg",
        split_length=split_length,
        num_queries_per_doc=num_queries_per_doc,
    )
    questions = question_generator.generate_batch(texts=[document.content, document_2.content])
    assert isinstance(questions, list)
    assert len(questions) == 2
    assert isinstance(questions[0], list)
    assert isinstance(questions[1], list)
    assert len(questions[0]) > 0
    assert len(questions[1]) > 0

    # first list of questions should be about Australian punk band
    verify_questions(questions[0], keywords)
    # second list of questions should be about Berlin
    verify_questions(questions[1], keywords_2)


@pytest.mark.parametrize("retriever,document_store", [("tfidf", "memory")], indirect=True)
def test_rqg_pipeline(question_generator, retriever):
    retriever.document_store.write_documents([document])
    p = RetrieverQuestionGenerationPipeline(retriever, question_generator)
    result = p.run(query)
    keys = list(result)
    assert "generated_questions" in keys
    assert len(result["generated_questions"][0]["questions"]) > 0


@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_qag_pipeline(question_generator, reader):
    p = QuestionAnswerGenerationPipeline(question_generator, reader)
    results = p.run(documents=[document])
    assert "queries" in results
    assert "answers" in results
    assert len(results["queries"]) == len(results["answers"])
    assert len(results["answers"]) > 0
    assert results["answers"][0][0].answer is not None


def verify_questions(questions: List[str], question_keywords: List[str]):
    for q in questions:
        assert any(word in q for word in question_keywords)
-												fix: QuestionGenerator generates wrong document questions for non-default `num_queries_per_doc` parameter (#3381)


											
										
										
											2022-10-14 12:08:30 +02:00
+								from typing import List
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								import pytest
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								from haystack.pipelines import (
 								    QuestionAnswerGenerationPipeline,
 								    QuestionGenerationPipeline,
 								    RetrieverQuestionGenerationPipeline,
 								)
-												Simplify `language_modeling.py` and `tokenization.py` (#2703)

* Simplification of language_model.py and tokenization.py to remove code duplication

Co-authored-by: vblagoje <dovlex@gmail.com>
											
										
										
											2022-07-22 16:29:30 +02:00
+								from haystack.nodes.question_generator import QuestionGenerator
-												Refactoring of the `haystack` package (#1624)

* Files moved, imports all broken

* Fix most imports and docstrings into

* Fix the paths to the modules in the API docs

* Add latest docstring and tutorial changes

* Add a few pipelines that were lost in the inports

* Fix a bunch of mypy warnings

* Add latest docstring and tutorial changes

* Create a file_classifier module

* Add docs for file_classifier

* Fixed most circular imports, now the REST API can start

* Add latest docstring and tutorial changes

* Tackling more mypy issues

* Reintroduce  from FARM and fix last mypy issues hopefully

* Re-enable old-style imports

* Fix some more import from the top-level  package in an attempt to sort out circular imports

* Fix some imports in tests to new-style to prevent failed class equalities from breaking tests

* Change document_store into document_stores

* Update imports in tutorials

* Add latest docstring and tutorial changes

* Probably fixes summarizer tests

* Improve the old-style import allowing module imports (should work)

* Try to fix the docs

* Remove dedicated KnowledgeGraph page from autodocs

* Remove dedicated GraphRetriever page from autodocs

* Fix generate_docstrings.sh with an updated list of yaml files to look for

* Fix some more modules in the docs

* Fix the document stores docs too

* Fix a small issue on Tutorial14

* Add latest docstring and tutorial changes

* Add deprecation warning to old-style imports

* Remove stray folder and import Dict into dense.py

* Change import path for MLFlowLogger

* Add old loggers path to the import path aliases

* Fix debug output of convert_ipynb.py

* Fix circular import on BaseRetriever

* Missed one merge block

* re-run tutorial 5

* Fix imports in tutorial 5

* Re-enable squad_to_dpr CLI from the root package and move get_batches_from_generator into document_stores.base

* Add latest docstring and tutorial changes

* Fix typo in utils __init__

* Fix a few more imports

* Fix benchmarks too

* New-style imports in test_knowledge_graph

* Rollback setup.py

* Rollback squad_to_dpr too

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2021-10-25 15:50:23 +02:00
+								from haystack.schema import Document
-												Add QuestionGenerator (#1267)

* Create basic Question Generation

* Split texts into 50 word chunks

* Allow prompt to be changed

* Implement iteration functionality in DS

* Add docstrings, create pipelines

* Make pipelines work

* Add comments

* Add tests

* Add tutorials and docs

* Add doc string
											
										
										
											2021-07-26 17:20:43 +02:00
-												fix: QuestionGenerator generates wrong document questions for non-default `num_queries_per_doc` parameter (#3381)


											
										
										
											2022-10-14 12:08:30 +02:00
+								text = (
 								    "The Living End are an Australian punk rockabilly band from Melbourne, formed in 1994. Since 2002, "
 								    "the line-up consists of Chris Cheney (vocals, guitar), Scott Owen (double bass, vocals), and Andy "
 								    "Strachan (drums). The band rose to fame in 1997 after the release of their EP Second Solution / Prisoner "
 								    "of Society, which peaked at No. 4 on the Australian ARIA Singles Chart. They have released eight studio "
 								    "albums, two of which reached the No. 1 spot on the ARIA Albums Chart: The Living End (October 1998) and "
 								    "State of Emergency (February 2006). They have also achieved chart success in the U.S. and the United "
 								    "Kingdom. The Band was nominated 27 times and won five awards at the Australian ARIA Music Awards "
 								    'ceremonies: "Highest Selling Single" for Second Solution / Prisoner of Society (1998), "Breakthrough '
 								    'Artist – Album" and "Best Group" for The Living End (1999), as well as "Best Rock Album" for White '
 								    "Noise (2008) and The Ending Is Just the Beginning Repeating (2011). In October 2010, their debut album "
 								    'was listed in the book "100 Best Australian Albums". Australian musicologist Ian McFarlane described '
 								    'the group as "one of Australia’s premier rock acts. By blending a range of styles (punk, rockabilly '
 								    "and flat out rock) with great success, The Living End has managed to produce anthemic choruses and "
 								    'memorable songs in abundance".'
 								)
-												Redesign primitives - `Document`, `Answer`, `Label`  (#1398)

* first draft / notes on new primitives

* wip label / feedback refactor

* rename doc.text -> doc.content. add doc.content_type

* add datatype for content

* remove faq_question_field from ES and weaviate. rename text_field -> content_field in docstores. update tutorials for content field

* update converters for . Add warning for empty

* renam label.question -> label.query. Allow sorting of Answers.

* WIP primitives

* update ui/reader for new Answer format

* Improve Label. First refactoring of MultiLabel. Adjust eval code

* fixed workflow conflict with introducing new one (#1472)

* Add latest docstring and tutorial changes

* make add_eval_data() work again

* fix reader formats. WIP fix _extract_docs_and_labels_from_dict

* fix test reader

* Add latest docstring and tutorial changes

* fix another test case for reader

* fix mypy in farm reader.eval()

* fix mypy in farm reader.eval()

* WIP ORM refactor

* Add latest docstring and tutorial changes

* fix mypy weaviate

* make label and multilabel dataclasses

* bump mypy env in CI to python 3.8

* WIP refactor Label ORM

* WIP refactor Label ORM

* simplify tests for individual doc stores

* WIP refactoring markers of tests

* test alternative approach for tests with existing parametrization

* WIP refactor ORMs

* fix skip logic of already parametrized tests

* fix weaviate behaviour in tests - not parametrizing it in our general test cases.

* Add latest docstring and tutorial changes

* fix some tests

* remove sql from document_store_types

* fix markers for generator and pipeline test

* remove inmemory marker

* remove unneeded elasticsearch markers

* add dataclasses-json dependency. adjust ORM to just store JSON repr

* ignore type as dataclasses_json seems to miss functionality here

* update readme and contributing.md

* update contributing

* adjust example

* fix duplicate doc handling for custom index

* Add latest docstring and tutorial changes

* fix some ORM issues. fix get_all_labels_aggregated.

* update drop flags where get_all_labels_aggregated() was used before

* Add latest docstring and tutorial changes

* add to_json(). add + fix tests

* fix no_answer handling in label / multilabel

* fix duplicate docs in memory doc store. change primary key for sql doc table

* fix mypy issues

* fix mypy issues

* haystack/retriever/base.py

* fix test_write_document_meta[elastic]

* fix test_elasticsearch_custom_fields

* fix test_labels[elastic]

* fix crawler

* fix converter

* fix docx converter

* fix preprocessor

* fix test_utils

* fix tfidf retriever. fix selection of docstore in tests with multiple fixtures / parameterizations

* Add latest docstring and tutorial changes

* fix crawler test. fix ocrconverter attribute

* fix test_elasticsearch_custom_query

* fix generator pipeline

* fix ocr converter

* fix ragenerator

* Add latest docstring and tutorial changes

* fix test_load_and_save_yaml for elasticsearch

* fixes for pipeline tests

* fix faq pipeline

* fix pipeline tests

* Add latest docstring and tutorial changes

* fix weaviate

* Add latest docstring and tutorial changes

* trigger CI

* satisfy mypy

* Add latest docstring and tutorial changes

* satisfy mypy

* Add latest docstring and tutorial changes

* trigger CI

* fix question generation test

* fix ray. fix Q-generation

* fix translator test

* satisfy mypy

* wip refactor feedback rest api

* fix rest api feedback endpoint

* fix doc classifier

* remove relation of Labels -> Docs in SQL ORM

* fix faiss/milvus tests

* fix doc classifier test

* fix eval test

* fixing eval issues

* Add latest docstring and tutorial changes

* fix mypy

* WIP replace dataclasses-json with manual serialization

* Add latest docstring and tutorial changes

* revert to dataclass-json serialization for now. remove debug prints.

* update docstrings

* fix extractor. fix Answer Span init

* fix api test

* keep meta data of answers in reader.run()

* fix meta handling

* adress review feedback

* Add latest docstring and tutorial changes

* make document=None for open domain labels

* add import

* fix print utils

* fix rest api

* adress review feedback

* Add latest docstring and tutorial changes

* fix mypy

Co-authored-by: Markus Paff <markuspaff.mp@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2021-10-13 14:23:23 +02:00
+								document = Document(content=text)
-												Add QuestionGenerator (#1267)

* Create basic Question Generation

* Split texts into 50 word chunks

* Allow prompt to be changed

* Implement iteration functionality in DS

* Add docstrings, create pipelines

* Make pipelines work

* Add comments

* Add tests

* Add tutorials and docs

* Add doc string
											
										
										
											2021-07-26 17:20:43 +02:00
+								query = "Living End"
-												fix: QuestionGenerator generates wrong document questions for non-default `num_queries_per_doc` parameter (#3381)


											
										
										
											2022-10-14 12:08:30 +02:00
+								keywords = [
 								    "Australian",
 								    "punk",
 								    "drummer",
 								    "Living",
 								    "band",
 								    "Band",
 								    "Second",
 								    "album",
 								    "albums",
 								    "dialect",
 								    "music",
 								    "book",
 								    "group",
 								    "produce",
 								    "Music",
 								    "Awards",
 								    "year",
 								    "released",
 								]
 								text_2 = (
 								    "Berlin straddles the banks of the Spree, which flows into the Havel (a tributary of the Elbe) in the "
 								    "western borough of Spandau. Among the city's main topographical features are the many lakes in the western "
 								    "and southeastern boroughs formed by the Spree, Havel and Dahme, the largest of which is Lake Müggelsee. "
 								    "Due to its location in the European Plain, Berlin is influenced by a temperate seasonal climate. About "
 								    "one-third of the city's area is composed of forests, parks, gardens, rivers, canals and lakes. The city "
 								    "lies in the Central German dialect area, the Berlin dialect being a variant of the Lusatian-New Marchian "
 								    "dialects."
 								)
 								document_2 = Document(content=text_2)
 								keywords_2 = [
 								    "Berlin",
 								    "Elbe",
 								    "Spandau",
 								    "Spree",
 								    "boroughs",
 								    "lakes",
 								    "largest",
 								    "seasonal",
 								    "climate",
 								    "city",
 								    "dialect",
 								    "German",
 								]
-												Add QuestionGenerator (#1267)

* Create basic Question Generation

* Split texts into 50 word chunks

* Allow prompt to be changed

* Implement iteration functionality in DS

* Add docstrings, create pipelines

* Make pipelines work

* Add comments

* Add tests

* Add tutorials and docs

* Add doc string
											
										
										
											2021-07-26 17:20:43 +02:00
 								def test_qg_pipeline(question_generator):
 								    p = QuestionGenerationPipeline(question_generator)
 								    result = p.run(documents=[document])
 								    keys = list(result)
 								    assert "generated_questions" in keys
 								    assert len(result["generated_questions"][0]["questions"]) > 0
-												fix: QuestionGenerator generates wrong document questions for non-default `num_queries_per_doc` parameter (#3381)


											
										
										
											2022-10-14 12:08:30 +02:00
+								def test_qg_pipeline_non_default_params():
 								    question_generator = QuestionGenerator(model_name_or_path="valhalla/t5-small-e2e-qg", num_queries_per_doc=2)
 								    p = QuestionGenerationPipeline(question_generator)
 								    result = p.run(documents=[document, document_2])
 								    assert isinstance(result, dict)
 								    assert "generated_questions" in result
 								    assert "documents" in result
 								    assert isinstance(result["generated_questions"], list)
 								    assert isinstance(result["documents"], list)
 								    assert len(result["generated_questions"]) == 2
 								    assert len(result["documents"]) == 2
 								    assert len(result["generated_questions"][0]["questions"]) == 26
 								    assert len(result["generated_questions"][1]["questions"]) == 12
 								    # first list of questions should be about Australian punk band
 								    verify_questions(result["generated_questions"][0]["questions"], keywords)
 								    # second list of questions should be about Berlin
 								    verify_questions(result["generated_questions"][1]["questions"], keywords_2)
 								@pytest.mark.parametrize("split_length, num_queries_per_doc", [(50, 1), (50, 2), (50, 3), (100, 1), (100, 2), (100, 3)])
 								def test_qa_generator_non_default_params(split_length, num_queries_per_doc):
 								    question_generator = QuestionGenerator(
 								        model_name_or_path="valhalla/t5-small-e2e-qg",
 								        split_length=split_length,
 								        num_queries_per_doc=num_queries_per_doc,
 								    )
 								    questions = question_generator.generate_batch(texts=[document.content, document_2.content])
 								    assert isinstance(questions, list)
 								    assert len(questions) == 2
 								    assert isinstance(questions[0], list)
 								    assert isinstance(questions[1], list)
 								    assert len(questions[0]) > 0
 								    assert len(questions[1]) > 0
 								    # first list of questions should be about Australian punk band
 								    verify_questions(questions[0], keywords)
 								    # second list of questions should be about Berlin
 								    verify_questions(questions[1], keywords_2)
-												Redesign primitives - `Document`, `Answer`, `Label`  (#1398)

* first draft / notes on new primitives

* wip label / feedback refactor

* rename doc.text -> doc.content. add doc.content_type

* add datatype for content

* remove faq_question_field from ES and weaviate. rename text_field -> content_field in docstores. update tutorials for content field

* update converters for . Add warning for empty

* renam label.question -> label.query. Allow sorting of Answers.

* WIP primitives

* update ui/reader for new Answer format

* Improve Label. First refactoring of MultiLabel. Adjust eval code

* fixed workflow conflict with introducing new one (#1472)

* Add latest docstring and tutorial changes

* make add_eval_data() work again

* fix reader formats. WIP fix _extract_docs_and_labels_from_dict

* fix test reader

* Add latest docstring and tutorial changes

* fix another test case for reader

* fix mypy in farm reader.eval()

* fix mypy in farm reader.eval()

* WIP ORM refactor

* Add latest docstring and tutorial changes

* fix mypy weaviate

* make label and multilabel dataclasses

* bump mypy env in CI to python 3.8

* WIP refactor Label ORM

* WIP refactor Label ORM

* simplify tests for individual doc stores

* WIP refactoring markers of tests

* test alternative approach for tests with existing parametrization

* WIP refactor ORMs

* fix skip logic of already parametrized tests

* fix weaviate behaviour in tests - not parametrizing it in our general test cases.

* Add latest docstring and tutorial changes

* fix some tests

* remove sql from document_store_types

* fix markers for generator and pipeline test

* remove inmemory marker

* remove unneeded elasticsearch markers

* add dataclasses-json dependency. adjust ORM to just store JSON repr

* ignore type as dataclasses_json seems to miss functionality here

* update readme and contributing.md

* update contributing

* adjust example

* fix duplicate doc handling for custom index

* Add latest docstring and tutorial changes

* fix some ORM issues. fix get_all_labels_aggregated.

* update drop flags where get_all_labels_aggregated() was used before

* Add latest docstring and tutorial changes

* add to_json(). add + fix tests

* fix no_answer handling in label / multilabel

* fix duplicate docs in memory doc store. change primary key for sql doc table

* fix mypy issues

* fix mypy issues

* haystack/retriever/base.py

* fix test_write_document_meta[elastic]

* fix test_elasticsearch_custom_fields

* fix test_labels[elastic]

* fix crawler

* fix converter

* fix docx converter

* fix preprocessor

* fix test_utils

* fix tfidf retriever. fix selection of docstore in tests with multiple fixtures / parameterizations

* Add latest docstring and tutorial changes

* fix crawler test. fix ocrconverter attribute

* fix test_elasticsearch_custom_query

* fix generator pipeline

* fix ocr converter

* fix ragenerator

* Add latest docstring and tutorial changes

* fix test_load_and_save_yaml for elasticsearch

* fixes for pipeline tests

* fix faq pipeline

* fix pipeline tests

* Add latest docstring and tutorial changes

* fix weaviate

* Add latest docstring and tutorial changes

* trigger CI

* satisfy mypy

* Add latest docstring and tutorial changes

* satisfy mypy

* Add latest docstring and tutorial changes

* trigger CI

* fix question generation test

* fix ray. fix Q-generation

* fix translator test

* satisfy mypy

* wip refactor feedback rest api

* fix rest api feedback endpoint

* fix doc classifier

* remove relation of Labels -> Docs in SQL ORM

* fix faiss/milvus tests

* fix doc classifier test

* fix eval test

* fixing eval issues

* Add latest docstring and tutorial changes

* fix mypy

* WIP replace dataclasses-json with manual serialization

* Add latest docstring and tutorial changes

* revert to dataclass-json serialization for now. remove debug prints.

* update docstrings

* fix extractor. fix Answer Span init

* fix api test

* keep meta data of answers in reader.run()

* fix meta handling

* adress review feedback

* Add latest docstring and tutorial changes

* make document=None for open domain labels

* add import

* fix print utils

* fix rest api

* adress review feedback

* Add latest docstring and tutorial changes

* fix mypy

Co-authored-by: Markus Paff <markuspaff.mp@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2021-10-13 14:23:23 +02:00
+								@pytest.mark.parametrize("retriever,document_store", [("tfidf", "memory")], indirect=True)
-												Add QuestionGenerator (#1267)

* Create basic Question Generation

* Split texts into 50 word chunks

* Allow prompt to be changed

* Implement iteration functionality in DS

* Add docstrings, create pipelines

* Make pipelines work

* Add comments

* Add tests

* Add tutorials and docs

* Add doc string
											
										
										
											2021-07-26 17:20:43 +02:00
+								def test_rqg_pipeline(question_generator, retriever):
-												Redesign primitives - `Document`, `Answer`, `Label`  (#1398)

* first draft / notes on new primitives

* wip label / feedback refactor

* rename doc.text -> doc.content. add doc.content_type

* add datatype for content

* remove faq_question_field from ES and weaviate. rename text_field -> content_field in docstores. update tutorials for content field

* update converters for . Add warning for empty

* renam label.question -> label.query. Allow sorting of Answers.

* WIP primitives

* update ui/reader for new Answer format

* Improve Label. First refactoring of MultiLabel. Adjust eval code

* fixed workflow conflict with introducing new one (#1472)

* Add latest docstring and tutorial changes

* make add_eval_data() work again

* fix reader formats. WIP fix _extract_docs_and_labels_from_dict

* fix test reader

* Add latest docstring and tutorial changes

* fix another test case for reader

* fix mypy in farm reader.eval()

* fix mypy in farm reader.eval()

* WIP ORM refactor

* Add latest docstring and tutorial changes

* fix mypy weaviate

* make label and multilabel dataclasses

* bump mypy env in CI to python 3.8

* WIP refactor Label ORM

* WIP refactor Label ORM

* simplify tests for individual doc stores

* WIP refactoring markers of tests

* test alternative approach for tests with existing parametrization

* WIP refactor ORMs

* fix skip logic of already parametrized tests

* fix weaviate behaviour in tests - not parametrizing it in our general test cases.

* Add latest docstring and tutorial changes

* fix some tests

* remove sql from document_store_types

* fix markers for generator and pipeline test

* remove inmemory marker

* remove unneeded elasticsearch markers

* add dataclasses-json dependency. adjust ORM to just store JSON repr

* ignore type as dataclasses_json seems to miss functionality here

* update readme and contributing.md

* update contributing

* adjust example

* fix duplicate doc handling for custom index

* Add latest docstring and tutorial changes

* fix some ORM issues. fix get_all_labels_aggregated.

* update drop flags where get_all_labels_aggregated() was used before

* Add latest docstring and tutorial changes

* add to_json(). add + fix tests

* fix no_answer handling in label / multilabel

* fix duplicate docs in memory doc store. change primary key for sql doc table

* fix mypy issues

* fix mypy issues

* haystack/retriever/base.py

* fix test_write_document_meta[elastic]

* fix test_elasticsearch_custom_fields

* fix test_labels[elastic]

* fix crawler

* fix converter

* fix docx converter

* fix preprocessor

* fix test_utils

* fix tfidf retriever. fix selection of docstore in tests with multiple fixtures / parameterizations

* Add latest docstring and tutorial changes

* fix crawler test. fix ocrconverter attribute

* fix test_elasticsearch_custom_query

* fix generator pipeline

* fix ocr converter

* fix ragenerator

* Add latest docstring and tutorial changes

* fix test_load_and_save_yaml for elasticsearch

* fixes for pipeline tests

* fix faq pipeline

* fix pipeline tests

* Add latest docstring and tutorial changes

* fix weaviate

* Add latest docstring and tutorial changes

* trigger CI

* satisfy mypy

* Add latest docstring and tutorial changes

* satisfy mypy

* Add latest docstring and tutorial changes

* trigger CI

* fix question generation test

* fix ray. fix Q-generation

* fix translator test

* satisfy mypy

* wip refactor feedback rest api

* fix rest api feedback endpoint

* fix doc classifier

* remove relation of Labels -> Docs in SQL ORM

* fix faiss/milvus tests

* fix doc classifier test

* fix eval test

* fixing eval issues

* Add latest docstring and tutorial changes

* fix mypy

* WIP replace dataclasses-json with manual serialization

* Add latest docstring and tutorial changes

* revert to dataclass-json serialization for now. remove debug prints.

* update docstrings

* fix extractor. fix Answer Span init

* fix api test

* keep meta data of answers in reader.run()

* fix meta handling

* adress review feedback

* Add latest docstring and tutorial changes

* make document=None for open domain labels

* add import

* fix print utils

* fix rest api

* adress review feedback

* Add latest docstring and tutorial changes

* fix mypy

Co-authored-by: Markus Paff <markuspaff.mp@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2021-10-13 14:23:23 +02:00
+								    retriever.document_store.write_documents([document])
-												Add QuestionGenerator (#1267)

* Create basic Question Generation

* Split texts into 50 word chunks

* Allow prompt to be changed

* Implement iteration functionality in DS

* Add docstrings, create pipelines

* Make pipelines work

* Add comments

* Add tests

* Add tutorials and docs

* Add doc string
											
										
										
											2021-07-26 17:20:43 +02:00
+								    p = RetrieverQuestionGenerationPipeline(retriever, question_generator)
 								    result = p.run(query)
 								    keys = list(result)
 								    assert "generated_questions" in keys
 								    assert len(result["generated_questions"][0]["questions"]) > 0
 								@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 								def test_qag_pipeline(question_generator, reader):
 								    p = QuestionAnswerGenerationPipeline(question_generator, reader)
-												Add `run_batch` method to all nodes and `Pipeline` to allow batch querying (#2481)

* Add run_batch methods for batch querying

* Update Documentation & Code Style

* Fix mypy

* Update Documentation & Code Style

* Fix mypy

* Fix linter

* Fix tests

* Update Documentation & Code Style

* Fix tests

* Update Documentation & Code Style

* Fix mypy

* Fix rest api test

* Update Documentation & Code Style

* Add Doc strings

* Update Documentation & Code Style

* Add batch_size as attribute to nodes supporting batching

* Adapt error messages

* Adapt type of filters in retrievers

* Revert change about truncation_warning in summarizer

* Unify multiple_doc_lists tests

* Use smaller models in extractor tests

* Add return types to JoinAnswers and RouteDocuments

* Adapt return statements in reader's run_batch method

* Allow list of filters

* Adapt error messages

* Update Documentation & Code Style

* Fix tests

* Fix mypy

* Adapt print_questions

* Remove disabling warning about too many public methods

* Add flag for pylint to disable warning about too many public methods in pipelines/base.py and document_stores/base.py

* Add type check

* Update Documentation & Code Style

* Adapt tutorial 11

* Update Documentation & Code Style

* Add query_batch method for DCDocStore

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-05-11 11:11:00 +02:00
+								    results = p.run(documents=[document])
 								    assert "queries" in results
 								    assert "answers" in results
 								    assert len(results["queries"]) == len(results["answers"])
 								    assert len(results["answers"]) > 0
 								    assert results["answers"][0][0].answer is not None
-												fix: QuestionGenerator generates wrong document questions for non-default `num_queries_per_doc` parameter (#3381)


											
										
										
											2022-10-14 12:08:30 +02:00
 								def verify_questions(questions: List[str], question_keywords: List[str]):
 								    for q in questions:
 								        assert any(word in q for word in question_keywords)