haystack/test/nodes/test_label_generator.py

from pathlib import Path

import pytest

from haystack.document_stores import BaseDocumentStore
from haystack.nodes import QuestionGenerator, EmbeddingRetriever, PseudoLabelGenerator
from test.conftest import DOCS_WITH_EMBEDDINGS


@pytest.mark.generator
@pytest.mark.integration
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
def test_pseudo_label_generator(
    document_store: BaseDocumentStore,
    retriever: EmbeddingRetriever,
    question_generator: QuestionGenerator,
    tmp_path: Path,
):
    document_store.write_documents(DOCS_WITH_EMBEDDINGS)
    psg = PseudoLabelGenerator(question_generator, retriever)
    train_examples = []
    output, pipe_id = psg.run(documents=document_store.get_all_documents())
    assert "gpl_labels" in output
    for item in output["gpl_labels"]:
        assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
        train_examples.append(item)

    assert len(train_examples) > 0


@pytest.mark.slow
@pytest.mark.generator
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
def test_pseudo_label_generator_batch(
    document_store: BaseDocumentStore,
    retriever: EmbeddingRetriever,
    question_generator: QuestionGenerator,
    tmp_path: Path,
):
    document_store.write_documents(DOCS_WITH_EMBEDDINGS)
    psg = PseudoLabelGenerator(question_generator, retriever)
    train_examples = []

    output, pipe_id = psg.run_batch(documents=document_store.get_all_documents())
    assert "gpl_labels" in output
    for item in output["gpl_labels"]:
        assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
        train_examples.append(item)

    assert len(train_examples) > 0


@pytest.mark.generator
@pytest.mark.integration
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
def test_pseudo_label_generator_using_question_document_pairs(
    document_store: BaseDocumentStore, retriever: EmbeddingRetriever, tmp_path: Path
):
    document_store.write_documents(DOCS_WITH_EMBEDDINGS)
    docs = [
        {
            "question": "What is the capital of Germany?",
            "document": "Berlin is the capital and largest city of Germany by both area and population.",
        },
        {
            "question": "What is the largest city in Germany by population and area?",
            "document": "Berlin is the capital and largest city of Germany by both area and population.",
        },
    ]
    psg = PseudoLabelGenerator(docs, retriever)
    train_examples = []
    output, pipe_id = psg.run(documents=document_store.get_all_documents())
    assert "gpl_labels" in output
    for item in output["gpl_labels"]:
        assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
        train_examples.append(item)

    assert len(train_examples) > 0


@pytest.mark.slow
@pytest.mark.generator
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
def test_pseudo_label_generator_using_question_document_pairs_batch(
    document_store: BaseDocumentStore, retriever: EmbeddingRetriever, tmp_path: Path
):
    document_store.write_documents(DOCS_WITH_EMBEDDINGS)
    docs = [
        {
            "question": "What is the capital of Germany?",
            "document": "Berlin is the capital and largest city of Germany by both area and population.",
        },
        {
            "question": "What is the largest city in Germany by population and area?",
            "document": "Berlin is the capital and largest city of Germany by both area and population.",
        },
    ]
    psg = PseudoLabelGenerator(docs, retriever)
    train_examples = []

    output, pipe_id = psg.run_batch(documents=document_store.get_all_documents())
    assert "gpl_labels" in output
    for item in output["gpl_labels"]:
        assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
        train_examples.append(item)

    assert len(train_examples) > 0


@pytest.mark.slow
@pytest.mark.generator
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
def test_training_and_save(retriever: EmbeddingRetriever, tmp_path: Path):
    train_examples = [
        {
            "question": "What is the capital of Germany?",
            "pos_doc": "Berlin is the capital and largest city of Germany by both area and population.",
            "neg_doc": "The capital of Germany is the city state of Berlin.",
            "score": -2.2788997,
        },
        {
            "question": "What is the largest city in Germany by population and area?",
            "pos_doc": "Berlin is the capital and largest city of Germany by both area and population.",
            "neg_doc": "The capital of Germany is the city state of Berlin.",
            "score": 7.0911007,
        },
    ]
    retriever.train(train_examples)
    retriever.save(tmp_path)
Add Generative Pseudo Labeling (#2388) 2022-06-02 16:12:47 +02:00			`from pathlib import Path`

			`import pytest`

Add GPL API docs, unit tests update (#2634) * Update test_label_generator.py * GPL increase default batch size to 16 * GPL - API docs * GPL - split unit tests * Make devs aware of multilingual GPL * Create separate train/save test Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-10 05:25:28 -04:00			`from haystack.document_stores import BaseDocumentStore`
Add Generative Pseudo Labeling (#2388) 2022-06-02 16:12:47 +02:00			`from haystack.nodes import QuestionGenerator, EmbeddingRetriever, PseudoLabelGenerator`
			`from test.conftest import DOCS_WITH_EMBEDDINGS`


			`@pytest.mark.generator`
[CI Refactoring] Workflow refactoring (#2576) * Unify CI tests (from #2466) * Update Documentation & Code Style * Change folder names * Fix markers list * Remove marker 'slow', replaced with 'integration' * Soften children check * Start ES first so it has time to boot while Python is setup * Run the full workflow * Try to make pip upgrade on Windows * Set KG tests as integration * Update Documentation & Code Style * typo * faster pylint * Make Pylint use the cache * filter diff files for pylint * debug pylint statement * revert pylint changes * Remove path from asserted log (fails on Windows) * Skip preprocessor test on Windows * Tackling Windows specific failures * Fix pytest command for windows suites * Remove \ from command * Move poppler test into integration * Skip opensearch test on windows * Add tolerance in reader sas score for Windows * Another pytorch approx * Raise time limit for unit tests :( * Skip poppler test on Windows CI * Specify to pull with FF only in docs check * temporarily run the docs check immediately * Allow merge commit for now * Try without fetch depth * Accelerating test * Accelerating test * Add repository and ref alongside fetch-depth * Separate out code&docs check from tests * Use setup-python cache * Delete custom action * Remove the pull step in the docs check, will find a way to run on bot commits * Add requirements.txt in .github for caching * Actually install dependencies * Change deps group for pylint * Unclear why the requirements.txt is still required :/ * Fix the code check python setup * Install all deps for pylint * Make the autoformat check depend on tests and doc updates workflows * Try installing dependencies in another order * Try again to install the deps * quoting the paths * Ad back the requirements * Try again to install rest_api and ui * Change deps group * Duplicate haystack install line * See if the cache is the problem * Disable also in mypy, who knows * split the install step * Split install step everywhere * Revert "Separate out code&docs check from tests" This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd. * Add back the action * Proactive support for audio (see text2speech branch) * Fix label generator tests * Remove install of libsndfile1 on win temporarily * exclude audio tests on win * install ffmpeg for integration tests Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-07 09:23:03 +02:00			`@pytest.mark.integration`
Add Generative Pseudo Labeling (#2388) 2022-06-02 16:12:47 +02:00			`@pytest.mark.parametrize("document_store", ["memory"], indirect=True)`
			`@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)`
			`def test_pseudo_label_generator(`
Add GPL API docs, unit tests update (#2634) * Update test_label_generator.py * GPL increase default batch size to 16 * GPL - API docs * GPL - split unit tests * Make devs aware of multilingual GPL * Create separate train/save test Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-10 05:25:28 -04:00			`document_store: BaseDocumentStore,`
			`retriever: EmbeddingRetriever,`
			`question_generator: QuestionGenerator,`
			`tmp_path: Path,`
Add Generative Pseudo Labeling (#2388) 2022-06-02 16:12:47 +02:00			`):`
			`document_store.write_documents(DOCS_WITH_EMBEDDINGS)`
			`psg = PseudoLabelGenerator(question_generator, retriever)`
			`train_examples = []`
Add GPL API docs, unit tests update (#2634) * Update test_label_generator.py * GPL increase default batch size to 16 * GPL - API docs * GPL - split unit tests * Make devs aware of multilingual GPL * Create separate train/save test Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-10 05:25:28 -04:00			`output, pipe_id = psg.run(documents=document_store.get_all_documents())`
			`assert "gpl_labels" in output`
			`for item in output["gpl_labels"]:`
			`assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item`
			`train_examples.append(item)`

			`assert len(train_examples) > 0`


			`@pytest.mark.slow`
			`@pytest.mark.generator`
			`@pytest.mark.parametrize("document_store", ["memory"], indirect=True)`
			`@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)`
			`def test_pseudo_label_generator_batch(`
			`document_store: BaseDocumentStore,`
			`retriever: EmbeddingRetriever,`
			`question_generator: QuestionGenerator,`
			`tmp_path: Path,`
			`):`
			`document_store.write_documents(DOCS_WITH_EMBEDDINGS)`
			`psg = PseudoLabelGenerator(question_generator, retriever)`
			`train_examples = []`

			`output, pipe_id = psg.run_batch(documents=document_store.get_all_documents())`
			`assert "gpl_labels" in output`
			`for item in output["gpl_labels"]:`
			`assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item`
			`train_examples.append(item)`
Add Generative Pseudo Labeling (#2388) 2022-06-02 16:12:47 +02:00
			`assert len(train_examples) > 0`


			`@pytest.mark.generator`
[CI Refactoring] Workflow refactoring (#2576) * Unify CI tests (from #2466) * Update Documentation & Code Style * Change folder names * Fix markers list * Remove marker 'slow', replaced with 'integration' * Soften children check * Start ES first so it has time to boot while Python is setup * Run the full workflow * Try to make pip upgrade on Windows * Set KG tests as integration * Update Documentation & Code Style * typo * faster pylint * Make Pylint use the cache * filter diff files for pylint * debug pylint statement * revert pylint changes * Remove path from asserted log (fails on Windows) * Skip preprocessor test on Windows * Tackling Windows specific failures * Fix pytest command for windows suites * Remove \ from command * Move poppler test into integration * Skip opensearch test on windows * Add tolerance in reader sas score for Windows * Another pytorch approx * Raise time limit for unit tests :( * Skip poppler test on Windows CI * Specify to pull with FF only in docs check * temporarily run the docs check immediately * Allow merge commit for now * Try without fetch depth * Accelerating test * Accelerating test * Add repository and ref alongside fetch-depth * Separate out code&docs check from tests * Use setup-python cache * Delete custom action * Remove the pull step in the docs check, will find a way to run on bot commits * Add requirements.txt in .github for caching * Actually install dependencies * Change deps group for pylint * Unclear why the requirements.txt is still required :/ * Fix the code check python setup * Install all deps for pylint * Make the autoformat check depend on tests and doc updates workflows * Try installing dependencies in another order * Try again to install the deps * quoting the paths * Ad back the requirements * Try again to install rest_api and ui * Change deps group * Duplicate haystack install line * See if the cache is the problem * Disable also in mypy, who knows * split the install step * Split install step everywhere * Revert "Separate out code&docs check from tests" This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd. * Add back the action * Proactive support for audio (see text2speech branch) * Fix label generator tests * Remove install of libsndfile1 on win temporarily * exclude audio tests on win * install ffmpeg for integration tests Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-07 09:23:03 +02:00			`@pytest.mark.integration`
Add Generative Pseudo Labeling (#2388) 2022-06-02 16:12:47 +02:00			`@pytest.mark.parametrize("document_store", ["memory"], indirect=True)`
			`@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)`
			`def test_pseudo_label_generator_using_question_document_pairs(`
Add GPL API docs, unit tests update (#2634) * Update test_label_generator.py * GPL increase default batch size to 16 * GPL - API docs * GPL - split unit tests * Make devs aware of multilingual GPL * Create separate train/save test Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-10 05:25:28 -04:00			`document_store: BaseDocumentStore, retriever: EmbeddingRetriever, tmp_path: Path`
Add Generative Pseudo Labeling (#2388) 2022-06-02 16:12:47 +02:00			`):`
			`document_store.write_documents(DOCS_WITH_EMBEDDINGS)`
			`docs = [`
			`{`
			`"question": "What is the capital of Germany?",`
			`"document": "Berlin is the capital and largest city of Germany by both area and population.",`
			`},`
			`{`
			`"question": "What is the largest city in Germany by population and area?",`
			`"document": "Berlin is the capital and largest city of Germany by both area and population.",`
			`},`
			`]`
			`psg = PseudoLabelGenerator(docs, retriever)`
			`train_examples = []`
Add GPL API docs, unit tests update (#2634) * Update test_label_generator.py * GPL increase default batch size to 16 * GPL - API docs * GPL - split unit tests * Make devs aware of multilingual GPL * Create separate train/save test Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-10 05:25:28 -04:00			`output, pipe_id = psg.run(documents=document_store.get_all_documents())`
			`assert "gpl_labels" in output`
			`for item in output["gpl_labels"]:`
			`assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item`
			`train_examples.append(item)`
Add Generative Pseudo Labeling (#2388) 2022-06-02 16:12:47 +02:00
			`assert len(train_examples) > 0`

Add GPL API docs, unit tests update (#2634) * Update test_label_generator.py * GPL increase default batch size to 16 * GPL - API docs * GPL - split unit tests * Make devs aware of multilingual GPL * Create separate train/save test Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2022-06-10 05:25:28 -04:00
			`@pytest.mark.slow`
			`@pytest.mark.generator`
			`@pytest.mark.parametrize("document_store", ["memory"], indirect=True)`
			`@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)`
			`def test_pseudo_label_generator_using_question_document_pairs_batch(`
			`document_store: BaseDocumentStore, retriever: EmbeddingRetriever, tmp_path: Path`
			`):`
			`document_store.write_documents(DOCS_WITH_EMBEDDINGS)`
			`docs = [`
			`{`
			`"question": "What is the capital of Germany?",`
			`"document": "Berlin is the capital and largest city of Germany by both area and population.",`
			`},`
			`{`
			`"question": "What is the largest city in Germany by population and area?",`
			`"document": "Berlin is the capital and largest city of Germany by both area and population.",`
			`},`
			`]`
			`psg = PseudoLabelGenerator(docs, retriever)`
			`train_examples = []`

			`output, pipe_id = psg.run_batch(documents=document_store.get_all_documents())`
			`assert "gpl_labels" in output`
			`for item in output["gpl_labels"]:`
			`assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item`
			`train_examples.append(item)`

			`assert len(train_examples) > 0`


			`@pytest.mark.slow`
			`@pytest.mark.generator`
			`@pytest.mark.parametrize("document_store", ["memory"], indirect=True)`
			`@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)`
			`def test_training_and_save(retriever: EmbeddingRetriever, tmp_path: Path):`
			`train_examples = [`
			`{`
			`"question": "What is the capital of Germany?",`
			`"pos_doc": "Berlin is the capital and largest city of Germany by both area and population.",`
			`"neg_doc": "The capital of Germany is the city state of Berlin.",`
			`"score": -2.2788997,`
			`},`
			`{`
			`"question": "What is the largest city in Germany by population and area?",`
			`"pos_doc": "Berlin is the capital and largest city of Germany by both area and population.",`
			`"neg_doc": "The capital of Germany is the city state of Berlin.",`
			`"score": 7.0911007,`
			`},`
			`]`
Add Generative Pseudo Labeling (#2388) 2022-06-02 16:12:47 +02:00			`retriever.train(train_examples)`
			`retriever.save(tmp_path)`