haystack/test/nodes/test_label_generator.py
Sara Zan 54518ac790
[CI Refactoring] Refactor Document fixtures in tests (#2577)
* Refactor document fixtures

* Add embedding files

* Update Documentation & Code Style

* Indentation issue

* Update Documentation & Code Style

* Fix type conversion in conftest.py

* Update Documentation & Code Style

* mypy on sql.py

* mypy on crawler.py

* mypy on pinecone.py

* Adapt retriever tests

* Update Documentation & Code Style

* mypy on crawler.py

* Update Documentation & Code Style

* mypy on crawler.py again

* Update Documentation & Code Style

* mypy fix was too rough

* Fix some more tests

* Update Documentation & Code Style

* Skip meaningless test on FilterRetriever

* Make embedding values less specific

* Update Documentation & Code Style

* Use stable IDs in retriever tests that depend on it

* Remove needless fixtures

* docs_with_ids

* Update Documentation & Code Style

* Typo

* Fix retriever tests

* Fix reader tests

* Update Documentation & Code Style

* Workaround #2626

* Update Documentation & Code Style

* Fix label generator tests

* Reorder vectors

* remove print

* Update Documentation & Code Style

* Update Documentation & Code Style

* git tags leftover

* Update Documentation & Code Style

* fix last failing test

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2022-06-10 18:22:48 +02:00

136 lines
5.2 KiB
Python

from typing import List
from pathlib import Path
import pytest
from haystack import Document
from haystack.document_stores import BaseDocumentStore
from haystack.nodes import QuestionGenerator, EmbeddingRetriever, PseudoLabelGenerator
@pytest.mark.generator
@pytest.mark.integration
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
def test_pseudo_label_generator(
document_store: BaseDocumentStore,
retriever: EmbeddingRetriever,
question_generator: QuestionGenerator,
docs_with_true_emb: List[Document],
):
document_store.write_documents(docs_with_true_emb)
psg = PseudoLabelGenerator(question_generator, retriever)
train_examples = []
output, _ = psg.run(documents=document_store.get_all_documents())
assert "gpl_labels" in output
for item in output["gpl_labels"]:
assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
train_examples.append(item)
assert len(train_examples) > 0
@pytest.mark.generator
@pytest.mark.integration
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
def test_pseudo_label_generator_batch(
document_store: BaseDocumentStore,
retriever: EmbeddingRetriever,
question_generator: QuestionGenerator,
docs_with_true_emb: List[Document],
):
document_store.write_documents(docs_with_true_emb)
psg = PseudoLabelGenerator(question_generator, retriever)
train_examples = []
output, _ = psg.run_batch(documents=document_store.get_all_documents())
assert "gpl_labels" in output
for item in output["gpl_labels"]:
assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
train_examples.append(item)
assert len(train_examples) > 0
@pytest.mark.generator
@pytest.mark.integration
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
def test_pseudo_label_generator_using_question_document_pairs(
document_store: BaseDocumentStore, retriever: EmbeddingRetriever, docs_with_true_emb: List[Document]
):
document_store.write_documents(docs_with_true_emb)
docs = [
{
"question": "What is the capital of Germany?",
"document": "Berlin is the capital and largest city of Germany by both area and population.",
},
{
"question": "What is the largest city in Germany by population and area?",
"document": "Berlin is the capital and largest city of Germany by both area and population.",
},
]
psg = PseudoLabelGenerator(docs, retriever)
train_examples = []
output, _ = psg.run(documents=document_store.get_all_documents())
assert "gpl_labels" in output
for item in output["gpl_labels"]:
assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
train_examples.append(item)
assert len(train_examples) > 0
@pytest.mark.generator
@pytest.mark.integration
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
def test_pseudo_label_generator_using_question_document_pairs_batch(
document_store: BaseDocumentStore, retriever: EmbeddingRetriever, docs_with_true_emb: List[Document]
):
document_store.write_documents(docs_with_true_emb)
docs = [
{
"question": "What is the capital of Germany?",
"document": "Berlin is the capital and largest city of Germany by both area and population.",
},
{
"question": "What is the largest city in Germany by population and area?",
"document": "Berlin is the capital and largest city of Germany by both area and population.",
},
]
psg = PseudoLabelGenerator(docs, retriever)
train_examples = []
output, _ = psg.run_batch(documents=document_store.get_all_documents())
assert "gpl_labels" in output
for item in output["gpl_labels"]:
assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
train_examples.append(item)
assert len(train_examples) > 0
@pytest.mark.generator
@pytest.mark.integration
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
def test_training_and_save(retriever: EmbeddingRetriever, tmp_path: Path):
train_examples = [
{
"question": "What is the capital of Germany?",
"pos_doc": "Berlin is the capital and largest city of Germany by both area and population.",
"neg_doc": "The capital of Germany is the city state of Berlin.",
"score": -2.2788997,
},
{
"question": "What is the largest city in Germany by population and area?",
"pos_doc": "Berlin is the capital and largest city of Germany by both area and population.",
"neg_doc": "The capital of Germany is the city state of Berlin.",
"score": 7.0911007,
},
]
retriever.train(train_examples)
retriever.save(tmp_path)