mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-20 22:48:45 +00:00
Add GPL API docs, unit tests update (#2634)
* Update test_label_generator.py * GPL increase default batch size to 16 * GPL - API docs * GPL - split unit tests * Make devs aware of multilingual GPL * Create separate train/save test Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
f90649fab1
commit
b13c32eb9c
171
docs/_src/api/api/pseudo_label_generator.md
Normal file
171
docs/_src/api/api/pseudo_label_generator.md
Normal file
@ -0,0 +1,171 @@
|
|||||||
|
<a id="pseudo_label_generator"></a>
|
||||||
|
|
||||||
|
# Module pseudo\_label\_generator
|
||||||
|
|
||||||
|
<a id="pseudo_label_generator.PseudoLabelGenerator"></a>
|
||||||
|
|
||||||
|
## PseudoLabelGenerator
|
||||||
|
|
||||||
|
```python
|
||||||
|
class PseudoLabelGenerator(BaseComponent)
|
||||||
|
```
|
||||||
|
|
||||||
|
PseudoLabelGenerator is a component that creates Generative Pseudo Labeling (GPL) training data for the
|
||||||
|
training of dense retrievers.
|
||||||
|
|
||||||
|
GPL is an unsupervised domain adaptation method for the training of dense retrievers. It is based on question
|
||||||
|
generation and pseudo labelling with powerful cross-encoders. To train a domain-adapted model, it needs access
|
||||||
|
to an unlabeled target corpus, usually through DocumentStore and a Retriever to mine for negatives.
|
||||||
|
|
||||||
|
For more details, see [GPL](https://github.com/UKPLab/gpl).
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
| document_store = DocumentStore(...)
|
||||||
|
| retriever = Retriever(...)
|
||||||
|
| qg = QuestionGenerator(model_name_or_path="doc2query/msmarco-t5-base-v1")
|
||||||
|
| plg = PseudoLabelGenerator(qg, retriever)
|
||||||
|
| output, output_id = psg.run(documents=document_store.get_all_documents())
|
||||||
|
|
|
||||||
|
```
|
||||||
|
|
||||||
|
**Notes**:
|
||||||
|
|
||||||
|
|
||||||
|
While the NLP researchers trained the default question
|
||||||
|
[generation](https://huggingface.co/doc2query/msmarco-t5-base-v1) and the cross
|
||||||
|
[encoder](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2) models on
|
||||||
|
the English language corpus, we can also use the language-specific question generation and
|
||||||
|
cross-encoder models in the target language of our choice to apply GPL to documents in languages
|
||||||
|
other than English.
|
||||||
|
|
||||||
|
As of this writing, the German language question
|
||||||
|
[generation](https://huggingface.co/ml6team/mt5-small-german-query-generation) and the cross
|
||||||
|
[encoder](https://huggingface.co/ml6team/cross-encoder-mmarco-german-distilbert-base) models are
|
||||||
|
already available, as well as question [generation](https://huggingface.co/doc2query/msmarco-14langs-mt5-base-v1)
|
||||||
|
and the cross [encoder](https://huggingface.co/cross-encoder/mmarco-mMiniLMv2-L12-H384-v1)
|
||||||
|
models trained on fourteen languages.
|
||||||
|
|
||||||
|
<a id="pseudo_label_generator.PseudoLabelGenerator.__init__"></a>
|
||||||
|
|
||||||
|
#### PseudoLabelGenerator.\_\_init\_\_
|
||||||
|
|
||||||
|
```python
|
||||||
|
def __init__(question_producer: Union[QuestionGenerator, List[Dict[str, str]]], retriever: BaseRetriever, cross_encoder_model_name_or_path: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", max_questions_per_document: int = 3, top_k: int = 50, batch_size: int = 16, progress_bar: bool = True)
|
||||||
|
```
|
||||||
|
|
||||||
|
Loads the cross-encoder model and prepares PseudoLabelGenerator.
|
||||||
|
|
||||||
|
**Arguments**:
|
||||||
|
|
||||||
|
- `question_producer` (`Union[QuestionGenerator, List[Dict[str, str]]]`): The question producer used to generate questions or a list of already produced
|
||||||
|
questions/document pairs in a Dictionary format {"question": "question text ...", "document": "document text ..."}.
|
||||||
|
- `retriever` (`BaseRetriever`): The Retriever used to query document stores.
|
||||||
|
- `cross_encoder_model_name_or_path` (`str (optional)`): The path to the cross encoder model, defaults to
|
||||||
|
`cross-encoder/ms-marco-MiniLM-L-6-v2`.
|
||||||
|
- `max_questions_per_document` (`int`): The max number of questions generated per document, defaults to 3.
|
||||||
|
- `top_k` (`int (optional)`): The number of answers retrieved for each question, defaults to 50.
|
||||||
|
- `batch_size` (`int (optional)`): The number of documents to process at a time.
|
||||||
|
|
||||||
|
<a id="pseudo_label_generator.PseudoLabelGenerator.generate_questions"></a>
|
||||||
|
|
||||||
|
#### PseudoLabelGenerator.generate\_questions
|
||||||
|
|
||||||
|
```python
|
||||||
|
def generate_questions(documents: List[Document], batch_size: Optional[int] = None) -> List[Dict[str, str]]
|
||||||
|
```
|
||||||
|
|
||||||
|
It takes a list of documents and generates a list of question-document pairs.
|
||||||
|
|
||||||
|
**Arguments**:
|
||||||
|
|
||||||
|
- `documents` (`List[Document]`): A list of documents to generate questions from.
|
||||||
|
- `batch_size` (`Optional[int]`): The number of documents to process at a time.
|
||||||
|
|
||||||
|
**Returns**:
|
||||||
|
|
||||||
|
A list of question-document pairs.
|
||||||
|
|
||||||
|
<a id="pseudo_label_generator.PseudoLabelGenerator.mine_negatives"></a>
|
||||||
|
|
||||||
|
#### PseudoLabelGenerator.mine\_negatives
|
||||||
|
|
||||||
|
```python
|
||||||
|
def mine_negatives(question_doc_pairs: List[Dict[str, str]], batch_size: Optional[int] = None) -> List[Dict[str, str]]
|
||||||
|
```
|
||||||
|
|
||||||
|
Given a list of question and positive document pairs, this function returns a list of question/positive document/negative document
|
||||||
|
|
||||||
|
dictionaries.
|
||||||
|
|
||||||
|
**Arguments**:
|
||||||
|
|
||||||
|
- `question_doc_pairs` (`List[Dict[str, str]]`): A list of question/positive document pairs.
|
||||||
|
- `batch_size` (`int (optional)`): The number of queries to run in a batch.
|
||||||
|
|
||||||
|
**Returns**:
|
||||||
|
|
||||||
|
A list of dictionaries, where each dictionary contains the question, positive document,
|
||||||
|
and negative document.
|
||||||
|
|
||||||
|
<a id="pseudo_label_generator.PseudoLabelGenerator.generate_margin_scores"></a>
|
||||||
|
|
||||||
|
#### PseudoLabelGenerator.generate\_margin\_scores
|
||||||
|
|
||||||
|
```python
|
||||||
|
def generate_margin_scores(mined_negatives: List[Dict[str, str]], batch_size: Optional[int] = None) -> List[Dict]
|
||||||
|
```
|
||||||
|
|
||||||
|
Given a list of mined negatives, this function predicts the score margin between the positive and negative document using
|
||||||
|
|
||||||
|
the cross-encoder.
|
||||||
|
|
||||||
|
The function returns a list of examples, where each example is a dictionary with the following keys:
|
||||||
|
|
||||||
|
* question: The question string.
|
||||||
|
* pos_doc: Positive document string (the document containing the answer).
|
||||||
|
* neg_doc: Negative document string (the document that doesn't contain the answer).
|
||||||
|
* score: The margin between the score for question-positive document pair and the score for question-negative document pair.
|
||||||
|
|
||||||
|
**Arguments**:
|
||||||
|
|
||||||
|
- `mined_negatives` (`List[Dict[str, str]]`): The list of mined negatives.
|
||||||
|
- `batch_size` (`int (optional)`): The number of mined negative lists to run in a batch.
|
||||||
|
|
||||||
|
**Returns**:
|
||||||
|
|
||||||
|
A list of dictionaries, each of which has the following keys:
|
||||||
|
- question: The question string
|
||||||
|
- pos_doc: Positive document string
|
||||||
|
- neg_doc: Negative document string
|
||||||
|
- score: The score margin
|
||||||
|
|
||||||
|
<a id="pseudo_label_generator.PseudoLabelGenerator.generate_pseudo_labels"></a>
|
||||||
|
|
||||||
|
#### PseudoLabelGenerator.generate\_pseudo\_labels
|
||||||
|
|
||||||
|
```python
|
||||||
|
def generate_pseudo_labels(documents: List[Document], batch_size: Optional[int] = None) -> Tuple[dict, str]
|
||||||
|
```
|
||||||
|
|
||||||
|
Given a list of documents, this function generates a list of question-document pairs, mines for negatives, and
|
||||||
|
|
||||||
|
scores a positive/negative margin with cross-encoder. The output is the training data for the
|
||||||
|
adaptation of dense retriever models.
|
||||||
|
|
||||||
|
**Arguments**:
|
||||||
|
|
||||||
|
- `documents` (`List[Document]`): List[Document] = The list of documents to mine negatives from.
|
||||||
|
- `batch_size` (`Optional[int]`): The number of documents to process in a batch.
|
||||||
|
|
||||||
|
**Returns**:
|
||||||
|
|
||||||
|
A dictionary with a single key 'gpl_labels' representing a list of dictionaries, where each
|
||||||
|
dictionary contains the following keys:
|
||||||
|
- question: The question string.
|
||||||
|
- pos_doc: Positive document for the given question.
|
||||||
|
- neg_doc: Negative document for the given question.
|
||||||
|
- score: The margin between the score for question-positive document pair and the score for question-negative document pair.
|
||||||
|
|
20
docs/_src/api/pydoc/pseudo-label-generator.yml
Normal file
20
docs/_src/api/pydoc/pseudo-label-generator.yml
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
loaders:
|
||||||
|
- type: python
|
||||||
|
search_path: [../../../../haystack/nodes/label_generator]
|
||||||
|
modules: ['pseudo_label_generator']
|
||||||
|
ignore_when_discovered: ['__init__']
|
||||||
|
processors:
|
||||||
|
- type: filter
|
||||||
|
expression:
|
||||||
|
documented_only: true
|
||||||
|
do_not_filter_modules: false
|
||||||
|
skip_empty_modules: true
|
||||||
|
- type: smart
|
||||||
|
- type: crossref
|
||||||
|
renderer:
|
||||||
|
type: markdown
|
||||||
|
descriptive_class_title: false
|
||||||
|
descriptive_module_title: true
|
||||||
|
add_method_class_prefix: true
|
||||||
|
add_member_class_prefix: false
|
||||||
|
filename: pseudo_label_generator.md
|
@ -3256,7 +3256,7 @@
|
|||||||
},
|
},
|
||||||
"batch_size": {
|
"batch_size": {
|
||||||
"title": "Batch Size",
|
"title": "Batch Size",
|
||||||
"default": 4,
|
"default": 16,
|
||||||
"type": "integer"
|
"type": "integer"
|
||||||
},
|
},
|
||||||
"progress_bar": {
|
"progress_bar": {
|
||||||
|
@ -30,6 +30,24 @@ class PseudoLabelGenerator(BaseComponent):
|
|||||||
| output, output_id = psg.run(documents=document_store.get_all_documents())
|
| output, output_id = psg.run(documents=document_store.get_all_documents())
|
||||||
|
|
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Note:
|
||||||
|
|
||||||
|
While the NLP researchers trained the default question
|
||||||
|
[generation](https://huggingface.co/doc2query/msmarco-t5-base-v1) and the cross
|
||||||
|
[encoder](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2) models on
|
||||||
|
the English language corpus, we can also use the language-specific question generation and
|
||||||
|
cross-encoder models in the target language of our choice to apply GPL to documents in languages
|
||||||
|
other than English.
|
||||||
|
|
||||||
|
As of this writing, the German language question
|
||||||
|
[generation](https://huggingface.co/ml6team/mt5-small-german-query-generation) and the cross
|
||||||
|
[encoder](https://huggingface.co/ml6team/cross-encoder-mmarco-german-distilbert-base) models are
|
||||||
|
already available, as well as question [generation](https://huggingface.co/doc2query/msmarco-14langs-mt5-base-v1)
|
||||||
|
and the cross [encoder](https://huggingface.co/cross-encoder/mmarco-mMiniLMv2-L12-H384-v1)
|
||||||
|
models trained on fourteen languages.
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -39,7 +57,7 @@ class PseudoLabelGenerator(BaseComponent):
|
|||||||
cross_encoder_model_name_or_path: str = "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
cross_encoder_model_name_or_path: str = "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||||
max_questions_per_document: int = 3,
|
max_questions_per_document: int = 3,
|
||||||
top_k: int = 50,
|
top_k: int = 50,
|
||||||
batch_size: int = 4,
|
batch_size: int = 16,
|
||||||
progress_bar: bool = True,
|
progress_bar: bool = True,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -2,6 +2,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from haystack.document_stores import BaseDocumentStore
|
||||||
from haystack.nodes import QuestionGenerator, EmbeddingRetriever, PseudoLabelGenerator
|
from haystack.nodes import QuestionGenerator, EmbeddingRetriever, PseudoLabelGenerator
|
||||||
from test.conftest import DOCS_WITH_EMBEDDINGS
|
from test.conftest import DOCS_WITH_EMBEDDINGS
|
||||||
|
|
||||||
@ -11,21 +12,44 @@ from test.conftest import DOCS_WITH_EMBEDDINGS
|
|||||||
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
|
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
|
||||||
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
|
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
|
||||||
def test_pseudo_label_generator(
|
def test_pseudo_label_generator(
|
||||||
document_store, retriever: EmbeddingRetriever, question_generator: QuestionGenerator, tmp_path: Path
|
document_store: BaseDocumentStore,
|
||||||
|
retriever: EmbeddingRetriever,
|
||||||
|
question_generator: QuestionGenerator,
|
||||||
|
tmp_path: Path,
|
||||||
):
|
):
|
||||||
document_store.write_documents(DOCS_WITH_EMBEDDINGS)
|
document_store.write_documents(DOCS_WITH_EMBEDDINGS)
|
||||||
psg = PseudoLabelGenerator(question_generator, retriever)
|
psg = PseudoLabelGenerator(question_generator, retriever)
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for idx, doc in enumerate(document_store):
|
output, pipe_id = psg.run(documents=document_store.get_all_documents())
|
||||||
output, stream = psg.run(documents=[doc])
|
assert "gpl_labels" in output
|
||||||
assert "gpl_labels" in output
|
for item in output["gpl_labels"]:
|
||||||
for item in output["gpl_labels"]:
|
assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
|
||||||
assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
|
train_examples.append(item)
|
||||||
train_examples.append(item)
|
|
||||||
|
assert len(train_examples) > 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
@pytest.mark.generator
|
||||||
|
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
|
||||||
|
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
|
||||||
|
def test_pseudo_label_generator_batch(
|
||||||
|
document_store: BaseDocumentStore,
|
||||||
|
retriever: EmbeddingRetriever,
|
||||||
|
question_generator: QuestionGenerator,
|
||||||
|
tmp_path: Path,
|
||||||
|
):
|
||||||
|
document_store.write_documents(DOCS_WITH_EMBEDDINGS)
|
||||||
|
psg = PseudoLabelGenerator(question_generator, retriever)
|
||||||
|
train_examples = []
|
||||||
|
|
||||||
|
output, pipe_id = psg.run_batch(documents=document_store.get_all_documents())
|
||||||
|
assert "gpl_labels" in output
|
||||||
|
for item in output["gpl_labels"]:
|
||||||
|
assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
|
||||||
|
train_examples.append(item)
|
||||||
|
|
||||||
assert len(train_examples) > 0
|
assert len(train_examples) > 0
|
||||||
retriever.train(train_examples)
|
|
||||||
retriever.save(tmp_path)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.generator
|
@pytest.mark.generator
|
||||||
@ -33,7 +57,7 @@ def test_pseudo_label_generator(
|
|||||||
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
|
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
|
||||||
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
|
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
|
||||||
def test_pseudo_label_generator_using_question_document_pairs(
|
def test_pseudo_label_generator_using_question_document_pairs(
|
||||||
document_store, retriever: EmbeddingRetriever, tmp_path: Path
|
document_store: BaseDocumentStore, retriever: EmbeddingRetriever, tmp_path: Path
|
||||||
):
|
):
|
||||||
document_store.write_documents(DOCS_WITH_EMBEDDINGS)
|
document_store.write_documents(DOCS_WITH_EMBEDDINGS)
|
||||||
docs = [
|
docs = [
|
||||||
@ -48,15 +72,63 @@ def test_pseudo_label_generator_using_question_document_pairs(
|
|||||||
]
|
]
|
||||||
psg = PseudoLabelGenerator(docs, retriever)
|
psg = PseudoLabelGenerator(docs, retriever)
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for idx, doc in enumerate(document_store):
|
output, pipe_id = psg.run(documents=document_store.get_all_documents())
|
||||||
# the documents passed here are ignored as we provided source documents in the constructor
|
assert "gpl_labels" in output
|
||||||
output, stream = psg.run(documents=[doc])
|
for item in output["gpl_labels"]:
|
||||||
assert "gpl_labels" in output
|
assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
|
||||||
for item in output["gpl_labels"]:
|
train_examples.append(item)
|
||||||
assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
|
|
||||||
train_examples.append(item)
|
|
||||||
|
|
||||||
assert len(train_examples) > 0
|
assert len(train_examples) > 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
@pytest.mark.generator
|
||||||
|
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
|
||||||
|
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
|
||||||
|
def test_pseudo_label_generator_using_question_document_pairs_batch(
|
||||||
|
document_store: BaseDocumentStore, retriever: EmbeddingRetriever, tmp_path: Path
|
||||||
|
):
|
||||||
|
document_store.write_documents(DOCS_WITH_EMBEDDINGS)
|
||||||
|
docs = [
|
||||||
|
{
|
||||||
|
"question": "What is the capital of Germany?",
|
||||||
|
"document": "Berlin is the capital and largest city of Germany by both area and population.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "What is the largest city in Germany by population and area?",
|
||||||
|
"document": "Berlin is the capital and largest city of Germany by both area and population.",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
psg = PseudoLabelGenerator(docs, retriever)
|
||||||
|
train_examples = []
|
||||||
|
|
||||||
|
output, pipe_id = psg.run_batch(documents=document_store.get_all_documents())
|
||||||
|
assert "gpl_labels" in output
|
||||||
|
for item in output["gpl_labels"]:
|
||||||
|
assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item
|
||||||
|
train_examples.append(item)
|
||||||
|
|
||||||
|
assert len(train_examples) > 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
@pytest.mark.generator
|
||||||
|
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
|
||||||
|
@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True)
|
||||||
|
def test_training_and_save(retriever: EmbeddingRetriever, tmp_path: Path):
|
||||||
|
train_examples = [
|
||||||
|
{
|
||||||
|
"question": "What is the capital of Germany?",
|
||||||
|
"pos_doc": "Berlin is the capital and largest city of Germany by both area and population.",
|
||||||
|
"neg_doc": "The capital of Germany is the city state of Berlin.",
|
||||||
|
"score": -2.2788997,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question": "What is the largest city in Germany by population and area?",
|
||||||
|
"pos_doc": "Berlin is the capital and largest city of Germany by both area and population.",
|
||||||
|
"neg_doc": "The capital of Germany is the city state of Berlin.",
|
||||||
|
"score": 7.0911007,
|
||||||
|
},
|
||||||
|
]
|
||||||
retriever.train(train_examples)
|
retriever.train(train_examples)
|
||||||
retriever.save(tmp_path)
|
retriever.save(tmp_path)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user