diff --git a/docs/_src/api/api/pseudo_label_generator.md b/docs/_src/api/api/pseudo_label_generator.md new file mode 100644 index 000000000..f4b970c01 --- /dev/null +++ b/docs/_src/api/api/pseudo_label_generator.md @@ -0,0 +1,171 @@ + + +# Module pseudo\_label\_generator + + + +## PseudoLabelGenerator + +```python +class PseudoLabelGenerator(BaseComponent) +``` + +PseudoLabelGenerator is a component that creates Generative Pseudo Labeling (GPL) training data for the +training of dense retrievers. + +GPL is an unsupervised domain adaptation method for the training of dense retrievers. It is based on question +generation and pseudo labelling with powerful cross-encoders. To train a domain-adapted model, it needs access +to an unlabeled target corpus, usually through DocumentStore and a Retriever to mine for negatives. + +For more details, see [GPL](https://github.com/UKPLab/gpl). + +For example: + + +```python +| document_store = DocumentStore(...) +| retriever = Retriever(...) +| qg = QuestionGenerator(model_name_or_path="doc2query/msmarco-t5-base-v1") +| plg = PseudoLabelGenerator(qg, retriever) +| output, output_id = psg.run(documents=document_store.get_all_documents()) +| +``` + +**Notes**: + + + While the NLP researchers trained the default question + [generation](https://huggingface.co/doc2query/msmarco-t5-base-v1) and the cross + [encoder](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2) models on + the English language corpus, we can also use the language-specific question generation and + cross-encoder models in the target language of our choice to apply GPL to documents in languages + other than English. + + As of this writing, the German language question + [generation](https://huggingface.co/ml6team/mt5-small-german-query-generation) and the cross + [encoder](https://huggingface.co/ml6team/cross-encoder-mmarco-german-distilbert-base) models are + already available, as well as question [generation](https://huggingface.co/doc2query/msmarco-14langs-mt5-base-v1) + and the cross [encoder](https://huggingface.co/cross-encoder/mmarco-mMiniLMv2-L12-H384-v1) + models trained on fourteen languages. + + + +#### PseudoLabelGenerator.\_\_init\_\_ + +```python +def __init__(question_producer: Union[QuestionGenerator, List[Dict[str, str]]], retriever: BaseRetriever, cross_encoder_model_name_or_path: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", max_questions_per_document: int = 3, top_k: int = 50, batch_size: int = 16, progress_bar: bool = True) +``` + +Loads the cross-encoder model and prepares PseudoLabelGenerator. + +**Arguments**: + +- `question_producer` (`Union[QuestionGenerator, List[Dict[str, str]]]`): The question producer used to generate questions or a list of already produced +questions/document pairs in a Dictionary format {"question": "question text ...", "document": "document text ..."}. +- `retriever` (`BaseRetriever`): The Retriever used to query document stores. +- `cross_encoder_model_name_or_path` (`str (optional)`): The path to the cross encoder model, defaults to +`cross-encoder/ms-marco-MiniLM-L-6-v2`. +- `max_questions_per_document` (`int`): The max number of questions generated per document, defaults to 3. +- `top_k` (`int (optional)`): The number of answers retrieved for each question, defaults to 50. +- `batch_size` (`int (optional)`): The number of documents to process at a time. + + + +#### PseudoLabelGenerator.generate\_questions + +```python +def generate_questions(documents: List[Document], batch_size: Optional[int] = None) -> List[Dict[str, str]] +``` + +It takes a list of documents and generates a list of question-document pairs. + +**Arguments**: + +- `documents` (`List[Document]`): A list of documents to generate questions from. +- `batch_size` (`Optional[int]`): The number of documents to process at a time. + +**Returns**: + +A list of question-document pairs. + + + +#### PseudoLabelGenerator.mine\_negatives + +```python +def mine_negatives(question_doc_pairs: List[Dict[str, str]], batch_size: Optional[int] = None) -> List[Dict[str, str]] +``` + +Given a list of question and positive document pairs, this function returns a list of question/positive document/negative document + +dictionaries. + +**Arguments**: + +- `question_doc_pairs` (`List[Dict[str, str]]`): A list of question/positive document pairs. +- `batch_size` (`int (optional)`): The number of queries to run in a batch. + +**Returns**: + +A list of dictionaries, where each dictionary contains the question, positive document, +and negative document. + + + +#### PseudoLabelGenerator.generate\_margin\_scores + +```python +def generate_margin_scores(mined_negatives: List[Dict[str, str]], batch_size: Optional[int] = None) -> List[Dict] +``` + +Given a list of mined negatives, this function predicts the score margin between the positive and negative document using + +the cross-encoder. + +The function returns a list of examples, where each example is a dictionary with the following keys: + +* question: The question string. +* pos_doc: Positive document string (the document containing the answer). +* neg_doc: Negative document string (the document that doesn't contain the answer). +* score: The margin between the score for question-positive document pair and the score for question-negative document pair. + +**Arguments**: + +- `mined_negatives` (`List[Dict[str, str]]`): The list of mined negatives. +- `batch_size` (`int (optional)`): The number of mined negative lists to run in a batch. + +**Returns**: + +A list of dictionaries, each of which has the following keys: +- question: The question string +- pos_doc: Positive document string +- neg_doc: Negative document string +- score: The score margin + + + +#### PseudoLabelGenerator.generate\_pseudo\_labels + +```python +def generate_pseudo_labels(documents: List[Document], batch_size: Optional[int] = None) -> Tuple[dict, str] +``` + +Given a list of documents, this function generates a list of question-document pairs, mines for negatives, and + +scores a positive/negative margin with cross-encoder. The output is the training data for the +adaptation of dense retriever models. + +**Arguments**: + +- `documents` (`List[Document]`): List[Document] = The list of documents to mine negatives from. +- `batch_size` (`Optional[int]`): The number of documents to process in a batch. + +**Returns**: + +A dictionary with a single key 'gpl_labels' representing a list of dictionaries, where each +dictionary contains the following keys: +- question: The question string. +- pos_doc: Positive document for the given question. +- neg_doc: Negative document for the given question. +- score: The margin between the score for question-positive document pair and the score for question-negative document pair. + diff --git a/docs/_src/api/pydoc/pseudo-label-generator.yml b/docs/_src/api/pydoc/pseudo-label-generator.yml new file mode 100644 index 000000000..f21f5a388 --- /dev/null +++ b/docs/_src/api/pydoc/pseudo-label-generator.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/label_generator] + modules: ['pseudo_label_generator'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: pseudo_label_generator.md diff --git a/haystack/json-schemas/haystack-pipeline-master.schema.json b/haystack/json-schemas/haystack-pipeline-master.schema.json index 4cacfe7d2..33a713936 100644 --- a/haystack/json-schemas/haystack-pipeline-master.schema.json +++ b/haystack/json-schemas/haystack-pipeline-master.schema.json @@ -3256,7 +3256,7 @@ }, "batch_size": { "title": "Batch Size", - "default": 4, + "default": 16, "type": "integer" }, "progress_bar": { diff --git a/haystack/nodes/label_generator/pseudo_label_generator.py b/haystack/nodes/label_generator/pseudo_label_generator.py index 1a33cd417..2f15d7c7d 100644 --- a/haystack/nodes/label_generator/pseudo_label_generator.py +++ b/haystack/nodes/label_generator/pseudo_label_generator.py @@ -30,6 +30,24 @@ class PseudoLabelGenerator(BaseComponent): | output, output_id = psg.run(documents=document_store.get_all_documents()) | ``` + + Note: + + While the NLP researchers trained the default question + [generation](https://huggingface.co/doc2query/msmarco-t5-base-v1) and the cross + [encoder](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2) models on + the English language corpus, we can also use the language-specific question generation and + cross-encoder models in the target language of our choice to apply GPL to documents in languages + other than English. + + As of this writing, the German language question + [generation](https://huggingface.co/ml6team/mt5-small-german-query-generation) and the cross + [encoder](https://huggingface.co/ml6team/cross-encoder-mmarco-german-distilbert-base) models are + already available, as well as question [generation](https://huggingface.co/doc2query/msmarco-14langs-mt5-base-v1) + and the cross [encoder](https://huggingface.co/cross-encoder/mmarco-mMiniLMv2-L12-H384-v1) + models trained on fourteen languages. + + """ def __init__( @@ -39,7 +57,7 @@ class PseudoLabelGenerator(BaseComponent): cross_encoder_model_name_or_path: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", max_questions_per_document: int = 3, top_k: int = 50, - batch_size: int = 4, + batch_size: int = 16, progress_bar: bool = True, ): """ diff --git a/test/nodes/test_label_generator.py b/test/nodes/test_label_generator.py index 6cf45a024..25d7fbe01 100644 --- a/test/nodes/test_label_generator.py +++ b/test/nodes/test_label_generator.py @@ -2,6 +2,7 @@ from pathlib import Path import pytest +from haystack.document_stores import BaseDocumentStore from haystack.nodes import QuestionGenerator, EmbeddingRetriever, PseudoLabelGenerator from test.conftest import DOCS_WITH_EMBEDDINGS @@ -11,21 +12,44 @@ from test.conftest import DOCS_WITH_EMBEDDINGS @pytest.mark.parametrize("document_store", ["memory"], indirect=True) @pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True) def test_pseudo_label_generator( - document_store, retriever: EmbeddingRetriever, question_generator: QuestionGenerator, tmp_path: Path + document_store: BaseDocumentStore, + retriever: EmbeddingRetriever, + question_generator: QuestionGenerator, + tmp_path: Path, ): document_store.write_documents(DOCS_WITH_EMBEDDINGS) psg = PseudoLabelGenerator(question_generator, retriever) train_examples = [] - for idx, doc in enumerate(document_store): - output, stream = psg.run(documents=[doc]) - assert "gpl_labels" in output - for item in output["gpl_labels"]: - assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item - train_examples.append(item) + output, pipe_id = psg.run(documents=document_store.get_all_documents()) + assert "gpl_labels" in output + for item in output["gpl_labels"]: + assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item + train_examples.append(item) + + assert len(train_examples) > 0 + + +@pytest.mark.slow +@pytest.mark.generator +@pytest.mark.parametrize("document_store", ["memory"], indirect=True) +@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True) +def test_pseudo_label_generator_batch( + document_store: BaseDocumentStore, + retriever: EmbeddingRetriever, + question_generator: QuestionGenerator, + tmp_path: Path, +): + document_store.write_documents(DOCS_WITH_EMBEDDINGS) + psg = PseudoLabelGenerator(question_generator, retriever) + train_examples = [] + + output, pipe_id = psg.run_batch(documents=document_store.get_all_documents()) + assert "gpl_labels" in output + for item in output["gpl_labels"]: + assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item + train_examples.append(item) assert len(train_examples) > 0 - retriever.train(train_examples) - retriever.save(tmp_path) @pytest.mark.generator @@ -33,7 +57,7 @@ def test_pseudo_label_generator( @pytest.mark.parametrize("document_store", ["memory"], indirect=True) @pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True) def test_pseudo_label_generator_using_question_document_pairs( - document_store, retriever: EmbeddingRetriever, tmp_path: Path + document_store: BaseDocumentStore, retriever: EmbeddingRetriever, tmp_path: Path ): document_store.write_documents(DOCS_WITH_EMBEDDINGS) docs = [ @@ -48,15 +72,63 @@ def test_pseudo_label_generator_using_question_document_pairs( ] psg = PseudoLabelGenerator(docs, retriever) train_examples = [] - for idx, doc in enumerate(document_store): - # the documents passed here are ignored as we provided source documents in the constructor - output, stream = psg.run(documents=[doc]) - assert "gpl_labels" in output - for item in output["gpl_labels"]: - assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item - train_examples.append(item) + output, pipe_id = psg.run(documents=document_store.get_all_documents()) + assert "gpl_labels" in output + for item in output["gpl_labels"]: + assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item + train_examples.append(item) assert len(train_examples) > 0 + +@pytest.mark.slow +@pytest.mark.generator +@pytest.mark.parametrize("document_store", ["memory"], indirect=True) +@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True) +def test_pseudo_label_generator_using_question_document_pairs_batch( + document_store: BaseDocumentStore, retriever: EmbeddingRetriever, tmp_path: Path +): + document_store.write_documents(DOCS_WITH_EMBEDDINGS) + docs = [ + { + "question": "What is the capital of Germany?", + "document": "Berlin is the capital and largest city of Germany by both area and population.", + }, + { + "question": "What is the largest city in Germany by population and area?", + "document": "Berlin is the capital and largest city of Germany by both area and population.", + }, + ] + psg = PseudoLabelGenerator(docs, retriever) + train_examples = [] + + output, pipe_id = psg.run_batch(documents=document_store.get_all_documents()) + assert "gpl_labels" in output + for item in output["gpl_labels"]: + assert "question" in item and "pos_doc" in item and "neg_doc" in item and "score" in item + train_examples.append(item) + + assert len(train_examples) > 0 + + +@pytest.mark.slow +@pytest.mark.generator +@pytest.mark.parametrize("document_store", ["memory"], indirect=True) +@pytest.mark.parametrize("retriever", ["embedding_sbert"], indirect=True) +def test_training_and_save(retriever: EmbeddingRetriever, tmp_path: Path): + train_examples = [ + { + "question": "What is the capital of Germany?", + "pos_doc": "Berlin is the capital and largest city of Germany by both area and population.", + "neg_doc": "The capital of Germany is the city state of Berlin.", + "score": -2.2788997, + }, + { + "question": "What is the largest city in Germany by population and area?", + "pos_doc": "Berlin is the capital and largest city of Germany by both area and population.", + "neg_doc": "The capital of Germany is the city state of Berlin.", + "score": 7.0911007, + }, + ] retriever.train(train_examples) retriever.save(tmp_path)