diff --git a/docs/_src/tutorials/tutorials/2.md b/docs/_src/tutorials/tutorials/2.md index 7075bffd3..48854daac 100644 --- a/docs/_src/tutorials/tutorials/2.md +++ b/docs/_src/tutorials/tutorials/2.md @@ -92,6 +92,44 @@ reader.save(directory="my_model") new_reader = FARMReader(model_name_or_path="my_model") ``` +## Distill your model +In this case, we have used "distilbert-base-uncased" as our base model. This model was trained using a process called distillation. In this process, a bigger model is trained first and is used to train a smaller model which increases its accuracy. This is why "distilbert-base-uncased" can achieve quite competitive performance while being very small. + +Sometimes, however, you can't use an already distilled model and have to distil it yourself. For this case, haystack has implemented [distillation features](https://haystack.deepset.ai/guides/model-distillation). + +### Augmenting your training data +To get the most out of model distillation, we recommend increasing the size of your training data by using data augmentation. You can do this by running the [`augment_squad.py` script](https://github.com/deepset-ai/haystack/blob/master/haystack/utils/augment_squad.py): + + +```python +# Downloading script +!wget https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/utils/augment_squad.py +# Just replace the path with your dataset and adjust the output +!python augment_squad.py --squad_path data/squad20/dev-v2.0.json --output_path augmented_dataset.json --multiplication_factor 2 +``` + +In this case, we use a multiplication factor of 2 to keep this example lightweight. Usually you would use a factor like 20 depending on the size of your training data. Augmenting this small dataset with a multiplication factor of 2, should take about 5 to 10 minutes to run on one V100 GPU. + +### Running distillation +Distillation in haystack is done in two steps: First, you run intermediate layer distillation on the augmented dataset to ensure the two models behave similarly. After that, you run the prediction layer distillation on the non-augmented dataset to optimize the model for your specific task. + +If you want, you can leave out the intermediate layer distillation step and only run the prediction layer distillation. This way you also do not need to perform data augmentation. However, this will make the model significantly less accurate. + + +```python +# Loading a fine-tuned model as teacher e.g. "deepset/​bert-​base-​uncased-​squad2" +teacher = FARMReader(model_name_or_path="my_model", use_gpu=True) + +# You can use any pre-trained language model as teacher that uses the same tokenizer as the teacher model. +# The number of the layers in the teacher model also needs to be a multiple of the number of the layers in the student. +student = FARMReader(model_name_or_path="huawei-noah/TinyBERT_General_6L_768D", use_gpu=True) + +student.distil_intermediate_layers_from(teacher, data_dir="data/squad20", train_filename="augmented_dataset.json", use_gpu=True) +student.distil_prediction_layer_from(teacher, data_dir="data/squad20", train_filename="dev-v2.0.json", use_gpu=True) + +student.save(directory="my_distilled_model") +``` + ## About us This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany diff --git a/haystack/utils/augment_squad.py b/haystack/utils/augment_squad.py index 3cfdd1083..b2026e093 100644 --- a/haystack/utils/augment_squad.py +++ b/haystack/utils/augment_squad.py @@ -177,7 +177,7 @@ def augment(word_id_mapping: dict, id_word_mapping: dict, vectors: np.ndarray, m new_texts.append(" ".join(new_text)) return new_texts -def augment_squad(model: str, tokenizer: str, squad_path: Path, output_path: Path, +def augment_squad(squad_path: Path, output_path: Path, model: str = "bert-base-uncased", tokenizer: str = "bert-base-uncased", glove_path: Path = Path("glove.txt"), multiplication_factor: int = 20, word_possibilities: int = 20, replace_probability: float = 0.4, device: str = "cpu:0", batch_size: int = 16): """Loads a squad dataset, augments the contexts, and saves the result in SQuAD format.""" diff --git a/test/test_utils.py b/test/test_utils.py index c82296eef..654d15917 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -21,8 +21,8 @@ def test_squad_augmentation(): output = Path("samples/squad/tiny_augmented.json") glove_path = Path("samples/glove/tiny.txt") # dummy glove file, will not even be use when augmenting tiny.json multiplication_factor = 5 - augment_squad("distilbert-base-uncased", "distilbert-base-uncased", input_, output, - glove_path, multiplication_factor=multiplication_factor) + augment_squad(model="distilbert-base-uncased", tokenizer="distilbert-base-uncased", squad_path=input_, output_path=output, + glove_path=glove_path, multiplication_factor=multiplication_factor) original_squad = SquadData.from_file(input_) augmented_squad = SquadData.from_file(output) - assert original_squad.count(unit="paragraph") == augmented_squad.count(unit="paragraph") * multiplication_factor \ No newline at end of file + assert original_squad.count(unit="paragraph") == augmented_squad.count(unit="paragraph") * multiplication_factor diff --git a/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb b/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb index de007121e..1a86a22e6 100644 --- a/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb +++ b/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ "# Fine-tuning a Model on Your Own Data\n", "\n", @@ -12,11 +13,13 @@ "While this varies by domain, we saw that ~ 2000 examples can easily increase performance by +5-20%.\n", "\n", "This tutorial shows you how to fine-tune a pretrained model on your own dataset." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "### Prepare environment\n", "\n", @@ -25,29 +28,28 @@ "**Runtime -> Change Runtime type -> Hardware accelerator -> GPU**\n", "\n", "" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "# Make sure you have a GPU running\n", - "!nvidia-smi" - ], - "outputs": [], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } + }, + "outputs": [], + "source": [ + "# Make sure you have a GPU running\n", + "!nvidia-smi" + ] }, { "cell_type": "code", "execution_count": 1, + "metadata": {}, + "outputs": [], "source": [ "# Install the latest release of Haystack in your own environment \n", "#! pip install farm-haystack\n", @@ -58,26 +60,25 @@ "\n", "# If you run this notebook on Google Colab, you might need to\n", "# restart the runtime after installing haystack." - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "from haystack.nodes import FARMReader" - ], - "outputs": [], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } + }, + "outputs": [], + "source": [ + "from haystack.nodes import FARMReader" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "\n", "## Create Training Data\n", @@ -100,22 +101,20 @@ "**Recommendation**: Run training on a GPU.\n", "If you are using Colab: Enable this in the menu \"Runtime\" > \"Change Runtime type\" > Select \"GPU\" in dropdown.\n", "Then change the `use_gpu` arguments below to `True`" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 2, - "source": [ - "reader = FARMReader(model_name_or_path=\"distilbert-base-uncased-distilled-squad\", use_gpu=True)\n", - "data_dir = \"data/squad20\"\n", - "# data_dir = \"PATH/TO_YOUR/TRAIN_DATA\" \n", - "reader.train(data_dir=data_dir, train_filename=\"dev-v2.0.json\", use_gpu=True, n_epochs=1, save_dir=\"my_model\")" - ], + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "04/28/2020 14:39:27 - INFO - farm.utils - device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None\n", "04/28/2020 14:39:27 - INFO - farm.infer - Could not find `distilbert-base-uncased-distilled-squad` locally. Try to download from model hub ...\n", @@ -130,45 +129,108 @@ ] } ], - "metadata": { - "pycharm": { - "name": "#%%\n" - } - } + "source": [ + "reader = FARMReader(model_name_or_path=\"distilbert-base-uncased-distilled-squad\", use_gpu=True)\n", + "data_dir = \"data/squad20\"\n", + "# data_dir = \"PATH/TO_YOUR/TRAIN_DATA\" \n", + "reader.train(data_dir=data_dir, train_filename=\"dev-v2.0.json\", use_gpu=True, n_epochs=1, save_dir=\"my_model\")" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], "source": [ "# Saving the model happens automatically at the end of training into the `save_dir` you specified\n", "# However, you could also save a reader manually again via:\n", "reader.save(directory=\"my_model\")" - ], - "outputs": [], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "# If you want to load it at a later point, just do:\n", - "new_reader = FARMReader(model_name_or_path=\"my_model\")" - ], - "outputs": [], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } + }, + "outputs": [], + "source": [ + "# If you want to load it at a later point, just do:\n", + "new_reader = FARMReader(model_name_or_path=\"my_model\")" + ] }, { "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Distill your model\n", + "In this case, we have used \"distilbert-base-uncased\" as our base model. This model was trained using a process called distillation. In this process, a bigger model is trained first and is used to train a smaller model which increases its accuracy. This is why \"distilbert-base-uncased\" can achieve quite competitive performance while being very small.\n", + "\n", + "Sometimes, however, you can't use an already distilled model and have to distil it yourself. For this case, haystack has implemented [distillation features](https://haystack.deepset.ai/guides/model-distillation).\n", + "\n", + "### Augmenting your training data\n", + "To get the most out of model distillation, we recommend increasing the size of your training data by using data augmentation. You can do this by running the [`augment_squad.py` script](https://github.com/deepset-ai/haystack/blob/master/haystack/utils/augment_squad.py):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Downloading script\n", + "!wget https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/utils/augment_squad.py\n", + "# Just replace the path with your dataset and adjust the output\n", + "!python augment_squad.py --squad_path data/squad20/dev-v2.0.json --output_path augmented_dataset.json --multiplication_factor 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case, we use a multiplication factor of 2 to keep this example lightweight. Usually you would use a factor like 20 depending on the size of your training data. Augmenting this small dataset with a multiplication factor of 2, should take about 5 to 10 minutes to run on one V100 GPU.\n", + "\n", + "### Running distillation\n", + "Distillation in haystack is done in two steps: First, you run intermediate layer distillation on the augmented dataset to ensure the two models behave similarly. After that, you run the prediction layer distillation on the non-augmented dataset to optimize the model for your specific task.\n", + "\n", + "If you want, you can leave out the intermediate layer distillation step and only run the prediction layer distillation. This way you also do not need to perform data augmentation. However, this will make the model significantly less accurate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading a fine-tuned model as teacher e.g. \"deepset/​bert-​base-​uncased-​squad2\"\n", + "teacher = FARMReader(model_name_or_path=\"my_model\", use_gpu=True)\n", + "\n", + "# You can use any pre-trained language model as teacher that uses the same tokenizer as the teacher model.\n", + "# The number of the layers in the teacher model also needs to be a multiple of the number of the layers in the student.\n", + "student = FARMReader(model_name_or_path=\"huawei-noah/TinyBERT_General_6L_768D\", use_gpu=True)\n", + "\n", + "student.distil_intermediate_layers_from(teacher, data_dir=\"data/squad20\", train_filename=\"augmented_dataset.json\", use_gpu=True)\n", + "student.distil_prediction_layer_from(teacher, data_dir=\"data/squad20\", train_filename=\"dev-v2.0.json\", use_gpu=True)\n", + "\n", + "student.save(directory=\"my_distilled_model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "## About us\n", "\n", @@ -186,13 +248,7 @@ "[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai)\n", "\n", "By the way: [we're hiring!](https://www.deepset.ai/jobs)" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } + ] } ], "metadata": { diff --git a/tutorials/Tutorial2_Finetune_a_model_on_your_data.py b/tutorials/Tutorial2_Finetune_a_model_on_your_data.py index 1add3f684..a5eab4ba1 100755 --- a/tutorials/Tutorial2_Finetune_a_model_on_your_data.py +++ b/tutorials/Tutorial2_Finetune_a_model_on_your_data.py @@ -8,6 +8,9 @@ # This tutorial shows you how to fine-tune a pretrained model on your own dataset. from haystack.nodes import FARMReader +from haystack.utils import augment_squad + +from pathlib import Path def tutorial2_finetune_a_model_on_your_data(): @@ -47,10 +50,52 @@ def tutorial2_finetune_a_model_on_your_data(): # If you want to load it at a later point, just do: new_reader = FARMReader(model_name_or_path="my_model") + # ## Distill your model + # In this case, we have used "distilbert-base-uncased" as our base model. + # This model was trained using a process called distillation. + # In this process, a bigger model is trained first and is used to train a smaller model which increases its accuracy. + # This is why "distilbert-base-uncased" can achieve quite competitive performance while being very small. + # + # Sometimes, however, you can't use an already distilled model and have to distil it yourself. + # For this case, haystack has implemented [distillation features](https://haystack.deepset.ai/guides/model-distillation).. + # distil() + + +def distil(): + # ### Augmenting your training data + # To get the most out of model distillation, we recommend increasing the size of your training data by using data augmentation. + # You can do this by running the [`augment_squad.py` script](https://github.com/deepset-ai/haystack/blob/master/haystack/utils/augment_squad.py): + # # Just replace dataset.json with the name of your dataset and adjust the output path + augment_squad.main(squad_path=Path("dataset.json"), output_path=Path("augmented_dataset.json"), multiplication_factor=2) + # In this case, we use a multiplication factor of 2 to keep this example lightweight. + # Usually you would use a factor like 20 depending on the size of your training data. + # Augmenting this small dataset with a multiplication factor of 2, should take about 5 to 10 minutes to run on one V100 GPU. + + + # ### Running distillation + # Distillation in haystack is done in two steps: + # First, you run intermediate layer distillation on the augmented dataset to ensure the two models behave similarly. + # After that, you run the prediction layer distillation on the non-augmented dataset to optimize the model for your specific task. + + # If you want, you can leave out the intermediate layer distillation step and only run the prediction layer distillation. + # This way you also do not need to perform data augmentation. However, this will make the model significantly less accurate. + + # Loading a fine-tuned model as teacher e.g. "deepset/​bert-​base-​uncased-​squad2" + teacher = FARMReader(model_name_or_path="huawei-noah/TinyBERT_General_6L_768D", use_gpu=True) + + # You can use any pre-trained language model as teacher that uses the same tokenizer as the teacher model. + # The number of the layers in the teacher model also needs to be a multiple of the number of the layers in the student. + student = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True) + + student.distil_intermediate_layers_from(teacher, data_dir="data/squad20", train_filename="augmented_dataset.json", use_gpu=True) + student.distil_prediction_layer_from(teacher, data_dir="data/squad20", train_filename="dev-v2.0.json", use_gpu=True) + + student.save(directory="my_distilled_model") + if __name__ == "__main__": tutorial2_finetune_a_model_on_your_data() # This Haystack script was made with love by deepset in Berlin, Germany # Haystack: https://github.com/deepset-ai/haystack -# deepset: https://deepset.ai/ \ No newline at end of file +# deepset: https://deepset.ai/