feat: add gradient accumulation in FARMReader (#2925)

* expose gradient accumulation to train function of FARMReader * add documentation for gradient accumulation * Update Documentation & Code Style * doc string improvements Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * doc string improvements Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * doc string improvements Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
2026-01-07 20:46:31 +00:00 · 2022-08-08 18:42:21 +02:00 · 2022-08-08 18:42:21 +02:00 · c91316e862
commit c91316e862
parent 82448efa4f
2 changed files with 20 additions and 3 deletions
--- a/docs/_src/api/api/reader.md
+++ b/docs/_src/api/api/reader.md
@ -107,7 +107,7 @@ Additional information can be found here https://huggingface.co/transformers/mai
 #### FARMReader.train

 ```python
-def train(data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], batch_size: int = 10, n_epochs: int = 2, learning_rate: float = 1e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"))
+def train(data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], batch_size: int = 10, n_epochs: int = 2, learning_rate: float = 1e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), grad_acc_steps: int = 1)
 ```

 Fine-tune a model on a QA dataset. Options:
@ -156,6 +156,7 @@ checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is cr
 - `caching`: whether or not to use caching for preprocessed dataset
 - `cache_path`: Path to cache the preprocessed dataset
 - `processor`: The processor to use for preprocessing. If None, the default SquadProcessor is used.
+- `grad_acc_steps`: The number of steps to accumulate gradients for before performing a backward pass.

 **Returns**:

@ -166,7 +167,7 @@ None
 #### FARMReader.distil\_prediction\_layer\_from

 ```python
-def distil_prediction_layer_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], student_batch_size: int = 10, teacher_batch_size: Optional[int] = None, n_epochs: int = 2, learning_rate: float = 3e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss_weight: float = 0.5, distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "kl_div", temperature: float = 1.0)
+def distil_prediction_layer_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], student_batch_size: int = 10, teacher_batch_size: Optional[int] = None, n_epochs: int = 2, learning_rate: float = 3e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss_weight: float = 0.5, distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "kl_div", temperature: float = 1.0, grad_acc_steps: int = 1)
 ```

 Fine-tune a model on a QA dataset using logit-based distillation. You need to provide a teacher model that is already finetuned on the dataset
@ -234,6 +235,7 @@ checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is cr
 - `tinybert_learning_rate`: Learning rate to use when training the student model with the TinyBERT loss function.
 - `tinybert_train_filename`: Filename of training data to use when training the student model with the TinyBERT loss function. To best follow the original paper, this should be an augmented version of the training data created using the augment_squad.py script. If not specified, the training data from the original training is used.
 - `processor`: The processor to use for preprocessing. If None, the default SquadProcessor is used.
+- `grad_acc_steps`: The number of steps to accumulate gradients for before performing a backward pass.

 **Returns**:

@ -244,7 +246,7 @@ None
 #### FARMReader.distil\_intermediate\_layers\_from

 ```python
-def distil_intermediate_layers_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], batch_size: int = 10, n_epochs: int = 5, learning_rate: float = 5e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "mse", temperature: float = 1.0, processor: Optional[Processor] = None)
+def distil_intermediate_layers_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], batch_size: int = 10, n_epochs: int = 5, learning_rate: float = 5e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "mse", temperature: float = 1.0, processor: Optional[Processor] = None, grad_acc_steps: int = 1)
 ```

 The first stage of distillation finetuning as described in the TinyBERT paper:
@ -304,6 +306,7 @@ checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is cr
 - `distillation_loss`: Specifies how teacher and model logits should be compared. Can either be a string ("mse" for mean squared error or "kl_div" for kl divergence loss) or a callable loss function (needs to have named parameters student_logits and teacher_logits)
 - `temperature`: The temperature for distillation. A higher temperature will result in less certainty of teacher outputs. A lower temperature means more certainty. A temperature of 1.0 does not change the certainty of the model.
 - `processor`: The processor to use for preprocessing. If None, the default SquadProcessor is used.
+- `grad_acc_steps`: The number of steps to accumulate gradients for before performing a backward pass.

 **Returns**:

--- a/haystack/nodes/reader/farm.py
+++ b/haystack/nodes/reader/farm.py
@ -187,6 +187,7 @@ class FARMReader(BaseReader):
        temperature: float = 1.0,
        tinybert: bool = False,
        processor: Optional[Processor] = None,
+        grad_acc_steps: int = 1,
    ):
        if dev_filename:
            dev_split = 0
@ -263,6 +264,7 @@ class FARMReader(BaseReader):
            n_epochs=n_epochs,
            device=devices[0],
            use_amp=use_amp,
+            grad_acc_steps=grad_acc_steps,
        )
        # 4. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
        if tinybert:
@ -283,6 +285,7 @@ class FARMReader(BaseReader):
                checkpoint_root_dir=Path(checkpoint_root_dir),
                checkpoint_every=checkpoint_every,
                checkpoints_to_keep=checkpoints_to_keep,
+                grad_acc_steps=grad_acc_steps,
            )

        elif (
@ -305,6 +308,7 @@ class FARMReader(BaseReader):
                distillation_loss=distillation_loss,
                distillation_loss_weight=distillation_loss_weight,
                temperature=temperature,
+                grad_acc_steps=grad_acc_steps,
            )
        else:
            trainer = Trainer.create_or_load_checkpoint(
@ -321,6 +325,7 @@ class FARMReader(BaseReader):
                checkpoint_root_dir=Path(checkpoint_root_dir),
                checkpoint_every=checkpoint_every,
                checkpoints_to_keep=checkpoints_to_keep,
+                grad_acc_steps=grad_acc_steps,
            )

        # 5. Let it grow!
@ -350,6 +355,7 @@ class FARMReader(BaseReader):
        checkpoints_to_keep: int = 3,
        caching: bool = False,
        cache_path: Path = Path("cache/data_silo"),
+        grad_acc_steps: int = 1,
    ):
        """
        Fine-tune a model on a QA dataset. Options:
@ -395,6 +401,7 @@ class FARMReader(BaseReader):
        :param caching: whether or not to use caching for preprocessed dataset
        :param cache_path: Path to cache the preprocessed dataset
        :param processor: The processor to use for preprocessing. If None, the default SquadProcessor is used.
+        :param grad_acc_steps: The number of steps to accumulate gradients for before performing a backward pass.
        :return: None
        """
        return self._training_procedure(
@ -419,6 +426,7 @@ class FARMReader(BaseReader):
            checkpoints_to_keep=checkpoints_to_keep,
            caching=caching,
            cache_path=cache_path,
+            grad_acc_steps=grad_acc_steps,
        )

    def distil_prediction_layer_from(
@ -449,6 +457,7 @@ class FARMReader(BaseReader):
        distillation_loss_weight: float = 0.5,
        distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "kl_div",
        temperature: float = 1.0,
+        grad_acc_steps: int = 1,
    ):
        """
        Fine-tune a model on a QA dataset using logit-based distillation. You need to provide a teacher model that is already finetuned on the dataset
@ -513,6 +522,7 @@ class FARMReader(BaseReader):
        :param tinybert_learning_rate: Learning rate to use when training the student model with the TinyBERT loss function.
        :param tinybert_train_filename: Filename of training data to use when training the student model with the TinyBERT loss function. To best follow the original paper, this should be an augmented version of the training data created using the augment_squad.py script. If not specified, the training data from the original training is used.
        :param processor: The processor to use for preprocessing. If None, the default SquadProcessor is used.
+        :param grad_acc_steps: The number of steps to accumulate gradients for before performing a backward pass.
        :return: None
        """
        return self._training_procedure(
@ -542,6 +552,7 @@ class FARMReader(BaseReader):
            distillation_loss_weight=distillation_loss_weight,
            distillation_loss=distillation_loss,
            temperature=temperature,
+            grad_acc_steps=grad_acc_steps,
        )

    def distil_intermediate_layers_from(
@ -571,6 +582,7 @@ class FARMReader(BaseReader):
        distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "mse",
        temperature: float = 1.0,
        processor: Optional[Processor] = None,
+        grad_acc_steps: int = 1,
    ):
        """
        The first stage of distillation finetuning as described in the TinyBERT paper:
@ -627,6 +639,7 @@ class FARMReader(BaseReader):
        :param distillation_loss: Specifies how teacher and model logits should be compared. Can either be a string ("mse" for mean squared error or "kl_div" for kl divergence loss) or a callable loss function (needs to have named parameters student_logits and teacher_logits)
        :param temperature: The temperature for distillation. A higher temperature will result in less certainty of teacher outputs. A lower temperature means more certainty. A temperature of 1.0 does not change the certainty of the model.
        :param processor: The processor to use for preprocessing. If None, the default SquadProcessor is used.
+        :param grad_acc_steps: The number of steps to accumulate gradients for before performing a backward pass.
        :return: None
        """
        return self._training_procedure(
@ -657,6 +670,7 @@ class FARMReader(BaseReader):
            temperature=temperature,
            tinybert=True,
            processor=processor,
+            grad_acc_steps=grad_acc_steps,
        )

    def update_parameters(