diff --git a/docs/_src/api/api/ranker.md b/docs/_src/api/api/ranker.md
index 6e7db33cd..e9de4491b 100644
--- a/docs/_src/api/api/ranker.md
+++ b/docs/_src/api/api/ranker.md
@@ -92,7 +92,7 @@ p.add_node(component=ranker, name="Ranker", inputs=["ESRetriever"])
 #### \_\_init\_\_
 
 ```python
-def __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, top_k: int = 10, use_gpu: bool = True, devices: Optional[List[Union[int, str, torch.device]]] = None)
+def __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, top_k: int = 10, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None)
 ```
 
 **Arguments**:
@@ -103,7 +103,10 @@ See https://huggingface.co/cross-encoder for full list of available models
 - `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
 - `top_k`: The maximum number of documents to return
 - `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.
-- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. ["cuda:0"]).
+- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
+The strings will be converted into pytorch devices, so use the string notation described here:
+https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
+(e.g. ["cuda:0"]).
 
 <a id="sentence_transformers.SentenceTransformersRanker.predict_batch"></a>
 
diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md
index bdc70da8d..8e280d260 100644
--- a/docs/_src/api/api/reader.md
+++ b/docs/_src/api/api/reader.md
@@ -398,7 +398,7 @@ Dict containing query and answers
 #### eval\_on\_file
 
 ```python
-def eval_on_file(data_dir: str, test_filename: str, device: Optional[str] = None)
+def eval_on_file(data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None)
 ```
 
 Performs evaluation on a SQuAD-formatted file.
@@ -410,16 +410,18 @@ Returns a dict containing the following metrics:
 
 **Arguments**:
 
-- `data_dir` (`Path or str`): The directory in which the test set can be found
-- `test_filename` (`str`): The name of the file containing the test data in SQuAD format.
-- `device` (`str`): The device on which the tensors should be processed. Choose from "cpu" and "cuda" or use the Reader's device by default.
+- `data_dir`: The directory in which the test set can be found
+- `test_filename`: The name of the file containing the test data in SQuAD format.
+- `device`: The device on which the tensors should be processed.
+Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
+or use the Reader's device by default.
 
 <a id="farm.FARMReader.eval"></a>
 
 #### eval
 
 ```python
-def eval(document_store: BaseDocumentStore, device: Optional[str] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False)
+def eval(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False)
 ```
 
 Performs evaluation on evaluation documents in the DocumentStore.
@@ -432,7 +434,9 @@ Returns a dict containing the following metrics:
 **Arguments**:
 
 - `document_store`: DocumentStore containing the evaluation documents
-- `device`: The device on which the tensors should be processed. Choose from "cpu" and "cuda" or use the Reader's device by default.
+- `device`: The device on which the tensors should be processed.
+Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
+or use the Reader's device by default.
 - `label_index`: Index/Table name where labeled questions are stored
 - `doc_index`: Index/Table name where documents that are used for evaluation are stored
 - `label_origin`: Field name where the gold labels are stored
@@ -443,7 +447,7 @@ Returns a dict containing the following metrics:
 #### calibrate\_confidence\_scores
 
 ```python
-def calibrate_confidence_scores(document_store: BaseDocumentStore, device: Optional[str] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label")
+def calibrate_confidence_scores(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label")
 ```
 
 Calibrates confidence scores on evaluation documents in the DocumentStore.
@@ -451,7 +455,9 @@ Calibrates confidence scores on evaluation documents in the DocumentStore.
 **Arguments**:
 
 - `document_store`: DocumentStore containing the evaluation documents
-- `device`: The device on which the tensors should be processed. Choose from "cpu" and "cuda" or use the Reader's device by default.
+- `device`: The device on which the tensors should be processed.
+Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
+or use the Reader's device by default.
 - `label_index`: Index/Table name where labeled questions are stored
 - `doc_index`: Index/Table name where documents that are used for evaluation are stored
 - `label_origin`: Field name where the gold labels are stored
diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md
index 0a3e638b0..90e7db44f 100644
--- a/docs/_src/api/api/retriever.md
+++ b/docs/_src/api/api/retriever.md
@@ -312,7 +312,7 @@ Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Que
 #### \_\_init\_\_
 
 ```python
-def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[int, str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None)
+def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None)
 ```
 
 Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
@@ -362,8 +362,11 @@ Options: `dot_product` (Default) or `cosine`
 Increase if errors like "encoded data exceeds max_size ..." come up
 - `progress_bar`: Whether to show a tqdm progress bar or not.
 Can be helpful to disable in production deployments to keep the logs clean.
-- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. ["cuda:0"]).
-As multi-GPU training is currently not implemented for DPR, training will only use the first device provided in this list.
+- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
+These strings will be converted into pytorch devices, so use the string notation described here:
+https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
+(e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for DPR, training
+will only use the first device provided in this list.
 - `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`,
 the local token will be used, which must be previously created via `transformer-cli login`.
 Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
@@ -520,7 +523,7 @@ Kostić, Bogdan, et al. (2021): "Multi-modal Retrieval of Tables and Texts Using
 #### \_\_init\_\_
 
 ```python
-def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[int, str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None)
+def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None)
 ```
 
 Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
@@ -556,8 +559,11 @@ Options: `dot_product` (Default) or `cosine`
 Increase if errors like "encoded data exceeds max_size ..." come up
 - `progress_bar`: Whether to show a tqdm progress bar or not.
 Can be helpful to disable in production deployments to keep the logs clean.
-- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. ["cuda:0"]).
-As multi-GPU training is currently not implemented for DPR, training will only use the first device provided in this list.
+- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
+These strings will be converted into pytorch devices, so use the string notation described here:
+https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
+(e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for TableTextRetriever,
+training will only use the first device provided in this list.
 - `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`,
 the local token will be used, which must be previously created via `transformer-cli login`.
 Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
@@ -695,7 +701,7 @@ class EmbeddingRetriever(BaseRetriever)
 #### \_\_init\_\_
 
 ```python
-def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: str = "farm", pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[int, str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None)
+def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: str = "farm", pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None)
 ```
 
 **Arguments**:
@@ -721,8 +727,11 @@ Options:
 Default: -1 (very last layer).
 - `top_k`: How many documents to return per query.
 - `progress_bar`: If true displays progress bar during embedding.
-- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. ["cuda:0"]).
-As multi-GPU training is currently not implemented for DPR, training will only use the first device provided in this list.
+- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
+These strings will be converted into pytorch devices, so use the string notation described here:
+https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
+(e.g. ["cuda:0"]). Note: As multi-GPU training is currently not implemented for EmbeddingRetriever,
+training will only use the first device provided in this list.
 - `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`,
 the local token will be used, which must be previously created via `transformer-cli login`.
 Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
diff --git a/haystack/modeling/conversion/transformers.py b/haystack/modeling/conversion/transformers.py
index 1aa7d4645..80a7b71c6 100644
--- a/haystack/modeling/conversion/transformers.py
+++ b/haystack/modeling/conversion/transformers.py
@@ -1,11 +1,13 @@
 import logging
 from typing import Union
 
+import torch
 from transformers import AutoModelForQuestionAnswering
 
 from haystack.modeling.model import adaptive_model as am
 from haystack.modeling.model.language_model import LanguageModel
 from haystack.modeling.model.prediction_head import QuestionAnsweringHead
+from haystack.modeling.data_handler.processor import Processor
 
 
 logger = logging.getLogger(__name__)
@@ -46,10 +48,10 @@ class Converter:
     @staticmethod
     def convert_from_transformers(
         model_name_or_path,
-        device,
-        revision=None,
-        task_type=None,
-        processor=None,
+        device: Union[str, torch.device],
+        revision: str = None,
+        task_type: str = "question_answering",
+        processor: Processor = None,
         use_auth_token: Union[bool, str] = None,
         **kwargs,
     ):
@@ -65,14 +67,10 @@ class Converter:
                                               - deepset/bert-large-uncased-whole-word-masking-squad2
 
                                               See https://huggingface.co/models for full list
-        :param device: "cpu" or "cuda"
+        :param device: torch.device("cpu") or torch.device("cuda")
         :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
-        :type revision: str
-        :param task_type: One of :
-                          - 'question_answering'
-                          More tasks coming soon ...
-        :param processor: populates prediction head with information coming from tasks
-        :type processor: Processor
+                         Right now accepts only 'question_answering'.
+        :param processor: populates prediction head with information coming from tasks.
         :return: AdaptiveModel
         """
 
diff --git a/haystack/modeling/data_handler/data_silo.py b/haystack/modeling/data_handler/data_silo.py
index ae2a51c52..e6f8eff3d 100644
--- a/haystack/modeling/data_handler/data_silo.py
+++ b/haystack/modeling/data_handler/data_silo.py
@@ -785,7 +785,7 @@ class DistillationDataSilo(DataSilo):
         self,
         teacher_model: "FARMReader",
         teacher_batch_size: int,
-        device: str,
+        device: torch.device,
         processor: Processor,
         batch_size: int,
         eval_batch_size: Optional[int] = None,
diff --git a/haystack/modeling/evaluation/eval.py b/haystack/modeling/evaluation/eval.py
index 6d757940e..1b92f2a8f 100644
--- a/haystack/modeling/evaluation/eval.py
+++ b/haystack/modeling/evaluation/eval.py
@@ -20,11 +20,11 @@ class Evaluator:
     Handles evaluation of a given model over a specified dataset.
     """
 
-    def __init__(self, data_loader: torch.utils.data.DataLoader, tasks, device: str, report: bool = True):
+    def __init__(self, data_loader: torch.utils.data.DataLoader, tasks, device: torch.device, report: bool = True):
         """
         :param data_loader: The PyTorch DataLoader that will return batches of data from the evaluation dataset
         :param tesks:
-        :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda".
+        :param device: The device on which the tensors should be processed. Choose from torch.device("cpu") and torch.device("cuda").
         :param report: Whether an eval report should be generated (e.g. classification report per class).
         """
         self.data_loader = data_loader
diff --git a/haystack/modeling/infer.py b/haystack/modeling/infer.py
index 7fc2bf5ea..fcd304339 100644
--- a/haystack/modeling/infer.py
+++ b/haystack/modeling/infer.py
@@ -128,7 +128,7 @@ class Inferencer:
         use_fast: bool = True,
         tokenizer_args: Dict = None,
         multithreading_rust: bool = True,
-        devices: Optional[List[Union[int, str, torch.device]]] = None,
+        devices: Optional[List[torch.device]] = None,
         use_auth_token: Union[bool, str] = None,
         **kwargs,
     ):
diff --git a/haystack/modeling/model/adaptive_model.py b/haystack/modeling/model/adaptive_model.py
index d1f4a6cbe..1bd885ce6 100644
--- a/haystack/modeling/model/adaptive_model.py
+++ b/haystack/modeling/model/adaptive_model.py
@@ -169,7 +169,7 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
         prediction_heads: List[PredictionHead],
         embeds_dropout_prob: float,
         lm_output_types: Union[str, List[str]],
-        device: str,
+        device: torch.device,
         loss_aggregation_fn: Optional[Callable] = None,
     ):
         """
@@ -182,7 +182,7 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
                                 "per_sequence", a single embedding will be extracted to represent the full
                                 input sequence. Can either be a single string, or a list of strings,
                                 one for each prediction head.
-        :param device: The device on which this model will operate. Either "cpu" or "cuda".
+        :param device: The device on which this model will operate. Either torch.device("cpu") or torch.device("cuda").
         :param loss_aggregation_fn: Function to aggregate the loss of multiple prediction heads.
                                     Input: loss_per_head (list of tensors), global_step (int), batch (dict)
                                     Output: aggregated loss (tensor)
@@ -258,13 +258,13 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
             # Need to save config and pipeline
 
     @classmethod
-    def load(  #  type: ignore
+    def load(  # type: ignore
         cls,
-        load_dir: Union[str, Path],  #  type: ignore
-        device: str,  #  type: ignore
-        strict: bool = True,  #  type: ignore
-        lm_name: Optional[str] = None,  #  type: ignore
-        processor: Optional[Processor] = None,  #  type: ignore
+        load_dir: Union[str, Path],
+        device: Union[str, torch.device],
+        strict: bool = True,
+        lm_name: Optional[str] = None,
+        processor: Optional[Processor] = None,
     ):
         """
         Loads an AdaptiveModel from a directory. The directory must contain:
@@ -277,12 +277,13 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
         * vocab.txt vocab file for language model, turning text to Wordpiece Tokens
 
         :param load_dir: Location where the AdaptiveModel is stored.
-        :param device: To which device we want to sent the model, either cpu or cuda.
+        :param device: To which device we want to sent the model, either torch.device("cpu") or torch.device("cuda").
         :param lm_name: The name to assign to the loaded language model.
         :param strict: Whether to strictly enforce that the keys loaded from saved model match the ones in
                        the PredictionHead (see torch.nn.module.load_state_dict()).
         :param processor: Processor to populate prediction head with information coming from tasks.
         """
+        device = torch.device(device)
         # Language Model
         if lm_name:
             language_model = LanguageModel.load(load_dir, haystack_lm_name=lm_name)
@@ -489,9 +490,9 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
     def convert_from_transformers(
         cls,
         model_name_or_path: Union[str, Path],
-        device: str,
+        device: torch.device,
         revision: Optional[str] = None,
-        task_type: Optional[str] = None,
+        task_type: str = "question_answering",
         processor: Optional[Processor] = None,
         use_auth_token: Optional[Union[bool, str]] = None,
         **kwargs,
@@ -509,12 +510,9 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
 
                                               See https://huggingface.co/models for full list
         :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
-        :param device: "cpu" or "cuda"
-        :param task_type: One of :
-                          - 'question_answering'
-                          More tasks coming soon ...
+        :param device: On which hardware the conversion should take place. Choose from torch.device("cpu") or torch.device("cuda")
+        :param task_type: 'question_answering'. More tasks coming soon ...
         :param processor: Processor to populate prediction head with information coming from tasks.
-        :type processor: Processor
         :return: AdaptiveModel
         """
         return conv.Converter.convert_from_transformers(
@@ -570,7 +568,7 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
             tokenizer_name_or_path=model_name, task_type=task_type, max_seq_len=256, doc_stride=128, use_fast=True
         )
         processor.save(output_path)
-        model = AdaptiveModel.convert_from_transformers(model_name, device="cpu", task_type=task_type)
+        model = AdaptiveModel.convert_from_transformers(model_name, device=torch.device("cpu"), task_type=task_type)
         model.save(output_path)
         os.remove(output_path / "language_model.bin")  # remove the actual PyTorch model(only configs are required)
 
@@ -617,14 +615,14 @@ class ONNXAdaptiveModel(BaseAdaptiveModel):
         language_model_class: str,
         language: str,
         prediction_heads: List[PredictionHead],
-        device: str,
+        device: torch.device,
     ):
         """
         :param onnx_session: ? # TODO
         :param language_model_class: Class of LanguageModel
         :param langauge: Language the model is trained for.
         :param prediction_heads: A list of models that take embeddings and return logits for a given task.
-        :param device: The device on which this model will operate. Either "cpu" or "cuda".
+        :param device: The device on which this model will operate. Either torch.device("cpu") or torch.device("cuda").
         """
         import onnxruntime
 
@@ -642,13 +640,14 @@ class ONNXAdaptiveModel(BaseAdaptiveModel):
         self.device = device
 
     @classmethod
-    def load(cls, load_dir: Union[str, Path], device: str, **kwargs):  # type: ignore
+    def load(cls, load_dir: Union[str, Path], device: Union[str, torch.device], **kwargs):  # type: ignore
         """
         Loads an ONNXAdaptiveModel from a directory.
 
         :param load_dir: Location where the ONNXAdaptiveModel is stored.
-        :param device: The device on which this model will operate. Either "cpu" or "cuda".
+        :param device: The device on which this model will operate. Either torch.device("cpu") or torch.device("cuda").
         """
+        device = torch.device(device)
         load_dir = Path(load_dir)
         import onnxruntime
 
@@ -657,7 +656,11 @@ class ONNXAdaptiveModel(BaseAdaptiveModel):
         sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
         # Use OpenMP optimizations. Only useful for CPU, has little impact for GPUs.
         sess_options.intra_op_num_threads = multiprocessing.cpu_count()
-        onnx_session = onnxruntime.InferenceSession(str(load_dir / "model.onnx"), sess_options)
+
+        providers = kwargs.get(
+            "providers", ["CPUExecutionProvider"] if device.type == "cpu" else ["CUDAExecutionProvider"]
+        )
+        onnx_session = onnxruntime.InferenceSession(str(load_dir / "model.onnx"), sess_options, providers=providers)
 
         # Prediction heads
         _, ph_config_files = cls._get_prediction_head_files(load_dir, strict=False)
diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py
index 35b528bbb..fb8a2f2bf 100644
--- a/haystack/modeling/model/biadaptive_model.py
+++ b/haystack/modeling/model/biadaptive_model.py
@@ -36,7 +36,7 @@ class BiAdaptiveModel(nn.Module):
         language_model2: LanguageModel,
         prediction_heads: List[PredictionHead],
         embeds_dropout_prob: float = 0.1,
-        device: str = "cuda",
+        device: torch.device = torch.device("cuda"),
         lm1_output_types: Union[str, List[str]] = ["per_sequence"],
         lm2_output_types: Union[str, List[str]] = ["per_sequence"],
         loss_aggregation_fn: Optional[Callable] = None,
@@ -57,7 +57,7 @@ class BiAdaptiveModel(nn.Module):
                                  "per_sequence", a single embedding will be extracted to represent the full
                                  input sequence. Can either be a single string, or a list of strings,
                                  one for each prediction head.
-        :param device: The device on which this model will operate. Either "cpu" or "cuda".
+        :param device: The device on which this model will operate. Either torch.device("cpu") or torch.device("cuda").
         :param loss_aggregation_fn: Function to aggregate the loss of multiple prediction heads.
                                     Input: loss_per_head (list of tensors), global_step (int), batch (dict)
                                     Output: aggregated loss (tensor)
@@ -108,7 +108,7 @@ class BiAdaptiveModel(nn.Module):
     def load(
         cls,
         load_dir: Path,
-        device: str,
+        device: torch.device,
         strict: bool = False,
         lm1_name: str = "lm1",
         lm2_name: str = "lm2",
@@ -130,7 +130,7 @@ class BiAdaptiveModel(nn.Module):
         * special_tokens_map.json
 
         :param load_dir: Location where adaptive model is stored.
-        :param device: To which device we want to sent the model, either cpu or cuda.
+        :param device: To which device we want to sent the model, either torch.device("cpu") or torch.device("cuda").
         :param lm1_name: The name to assign to the first loaded language model (for encoding queries).
         :param lm2_name: The name to assign to the second loaded language model (for encoding context/passages).
         :param strict: Whether to strictly enforce that the keys loaded from saved model match the ones in
@@ -432,8 +432,8 @@ class BiAdaptiveModel(nn.Module):
         cls,
         model_name_or_path1: Union[str, Path],
         model_name_or_path2: Union[str, Path],
-        device: str,
-        task_type: str,
+        device: torch.device,
+        task_type: str = "text_similarity",
         processor: Optional[Processor] = None,
         similarity_function: str = "dot_product",
     ):
@@ -451,9 +451,8 @@ class BiAdaptiveModel(nn.Module):
                                       Exemplary public names:
                                       - facebook/dpr-ctx_encoder-single-nq-base
                                       - deepset/bert-large-uncased-whole-word-masking-squad2
-        :param device: "cpu" or "cuda"
-        :param task_type: 'text_similarity'
-                          More tasks coming soon ...
+        :param device: On which hardware the conversion is going to run on. Either torch.device("cpu") or torch.device("cuda")
+        :param task_type: 'text_similarity' More tasks coming soon ...
         :param processor: populates prediction head with information coming from tasks
         :type processor: Processor
         :return: AdaptiveModel
diff --git a/haystack/modeling/model/optimization.py b/haystack/modeling/model/optimization.py
index d7ca41b8b..37ec1160c 100644
--- a/haystack/modeling/model/optimization.py
+++ b/haystack/modeling/model/optimization.py
@@ -1,5 +1,5 @@
 # TODO analyse if this optimization is needed or whether we can use HF transformers code
-from typing import Dict, Any
+from typing import Dict, Any, Optional
 
 import inspect
 import logging
@@ -73,7 +73,7 @@ def initialize_optimizer(
     model: AdaptiveModel,
     n_batches: int,
     n_epochs: int,
-    device,
+    device: torch.device,
     learning_rate: float,
     optimizer_opts: Dict[Any, Any] = None,
     schedule_opts: Dict[Any, Any] = None,
@@ -90,7 +90,7 @@ def initialize_optimizer(
     :param model: model to optimize (e.g. trimming weights to fp16 / mixed precision)
     :param n_batches: number of batches for training
     :param n_epochs: number of epochs for training
-    :param device:
+    :param device: Which hardware will be used by the optimizer. Either torch.device("cpu") or torch.device("cuda").
     :param learning_rate: Learning rate
     :param optimizer_opts: Dict to customize the optimizer. Choose any optimizer available from torch.optim, apex.optimizers or
                            transformers.optimization by supplying the class name and the parameters for the constructor.
@@ -295,14 +295,20 @@ def get_scheduler(optimizer, opts):
     return scheduler
 
 
-def optimize_model(model, device, local_rank, optimizer=None, distributed=False, use_amp=None):
+def optimize_model(
+    model: "AdaptiveModel",
+    device: torch.device,
+    local_rank: int,
+    optimizer=None,
+    distributed: Optional[bool] = False,
+    use_amp: Optional[str] = None,
+):
     """
     Wraps MultiGPU or distributed usage around a model
     No support for ONNX models
 
     :param model: model to optimize (e.g. trimming weights to fp16 / mixed precision)
-    :type model: AdaptiveModel
-    :param device: either gpu or cpu, get the device from initialize_device_settings()
+    :param device: either torch.device("cpu") or torch.device("cuda"). Get the device from `initialize_device_settings()`
     :param distributed: Whether training on distributed machines
     :param local_rank: rank of the machine in a distributed setting
     :param use_amp: Optimization level of nvidia's automatic mixed precision (AMP). The higher the level, the faster the model.
diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py
index 789dbd0ed..0de639a02 100644
--- a/haystack/modeling/model/triadaptive_model.py
+++ b/haystack/modeling/model/triadaptive_model.py
@@ -43,7 +43,7 @@ class TriAdaptiveModel(nn.Module):
         language_model3: LanguageModel,
         prediction_heads: List[PredictionHead],
         embeds_dropout_prob: float = 0.1,
-        device: str = "cuda",
+        device: torch.device = torch.device("cuda"),
         lm1_output_types: Union[str, List[str]] = ["per_sequence"],
         lm2_output_types: Union[str, List[str]] = ["per_sequence"],
         lm3_output_types: Union[str, List[str]] = ["per_sequence"],
@@ -71,7 +71,7 @@ class TriAdaptiveModel(nn.Module):
                                 "per_sequence", a single embedding will be extracted to represent the full
                                 input sequence. Can either be a single string, or a list of strings,
                                 one for each prediction head.
-        :param device: The device on which this model will operate. Either "cpu" or "cuda".
+        :param device: The device on which this model will operate. Either torch.device("cpu") or torch.device("cuda").
         :param loss_aggregation_fn: Function to aggregate the loss of multiple prediction heads.
                                     Input: loss_per_head (list of tensors), global_step (int), batch (dict)
                                     Output: aggregated loss (tensor)
@@ -129,7 +129,7 @@ class TriAdaptiveModel(nn.Module):
     def load(
         cls,
         load_dir: Path,
-        device: str,
+        device: torch.device,
         strict: bool = False,
         lm1_name: str = "lm1",
         lm2_name: str = "lm2",
@@ -155,7 +155,7 @@ class TriAdaptiveModel(nn.Module):
         * special_tokens_map.json
 
         :param load_dir: Location where the TriAdaptiveModel is stored.
-        :param device: To which device we want to sent the model, either cpu or cuda.
+        :param device: To which device we want to sent the model, either torch.device("cpu") or torch.device("cuda").
         :param lm1_name: The name to assign to the first loaded language model (for encoding queries).
         :param lm2_name: The name to assign to the second loaded language model (for encoding context/passages).
         :param lm3_name: The name to assign to the second loaded language model (for encoding tables).
diff --git a/haystack/modeling/training/base.py b/haystack/modeling/training/base.py
index e2a213ff4..6554fa96f 100644
--- a/haystack/modeling/training/base.py
+++ b/haystack/modeling/training/base.py
@@ -126,7 +126,7 @@ class Trainer:
         data_silo: DataSilo,
         epochs: int,
         n_gpu: int,
-        device,
+        device: torch.device,
         lr_schedule=None,
         evaluate_every: int = 100,
         eval_report: bool = True,
@@ -152,7 +152,7 @@ class Trainer:
         :param data_silo: A DataSilo object that will contain the train, dev and test datasets as PyTorch DataLoaders
         :param epochs: How many times the training procedure will loop through the train dataset
         :param n_gpu: The number of gpus available for training and evaluation.
-        :param device: The device on which the train, dev and test tensors should be hosted. Choose from "cpu" and "cuda".
+        :param device: The device on which the train, dev and test tensors should be hosted. Choose from torch.device("cpu") and torch.device("cuda").
         :param lr_schedule: An optional scheduler object that can regulate the learning rate of the optimizer
         :param evaluate_every: Perform dev set evaluation after this many steps of training.
         :param eval_report: If evaluate_every is not 0, specifies if an eval report should be generated when evaluating
@@ -660,7 +660,7 @@ class DistillationTrainer(Trainer):
         data_silo: DistillationDataSilo,
         epochs: int,
         n_gpu: int,
-        device: str,
+        device: torch.device,
         lr_schedule: Optional["_LRScheduler"] = None,
         evaluate_every: int = 100,
         eval_report: bool = True,
@@ -691,7 +691,7 @@ class DistillationTrainer(Trainer):
         :param data_silo: A DataSilo object that will contain the train, dev and test datasets as PyTorch DataLoaders
         :param epochs: How many times the training procedure will loop through the train dataset
         :param n_gpu: The number of gpus available for training and evaluation.
-        :param device: The device on which the train, dev and test tensors should be hosted. Choose from "cpu" and "cuda".
+        :param device: The device on which the train, dev and test tensors should be hosted. Choose from torch.device("cpu") and torch.device("cuda").
         :param lr_schedule: An optional scheduler object that can regulate the learning rate of the optimizer
         :param evaluate_every: Perform dev set evaluation after this many steps of training.
         :param eval_report: If evaluate_every is not 0, specifies if an eval report should be generated when evaluating
@@ -833,7 +833,7 @@ class TinyBERTDistillationTrainer(Trainer):
         :param data_silo: A DataSilo object that will contain the train, dev and test datasets as PyTorch DataLoaders
         :param epochs: How many times the training procedure will loop through the train dataset
         :param n_gpu: The number of gpus available for training and evaluation.
-        :param device: The device on which the train, dev and test tensors should be hosted. Choose from "cpu" and "cuda".
+        :param device: The device on which the train, dev and test tensors should be hosted. Choose from torch.device("cpu") and torch.device("cuda").
         :param lr_schedule: An optional scheduler object that can regulate the learning rate of the optimizer
         :param evaluate_every: Perform dev set evaluation after this many steps of training.
         :param eval_report: If evaluate_every is not 0, specifies if an eval report should be generated when evaluating
diff --git a/haystack/nodes/ranker/sentence_transformers.py b/haystack/nodes/ranker/sentence_transformers.py
index 67defe5fe..e471f4dbb 100644
--- a/haystack/nodes/ranker/sentence_transformers.py
+++ b/haystack/nodes/ranker/sentence_transformers.py
@@ -41,7 +41,7 @@ class SentenceTransformersRanker(BaseRanker):
         model_version: Optional[str] = None,
         top_k: int = 10,
         use_gpu: bool = True,
-        devices: Optional[List[Union[int, str, torch.device]]] = None,
+        devices: Optional[List[Union[str, torch.device]]] = None,
     ):
         """
         :param model_name_or_path: Directory of a saved model or the name of a public model e.g.
@@ -50,16 +50,20 @@ class SentenceTransformersRanker(BaseRanker):
         :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
         :param top_k: The maximum number of documents to return
         :param use_gpu: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.
-        :param devices: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. ["cuda:0"]).
+        :param devices: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
+                        The strings will be converted into pytorch devices, so use the string notation described here:
+                        https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
+                        (e.g. ["cuda:0"]).
         """
         super().__init__()
 
         self.top_k = top_k
 
         if devices is not None:
-            self.devices = devices
+            self.devices = [torch.device(device) for device in devices]
         else:
             self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=True)
+
         self.transformer_model = AutoModelForSequenceClassification.from_pretrained(
             pretrained_model_name_or_path=model_name_or_path, revision=model_version
         )
diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py
index ccecc772f..73a6bfccf 100644
--- a/haystack/nodes/reader/farm.py
+++ b/haystack/nodes/reader/farm.py
@@ -777,7 +777,9 @@ class FARMReader(BaseReader):
 
         return result
 
-    def eval_on_file(self, data_dir: str, test_filename: str, device: Optional[str] = None):
+    def eval_on_file(
+        self, data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None
+    ):
         """
         Performs evaluation on a SQuAD-formatted file.
         Returns a dict containing the following metrics:
@@ -786,14 +788,16 @@ class FARMReader(BaseReader):
             - "top_n_accuracy": Proportion of predicted answers that overlap with correct answer
 
         :param data_dir: The directory in which the test set can be found
-        :type data_dir: Path or str
         :param test_filename: The name of the file containing the test data in SQuAD format.
-        :type test_filename: str
-        :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda" or use the Reader's device by default.
-        :type device: str
+        :param device: The device on which the tensors should be processed.
+               Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
+               or use the Reader's device by default.
         """
         if device is None:
             device = self.devices[0]
+        else:
+            device = torch.device(device)
+
         eval_processor = SquadProcessor(
             tokenizer=self.inferencer.processor.tokenizer,
             max_seq_len=self.inferencer.processor.max_seq_len,
@@ -822,7 +826,7 @@ class FARMReader(BaseReader):
     def eval(
         self,
         document_store: BaseDocumentStore,
-        device: Optional[str] = None,
+        device: Optional[Union[str, torch.device]] = None,
         label_index: str = "label",
         doc_index: str = "eval_document",
         label_origin: str = "gold-label",
@@ -836,7 +840,9 @@ class FARMReader(BaseReader):
               - "top_n_accuracy": Proportion of predicted answers that overlap with correct answer
 
         :param document_store: DocumentStore containing the evaluation documents
-        :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda" or use the Reader's device by default.
+        :param device: The device on which the tensors should be processed.
+                       Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
+                       or use the Reader's device by default.
         :param label_index: Index/Table name where labeled questions are stored
         :param doc_index: Index/Table name where documents that are used for evaluation are stored
         :param label_origin: Field name where the gold labels are stored
@@ -844,6 +850,9 @@ class FARMReader(BaseReader):
         """
         if device is None:
             device = self.devices[0]
+        else:
+            device = torch.device(device)
+
         if self.top_k_per_candidate != 4:
             logger.info(
                 f"Performing Evaluation using top_k_per_candidate = {self.top_k_per_candidate} \n"
@@ -1012,7 +1021,7 @@ class FARMReader(BaseReader):
     def calibrate_confidence_scores(
         self,
         document_store: BaseDocumentStore,
-        device: Optional[str] = None,
+        device: Optional[Union[str, torch.device]] = None,
         label_index: str = "label",
         doc_index: str = "eval_document",
         label_origin: str = "gold_label",
@@ -1021,7 +1030,9 @@ class FARMReader(BaseReader):
         Calibrates confidence scores on evaluation documents in the DocumentStore.
 
         :param document_store: DocumentStore containing the evaluation documents
-        :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda" or use the Reader's device by default.
+        :param device: The device on which the tensors should be processed.
+                       Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
+                       or use the Reader's device by default.
         :param label_index: Index/Table name where labeled questions are stored
         :param doc_index: Index/Table name where documents that are used for evaluation are stored
         :param label_origin: Field name where the gold labels are stored
diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py
index 5d56245fc..6bd11aba1 100644
--- a/haystack/nodes/retriever/dense.py
+++ b/haystack/nodes/retriever/dense.py
@@ -54,7 +54,7 @@ class DensePassageRetriever(BaseRetriever):
         similarity_function: str = "dot_product",
         global_loss_buffer_size: int = 150000,
         progress_bar: bool = True,
-        devices: Optional[List[Union[int, str, torch.device]]] = None,
+        devices: Optional[List[Union[str, torch.device]]] = None,
         use_auth_token: Optional[Union[str, bool]] = None,
     ):
         """
@@ -102,8 +102,11 @@ class DensePassageRetriever(BaseRetriever):
                                         Increase if errors like "encoded data exceeds max_size ..." come up
         :param progress_bar: Whether to show a tqdm progress bar or not.
                              Can be helpful to disable in production deployments to keep the logs clean.
-        :param devices: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. ["cuda:0"]).
-                        As multi-GPU training is currently not implemented for DPR, training will only use the first device provided in this list.
+        :param devices: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
+                        These strings will be converted into pytorch devices, so use the string notation described here:
+                        https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
+                        (e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for DPR, training
+                        will only use the first device provided in this list.
         :param use_auth_token:  API token used to download private models from Huggingface. If this parameter is set to `True`,
                                 the local token will be used, which must be previously created via `transformer-cli login`.
                                 Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
@@ -111,7 +114,7 @@ class DensePassageRetriever(BaseRetriever):
         super().__init__()
 
         if devices is not None:
-            self.devices = devices
+            self.devices = [torch.device(device) for device in devices]
         else:
             self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=True)
 
@@ -193,7 +196,7 @@ class DensePassageRetriever(BaseRetriever):
             embeds_dropout_prob=0.1,
             lm1_output_types=["per_sequence"],
             lm2_output_types=["per_sequence"],
-            device=str(self.devices[0]),
+            device=self.devices[0],
         )
 
         self.model.connect_heads_with_processor(self.processor.tasks, require_labels=False)
@@ -548,7 +551,7 @@ class TableTextRetriever(BaseRetriever):
         similarity_function: str = "dot_product",
         global_loss_buffer_size: int = 150000,
         progress_bar: bool = True,
-        devices: Optional[List[Union[int, str, torch.device]]] = None,
+        devices: Optional[List[Union[str, torch.device]]] = None,
         use_auth_token: Optional[Union[str, bool]] = None,
     ):
         """
@@ -582,8 +585,11 @@ class TableTextRetriever(BaseRetriever):
                                         Increase if errors like "encoded data exceeds max_size ..." come up
         :param progress_bar: Whether to show a tqdm progress bar or not.
                              Can be helpful to disable in production deployments to keep the logs clean.
-        :param devices: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. ["cuda:0"]).
-                        As multi-GPU training is currently not implemented for DPR, training will only use the first device provided in this list.
+        :param devices: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
+                        These strings will be converted into pytorch devices, so use the string notation described here:
+                        https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
+                        (e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for TableTextRetriever,
+                        training will only use the first device provided in this list.
         :param use_auth_token:  API token used to download private models from Huggingface. If this parameter is set to `True`,
                                 the local token will be used, which must be previously created via `transformer-cli login`.
                                 Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
@@ -591,7 +597,7 @@ class TableTextRetriever(BaseRetriever):
         super().__init__()
 
         if devices is not None:
-            self.devices = devices
+            self.devices = [torch.device(device) for device in devices]
         else:
             self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=True)
 
@@ -699,7 +705,7 @@ class TableTextRetriever(BaseRetriever):
             lm1_output_types=["per_sequence"],
             lm2_output_types=["per_sequence"],
             lm3_output_types=["per_sequence"],
-            device=str(self.devices[0]),
+            device=self.devices[0],
         )
 
         self.model.connect_heads_with_processor(self.processor.tasks, require_labels=False)
@@ -1075,7 +1081,7 @@ class EmbeddingRetriever(BaseRetriever):
         emb_extraction_layer: int = -1,
         top_k: int = 10,
         progress_bar: bool = True,
-        devices: Optional[List[Union[int, str, torch.device]]] = None,
+        devices: Optional[List[Union[str, torch.device]]] = None,
         use_auth_token: Optional[Union[str, bool]] = None,
     ):
         """
@@ -1101,8 +1107,11 @@ class EmbeddingRetriever(BaseRetriever):
                                      Default: -1 (very last layer).
         :param top_k: How many documents to return per query.
         :param progress_bar: If true displays progress bar during embedding.
-        :param devices: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. ["cuda:0"]).
-                        As multi-GPU training is currently not implemented for DPR, training will only use the first device provided in this list.
+        :param devices: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
+                        These strings will be converted into pytorch devices, so use the string notation described here:
+                        https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
+                        (e.g. ["cuda:0"]). Note: As multi-GPU training is currently not implemented for EmbeddingRetriever,
+                        training will only use the first device provided in this list.
         :param use_auth_token:  API token used to download private models from Huggingface. If this parameter is set to `True`,
                                 the local token will be used, which must be previously created via `transformer-cli login`.
                                 Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
@@ -1110,7 +1119,7 @@ class EmbeddingRetriever(BaseRetriever):
         super().__init__()
 
         if devices is not None:
-            self.devices = devices
+            self.devices = [torch.device(device) for device in devices]
         else:
             self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=True)
 
diff --git a/haystack/utils/augment_squad.py b/haystack/utils/augment_squad.py
index 35a79f50b..e9be4ac8b 100644
--- a/haystack/utils/augment_squad.py
+++ b/haystack/utils/augment_squad.py
@@ -23,6 +23,7 @@ Arguments:
     model: Huggingface MLM model identifier.
 """
 
+from typing import Tuple, List, Union
 
 import torch
 from torch.nn import functional as F
@@ -37,13 +38,14 @@ import argparse
 import json
 import logging
 from tqdm import tqdm
-from typing import Tuple, List
 
 logger = logging.getLogger(__name__)
 
 
 def load_glove(
-    glove_path: Path = Path("glove.txt"), vocab_size: int = 100_000, device: str = "cpu:0"
+    glove_path: Path = Path("glove.txt"),
+    vocab_size: int = 100_000,
+    device: Union[str, torch.device] = torch.device("cpu:0"),
 ) -> Tuple[dict, dict, torch.Tensor]:
     """Loads the GloVe vectors and returns a mapping from words to their GloVe vector indices and the other way around."""
 
@@ -112,8 +114,9 @@ def get_replacements(
     text: str,
     word_possibilities: int = 20,
     batch_size: int = 16,
-    device: str = "cpu:0",
+    device: torch.device = torch.device("cpu:0"),
 ) -> List[List[str]]:
+
     """Returns a list of possible replacements for each word in the text."""
     input_ids, words, word_subword_mapping = tokenize_and_extract_words(text, tokenizer)
 
@@ -179,8 +182,9 @@ def augment(
     word_possibilities: int = 20,
     replace_probability: float = 0.4,
     batch_size: int = 16,
-    device: str = "cpu:0",
+    device: Union[str, torch.device] = torch.device("cpu:0"),
 ) -> List[str]:
+    device = torch.device(device)
     # returns a list of different augmented versions of the text
     replacements = get_replacements(
         glove_word_id_mapping=word_id_mapping,
@@ -211,16 +215,17 @@ def augment(
 def augment_squad(
     squad_path: Path,
     output_path: Path,
+    glove_path: Path = Path("glove.txt"),
     model: str = "bert-base-uncased",
     tokenizer: str = "bert-base-uncased",
-    glove_path: Path = Path("glove.txt"),
     multiplication_factor: int = 20,
     word_possibilities: int = 20,
     replace_probability: float = 0.4,
-    device: str = "cpu:0",
+    device: Union[str, torch.device] = "cpu:0",
     batch_size: int = 16,
 ):
     """Loads a squad dataset, augments the contexts, and saves the result in SQuAD format."""
+    device = torch.device(device)
     # loading model and tokenizer
     transformers_model = AutoModelForMaskedLM.from_pretrained(model)
     transformers_model.to(device)