feat: take the list of models to cache instead of hardcoding one (#3060)

* take the list of models to cache as an input * let nltk find the cache dir on its own
2025-12-30 16:47:19 +00:00 · 2022-08-18 11:55:29 +02:00 · 2022-08-18 11:55:29 +02:00 · af24ffae55
commit af24ffae55
parent 1027ab3624
1 changed files with 11 additions and 7 deletions
--- a/haystack/utils/docker.py
+++ b/haystack/utils/docker.py
@ -1,21 +1,25 @@
 import logging


-def cache_models():
+def cache_models(models=None):
    """
    Small function that caches models and other data.
    Used only in the Dockerfile to include these caches in the images.
    """
+    # Backward compat after adding the `model` param
+    if models is None:
+        models = ["deepset/roberta-base-squad2"]
+
    # download punkt tokenizer
    logging.info("Caching punkt data")
    import nltk

-    nltk.download("punkt", download_dir="/root/nltk_data")
+    nltk.download("punkt")

-    # Cache roberta-base-squad2 model
-    logging.info("Caching deepset/roberta-base-squad2")
+    # Cache models
    import transformers

-    model_to_cache = "deepset/roberta-base-squad2"
-    transformers.AutoTokenizer.from_pretrained(model_to_cache)
-    transformers.AutoModel.from_pretrained(model_to_cache)
+    for model_to_cache in models:
+        logging.info(f"Caching {model_to_cache}")
+        transformers.AutoTokenizer.from_pretrained(model_to_cache)
+        transformers.AutoModel.from_pretrained(model_to_cache)