From 9c3d9b4885a8a9a07a9c4cb733fde7b879ea106b Mon Sep 17 00:00:00 2001
From: Sara Zan <sara.zanzottera@deepset.ai>
Date: Tue, 11 Jan 2022 16:37:45 +0100
Subject: [PATCH] Add models to demo docker image (#1978)

* Add utility to cache models and nltk data & modify Dockerfiles to use it

* Fix punkt data not being cached
---
 Dockerfile               |  5 ++---
 Dockerfile-GPU           |  7 ++++---
 haystack/utils/docker.py | 18 ++++++++++++++++++
 3 files changed, 24 insertions(+), 6 deletions(-)
 create mode 100644 haystack/utils/docker.py

diff --git a/Dockerfile b/Dockerfile
index d4fab6923..f3e2a3fdd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,11 +18,10 @@ COPY haystack /home/user/haystack
 
 # install as a package
 COPY setup.py requirements.txt README.md /home/user/
+RUN pip install --upgrade pip
 RUN pip install -r requirements.txt
 RUN pip install -e .
-
-# download punkt tokenizer to be included in image
-RUN python3 -c "import nltk;nltk.download('punkt', download_dir='/usr/nltk_data')"
+RUN python3 -c "from haystack.utils.docker import cache_models;cache_models()"
 
 # create folder for /file-upload API endpoint with write permissions, this might be adjusted depending on FILE_UPLOAD_PATH
 RUN mkdir -p /home/user/file-upload
diff --git a/Dockerfile-GPU b/Dockerfile-GPU
index 8a0edbf4a..b7a582f9e 100644
--- a/Dockerfile-GPU
+++ b/Dockerfile-GPU
@@ -37,15 +37,13 @@ RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1
 # Copy package setup files
 COPY setup.py requirements.txt README.md /home/user/
 
+RUN pip install --upgrade pip
 RUN echo "Install required packages" && \
     # Install PyTorch for CUDA 11
     pip3 install torch==1.10.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && \
     # Install from requirements.txt		
     pip3 install -r requirements.txt
 
-# download punkt tokenizer to be included in image
-RUN python3 -c "import nltk;nltk.download('punkt', download_dir='/usr/nltk_data')"
-
 # copy saved models
 COPY README.md models* /home/user/models/
 
@@ -58,6 +56,9 @@ COPY haystack /home/user/haystack
 # Install package
 RUN pip3 install -e .
 
+# Cache Roberta and NLTK data
+RUN python3 -c "from haystack.utils.docker import cache_models;cache_models()"
+
 # optional : copy sqlite db if needed for testing
 #COPY qa.db /home/user/
 
diff --git a/haystack/utils/docker.py b/haystack/utils/docker.py
new file mode 100644
index 000000000..faa075638
--- /dev/null
+++ b/haystack/utils/docker.py
@@ -0,0 +1,18 @@
+import logging
+
+def cache_models():
+    """
+    Small function that caches models and other data.
+    Used only in the Dockerfile to include these caches in the images.
+    """
+    # download punkt tokenizer
+    logging.info("Caching punkt data")
+    import nltk
+    nltk.download('punkt', download_dir='/root/nltk_data')
+    
+    # Cache roberta-base-squad2 model
+    logging.info("Caching deepset/roberta-base-squad2")
+    import transformers
+    model_to_cache='deepset/roberta-base-squad2'
+    transformers.AutoTokenizer.from_pretrained(model_to_cache)
+    transformers.AutoModel.from_pretrained(model_to_cache)
\ No newline at end of file