From 9c3d9b4885a8a9a07a9c4cb733fde7b879ea106b Mon Sep 17 00:00:00 2001 From: Sara Zan Date: Tue, 11 Jan 2022 16:37:45 +0100 Subject: [PATCH] Add models to demo docker image (#1978) * Add utility to cache models and nltk data & modify Dockerfiles to use it * Fix punkt data not being cached --- Dockerfile | 5 ++--- Dockerfile-GPU | 7 ++++--- haystack/utils/docker.py | 18 ++++++++++++++++++ 3 files changed, 24 insertions(+), 6 deletions(-) create mode 100644 haystack/utils/docker.py diff --git a/Dockerfile b/Dockerfile index d4fab6923..f3e2a3fdd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,11 +18,10 @@ COPY haystack /home/user/haystack # install as a package COPY setup.py requirements.txt README.md /home/user/ +RUN pip install --upgrade pip RUN pip install -r requirements.txt RUN pip install -e . - -# download punkt tokenizer to be included in image -RUN python3 -c "import nltk;nltk.download('punkt', download_dir='/usr/nltk_data')" +RUN python3 -c "from haystack.utils.docker import cache_models;cache_models()" # create folder for /file-upload API endpoint with write permissions, this might be adjusted depending on FILE_UPLOAD_PATH RUN mkdir -p /home/user/file-upload diff --git a/Dockerfile-GPU b/Dockerfile-GPU index 8a0edbf4a..b7a582f9e 100644 --- a/Dockerfile-GPU +++ b/Dockerfile-GPU @@ -37,15 +37,13 @@ RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 # Copy package setup files COPY setup.py requirements.txt README.md /home/user/ +RUN pip install --upgrade pip RUN echo "Install required packages" && \ # Install PyTorch for CUDA 11 pip3 install torch==1.10.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && \ # Install from requirements.txt pip3 install -r requirements.txt -# download punkt tokenizer to be included in image -RUN python3 -c "import nltk;nltk.download('punkt', download_dir='/usr/nltk_data')" - # copy saved models COPY README.md models* /home/user/models/ @@ -58,6 +56,9 @@ COPY haystack /home/user/haystack # Install package RUN pip3 install -e . +# Cache Roberta and NLTK data +RUN python3 -c "from haystack.utils.docker import cache_models;cache_models()" + # optional : copy sqlite db if needed for testing #COPY qa.db /home/user/ diff --git a/haystack/utils/docker.py b/haystack/utils/docker.py new file mode 100644 index 000000000..faa075638 --- /dev/null +++ b/haystack/utils/docker.py @@ -0,0 +1,18 @@ +import logging + +def cache_models(): + """ + Small function that caches models and other data. + Used only in the Dockerfile to include these caches in the images. + """ + # download punkt tokenizer + logging.info("Caching punkt data") + import nltk + nltk.download('punkt', download_dir='/root/nltk_data') + + # Cache roberta-base-squad2 model + logging.info("Caching deepset/roberta-base-squad2") + import transformers + model_to_cache='deepset/roberta-base-squad2' + transformers.AutoTokenizer.from_pretrained(model_to_cache) + transformers.AutoModel.from_pretrained(model_to_cache) \ No newline at end of file