diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base index 6be9a57ba..f6cd183bc 100644 --- a/docker/Dockerfile.base +++ b/docker/Dockerfile.base @@ -34,6 +34,7 @@ RUN pip install --upgrade pip && \ pip install --no-cache-dir .${haystack_extras} && \ pip install --no-cache-dir ./rest_api + FROM $base_immage AS final COPY --from=build-image /opt/venv /opt/venv @@ -43,7 +44,7 @@ RUN apt-get update && apt-get install -y libfontconfig && rm -rf /var/lib/apt/li ENV PATH="/opt/venv/bin:$PATH" -# Importing Haystack will generate and persist the json schema, we do this here for two reasons: +# The JSON schema is lazily generated at first usage, but we do it explicitly here for two reasons: # - the schema will be already there when the container runs, saving the generation overhead when a container starts # - derived images don't need to write the schema and can run with lower user privileges -RUN python3 -c "import haystack" +RUN python3 -c "from haystack.utils.docker import cache_schema; cache_schema()" diff --git a/haystack/utils/docker.py b/haystack/utils/docker.py index 5cdc16f9c..a27056845 100644 --- a/haystack/utils/docker.py +++ b/haystack/utils/docker.py @@ -1,5 +1,6 @@ import logging from typing import List, Union, Optional +from haystack.nodes._json_schema import load_schema def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Union[str, bool]] = None): @@ -31,3 +32,15 @@ def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Un logging.info("Caching %s", model_to_cache) transformers.AutoTokenizer.from_pretrained(model_to_cache, use_auth_token=use_auth_token) transformers.AutoModel.from_pretrained(model_to_cache, use_auth_token=use_auth_token) + + +def cache_schema(): + """ + Generate and persist Haystack JSON schema. + + The schema is lazily generated at first usage, but this might not work in Docker containers + when the user running Haystack doesn't have write permissions on the Python installation. By + calling this function at Docker image build time, the schema is generated once for all. + """ + # Calling load_schema() will generate the schema as a side effect + load_schema()