feat: utility function to explicitly invoke JSON schema generation (#3798)

* explicitly cache the JSON schema

* fix import path

* move to final
This commit is contained in:
Massimiliano Pippi 2023-01-02 17:06:24 +01:00 committed by GitHub
parent bebd6b26ec
commit 19c7725319
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 16 additions and 2 deletions

View File

@ -34,6 +34,7 @@ RUN pip install --upgrade pip && \
pip install --no-cache-dir .${haystack_extras} && \
pip install --no-cache-dir ./rest_api
FROM $base_immage AS final
COPY --from=build-image /opt/venv /opt/venv
@ -43,7 +44,7 @@ RUN apt-get update && apt-get install -y libfontconfig && rm -rf /var/lib/apt/li
ENV PATH="/opt/venv/bin:$PATH"
# Importing Haystack will generate and persist the json schema, we do this here for two reasons:
# The JSON schema is lazily generated at first usage, but we do it explicitly here for two reasons:
# - the schema will be already there when the container runs, saving the generation overhead when a container starts
# - derived images don't need to write the schema and can run with lower user privileges
RUN python3 -c "import haystack"
RUN python3 -c "from haystack.utils.docker import cache_schema; cache_schema()"

View File

@ -1,5 +1,6 @@
import logging
from typing import List, Union, Optional
from haystack.nodes._json_schema import load_schema
def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Union[str, bool]] = None):
@ -31,3 +32,15 @@ def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Un
logging.info("Caching %s", model_to_cache)
transformers.AutoTokenizer.from_pretrained(model_to_cache, use_auth_token=use_auth_token)
transformers.AutoModel.from_pretrained(model_to_cache, use_auth_token=use_auth_token)
def cache_schema():
"""
Generate and persist Haystack JSON schema.
The schema is lazily generated at first usage, but this might not work in Docker containers
when the user running Haystack doesn't have write permissions on the Python installation. By
calling this function at Docker image build time, the schema is generated once for all.
"""
# Calling load_schema() will generate the schema as a side effect
load_schema()