mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-17 01:58:23 +00:00
build: cache nltk models into the docker image (#4118)
* separated nltk cache * separated nltk caching * fixed pylint lazy log error * using model name as default value
This commit is contained in:
parent
ec72dd73fc
commit
d27f372b67
@ -47,3 +47,7 @@ ENV PATH="/opt/venv/bin:$PATH"
|
|||||||
# - the schema will be already there when the container runs, saving the generation overhead when a container starts
|
# - the schema will be already there when the container runs, saving the generation overhead when a container starts
|
||||||
# - derived images don't need to write the schema and can run with lower user privileges
|
# - derived images don't need to write the schema and can run with lower user privileges
|
||||||
RUN python3 -c "from haystack.utils.docker import cache_schema; cache_schema()"
|
RUN python3 -c "from haystack.utils.docker import cache_schema; cache_schema()"
|
||||||
|
|
||||||
|
# Haystack Preprocessor uses NLTK punkt model to divide text into a list of sentences.
|
||||||
|
# We cache these models for seemless user experience.
|
||||||
|
RUN python3 -c "from haystack.utils.docker import cache_nltk_model; cache_nltk_model()"
|
||||||
|
|||||||
@ -3,6 +3,13 @@ from typing import List, Union, Optional
|
|||||||
from haystack.nodes._json_schema import load_schema
|
from haystack.nodes._json_schema import load_schema
|
||||||
|
|
||||||
|
|
||||||
|
def cache_nltk_model(model: str = "punkt"):
|
||||||
|
logging.info("Caching %s model...", model)
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
nltk.download(model)
|
||||||
|
|
||||||
|
|
||||||
def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Union[str, bool]] = None):
|
def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Union[str, bool]] = None):
|
||||||
"""
|
"""
|
||||||
Small function that caches models and other data.
|
Small function that caches models and other data.
|
||||||
@ -19,12 +26,6 @@ def cache_models(models: Optional[List[str]] = None, use_auth_token: Optional[Un
|
|||||||
if models is None:
|
if models is None:
|
||||||
models = ["deepset/roberta-base-squad2"]
|
models = ["deepset/roberta-base-squad2"]
|
||||||
|
|
||||||
# download punkt tokenizer
|
|
||||||
logging.info("Caching punkt data")
|
|
||||||
import nltk
|
|
||||||
|
|
||||||
nltk.download("punkt")
|
|
||||||
|
|
||||||
# Cache models
|
# Cache models
|
||||||
import transformers
|
import transformers
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user