mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-02 02:39:51 +00:00
48 lines
1.6 KiB
Docker
48 lines
1.6 KiB
Docker
ARG build_image
|
|
ARG base_image
|
|
|
|
FROM $build_image AS build-image
|
|
|
|
ARG DEBIAN_FRONTEND=noninteractive
|
|
ARG haystack_version
|
|
ARG haystack_extras
|
|
|
|
RUN apt-get update && \
|
|
apt-get install -y --no-install-recommends \
|
|
build-essential \
|
|
git \
|
|
libxml2-dev \
|
|
libxslt1-dev
|
|
|
|
# Shallow clone Haystack repo, we'll install from the local sources
|
|
RUN git clone --depth=1 --branch=${haystack_version} https://github.com/deepset-ai/haystack.git /opt/haystack
|
|
WORKDIR /opt/haystack
|
|
|
|
# Use a virtualenv we can copy over the next build stage
|
|
RUN python3 -m venv --system-site-packages /opt/venv
|
|
ENV PATH="/opt/venv/bin:$PATH"
|
|
|
|
RUN pip install --upgrade pip && \
|
|
pip install --no-cache-dir .${haystack_extras} && \
|
|
pip install --no-cache-dir ./rest_api
|
|
|
|
|
|
FROM $base_image AS final
|
|
|
|
COPY --from=build-image /opt/venv /opt/venv
|
|
COPY --from=deepset/xpdf:latest /opt/pdftotext /usr/local/bin
|
|
|
|
# pdftotext requires fontconfig runtime
|
|
RUN apt-get update && apt-get install -y libfontconfig && rm -rf /var/lib/apt/lists/*
|
|
|
|
ENV PATH="/opt/venv/bin:$PATH"
|
|
|
|
# The JSON schema is lazily generated at first usage, but we do it explicitly here for two reasons:
|
|
# - the schema will be already there when the container runs, saving the generation overhead when a container starts
|
|
# - derived images don't need to write the schema and can run with lower user privileges
|
|
RUN python3 -c "from haystack.utils.docker import cache_schema; cache_schema()"
|
|
|
|
# Haystack Preprocessor uses NLTK punkt model to divide text into a list of sentences.
|
|
# We cache these models for seemless user experience.
|
|
RUN python3 -c "from haystack.utils.docker import cache_nltk_model; cache_nltk_model()"
|