refactoring: reimplement Docker strategy (#3162)

* setup base images

* add cpu flavor

* use the same Dockerfile for cpu and gpu

* better naming, add docs

* add docker workflow

* add missing image input

* change cwd for bake

* also push api images

* try conditional tagging for releases

* revert testing code

* update docker readme

* document variable override

* use Python 3.10

* allow empty HAYSTACK_EXTRAS

* Apply suggestions from code review

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* remove repo description step, can't make it work so far

* add docs to the last step as it's tricky

* manage tags for the newest images

* tests are passing, checking in the last bit

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
This commit is contained in:
Massimiliano Pippi 2022-09-12 16:33:56 +02:00 committed by GitHub
parent 21aedc644f
commit 64b0c43885
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 287 additions and 0 deletions

93
.github/workflows/docker_release.yml vendored Normal file
View File

@ -0,0 +1,93 @@
name: Release Docker images
on:
workflow_dispatch:
push:
branches:
- main
tags:
- v*
env:
DOCKER_REPO_NAME: deepset/haystack
jobs:
build-and-push:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKER_HUB_USER }}
password: ${{ secrets.DOCKER_HUB_TOKEN }}
- name: Docker meta
id: meta
uses: docker/metadata-action@v4
with:
images: $DOCKER_REPO_NAME
- name: Build base images
uses: docker/bake-action@v2
env:
IMAGE_TAG_SUFFIX: ${{ steps.meta.outputs.version }}
with:
workdir: docker
targets: base
push: true
- name: Build api images
uses: docker/bake-action@v2
env:
IMAGE_TAG_SUFFIX: ${{ steps.meta.outputs.version }}
BASE_IMAGE_TAG_SUFFIX: ${{ steps.meta.outputs.version }}
with:
workdir: docker
targets: api
push: true
- name: Get latest version of Haystack
id: latest-version
uses: pozetroninc/github-action-get-latest-release@master
if: startsWith(github.ref, 'refs/tags/')
with:
repository: ${{ github.repository }}
excludes: prerelease, draft
- name: Compare current version with latest
uses: madhead/semver-utils@latest
id: version
if: startsWith(github.ref, 'refs/tags/')
with:
# Version being built
version: ${{ github.ref_name }}
# Compare to latest
compare-to: ${{ steps.latest-version.outputs.release }}
- name: Use latest
if: steps.version.outputs.comparison-result == '>'
run: |
echo ${{ steps.version.outputs.comparison-result }};
echo ${{ steps.latest-version.outputs.release }};
# This step should only run when we release a new minor, so
# that we can tag the most recent image without the version number.
# For example, if the previous step builds `deepset/haystack:cpu-1.8.0`,
# this builds `deepset/haystack:cpu`
- name: Build api images no version in tag
uses: docker/bake-action@v2
if: steps.version.outputs.comparison-result == '>'
env:
IMAGE_TAG_SUFFIX: ${{ steps.meta.outputs.version }}
BASE_IMAGE_TAG_SUFFIX: ${{ steps.meta.outputs.version }}
with:
workdir: docker
targets: api-latest
push: true

13
docker/Dockerfile.api Normal file
View File

@ -0,0 +1,13 @@
ARG base_image_tag
FROM deepset/haystack:${base_image_tag}
# Create a folder for the /file-upload API endpoint with write permissions
RUN mkdir -p /opt/file-upload && chmod 777 /opt/file-upload
# Tell rest_api which folder to use for uploads
ENV FILE_UPLOAD_PATH="/opt/file-upload"
EXPOSE 8000
CMD ["gunicorn", "rest_api.application:app", "-b", "0.0.0.0", "-k", "uvicorn.workers.UvicornWorker", "--workers", "1", "--timeout", "180"]

38
docker/Dockerfile.base Normal file
View File

@ -0,0 +1,38 @@
ARG build_image
ARG base_immage
FROM $build_image AS build-image
ARG haystack_version
ARG haystack_extras
ARG torch_scatter
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential gcc git curl \
tesseract-ocr libtesseract-dev poppler-utils
# Install PDF converter
RUN curl -O https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && \
tar -xvf xpdf-tools-linux-4.04.tar.gz && \
cp xpdf-tools-linux-4.04/bin64/pdftotext /opt && \
rm -rf xpdf-tools-linux-4.04
# Shallow clone Haystack repo, we'll install from the local sources
RUN git clone --depth=1 --branch=${haystack_version} https://github.com/deepset-ai/haystack.git /opt/haystack
WORKDIR /opt/haystack
# Use a virtualenv we can copy over the next build stage
RUN python -m venv --system-site-packages /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
RUN pip install --upgrade pip && \
pip install --no-cache-dir .${haystack_extras} && \
pip install --no-cache-dir ./rest_api && \
pip install --no-cache-dir torch-scatter -f $torch_scatter
FROM $base_immage AS final
COPY --from=build-image /opt/venv /opt/venv
COPY --from=build-image /opt/pdftotext /usr/local/bin
ENV PATH="/opt/venv/bin:$PATH"

49
docker/README.md Normal file
View File

@ -0,0 +1,49 @@
# Haystack Docker image
Haystack is an end-to-end framework that enables you to build powerful and production-ready
pipelines for different search use cases. The Docker image comes with a web service
configured to serve Haystack's `rest_api` to ease pipelines' deployments in containerized
environments.
Start the Docker container binding the TCP port `8000` locally:
```sh
docker run -p 8000:8000 deepset/haystack
```
If you need the container to access other services available in the host:
```sh
docker run -p 8000:8000 --network="host" deepset/haystack
```
## Image variants
The Docker image comes in two variants:
- `haystack:cpu-<version>`: this image is smaller but doesn't support GPU
- `haystack:gpu-<version>`: this image comes with the Cuda runtime and is capable of running on GPUs
## Image development
Images are built with BuildKit and we use `bake` to orchestrate the process.
You can build a specific image by simply run:
```sh
docker buildx bake gpu
```
You can override any `variable` defined in the `docker-bake.hcl` file and build custom
images, for example if you want to use a branch from the Haystack repo:
```sh
HAYSTACK_VERSION=mybranch_or_tag BASE_IMAGE_TAG_SUFFIX=latest docker buildx bake gpu --no-cache
```
# License
View [license information](https://github.com/deepset-ai/haystack/blob/main/LICENSE) for
the software contained in this image.
As with all Docker images, these likely also contain other software which may be under
other licenses (such as Bash, etc from the base distribution, along with any direct or
indirect dependencies of the primary software being contained).
As for any pre-built image usage, it is the image user's responsibility to ensure that any
use of this image complies with any relevant licenses for all software contained within.

94
docker/docker-bake.hcl Normal file
View File

@ -0,0 +1,94 @@
variable "HAYSTACK_VERSION" {
default = "main"
}
variable "GITHUB_REF" {
default = ""
}
variable "IMAGE_NAME" {
default = "deepset/haystack"
}
variable "IMAGE_TAG_SUFFIX" {
default = "local"
}
variable "BASE_IMAGE_TAG_SUFFIX" {
default = "local"
}
variable "HAYSTACK_EXTRAS" {
default = ""
}
group "base" {
targets = ["base", "base-gpu"]
}
group "api" {
targets = ["cpu", "gpu"]
}
group "api-latest" {
targets = ["cpu-latest", "gpu-latest"]
}
group "all" {
targets = ["base", "base-gpu", "cpu", "gpu"]
}
target "docker-metadata-action" {}
target "base" {
dockerfile = "Dockerfile.base"
tags = ["${IMAGE_NAME}:base-${IMAGE_TAG_SUFFIX}"]
args = {
build_image = "python:3.10-slim"
base_immage = "python:3.10-slim"
haystack_version = "${HAYSTACK_VERSION}"
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,ocr,onnx,beir]"
torch_scatter = "https://data.pyg.org/whl/torch-1.12.0+cpu.html"
}
}
target "base-gpu" {
dockerfile = "Dockerfile.base"
tags = ["${IMAGE_NAME}:base-gpu-${IMAGE_TAG_SUFFIX}"]
args = {
build_image = "pytorch/pytorch:1.12.1-cuda11.3-cudnn8-runtime"
base_immage = "pytorch/pytorch:1.12.1-cuda11.3-cudnn8-runtime"
haystack_version = "${HAYSTACK_VERSION}"
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,ocr,onnx-gpu,beir]"
torch_scatter = "https://data.pyg.org/whl/torch-1.12.1%2Bcu113.html"
}
}
target "cpu" {
dockerfile = "Dockerfile.api"
tags = ["${IMAGE_NAME}:cpu-${IMAGE_TAG_SUFFIX}"]
args = {
base_image_tag = "base-${BASE_IMAGE_TAG_SUFFIX}"
}
}
target "cpu-latest" {
inherits = ["cpu"]
tags = ["${IMAGE_NAME}:cpu"]
}
target "gpu" {
dockerfile = "Dockerfile.api"
tags = ["${IMAGE_NAME}:gpu-${IMAGE_TAG_SUFFIX}"]
args = {
base_image_tag = "base-gpu-${BASE_IMAGE_TAG_SUFFIX}"
}
platforms = [
"linux/amd64"
]
}
target "gpu-latest" {
inherits = ["gpu"]
tags = ["${IMAGE_NAME}:gpu"]
}