ci: Move xpdf build into separate container (#4199)

* Create Dockerfile and hcl config to build Xpdf

* Create workflow to build Xpdf Docker image

* Update Dockerfile.base to not build Xpdf

* Fix CWD removal and arg casing

* Fix ARG setting
This commit is contained in:
Silvano Cerza 2023-02-20 14:58:11 +01:00 committed by GitHub
parent aaa1522c45
commit 30cdb81f19
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 75 additions and 15 deletions

39
.github/workflows/xpdf_release.yml vendored Normal file
View File

@ -0,0 +1,39 @@
name: Xpdf Docker image release
on:
push:
branches:
- main
paths:
- docker/docker-bake-xpdf.hcl
- docker/Dockerfile.xpdf
jobs:
publish-xpdf-image:
runs-on: ubuntu-latest
env:
DOCKER_REPO_NAME: deepset/xpdf
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKER_HUB_USER }}
password: ${{ secrets.DOCKER_HUB_TOKEN }}
- name: Build and publish Xpdf image
uses: docker/bake-action@v2
with:
files: "docker-bake-xpdf.hcl"
workdir: docker
targets: xpdf
push: true

View File

@ -7,20 +7,6 @@ ARG DEBIAN_FRONTEND=noninteractive
ARG haystack_version
ARG haystack_extras
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential gcc git curl cmake \
tesseract-ocr libtesseract-dev poppler-utils
# Install PDF converter
RUN curl -O https://dl.xpdfreader.com/xpdf-4.04.tar.gz && \
tar -xvf xpdf-4.04.tar.gz && \
cd xpdf-4.04 && \
cmake . && \
make && \
cp xpdf/pdftotext /opt && \
cd .. && \
rm -rf xpdf-4.04
# Shallow clone Haystack repo, we'll install from the local sources
RUN git clone --depth=1 --branch=${haystack_version} https://github.com/deepset-ai/haystack.git /opt/haystack
WORKDIR /opt/haystack
@ -37,7 +23,8 @@ RUN pip install --upgrade pip && \
FROM $base_image AS final
COPY --from=build-image /opt/venv /opt/venv
COPY --from=build-image /opt/pdftotext /usr/local/bin
COPY --from=deepset/xpdf:latest /opt/pdftotext /usr/local/bin
# pdftotext requires fontconfig runtime
RUN apt-get update && apt-get install -y libfontconfig && rm -rf /var/lib/apt/lists/*

22
docker/Dockerfile.xpdf Normal file
View File

@ -0,0 +1,22 @@
FROM ubuntu:latest
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
cmake \
curl \
gcc \
git \
libtesseract-dev \
poppler-utils \
tesseract-ocr
ARG xpdf_version
RUN curl -O https://dl.xpdfreader.com/xpdf-${xpdf_version}.tar.gz && \
tar -xvf xpdf-${xpdf_version}.tar.gz && \
cd xpdf-${xpdf_version} && \
cmake . && \
make && \
cp xpdf/pdftotext /opt && \
cd .. \
rm -rf xpdf-${xpdf_version}

View File

@ -0,0 +1,12 @@
variable "XPDF_VERSION" {
default = "4.04"
}
target "xpdf" {
dockerfile = "Dockerfile.xpdf"
tags = ["deepset/xpdf:latest"]
args = {
xpdf_version = "${XPDF_VERSION}"
}
platforms = ["linux/amd64", "linux/arm64"]
}