ci: publish amd and arm images (#404)

This commit is contained in:
ryannikolaidis 2023-03-29 00:02:39 -07:00 committed by GitHub
parent 09b52b4fc4
commit 65fec954ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 170 additions and 119 deletions

6
.dockerignore Normal file
View File

@ -0,0 +1,6 @@
.git
.vscode
__pycache__
*.pyc
*.pyo
*.bak

View File

@ -6,88 +6,120 @@ on:
- main
env:
DOCKER_REGISTRY: "quay.io"
DOCKER_NAMESPACE: "unstructured-io"
IMAGE_PLATFORMS: linux/amd64
PACKAGE: "unstructured"
DOCKER_REPOSITORY: quay.io/unstructured-io/unstructured
DOCKER_BUILD_REPOSITORY: quay.io/unstructured-io/build-unstructured
PIP_VERSION: "22.2.1"
PYTHON_VERSION: "3.8"
NLTK_DATA: ${{ github.workspace }}/nltk_data
TEST_IMAGE_NAME: "unstructured-dev"
jobs:
setup:
set-short-sha:
runs-on: ubuntu-latest
outputs:
short_sha: ${{ steps.set_short_sha.outputs.short_sha }}
steps:
- uses: actions/checkout@v3
- uses: actions/cache@v3
id: virtualenv-cache
with:
path: |
.venv
nltk_data
key: unstructured-${{ runner.os }}-${{ hashFiles('requirements/*.txt') }}
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Setup virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
source .venv/bin/activate
make install-ci
build:
- name: Set Short SHA
id: set_short_sha
run: echo "::set-output name=short_sha::$(echo ${{ github.sha }} | cut -c1-7)"
build-amd:
runs-on: ubuntu-latest
needs: [setup]
needs: set-short-sha
env:
SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }}
steps:
- uses: docker/setup-buildx-action@v1
- name: Checkout code
uses: actions/checkout@v3
- name: Set virtualenv cache
uses: actions/cache@v3
id: virtualenv-cache
with:
path: |
.venv
nltk_data
key: unstructured-${{ runner.os }}-${{ hashFiles('requirements/*.txt') }}
- name: Set up QEMU
uses: docker/setup-qemu-action@v1
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Login to Quay.io
uses: docker/login-action@v1
with:
registry: quay.io
username: ${{ secrets.QUAY_IO_ROBOT_USERNAME }}
password: ${{ secrets.QUAY_IO_ROBOT_TOKEN }}
- name: Build and push Docker image
- name: Build AMD image
run: |
DOCKER_BUILDKIT=1 docker buildx build --platform=linux/amd64 --load \
--build-arg PIP_VERSION=$PIP_VERSION \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--progress plain \
--cache-from $DOCKER_BUILD_REPOSITORY:amd \
-t $DOCKER_BUILD_REPOSITORY:amd-$SHORT_SHA .
- name: Test AMD image
run: |
DOCKER_PLATFORM="linux/amd64" DOCKER_IMAGE="$DOCKER_BUILD_REPOSITORY:amd-$SHORT_SHA" make docker-test
- name: Push AMD image
run: |
# write to the build repository to cache for the publish-images job
docker push $DOCKER_BUILD_REPOSITORY:amd-$SHORT_SHA
build-arm:
runs-on: ubuntu-latest
needs: set-short-sha
env:
SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }}
steps:
- uses: docker/setup-buildx-action@v1
- name: Checkout code
uses: actions/checkout@v3
- name: Login to Quay.io
uses: docker/login-action@v1
with:
registry: quay.io
username: ${{ secrets.QUAY_IO_ROBOT_USERNAME }}
password: ${{ secrets.QUAY_IO_ROBOT_TOKEN }}
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Build ARM image
run: |
DOCKER_BUILDKIT=1 docker buildx build --platform=linux/arm64 --load \
--build-arg PIP_VERSION=$PIP_VERSION \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--progress plain \
--cache-from $DOCKER_BUILD_REPOSITORY:arm \
-t $DOCKER_BUILD_REPOSITORY:arm-$SHORT_SHA .
- name: Test ARM image
run: |
# only run a subset of tests on ARM, since they take a long time with emulation
DOCKER_PLATFORM="linux/arm64" DOCKER_IMAGE="$DOCKER_BUILD_REPOSITORY:arm-$SHORT_SHA" make docker-test TEST_NAME=partition/test_text.py
- name: Push ARM image
run: |
# write to the build repository to cache for the publish-images job
docker push $DOCKER_BUILD_REPOSITORY:arm-$SHORT_SHA
publish-images:
runs-on: ubuntu-latest
needs: [set-short-sha, build-amd, build-arm]
env:
SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }}
steps:
- uses: docker/setup-buildx-action@v1
- name: Checkout code
uses: actions/checkout@v3
- name: Login to Quay.io
uses: docker/login-action@v1
with:
registry: quay.io
username: ${{ secrets.QUAY_IO_ROBOT_USERNAME }}
password: ${{ secrets.QUAY_IO_ROBOT_TOKEN }}
- name: Pull AMD image
run: |
docker pull $DOCKER_BUILD_REPOSITORY:amd-$SHORT_SHA
- name: Pull ARM image
run: |
docker pull $DOCKER_BUILD_REPOSITORY:arm-$SHORT_SHA
- name: Push latest build tags for AMD and ARM
run: |
# these are used to construct the final manifest but also cache-from in subsequent runs
docker tag $DOCKER_BUILD_REPOSITORY:amd-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd
docker push $DOCKER_BUILD_REPOSITORY:amd
docker tag $DOCKER_BUILD_REPOSITORY:arm-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:arm
docker push $DOCKER_BUILD_REPOSITORY:arm
- name: Push multiarch manifest
run: |
docker manifest create ${DOCKER_REPOSITORY}:latest $DOCKER_BUILD_REPOSITORY:amd $DOCKER_BUILD_REPOSITORY:arm
docker manifest push $DOCKER_REPOSITORY:latest
docker manifest create ${DOCKER_REPOSITORY}:$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd $DOCKER_BUILD_REPOSITORY:arm
docker manifest push $DOCKER_REPOSITORY:$SHORT_SHA
VERSION=$(grep -Po '(?<=__version__ = ")[^"]*' unstructured/__version__.py)
GIT_SHA=$(git rev-parse --short HEAD)
IMAGE_NAME=${{ env.PACKAGE }}
docker buildx create --use --driver=docker-container
docker buildx build --platform=${{ env.IMAGE_PLATFORMS }} --provenance=false --load \
--cache-to type=gha,scope=$GITHUB_REF_NAME-$IMAGE_NAME \
--cache-from type=gha,scope=$GITHUB_REF_NAME-$IMAGE_NAME \
--build-arg PIP_VERSION=${{ env.PIP_VERSION }} \
--progress plain \
-t ${{ env.TEST_IMAGE_NAME }}:latest \
-t ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_NAMESPACE }}/${IMAGE_NAME}:${GIT_SHA} \
-t ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_NAMESPACE }}/${IMAGE_NAME}:${VERSION} \
-t ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_NAMESPACE }}/${IMAGE_NAME}:latest .
docker manifest create ${DOCKER_REPOSITORY}:$VERSION $DOCKER_BUILD_REPOSITORY:amd $DOCKER_BUILD_REPOSITORY:arm
docker manifest push $DOCKER_REPOSITORY:$VERSION
- name: Test image
run: |
source .venv/bin/activate
make docker-test
- name: Push image
run: |
docker image push --all-tags ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_NAMESPACE }}/${{ env.PACKAGE }}

View File

@ -3,70 +3,79 @@
FROM centos:centos7.9.2009
ARG PIP_VERSION
ARG UNSTRUCTURED
# Install dependency packages
RUN yum -y update && \
yum -y install poppler-utils xz-devel which
# Enable the EPEL repository
RUN yum install -y epel-release && yum clean all
# Install pandoc
RUN yum install -y pandoc && yum clean all
# Note(austin) Get a recent tesseract from this repo
# See https://tesseract-ocr.github.io/tessdoc/Installation.html
# PDF and images:
RUN yum-config-manager --add-repo https://download.opensuse.org/repositories/home:/Alexander_Pozdnyakov/CentOS_7/ && \
rpm --import https://build.opensuse.org/projects/home:Alexander_Pozdnyakov/public_key && \
yum -y update && \
yum -y install tesseract
# Note(yuming): Install gcc & g++ ≥ 5.4 for Detectron2 requirement
RUN yum -y update
RUN yum -y install centos-release-scl
RUN yum -y install devtoolset-7-gcc*
SHELL [ "/usr/bin/scl", "enable", "devtoolset-7"]
RUN yum -y update && \
# MS Office docs:
yum -y install poppler-utils xz-devel wget tar curl make which && \
yum install -y epel-release && \
yum install -y pandoc && \
yum -y install libreoffice && \
yum -y install openssl-devel bzip2-devel libffi-devel make git sqlite-devel && \
yum clean all
# Install gcc & g++ ≥ 8 for Tesseract and Detectron2
RUN yum -y install centos-release-scl && \
yum -y install devtoolset-9-gcc* && \
yum clean all
SHELL [ "/usr/bin/scl", "enable", "devtoolset-9"]
# Install Tessaract
RUN set -ex && \
$sudo yum install -y opencv opencv-devel opencv-python perl-core clang libpng-devel libtiff-devel libwebp-devel libjpeg-turbo-devel git-core libtool pkgconfig xz && \
wget https://github.com/DanBloomberg/leptonica/releases/download/1.75.1/leptonica-1.75.1.tar.gz && \
tar -xzvf leptonica-1.75.1.tar.gz && \
cd leptonica-1.75.1 || exit && \
./configure && make && $sudo make install && \
cd .. && \
wget http://mirror.squ.edu.om/gnu/autoconf-archive/autoconf-archive-2017.09.28.tar.xz && \
tar -xvf autoconf-archive-2017.09.28.tar.xz && \
cd autoconf-archive-2017.09.28 || exit && \
./configure && make && $sudo make install && \
$sudo cp m4/* /usr/share/aclocal && \
cd .. && \
git clone --depth 1 https://github.com/tesseract-ocr/tesseract.git tesseract-ocr && \
cd tesseract-ocr || exit && \
export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig && \
scl enable devtoolset-9 -- sh -c './autogen.sh && ./configure && make && make install' && \
cd .. && \
git clone https://github.com/tesseract-ocr/tessdata.git && \
$sudo cp tessdata/*.traineddata /usr/local/share/tessdata && \
$sudo rm -rf /tesseract-ocr /tessdata /autoconf-archive-2017.09.28* /leptonica-1.75.1* && \
$sudo yum -y remove opencv opencv-devel opencv-python perl-core clang libpng-devel libtiff-devel libwebp-devel libjpeg-turbo-devel git-core libtool && \
$sudo rm -rf /var/cache/yum/* && \
$sudo rm -rf /tmp/* && \
yum clean all
# Install Python
RUN yum -y install openssl-devel bzip2-devel libffi-devel make git sqlite-devel && \
curl -O https://www.python.org/ftp/python/3.8.15/Python-3.8.15.tgz && tar -xzf Python-3.8.15.tgz && \
cd Python-3.8.15/ && ./configure --enable-optimizations && make altinstall && \
cd .. && rm -rf Python-3.8.15* && \
ln -s /usr/local/bin/python3.8 /usr/local/bin/python3
ln -s /usr/local/bin/python3.8 /usr/local/bin/python3 && \
$sudo yum -y remove openssl-devel bzip2-devel libffi-devel make sqlite-devel && \
$sudo rm -rf /var/cache/yum/* && \
yum clean all
# create a home directory
# Set up environment
ENV HOME /home/
WORKDIR ${HOME}
RUN mkdir ${HOME}/.ssh && chmod go-rwx ${HOME}/.ssh \
&& ssh-keyscan -t rsa github.com >> /home/.ssh/known_hosts
ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
ENV PATH="/home/usr/.local/bin:${PATH}"
# Copy and install Unstructured
COPY requirements requirements
RUN python3.8 -m pip install pip==${PIP_VERSION} && \
pip install --no-cache -r requirements/base.txt && \
pip install --no-cache -r requirements/test.txt && \
pip install --no-cache -r requirements/huggingface.txt && \
pip install --no-cache -r requirements/dev.txt && \
pip install --no-cache -r requirements/local-inference.txt && \
pip install --no-cache "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"
COPY example-docs example-docs
COPY requirements/base.txt requirements-base.txt
COPY requirements/test.txt requirements-test.txt
COPY requirements/huggingface.txt requirements-huggingface.txt
COPY requirements/dev.txt requirements-dev.txt
# PDFs and images
COPY requirements/local-inference.txt requirements-local-inference.txt
RUN python3.8 -m pip install pip==${PIP_VERSION} \
&& pip install --no-cache -r requirements-base.txt \
&& pip install --no-cache -r requirements-test.txt \
&& pip install --no-cache -r requirements-huggingface.txt \
&& pip install --no-cache -r requirements-dev.txt \
# PDFs and images
&& pip install --no-cache -r requirements-local-inference.txt \
# PDFs
&& pip install --no-cache "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"
COPY unstructured unstructured
CMD ["/bin/bash"]

View File

@ -186,16 +186,20 @@ check-coverage:
# Docker targets are provided for convenience only and are not required in a standard development environment
DOCKER_PLATFORM ?= linux/amd64
DOCKER_IMAGE ?= unstructured-dev:latest
.PHONY: docker-build
docker-build:
PIP_VERSION=${PIP_VERSION} ./scripts/docker-build.sh
.PHONY: docker-start-bash
docker-start-bash:
docker run --platform linux/amd64 -ti --rm unstructured-dev:latest
docker run --platform $(DOCKER_PLATFORM) -ti --rm $(DOCKER_IMAGE)
.PHONY: docker-test
docker-test:
docker run --platform linux/amd64 --rm \
-v ${CURRENT_DIR}/test_unstructured:/home/test_unstructured unstructured-dev:latest \
bash -c "pytest test_unstructured"
docker run --platform $(DOCKER_PLATFORM) --rm \
-v ${CURRENT_DIR}/test_unstructured:/home/test_unstructured \
$(DOCKER_IMAGE) \
bash -c "pytest $(if $(TEST_NAME),-k $(TEST_NAME),) test_unstructured"