mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
ci: publish amd and arm images (#404)
This commit is contained in:
parent
09b52b4fc4
commit
65fec954ba
6
.dockerignore
Normal file
6
.dockerignore
Normal file
@ -0,0 +1,6 @@
|
||||
.git
|
||||
.vscode
|
||||
__pycache__
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.bak
|
164
.github/workflows/docker-publish.yml
vendored
164
.github/workflows/docker-publish.yml
vendored
@ -6,88 +6,120 @@ on:
|
||||
- main
|
||||
|
||||
env:
|
||||
DOCKER_REGISTRY: "quay.io"
|
||||
DOCKER_NAMESPACE: "unstructured-io"
|
||||
IMAGE_PLATFORMS: linux/amd64
|
||||
PACKAGE: "unstructured"
|
||||
DOCKER_REPOSITORY: quay.io/unstructured-io/unstructured
|
||||
DOCKER_BUILD_REPOSITORY: quay.io/unstructured-io/build-unstructured
|
||||
PIP_VERSION: "22.2.1"
|
||||
PYTHON_VERSION: "3.8"
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
TEST_IMAGE_NAME: "unstructured-dev"
|
||||
|
||||
jobs:
|
||||
setup:
|
||||
set-short-sha:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
short_sha: ${{ steps.set_short_sha.outputs.short_sha }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/cache@v3
|
||||
id: virtualenv-cache
|
||||
with:
|
||||
path: |
|
||||
.venv
|
||||
nltk_data
|
||||
key: unstructured-${{ runner.os }}-${{ hashFiles('requirements/*.txt') }}
|
||||
- name: Set up Python ${{ env.PYTHON_VERSION }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
- name: Setup virtual environment (no cache hit)
|
||||
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
python${{ env.PYTHON_VERSION }} -m venv .venv
|
||||
source .venv/bin/activate
|
||||
make install-ci
|
||||
|
||||
build:
|
||||
- name: Set Short SHA
|
||||
id: set_short_sha
|
||||
run: echo "::set-output name=short_sha::$(echo ${{ github.sha }} | cut -c1-7)"
|
||||
|
||||
build-amd:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [setup]
|
||||
needs: set-short-sha
|
||||
env:
|
||||
SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }}
|
||||
steps:
|
||||
- uses: docker/setup-buildx-action@v1
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set virtualenv cache
|
||||
uses: actions/cache@v3
|
||||
id: virtualenv-cache
|
||||
with:
|
||||
path: |
|
||||
.venv
|
||||
nltk_data
|
||||
key: unstructured-${{ runner.os }}-${{ hashFiles('requirements/*.txt') }}
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v1
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
|
||||
- name: Login to Quay.io
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
registry: quay.io
|
||||
username: ${{ secrets.QUAY_IO_ROBOT_USERNAME }}
|
||||
password: ${{ secrets.QUAY_IO_ROBOT_TOKEN }}
|
||||
|
||||
- name: Build and push Docker image
|
||||
- name: Build AMD image
|
||||
run: |
|
||||
DOCKER_BUILDKIT=1 docker buildx build --platform=linux/amd64 --load \
|
||||
--build-arg PIP_VERSION=$PIP_VERSION \
|
||||
--build-arg BUILDKIT_INLINE_CACHE=1 \
|
||||
--progress plain \
|
||||
--cache-from $DOCKER_BUILD_REPOSITORY:amd \
|
||||
-t $DOCKER_BUILD_REPOSITORY:amd-$SHORT_SHA .
|
||||
- name: Test AMD image
|
||||
run: |
|
||||
DOCKER_PLATFORM="linux/amd64" DOCKER_IMAGE="$DOCKER_BUILD_REPOSITORY:amd-$SHORT_SHA" make docker-test
|
||||
- name: Push AMD image
|
||||
run: |
|
||||
# write to the build repository to cache for the publish-images job
|
||||
docker push $DOCKER_BUILD_REPOSITORY:amd-$SHORT_SHA
|
||||
build-arm:
|
||||
runs-on: ubuntu-latest
|
||||
needs: set-short-sha
|
||||
env:
|
||||
SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }}
|
||||
steps:
|
||||
- uses: docker/setup-buildx-action@v1
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
- name: Login to Quay.io
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
registry: quay.io
|
||||
username: ${{ secrets.QUAY_IO_ROBOT_USERNAME }}
|
||||
password: ${{ secrets.QUAY_IO_ROBOT_TOKEN }}
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
- name: Build ARM image
|
||||
run: |
|
||||
DOCKER_BUILDKIT=1 docker buildx build --platform=linux/arm64 --load \
|
||||
--build-arg PIP_VERSION=$PIP_VERSION \
|
||||
--build-arg BUILDKIT_INLINE_CACHE=1 \
|
||||
--progress plain \
|
||||
--cache-from $DOCKER_BUILD_REPOSITORY:arm \
|
||||
-t $DOCKER_BUILD_REPOSITORY:arm-$SHORT_SHA .
|
||||
- name: Test ARM image
|
||||
run: |
|
||||
# only run a subset of tests on ARM, since they take a long time with emulation
|
||||
DOCKER_PLATFORM="linux/arm64" DOCKER_IMAGE="$DOCKER_BUILD_REPOSITORY:arm-$SHORT_SHA" make docker-test TEST_NAME=partition/test_text.py
|
||||
- name: Push ARM image
|
||||
run: |
|
||||
# write to the build repository to cache for the publish-images job
|
||||
docker push $DOCKER_BUILD_REPOSITORY:arm-$SHORT_SHA
|
||||
publish-images:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [set-short-sha, build-amd, build-arm]
|
||||
env:
|
||||
SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }}
|
||||
steps:
|
||||
- uses: docker/setup-buildx-action@v1
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
- name: Login to Quay.io
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
registry: quay.io
|
||||
username: ${{ secrets.QUAY_IO_ROBOT_USERNAME }}
|
||||
password: ${{ secrets.QUAY_IO_ROBOT_TOKEN }}
|
||||
- name: Pull AMD image
|
||||
run: |
|
||||
docker pull $DOCKER_BUILD_REPOSITORY:amd-$SHORT_SHA
|
||||
- name: Pull ARM image
|
||||
run: |
|
||||
docker pull $DOCKER_BUILD_REPOSITORY:arm-$SHORT_SHA
|
||||
- name: Push latest build tags for AMD and ARM
|
||||
run: |
|
||||
# these are used to construct the final manifest but also cache-from in subsequent runs
|
||||
docker tag $DOCKER_BUILD_REPOSITORY:amd-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd
|
||||
docker push $DOCKER_BUILD_REPOSITORY:amd
|
||||
docker tag $DOCKER_BUILD_REPOSITORY:arm-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:arm
|
||||
docker push $DOCKER_BUILD_REPOSITORY:arm
|
||||
- name: Push multiarch manifest
|
||||
run: |
|
||||
docker manifest create ${DOCKER_REPOSITORY}:latest $DOCKER_BUILD_REPOSITORY:amd $DOCKER_BUILD_REPOSITORY:arm
|
||||
docker manifest push $DOCKER_REPOSITORY:latest
|
||||
docker manifest create ${DOCKER_REPOSITORY}:$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd $DOCKER_BUILD_REPOSITORY:arm
|
||||
docker manifest push $DOCKER_REPOSITORY:$SHORT_SHA
|
||||
VERSION=$(grep -Po '(?<=__version__ = ")[^"]*' unstructured/__version__.py)
|
||||
GIT_SHA=$(git rev-parse --short HEAD)
|
||||
IMAGE_NAME=${{ env.PACKAGE }}
|
||||
docker buildx create --use --driver=docker-container
|
||||
docker buildx build --platform=${{ env.IMAGE_PLATFORMS }} --provenance=false --load \
|
||||
--cache-to type=gha,scope=$GITHUB_REF_NAME-$IMAGE_NAME \
|
||||
--cache-from type=gha,scope=$GITHUB_REF_NAME-$IMAGE_NAME \
|
||||
--build-arg PIP_VERSION=${{ env.PIP_VERSION }} \
|
||||
--progress plain \
|
||||
-t ${{ env.TEST_IMAGE_NAME }}:latest \
|
||||
-t ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_NAMESPACE }}/${IMAGE_NAME}:${GIT_SHA} \
|
||||
-t ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_NAMESPACE }}/${IMAGE_NAME}:${VERSION} \
|
||||
-t ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_NAMESPACE }}/${IMAGE_NAME}:latest .
|
||||
docker manifest create ${DOCKER_REPOSITORY}:$VERSION $DOCKER_BUILD_REPOSITORY:amd $DOCKER_BUILD_REPOSITORY:arm
|
||||
docker manifest push $DOCKER_REPOSITORY:$VERSION
|
||||
|
||||
- name: Test image
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
make docker-test
|
||||
|
||||
- name: Push image
|
||||
run: |
|
||||
docker image push --all-tags ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_NAMESPACE }}/${{ env.PACKAGE }}
|
||||
|
||||
|
107
Dockerfile
107
Dockerfile
@ -3,70 +3,79 @@
|
||||
FROM centos:centos7.9.2009
|
||||
|
||||
ARG PIP_VERSION
|
||||
ARG UNSTRUCTURED
|
||||
|
||||
# Install dependency packages
|
||||
RUN yum -y update && \
|
||||
yum -y install poppler-utils xz-devel which
|
||||
|
||||
# Enable the EPEL repository
|
||||
RUN yum install -y epel-release && yum clean all
|
||||
|
||||
# Install pandoc
|
||||
RUN yum install -y pandoc && yum clean all
|
||||
|
||||
# Note(austin) Get a recent tesseract from this repo
|
||||
# See https://tesseract-ocr.github.io/tessdoc/Installation.html
|
||||
# PDF and images:
|
||||
RUN yum-config-manager --add-repo https://download.opensuse.org/repositories/home:/Alexander_Pozdnyakov/CentOS_7/ && \
|
||||
rpm --import https://build.opensuse.org/projects/home:Alexander_Pozdnyakov/public_key && \
|
||||
yum -y update && \
|
||||
yum -y install tesseract
|
||||
|
||||
# Note(yuming): Install gcc & g++ ≥ 5.4 for Detectron2 requirement
|
||||
RUN yum -y update
|
||||
RUN yum -y install centos-release-scl
|
||||
RUN yum -y install devtoolset-7-gcc*
|
||||
SHELL [ "/usr/bin/scl", "enable", "devtoolset-7"]
|
||||
|
||||
RUN yum -y update && \
|
||||
# MS Office docs:
|
||||
yum -y install poppler-utils xz-devel wget tar curl make which && \
|
||||
yum install -y epel-release && \
|
||||
yum install -y pandoc && \
|
||||
yum -y install libreoffice && \
|
||||
yum -y install openssl-devel bzip2-devel libffi-devel make git sqlite-devel && \
|
||||
yum clean all
|
||||
|
||||
# Install gcc & g++ ≥ 8 for Tesseract and Detectron2
|
||||
RUN yum -y install centos-release-scl && \
|
||||
yum -y install devtoolset-9-gcc* && \
|
||||
yum clean all
|
||||
|
||||
SHELL [ "/usr/bin/scl", "enable", "devtoolset-9"]
|
||||
|
||||
# Install Tessaract
|
||||
RUN set -ex && \
|
||||
$sudo yum install -y opencv opencv-devel opencv-python perl-core clang libpng-devel libtiff-devel libwebp-devel libjpeg-turbo-devel git-core libtool pkgconfig xz && \
|
||||
wget https://github.com/DanBloomberg/leptonica/releases/download/1.75.1/leptonica-1.75.1.tar.gz && \
|
||||
tar -xzvf leptonica-1.75.1.tar.gz && \
|
||||
cd leptonica-1.75.1 || exit && \
|
||||
./configure && make && $sudo make install && \
|
||||
cd .. && \
|
||||
wget http://mirror.squ.edu.om/gnu/autoconf-archive/autoconf-archive-2017.09.28.tar.xz && \
|
||||
tar -xvf autoconf-archive-2017.09.28.tar.xz && \
|
||||
cd autoconf-archive-2017.09.28 || exit && \
|
||||
./configure && make && $sudo make install && \
|
||||
$sudo cp m4/* /usr/share/aclocal && \
|
||||
cd .. && \
|
||||
git clone --depth 1 https://github.com/tesseract-ocr/tesseract.git tesseract-ocr && \
|
||||
cd tesseract-ocr || exit && \
|
||||
export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig && \
|
||||
scl enable devtoolset-9 -- sh -c './autogen.sh && ./configure && make && make install' && \
|
||||
cd .. && \
|
||||
git clone https://github.com/tesseract-ocr/tessdata.git && \
|
||||
$sudo cp tessdata/*.traineddata /usr/local/share/tessdata && \
|
||||
$sudo rm -rf /tesseract-ocr /tessdata /autoconf-archive-2017.09.28* /leptonica-1.75.1* && \
|
||||
$sudo yum -y remove opencv opencv-devel opencv-python perl-core clang libpng-devel libtiff-devel libwebp-devel libjpeg-turbo-devel git-core libtool && \
|
||||
$sudo rm -rf /var/cache/yum/* && \
|
||||
$sudo rm -rf /tmp/* && \
|
||||
yum clean all
|
||||
|
||||
# Install Python
|
||||
RUN yum -y install openssl-devel bzip2-devel libffi-devel make git sqlite-devel && \
|
||||
curl -O https://www.python.org/ftp/python/3.8.15/Python-3.8.15.tgz && tar -xzf Python-3.8.15.tgz && \
|
||||
cd Python-3.8.15/ && ./configure --enable-optimizations && make altinstall && \
|
||||
cd .. && rm -rf Python-3.8.15* && \
|
||||
ln -s /usr/local/bin/python3.8 /usr/local/bin/python3
|
||||
ln -s /usr/local/bin/python3.8 /usr/local/bin/python3 && \
|
||||
$sudo yum -y remove openssl-devel bzip2-devel libffi-devel make sqlite-devel && \
|
||||
$sudo rm -rf /var/cache/yum/* && \
|
||||
yum clean all
|
||||
|
||||
# create a home directory
|
||||
# Set up environment
|
||||
ENV HOME /home/
|
||||
|
||||
WORKDIR ${HOME}
|
||||
RUN mkdir ${HOME}/.ssh && chmod go-rwx ${HOME}/.ssh \
|
||||
&& ssh-keyscan -t rsa github.com >> /home/.ssh/known_hosts
|
||||
|
||||
ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
|
||||
ENV PATH="/home/usr/.local/bin:${PATH}"
|
||||
|
||||
# Copy and install Unstructured
|
||||
COPY requirements requirements
|
||||
|
||||
RUN python3.8 -m pip install pip==${PIP_VERSION} && \
|
||||
pip install --no-cache -r requirements/base.txt && \
|
||||
pip install --no-cache -r requirements/test.txt && \
|
||||
pip install --no-cache -r requirements/huggingface.txt && \
|
||||
pip install --no-cache -r requirements/dev.txt && \
|
||||
pip install --no-cache -r requirements/local-inference.txt && \
|
||||
pip install --no-cache "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"
|
||||
|
||||
COPY example-docs example-docs
|
||||
|
||||
COPY requirements/base.txt requirements-base.txt
|
||||
COPY requirements/test.txt requirements-test.txt
|
||||
COPY requirements/huggingface.txt requirements-huggingface.txt
|
||||
COPY requirements/dev.txt requirements-dev.txt
|
||||
# PDFs and images
|
||||
COPY requirements/local-inference.txt requirements-local-inference.txt
|
||||
|
||||
|
||||
RUN python3.8 -m pip install pip==${PIP_VERSION} \
|
||||
&& pip install --no-cache -r requirements-base.txt \
|
||||
&& pip install --no-cache -r requirements-test.txt \
|
||||
&& pip install --no-cache -r requirements-huggingface.txt \
|
||||
&& pip install --no-cache -r requirements-dev.txt \
|
||||
# PDFs and images
|
||||
&& pip install --no-cache -r requirements-local-inference.txt \
|
||||
# PDFs
|
||||
&& pip install --no-cache "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"
|
||||
|
||||
COPY unstructured unstructured
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
12
Makefile
12
Makefile
@ -186,16 +186,20 @@ check-coverage:
|
||||
|
||||
# Docker targets are provided for convenience only and are not required in a standard development environment
|
||||
|
||||
DOCKER_PLATFORM ?= linux/amd64
|
||||
DOCKER_IMAGE ?= unstructured-dev:latest
|
||||
|
||||
.PHONY: docker-build
|
||||
docker-build:
|
||||
PIP_VERSION=${PIP_VERSION} ./scripts/docker-build.sh
|
||||
|
||||
.PHONY: docker-start-bash
|
||||
docker-start-bash:
|
||||
docker run --platform linux/amd64 -ti --rm unstructured-dev:latest
|
||||
docker run --platform $(DOCKER_PLATFORM) -ti --rm $(DOCKER_IMAGE)
|
||||
|
||||
.PHONY: docker-test
|
||||
docker-test:
|
||||
docker run --platform linux/amd64 --rm \
|
||||
-v ${CURRENT_DIR}/test_unstructured:/home/test_unstructured unstructured-dev:latest \
|
||||
bash -c "pytest test_unstructured"
|
||||
docker run --platform $(DOCKER_PLATFORM) --rm \
|
||||
-v ${CURRENT_DIR}/test_unstructured:/home/test_unstructured \
|
||||
$(DOCKER_IMAGE) \
|
||||
bash -c "pytest $(if $(TEST_NAME),-k $(TEST_NAME),) test_unstructured"
|
||||
|
Loading…
x
Reference in New Issue
Block a user