From b47e6e9fdc205f97ce379afa11d7f89c7392310c Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Tue, 18 Jun 2024 05:15:44 -0700 Subject: [PATCH] refactor: remove download packages step (#3225) This PR aims to remove the download packages step since all of that gets installed in the base images. This PR also updates the base `wolfi` image because the original base image can not be found anymore: https://github.com/Unstructured-IO/unstructured/actions/runs/9555654898/job/26339587945 --- .github/workflows/ci.yml | 1 - .github/workflows/docker-publish.yml | 1 - .gitignore | 3 --- CHANGELOG.md | 2 +- Dockerfile-amd64 | 2 +- Makefile | 4 ---- scripts/docker-dl-packages.sh | 23 ----------------------- 7 files changed, 2 insertions(+), 34 deletions(-) delete mode 100755 scripts/docker-dl-packages.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fd58600ab..8fbda89e6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -499,7 +499,6 @@ jobs: - name: Test Dockerfile run: | echo "UNS_API_KEY=${{ secrets.UNS_API_KEY }}" > uns_test_env_file - make docker-dl-packages make docker-build make docker-test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true - name: Scan image diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index b0d877eff..1bcd489c2 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -47,7 +47,6 @@ jobs: password: ${{ secrets.QUAY_IO_ROBOT_TOKEN }} - name: Build images run: | - make docker-dl-packages ARCH=$(cut -d "/" -f2 <<< ${{ matrix.docker-platform }}) DOCKER_BUILDKIT=1 docker buildx build --platform=$ARCH --load \ -f Dockerfile-$ARCH \ diff --git a/.gitignore b/.gitignore index d9af439bc..5b3aba670 100644 --- a/.gitignore +++ b/.gitignore @@ -204,6 +204,3 @@ examples/**/output/ outputdiff.txt metricsdiff.txt - -# APK packages for the docker build -docker-packages/* diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bd419514..b3a6adf49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,7 +21,7 @@ ### Fixes -* **Remove deprecated `overwrite_schema` kwarg from Delta Table connector.**. The `overwrite_schema` kwarg is deprecated in `deltalake>=0.18.0`. `schema_mode=` should be used now instead. `schema_mode="overwrite"` is equivalent to `overwrite_schema=True` and `schema_mode="merge"` is equivalent to `overwrite_schema="False"`. `schema_mode` defaults to `None`. You can also now specify `engine`, which defaults to `"pyarrow"`. You need to specify `enginer="rust"` to use `"schema_mode"`. +* **Remove deprecated `overwrite_schema` kwarg from Delta Table connector.** The `overwrite_schema` kwarg is deprecated in `deltalake>=0.18.0`. `schema_mode=` should be used now instead. `schema_mode="overwrite"` is equivalent to `overwrite_schema=True` and `schema_mode="merge"` is equivalent to `overwrite_schema="False"`. `schema_mode` defaults to `None`. You can also now specify `engine`, which defaults to `"pyarrow"`. You need to specify `enginer="rust"` to use `"schema_mode"`. * **Fix passing parameters to python-client** - Remove parsing list arguments to strings in passing arguments to python-client in Ingest workflow and `partition_via_api` * **table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string. * **fsspec connector path/permissions bug** V2 fsspec connectors were failing when defined relative filepaths had leading slash. This strips that slash to guarantee the relative path never has it. diff --git a/Dockerfile-amd64 b/Dockerfile-amd64 index 2caf5cf90..115775933 100644 --- a/Dockerfile-amd64 +++ b/Dockerfile-amd64 @@ -1,4 +1,4 @@ -FROM quay.io/unstructured-io/base-images:wolfi-base@sha256:6c00a236c648ffdaf196ccbc446f5c6cc9eb4e3ab9e437178abcfac710b2b373 +FROM quay.io/unstructured-io/base-images:wolfi-base@sha256:863fd5b87e780dacec62b97c2db2aeda7f770fcf9b045b29f53ec1ddbe607b4d USER root diff --git a/Makefile b/Makefile index 1071a648c..41890430d 100644 --- a/Makefile +++ b/Makefile @@ -460,10 +460,6 @@ DOCKER_IMAGE ?= unstructured:dev docker-build: PIP_VERSION=${PIP_VERSION} DOCKER_IMAGE_NAME=${DOCKER_IMAGE} ./scripts/docker-build.sh -.PHONY: docker-dl-packages -docker-dl-packages: - @scripts/docker-dl-packages.sh - .PHONY: docker-start-bash docker-start-bash: docker run -ti --rm ${DOCKER_IMAGE} diff --git a/scripts/docker-dl-packages.sh b/scripts/docker-dl-packages.sh deleted file mode 100755 index b50b400e2..000000000 --- a/scripts/docker-dl-packages.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -files=( - "libreoffice-7.6.5-r0.apk" - "libreoffice-24-24.2.4.1-r0.67f8e014.apk" - "openjpeg-2.5.0-r0.apk" - "poppler-23.09.0-r0.apk" - "leptonica-1.83.0-r0.apk" - "pandoc-3.1.8-r0.apk" - "tesseract-5.3.2-r0.apk" - "nltk_data.tgz" - -) - -directory="docker-packages" -mkdir -p "${directory}" - -for file in "${files[@]}"; do - echo "Downloading ${file}" - wget "https://utic-public-cf.s3.amazonaws.com/$file" -P "$directory" -done - -echo "Downloads complete."