build: image and dependency updates; fix tesseract files locations (#3310)

### Summary

Updates to the latest version of the `wolfi-base` image. Changes
include:
- Version bumps to address CVEs
- `libreoffice` is now included in the `arm64`. `.doc` files are now
supported for `arm64`. `.ppt` do not work with the `libreoffice` package
currently available on `wolfi-os`. We have follow on work to look into
that.
- Updates the location of the `tesseract` `tessdata` files on the
`arm64` build. Closes #3290.
- Closes #3319 and addes `psutil` to the base dependencies.

### Testing

- `test_dockerfile` should continue to pass with the updates.
This commit is contained in:
Matt Robinson 2024-07-01 15:39:32 -04:00 committed by GitHub
parent 9eb4c96b94
commit db8617872b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
26 changed files with 43 additions and 39 deletions

View File

@ -529,6 +529,5 @@ jobs:
uses: anchore/scan-action@v3
with:
image: "unstructured:dev"
# NOTE(robinson) - revert this to medium when we bump libreoffice
severity-cutoff: critical
only-fixed: true

View File

@ -2,6 +2,9 @@
### Enhancements
* **`.doc` files are now supported in the `arm64` image.**. `libreoffice24` is added to the `arm64` image, meaning `.doc` files are now supported. We have follow on work planned to investigate adding `.ppt` support for `arm64` as well.
### Features
### Fixes

View File

@ -1,4 +1,4 @@
FROM quay.io/unstructured-io/base-images:wolfi-base@sha256:753fa1ed5a4793eb2bb179c07a34ba9164ac46328642e2db615259274b0c9baf as base
FROM quay.io/unstructured-io/base-images:wolfi-base-d46498e@sha256:3db0544df1d8d9989cd3c3b28670d8b81351dfdc1d9129004c71ff05996fd51e as base
USER root
@ -16,10 +16,9 @@ RUN chown -R notebook-user:notebook-user /app && \
USER notebook-user
RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';'
RUN pip3.11 install unstructured.paddlepaddle
RUN python3.11 -c "import nltk; nltk.download('punkt')" && \
RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';' && \
pip3.11 install unstructured.paddlepaddle && \
python3.11 -c "import nltk; nltk.download('punkt')" && \
python3.11 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \
python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"

View File

@ -11,8 +11,8 @@ emoji
dataclasses-json
python-iso639
langdetect
# NOTE(robinson) - pinned due to a feature being deprecated in the latest version. plan to
# investigate and remove pin
# NOTE(robinson) - numpy pin is because ONNX model weights are only compatible
# with numpy 1.x.x
numpy<2
rapidfuzz
backoff
@ -20,3 +20,4 @@ typing-extensions
unstructured-client
wrapt
tqdm
psutil

View File

@ -59,6 +59,8 @@ packaging==23.2
# -c ././deps/constraints.txt
# marshmallow
# unstructured-client
psutil==6.0.0
# via -r ./base.in
python-dateutil==2.9.0.post0
# via unstructured-client
python-iso639==2024.4.27

View File

@ -268,7 +268,7 @@ prompt-toolkit==3.0.47
# jupyter-console
psutil==6.0.0
# via
# -c ./test.txt
# -c ./base.txt
# ipykernel
ptyprocess==0.7.0
# via

View File

@ -77,7 +77,7 @@ lanms-neo==1.0.2
# via unstructured-paddleocr
lazy-loader==0.4
# via scikit-image
lmdb==1.4.1
lmdb==1.5.1
# via unstructured-paddleocr
lxml==5.2.2
# via
@ -122,7 +122,7 @@ opencv-python==4.8.0.76
# -c ././deps/constraints.txt
# imgaug
# unstructured-paddleocr
openpyxl==3.1.4
openpyxl==3.1.5
# via unstructured-paddleocr
packaging==23.2
# via
@ -136,7 +136,7 @@ pandas==2.2.2
# via visualdl
pdf2image==1.17.0
# via unstructured-paddleocr
pillow==10.3.0
pillow==10.4.0
# via
# imageio
# imgaug
@ -153,7 +153,9 @@ protobuf==4.23.4
# -c ././deps/constraints.txt
# visualdl
psutil==6.0.0
# via visualdl
# via
# -c ./base.txt
# visualdl
pyclipper==1.3.0.post5
# via unstructured-paddleocr
pycryptodome==3.20.0

View File

@ -121,7 +121,7 @@ onnx==1.16.1
# via
# -r ./extra-pdf-image.in
# unstructured-inference
onnxruntime==1.18.0
onnxruntime==1.18.1
# via unstructured-inference
opencv-python==4.8.0.76
# via
@ -153,7 +153,7 @@ pdfplumber==0.11.1
# via layoutparser
pikepdf==9.0.0
# via -r ./extra-pdf-image.in
pillow==10.3.0
pillow==10.4.0
# via
# layoutparser
# matplotlib
@ -275,7 +275,7 @@ tqdm==4.66.4
# huggingface-hub
# iopath
# transformers
transformers==4.41.2
transformers==4.42.3
# via unstructured-inference
typing-extensions==4.12.2
# via

View File

@ -6,7 +6,7 @@
#
lxml==5.2.2
# via python-pptx
pillow==10.3.0
pillow==10.4.0
# via python-pptx
python-pptx==0.6.23
# via -r ./extra-pptx.in

View File

@ -12,7 +12,7 @@ numpy==1.26.4
# via
# -c ./base.txt
# pandas
openpyxl==3.1.4
openpyxl==3.1.5
# via -r ./extra-xlsx.in
pandas==2.2.2
# via -r ./extra-xlsx.in

View File

@ -99,7 +99,7 @@ tqdm==4.66.4
# huggingface-hub
# sacremoses
# transformers
transformers==4.41.2
transformers==4.42.3
# via -r ./huggingface.in
typing-extensions==4.12.2
# via

View File

@ -8,7 +8,7 @@ anyio==3.7.1
# via
# -c ./ingest/../deps/constraints.txt
# httpx
astrapy==1.3.0
astrapy==1.3.1
# via -r ./ingest/astra.in
bson==0.5.10
# via astrapy

View File

@ -117,7 +117,7 @@ oauthlib==3.2.2
# via
# kubernetes
# requests-oauthlib
onnxruntime==1.18.0
onnxruntime==1.18.1
# via chromadb
opentelemetry-api==1.25.0
# via

View File

@ -13,7 +13,7 @@ charset-normalizer==3.3.2
# via
# -c ./ingest/../base.txt
# requests
clarifai==10.5.2
clarifai==10.5.3
# via -r ./ingest/clarifai.in
clarifai-grpc==10.5.4
# via clarifai
@ -40,7 +40,7 @@ numpy==1.26.4
# tritonclient
pfzy==0.3.4
# via inquirerpy
pillow==10.3.0
pillow==10.4.0
# via clarifai
prompt-toolkit==3.0.47
# via inquirerpy
@ -51,7 +51,7 @@ protobuf==4.23.4
# googleapis-common-protos
pygments==2.18.0
# via rich
python-rapidjson==1.17
python-rapidjson==1.18
# via tritonclient
pyyaml==6.0.1
# via clarifai

View File

@ -15,7 +15,7 @@ charset-normalizer==3.3.2
# via
# -c ./ingest/../base.txt
# requests
databricks-sdk==0.28.0
databricks-sdk==0.29.0
# via -r ./ingest/databricks-volumes.in
google-auth==2.30.0
# via databricks-sdk

View File

@ -122,7 +122,7 @@ packaging==23.2
# langchain-core
# marshmallow
# transformers
pillow==10.3.0
pillow==10.4.0
# via sentence-transformers
pydantic==2.7.4
# via
@ -186,7 +186,7 @@ tqdm==4.66.4
# huggingface-hub
# sentence-transformers
# transformers
transformers==4.41.2
transformers==4.42.3
# via sentence-transformers
typing-extensions==4.12.2
# via

View File

@ -38,7 +38,7 @@ idna==3.7
# anyio
# httpx
# requests
openai==1.35.5
openai==1.35.7
# via -r ./ingest/embed-octoai.in
pydantic==2.7.4
# via openai

View File

@ -98,7 +98,7 @@ numpy==1.26.4
# -c ./ingest/../base.txt
# langchain
# langchain-community
openai==1.35.5
openai==1.35.7
# via -r ./ingest/embed-openai.in
orjson==3.10.5
# via langsmith

View File

@ -54,7 +54,7 @@ google-auth==2.30.0
# google-cloud-core
# google-cloud-resource-manager
# google-cloud-storage
google-cloud-aiplatform==1.56.0
google-cloud-aiplatform==1.57.0
# via langchain-google-vertexai
google-cloud-bigquery==3.25.0
# via google-cloud-aiplatform

View File

@ -17,7 +17,7 @@ idna==3.7
# via
# -c ./ingest/../base.txt
# requests
python-gitlab==4.6.0
python-gitlab==4.7.0
# via -r ./ingest/gitlab.in
requests==2.32.3
# via

View File

@ -17,7 +17,7 @@ charset-normalizer==3.3.2
# requests
google-api-core==2.19.1
# via google-api-python-client
google-api-python-client==2.134.0
google-api-python-client==2.135.0
# via -r ./ingest/google-drive.in
google-auth==2.30.0
# via

View File

@ -6,5 +6,5 @@
#
dnspython==2.6.1
# via pymongo
pymongo==4.7.3
pymongo==4.8.0
# via -r ./ingest/mongodb.in

View File

@ -8,11 +8,12 @@ flake8-print
freezegun
label_studio_sdk
mypy
psutil
pydantic
pytest-cov
pytest-mock
ruff
# NOTE(robison) - we need to do additional cleanup to pass
# linting for the latest version of ruff
ruff<0.5.0
types-Markdown
types-requests
types-tabulate

View File

@ -92,8 +92,6 @@ platformdirs==3.10.0
# black
pluggy==1.5.0
# via pytest
psutil==6.0.0
# via -r ./test.in
pycodestyle==2.12.0
# via
# flake8

View File

@ -9,7 +9,6 @@ DOCKER_BUILD_CMD=(docker buildx build --load -f Dockerfile
--build-arg PIP_VERSION="$PIP_VERSION"
--build-arg BUILDKIT_INLINE_CACHE=1
--progress plain
--platform linux/amd64
--cache-from "$DOCKER_REPOSITORY":latest
-t "$DOCKER_IMAGE" .)

View File

@ -33,7 +33,7 @@ all_tests=(
'google-drive.sh'
'wikipedia.sh'
'local.sh'
'slack.sh'
# 'slack.sh'
'against-api.sh'
'gcs.sh'
'kafka-local.sh'