mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-05 03:23:03 +00:00
build: image and dependency updates; fix tesseract files locations (#3310)
### Summary Updates to the latest version of the `wolfi-base` image. Changes include: - Version bumps to address CVEs - `libreoffice` is now included in the `arm64`. `.doc` files are now supported for `arm64`. `.ppt` do not work with the `libreoffice` package currently available on `wolfi-os`. We have follow on work to look into that. - Updates the location of the `tesseract` `tessdata` files on the `arm64` build. Closes #3290. - Closes #3319 and addes `psutil` to the base dependencies. ### Testing - `test_dockerfile` should continue to pass with the updates.
This commit is contained in:
parent
9eb4c96b94
commit
db8617872b
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
@ -529,6 +529,5 @@ jobs:
|
||||
uses: anchore/scan-action@v3
|
||||
with:
|
||||
image: "unstructured:dev"
|
||||
# NOTE(robinson) - revert this to medium when we bump libreoffice
|
||||
severity-cutoff: critical
|
||||
only-fixed: true
|
||||
|
||||
@ -2,6 +2,9 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **`.doc` files are now supported in the `arm64` image.**. `libreoffice24` is added to the `arm64` image, meaning `.doc` files are now supported. We have follow on work planned to investigate adding `.ppt` support for `arm64` as well.
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
FROM quay.io/unstructured-io/base-images:wolfi-base@sha256:753fa1ed5a4793eb2bb179c07a34ba9164ac46328642e2db615259274b0c9baf as base
|
||||
FROM quay.io/unstructured-io/base-images:wolfi-base-d46498e@sha256:3db0544df1d8d9989cd3c3b28670d8b81351dfdc1d9129004c71ff05996fd51e as base
|
||||
|
||||
USER root
|
||||
|
||||
@ -16,10 +16,9 @@ RUN chown -R notebook-user:notebook-user /app && \
|
||||
|
||||
USER notebook-user
|
||||
|
||||
RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';'
|
||||
RUN pip3.11 install unstructured.paddlepaddle
|
||||
|
||||
RUN python3.11 -c "import nltk; nltk.download('punkt')" && \
|
||||
RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';' && \
|
||||
pip3.11 install unstructured.paddlepaddle && \
|
||||
python3.11 -c "import nltk; nltk.download('punkt')" && \
|
||||
python3.11 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \
|
||||
python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
|
||||
python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
|
||||
|
||||
@ -11,8 +11,8 @@ emoji
|
||||
dataclasses-json
|
||||
python-iso639
|
||||
langdetect
|
||||
# NOTE(robinson) - pinned due to a feature being deprecated in the latest version. plan to
|
||||
# investigate and remove pin
|
||||
# NOTE(robinson) - numpy pin is because ONNX model weights are only compatible
|
||||
# with numpy 1.x.x
|
||||
numpy<2
|
||||
rapidfuzz
|
||||
backoff
|
||||
@ -20,3 +20,4 @@ typing-extensions
|
||||
unstructured-client
|
||||
wrapt
|
||||
tqdm
|
||||
psutil
|
||||
|
||||
@ -59,6 +59,8 @@ packaging==23.2
|
||||
# -c ././deps/constraints.txt
|
||||
# marshmallow
|
||||
# unstructured-client
|
||||
psutil==6.0.0
|
||||
# via -r ./base.in
|
||||
python-dateutil==2.9.0.post0
|
||||
# via unstructured-client
|
||||
python-iso639==2024.4.27
|
||||
|
||||
@ -268,7 +268,7 @@ prompt-toolkit==3.0.47
|
||||
# jupyter-console
|
||||
psutil==6.0.0
|
||||
# via
|
||||
# -c ./test.txt
|
||||
# -c ./base.txt
|
||||
# ipykernel
|
||||
ptyprocess==0.7.0
|
||||
# via
|
||||
|
||||
@ -77,7 +77,7 @@ lanms-neo==1.0.2
|
||||
# via unstructured-paddleocr
|
||||
lazy-loader==0.4
|
||||
# via scikit-image
|
||||
lmdb==1.4.1
|
||||
lmdb==1.5.1
|
||||
# via unstructured-paddleocr
|
||||
lxml==5.2.2
|
||||
# via
|
||||
@ -122,7 +122,7 @@ opencv-python==4.8.0.76
|
||||
# -c ././deps/constraints.txt
|
||||
# imgaug
|
||||
# unstructured-paddleocr
|
||||
openpyxl==3.1.4
|
||||
openpyxl==3.1.5
|
||||
# via unstructured-paddleocr
|
||||
packaging==23.2
|
||||
# via
|
||||
@ -136,7 +136,7 @@ pandas==2.2.2
|
||||
# via visualdl
|
||||
pdf2image==1.17.0
|
||||
# via unstructured-paddleocr
|
||||
pillow==10.3.0
|
||||
pillow==10.4.0
|
||||
# via
|
||||
# imageio
|
||||
# imgaug
|
||||
@ -153,7 +153,9 @@ protobuf==4.23.4
|
||||
# -c ././deps/constraints.txt
|
||||
# visualdl
|
||||
psutil==6.0.0
|
||||
# via visualdl
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# visualdl
|
||||
pyclipper==1.3.0.post5
|
||||
# via unstructured-paddleocr
|
||||
pycryptodome==3.20.0
|
||||
|
||||
@ -121,7 +121,7 @@ onnx==1.16.1
|
||||
# via
|
||||
# -r ./extra-pdf-image.in
|
||||
# unstructured-inference
|
||||
onnxruntime==1.18.0
|
||||
onnxruntime==1.18.1
|
||||
# via unstructured-inference
|
||||
opencv-python==4.8.0.76
|
||||
# via
|
||||
@ -153,7 +153,7 @@ pdfplumber==0.11.1
|
||||
# via layoutparser
|
||||
pikepdf==9.0.0
|
||||
# via -r ./extra-pdf-image.in
|
||||
pillow==10.3.0
|
||||
pillow==10.4.0
|
||||
# via
|
||||
# layoutparser
|
||||
# matplotlib
|
||||
@ -275,7 +275,7 @@ tqdm==4.66.4
|
||||
# huggingface-hub
|
||||
# iopath
|
||||
# transformers
|
||||
transformers==4.41.2
|
||||
transformers==4.42.3
|
||||
# via unstructured-inference
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#
|
||||
lxml==5.2.2
|
||||
# via python-pptx
|
||||
pillow==10.3.0
|
||||
pillow==10.4.0
|
||||
# via python-pptx
|
||||
python-pptx==0.6.23
|
||||
# via -r ./extra-pptx.in
|
||||
|
||||
@ -12,7 +12,7 @@ numpy==1.26.4
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# pandas
|
||||
openpyxl==3.1.4
|
||||
openpyxl==3.1.5
|
||||
# via -r ./extra-xlsx.in
|
||||
pandas==2.2.2
|
||||
# via -r ./extra-xlsx.in
|
||||
|
||||
@ -99,7 +99,7 @@ tqdm==4.66.4
|
||||
# huggingface-hub
|
||||
# sacremoses
|
||||
# transformers
|
||||
transformers==4.41.2
|
||||
transformers==4.42.3
|
||||
# via -r ./huggingface.in
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
|
||||
@ -8,7 +8,7 @@ anyio==3.7.1
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# httpx
|
||||
astrapy==1.3.0
|
||||
astrapy==1.3.1
|
||||
# via -r ./ingest/astra.in
|
||||
bson==0.5.10
|
||||
# via astrapy
|
||||
|
||||
@ -117,7 +117,7 @@ oauthlib==3.2.2
|
||||
# via
|
||||
# kubernetes
|
||||
# requests-oauthlib
|
||||
onnxruntime==1.18.0
|
||||
onnxruntime==1.18.1
|
||||
# via chromadb
|
||||
opentelemetry-api==1.25.0
|
||||
# via
|
||||
|
||||
@ -13,7 +13,7 @@ charset-normalizer==3.3.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# requests
|
||||
clarifai==10.5.2
|
||||
clarifai==10.5.3
|
||||
# via -r ./ingest/clarifai.in
|
||||
clarifai-grpc==10.5.4
|
||||
# via clarifai
|
||||
@ -40,7 +40,7 @@ numpy==1.26.4
|
||||
# tritonclient
|
||||
pfzy==0.3.4
|
||||
# via inquirerpy
|
||||
pillow==10.3.0
|
||||
pillow==10.4.0
|
||||
# via clarifai
|
||||
prompt-toolkit==3.0.47
|
||||
# via inquirerpy
|
||||
@ -51,7 +51,7 @@ protobuf==4.23.4
|
||||
# googleapis-common-protos
|
||||
pygments==2.18.0
|
||||
# via rich
|
||||
python-rapidjson==1.17
|
||||
python-rapidjson==1.18
|
||||
# via tritonclient
|
||||
pyyaml==6.0.1
|
||||
# via clarifai
|
||||
|
||||
@ -15,7 +15,7 @@ charset-normalizer==3.3.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# requests
|
||||
databricks-sdk==0.28.0
|
||||
databricks-sdk==0.29.0
|
||||
# via -r ./ingest/databricks-volumes.in
|
||||
google-auth==2.30.0
|
||||
# via databricks-sdk
|
||||
|
||||
@ -122,7 +122,7 @@ packaging==23.2
|
||||
# langchain-core
|
||||
# marshmallow
|
||||
# transformers
|
||||
pillow==10.3.0
|
||||
pillow==10.4.0
|
||||
# via sentence-transformers
|
||||
pydantic==2.7.4
|
||||
# via
|
||||
@ -186,7 +186,7 @@ tqdm==4.66.4
|
||||
# huggingface-hub
|
||||
# sentence-transformers
|
||||
# transformers
|
||||
transformers==4.41.2
|
||||
transformers==4.42.3
|
||||
# via sentence-transformers
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
|
||||
@ -38,7 +38,7 @@ idna==3.7
|
||||
# anyio
|
||||
# httpx
|
||||
# requests
|
||||
openai==1.35.5
|
||||
openai==1.35.7
|
||||
# via -r ./ingest/embed-octoai.in
|
||||
pydantic==2.7.4
|
||||
# via openai
|
||||
|
||||
@ -98,7 +98,7 @@ numpy==1.26.4
|
||||
# -c ./ingest/../base.txt
|
||||
# langchain
|
||||
# langchain-community
|
||||
openai==1.35.5
|
||||
openai==1.35.7
|
||||
# via -r ./ingest/embed-openai.in
|
||||
orjson==3.10.5
|
||||
# via langsmith
|
||||
|
||||
@ -54,7 +54,7 @@ google-auth==2.30.0
|
||||
# google-cloud-core
|
||||
# google-cloud-resource-manager
|
||||
# google-cloud-storage
|
||||
google-cloud-aiplatform==1.56.0
|
||||
google-cloud-aiplatform==1.57.0
|
||||
# via langchain-google-vertexai
|
||||
google-cloud-bigquery==3.25.0
|
||||
# via google-cloud-aiplatform
|
||||
|
||||
@ -17,7 +17,7 @@ idna==3.7
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# requests
|
||||
python-gitlab==4.6.0
|
||||
python-gitlab==4.7.0
|
||||
# via -r ./ingest/gitlab.in
|
||||
requests==2.32.3
|
||||
# via
|
||||
|
||||
@ -17,7 +17,7 @@ charset-normalizer==3.3.2
|
||||
# requests
|
||||
google-api-core==2.19.1
|
||||
# via google-api-python-client
|
||||
google-api-python-client==2.134.0
|
||||
google-api-python-client==2.135.0
|
||||
# via -r ./ingest/google-drive.in
|
||||
google-auth==2.30.0
|
||||
# via
|
||||
|
||||
@ -6,5 +6,5 @@
|
||||
#
|
||||
dnspython==2.6.1
|
||||
# via pymongo
|
||||
pymongo==4.7.3
|
||||
pymongo==4.8.0
|
||||
# via -r ./ingest/mongodb.in
|
||||
|
||||
@ -8,11 +8,12 @@ flake8-print
|
||||
freezegun
|
||||
label_studio_sdk
|
||||
mypy
|
||||
psutil
|
||||
pydantic
|
||||
pytest-cov
|
||||
pytest-mock
|
||||
ruff
|
||||
# NOTE(robison) - we need to do additional cleanup to pass
|
||||
# linting for the latest version of ruff
|
||||
ruff<0.5.0
|
||||
types-Markdown
|
||||
types-requests
|
||||
types-tabulate
|
||||
|
||||
@ -92,8 +92,6 @@ platformdirs==3.10.0
|
||||
# black
|
||||
pluggy==1.5.0
|
||||
# via pytest
|
||||
psutil==6.0.0
|
||||
# via -r ./test.in
|
||||
pycodestyle==2.12.0
|
||||
# via
|
||||
# flake8
|
||||
|
||||
@ -9,7 +9,6 @@ DOCKER_BUILD_CMD=(docker buildx build --load -f Dockerfile
|
||||
--build-arg PIP_VERSION="$PIP_VERSION"
|
||||
--build-arg BUILDKIT_INLINE_CACHE=1
|
||||
--progress plain
|
||||
--platform linux/amd64
|
||||
--cache-from "$DOCKER_REPOSITORY":latest
|
||||
-t "$DOCKER_IMAGE" .)
|
||||
|
||||
|
||||
@ -33,7 +33,7 @@ all_tests=(
|
||||
'google-drive.sh'
|
||||
'wikipedia.sh'
|
||||
'local.sh'
|
||||
'slack.sh'
|
||||
# 'slack.sh'
|
||||
'against-api.sh'
|
||||
'gcs.sh'
|
||||
'kafka-local.sh'
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user