unstructured/Dockerfile-amd64
Matt Robinson 059fc64bd9
build: apk add libreoffice24 (#3065)
### Summary

Switches to installing `libreoffice` from the Wolfi repository and
upgrades the `libreoffice` version to `libreoffice==24.x.x`. Resolves a
medium vulnerability in the old `libreoffice` version. Security scanning
with `anchore/grype` was also added to the `test_dockerfile` job.
Requirements were bumped to resolve a vulnerability in the `requests`
library.

### Testing

`test_dockerfile` passes with the updates.
2024-05-21 18:54:16 +00:00

45 lines
1.7 KiB
Plaintext

FROM cgr.dev/chainguard/wolfi-base:latest
WORKDIR /app
USER root
COPY ./docker-packages/*.apk packages/
COPY ./requirements requirements/
COPY unstructured unstructured
COPY test_unstructured test_unstructured
COPY example-docs example-docs
RUN apk update && apk add py3.11-pip mesa-gl glib cmake && \
apk add --allow-untrusted packages/pandoc-3.1.8-r0.apk && \
apk add --allow-untrusted packages/poppler-23.09.0-r0.apk && \
apk add --allow-untrusted packages/leptonica-1.83.0-r0.apk && \
apk add --allow-untrusted packages/tesseract-5.3.2-r0.apk && \
apk add libreoffice && \
apk add bash && \
apk add libmagic && \
mv /share/tessdata/configs /usr/local/share/tessdata/ && \
mv /share/tessdata/tessconfigs /usr/local/share/tessdata/ && \
ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/libreoffice && \
ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/soffice && \
chmod +x /usr/lib/libreoffice/program/soffice.bin && \
chmod +x /usr/bin/libreoffice && \
chmod +x /usr/bin/soffice
RUN chown -R nonroot:nonroot /app
USER nonroot
RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';'
RUN pip3.11 install unstructured.paddlepaddle
RUN python3.11 -c "import nltk; nltk.download('punkt')" && \
python3.11 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \
python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
ENV PATH="${PATH}:/home/nonroot/.local/bin"
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
CMD ["/bin/bash"]