2025-01-08 14:00:13 -08:00
|
|
|
FROM quay.io/unstructured-io/base-images:wolfi-base-latest AS base
|
|
|
|
|
2025-06-17 02:32:06 -04:00
|
|
|
ARG PYTHON=python3.12
|
2025-01-23 11:11:38 -06:00
|
|
|
ARG PIP="${PYTHON} -m pip"
|
2023-03-14 13:40:01 -07:00
|
|
|
|
2024-05-15 18:53:15 -04:00
|
|
|
USER root
|
2023-03-14 13:40:01 -07:00
|
|
|
|
2024-06-14 16:41:27 -04:00
|
|
|
WORKDIR /app
|
|
|
|
|
2024-05-16 20:22:10 -04:00
|
|
|
COPY ./requirements requirements/
|
2023-03-14 13:40:01 -07:00
|
|
|
COPY unstructured unstructured
|
2024-05-15 18:53:15 -04:00
|
|
|
COPY test_unstructured test_unstructured
|
|
|
|
COPY example-docs example-docs
|
2023-03-14 13:40:01 -07:00
|
|
|
|
2024-06-26 15:14:55 +02:00
|
|
|
RUN chown -R notebook-user:notebook-user /app && \
|
2025-06-10 18:38:43 -07:00
|
|
|
apk add --no-cache font-ubuntu fontconfig git && \
|
2025-01-08 14:00:13 -08:00
|
|
|
fc-cache -fv && \
|
|
|
|
[ -e /usr/bin/python3 ] || ln -s /usr/bin/$PYTHON /usr/bin/python3
|
2024-06-14 16:41:27 -04:00
|
|
|
|
|
|
|
USER notebook-user
|
2024-05-15 18:53:15 -04:00
|
|
|
|
2025-01-23 11:11:38 -06:00
|
|
|
# append PATH before pip install to avoid warning logs; it also avoids issues with packages that needs compilation during installation
|
|
|
|
ENV PATH="${PATH}:/home/notebook-user/.local/bin"
|
|
|
|
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
|
2025-01-08 14:00:13 -08:00
|
|
|
ENV NLTK_DATA=/home/notebook-user/nltk_data
|
|
|
|
|
|
|
|
# Install Python dependencies and download required NLTK packages
|
build: remove test and dev deps from docker image (#3969)
Removed the dependencies contained in `test.txt`, `dev.txt`, and
`constraints.txt` from the things that get installed in the docker
image. In order to keep testing the image (running the tests), I added a
step to the `docker-test` make target to install `test.txt` and
`dev.txt`. Thus we presumably get a smaller image (probably not much
smaller), reduce the dependency chain or our images, and have less
exposure to vulnerabilities while still testing as robustly as before.
Incidentally, I removed the `Dockerfile` for our ubuntu image, since it
made reference to non-existent make targets, which tells me it's stale
and wasn't being used.
### Review:
- Reviewer should ensure the dev and test dependencies are not being
installed in the docker image. One way to check is to check the logs in
CI, and note, e.g. that
[this](https://github.com/Unstructured-IO/unstructured/actions/runs/14112971425/job/39536304012#step:3:1700)
is the first reference to `pytest` in the docker build and test logs,
after the image build is completed.
- Reviewer should ensure docker image is still being tested in CI and is
passing.
2025-03-27 13:41:11 -05:00
|
|
|
RUN find requirements/ -type f -name "*.txt" ! -name "test.txt" ! -name "dev.txt" ! -name "constraints.txt" -exec $PIP install --no-cache-dir --user -r '{}' ';' && \
|
2025-01-08 14:00:13 -08:00
|
|
|
mkdir -p ${NLTK_DATA} && \
|
|
|
|
$PYTHON -m nltk.downloader -d ${NLTK_DATA} punkt_tab averaged_perceptron_tagger_eng && \
|
|
|
|
$PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \
|
|
|
|
$PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
|
2024-05-15 18:53:15 -04:00
|
|
|
|
2025-06-04 14:52:58 -04:00
|
|
|
ENV HF_HUB_OFFLINE=1
|
|
|
|
|
2023-03-14 13:40:01 -07:00
|
|
|
CMD ["/bin/bash"]
|