2025-01-08 14:00:13 -08:00
|
|
|
FROM quay.io/unstructured-io/base-images:wolfi-base-latest AS base
|
|
|
|
|
|
|
|
ARG PYTHON=python3.11
|
2025-01-23 11:11:38 -06:00
|
|
|
ARG PIP="${PYTHON} -m pip"
|
2023-03-14 13:40:01 -07:00
|
|
|
|
2024-05-15 18:53:15 -04:00
|
|
|
USER root
|
2023-03-14 13:40:01 -07:00
|
|
|
|
2024-06-14 16:41:27 -04:00
|
|
|
WORKDIR /app
|
|
|
|
|
2024-05-16 20:22:10 -04:00
|
|
|
COPY ./requirements requirements/
|
2023-03-14 13:40:01 -07:00
|
|
|
COPY unstructured unstructured
|
2024-05-15 18:53:15 -04:00
|
|
|
COPY test_unstructured test_unstructured
|
|
|
|
COPY example-docs example-docs
|
2023-03-14 13:40:01 -07:00
|
|
|
|
2024-06-26 15:14:55 +02:00
|
|
|
RUN chown -R notebook-user:notebook-user /app && \
|
2025-01-08 14:00:13 -08:00
|
|
|
apk add font-ubuntu git && \
|
|
|
|
fc-cache -fv && \
|
|
|
|
[ -e /usr/bin/python3 ] || ln -s /usr/bin/$PYTHON /usr/bin/python3
|
2024-06-14 16:41:27 -04:00
|
|
|
|
|
|
|
USER notebook-user
|
2024-05-15 18:53:15 -04:00
|
|
|
|
2025-01-23 11:11:38 -06:00
|
|
|
# append PATH before pip install to avoid warning logs; it also avoids issues with packages that needs compilation during installation
|
|
|
|
ENV PATH="${PATH}:/home/notebook-user/.local/bin"
|
|
|
|
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
|
2025-01-08 14:00:13 -08:00
|
|
|
ENV NLTK_DATA=/home/notebook-user/nltk_data
|
|
|
|
|
|
|
|
# Install Python dependencies and download required NLTK packages
|
|
|
|
RUN find requirements/ -type f -name "*.txt" -exec $PIP install --no-cache-dir --user -r '{}' ';' && \
|
|
|
|
mkdir -p ${NLTK_DATA} && \
|
|
|
|
$PYTHON -m nltk.downloader -d ${NLTK_DATA} punkt_tab averaged_perceptron_tagger_eng && \
|
|
|
|
$PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \
|
|
|
|
$PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
|
2024-05-15 18:53:15 -04:00
|
|
|
|
2023-03-14 13:40:01 -07:00
|
|
|
CMD ["/bin/bash"]
|