unstructured/Dockerfile

48 lines
1.8 KiB
Docker
Raw Normal View History

2023-03-14 13:40:01 -07:00
# syntax=docker/dockerfile:experimental
FROM quay.io/unstructured-io/base-images:rocky8.7-3 as base
2023-03-14 13:40:01 -07:00
ARG PIP_VERSION
# Set up environment
2023-03-14 13:40:01 -07:00
ENV HOME /home/
WORKDIR ${HOME}
RUN mkdir ${HOME}/.ssh && chmod go-rwx ${HOME}/.ssh \
&& ssh-keyscan -t rsa github.com >> /home/.ssh/known_hosts
ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
ENV PATH="/home/usr/.local/bin:${PATH}"
FROM base as deps
2023-03-29 00:02:39 -07:00
# Copy and install Unstructured
COPY requirements requirements
2023-03-14 13:40:01 -07:00
2023-03-29 00:02:39 -07:00
RUN python3.8 -m pip install pip==${PIP_VERSION} && \
dnf -y groupinstall "Development Tools" && \
2023-03-29 00:02:39 -07:00
pip install --no-cache -r requirements/base.txt && \
pip install --no-cache -r requirements/test.txt && \
pip install --no-cache -r requirements/huggingface.txt && \
pip install --no-cache -r requirements/dev.txt && \
pip install --no-cache -r requirements/ingest-azure.txt && \
2023-07-31 18:10:10 -07:00
pip install --no-cache -r requirements/ingest-box.txt && \
pip install --no-cache -r requirements/ingest-github.txt && \
pip install --no-cache -r requirements/ingest-gitlab.txt && \
pip install --no-cache -r requirements/ingest-google-drive.txt && \
pip install --no-cache -r requirements/ingest-reddit.txt && \
pip install --no-cache -r requirements/ingest-s3.txt && \
pip install --no-cache -r requirements/ingest-slack.txt && \
pip install --no-cache -r requirements/ingest-wikipedia.txt && \
2023-03-29 00:02:39 -07:00
pip install --no-cache -r requirements/local-inference.txt && \
dnf -y groupremove "Development Tools" && \
dnf clean all
2023-03-14 13:40:01 -07:00
RUN python3.8 -c "import nltk; nltk.download('punkt')" && \
python3.8 -c "import nltk; nltk.download('averaged_perceptron_tagger')"
FROM deps as code
2023-03-29 00:02:39 -07:00
COPY example-docs example-docs
2023-03-14 13:40:01 -07:00
COPY unstructured unstructured
RUN python3.8 -c "from unstructured.ingest.doc_processor.generalized import initialize; initialize()"
2023-03-14 13:40:01 -07:00
CMD ["/bin/bash"]