unstructured/Dockerfile
Matt Robinson 331c7faf38
build(deps): split up dependencies by document type (#986)
* split dependencies by document type

* make pip-compile with new requirements

* add extra requirements to setup.py

* add in all docs; re pip-compile

* extra for all docs

* add pandas to xlsx

* dependency requires for tsv and csv

* handling for doc, docx and odt

* dependency check for pypandoc

* required dependencies for pandoc files

* xml and html

* markdown

* msg

* add in pdf

* add in pptx

* add in excel

* add lxml as base req

* extra all docs for local inference

* local inference installs all

* pin pillow version

* fixes for plain text tests

* fixes for doc

* update make commands

* changelog and version

* add xlrd

* update pip-compile

* pin numpy for python 3.8 support

* more constraints

* contraint on scipy

* update install docs

* constrain ipython

* add outlook to pip-compile

* more ipython constraints

* add extras to dockerfile

* pin office365 client

* few doc tweaks

* types as strings

* last pip-compile

* re pip-comple

* make tidy

* make tidy
2023-08-01 11:31:13 -04:00

56 lines
2.3 KiB
Docker

# syntax=docker/dockerfile:experimental
FROM quay.io/unstructured-io/base-images:rocky8.7-3 as base
ARG PIP_VERSION
# Set up environment
ENV HOME /home/
WORKDIR ${HOME}
RUN mkdir ${HOME}/.ssh && chmod go-rwx ${HOME}/.ssh \
&& ssh-keyscan -t rsa github.com >> /home/.ssh/known_hosts
ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
ENV PATH="/home/usr/.local/bin:${PATH}"
FROM base as deps
# Copy and install Unstructured
COPY requirements requirements
RUN python3.8 -m pip install pip==${PIP_VERSION} && \
dnf -y groupinstall "Development Tools" && \
pip install --no-cache -r requirements/base.txt && \
pip install --no-cache -r requirements/test.txt && \
pip install --no-cache -r requirements/huggingface.txt && \
pip install --no-cache -r requirements/dev.txt && \
pip install --no-cache -r requirements/ingest-azure.txt && \
pip install --no-cache -r requirements/ingest-box.txt && \
pip install --no-cache -r requirements/ingest-github.txt && \
pip install --no-cache -r requirements/ingest-gitlab.txt && \
pip install --no-cache -r requirements/ingest-google-drive.txt && \
pip install --no-cache -r requirements/ingest-reddit.txt && \
pip install --no-cache -r requirements/ingest-s3.txt && \
pip install --no-cache -r requirements/ingest-slack.txt && \
pip install --no-cache -r requirements/ingest-wikipedia.txt && \
pip install --no-cache -r requirements/extra-csv.txt && \
pip install --no-cache -r requirements/extra-docx.txt && \
pip install --no-cache -r requirements/extra-markdown.txt && \
pip install --no-cache -r requirements/extra-msg.txt && \
pip install --no-cache -r requirements/extra-odt.txt && \
pip install --no-cache -r requirements/extra-pandoc.txt && \
pip install --no-cache -r requirements/extra-pdf-image.txt && \
pip install --no-cache -r requirements/extra-pptx.txt && \
pip install --no-cache -r requirements/extra-xlsx.txt && \
dnf -y groupremove "Development Tools" && \
dnf clean all
RUN python3.8 -c "import nltk; nltk.download('punkt')" && \
python3.8 -c "import nltk; nltk.download('averaged_perceptron_tagger')"
FROM deps as code
COPY example-docs example-docs
COPY unstructured unstructured
RUN python3.8 -c "from unstructured.ingest.doc_processor.generalized import initialize; initialize()"
CMD ["/bin/bash"]