mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-28 07:33:36 +00:00
* split dependencies by document type * make pip-compile with new requirements * add extra requirements to setup.py * add in all docs; re pip-compile * extra for all docs * add pandas to xlsx * dependency requires for tsv and csv * handling for doc, docx and odt * dependency check for pypandoc * required dependencies for pandoc files * xml and html * markdown * msg * add in pdf * add in pptx * add in excel * add lxml as base req * extra all docs for local inference * local inference installs all * pin pillow version * fixes for plain text tests * fixes for doc * update make commands * changelog and version * add xlrd * update pip-compile * pin numpy for python 3.8 support * more constraints * contraint on scipy * update install docs * constrain ipython * add outlook to pip-compile * more ipython constraints * add extras to dockerfile * pin office365 client * few doc tweaks * types as strings * last pip-compile * re pip-comple * make tidy * make tidy
56 lines
2.3 KiB
Docker
56 lines
2.3 KiB
Docker
# syntax=docker/dockerfile:experimental
|
|
FROM quay.io/unstructured-io/base-images:rocky8.7-3 as base
|
|
|
|
ARG PIP_VERSION
|
|
|
|
# Set up environment
|
|
ENV HOME /home/
|
|
WORKDIR ${HOME}
|
|
RUN mkdir ${HOME}/.ssh && chmod go-rwx ${HOME}/.ssh \
|
|
&& ssh-keyscan -t rsa github.com >> /home/.ssh/known_hosts
|
|
ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
|
|
ENV PATH="/home/usr/.local/bin:${PATH}"
|
|
|
|
FROM base as deps
|
|
# Copy and install Unstructured
|
|
COPY requirements requirements
|
|
|
|
RUN python3.8 -m pip install pip==${PIP_VERSION} && \
|
|
dnf -y groupinstall "Development Tools" && \
|
|
pip install --no-cache -r requirements/base.txt && \
|
|
pip install --no-cache -r requirements/test.txt && \
|
|
pip install --no-cache -r requirements/huggingface.txt && \
|
|
pip install --no-cache -r requirements/dev.txt && \
|
|
pip install --no-cache -r requirements/ingest-azure.txt && \
|
|
pip install --no-cache -r requirements/ingest-box.txt && \
|
|
pip install --no-cache -r requirements/ingest-github.txt && \
|
|
pip install --no-cache -r requirements/ingest-gitlab.txt && \
|
|
pip install --no-cache -r requirements/ingest-google-drive.txt && \
|
|
pip install --no-cache -r requirements/ingest-reddit.txt && \
|
|
pip install --no-cache -r requirements/ingest-s3.txt && \
|
|
pip install --no-cache -r requirements/ingest-slack.txt && \
|
|
pip install --no-cache -r requirements/ingest-wikipedia.txt && \
|
|
pip install --no-cache -r requirements/extra-csv.txt && \
|
|
pip install --no-cache -r requirements/extra-docx.txt && \
|
|
pip install --no-cache -r requirements/extra-markdown.txt && \
|
|
pip install --no-cache -r requirements/extra-msg.txt && \
|
|
pip install --no-cache -r requirements/extra-odt.txt && \
|
|
pip install --no-cache -r requirements/extra-pandoc.txt && \
|
|
pip install --no-cache -r requirements/extra-pdf-image.txt && \
|
|
pip install --no-cache -r requirements/extra-pptx.txt && \
|
|
pip install --no-cache -r requirements/extra-xlsx.txt && \
|
|
dnf -y groupremove "Development Tools" && \
|
|
dnf clean all
|
|
|
|
RUN python3.8 -c "import nltk; nltk.download('punkt')" && \
|
|
python3.8 -c "import nltk; nltk.download('averaged_perceptron_tagger')"
|
|
|
|
FROM deps as code
|
|
|
|
COPY example-docs example-docs
|
|
COPY unstructured unstructured
|
|
|
|
RUN python3.8 -c "from unstructured.ingest.doc_processor.generalized import initialize; initialize()"
|
|
|
|
CMD ["/bin/bash"]
|