diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..de13482eb --- /dev/null +++ b/Dockerfile @@ -0,0 +1,64 @@ +# syntax=docker/dockerfile:experimental + +FROM centos:centos7.9.2009 + +ARG PIP_VERSION +ARG UNSTRUCTURED + +RUN yum -y update && \ + yum -y install poppler-utils xz-devel which + +# Note(austin) Get a recent tesseract from this repo +# See https://tesseract-ocr.github.io/tessdoc/Installation.html +# PDF and images: +RUN yum-config-manager --add-repo https://download.opensuse.org/repositories/home:/Alexander_Pozdnyakov/CentOS_7/ && \ + rpm --import https://build.opensuse.org/projects/home:Alexander_Pozdnyakov/public_key && \ + yum -y update && \ + yum -y install tesseract + +# Note(yuming): Install gcc & g++ ≥ 5.4 for Detectron2 requirement +RUN yum -y update +RUN yum -y install centos-release-scl +RUN yum -y install devtoolset-7-gcc* +SHELL [ "/usr/bin/scl", "enable", "devtoolset-7"] + +RUN yum -y update && \ + # MS Office docs: + yum -y install libreoffice && \ + yum -y install openssl-devel bzip2-devel libffi-devel make git sqlite-devel && \ + curl -O https://www.python.org/ftp/python/3.8.15/Python-3.8.15.tgz && tar -xzf Python-3.8.15.tgz && \ + cd Python-3.8.15/ && ./configure --enable-optimizations && make altinstall && \ + cd .. && rm -rf Python-3.8.15* && \ + ln -s /usr/local/bin/python3.8 /usr/local/bin/python3 + +# create a home directory +ENV HOME /home/ + +WORKDIR ${HOME} +RUN mkdir ${HOME}/.ssh && chmod go-rwx ${HOME}/.ssh \ + && ssh-keyscan -t rsa github.com >> /home/.ssh/known_hosts + +ENV PYTHONPATH="${PYTHONPATH}:${HOME}" +ENV PATH="/home/usr/.local/bin:${PATH}" + +COPY example-docs example-docs + +COPY requirements/base.txt requirements-base.txt +COPY requirements/huggingface.txt requirements-huggingface.txt +COPY requirements/dev.txt requirements-dev.txt +# PDFs and images +COPY requirements/local-inference.txt requirements-local-inference.txt + + +RUN python3.8 -m pip install pip==${PIP_VERSION} \ + && pip install --no-cache -r requirements-base.txt \ + && pip install --no-cache -r requirements-huggingface.txt \ + && pip install --no-cache -r requirements-dev.txt \ + # PDFs and images + && pip install --no-cache -r requirements-local-inference.txt \ + # PDFs + && pip install --no-cache "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2" + +COPY unstructured unstructured + +CMD ["/bin/bash"] diff --git a/Makefile b/Makefile index b1a71e5be..03caed394 100644 --- a/Makefile +++ b/Makefile @@ -178,3 +178,18 @@ version-sync: .PHONY: check-coverage check-coverage: coverage report --fail-under=95 + +########## +# Docker # +########## + +# Docker targets are provided for convenience only and are not required in a standard development environment + + +.PHONY: docker-build +docker-build: + PIP_VERSION=${PIP_VERSION} ./scripts/docker-build.sh + +.PHONY: docker-start-bash +docker-start-bash: + docker run --platform linux/amd64 -ti --rm unstructured-dev:latest diff --git a/README.md b/README.md index 198544d3b..9ad4c19b6 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,32 @@ elements = partition("example-docs/layout-parser-paper.pdf") print("\n\n".join([str(el) for el in elements])) ``` +## :dizzy: Instructions for using the docker image + +The following instructions are intended to help you get up and running using docker to interact with `unstructured`. + +If you only plan on parsing one type of data you can speed up building the image by commenting out some +of the packages/requirements necessary for other data types. See Dockerfile to know which lines are necessary +for your use case. + +See [here](https://docs.docker.com/get-docker/) if you don't already have docker installed on your machine. + +```bash +make docker-build + +# this will drop you into a bash shell where the docker image is running +make docker-start-bash + +# this will drop you into a python console so you can run the below partition functions +python3 + +>>> from unstructured.partition.pdf import partition_pdf +>>> elements = partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf") + +>>> from unstructured.partition.text import partition_text +>>> elements = partition_text(filename="example-docs/fake-text.txt") +``` + ## :coffee: Installation Instructions for Local Development diff --git a/scripts/docker-build.sh b/scripts/docker-build.sh new file mode 100755 index 000000000..f01ce8b54 --- /dev/null +++ b/scripts/docker-build.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -euo pipefail + +DOCKER_BUILDKIT=1 docker buildx build --load --platform=linux/amd64 -f Dockerfile \ + --build-arg PIP_VERSION="$PIP_VERSION" \ + --progress plain \ + -t unstructured-dev:latest .