mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-11-01 18:29:58 +00:00
91 lines
2.6 KiB
Docker
91 lines
2.6 KiB
Docker
# OCRmyPDF
|
|
#
|
|
# VERSION 3.2
|
|
FROM debian:stretch
|
|
MAINTAINER James R. Barlow <jim@purplerock.ca>
|
|
|
|
# Add unprivileged user
|
|
RUN useradd docker \
|
|
&& mkdir /home/docker \
|
|
&& chown docker:docker /home/docker
|
|
|
|
# Update system and install our dependencies
|
|
# If this command takes too Docker hub's automated build will timeout,
|
|
# so try it in portions
|
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
locales \
|
|
python3 \
|
|
python3-pip \
|
|
python3-venv \
|
|
python3-reportlab \
|
|
python3-pil \
|
|
python3-wheel
|
|
|
|
RUN apt-get install -y --no-install-recommends \
|
|
unpaper \
|
|
qpdf \
|
|
poppler-utils \
|
|
tesseract-ocr \
|
|
tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-eng tesseract-ocr-fra
|
|
|
|
RUN apt-get install -qy --no-install-recommends \
|
|
libffi-dev \
|
|
libpython3-dev \
|
|
gcc
|
|
|
|
# Install Ghostscript from Debian sid to work around JPEG 2000 issue in
|
|
# Debian stretch libgs9 or gs 9.16~dfsg-2.1
|
|
|
|
COPY ./share/etc-apt-sources.list /etc/apt/sources.list
|
|
|
|
RUN apt-get update && apt-get install -y ghostscript/sid
|
|
|
|
|
|
# Enforce UTF-8
|
|
# Borrowed from https://index.docker.io/u/crosbymichael/python/
|
|
RUN dpkg-reconfigure locales && \
|
|
locale-gen C.UTF-8 && \
|
|
/usr/sbin/update-locale LANG=C.UTF-8
|
|
ENV LC_ALL C.UTF-8
|
|
|
|
|
|
# Set up a Python virtualenv and take all of the system packages, so we can
|
|
# rely on the platform packages rather than importing GCC and compiling them
|
|
RUN pyvenv /appenv \
|
|
&& pyvenv --system-site-packages /appenv
|
|
|
|
COPY . /application/
|
|
|
|
# Replace stock Tesseract 3.04.00 font with improved sharp2.ttf that resolves
|
|
# issues in many PDF viewers.
|
|
# Discussion is in https://github.com/tesseract-ocr/tesseract/issues/182
|
|
COPY ./share/sharp2.ttf /usr/share/tesseract-ocr/tessdata/pdf.ttf
|
|
RUN chmod 644 /usr/share/tesseract-ocr/tessdata/pdf.ttf
|
|
|
|
# Set this here to force a docker version, allowing non-tagged versions to
|
|
# be built
|
|
# ENV SETUPTOOLS_SCM_PRETEND_VERSION=v3.3.0
|
|
|
|
# Install application and dependencies
|
|
# In this arrangement Pillow and reportlab will be provided by the system
|
|
# Even though ocrmypdf is locally present, pull from PyPI because
|
|
# Dockerhub and setuptools_scm clash
|
|
RUN . /appenv/bin/activate; \
|
|
pip install --upgrade pip \
|
|
&& pip install ocrmypdf \
|
|
&& pip install --no-cache-dir -r /application/test_requirements.txt
|
|
|
|
# Remove the junk
|
|
RUN apt-get remove -qy gcc
|
|
RUN apt-get autoremove -y && apt-get clean -y
|
|
RUN rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /root/*
|
|
|
|
USER docker
|
|
WORKDIR /home/docker
|
|
|
|
ENV OCRMYPDF_TEST_OUTPUT=/tmp/test-output
|
|
ENV OCRMYPDF_SHARP_TTF=1
|
|
|
|
# Must use array form of ENTRYPOINT
|
|
# Non-array form does not append other arguments, because that is "intuitive"
|
|
ENTRYPOINT ["/application/docker-wrapper.sh"] |