olmocr/Dockerfile

73 lines
3.2 KiB
Docker
Raw Normal View History

2025-06-02 18:13:22 +00:00
ARG CUDA_VERSION=12.8.1
FROM --platform=linux/amd64 nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
2025-05-21 10:57:04 -07:00
2025-06-02 18:34:47 +00:00
# Needs to be repeated below the FROM, or else it's not picked up
ARG PYTHON_VERSION=3.12
ARG CUDA_VERSION=12.8.1
# Set environment variable to prevent interactive prompts
ENV DEBIAN_FRONTEND=noninteractive
# From original VLLM dockerfile https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile
# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo python3-apt \
&& for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
2025-06-02 18:59:28 +00:00
&& update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python /usr/bin/python${PYTHON_VERSION} \
2025-06-02 18:34:47 +00:00
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv
# olmOCR Specific Installs
# Install fonts with workaround for update-notifier issue
RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections && \
apt-get update -y && \
apt-get install -y --no-install-recommends poppler-utils fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools && \
# Temporarily fix the python symlink for the installer
ln -sf /usr/bin/python3.8 /usr/bin/python3 && \
apt-get install -y --no-install-recommends ttf-mscorefonts-installer && \
# Restore our Python 3.12 symlink
update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION}
2025-06-02 18:47:34 +00:00
# Install some helper utilities for things like the benchmark
RUN apt-get update -y && apt-get install -y --no-install-recommends \
git \
git-lfs \
curl \
wget \
unzip
2025-05-21 10:57:04 -07:00
ENV PYTHONUNBUFFERED=1
2025-05-29 16:12:06 +00:00
2025-05-21 10:57:04 -07:00
WORKDIR /root
COPY pyproject.toml pyproject.toml
COPY olmocr/version.py olmocr/version.py
2025-06-02 18:39:32 +00:00
# Needed to resolve setuptools dependencies
ENV UV_INDEX_STRATEGY="unsafe-best-match"
2025-06-02 22:52:28 +00:00
RUN uv pip install --system --no-cache -e . --extra-index-url https://download.pytorch.org/whl/cu128
2025-06-02 18:34:47 +00:00
RUN uv pip install --system --no-cache ".[gpu]" --extra-index-url https://download.pytorch.org/whl/cu128
2025-06-02 21:23:04 +00:00
RUN uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl
2025-06-02 18:34:47 +00:00
RUN uv pip install --system --no-cache ".[bench]"
2025-05-22 14:28:40 -07:00
RUN playwright install-deps
RUN playwright install chromium
2025-05-21 10:57:04 -07:00
COPY olmocr olmocr
2025-05-28 14:35:23 -07:00
COPY scripts scripts
2025-05-21 10:57:04 -07:00
RUN python3 -m olmocr.pipeline --help