New trainer launch script cleanups

2025-11-02 11:04:25 +00:00 · 2025-06-25 23:05:32 +00:00 · 2025-06-25 23:05:32 +00:00 · d7e5037192
commit d7e5037192
parent 91e7b5ce3f
11 changed files with 65 additions and 200 deletions
--- a/gantry-requirements.txt
+++ b/gantry-requirements.txt
@ -1,35 +0,0 @@
-torchvision
-cached-path
-smart_open
-pypdf
-pypdfium2
-lingua-language-detector
-Pillow
-ruff
-mypy>=1.0,<1.5
-black>=23.0,<24.0
-isort>=5.12,<5.13
-pytest
-pytest-sphinx
-pytest-cov
-twine>=1.11.0
-build
-setuptools
-wheel
-Sphinx>=4.3.0,<7.1.0
-furo==2023.7.26
-myst-parser>=1.0,<2.1
-sphinx-copybutton==0.5.2
-sphinx-autobuild==2021.3.14
-sphinx-autodoc-typehints==1.23.3
-packaging
-necessary
-accelerate>=0.34.2
-datasets==3.0.0
-peft
-wandb
-omegaconf
-s3fs
-transformers>=4.45.1
-bitsandbytes
-ftfy
--- a/gantry-train-requirements.txt
+++ b/gantry-train-requirements.txt
--- a/scripts/beaker/Dockerfile-inference
+++ b/scripts/beaker/Dockerfile-inference
@ -1,49 +0,0 @@
-FROM --platform=linux/amd64 nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
-
-RUN apt-get update -y && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa \
-    && apt-get -y update
-
-# Install requirements specific to pdfs
-RUN apt-get update && apt-get -y install python3-apt
-RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
-RUN apt-get update -y && apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools
-
-RUN apt-get update -y && apt-get install -y --no-install-recommends \
-    git \
-    python3.11 \
-    python3.11-dev \
-    python3.11-distutils \
-    ca-certificates \
-    build-essential \
-    curl \
-    unzip
-
-RUN rm -rf /var/lib/apt/lists/* \
-    && unlink /usr/bin/python3 \
-    && ln -s /usr/bin/python3.11 /usr/bin/python3 \
-    && ln -s /usr/bin/python3 /usr/bin/python \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python \
-    && pip3 install -U pip    
-
-RUN apt-get update && apt-get -y install python3.11-venv 
-ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
-RUN /install.sh && rm /install.sh
-
-ENV PYTHONUNBUFFERED=1
-WORKDIR /root
-COPY pyproject.toml pyproject.toml
-COPY olmocr/version.py olmocr/version.py
-
-RUN /root/.local/bin/uv pip install --system --no-cache -e .
-
-RUN /root/.local/bin/uv pip install --system --no-cache sgl-kernel==0.0.3.post1 --force-reinstall --no-deps
-RUN /root/.local/bin/uv pip install --system --no-cache "sglang[all]==0.4.2" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
-
-COPY olmocr olmocr
-
-WORKDIR /root
-COPY olmocr olmocr
-
-RUN python3 -m sglang.launch_server --help
-RUN python3 -m olmocr.pipeline --help
--- a/scripts/beaker/Dockerfile-tagging
+++ b/scripts/beaker/Dockerfile-tagging
@ -1,49 +0,0 @@
-FROM --platform=linux/amd64 nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
-
-RUN apt-get update -y && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa \
-    && apt-get -y update
-
-# Install requirements specific to pdfs
-RUN apt-get update && apt-get -y install python3-apt
-RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
-RUN apt-get update -y && apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools
-
-RUN apt-get update -y && apt-get install -y --no-install-recommends \
-    git \
-    python3.11 \
-    python3.11-dev \
-    python3.11-distutils \
-    ca-certificates \
-    build-essential \
-    curl \
-    unzip
-
-RUN rm -rf /var/lib/apt/lists/* \
-    && unlink /usr/bin/python3 \
-    && ln -s /usr/bin/python3.11 /usr/bin/python3 \
-    && ln -s /usr/bin/python3 /usr/bin/python \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python \
-    && pip3 install -U pip    
-
-RUN apt-get update && apt-get -y install python3.11-venv 
-ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
-RUN /install.sh && rm /install.sh
-
-ENV PYTHONUNBUFFERED=1
-WORKDIR /root
-COPY pyproject.toml pyproject.toml
-COPY olmocr/version.py olmocr/version.py
-
-RUN /root/.local/bin/uv pip install --system --no-cache -e .
-
-RUN /root/.local/bin/uv pip install --system --no-cache vllm==0.8.2
-
-
-WORKDIR /root
-COPY olmocr olmocr
-COPY scripts scripts
-
-RUN vllm --help
-RUN python3 -m olmocr.pipeline --help
-RUN python scripts/tagging_pipeline.py --help
--- a/scripts/beaker/Dockerfile-train
+++ b/scripts/beaker/Dockerfile-train
@ -1,26 +0,0 @@
-FROM gcr.io/ai2-beaker-core/public/cqgl31u2ba5vrtuc91og:latest
-
-# Update the package list and install libaio-dev and gnupg2
-RUN apt update && apt-get install -y libaio-dev gnupg2
-
-# Add NVIDIA package repository keys
-RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
-    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub \
-    && apt-get -y update
-
-# Set up the NVIDIA CUDA repository
-RUN apt-get install -y software-properties-common \
-    && add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" \
-    && apt-get update
-
-# Install CUDA toolkit and nvcc 12.1
-RUN apt-get install -y cuda-nvcc-12-1
-
-# Get flash attention setup
-RUN pip install flash-attn --no-build-isolation
-
-# Install PDF utilities
-RUN apt-get install -y poppler-utils
-RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
-RUN apt-get install -y ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools
-
--- a/scripts/birr/config/qwen2-vl-7b-pdf-weka.yaml
+++ b/scripts/birr/config/qwen2-vl-7b-pdf-weka.yaml
@ -1,41 +0,0 @@
-model:
-  # full fine tune
-  name_or_path: weka://oe-data-default/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/best_bf16/
-  #name_or_path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/
-  vlm: true
-
-  # necessary to prevent random crashes, until vllm fixes some bugs
-  num_scheduler_steps: 1
-
-format:
-  add_generation_prompt: true
-
-generate:
-  # The model's max context length is 8096, but around 1500 tokens are reserved for the image itself
-  max_context_length: 6500
-  temperature: 0.8
-  top_p: 1.0
-  drop_long_outputs: false
-
-
-pipeline:
-  sqs_queue_name: jake-pdf
-  num_workers: 3
-  generation_batch_size: 256
-  tokenization_batch_size: 64
-  output_serializer: default
-  target_bucket: ai2-oe-data
-  target_object_prefix: [your username]/pdfworkspaces/s2orc_3200k_v2/inference_outputs
-  allowed_restarts_per_predictor: 10
-
-task:
-  budget: ai2/oe-data
-  workspace: ai2/oe-data-model-based-cleanup
-  name: qwen2vl-schedsteps-bg
-  replicas: 128
-  priority: LOW
-  gpu_count: 1
-  cluster:
-    - ai2/jupiter-cirrascale-2
-    - ai2/saturn-cirrascale
-
--- a/scripts/train/newtrainer-gantry.sh
+++ b/scripts/train/newtrainer-gantry.sh
@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Use conda environment Python if available, otherwise use system Python
+if [ -n "$CONDA_PREFIX" ]; then
+    PYTHON="$CONDA_PREFIX/bin/python"
+    echo "Using conda Python from: $CONDA_PREFIX"
+else
+    PYTHON="python"
+    echo "Warning: No conda environment detected, using system Python"
+fi
+
+# Get version from version.py
+VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)')
+echo "OlmOCR version: $VERSION"
+
+# Get first 10 characters of git hash
+GIT_HASH=$(git rev-parse HEAD | cut -c1-10)
+echo "Git hash: $GIT_HASH"
+
+# Get current git branch name
+GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
+echo "Git branch: $GIT_BRANCH"
+
+# Create full image tag
+IMAGE_TAG="olmocr-benchmark-${VERSION}-${GIT_HASH}"
+echo "Building Docker image with tag: $IMAGE_TAG"
+
+# Build the Docker image
+echo "Building Docker image..."
+docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
+
+# Get Beaker username
+BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
+echo "Beaker user: $BEAKER_USER"
+
+# Push image to beaker
+echo "Trying to push image to Beaker..."
+if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then
+    echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."
+fi
+
+gantry run \
+    --description "${run_name}"\
+    --task-name "${run_name}"\
+    --allow-dirty \
+    --host-networking \
+    --workspace ai2/olmocr \
+    --beaker-image $IMAGE_TAG \
+    --pip gantry-train-requirements.txt \
+    --priority normal \
+    --gpus 8 \
+    --preemptible \
+    --cluster "ai2/jupiter-cirrascale-2" \
+    --budget ai2/oe-data \
+    --env LOG_FILTER_TYPE=local_rank0_only \
+    --env OMP_NUM_THREADS=8 \
+    --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
+    --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
+    --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
+    --shared-memory 10GiB \
+    --yes \
+    -- /bin/bash -c "source scripts/beaker/jupiter-ib.sh && python -m olmocr.train.train --config olmocr/train/configs/example_config.yaml"
--- a/scripts/train/qwen25vl-7b-gantry.sh
+++ b/scripts/train/qwen25vl-7b-gantry.sh
--- a/scripts/train/qwen2vl-2b-gantry.sh
+++ b/scripts/train/qwen2vl-2b-gantry.sh
--- a/scripts/train/qwen2vl-7b-gantry.sh
+++ b/scripts/train/qwen2vl-7b-gantry.sh
--- a/scripts/train/qwen2vl-7b-lora-gantry.sh
+++ b/scripts/train/qwen2vl-7b-lora-gantry.sh