olmocr/scripts/train/newtrainer-beaker.sh

#!/bin/bash

set -e

# Parse command line arguments
CONFIG="olmocr/train/configs/qwen25_vl_b100_x1_default.yaml"
DATASET="s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0825"
SKIP_DOCKER_BUILD=false
PREEMPTIBLE=false

while [[ $# -gt 0 ]]; do
    case $1 in
        --config)
            CONFIG="$2"
            shift 2
            ;;
        --dataset)
            DATASET="$2"
            shift 2
            ;;
        --skip-docker-build)
            SKIP_DOCKER_BUILD=true
            shift
            ;;
        --preemptible)
            PREEMPTIBLE=true
            shift
            ;;
        *)
            echo "Unknown option: $1"
            echo "Usage: $0 [--config CONFIG_PATH] [--dataset DATASET_PATH] [--skip-docker-build] [--preemptible]"
            exit 1
            ;;
    esac
done

echo "Using config: $CONFIG"
echo "Using dataset: $DATASET"

# Use conda environment Python if available, otherwise use system Python
if [ -n "$CONDA_PREFIX" ]; then
    PYTHON="$CONDA_PREFIX/bin/python"
    echo "Using conda Python from: $CONDA_PREFIX"
else
    PYTHON="python"
    echo "Warning: No conda environment detected, using system Python"
fi

# Get version from version.py
VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)')
echo "OlmOCR version: $VERSION"

# Get first 10 characters of git hash
GIT_HASH=$(git rev-parse HEAD | cut -c1-10)
echo "Git hash: $GIT_HASH"

# Get current git branch name
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
echo "Git branch: $GIT_BRANCH"

# Create full image tag
IMAGE_TAG="olmocr-train-${VERSION}-${GIT_HASH}"
echo "Building Docker image with tag: $IMAGE_TAG"

# Build and push Docker image if not skipping
if [ "$SKIP_DOCKER_BUILD" = false ]; then
    echo "Building Docker image..."
    docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
    
    # Push image to beaker
    echo "Trying to push image to Beaker..."
    if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then
        echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."
    fi
else
    echo "Skipping Docker build as requested"
fi

# Get Beaker username
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
echo "Beaker user: $BEAKER_USER"

# Create Python script to run beaker experiment
cat << 'EOF' > /tmp/run_training_experiment.py
import sys
from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar, DataMount

# Get image tag, beaker user, git branch, git hash, config, dataset, and preemptible from command line
image_tag = sys.argv[1]
beaker_user = sys.argv[2]
git_branch = sys.argv[3]
git_hash = sys.argv[4]
config = sys.argv[5]
dataset = sys.argv[6]
preemptible = sys.argv[7] == "true"

# Initialize Beaker client
b = Beaker.from_env(default_workspace="ai2/olmocr")

# Build the training command
# Extract the dataset name from the S3 path for the local directory
dataset_name = dataset.rstrip('/').split('/')[-1]
commands = [
    "pip install .[train]",
    "pip install transformers==4.52.4",
    "pip install flash-attn==2.8.0.post2 --no-build-isolation",
    "pip install s5cmd",
    f"s5cmd sync {dataset}/processed_* /data/{dataset_name}/",
    f"python -m olmocr.train.train --config {config}"
]

# Build task spec
task_spec = TaskSpec(
    name="olmocr-training",
    image=ImageSource(beaker=f"{beaker_user}/{image_tag}"),
    command=[
        "bash", "-c",
        " && ".join(commands)
    ],
    context=TaskContext(
        priority=Priority.normal,
        preemptible=preemptible,
    ),
    resources=TaskResources(
        gpu_count=1,
        shared_memory="10GiB"
    ),
    constraints=Constraints(cluster=["ai2/titan-cirrascale"]),
    result=ResultSpec(path="/noop-results"),
    env_vars=[
        EnvVar(name="LOG_FILTER_TYPE", value="local_rank0_only"),
        EnvVar(name="OMP_NUM_THREADS", value="8"),
        EnvVar(name="BEAKER_USER_ID", value=beaker_user),
        EnvVar(name="AWS_ACCESS_KEY_ID", secret="ALLENNLP_AWS_ACCESS_KEY_ID"),
        EnvVar(name="AWS_SECRET_ACCESS_KEY", secret="ALLENNLP_AWS_SECRET_ACCESS_KEY"),
        EnvVar(name="WANDB_API_KEY", secret="JAKE_WANDB_API_KEY")
    ],
    datasets=[
        DataMount.new(mount_path="/weka/oe-data-default", weka="oe-data-default"),
        DataMount.new(mount_path="/weka/oe-training-default", weka="oe-training-default"),
    ]
)

# Create experiment spec
experiment_spec = ExperimentSpec(
    description=f"OlmOCR Training Run - Branch: {git_branch}, Commit: {git_hash}",
    budget="ai2/oe-base",
    tasks=[task_spec],
)

# Create the experiment
experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr")
print(f"Created training experiment: {experiment.id}")
print(f"View at: https://beaker.org/ex/{experiment.id}")
EOF

# Run the Python script to create the experiment
echo "Creating Beaker experiment..."
$PYTHON /tmp/run_training_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH "$CONFIG" "$DATASET" $PREEMPTIBLE

# Clean up temporary file
rm /tmp/run_training_experiment.py

echo "Training experiment submitted successfully!"
Basic atttempt to run trainer script 2025-06-25 23:22:59 +00:00			`#!/bin/bash`
New trainer launch script cleanups 2025-06-25 23:05:32 +00:00
			`set -e`

New trainer launch script for beaker 2025-07-01 01:43:38 +00:00			`# Parse command line arguments`
			`CONFIG="olmocr/train/configs/qwen25_vl_b100_x1_default.yaml"`
Going to train on the new transcripts data 2025-09-08 22:30:40 +00:00			`DATASET="s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0825"`
New trainer launch script for beaker 2025-07-01 01:43:38 +00:00			`SKIP_DOCKER_BUILD=false`
Adding some preempt flags 2025-08-05 18:00:46 +00:00			`PREEMPTIBLE=false`
New trainer launch script for beaker 2025-07-01 01:43:38 +00:00
			`while [[ $# -gt 0 ]]; do`
			`case $1 in`
			`--config)`
			`CONFIG="$2"`
			`shift 2`
			`;;`
Going to train on the new transcripts data 2025-09-08 22:30:40 +00:00			`--dataset)`
			`DATASET="$2"`
			`shift 2`
			`;;`
New trainer launch script for beaker 2025-07-01 01:43:38 +00:00			`--skip-docker-build)`
			`SKIP_DOCKER_BUILD=true`
			`shift`
			`;;`
Adding some preempt flags 2025-08-05 18:00:46 +00:00			`--preemptible)`
			`PREEMPTIBLE=true`
			`shift`
			`;;`
New trainer launch script for beaker 2025-07-01 01:43:38 +00:00			`*)`
			`echo "Unknown option: $1"`
Going to train on the new transcripts data 2025-09-08 22:30:40 +00:00			`echo "Usage: $0 [--config CONFIG_PATH] [--dataset DATASET_PATH] [--skip-docker-build] [--preemptible]"`
New trainer launch script for beaker 2025-07-01 01:43:38 +00:00			`exit 1`
			`;;`
			`esac`
			`done`

			`echo "Using config: $CONFIG"`
Going to train on the new transcripts data 2025-09-08 22:30:40 +00:00			`echo "Using dataset: $DATASET"`
New trainer launch script for beaker 2025-07-01 01:43:38 +00:00
New trainer launch script cleanups 2025-06-25 23:05:32 +00:00			`# Use conda environment Python if available, otherwise use system Python`
			`if [ -n "$CONDA_PREFIX" ]; then`
			`PYTHON="$CONDA_PREFIX/bin/python"`
			`echo "Using conda Python from: $CONDA_PREFIX"`
			`else`
			`PYTHON="python"`
			`echo "Warning: No conda environment detected, using system Python"`
			`fi`

			`# Get version from version.py`
			`VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)')`
			`echo "OlmOCR version: $VERSION"`

			`# Get first 10 characters of git hash`
			`GIT_HASH=$(git rev-parse HEAD \| cut -c1-10)`
			`echo "Git hash: $GIT_HASH"`

			`# Get current git branch name`
			`GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD)`
			`echo "Git branch: $GIT_BRANCH"`

			`# Create full image tag`
Basic atttempt to run trainer script 2025-06-25 23:22:59 +00:00			`IMAGE_TAG="olmocr-train-${VERSION}-${GIT_HASH}"`
New trainer launch script cleanups 2025-06-25 23:05:32 +00:00			`echo "Building Docker image with tag: $IMAGE_TAG"`

New trainer launch script for beaker 2025-07-01 01:43:38 +00:00			`# Build and push Docker image if not skipping`
			`if [ "$SKIP_DOCKER_BUILD" = false ]; then`
			`echo "Building Docker image..."`
			`docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .`

			`# Push image to beaker`
			`echo "Trying to push image to Beaker..."`
			`if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then`
			`echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."`
			`fi`
			`else`
			`echo "Skipping Docker build as requested"`
			`fi`
New trainer launch script cleanups 2025-06-25 23:05:32 +00:00
			`# Get Beaker username`
			`BEAKER_USER=$(beaker account whoami --format json \| jq -r '.[0].name')`
			`echo "Beaker user: $BEAKER_USER"`

Better script 2025-07-01 01:40:55 +00:00			`# Create Python script to run beaker experiment`
			`cat << 'EOF' > /tmp/run_training_experiment.py`
			`import sys`
			`from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar, DataMount`

Going to train on the new transcripts data 2025-09-08 22:30:40 +00:00			`# Get image tag, beaker user, git branch, git hash, config, dataset, and preemptible from command line`
Better script 2025-07-01 01:40:55 +00:00			`image_tag = sys.argv[1]`
			`beaker_user = sys.argv[2]`
			`git_branch = sys.argv[3]`
			`git_hash = sys.argv[4]`
New trainer launch script for beaker 2025-07-01 01:43:38 +00:00			`config = sys.argv[5]`
Going to train on the new transcripts data 2025-09-08 22:30:40 +00:00			`dataset = sys.argv[6]`
			`preemptible = sys.argv[7] == "true"`
Better script 2025-07-01 01:40:55 +00:00
			`# Initialize Beaker client`
			`b = Beaker.from_env(default_workspace="ai2/olmocr")`

			`# Build the training command`
Going to train on the new transcripts data 2025-09-08 22:30:40 +00:00			`# Extract the dataset name from the S3 path for the local directory`
			`dataset_name = dataset.rstrip('/').split('/')[-1]`
Better script 2025-07-01 01:40:55 +00:00			`commands = [`
Cleaning up some training requirements installation steps 2025-08-05 19:42:46 +00:00			`"pip install .[train]",`
Better script 2025-07-01 01:40:55 +00:00			`"pip install transformers==4.52.4",`
			`"pip install flash-attn==2.8.0.post2 --no-build-isolation",`
Copying preprocessed files to local ssd in trainer script 2025-08-04 22:18:38 +00:00			`"pip install s5cmd",`
Going to train on the new transcripts data 2025-09-08 22:30:40 +00:00			`f"s5cmd sync {dataset}/processed_* /data/{dataset_name}/",`
New trainer launch script for beaker 2025-07-01 01:43:38 +00:00			`f"python -m olmocr.train.train --config {config}"`
Better script 2025-07-01 01:40:55 +00:00			`]`

			`# Build task spec`
			`task_spec = TaskSpec(`
			`name="olmocr-training",`
			`image=ImageSource(beaker=f"{beaker_user}/{image_tag}"),`
			`command=[`
			`"bash", "-c",`
			`" && ".join(commands)`
			`],`
			`context=TaskContext(`
			`priority=Priority.normal,`
Adding some preempt flags 2025-08-05 18:00:46 +00:00			`preemptible=preemptible,`
Better script 2025-07-01 01:40:55 +00:00			`),`
			`resources=TaskResources(`
			`gpu_count=1,`
			`shared_memory="10GiB"`
			`),`
			`constraints=Constraints(cluster=["ai2/titan-cirrascale"]),`
			`result=ResultSpec(path="/noop-results"),`
			`env_vars=[`
			`EnvVar(name="LOG_FILTER_TYPE", value="local_rank0_only"),`
			`EnvVar(name="OMP_NUM_THREADS", value="8"),`
			`EnvVar(name="BEAKER_USER_ID", value=beaker_user),`
Copying preprocessed files to local ssd in trainer script 2025-08-04 22:18:38 +00:00			`EnvVar(name="AWS_ACCESS_KEY_ID", secret="ALLENNLP_AWS_ACCESS_KEY_ID"),`
			`EnvVar(name="AWS_SECRET_ACCESS_KEY", secret="ALLENNLP_AWS_SECRET_ACCESS_KEY"),`
Better script 2025-07-01 01:40:55 +00:00			`EnvVar(name="WANDB_API_KEY", secret="JAKE_WANDB_API_KEY")`
			`],`
			`datasets=[`
			`DataMount.new(mount_path="/weka/oe-data-default", weka="oe-data-default"),`
			`DataMount.new(mount_path="/weka/oe-training-default", weka="oe-training-default"),`
			`]`
			`)`

			`# Create experiment spec`
			`experiment_spec = ExperimentSpec(`
			`description=f"OlmOCR Training Run - Branch: {git_branch}, Commit: {git_hash}",`
Using new budget code 2025-08-06 16:31:08 +00:00			`budget="ai2/oe-base",`
Better script 2025-07-01 01:40:55 +00:00			`tasks=[task_spec],`
			`)`

			`# Create the experiment`
			`experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr")`
			`print(f"Created training experiment: {experiment.id}")`
			`print(f"View at: https://beaker.org/ex/{experiment.id}")`
			`EOF`

			`# Run the Python script to create the experiment`
			`echo "Creating Beaker experiment..."`
Going to train on the new transcripts data 2025-09-08 22:30:40 +00:00			`$PYTHON /tmp/run_training_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH "$CONFIG" "$DATASET" $PREEMPTIBLE`
Better script 2025-07-01 01:40:55 +00:00
			`# Clean up temporary file`
			`rm /tmp/run_training_experiment.py`

			`echo "Training experiment submitted successfully!"`