2025-06-25 23:22:59 +00:00
|
|
|
#!/bin/bash
|
2025-06-25 23:05:32 +00:00
|
|
|
|
|
|
|
set -e
|
|
|
|
|
|
|
|
# Use conda environment Python if available, otherwise use system Python
|
|
|
|
if [ -n "$CONDA_PREFIX" ]; then
|
|
|
|
PYTHON="$CONDA_PREFIX/bin/python"
|
|
|
|
echo "Using conda Python from: $CONDA_PREFIX"
|
|
|
|
else
|
|
|
|
PYTHON="python"
|
|
|
|
echo "Warning: No conda environment detected, using system Python"
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Get version from version.py
|
|
|
|
VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)')
|
|
|
|
echo "OlmOCR version: $VERSION"
|
|
|
|
|
|
|
|
# Get first 10 characters of git hash
|
|
|
|
GIT_HASH=$(git rev-parse HEAD | cut -c1-10)
|
|
|
|
echo "Git hash: $GIT_HASH"
|
|
|
|
|
|
|
|
# Get current git branch name
|
|
|
|
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
|
|
|
|
echo "Git branch: $GIT_BRANCH"
|
|
|
|
|
|
|
|
# Create full image tag
|
2025-06-25 23:22:59 +00:00
|
|
|
IMAGE_TAG="olmocr-train-${VERSION}-${GIT_HASH}"
|
2025-06-25 23:05:32 +00:00
|
|
|
echo "Building Docker image with tag: $IMAGE_TAG"
|
|
|
|
|
|
|
|
# Build the Docker image
|
|
|
|
echo "Building Docker image..."
|
|
|
|
docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
|
|
|
|
|
|
|
|
# Get Beaker username
|
|
|
|
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
|
|
|
|
echo "Beaker user: $BEAKER_USER"
|
|
|
|
|
|
|
|
# Push image to beaker
|
|
|
|
echo "Trying to push image to Beaker..."
|
|
|
|
if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then
|
|
|
|
echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."
|
|
|
|
fi
|
|
|
|
|
|
|
|
gantry run \
|
|
|
|
--description "${run_name}"\
|
|
|
|
--task-name "${run_name}"\
|
|
|
|
--allow-dirty \
|
|
|
|
--host-networking \
|
|
|
|
--workspace ai2/olmocr \
|
2025-06-25 23:22:59 +00:00
|
|
|
--beaker-image $BEAKER_USER/$IMAGE_TAG \
|
2025-06-25 23:05:32 +00:00
|
|
|
--pip gantry-train-requirements.txt \
|
|
|
|
--priority normal \
|
2025-06-26 18:34:53 +00:00
|
|
|
--gpus 1 \
|
2025-06-25 23:05:32 +00:00
|
|
|
--preemptible \
|
2025-06-30 21:24:35 +00:00
|
|
|
--cluster "ai2/titan-cirrascale" \
|
2025-06-25 23:05:32 +00:00
|
|
|
--budget ai2/oe-data \
|
|
|
|
--env LOG_FILTER_TYPE=local_rank0_only \
|
|
|
|
--env OMP_NUM_THREADS=8 \
|
|
|
|
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
|
|
|
|
--env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
|
|
|
|
--env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
|
|
|
|
--env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
|
2025-06-27 02:57:26 +00:00
|
|
|
--weka oe-data-default:/weka/oe-data-default \
|
|
|
|
--weka oe-training-default:/weka/oe-training-default \
|
2025-06-25 23:05:32 +00:00
|
|
|
--shared-memory 10GiB \
|
|
|
|
--yes \
|
2025-06-30 22:49:42 +00:00
|
|
|
-- /bin/bash -c "pip install flash-attn==2.8.0.post2 --no-build-isolation && python -m olmocr.train.train --config olmocr/train/configs/qwen25_vl_b100_x1_default_image_1600.yaml"
|