olmocr/scripts/train/newtrainer-gantry.sh

67 lines
2.3 KiB
Bash
Raw Normal View History

2025-06-25 23:22:59 +00:00
#!/bin/bash
2025-06-25 23:05:32 +00:00
set -e
# Use conda environment Python if available, otherwise use system Python
if [ -n "$CONDA_PREFIX" ]; then
PYTHON="$CONDA_PREFIX/bin/python"
echo "Using conda Python from: $CONDA_PREFIX"
else
PYTHON="python"
echo "Warning: No conda environment detected, using system Python"
fi
# Get version from version.py
VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)')
echo "OlmOCR version: $VERSION"
# Get first 10 characters of git hash
GIT_HASH=$(git rev-parse HEAD | cut -c1-10)
echo "Git hash: $GIT_HASH"
# Get current git branch name
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
echo "Git branch: $GIT_BRANCH"
# Create full image tag
2025-06-25 23:22:59 +00:00
IMAGE_TAG="olmocr-train-${VERSION}-${GIT_HASH}"
2025-06-25 23:05:32 +00:00
echo "Building Docker image with tag: $IMAGE_TAG"
# Build the Docker image
echo "Building Docker image..."
docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
# Get Beaker username
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
echo "Beaker user: $BEAKER_USER"
# Push image to beaker
echo "Trying to push image to Beaker..."
if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then
echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."
fi
gantry run \
--description "${run_name}"\
--task-name "${run_name}"\
--allow-dirty \
--host-networking \
--workspace ai2/olmocr \
2025-06-25 23:22:59 +00:00
--beaker-image $BEAKER_USER/$IMAGE_TAG \
2025-06-25 23:05:32 +00:00
--pip gantry-train-requirements.txt \
--priority normal \
2025-06-26 18:34:53 +00:00
--gpus 1 \
2025-06-25 23:05:32 +00:00
--preemptible \
2025-06-30 21:24:35 +00:00
--cluster "ai2/titan-cirrascale" \
2025-06-25 23:05:32 +00:00
--budget ai2/oe-data \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
2025-06-27 02:57:26 +00:00
--weka oe-data-default:/weka/oe-data-default \
--weka oe-training-default:/weka/oe-training-default \
2025-06-25 23:05:32 +00:00
--shared-memory 10GiB \
--yes \
2025-06-30 21:24:35 +00:00
-- /bin/bash -c "pip install flash-attn==2.8.0.post2 --no-build-isolation && python -m olmocr.train.train --config olmocr/train/configs/example_config.yaml"