olmocr/scripts/train/newtrainer-frontier.sh

31 lines
934 B
Bash
Raw Normal View History

#!/usr/bin/env bash
#SBATCH -A csc652
#SBATCH -J olmocr-train
#SBATCH -o logs/%j.out
#SBATCH -N 1
#SBATCH -t 02:00:00
module reset
module load PrgEnv-gnu
module load olcf-container-tools
module load apptainer-enable-mpi
# Run in offline mode to make sure it doesn't timeout loading the model
export TRANSFORMERS_OFFLINE=1
export HF_DATASETS_OFFLINE=1
export HF_HUB_OFFLINE=1
export HF_DATASETS_CACHE="/lustre/orion/csc652/proj-shared/huggingface-shared/datasets"
export HF_HUB_CACHE="/lustre/orion/csc652/proj-shared/huggingface-shared/hub"
# Was getting MIOpen errors with caching, had to disable for now
export MIOPEN_DISABLE_CACHE=1
source activate /lustre/orion/csc652/proj-shared/jakep/conda_env_312_olmocr_train
# Run in offline mode to make sure it doesn't timeout loading the model
export TRANSFORMERS_OFFLINE=1
export HF_DATASETS_OFFLINE=1
python -m olmocr.train.train --config olmocr/train/configs/example_config.yam