Some cleanup stuff

This commit is contained in:
Jake Poznanski 2025-06-30 21:24:35 +00:00
parent e86511e11b
commit 5c2d69a3d7
4 changed files with 12 additions and 6 deletions

View File

@ -10,7 +10,7 @@ model:
trust_remote_code: true
torch_dtype: auto
use_flash_attention: true
attn_implementation: sdpa
attn_implementation: flash_attention_2
# LoRA settings (disabled by default)
use_lora: false
@ -65,9 +65,11 @@ training:
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
gradient_accumulation_steps: 8
gradient_checkpointing: False
# Learning rate
learning_rate: 2e-5
learning_rate: 1e-6
lr_scheduler_type: cosine
warmup_ratio: 0.1

View File

@ -162,9 +162,13 @@ def main():
total_eval_samples = sum(len(dataset) for dataset in eval_datasets.values())
logger.info(f"Total evaluation samples across {len(eval_datasets)} datasets: {total_eval_samples}")
# Construct full output directory by appending run_name to base output_dir
full_output_dir = os.path.join(config.training.output_dir, config.run_name)
logger.info(f"Setting output directory to: {full_output_dir}")
# Set up training arguments
training_args = TrainingArguments(
output_dir=config.training.output_dir,
output_dir=full_output_dir,
num_train_epochs=config.training.num_train_epochs,
per_device_train_batch_size=config.training.per_device_train_batch_size,
per_device_eval_batch_size=config.training.per_device_eval_batch_size,

View File

@ -37,7 +37,7 @@ dependencies = [
"boto3",
"httpx",
"torch>=2.7.0",
"transformers>=4.51.1",
"transformers==4.52.4",
"img2pdf",
"beaker-py",
]

View File

@ -52,7 +52,7 @@ gantry run \
--priority normal \
--gpus 1 \
--preemptible \
--cluster "ai2/jupiter-cirrascale-2" \
--cluster "ai2/titan-cirrascale" \
--budget ai2/oe-data \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
@ -64,4 +64,4 @@ gantry run \
--weka oe-training-default:/weka/oe-training-default \
--shared-memory 10GiB \
--yes \
-- /bin/bash -c "source scripts/beaker/jupiter-ib.sh && python -m olmocr.train.train --config olmocr/train/configs/example_config.yaml"
-- /bin/bash -c "pip install flash-attn==2.8.0.post2 --no-build-isolation && python -m olmocr.train.train --config olmocr/train/configs/example_config.yaml"