Some cleanup stuff

2025-10-17 11:12:33 +00:00 · 2025-06-30 21:24:35 +00:00 · 2025-06-30 21:24:35 +00:00 · 5c2d69a3d7
commit 5c2d69a3d7
parent e86511e11b
4 changed files with 12 additions and 6 deletions
--- a/olmocr/train/configs/example_config.yaml
+++ b/olmocr/train/configs/example_config.yaml
@ -10,7 +10,7 @@ model:
  trust_remote_code: true
  torch_dtype: auto
  use_flash_attention: true
-  attn_implementation: sdpa
+  attn_implementation: flash_attention_2
  
  # LoRA settings (disabled by default)
  use_lora: false
@ -65,9 +65,11 @@ training:
  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
  gradient_accumulation_steps: 8
+
+  gradient_checkpointing: False
  
  # Learning rate
-  learning_rate: 2e-5
+  learning_rate: 1e-6
  lr_scheduler_type: cosine
  warmup_ratio: 0.1
  
--- a/olmocr/train/train.py
+++ b/olmocr/train/train.py
@ -162,9 +162,13 @@ def main():
    total_eval_samples = sum(len(dataset) for dataset in eval_datasets.values())
    logger.info(f"Total evaluation samples across {len(eval_datasets)} datasets: {total_eval_samples}")

+    # Construct full output directory by appending run_name to base output_dir
+    full_output_dir = os.path.join(config.training.output_dir, config.run_name)
+    logger.info(f"Setting output directory to: {full_output_dir}")
+
    # Set up training arguments
    training_args = TrainingArguments(
-        output_dir=config.training.output_dir,
+        output_dir=full_output_dir,
        num_train_epochs=config.training.num_train_epochs,
        per_device_train_batch_size=config.training.per_device_train_batch_size,
        per_device_eval_batch_size=config.training.per_device_eval_batch_size,
--- a/pyproject.toml
+++ b/pyproject.toml
@ -37,7 +37,7 @@ dependencies = [
  "boto3",
  "httpx",
  "torch>=2.7.0",
-  "transformers>=4.51.1",
+  "transformers==4.52.4",
  "img2pdf",
  "beaker-py",
 ]
--- a/scripts/train/newtrainer-gantry.sh
+++ b/scripts/train/newtrainer-gantry.sh
@ -52,7 +52,7 @@ gantry run \
    --priority normal \
    --gpus 1 \
    --preemptible \
-    --cluster "ai2/jupiter-cirrascale-2" \
+    --cluster "ai2/titan-cirrascale" \
    --budget ai2/oe-data \
    --env LOG_FILTER_TYPE=local_rank0_only \
    --env OMP_NUM_THREADS=8 \
@ -64,4 +64,4 @@ gantry run \
    --weka oe-training-default:/weka/oe-training-default \
    --shared-memory 10GiB \
    --yes \
-    -- /bin/bash -c "source scripts/beaker/jupiter-ib.sh && python -m olmocr.train.train --config olmocr/train/configs/example_config.yaml"
+    -- /bin/bash -c "pip install flash-attn==2.8.0.post2 --no-build-isolation && python -m olmocr.train.train --config olmocr/train/configs/example_config.yaml"