Frontier configs

2025-12-27 15:14:43 +00:00 · 2025-06-30 17:43:30 +00:00 · 2025-06-30 17:43:30 +00:00 · 656dbef833
commit 656dbef833
parent e2f2d36e4f
2 changed files with 93 additions and 1 deletions
--- a/olmocr/train/configs/example_config_frontier.yaml
+++ b/olmocr/train/configs/example_config_frontier.yaml
@ -0,0 +1,92 @@
+# Example OlmOCR Training Configuration
+
+# Project metadata
+project_name: olmocr-qwen-vl-training
+run_name: qwen2.5-vl-7b-finetune
+
+# Model configuration
+model:
+  name: Qwen/Qwen2.5-VL-7B-Instruct
+  trust_remote_code: true
+  torch_dtype: auto
+  use_flash_attention: true
+  attn_implementation: sdpa
+  
+  # LoRA settings (disabled by default)
+  use_lora: false
+  lora_rank: 8
+  lora_alpha: 32
+  lora_dropout: 0.1
+  lora_target_modules:
+    - q_proj
+    - v_proj
+    - k_proj
+    - o_proj
+
+# Dataset configuration
+dataset:
+
+  train:
+    - name: processed_01_books_train_iabooks
+      root_dir:  /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_01_books_train_iabooks/
+      pipeline: &basic_pipeline
+        - name: FrontMatterParser
+          front_matter_class: PageResponse
+        - name: PDFRenderer
+          target_longest_image_dim: 1024
+        - name: StaticLengthDocumentAnchoring
+          target_anchor_text_len: 1000
+        - name: FinetuningPrompt
+        - name: FrontMatterOutputFormat
+        - name: InstructUserMessages
+        - name: Tokenizer
+          masking_index: -100
+          end_of_message_token: "<|im_end|>"
+    # Not putting in big bulk of data to speed up loading for debugging for now
+    # - name: processed_00_documents_train_s2pdf
+    #   root_dir:  /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_00_documents_train_s2pdf/
+    #   pipeline: *basic_pipeline
+
+  eval:
+    - name: processed_00_documents_eval_s2pdf
+      root_dir:  /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_00_documents_eval_s2pdf/
+      pipeline: *basic_pipeline
+    - name: processed_01_books_eval_iabooks
+      root_dir:  /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_01_books_eval_iabooks/
+      pipeline: *basic_pipeline
+
+
+# Training configuration
+training:
+  output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
+  num_train_epochs: 1
+  
+  # Batch size and accumulation
+  per_device_train_batch_size: 1
+  per_device_eval_batch_size: 1
+  gradient_accumulation_steps: 8
+  
+  # Learning rate
+  learning_rate: 2e-5
+  lr_scheduler_type: cosine
+  warmup_ratio: 0.1
+  
+  # Optimization
+  optim: adamw_torch
+  weight_decay: 0.01
+  max_grad_norm: 1.0
+  
+  
+  # Evaluation and checkpointing
+  evaluation_strategy: steps
+  eval_steps: 500
+  save_strategy: steps
+  save_steps: 500
+  save_total_limit: 3
+  load_best_model_at_end: true
+  metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
+  greater_is_better: false
+  
+  report_to: 
+    - wandb
+ 
--- a/scripts/train/newtrainer-frontier.sh
+++ b/scripts/train/newtrainer-frontier.sh
@ -29,4 +29,4 @@ source activate /lustre/orion/csc652/proj-shared/jakep/conda_env_312_olmocr_trai
 export TRANSFORMERS_OFFLINE=1
 export HF_DATASETS_OFFLINE=1

-python -m olmocr.train.train --config olmocr/train/configs/example_config.yaml
+python -m olmocr.train.train --config olmocr/train/configs/example_config_frontier.yaml