LRs

2025-10-12 00:32:45 +00:00 · 2025-07-28 16:27:41 +00:00 · 2025-07-28 16:27:41 +00:00 · acf8bff554
commit acf8bff554
parent 5d7245f887
3 changed files with 213 additions and 1 deletions
--- a/olmocr/train/configs/qwen25_vl_olmocrv2_1epoch_muon2e-3.yaml
+++ b/olmocr/train/configs/qwen25_vl_olmocrv2_1epoch_muon2e-3.yaml
@ -0,0 +1,106 @@
 # Example OlmOCR Training Configuration
 # Project metadata
 project_name: olmocr-qwen-vl-training
 run_name: qwen2.5-vl-7b-olmocrv2_1epoch_muon_2e-3
 # Model configuration
 model:
  name: Qwen/Qwen2.5-VL-7B-Instruct
  trust_remote_code: true
  torch_dtype: bfloat16
  use_flash_attention: true
  attn_implementation: flash_attention_2
  # LoRA settings (disabled by default)
  use_lora: false
  # lora_rank: 8
  # lora_alpha: 32
  # lora_dropout: 0.1
  # lora_target_modules:
  #   - q_proj
  #   - v_proj
  #   - k_proj
  #   - o_proj
 # Dataset configuration
 dataset:
  train:
    - name: processed_01_books_train_iabooks
      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
      pipeline: &basic_pipeline
        - name: FrontMatterParser
          front_matter_class: PageResponse
        - name: PDFRenderer
          target_longest_image_dim: 1288
        - name: NewYamlFinetuningPromptWithNoAnchoring
        - name: FrontMatterOutputFormat
        - name: InstructUserMessages
        - name: Tokenizer
          masking_index: -100
          end_of_message_token: "<|im_end|>"
    - name: processed_00_documents_train_s2pdf
      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
      pipeline: *basic_pipeline
  eval:
    - name: processed_00_documents_eval_s2pdf
      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
      pipeline: *basic_pipeline
    - name: processed_01_books_eval_iabooks
      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
      pipeline: *basic_pipeline
 # Training configuration
 training:
  output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
  num_train_epochs: 1
  # Batch size and accumulation
  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
  gradient_accumulation_steps: 32
  gradient_checkpointing: False
  collator_max_token_len: 8192
  # Learning rate
  learning_rate: 2e-3
  lr_scheduler_type: linear
  warmup_ratio: 0.1
  # Optimization
  optim: muon
  weight_decay: 0.01
  max_grad_norm: 1.0
  # Muon optimizer specific settings
  muon_momentum: 0.95
  muon_lr_multiplier_head: 4.4  # 0.22 / 0.05 = 4.4
  muon_lr_multiplier_embed: 12.0  # 0.6 / 0.05 = 12.0
  muon_lr_multiplier_scalar: 0.8  # 0.04 / 0.05 = 0.8
  # Adam parameters for non-muon groups
  adam_beta1: 0.8
  adam_beta2: 0.95
  adam_epsilon: 1e-10
  seed: 300
  data_seed: 301
  # Evaluation and checkpointing
  evaluation_strategy: steps
  eval_steps: 500
  save_strategy: steps
  save_steps: 500
  save_total_limit: 5
  load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
  metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
  greater_is_better: false
  report_to: 
    - wandb
--- a/olmocr/train/configs/qwen25_vl_olmocrv2_1epoch_muon2e-4.yaml
+++ b/olmocr/train/configs/qwen25_vl_olmocrv2_1epoch_muon2e-4.yaml
@ -0,0 +1,106 @@
 # Example OlmOCR Training Configuration
 # Project metadata
 project_name: olmocr-qwen-vl-training
 run_name: qwen2.5-vl-7b-olmocrv2_1epoch_muon_2e-4
 # Model configuration
 model:
  name: Qwen/Qwen2.5-VL-7B-Instruct
  trust_remote_code: true
  torch_dtype: bfloat16
  use_flash_attention: true
  attn_implementation: flash_attention_2
  # LoRA settings (disabled by default)
  use_lora: false
  # lora_rank: 8
  # lora_alpha: 32
  # lora_dropout: 0.1
  # lora_target_modules:
  #   - q_proj
  #   - v_proj
  #   - k_proj
  #   - o_proj
 # Dataset configuration
 dataset:
  train:
    - name: processed_01_books_train_iabooks
      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
      pipeline: &basic_pipeline
        - name: FrontMatterParser
          front_matter_class: PageResponse
        - name: PDFRenderer
          target_longest_image_dim: 1288
        - name: NewYamlFinetuningPromptWithNoAnchoring
        - name: FrontMatterOutputFormat
        - name: InstructUserMessages
        - name: Tokenizer
          masking_index: -100
          end_of_message_token: "<|im_end|>"
    - name: processed_00_documents_train_s2pdf
      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
      pipeline: *basic_pipeline
  eval:
    - name: processed_00_documents_eval_s2pdf
      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
      pipeline: *basic_pipeline
    - name: processed_01_books_eval_iabooks
      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
      pipeline: *basic_pipeline
 # Training configuration
 training:
  output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
  num_train_epochs: 1
  # Batch size and accumulation
  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
  gradient_accumulation_steps: 32
  gradient_checkpointing: False
  collator_max_token_len: 8192
  # Learning rate
  learning_rate: 2e-4
  lr_scheduler_type: linear
  warmup_ratio: 0.1
  # Optimization
  optim: muon
  weight_decay: 0.01
  max_grad_norm: 1.0
  # Muon optimizer specific settings
  muon_momentum: 0.95
  muon_lr_multiplier_head: 4.4  # 0.22 / 0.05 = 4.4
  muon_lr_multiplier_embed: 12.0  # 0.6 / 0.05 = 12.0
  muon_lr_multiplier_scalar: 0.8  # 0.04 / 0.05 = 0.8
  # Adam parameters for non-muon groups
  adam_beta1: 0.8
  adam_beta2: 0.95
  adam_epsilon: 1e-10
  seed: 300
  data_seed: 301
  # Evaluation and checkpointing
  evaluation_strategy: steps
  eval_steps: 500
  save_strategy: steps
  save_steps: 500
  save_total_limit: 5
  load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
  metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
  greater_is_better: false
  report_to: 
    - wandb
--- a/olmocr/train/configs/qwen25_vl_olmocrv2_1epoch_muon2e-5.yaml
+++ b/olmocr/train/configs/qwen25_vl_olmocrv2_1epoch_muon2e-5.yaml
@ -2,7 +2,7 @@
 # Project metadata
 project_name: olmocr-qwen-vl-training
-run_name: qwen2.5-vl-7b-olmocrv2_1epoch_muon
+run_name: qwen2.5-vl-7b-olmocrv2_1epoch_muon_2e-5
 # Model configuration
 model: