This commit is contained in:
Jake Poznanski 2025-07-28 16:27:41 +00:00
parent 5d7245f887
commit acf8bff554
3 changed files with 213 additions and 1 deletions

View File

@ -0,0 +1,106 @@
# Example OlmOCR Training Configuration
# Project metadata
project_name: olmocr-qwen-vl-training
run_name: qwen2.5-vl-7b-olmocrv2_1epoch_muon_2e-3
# Model configuration
model:
name: Qwen/Qwen2.5-VL-7B-Instruct
trust_remote_code: true
torch_dtype: bfloat16
use_flash_attention: true
attn_implementation: flash_attention_2
# LoRA settings (disabled by default)
use_lora: false
# lora_rank: 8
# lora_alpha: 32
# lora_dropout: 0.1
# lora_target_modules:
# - q_proj
# - v_proj
# - k_proj
# - o_proj
# Dataset configuration
dataset:
train:
- name: processed_01_books_train_iabooks
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
pipeline: &basic_pipeline
- name: FrontMatterParser
front_matter_class: PageResponse
- name: PDFRenderer
target_longest_image_dim: 1288
- name: NewYamlFinetuningPromptWithNoAnchoring
- name: FrontMatterOutputFormat
- name: InstructUserMessages
- name: Tokenizer
masking_index: -100
end_of_message_token: "<|im_end|>"
- name: processed_00_documents_train_s2pdf
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
pipeline: *basic_pipeline
eval:
- name: processed_00_documents_eval_s2pdf
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
pipeline: *basic_pipeline
- name: processed_01_books_eval_iabooks
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
pipeline: *basic_pipeline
# Training configuration
training:
output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
num_train_epochs: 1
# Batch size and accumulation
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
gradient_accumulation_steps: 32
gradient_checkpointing: False
collator_max_token_len: 8192
# Learning rate
learning_rate: 2e-3
lr_scheduler_type: linear
warmup_ratio: 0.1
# Optimization
optim: muon
weight_decay: 0.01
max_grad_norm: 1.0
# Muon optimizer specific settings
muon_momentum: 0.95
muon_lr_multiplier_head: 4.4 # 0.22 / 0.05 = 4.4
muon_lr_multiplier_embed: 12.0 # 0.6 / 0.05 = 12.0
muon_lr_multiplier_scalar: 0.8 # 0.04 / 0.05 = 0.8
# Adam parameters for non-muon groups
adam_beta1: 0.8
adam_beta2: 0.95
adam_epsilon: 1e-10
seed: 300
data_seed: 301
# Evaluation and checkpointing
evaluation_strategy: steps
eval_steps: 500
save_strategy: steps
save_steps: 500
save_total_limit: 5
load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
greater_is_better: false
report_to:
- wandb

View File

@ -0,0 +1,106 @@
# Example OlmOCR Training Configuration
# Project metadata
project_name: olmocr-qwen-vl-training
run_name: qwen2.5-vl-7b-olmocrv2_1epoch_muon_2e-4
# Model configuration
model:
name: Qwen/Qwen2.5-VL-7B-Instruct
trust_remote_code: true
torch_dtype: bfloat16
use_flash_attention: true
attn_implementation: flash_attention_2
# LoRA settings (disabled by default)
use_lora: false
# lora_rank: 8
# lora_alpha: 32
# lora_dropout: 0.1
# lora_target_modules:
# - q_proj
# - v_proj
# - k_proj
# - o_proj
# Dataset configuration
dataset:
train:
- name: processed_01_books_train_iabooks
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
pipeline: &basic_pipeline
- name: FrontMatterParser
front_matter_class: PageResponse
- name: PDFRenderer
target_longest_image_dim: 1288
- name: NewYamlFinetuningPromptWithNoAnchoring
- name: FrontMatterOutputFormat
- name: InstructUserMessages
- name: Tokenizer
masking_index: -100
end_of_message_token: "<|im_end|>"
- name: processed_00_documents_train_s2pdf
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
pipeline: *basic_pipeline
eval:
- name: processed_00_documents_eval_s2pdf
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
pipeline: *basic_pipeline
- name: processed_01_books_eval_iabooks
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
pipeline: *basic_pipeline
# Training configuration
training:
output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
num_train_epochs: 1
# Batch size and accumulation
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
gradient_accumulation_steps: 32
gradient_checkpointing: False
collator_max_token_len: 8192
# Learning rate
learning_rate: 2e-4
lr_scheduler_type: linear
warmup_ratio: 0.1
# Optimization
optim: muon
weight_decay: 0.01
max_grad_norm: 1.0
# Muon optimizer specific settings
muon_momentum: 0.95
muon_lr_multiplier_head: 4.4 # 0.22 / 0.05 = 4.4
muon_lr_multiplier_embed: 12.0 # 0.6 / 0.05 = 12.0
muon_lr_multiplier_scalar: 0.8 # 0.04 / 0.05 = 0.8
# Adam parameters for non-muon groups
adam_beta1: 0.8
adam_beta2: 0.95
adam_epsilon: 1e-10
seed: 300
data_seed: 301
# Evaluation and checkpointing
evaluation_strategy: steps
eval_steps: 500
save_strategy: steps
save_steps: 500
save_total_limit: 5
load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
greater_is_better: false
report_to:
- wandb

View File

@ -2,7 +2,7 @@
# Project metadata
project_name: olmocr-qwen-vl-training
run_name: qwen2.5-vl-7b-olmocrv2_1epoch_muon
run_name: qwen2.5-vl-7b-olmocrv2_1epoch_muon_2e-5
# Model configuration
model: