diff --git a/olmocr/train/config.py b/olmocr/train/config.py index 0241378..1747b48 100644 --- a/olmocr/train/config.py +++ b/olmocr/train/config.py @@ -155,6 +155,7 @@ class TrainingConfig: learning_rate: float = 2e-5 lr_scheduler_type: str = "cosine" warmup_ratio: float = 0.1 + lr_scheduler_kwargs: Dict[str, Any] = field(default_factory=dict) # Optimization optim: str = "adamw_torch" diff --git a/olmocr/train/configs/example_config_frontier.yaml b/olmocr/train/configs/example_config_frontier.yaml deleted file mode 100644 index f1e0f37..0000000 --- a/olmocr/train/configs/example_config_frontier.yaml +++ /dev/null @@ -1,92 +0,0 @@ -# Example OlmOCR Training Configuration - -# Project metadata -project_name: olmocr-qwen-vl-training -run_name: qwen2.5-vl-7b-finetune - -# Model configuration -model: - name: Qwen/Qwen2.5-VL-7B-Instruct - trust_remote_code: true - torch_dtype: auto - use_flash_attention: true - attn_implementation: sdpa - - # LoRA settings (disabled by default) - use_lora: false - lora_rank: 8 - lora_alpha: 32 - lora_dropout: 0.1 - lora_target_modules: - - q_proj - - v_proj - - k_proj - - o_proj - -# Dataset configuration -dataset: - - train: - - name: processed_01_books_train_iabooks - root_dir: /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_01_books_train_iabooks/ - pipeline: &basic_pipeline - - name: FrontMatterParser - front_matter_class: PageResponse - - name: PDFRenderer - target_longest_image_dim: 1024 - - name: StaticLengthDocumentAnchoring - target_anchor_text_len: 1000 - - name: FinetuningPrompt - - name: FrontMatterOutputFormat - - name: InstructUserMessages - - name: Tokenizer - masking_index: -100 - end_of_message_token: "<|im_end|>" - # Not putting in big bulk of data to speed up loading for debugging for now - # - name: processed_00_documents_train_s2pdf - # root_dir: /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_00_documents_train_s2pdf/ - # pipeline: *basic_pipeline - - eval: - - name: processed_00_documents_eval_s2pdf - root_dir: /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_00_documents_eval_s2pdf/ - pipeline: *basic_pipeline - - name: processed_01_books_eval_iabooks - root_dir: /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_01_books_eval_iabooks/ - pipeline: *basic_pipeline - - -# Training configuration -training: - output_dir: /lustre/orion/csc652/proj-shared/jakep/olmocr-trainer/ - num_train_epochs: 1 - - # Batch size and accumulation - per_device_train_batch_size: 1 - per_device_eval_batch_size: 1 - gradient_accumulation_steps: 8 - - # Learning rate - learning_rate: 2e-5 - lr_scheduler_type: cosine - warmup_ratio: 0.1 - - # Optimization - optim: adamw_torch - weight_decay: 0.01 - max_grad_norm: 1.0 - - - # Evaluation and checkpointing - evaluation_strategy: steps - eval_steps: 500 - save_strategy: steps - save_steps: 500 - save_total_limit: 3 - load_best_model_at_end: true - metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss - greater_is_better: false - - report_to: - - wandb - \ No newline at end of file diff --git a/olmocr/train/configs/example_config.yaml b/olmocr/train/configs/qwen25_vl_b100_x1_day3_json_wsd.yaml similarity index 77% rename from olmocr/train/configs/example_config.yaml rename to olmocr/train/configs/qwen25_vl_b100_x1_day3_json_wsd.yaml index 2c1aa78..42c32ba 100644 --- a/olmocr/train/configs/example_config.yaml +++ b/olmocr/train/configs/qwen25_vl_b100_x1_day3_json_wsd.yaml @@ -2,7 +2,7 @@ # Project metadata project_name: olmocr-qwen-vl-training -run_name: qwen2.5-vl-7b-finetune +run_name: qwen2.5-vl-7b-finetune-day3-json # Model configuration model: @@ -35,17 +35,16 @@ dataset: - name: PDFRenderer target_longest_image_dim: 1024 - name: StaticLengthDocumentAnchoring - target_anchor_text_len: 1000 + target_anchor_text_len: 3000 - name: FinetuningPrompt - - name: FrontMatterOutputFormat + - name: JSONOutputFormat - name: InstructUserMessages - name: Tokenizer masking_index: -100 end_of_message_token: "<|im_end|>" - # Not putting in big bulk of data to speed up loading for debugging for now - # - name: processed_00_documents_train_s2pdf - # root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/ - # pipeline: *basic_pipeline + - name: processed_00_documents_train_s2pdf + root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/ + pipeline: *basic_pipeline eval: - name: processed_00_documents_eval_s2pdf @@ -56,6 +55,7 @@ dataset: pipeline: *basic_pipeline + # Training configuration training: output_dir: /weka/oe-data-default/jakep/olmocr-trainer/ @@ -64,13 +64,17 @@ training: # Batch size and accumulation per_device_train_batch_size: 1 per_device_eval_batch_size: 1 - gradient_accumulation_steps: 8 + gradient_accumulation_steps: 32 gradient_checkpointing: False + + collator_max_token_len: 8192 # Learning rate - learning_rate: 1e-6 - lr_scheduler_type: cosine + learning_rate: 2e-5 + lr_scheduler_type: warmup_stable_decay + lr_scheduler_kwargs: + num_stable_steps: 4000 warmup_ratio: 0.1 # Optimization @@ -84,8 +88,8 @@ training: eval_steps: 500 save_strategy: steps save_steps: 500 - save_total_limit: 3 - load_best_model_at_end: true + save_total_limit: 5 + load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss greater_is_better: false diff --git a/olmocr/train/train.py b/olmocr/train/train.py index 212cf93..c1156df 100644 --- a/olmocr/train/train.py +++ b/olmocr/train/train.py @@ -202,6 +202,7 @@ def main(): learning_rate=float(config.training.learning_rate), lr_scheduler_type=config.training.lr_scheduler_type, warmup_ratio=config.training.warmup_ratio, + lr_scheduler_kwargs=config.training.lr_scheduler_kwargs, optim=config.training.optim, adam_beta1=config.training.adam_beta1, adam_beta2=config.training.adam_beta2,