Adding wsd as an option

This commit is contained in:
Jake Poznanski 2025-07-09 22:35:57 +00:00
parent 69581cca23
commit 336b000416
4 changed files with 18 additions and 104 deletions

View File

@ -155,6 +155,7 @@ class TrainingConfig:
learning_rate: float = 2e-5
lr_scheduler_type: str = "cosine"
warmup_ratio: float = 0.1
lr_scheduler_kwargs: Dict[str, Any] = field(default_factory=dict)
# Optimization
optim: str = "adamw_torch"

View File

@ -1,92 +0,0 @@
# Example OlmOCR Training Configuration
# Project metadata
project_name: olmocr-qwen-vl-training
run_name: qwen2.5-vl-7b-finetune
# Model configuration
model:
name: Qwen/Qwen2.5-VL-7B-Instruct
trust_remote_code: true
torch_dtype: auto
use_flash_attention: true
attn_implementation: sdpa
# LoRA settings (disabled by default)
use_lora: false
lora_rank: 8
lora_alpha: 32
lora_dropout: 0.1
lora_target_modules:
- q_proj
- v_proj
- k_proj
- o_proj
# Dataset configuration
dataset:
train:
- name: processed_01_books_train_iabooks
root_dir: /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_01_books_train_iabooks/
pipeline: &basic_pipeline
- name: FrontMatterParser
front_matter_class: PageResponse
- name: PDFRenderer
target_longest_image_dim: 1024
- name: StaticLengthDocumentAnchoring
target_anchor_text_len: 1000
- name: FinetuningPrompt
- name: FrontMatterOutputFormat
- name: InstructUserMessages
- name: Tokenizer
masking_index: -100
end_of_message_token: "<|im_end|>"
# Not putting in big bulk of data to speed up loading for debugging for now
# - name: processed_00_documents_train_s2pdf
# root_dir: /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_00_documents_train_s2pdf/
# pipeline: *basic_pipeline
eval:
- name: processed_00_documents_eval_s2pdf
root_dir: /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_00_documents_eval_s2pdf/
pipeline: *basic_pipeline
- name: processed_01_books_eval_iabooks
root_dir: /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_01_books_eval_iabooks/
pipeline: *basic_pipeline
# Training configuration
training:
output_dir: /lustre/orion/csc652/proj-shared/jakep/olmocr-trainer/
num_train_epochs: 1
# Batch size and accumulation
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
gradient_accumulation_steps: 8
# Learning rate
learning_rate: 2e-5
lr_scheduler_type: cosine
warmup_ratio: 0.1
# Optimization
optim: adamw_torch
weight_decay: 0.01
max_grad_norm: 1.0
# Evaluation and checkpointing
evaluation_strategy: steps
eval_steps: 500
save_strategy: steps
save_steps: 500
save_total_limit: 3
load_best_model_at_end: true
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
greater_is_better: false
report_to:
- wandb

View File

@ -2,7 +2,7 @@
# Project metadata
project_name: olmocr-qwen-vl-training
run_name: qwen2.5-vl-7b-finetune
run_name: qwen2.5-vl-7b-finetune-day3-json
# Model configuration
model:
@ -35,17 +35,16 @@ dataset:
- name: PDFRenderer
target_longest_image_dim: 1024
- name: StaticLengthDocumentAnchoring
target_anchor_text_len: 1000
target_anchor_text_len: 3000
- name: FinetuningPrompt
- name: FrontMatterOutputFormat
- name: JSONOutputFormat
- name: InstructUserMessages
- name: Tokenizer
masking_index: -100
end_of_message_token: "<|im_end|>"
# Not putting in big bulk of data to speed up loading for debugging for now
# - name: processed_00_documents_train_s2pdf
# root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
# pipeline: *basic_pipeline
- name: processed_00_documents_train_s2pdf
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
pipeline: *basic_pipeline
eval:
- name: processed_00_documents_eval_s2pdf
@ -56,6 +55,7 @@ dataset:
pipeline: *basic_pipeline
# Training configuration
training:
output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
@ -64,13 +64,17 @@ training:
# Batch size and accumulation
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
gradient_accumulation_steps: 8
gradient_accumulation_steps: 32
gradient_checkpointing: False
collator_max_token_len: 8192
# Learning rate
learning_rate: 1e-6
lr_scheduler_type: cosine
learning_rate: 2e-5
lr_scheduler_type: warmup_stable_decay
lr_scheduler_kwargs:
num_stable_steps: 4000
warmup_ratio: 0.1
# Optimization
@ -84,8 +88,8 @@ training:
eval_steps: 500
save_strategy: steps
save_steps: 500
save_total_limit: 3
load_best_model_at_end: true
save_total_limit: 5
load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
greater_is_better: false

View File

@ -202,6 +202,7 @@ def main():
learning_rate=float(config.training.learning_rate),
lr_scheduler_type=config.training.lr_scheduler_type,
warmup_ratio=config.training.warmup_ratio,
lr_scheduler_kwargs=config.training.lr_scheduler_kwargs,
optim=config.training.optim,
adam_beta1=config.training.adam_beta1,
adam_beta2=config.training.adam_beta2,