mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-29 17:05:18 +00:00
Adding wsd as an option
This commit is contained in:
parent
69581cca23
commit
336b000416
@ -155,6 +155,7 @@ class TrainingConfig:
|
|||||||
learning_rate: float = 2e-5
|
learning_rate: float = 2e-5
|
||||||
lr_scheduler_type: str = "cosine"
|
lr_scheduler_type: str = "cosine"
|
||||||
warmup_ratio: float = 0.1
|
warmup_ratio: float = 0.1
|
||||||
|
lr_scheduler_kwargs: Dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
# Optimization
|
# Optimization
|
||||||
optim: str = "adamw_torch"
|
optim: str = "adamw_torch"
|
||||||
|
|||||||
@ -1,92 +0,0 @@
|
|||||||
# Example OlmOCR Training Configuration
|
|
||||||
|
|
||||||
# Project metadata
|
|
||||||
project_name: olmocr-qwen-vl-training
|
|
||||||
run_name: qwen2.5-vl-7b-finetune
|
|
||||||
|
|
||||||
# Model configuration
|
|
||||||
model:
|
|
||||||
name: Qwen/Qwen2.5-VL-7B-Instruct
|
|
||||||
trust_remote_code: true
|
|
||||||
torch_dtype: auto
|
|
||||||
use_flash_attention: true
|
|
||||||
attn_implementation: sdpa
|
|
||||||
|
|
||||||
# LoRA settings (disabled by default)
|
|
||||||
use_lora: false
|
|
||||||
lora_rank: 8
|
|
||||||
lora_alpha: 32
|
|
||||||
lora_dropout: 0.1
|
|
||||||
lora_target_modules:
|
|
||||||
- q_proj
|
|
||||||
- v_proj
|
|
||||||
- k_proj
|
|
||||||
- o_proj
|
|
||||||
|
|
||||||
# Dataset configuration
|
|
||||||
dataset:
|
|
||||||
|
|
||||||
train:
|
|
||||||
- name: processed_01_books_train_iabooks
|
|
||||||
root_dir: /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_01_books_train_iabooks/
|
|
||||||
pipeline: &basic_pipeline
|
|
||||||
- name: FrontMatterParser
|
|
||||||
front_matter_class: PageResponse
|
|
||||||
- name: PDFRenderer
|
|
||||||
target_longest_image_dim: 1024
|
|
||||||
- name: StaticLengthDocumentAnchoring
|
|
||||||
target_anchor_text_len: 1000
|
|
||||||
- name: FinetuningPrompt
|
|
||||||
- name: FrontMatterOutputFormat
|
|
||||||
- name: InstructUserMessages
|
|
||||||
- name: Tokenizer
|
|
||||||
masking_index: -100
|
|
||||||
end_of_message_token: "<|im_end|>"
|
|
||||||
# Not putting in big bulk of data to speed up loading for debugging for now
|
|
||||||
# - name: processed_00_documents_train_s2pdf
|
|
||||||
# root_dir: /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_00_documents_train_s2pdf/
|
|
||||||
# pipeline: *basic_pipeline
|
|
||||||
|
|
||||||
eval:
|
|
||||||
- name: processed_00_documents_eval_s2pdf
|
|
||||||
root_dir: /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_00_documents_eval_s2pdf/
|
|
||||||
pipeline: *basic_pipeline
|
|
||||||
- name: processed_01_books_eval_iabooks
|
|
||||||
root_dir: /lustre/orion/csc652/proj-shared/jakep/olmocr_mix/processed_01_books_eval_iabooks/
|
|
||||||
pipeline: *basic_pipeline
|
|
||||||
|
|
||||||
|
|
||||||
# Training configuration
|
|
||||||
training:
|
|
||||||
output_dir: /lustre/orion/csc652/proj-shared/jakep/olmocr-trainer/
|
|
||||||
num_train_epochs: 1
|
|
||||||
|
|
||||||
# Batch size and accumulation
|
|
||||||
per_device_train_batch_size: 1
|
|
||||||
per_device_eval_batch_size: 1
|
|
||||||
gradient_accumulation_steps: 8
|
|
||||||
|
|
||||||
# Learning rate
|
|
||||||
learning_rate: 2e-5
|
|
||||||
lr_scheduler_type: cosine
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
|
|
||||||
# Optimization
|
|
||||||
optim: adamw_torch
|
|
||||||
weight_decay: 0.01
|
|
||||||
max_grad_norm: 1.0
|
|
||||||
|
|
||||||
|
|
||||||
# Evaluation and checkpointing
|
|
||||||
evaluation_strategy: steps
|
|
||||||
eval_steps: 500
|
|
||||||
save_strategy: steps
|
|
||||||
save_steps: 500
|
|
||||||
save_total_limit: 3
|
|
||||||
load_best_model_at_end: true
|
|
||||||
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
|
|
||||||
greater_is_better: false
|
|
||||||
|
|
||||||
report_to:
|
|
||||||
- wandb
|
|
||||||
|
|
||||||
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
# Project metadata
|
# Project metadata
|
||||||
project_name: olmocr-qwen-vl-training
|
project_name: olmocr-qwen-vl-training
|
||||||
run_name: qwen2.5-vl-7b-finetune
|
run_name: qwen2.5-vl-7b-finetune-day3-json
|
||||||
|
|
||||||
# Model configuration
|
# Model configuration
|
||||||
model:
|
model:
|
||||||
@ -35,17 +35,16 @@ dataset:
|
|||||||
- name: PDFRenderer
|
- name: PDFRenderer
|
||||||
target_longest_image_dim: 1024
|
target_longest_image_dim: 1024
|
||||||
- name: StaticLengthDocumentAnchoring
|
- name: StaticLengthDocumentAnchoring
|
||||||
target_anchor_text_len: 1000
|
target_anchor_text_len: 3000
|
||||||
- name: FinetuningPrompt
|
- name: FinetuningPrompt
|
||||||
- name: FrontMatterOutputFormat
|
- name: JSONOutputFormat
|
||||||
- name: InstructUserMessages
|
- name: InstructUserMessages
|
||||||
- name: Tokenizer
|
- name: Tokenizer
|
||||||
masking_index: -100
|
masking_index: -100
|
||||||
end_of_message_token: "<|im_end|>"
|
end_of_message_token: "<|im_end|>"
|
||||||
# Not putting in big bulk of data to speed up loading for debugging for now
|
- name: processed_00_documents_train_s2pdf
|
||||||
# - name: processed_00_documents_train_s2pdf
|
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
|
||||||
# root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
|
pipeline: *basic_pipeline
|
||||||
# pipeline: *basic_pipeline
|
|
||||||
|
|
||||||
eval:
|
eval:
|
||||||
- name: processed_00_documents_eval_s2pdf
|
- name: processed_00_documents_eval_s2pdf
|
||||||
@ -56,6 +55,7 @@ dataset:
|
|||||||
pipeline: *basic_pipeline
|
pipeline: *basic_pipeline
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Training configuration
|
# Training configuration
|
||||||
training:
|
training:
|
||||||
output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
|
output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
|
||||||
@ -64,13 +64,17 @@ training:
|
|||||||
# Batch size and accumulation
|
# Batch size and accumulation
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
per_device_eval_batch_size: 1
|
per_device_eval_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 32
|
||||||
|
|
||||||
gradient_checkpointing: False
|
gradient_checkpointing: False
|
||||||
|
|
||||||
|
collator_max_token_len: 8192
|
||||||
|
|
||||||
# Learning rate
|
# Learning rate
|
||||||
learning_rate: 1e-6
|
learning_rate: 2e-5
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: warmup_stable_decay
|
||||||
|
lr_scheduler_kwargs:
|
||||||
|
num_stable_steps: 4000
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|
||||||
# Optimization
|
# Optimization
|
||||||
@ -84,8 +88,8 @@ training:
|
|||||||
eval_steps: 500
|
eval_steps: 500
|
||||||
save_strategy: steps
|
save_strategy: steps
|
||||||
save_steps: 500
|
save_steps: 500
|
||||||
save_total_limit: 3
|
save_total_limit: 5
|
||||||
load_best_model_at_end: true
|
load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
|
||||||
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
|
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
|
||||||
greater_is_better: false
|
greater_is_better: false
|
||||||
|
|
||||||
@ -202,6 +202,7 @@ def main():
|
|||||||
learning_rate=float(config.training.learning_rate),
|
learning_rate=float(config.training.learning_rate),
|
||||||
lr_scheduler_type=config.training.lr_scheduler_type,
|
lr_scheduler_type=config.training.lr_scheduler_type,
|
||||||
warmup_ratio=config.training.warmup_ratio,
|
warmup_ratio=config.training.warmup_ratio,
|
||||||
|
lr_scheduler_kwargs=config.training.lr_scheduler_kwargs,
|
||||||
optim=config.training.optim,
|
optim=config.training.optim,
|
||||||
adam_beta1=config.training.adam_beta1,
|
adam_beta1=config.training.adam_beta1,
|
||||||
adam_beta2=config.training.adam_beta2,
|
adam_beta2=config.training.adam_beta2,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user