From c075f3071f2f9c23f50eba87d8b2a78f72886c5b Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Sat, 16 Aug 2025 17:31:42 +0000 Subject: [PATCH] New configs for new data --- .../qwen25_vl_olmocrv3_1epoch.yaml | 0 .../qwen25_vl_olmocrv3_rotation_1epoch.yaml | 0 .../qwen25_vl_olmocrv3_rotation_2epoch.yaml | 0 ...vl_olmocrv3_rotation_2epoch_resumable.yaml | 0 ...5_vl_olmocrv3_rotation_augment_1epoch.yaml | 0 ...rv3_rotation_augment_1epoch_resumable.yaml | 0 ...rv3_rotation_augment_2epoch_resumable.yaml | 0 ...qwen25_vl_olmocrv3_rotation_localtest.yaml | 0 ...5_vl_olmocrv3_rotation_tokflip_1epoch.yaml | 0 .../qwen25_vl_olmocrv4_rotation_1epoch.yaml | 105 ++++++++++++++++++ scripts/train/newtrainer-beaker.sh | 2 +- 11 files changed, 106 insertions(+), 1 deletion(-) rename olmocr/train/configs/{ => v0.3.0}/qwen25_vl_olmocrv3_1epoch.yaml (100%) rename olmocr/train/configs/{ => v0.3.0}/qwen25_vl_olmocrv3_rotation_1epoch.yaml (100%) rename olmocr/train/configs/{ => v0.3.0}/qwen25_vl_olmocrv3_rotation_2epoch.yaml (100%) rename olmocr/train/configs/{ => v0.3.0}/qwen25_vl_olmocrv3_rotation_2epoch_resumable.yaml (100%) rename olmocr/train/configs/{ => v0.3.0}/qwen25_vl_olmocrv3_rotation_augment_1epoch.yaml (100%) rename olmocr/train/configs/{ => v0.3.0}/qwen25_vl_olmocrv3_rotation_augment_1epoch_resumable.yaml (100%) rename olmocr/train/configs/{ => v0.3.0}/qwen25_vl_olmocrv3_rotation_augment_2epoch_resumable.yaml (100%) rename olmocr/train/configs/{ => v0.3.0}/qwen25_vl_olmocrv3_rotation_localtest.yaml (100%) rename olmocr/train/configs/{ => v0.3.0}/qwen25_vl_olmocrv3_rotation_tokflip_1epoch.yaml (100%) create mode 100644 olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch.yaml diff --git a/olmocr/train/configs/qwen25_vl_olmocrv3_1epoch.yaml b/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_1epoch.yaml similarity index 100% rename from olmocr/train/configs/qwen25_vl_olmocrv3_1epoch.yaml rename to olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_1epoch.yaml diff --git a/olmocr/train/configs/qwen25_vl_olmocrv3_rotation_1epoch.yaml b/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_1epoch.yaml similarity index 100% rename from olmocr/train/configs/qwen25_vl_olmocrv3_rotation_1epoch.yaml rename to olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_1epoch.yaml diff --git a/olmocr/train/configs/qwen25_vl_olmocrv3_rotation_2epoch.yaml b/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_2epoch.yaml similarity index 100% rename from olmocr/train/configs/qwen25_vl_olmocrv3_rotation_2epoch.yaml rename to olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_2epoch.yaml diff --git a/olmocr/train/configs/qwen25_vl_olmocrv3_rotation_2epoch_resumable.yaml b/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_2epoch_resumable.yaml similarity index 100% rename from olmocr/train/configs/qwen25_vl_olmocrv3_rotation_2epoch_resumable.yaml rename to olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_2epoch_resumable.yaml diff --git a/olmocr/train/configs/qwen25_vl_olmocrv3_rotation_augment_1epoch.yaml b/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_augment_1epoch.yaml similarity index 100% rename from olmocr/train/configs/qwen25_vl_olmocrv3_rotation_augment_1epoch.yaml rename to olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_augment_1epoch.yaml diff --git a/olmocr/train/configs/qwen25_vl_olmocrv3_rotation_augment_1epoch_resumable.yaml b/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_augment_1epoch_resumable.yaml similarity index 100% rename from olmocr/train/configs/qwen25_vl_olmocrv3_rotation_augment_1epoch_resumable.yaml rename to olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_augment_1epoch_resumable.yaml diff --git a/olmocr/train/configs/qwen25_vl_olmocrv3_rotation_augment_2epoch_resumable.yaml b/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_augment_2epoch_resumable.yaml similarity index 100% rename from olmocr/train/configs/qwen25_vl_olmocrv3_rotation_augment_2epoch_resumable.yaml rename to olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_augment_2epoch_resumable.yaml diff --git a/olmocr/train/configs/qwen25_vl_olmocrv3_rotation_localtest.yaml b/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_localtest.yaml similarity index 100% rename from olmocr/train/configs/qwen25_vl_olmocrv3_rotation_localtest.yaml rename to olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_localtest.yaml diff --git a/olmocr/train/configs/qwen25_vl_olmocrv3_rotation_tokflip_1epoch.yaml b/olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_tokflip_1epoch.yaml similarity index 100% rename from olmocr/train/configs/qwen25_vl_olmocrv3_rotation_tokflip_1epoch.yaml rename to olmocr/train/configs/v0.3.0/qwen25_vl_olmocrv3_rotation_tokflip_1epoch.yaml diff --git a/olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch.yaml b/olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch.yaml new file mode 100644 index 0000000..ad213fc --- /dev/null +++ b/olmocr/train/configs/v0.4.0/qwen25_vl_olmocrv4_rotation_1epoch.yaml @@ -0,0 +1,105 @@ +# Example OlmOCR Training Configuration with Torch Compile + +# Project metadata +project_name: olmocr-qwen-vl-training +run_name: qwen2.5-vl-7b-olmocrv4_1epoch_baseline + +# Model configuration +model: + name: Qwen/Qwen2.5-VL-7B-Instruct + trust_remote_code: true + torch_dtype: bfloat16 + use_flash_attention: true + attn_implementation: flash_attention_2 + + # LoRA settings (disabled by default) + use_lora: false + # lora_rank: 8 + # lora_alpha: 32 + # lora_dropout: 0.1 + # lora_target_modules: + # - q_proj + # - v_proj + # - k_proj + # - o_proj + +# Dataset configuration +dataset: + + train: + - name: processed_01_books_train_iabooks + root_dir: /data/olmOCR-mix-0825/processed_01_books_train_iabooks/ + pipeline: &basic_pipeline + - name: FrontMatterParser + front_matter_class: PageResponse + - name: FilterOutRotatedDocuments + - name: PDFRenderer + target_longest_image_dim: 1288 + - name: RotationAugmentation + probability: 0.002 + - name: NewYamlFinetuningPromptWithNoAnchoring + - name: FrontMatterOutputFormat + - name: InstructUserMessages + prompt_first: true + - name: Tokenizer + masking_index: -100 + end_of_message_token: "<|im_end|>" + - name: processed_00_documents_train_s2pdf + root_dir: /data/olmOCR-mix-0825/processed_00_documents_train_s2pdf/ + pipeline: *basic_pipeline + + eval: + - name: processed_00_documents_eval_s2pdf + root_dir: /data/olmOCR-mix-0825/processed_00_documents_eval_s2pdf/ + pipeline: *basic_pipeline + - name: processed_01_books_eval_iabooks + root_dir: /data/olmOCR-mix-0825/processed_01_books_eval_iabooks/ + pipeline: *basic_pipeline + + +# Training configuration +training: + output_dir: /weka/oe-data-default/jakep/olmocr-trainer/ + num_train_epochs: 1 + + # Batch size and accumulation + per_device_train_batch_size: 1 + per_device_eval_batch_size: 1 + gradient_accumulation_steps: 32 + + gradient_checkpointing: False + + collator_max_token_len: 8192 + + # Learning rate + learning_rate: 2e-5 + lr_scheduler_type: linear + warmup_ratio: 0.1 + + # Optimization + optim: adamw_torch + weight_decay: 0.01 + max_grad_norm: 1.0 + + # Torch compile settings + torch_compile: true + torch_compile_backend: inductor + torch_compile_mode: default + torch_compile_fullgraph: false + torch_compile_dynamic: false + + seed: 300 + data_seed: 301 + + # Evaluation and checkpointing + evaluation_strategy: steps + eval_steps: 500 + save_strategy: steps + save_steps: 500 + save_total_limit: 5 + load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason + metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss + greater_is_better: false + + report_to: + - wandb \ No newline at end of file diff --git a/scripts/train/newtrainer-beaker.sh b/scripts/train/newtrainer-beaker.sh index fdee000..f08d216 100755 --- a/scripts/train/newtrainer-beaker.sh +++ b/scripts/train/newtrainer-beaker.sh @@ -96,7 +96,7 @@ commands = [ "pip install transformers==4.52.4", "pip install flash-attn==2.8.0.post2 --no-build-isolation", "pip install s5cmd", - "s5cmd sync s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/preprocessed_v0_2_3/* /data/olmOCR-mix-0225/", + "s5cmd sync s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0825/processed_* /data/olmOCR-mix-0825/", f"python -m olmocr.train.train --config {config}" ]