From 7dca33db60e3dbb69e6ee2ae15370c5cdace2302 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Tue, 5 Aug 2025 16:34:46 +0000
Subject: [PATCH] Getting things ready for a bit more augmentation

---
 olmocr/train/config.py                        |   2 +-
 ...5_vl_olmocrv3_rotation_tokflip_1epoch.yaml | 107 ++++++++++++++++++
 olmocr/train/dataloader.py                    |  11 +-
 pyproject.toml                                |   1 +
 4 files changed, 112 insertions(+), 9 deletions(-)
 create mode 100644 olmocr/train/configs/qwen25_vl_olmocrv3_rotation_tokflip_1epoch.yaml

diff --git a/olmocr/train/config.py b/olmocr/train/config.py
index efe7f8c..d28d597 100644
--- a/olmocr/train/config.py
+++ b/olmocr/train/config.py
@@ -386,7 +386,7 @@ class Config:
 
             elif step_name == "PDFRenderer":
                 steps.append(
-                    PDFRenderer(target_longest_image_dim=step_config.get("target_longest_image_dim", 1024), image_transform=None)  # Can be extended later
+                    PDFRenderer(target_longest_image_dim=step_config.get("target_longest_image_dim", 1024))
                 )
 
             elif step_name == "StaticLengthDocumentAnchoring":
diff --git a/olmocr/train/configs/qwen25_vl_olmocrv3_rotation_tokflip_1epoch.yaml b/olmocr/train/configs/qwen25_vl_olmocrv3_rotation_tokflip_1epoch.yaml
new file mode 100644
index 0000000..000867f
--- /dev/null
+++ b/olmocr/train/configs/qwen25_vl_olmocrv3_rotation_tokflip_1epoch.yaml
@@ -0,0 +1,107 @@
+# Example OlmOCR Training Configuration with Torch Compile
+
+# Project metadata
+project_name: olmocr-qwen-vl-training
+run_name: qwen2.5-vl-7b-olmocrv3_1epoch_prompt_first_rotation_tokflip
+
+# Model configuration
+model:
+  name: Qwen/Qwen2.5-VL-7B-Instruct
+  trust_remote_code: true
+  torch_dtype: bfloat16
+  use_flash_attention: true
+  attn_implementation: flash_attention_2
+  
+  # LoRA settings (disabled by default)
+  use_lora: false
+  # lora_rank: 8
+  # lora_alpha: 32
+  # lora_dropout: 0.1
+  # lora_target_modules:
+  #   - q_proj
+  #   - v_proj
+  #   - k_proj
+  #   - o_proj
+
+# Dataset configuration
+dataset:
+
+  train:
+    - name: processed_01_books_train_iabooks
+      root_dir: /data/olmOCR-mix-0225/processed_01_books_train_iabooks/
+      pipeline: &basic_pipeline
+        - name: FrontMatterParser
+          front_matter_class: PageResponse
+        - name: FilterOutRotatedDocuments
+        - name: PDFRenderer
+          target_longest_image_dim: 1288
+        - name: RotationAugmentation
+          probability: 0.002
+        - name: NewYamlFinetuningPromptWithNoAnchoring
+        - name: FrontMatterOutputFormat
+        - name: InstructUserMessages
+          prompt_first: true
+        - name: Tokenizer
+          masking_index: -100
+          end_of_message_token: "<|im_end|>"
+        - name: RandomTokenFlipper
+          token_flip_rate: 0.0001
+    - name: processed_00_documents_train_s2pdf
+      root_dir: /data/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
+      pipeline: *basic_pipeline
+
+  eval:
+    - name: processed_00_documents_eval_s2pdf
+      root_dir: /data/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
+      pipeline: *basic_pipeline
+    - name: processed_01_books_eval_iabooks
+      root_dir: /data/olmOCR-mix-0225/processed_01_books_eval_iabooks/
+      pipeline: *basic_pipeline
+
+
+# Training configuration
+training:
+  output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
+  num_train_epochs: 1
+  
+  # Batch size and accumulation
+  per_device_train_batch_size: 1
+  per_device_eval_batch_size: 1
+  gradient_accumulation_steps: 32
+
+  gradient_checkpointing: False
+
+  collator_max_token_len: 8192
+  
+  # Learning rate
+  learning_rate: 2e-5
+  lr_scheduler_type: linear
+  warmup_ratio: 0.1
+  
+  # Optimization
+  optim: adamw_torch
+  weight_decay: 0.01
+  max_grad_norm: 1.0
+
+  # Torch compile settings
+  torch_compile: true
+  torch_compile_backend: inductor
+  torch_compile_mode: default
+  torch_compile_fullgraph: false
+  torch_compile_dynamic: false
+
+  seed: 300
+  data_seed: 301
+  
+  # Evaluation and checkpointing
+  evaluation_strategy: steps
+  eval_steps: 500
+  save_strategy: steps
+  save_steps: 500
+  save_total_limit: 5
+  load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
+  metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
+  greater_is_better: false
+  
+  report_to: 
+    - wandb
\ No newline at end of file
diff --git a/olmocr/train/dataloader.py b/olmocr/train/dataloader.py
index b4493a7..7565106 100644
--- a/olmocr/train/dataloader.py
+++ b/olmocr/train/dataloader.py
@@ -281,7 +281,6 @@ class PDFRenderer(PipelineStep):
     """Pipeline step that renders PDF to image."""
 
     target_longest_image_dim: int
-    image_transform: Optional[Callable] = None
 
     def __call__(self, sample: Sample) -> Sample:
         """Render PDF to image."""
@@ -290,10 +289,6 @@ class PDFRenderer(PipelineStep):
         png_bytes = base64.b64decode(base64_png)
         image = Image.open(BytesIO(png_bytes))
 
-        # Apply transform if provided
-        if self.image_transform:
-            image = self.image_transform(image)
-
         # Update sample
         sample["image"] = image
 
@@ -524,6 +519,7 @@ class FilterOutRotatedDocuments(PipelineStep):
         return sample
 
 
+
 @dataclass(frozen=True, slots=True)
 class InstructUserMessages(PipelineStep):
     """Creates instruction-following messages format for training."""
@@ -670,20 +666,19 @@ class RandomTokenFlipper(PipelineStep):
 class MarkdownPDFDocumentDataset(BaseMarkdownPDFDataset):
     """Dataset that includes front matter parsing and PDF rendering by default."""
 
-    def __init__(self, root_dir: str | PathLike, target_longest_image_dim: int, image_transform=None, front_matter_class=None):
+    def __init__(self, root_dir: str | PathLike, target_longest_image_dim: int, front_matter_class=None):
         """
         Initialize the dataset with default pipeline steps.
 
         Args:
             root_dir: Path to the root folder containing processed markdown and PDF files
             target_longest_image_dim: Target dimension for the longest side of the image
-            image_transform: Optional transform to apply to the PDF images
             front_matter_class: Optional dataclass type to validate front matter against
         """
         # Create default pipeline steps
         pipeline_steps = [
             FrontMatterParser(front_matter_class),
-            PDFRenderer(target_longest_image_dim, image_transform),
+            PDFRenderer(target_longest_image_dim),
             StaticLengthDocumentAnchoring(target_anchor_text_len=6000),
             FinetuningPrompt(),
             FrontMatterOutputFormat(),
diff --git a/pyproject.toml b/pyproject.toml
index 19b2754..df8dc76 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -106,6 +106,7 @@ train = [
     "s3fs",
     "necessary",
     "einops",
+    "augraphy",
 ]
 
 elo = [