Getting things ready for a bit more augmentation

2025-12-14 16:53:18 +00:00 · 2025-08-05 16:34:46 +00:00 · 2025-08-05 16:34:46 +00:00 · 7dca33db60
commit 7dca33db60
parent 55f8ba0ac0
4 changed files with 112 additions and 9 deletions
--- a/olmocr/train/config.py
+++ b/olmocr/train/config.py
@ -386,7 +386,7 @@ class Config:
            elif step_name == "PDFRenderer":
                steps.append(
-                    PDFRenderer(target_longest_image_dim=step_config.get("target_longest_image_dim", 1024), image_transform=None)  # Can be extended later
+                    PDFRenderer(target_longest_image_dim=step_config.get("target_longest_image_dim", 1024))
                )
            elif step_name == "StaticLengthDocumentAnchoring":
--- a/olmocr/train/configs/qwen25_vl_olmocrv3_rotation_tokflip_1epoch.yaml
+++ b/olmocr/train/configs/qwen25_vl_olmocrv3_rotation_tokflip_1epoch.yaml
@ -0,0 +1,107 @@
 # Example OlmOCR Training Configuration with Torch Compile
 # Project metadata
 project_name: olmocr-qwen-vl-training
 run_name: qwen2.5-vl-7b-olmocrv3_1epoch_prompt_first_rotation_tokflip
 # Model configuration
 model:
  name: Qwen/Qwen2.5-VL-7B-Instruct
  trust_remote_code: true
  torch_dtype: bfloat16
  use_flash_attention: true
  attn_implementation: flash_attention_2
  # LoRA settings (disabled by default)
  use_lora: false
  # lora_rank: 8
  # lora_alpha: 32
  # lora_dropout: 0.1
  # lora_target_modules:
  #   - q_proj
  #   - v_proj
  #   - k_proj
  #   - o_proj
 # Dataset configuration
 dataset:
  train:
    - name: processed_01_books_train_iabooks
      root_dir: /data/olmOCR-mix-0225/processed_01_books_train_iabooks/
      pipeline: &basic_pipeline
        - name: FrontMatterParser
          front_matter_class: PageResponse
        - name: FilterOutRotatedDocuments
        - name: PDFRenderer
          target_longest_image_dim: 1288
        - name: RotationAugmentation
          probability: 0.002
        - name: NewYamlFinetuningPromptWithNoAnchoring
        - name: FrontMatterOutputFormat
        - name: InstructUserMessages
          prompt_first: true
        - name: Tokenizer
          masking_index: -100
          end_of_message_token: "<|im_end|>"
        - name: RandomTokenFlipper
          token_flip_rate: 0.0001
    - name: processed_00_documents_train_s2pdf
      root_dir: /data/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
      pipeline: *basic_pipeline
  eval:
    - name: processed_00_documents_eval_s2pdf
      root_dir: /data/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
      pipeline: *basic_pipeline
    - name: processed_01_books_eval_iabooks
      root_dir: /data/olmOCR-mix-0225/processed_01_books_eval_iabooks/
      pipeline: *basic_pipeline
 # Training configuration
 training:
  output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
  num_train_epochs: 1
  # Batch size and accumulation
  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
  gradient_accumulation_steps: 32
  gradient_checkpointing: False
  collator_max_token_len: 8192
  # Learning rate
  learning_rate: 2e-5
  lr_scheduler_type: linear
  warmup_ratio: 0.1
  # Optimization
  optim: adamw_torch
  weight_decay: 0.01
  max_grad_norm: 1.0
  # Torch compile settings
  torch_compile: true
  torch_compile_backend: inductor
  torch_compile_mode: default
  torch_compile_fullgraph: false
  torch_compile_dynamic: false
  seed: 300
  data_seed: 301
  # Evaluation and checkpointing
  evaluation_strategy: steps
  eval_steps: 500
  save_strategy: steps
  save_steps: 500
  save_total_limit: 5
  load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
  metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
  greater_is_better: false
  report_to: 
    - wandb
--- a/olmocr/train/dataloader.py
+++ b/olmocr/train/dataloader.py
@ -281,7 +281,6 @@ class PDFRenderer(PipelineStep):
    """Pipeline step that renders PDF to image."""
    target_longest_image_dim: int
    image_transform: Optional[Callable] = None
    def __call__(self, sample: Sample) -> Sample:
        """Render PDF to image."""
@ -290,10 +289,6 @@ class PDFRenderer(PipelineStep):
        png_bytes = base64.b64decode(base64_png)
        image = Image.open(BytesIO(png_bytes))
        # Apply transform if provided
        if self.image_transform:
            image = self.image_transform(image)
        # Update sample
        sample["image"] = image
@ -524,6 +519,7 @@ class FilterOutRotatedDocuments(PipelineStep):
        return sample
@dataclass(frozen=True, slots=True)
 class InstructUserMessages(PipelineStep):
    """Creates instruction-following messages format for training."""
@ -670,20 +666,19 @@ class RandomTokenFlipper(PipelineStep):
 class MarkdownPDFDocumentDataset(BaseMarkdownPDFDataset):
    """Dataset that includes front matter parsing and PDF rendering by default."""
-    def __init__(self, root_dir: str | PathLike, target_longest_image_dim: int, image_transform=None, front_matter_class=None):
+    def __init__(self, root_dir: str | PathLike, target_longest_image_dim: int, front_matter_class=None):
        """
        Initialize the dataset with default pipeline steps.
        Args:
            root_dir: Path to the root folder containing processed markdown and PDF files
            target_longest_image_dim: Target dimension for the longest side of the image
            image_transform: Optional transform to apply to the PDF images
            front_matter_class: Optional dataclass type to validate front matter against
        """
        # Create default pipeline steps
        pipeline_steps = [
            FrontMatterParser(front_matter_class),
-            PDFRenderer(target_longest_image_dim, image_transform),
+            PDFRenderer(target_longest_image_dim),
            StaticLengthDocumentAnchoring(target_anchor_text_len=6000),
            FinetuningPrompt(),
            FrontMatterOutputFormat(),
--- a/pyproject.toml
+++ b/pyproject.toml
@ -106,6 +106,7 @@ train = [
    "s3fs",
    "necessary",
    "einops",
    "augraphy",
 ]
 elo = [