Add some new rotation tests to a branch of the bench

2025-10-27 16:12:13 +00:00 · 2025-08-25 16:25:00 +00:00 · 2025-08-25 16:25:00 +00:00 · 55b7101d7e
commit 55b7101d7e
parent 5c6225b227
3 changed files with 203 additions and 11 deletions
--- a/olmocr/bench/scripts/rotate_pdfs.py
+++ b/olmocr/bench/scripts/rotate_pdfs.py
@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+import json
+import os
+import random
+import shutil
+from pathlib import Path
+from collections import defaultdict
+
+def main():
+    # Set paths
+    bench_data_dir = Path("./olmOCR-bench/bench_data")
+    pdfs_dir = Path("./olmOCR-bench/bench_data/pdfs")
+    rotated_pdfs_dir = pdfs_dir / "rotated"
+    output_jsonl = Path("rotated.jsonl")
+    
+    # Create rotated directory if it doesn't exist
+    rotated_pdfs_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Load all JSONL files and group by PDF
+    pdf_groups = defaultdict(list)
+    
+    print("Loading JSONL files...")
+    for jsonl_file in bench_data_dir.glob("*.jsonl"):
+        print(f"  Reading {jsonl_file}")
+        with open(jsonl_file, 'r') as f:
+            for line in f:
+                try:
+                    data = json.loads(line.strip())
+                    if 'pdf' in data:
+                        pdf_groups[data['pdf']].append(data)
+                except json.JSONDecodeError:
+                    continue
+    
+    print(f"Found {len(pdf_groups)} unique PDF groups")
+    
+    # Randomly select 10% of PDF groups
+    num_to_select = max(1, int(len(pdf_groups) * 0.1))
+    selected_pdfs = random.sample(list(pdf_groups.keys()), num_to_select)
+    
+    print(f"Selected {num_to_select} PDF groups (10% of total)")
+    
+    # Write selected entries to rotated.jsonl
+    print(f"Writing selected entries to {output_jsonl}")
+    with open(output_jsonl, 'w') as f:
+        for pdf_name in selected_pdfs:
+            for entry in pdf_groups[pdf_name]:
+                f.write(json.dumps(entry) + '\n')
+    
+    # Copy corresponding PDF files
+    print("Copying PDF files to rotated directory...")
+    copied_count = 0
+    missing_count = 0
+    
+    for pdf_name in selected_pdfs:
+        # Try to find the PDF in subdirectories
+        pdf_found = False
+        print(pdf_name)
+        source_path = pdfs_dir / pdf_name
+        if source_path.exists():
+            dest_path = rotated_pdfs_dir / os.path.basename(pdf_name)
+            print(f"  Copying {source_path} -> {dest_path}")
+            shutil.copy2(source_path, dest_path)
+            copied_count += 1
+            pdf_found = True
+        
+        if not pdf_found:
+            print(f"  Warning: PDF not found: {pdf_name}")
+            missing_count += 1
+    
+    print(f"\nSummary:")
+    print(f"  Total PDF groups: {len(pdf_groups)}")
+    print(f"  Selected groups: {num_to_select}")
+    print(f"  PDFs copied: {copied_count}")
+    if missing_count > 0:
+        print(f"  PDFs not found: {missing_count}")
+    print(f"  Output JSONL: {output_jsonl}")
+    print(f"  Rotated PDFs directory: {rotated_pdfs_dir}")
+
+if __name__ == "__main__":
+    main()
--- a/olmocr/bench/scripts/rotate_pdfs_random.sh
+++ b/olmocr/bench/scripts/rotate_pdfs_random.sh
@ -0,0 +1,67 @@
+#!/bin/bash
+
+# Directory containing PDFs to rotate
+PDF_DIR="/home/ubuntu/olmocr/olmOCR-bench-0825/bench_data/pdfs/rotated"
+
+# Check if directory exists
+if [ ! -d "$PDF_DIR" ]; then
+    echo "Error: Directory $PDF_DIR does not exist"
+    exit 1
+fi
+
+# Check if qpdf is installed (preferred for PDF rotation)
+if ! command -v qpdf &> /dev/null; then
+    echo "qpdf is not installed. Installing..."
+    sudo apt-get update && sudo apt-get install -y qpdf
+fi
+
+# Counter for processed files
+total=0
+success=0
+failed=0
+
+echo "Processing PDFs in $PDF_DIR"
+echo "----------------------------------------"
+
+# Process each PDF file
+for pdf_file in "$PDF_DIR"/*.pdf; do
+    # Check if any PDF files exist
+    if [ ! -f "$pdf_file" ]; then
+        echo "No PDF files found in $PDF_DIR"
+        exit 1
+    fi
+    
+    # Get filename
+    filename=$(basename "$pdf_file")
+    
+    # Randomly select rotation angle (90, 180, or 270)
+    angles=(90 180 270)
+    rotation=${angles[$RANDOM % ${#angles[@]}]}
+    
+    echo "Rotating $filename by $rotation degrees..."
+    
+    # Create temporary file for rotated PDF
+    temp_file="${pdf_file}.tmp"
+    
+    # Rotate the PDF using qpdf
+    if qpdf "$pdf_file" "$temp_file" --rotate=+$rotation; then
+        # Replace original with rotated version
+        mv "$temp_file" "$pdf_file"
+        echo "  ✓ Successfully rotated $filename by $rotation degrees"
+        ((success++))
+    else
+        echo "  ✗ Failed to rotate $filename"
+        rm -f "$temp_file"
+        ((failed++))
+    fi
+    
+    ((total++))
+done
+
+echo "----------------------------------------"
+echo "Summary:"
+echo "  Total PDFs processed: $total"
+echo "  Successfully rotated: $success"
+if [ $failed -gt 0 ]; then
+    echo "  Failed: $failed"
+fi
--- a/scripts/run_benchmark.sh
+++ b/scripts/run_benchmark.sh
@ -10,15 +10,25 @@ set -e

 # Parse command line arguments
 MODEL=""
+B200_MODE=""
+BENCH_BRANCH=""
 while [[ $# -gt 0 ]]; do
    case $1 in
        --model)
            MODEL="$2"
            shift 2
            ;;
+        --b200)
+            B200_MODE="true"
+            shift
+            ;;
+        --benchbranch)
+            BENCH_BRANCH="$2"
+            shift 2
+            ;;
        *)
            echo "Unknown option: $1"
-            echo "Usage: $0 [--model MODEL_NAME]"
+            echo "Usage: $0 [--model MODEL_NAME] [--b200] [--benchbranch BRANCH_NAME]"
            exit 1
            ;;
    esac
@ -78,12 +88,27 @@ cat << 'EOF' > /tmp/run_benchmark_experiment.py
 import sys
 from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar

-# Get image tag, beaker user, git branch, git hash, and optional model from command line
+# Get image tag, beaker user, git branch, git hash, optional model, b200 mode, and bench branch from command line
 image_tag = sys.argv[1]
 beaker_user = sys.argv[2]
 git_branch = sys.argv[3]
 git_hash = sys.argv[4]
-model = sys.argv[5] if len(sys.argv) > 5 else None
+model = None
+b200_mode = False
+bench_branch = None
+
+# Parse remaining arguments
+arg_idx = 5
+while arg_idx < len(sys.argv):
+    if sys.argv[arg_idx] == "--b200":
+        b200_mode = True
+        arg_idx += 1
+    elif sys.argv[arg_idx] == "--benchbranch":
+        bench_branch = sys.argv[arg_idx + 1]
+        arg_idx += 2
+    else:
+        model = sys.argv[arg_idx]
+        arg_idx += 1

 # Initialize Beaker client
 b = Beaker.from_env(default_workspace="ai2/olmocr")
@ -111,8 +136,14 @@ if has_aws_creds:
        "mkdir -p ~/.aws",
        'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
    ])
+
+# Build git clone command with optional branch
+git_clone_cmd = "git clone https://huggingface.co/datasets/allenai/olmOCR-bench"
+if bench_branch:
+    git_clone_cmd += f" -b {bench_branch}"
+
 commands.extend([
-    "git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
+    git_clone_cmd,
    "cd olmOCR-bench && git lfs pull && cd ..",
    pipeline_cmd,
    "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/",
@ -132,7 +163,7 @@ task_spec_args = {
        preemptible=True,
    ),
    "resources": TaskResources(gpu_count=1),
-    "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
+    "constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
    "result": ResultSpec(path="/noop-results"),
 }

@ -181,9 +212,9 @@ perf_task_spec_args = {
        priority=Priority.normal,
        preemptible=True,
    ),
-    # Need to reserve all 8 gpus for performance spec or else benchmark results can be off
-    "resources": TaskResources(gpu_count=8),
-    "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
+    # Need to reserve all 8 gpus for performance spec or else benchmark results can be off (1 for b200 mode)
+    "resources": TaskResources(gpu_count=1 if b200_mode else 8),
+    "constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
    "result": ResultSpec(path="/noop-results"),
 }

@ -208,13 +239,27 @@ EOF

 # Run the Python script to create the experiments
 echo "Creating Beaker experiments..."
+
+# Build command with appropriate arguments
+CMD="$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH"
+
 if [ -n "$MODEL" ]; then
    echo "Using model: $MODEL"
-    $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH "$MODEL"
-else
-    $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH
+    CMD="$CMD $MODEL"
 fi

+if [ -n "$B200_MODE" ]; then
+    echo "Using B200 mode: ai2/titan-cirrascale cluster with 1 GPU for perf task"
+    CMD="$CMD --b200"
+fi
+
+if [ -n "$BENCH_BRANCH" ]; then
+    echo "Using bench branch: $BENCH_BRANCH"
+    CMD="$CMD --benchbranch $BENCH_BRANCH"
+fi
+
+eval $CMD
+
 # Clean up temporary file
 rm /tmp/run_benchmark_experiment.py