Add some new rotation tests to a branch of the bench

2025-10-29 00:52:31 +00:00 · 2025-08-25 16:25:00 +00:00 · 2025-08-25 16:25:00 +00:00 · 55b7101d7e
commit 55b7101d7e
parent 5c6225b227
3 changed files with 203 additions and 11 deletions
--- a/olmocr/bench/scripts/rotate_pdfs.py
+++ b/olmocr/bench/scripts/rotate_pdfs.py
@ -0,0 +1,80 @@
 #!/usr/bin/env python3
 import json
 import os
 import random
 import shutil
 from pathlib import Path
 from collections import defaultdict
 def main():
    # Set paths
    bench_data_dir = Path("./olmOCR-bench/bench_data")
    pdfs_dir = Path("./olmOCR-bench/bench_data/pdfs")
    rotated_pdfs_dir = pdfs_dir / "rotated"
    output_jsonl = Path("rotated.jsonl")
    # Create rotated directory if it doesn't exist
    rotated_pdfs_dir.mkdir(parents=True, exist_ok=True)
    # Load all JSONL files and group by PDF
    pdf_groups = defaultdict(list)
    print("Loading JSONL files...")
    for jsonl_file in bench_data_dir.glob("*.jsonl"):
        print(f"  Reading {jsonl_file}")
        with open(jsonl_file, 'r') as f:
            for line in f:
                try:
                    data = json.loads(line.strip())
                    if 'pdf' in data:
                        pdf_groups[data['pdf']].append(data)
                except json.JSONDecodeError:
                    continue
    print(f"Found {len(pdf_groups)} unique PDF groups")
    # Randomly select 10% of PDF groups
    num_to_select = max(1, int(len(pdf_groups) * 0.1))
    selected_pdfs = random.sample(list(pdf_groups.keys()), num_to_select)
    print(f"Selected {num_to_select} PDF groups (10% of total)")
    # Write selected entries to rotated.jsonl
    print(f"Writing selected entries to {output_jsonl}")
    with open(output_jsonl, 'w') as f:
        for pdf_name in selected_pdfs:
            for entry in pdf_groups[pdf_name]:
                f.write(json.dumps(entry) + '\n')
    # Copy corresponding PDF files
    print("Copying PDF files to rotated directory...")
    copied_count = 0
    missing_count = 0
    for pdf_name in selected_pdfs:
        # Try to find the PDF in subdirectories
        pdf_found = False
        print(pdf_name)
        source_path = pdfs_dir / pdf_name
        if source_path.exists():
            dest_path = rotated_pdfs_dir / os.path.basename(pdf_name)
            print(f"  Copying {source_path} -> {dest_path}")
            shutil.copy2(source_path, dest_path)
            copied_count += 1
            pdf_found = True
        if not pdf_found:
            print(f"  Warning: PDF not found: {pdf_name}")
            missing_count += 1
    print(f"\nSummary:")
    print(f"  Total PDF groups: {len(pdf_groups)}")
    print(f"  Selected groups: {num_to_select}")
    print(f"  PDFs copied: {copied_count}")
    if missing_count > 0:
        print(f"  PDFs not found: {missing_count}")
    print(f"  Output JSONL: {output_jsonl}")
    print(f"  Rotated PDFs directory: {rotated_pdfs_dir}")
 if __name__ == "__main__":
    main()
--- a/olmocr/bench/scripts/rotate_pdfs_random.sh
+++ b/olmocr/bench/scripts/rotate_pdfs_random.sh
@ -0,0 +1,67 @@
 #!/bin/bash
 # Directory containing PDFs to rotate
 PDF_DIR="/home/ubuntu/olmocr/olmOCR-bench-0825/bench_data/pdfs/rotated"
 # Check if directory exists
 if [ ! -d "$PDF_DIR" ]; then
    echo "Error: Directory $PDF_DIR does not exist"
    exit 1
 fi
 # Check if qpdf is installed (preferred for PDF rotation)
 if ! command -v qpdf &> /dev/null; then
    echo "qpdf is not installed. Installing..."
    sudo apt-get update && sudo apt-get install -y qpdf
 fi
 # Counter for processed files
 total=0
 success=0
 failed=0
 echo "Processing PDFs in $PDF_DIR"
 echo "----------------------------------------"
 # Process each PDF file
 for pdf_file in "$PDF_DIR"/*.pdf; do
    # Check if any PDF files exist
    if [ ! -f "$pdf_file" ]; then
        echo "No PDF files found in $PDF_DIR"
        exit 1
    fi
    # Get filename
    filename=$(basename "$pdf_file")
    # Randomly select rotation angle (90, 180, or 270)
    angles=(90 180 270)
    rotation=${angles[$RANDOM % ${#angles[@]}]}
    echo "Rotating $filename by $rotation degrees..."
    # Create temporary file for rotated PDF
    temp_file="${pdf_file}.tmp"
    # Rotate the PDF using qpdf
    if qpdf "$pdf_file" "$temp_file" --rotate=+$rotation; then
        # Replace original with rotated version
        mv "$temp_file" "$pdf_file"
        echo "  ✓ Successfully rotated $filename by $rotation degrees"
        ((success++))
    else
        echo "  ✗ Failed to rotate $filename"
        rm -f "$temp_file"
        ((failed++))
    fi
    ((total++))
 done
 echo "----------------------------------------"
 echo "Summary:"
 echo "  Total PDFs processed: $total"
 echo "  Successfully rotated: $success"
 if [ $failed -gt 0 ]; then
    echo "  Failed: $failed"
 fi
--- a/scripts/run_benchmark.sh
+++ b/scripts/run_benchmark.sh
@ -10,15 +10,25 @@ set -e
 # Parse command line arguments
 MODEL=""
 B200_MODE=""
 BENCH_BRANCH=""
 while [[ $# -gt 0 ]]; do
    case $1 in
        --model)
            MODEL="$2"
            shift 2
            ;;
        --b200)
            B200_MODE="true"
            shift
            ;;
        --benchbranch)
            BENCH_BRANCH="$2"
            shift 2
            ;;
        *)
            echo "Unknown option: $1"
-            echo "Usage: $0 [--model MODEL_NAME]"
+            echo "Usage: $0 [--model MODEL_NAME] [--b200] [--benchbranch BRANCH_NAME]"
            exit 1
            ;;
    esac
@ -78,12 +88,27 @@ cat << 'EOF' > /tmp/run_benchmark_experiment.py
 import sys
 from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar
-# Get image tag, beaker user, git branch, git hash, and optional model from command line
+# Get image tag, beaker user, git branch, git hash, optional model, b200 mode, and bench branch from command line
 image_tag = sys.argv[1]
 beaker_user = sys.argv[2]
 git_branch = sys.argv[3]
 git_hash = sys.argv[4]
-model = sys.argv[5] if len(sys.argv) > 5 else None
+model = None
 b200_mode = False
 bench_branch = None
 # Parse remaining arguments
 arg_idx = 5
 while arg_idx < len(sys.argv):
    if sys.argv[arg_idx] == "--b200":
        b200_mode = True
        arg_idx += 1
    elif sys.argv[arg_idx] == "--benchbranch":
        bench_branch = sys.argv[arg_idx + 1]
        arg_idx += 2
    else:
        model = sys.argv[arg_idx]
        arg_idx += 1
 # Initialize Beaker client
 b = Beaker.from_env(default_workspace="ai2/olmocr")
@ -111,8 +136,14 @@ if has_aws_creds:
        "mkdir -p ~/.aws",
        'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
    ])
 # Build git clone command with optional branch
 git_clone_cmd = "git clone https://huggingface.co/datasets/allenai/olmOCR-bench"
 if bench_branch:
    git_clone_cmd += f" -b {bench_branch}"
 commands.extend([
-    "git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
+    git_clone_cmd,
    "cd olmOCR-bench && git lfs pull && cd ..",
    pipeline_cmd,
    "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/",
@ -132,7 +163,7 @@ task_spec_args = {
        preemptible=True,
    ),
    "resources": TaskResources(gpu_count=1),
-    "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
+    "constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
    "result": ResultSpec(path="/noop-results"),
 }
@ -181,9 +212,9 @@ perf_task_spec_args = {
        priority=Priority.normal,
        preemptible=True,
    ),
-    # Need to reserve all 8 gpus for performance spec or else benchmark results can be off
+    # Need to reserve all 8 gpus for performance spec or else benchmark results can be off (1 for b200 mode)
-    "resources": TaskResources(gpu_count=8),
+    "resources": TaskResources(gpu_count=1 if b200_mode else 8),
-    "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
+    "constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
    "result": ResultSpec(path="/noop-results"),
 }
@ -208,13 +239,27 @@ EOF
 # Run the Python script to create the experiments
 echo "Creating Beaker experiments..."
 # Build command with appropriate arguments
 CMD="$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH"
 if [ -n "$MODEL" ]; then
    echo "Using model: $MODEL"
-    $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH "$MODEL"
+    CMD="$CMD $MODEL"
 else
    $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH
 fi
 if [ -n "$B200_MODE" ]; then
    echo "Using B200 mode: ai2/titan-cirrascale cluster with 1 GPU for perf task"
    CMD="$CMD --b200"
 fi
 if [ -n "$BENCH_BRANCH" ]; then
    echo "Using bench branch: $BENCH_BRANCH"
    CMD="$CMD --benchbranch $BENCH_BRANCH"
 fi
 eval $CMD
 # Clean up temporary file
 rm /tmp/run_benchmark_experiment.py