From 55b7101d7ebca2b2ef6066ccecd351a3b52c7d3e Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 25 Aug 2025 16:25:00 +0000 Subject: [PATCH] Add some new rotation tests to a branch of the bench --- olmocr/bench/scripts/rotate_pdfs.py | 80 ++++++++++++++++++++++ olmocr/bench/scripts/rotate_pdfs_random.sh | 67 ++++++++++++++++++ scripts/run_benchmark.sh | 67 +++++++++++++++--- 3 files changed, 203 insertions(+), 11 deletions(-) create mode 100755 olmocr/bench/scripts/rotate_pdfs.py create mode 100755 olmocr/bench/scripts/rotate_pdfs_random.sh diff --git a/olmocr/bench/scripts/rotate_pdfs.py b/olmocr/bench/scripts/rotate_pdfs.py new file mode 100755 index 0000000..71b4d0a --- /dev/null +++ b/olmocr/bench/scripts/rotate_pdfs.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +import json +import os +import random +import shutil +from pathlib import Path +from collections import defaultdict + +def main(): + # Set paths + bench_data_dir = Path("./olmOCR-bench/bench_data") + pdfs_dir = Path("./olmOCR-bench/bench_data/pdfs") + rotated_pdfs_dir = pdfs_dir / "rotated" + output_jsonl = Path("rotated.jsonl") + + # Create rotated directory if it doesn't exist + rotated_pdfs_dir.mkdir(parents=True, exist_ok=True) + + # Load all JSONL files and group by PDF + pdf_groups = defaultdict(list) + + print("Loading JSONL files...") + for jsonl_file in bench_data_dir.glob("*.jsonl"): + print(f" Reading {jsonl_file}") + with open(jsonl_file, 'r') as f: + for line in f: + try: + data = json.loads(line.strip()) + if 'pdf' in data: + pdf_groups[data['pdf']].append(data) + except json.JSONDecodeError: + continue + + print(f"Found {len(pdf_groups)} unique PDF groups") + + # Randomly select 10% of PDF groups + num_to_select = max(1, int(len(pdf_groups) * 0.1)) + selected_pdfs = random.sample(list(pdf_groups.keys()), num_to_select) + + print(f"Selected {num_to_select} PDF groups (10% of total)") + + # Write selected entries to rotated.jsonl + print(f"Writing selected entries to {output_jsonl}") + with open(output_jsonl, 'w') as f: + for pdf_name in selected_pdfs: + for entry in pdf_groups[pdf_name]: + f.write(json.dumps(entry) + '\n') + + # Copy corresponding PDF files + print("Copying PDF files to rotated directory...") + copied_count = 0 + missing_count = 0 + + for pdf_name in selected_pdfs: + # Try to find the PDF in subdirectories + pdf_found = False + print(pdf_name) + source_path = pdfs_dir / pdf_name + if source_path.exists(): + dest_path = rotated_pdfs_dir / os.path.basename(pdf_name) + print(f" Copying {source_path} -> {dest_path}") + shutil.copy2(source_path, dest_path) + copied_count += 1 + pdf_found = True + + if not pdf_found: + print(f" Warning: PDF not found: {pdf_name}") + missing_count += 1 + + print(f"\nSummary:") + print(f" Total PDF groups: {len(pdf_groups)}") + print(f" Selected groups: {num_to_select}") + print(f" PDFs copied: {copied_count}") + if missing_count > 0: + print(f" PDFs not found: {missing_count}") + print(f" Output JSONL: {output_jsonl}") + print(f" Rotated PDFs directory: {rotated_pdfs_dir}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/olmocr/bench/scripts/rotate_pdfs_random.sh b/olmocr/bench/scripts/rotate_pdfs_random.sh new file mode 100755 index 0000000..a3f636b --- /dev/null +++ b/olmocr/bench/scripts/rotate_pdfs_random.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# Directory containing PDFs to rotate +PDF_DIR="/home/ubuntu/olmocr/olmOCR-bench-0825/bench_data/pdfs/rotated" + +# Check if directory exists +if [ ! -d "$PDF_DIR" ]; then + echo "Error: Directory $PDF_DIR does not exist" + exit 1 +fi + +# Check if qpdf is installed (preferred for PDF rotation) +if ! command -v qpdf &> /dev/null; then + echo "qpdf is not installed. Installing..." + sudo apt-get update && sudo apt-get install -y qpdf +fi + +# Counter for processed files +total=0 +success=0 +failed=0 + +echo "Processing PDFs in $PDF_DIR" +echo "----------------------------------------" + +# Process each PDF file +for pdf_file in "$PDF_DIR"/*.pdf; do + # Check if any PDF files exist + if [ ! -f "$pdf_file" ]; then + echo "No PDF files found in $PDF_DIR" + exit 1 + fi + + # Get filename + filename=$(basename "$pdf_file") + + # Randomly select rotation angle (90, 180, or 270) + angles=(90 180 270) + rotation=${angles[$RANDOM % ${#angles[@]}]} + + echo "Rotating $filename by $rotation degrees..." + + # Create temporary file for rotated PDF + temp_file="${pdf_file}.tmp" + + # Rotate the PDF using qpdf + if qpdf "$pdf_file" "$temp_file" --rotate=+$rotation; then + # Replace original with rotated version + mv "$temp_file" "$pdf_file" + echo " ✓ Successfully rotated $filename by $rotation degrees" + ((success++)) + else + echo " ✗ Failed to rotate $filename" + rm -f "$temp_file" + ((failed++)) + fi + + ((total++)) +done + +echo "----------------------------------------" +echo "Summary:" +echo " Total PDFs processed: $total" +echo " Successfully rotated: $success" +if [ $failed -gt 0 ]; then + echo " Failed: $failed" +fi \ No newline at end of file diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index 0c9863a..33fc049 100755 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -10,15 +10,25 @@ set -e # Parse command line arguments MODEL="" +B200_MODE="" +BENCH_BRANCH="" while [[ $# -gt 0 ]]; do case $1 in --model) MODEL="$2" shift 2 ;; + --b200) + B200_MODE="true" + shift + ;; + --benchbranch) + BENCH_BRANCH="$2" + shift 2 + ;; *) echo "Unknown option: $1" - echo "Usage: $0 [--model MODEL_NAME]" + echo "Usage: $0 [--model MODEL_NAME] [--b200] [--benchbranch BRANCH_NAME]" exit 1 ;; esac @@ -78,12 +88,27 @@ cat << 'EOF' > /tmp/run_benchmark_experiment.py import sys from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar -# Get image tag, beaker user, git branch, git hash, and optional model from command line +# Get image tag, beaker user, git branch, git hash, optional model, b200 mode, and bench branch from command line image_tag = sys.argv[1] beaker_user = sys.argv[2] git_branch = sys.argv[3] git_hash = sys.argv[4] -model = sys.argv[5] if len(sys.argv) > 5 else None +model = None +b200_mode = False +bench_branch = None + +# Parse remaining arguments +arg_idx = 5 +while arg_idx < len(sys.argv): + if sys.argv[arg_idx] == "--b200": + b200_mode = True + arg_idx += 1 + elif sys.argv[arg_idx] == "--benchbranch": + bench_branch = sys.argv[arg_idx + 1] + arg_idx += 2 + else: + model = sys.argv[arg_idx] + arg_idx += 1 # Initialize Beaker client b = Beaker.from_env(default_workspace="ai2/olmocr") @@ -111,8 +136,14 @@ if has_aws_creds: "mkdir -p ~/.aws", 'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials' ]) + +# Build git clone command with optional branch +git_clone_cmd = "git clone https://huggingface.co/datasets/allenai/olmOCR-bench" +if bench_branch: + git_clone_cmd += f" -b {bench_branch}" + commands.extend([ - "git clone https://huggingface.co/datasets/allenai/olmOCR-bench", + git_clone_cmd, "cd olmOCR-bench && git lfs pull && cd ..", pipeline_cmd, "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/", @@ -132,7 +163,7 @@ task_spec_args = { preemptible=True, ), "resources": TaskResources(gpu_count=1), - "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), + "constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), "result": ResultSpec(path="/noop-results"), } @@ -181,9 +212,9 @@ perf_task_spec_args = { priority=Priority.normal, preemptible=True, ), - # Need to reserve all 8 gpus for performance spec or else benchmark results can be off - "resources": TaskResources(gpu_count=8), - "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), + # Need to reserve all 8 gpus for performance spec or else benchmark results can be off (1 for b200 mode) + "resources": TaskResources(gpu_count=1 if b200_mode else 8), + "constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), "result": ResultSpec(path="/noop-results"), } @@ -208,13 +239,27 @@ EOF # Run the Python script to create the experiments echo "Creating Beaker experiments..." + +# Build command with appropriate arguments +CMD="$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH" + if [ -n "$MODEL" ]; then echo "Using model: $MODEL" - $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH "$MODEL" -else - $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH + CMD="$CMD $MODEL" fi +if [ -n "$B200_MODE" ]; then + echo "Using B200 mode: ai2/titan-cirrascale cluster with 1 GPU for perf task" + CMD="$CMD --b200" +fi + +if [ -n "$BENCH_BRANCH" ]; then + echo "Using bench branch: $BENCH_BRANCH" + CMD="$CMD --benchbranch $BENCH_BRANCH" +fi + +eval $CMD + # Clean up temporary file rm /tmp/run_benchmark_experiment.py