Add some new rotation tests to a branch of the bench

This commit is contained in:
Jake Poznanski 2025-08-25 16:25:00 +00:00
parent 5c6225b227
commit 55b7101d7e
3 changed files with 203 additions and 11 deletions

View File

@ -0,0 +1,80 @@
#!/usr/bin/env python3
import json
import os
import random
import shutil
from pathlib import Path
from collections import defaultdict
def main():
# Set paths
bench_data_dir = Path("./olmOCR-bench/bench_data")
pdfs_dir = Path("./olmOCR-bench/bench_data/pdfs")
rotated_pdfs_dir = pdfs_dir / "rotated"
output_jsonl = Path("rotated.jsonl")
# Create rotated directory if it doesn't exist
rotated_pdfs_dir.mkdir(parents=True, exist_ok=True)
# Load all JSONL files and group by PDF
pdf_groups = defaultdict(list)
print("Loading JSONL files...")
for jsonl_file in bench_data_dir.glob("*.jsonl"):
print(f" Reading {jsonl_file}")
with open(jsonl_file, 'r') as f:
for line in f:
try:
data = json.loads(line.strip())
if 'pdf' in data:
pdf_groups[data['pdf']].append(data)
except json.JSONDecodeError:
continue
print(f"Found {len(pdf_groups)} unique PDF groups")
# Randomly select 10% of PDF groups
num_to_select = max(1, int(len(pdf_groups) * 0.1))
selected_pdfs = random.sample(list(pdf_groups.keys()), num_to_select)
print(f"Selected {num_to_select} PDF groups (10% of total)")
# Write selected entries to rotated.jsonl
print(f"Writing selected entries to {output_jsonl}")
with open(output_jsonl, 'w') as f:
for pdf_name in selected_pdfs:
for entry in pdf_groups[pdf_name]:
f.write(json.dumps(entry) + '\n')
# Copy corresponding PDF files
print("Copying PDF files to rotated directory...")
copied_count = 0
missing_count = 0
for pdf_name in selected_pdfs:
# Try to find the PDF in subdirectories
pdf_found = False
print(pdf_name)
source_path = pdfs_dir / pdf_name
if source_path.exists():
dest_path = rotated_pdfs_dir / os.path.basename(pdf_name)
print(f" Copying {source_path} -> {dest_path}")
shutil.copy2(source_path, dest_path)
copied_count += 1
pdf_found = True
if not pdf_found:
print(f" Warning: PDF not found: {pdf_name}")
missing_count += 1
print(f"\nSummary:")
print(f" Total PDF groups: {len(pdf_groups)}")
print(f" Selected groups: {num_to_select}")
print(f" PDFs copied: {copied_count}")
if missing_count > 0:
print(f" PDFs not found: {missing_count}")
print(f" Output JSONL: {output_jsonl}")
print(f" Rotated PDFs directory: {rotated_pdfs_dir}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,67 @@
#!/bin/bash
# Directory containing PDFs to rotate
PDF_DIR="/home/ubuntu/olmocr/olmOCR-bench-0825/bench_data/pdfs/rotated"
# Check if directory exists
if [ ! -d "$PDF_DIR" ]; then
echo "Error: Directory $PDF_DIR does not exist"
exit 1
fi
# Check if qpdf is installed (preferred for PDF rotation)
if ! command -v qpdf &> /dev/null; then
echo "qpdf is not installed. Installing..."
sudo apt-get update && sudo apt-get install -y qpdf
fi
# Counter for processed files
total=0
success=0
failed=0
echo "Processing PDFs in $PDF_DIR"
echo "----------------------------------------"
# Process each PDF file
for pdf_file in "$PDF_DIR"/*.pdf; do
# Check if any PDF files exist
if [ ! -f "$pdf_file" ]; then
echo "No PDF files found in $PDF_DIR"
exit 1
fi
# Get filename
filename=$(basename "$pdf_file")
# Randomly select rotation angle (90, 180, or 270)
angles=(90 180 270)
rotation=${angles[$RANDOM % ${#angles[@]}]}
echo "Rotating $filename by $rotation degrees..."
# Create temporary file for rotated PDF
temp_file="${pdf_file}.tmp"
# Rotate the PDF using qpdf
if qpdf "$pdf_file" "$temp_file" --rotate=+$rotation; then
# Replace original with rotated version
mv "$temp_file" "$pdf_file"
echo " ✓ Successfully rotated $filename by $rotation degrees"
((success++))
else
echo " ✗ Failed to rotate $filename"
rm -f "$temp_file"
((failed++))
fi
((total++))
done
echo "----------------------------------------"
echo "Summary:"
echo " Total PDFs processed: $total"
echo " Successfully rotated: $success"
if [ $failed -gt 0 ]; then
echo " Failed: $failed"
fi

View File

@ -10,15 +10,25 @@ set -e
# Parse command line arguments
MODEL=""
B200_MODE=""
BENCH_BRANCH=""
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL="$2"
shift 2
;;
--b200)
B200_MODE="true"
shift
;;
--benchbranch)
BENCH_BRANCH="$2"
shift 2
;;
*)
echo "Unknown option: $1"
echo "Usage: $0 [--model MODEL_NAME]"
echo "Usage: $0 [--model MODEL_NAME] [--b200] [--benchbranch BRANCH_NAME]"
exit 1
;;
esac
@ -78,12 +88,27 @@ cat << 'EOF' > /tmp/run_benchmark_experiment.py
import sys
from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar
# Get image tag, beaker user, git branch, git hash, and optional model from command line
# Get image tag, beaker user, git branch, git hash, optional model, b200 mode, and bench branch from command line
image_tag = sys.argv[1]
beaker_user = sys.argv[2]
git_branch = sys.argv[3]
git_hash = sys.argv[4]
model = sys.argv[5] if len(sys.argv) > 5 else None
model = None
b200_mode = False
bench_branch = None
# Parse remaining arguments
arg_idx = 5
while arg_idx < len(sys.argv):
if sys.argv[arg_idx] == "--b200":
b200_mode = True
arg_idx += 1
elif sys.argv[arg_idx] == "--benchbranch":
bench_branch = sys.argv[arg_idx + 1]
arg_idx += 2
else:
model = sys.argv[arg_idx]
arg_idx += 1
# Initialize Beaker client
b = Beaker.from_env(default_workspace="ai2/olmocr")
@ -111,8 +136,14 @@ if has_aws_creds:
"mkdir -p ~/.aws",
'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
])
# Build git clone command with optional branch
git_clone_cmd = "git clone https://huggingface.co/datasets/allenai/olmOCR-bench"
if bench_branch:
git_clone_cmd += f" -b {bench_branch}"
commands.extend([
"git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
git_clone_cmd,
"cd olmOCR-bench && git lfs pull && cd ..",
pipeline_cmd,
"python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/",
@ -132,7 +163,7 @@ task_spec_args = {
preemptible=True,
),
"resources": TaskResources(gpu_count=1),
"constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
"constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
"result": ResultSpec(path="/noop-results"),
}
@ -181,9 +212,9 @@ perf_task_spec_args = {
priority=Priority.normal,
preemptible=True,
),
# Need to reserve all 8 gpus for performance spec or else benchmark results can be off
"resources": TaskResources(gpu_count=8),
"constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
# Need to reserve all 8 gpus for performance spec or else benchmark results can be off (1 for b200 mode)
"resources": TaskResources(gpu_count=1 if b200_mode else 8),
"constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
"result": ResultSpec(path="/noop-results"),
}
@ -208,13 +239,27 @@ EOF
# Run the Python script to create the experiments
echo "Creating Beaker experiments..."
# Build command with appropriate arguments
CMD="$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH"
if [ -n "$MODEL" ]; then
echo "Using model: $MODEL"
$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH "$MODEL"
else
$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH
CMD="$CMD $MODEL"
fi
if [ -n "$B200_MODE" ]; then
echo "Using B200 mode: ai2/titan-cirrascale cluster with 1 GPU for perf task"
CMD="$CMD --b200"
fi
if [ -n "$BENCH_BRANCH" ]; then
echo "Using bench branch: $BENCH_BRANCH"
CMD="$CMD --benchbranch $BENCH_BRANCH"
fi
eval $CMD
# Clean up temporary file
rm /tmp/run_benchmark_experiment.py