mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-27 16:12:13 +00:00
Add some new rotation tests to a branch of the bench
This commit is contained in:
parent
5c6225b227
commit
55b7101d7e
80
olmocr/bench/scripts/rotate_pdfs.py
Executable file
80
olmocr/bench/scripts/rotate_pdfs.py
Executable file
@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
def main():
|
||||
# Set paths
|
||||
bench_data_dir = Path("./olmOCR-bench/bench_data")
|
||||
pdfs_dir = Path("./olmOCR-bench/bench_data/pdfs")
|
||||
rotated_pdfs_dir = pdfs_dir / "rotated"
|
||||
output_jsonl = Path("rotated.jsonl")
|
||||
|
||||
# Create rotated directory if it doesn't exist
|
||||
rotated_pdfs_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load all JSONL files and group by PDF
|
||||
pdf_groups = defaultdict(list)
|
||||
|
||||
print("Loading JSONL files...")
|
||||
for jsonl_file in bench_data_dir.glob("*.jsonl"):
|
||||
print(f" Reading {jsonl_file}")
|
||||
with open(jsonl_file, 'r') as f:
|
||||
for line in f:
|
||||
try:
|
||||
data = json.loads(line.strip())
|
||||
if 'pdf' in data:
|
||||
pdf_groups[data['pdf']].append(data)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
print(f"Found {len(pdf_groups)} unique PDF groups")
|
||||
|
||||
# Randomly select 10% of PDF groups
|
||||
num_to_select = max(1, int(len(pdf_groups) * 0.1))
|
||||
selected_pdfs = random.sample(list(pdf_groups.keys()), num_to_select)
|
||||
|
||||
print(f"Selected {num_to_select} PDF groups (10% of total)")
|
||||
|
||||
# Write selected entries to rotated.jsonl
|
||||
print(f"Writing selected entries to {output_jsonl}")
|
||||
with open(output_jsonl, 'w') as f:
|
||||
for pdf_name in selected_pdfs:
|
||||
for entry in pdf_groups[pdf_name]:
|
||||
f.write(json.dumps(entry) + '\n')
|
||||
|
||||
# Copy corresponding PDF files
|
||||
print("Copying PDF files to rotated directory...")
|
||||
copied_count = 0
|
||||
missing_count = 0
|
||||
|
||||
for pdf_name in selected_pdfs:
|
||||
# Try to find the PDF in subdirectories
|
||||
pdf_found = False
|
||||
print(pdf_name)
|
||||
source_path = pdfs_dir / pdf_name
|
||||
if source_path.exists():
|
||||
dest_path = rotated_pdfs_dir / os.path.basename(pdf_name)
|
||||
print(f" Copying {source_path} -> {dest_path}")
|
||||
shutil.copy2(source_path, dest_path)
|
||||
copied_count += 1
|
||||
pdf_found = True
|
||||
|
||||
if not pdf_found:
|
||||
print(f" Warning: PDF not found: {pdf_name}")
|
||||
missing_count += 1
|
||||
|
||||
print(f"\nSummary:")
|
||||
print(f" Total PDF groups: {len(pdf_groups)}")
|
||||
print(f" Selected groups: {num_to_select}")
|
||||
print(f" PDFs copied: {copied_count}")
|
||||
if missing_count > 0:
|
||||
print(f" PDFs not found: {missing_count}")
|
||||
print(f" Output JSONL: {output_jsonl}")
|
||||
print(f" Rotated PDFs directory: {rotated_pdfs_dir}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
67
olmocr/bench/scripts/rotate_pdfs_random.sh
Executable file
67
olmocr/bench/scripts/rotate_pdfs_random.sh
Executable file
@ -0,0 +1,67 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Directory containing PDFs to rotate
|
||||
PDF_DIR="/home/ubuntu/olmocr/olmOCR-bench-0825/bench_data/pdfs/rotated"
|
||||
|
||||
# Check if directory exists
|
||||
if [ ! -d "$PDF_DIR" ]; then
|
||||
echo "Error: Directory $PDF_DIR does not exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if qpdf is installed (preferred for PDF rotation)
|
||||
if ! command -v qpdf &> /dev/null; then
|
||||
echo "qpdf is not installed. Installing..."
|
||||
sudo apt-get update && sudo apt-get install -y qpdf
|
||||
fi
|
||||
|
||||
# Counter for processed files
|
||||
total=0
|
||||
success=0
|
||||
failed=0
|
||||
|
||||
echo "Processing PDFs in $PDF_DIR"
|
||||
echo "----------------------------------------"
|
||||
|
||||
# Process each PDF file
|
||||
for pdf_file in "$PDF_DIR"/*.pdf; do
|
||||
# Check if any PDF files exist
|
||||
if [ ! -f "$pdf_file" ]; then
|
||||
echo "No PDF files found in $PDF_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get filename
|
||||
filename=$(basename "$pdf_file")
|
||||
|
||||
# Randomly select rotation angle (90, 180, or 270)
|
||||
angles=(90 180 270)
|
||||
rotation=${angles[$RANDOM % ${#angles[@]}]}
|
||||
|
||||
echo "Rotating $filename by $rotation degrees..."
|
||||
|
||||
# Create temporary file for rotated PDF
|
||||
temp_file="${pdf_file}.tmp"
|
||||
|
||||
# Rotate the PDF using qpdf
|
||||
if qpdf "$pdf_file" "$temp_file" --rotate=+$rotation; then
|
||||
# Replace original with rotated version
|
||||
mv "$temp_file" "$pdf_file"
|
||||
echo " ✓ Successfully rotated $filename by $rotation degrees"
|
||||
((success++))
|
||||
else
|
||||
echo " ✗ Failed to rotate $filename"
|
||||
rm -f "$temp_file"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((total++))
|
||||
done
|
||||
|
||||
echo "----------------------------------------"
|
||||
echo "Summary:"
|
||||
echo " Total PDFs processed: $total"
|
||||
echo " Successfully rotated: $success"
|
||||
if [ $failed -gt 0 ]; then
|
||||
echo " Failed: $failed"
|
||||
fi
|
||||
@ -10,15 +10,25 @@ set -e
|
||||
|
||||
# Parse command line arguments
|
||||
MODEL=""
|
||||
B200_MODE=""
|
||||
BENCH_BRANCH=""
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--model)
|
||||
MODEL="$2"
|
||||
shift 2
|
||||
;;
|
||||
--b200)
|
||||
B200_MODE="true"
|
||||
shift
|
||||
;;
|
||||
--benchbranch)
|
||||
BENCH_BRANCH="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
echo "Usage: $0 [--model MODEL_NAME]"
|
||||
echo "Usage: $0 [--model MODEL_NAME] [--b200] [--benchbranch BRANCH_NAME]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@ -78,12 +88,27 @@ cat << 'EOF' > /tmp/run_benchmark_experiment.py
|
||||
import sys
|
||||
from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar
|
||||
|
||||
# Get image tag, beaker user, git branch, git hash, and optional model from command line
|
||||
# Get image tag, beaker user, git branch, git hash, optional model, b200 mode, and bench branch from command line
|
||||
image_tag = sys.argv[1]
|
||||
beaker_user = sys.argv[2]
|
||||
git_branch = sys.argv[3]
|
||||
git_hash = sys.argv[4]
|
||||
model = sys.argv[5] if len(sys.argv) > 5 else None
|
||||
model = None
|
||||
b200_mode = False
|
||||
bench_branch = None
|
||||
|
||||
# Parse remaining arguments
|
||||
arg_idx = 5
|
||||
while arg_idx < len(sys.argv):
|
||||
if sys.argv[arg_idx] == "--b200":
|
||||
b200_mode = True
|
||||
arg_idx += 1
|
||||
elif sys.argv[arg_idx] == "--benchbranch":
|
||||
bench_branch = sys.argv[arg_idx + 1]
|
||||
arg_idx += 2
|
||||
else:
|
||||
model = sys.argv[arg_idx]
|
||||
arg_idx += 1
|
||||
|
||||
# Initialize Beaker client
|
||||
b = Beaker.from_env(default_workspace="ai2/olmocr")
|
||||
@ -111,8 +136,14 @@ if has_aws_creds:
|
||||
"mkdir -p ~/.aws",
|
||||
'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
|
||||
])
|
||||
|
||||
# Build git clone command with optional branch
|
||||
git_clone_cmd = "git clone https://huggingface.co/datasets/allenai/olmOCR-bench"
|
||||
if bench_branch:
|
||||
git_clone_cmd += f" -b {bench_branch}"
|
||||
|
||||
commands.extend([
|
||||
"git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
|
||||
git_clone_cmd,
|
||||
"cd olmOCR-bench && git lfs pull && cd ..",
|
||||
pipeline_cmd,
|
||||
"python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/",
|
||||
@ -132,7 +163,7 @@ task_spec_args = {
|
||||
preemptible=True,
|
||||
),
|
||||
"resources": TaskResources(gpu_count=1),
|
||||
"constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
|
||||
"constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
|
||||
"result": ResultSpec(path="/noop-results"),
|
||||
}
|
||||
|
||||
@ -181,9 +212,9 @@ perf_task_spec_args = {
|
||||
priority=Priority.normal,
|
||||
preemptible=True,
|
||||
),
|
||||
# Need to reserve all 8 gpus for performance spec or else benchmark results can be off
|
||||
"resources": TaskResources(gpu_count=8),
|
||||
"constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
|
||||
# Need to reserve all 8 gpus for performance spec or else benchmark results can be off (1 for b200 mode)
|
||||
"resources": TaskResources(gpu_count=1 if b200_mode else 8),
|
||||
"constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
|
||||
"result": ResultSpec(path="/noop-results"),
|
||||
}
|
||||
|
||||
@ -208,13 +239,27 @@ EOF
|
||||
|
||||
# Run the Python script to create the experiments
|
||||
echo "Creating Beaker experiments..."
|
||||
|
||||
# Build command with appropriate arguments
|
||||
CMD="$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH"
|
||||
|
||||
if [ -n "$MODEL" ]; then
|
||||
echo "Using model: $MODEL"
|
||||
$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH "$MODEL"
|
||||
else
|
||||
$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH
|
||||
CMD="$CMD $MODEL"
|
||||
fi
|
||||
|
||||
if [ -n "$B200_MODE" ]; then
|
||||
echo "Using B200 mode: ai2/titan-cirrascale cluster with 1 GPU for perf task"
|
||||
CMD="$CMD --b200"
|
||||
fi
|
||||
|
||||
if [ -n "$BENCH_BRANCH" ]; then
|
||||
echo "Using bench branch: $BENCH_BRANCH"
|
||||
CMD="$CMD --benchbranch $BENCH_BRANCH"
|
||||
fi
|
||||
|
||||
eval $CMD
|
||||
|
||||
# Clean up temporary file
|
||||
rm /tmp/run_benchmark_experiment.py
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user