mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-29 00:52:31 +00:00
Add some new rotation tests to a branch of the bench
This commit is contained in:
parent
5c6225b227
commit
55b7101d7e
80
olmocr/bench/scripts/rotate_pdfs.py
Executable file
80
olmocr/bench/scripts/rotate_pdfs.py
Executable file
@ -0,0 +1,80 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Set paths
|
||||||
|
bench_data_dir = Path("./olmOCR-bench/bench_data")
|
||||||
|
pdfs_dir = Path("./olmOCR-bench/bench_data/pdfs")
|
||||||
|
rotated_pdfs_dir = pdfs_dir / "rotated"
|
||||||
|
output_jsonl = Path("rotated.jsonl")
|
||||||
|
|
||||||
|
# Create rotated directory if it doesn't exist
|
||||||
|
rotated_pdfs_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Load all JSONL files and group by PDF
|
||||||
|
pdf_groups = defaultdict(list)
|
||||||
|
|
||||||
|
print("Loading JSONL files...")
|
||||||
|
for jsonl_file in bench_data_dir.glob("*.jsonl"):
|
||||||
|
print(f" Reading {jsonl_file}")
|
||||||
|
with open(jsonl_file, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
try:
|
||||||
|
data = json.loads(line.strip())
|
||||||
|
if 'pdf' in data:
|
||||||
|
pdf_groups[data['pdf']].append(data)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Found {len(pdf_groups)} unique PDF groups")
|
||||||
|
|
||||||
|
# Randomly select 10% of PDF groups
|
||||||
|
num_to_select = max(1, int(len(pdf_groups) * 0.1))
|
||||||
|
selected_pdfs = random.sample(list(pdf_groups.keys()), num_to_select)
|
||||||
|
|
||||||
|
print(f"Selected {num_to_select} PDF groups (10% of total)")
|
||||||
|
|
||||||
|
# Write selected entries to rotated.jsonl
|
||||||
|
print(f"Writing selected entries to {output_jsonl}")
|
||||||
|
with open(output_jsonl, 'w') as f:
|
||||||
|
for pdf_name in selected_pdfs:
|
||||||
|
for entry in pdf_groups[pdf_name]:
|
||||||
|
f.write(json.dumps(entry) + '\n')
|
||||||
|
|
||||||
|
# Copy corresponding PDF files
|
||||||
|
print("Copying PDF files to rotated directory...")
|
||||||
|
copied_count = 0
|
||||||
|
missing_count = 0
|
||||||
|
|
||||||
|
for pdf_name in selected_pdfs:
|
||||||
|
# Try to find the PDF in subdirectories
|
||||||
|
pdf_found = False
|
||||||
|
print(pdf_name)
|
||||||
|
source_path = pdfs_dir / pdf_name
|
||||||
|
if source_path.exists():
|
||||||
|
dest_path = rotated_pdfs_dir / os.path.basename(pdf_name)
|
||||||
|
print(f" Copying {source_path} -> {dest_path}")
|
||||||
|
shutil.copy2(source_path, dest_path)
|
||||||
|
copied_count += 1
|
||||||
|
pdf_found = True
|
||||||
|
|
||||||
|
if not pdf_found:
|
||||||
|
print(f" Warning: PDF not found: {pdf_name}")
|
||||||
|
missing_count += 1
|
||||||
|
|
||||||
|
print(f"\nSummary:")
|
||||||
|
print(f" Total PDF groups: {len(pdf_groups)}")
|
||||||
|
print(f" Selected groups: {num_to_select}")
|
||||||
|
print(f" PDFs copied: {copied_count}")
|
||||||
|
if missing_count > 0:
|
||||||
|
print(f" PDFs not found: {missing_count}")
|
||||||
|
print(f" Output JSONL: {output_jsonl}")
|
||||||
|
print(f" Rotated PDFs directory: {rotated_pdfs_dir}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
67
olmocr/bench/scripts/rotate_pdfs_random.sh
Executable file
67
olmocr/bench/scripts/rotate_pdfs_random.sh
Executable file
@ -0,0 +1,67 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Directory containing PDFs to rotate
|
||||||
|
PDF_DIR="/home/ubuntu/olmocr/olmOCR-bench-0825/bench_data/pdfs/rotated"
|
||||||
|
|
||||||
|
# Check if directory exists
|
||||||
|
if [ ! -d "$PDF_DIR" ]; then
|
||||||
|
echo "Error: Directory $PDF_DIR does not exist"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if qpdf is installed (preferred for PDF rotation)
|
||||||
|
if ! command -v qpdf &> /dev/null; then
|
||||||
|
echo "qpdf is not installed. Installing..."
|
||||||
|
sudo apt-get update && sudo apt-get install -y qpdf
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Counter for processed files
|
||||||
|
total=0
|
||||||
|
success=0
|
||||||
|
failed=0
|
||||||
|
|
||||||
|
echo "Processing PDFs in $PDF_DIR"
|
||||||
|
echo "----------------------------------------"
|
||||||
|
|
||||||
|
# Process each PDF file
|
||||||
|
for pdf_file in "$PDF_DIR"/*.pdf; do
|
||||||
|
# Check if any PDF files exist
|
||||||
|
if [ ! -f "$pdf_file" ]; then
|
||||||
|
echo "No PDF files found in $PDF_DIR"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get filename
|
||||||
|
filename=$(basename "$pdf_file")
|
||||||
|
|
||||||
|
# Randomly select rotation angle (90, 180, or 270)
|
||||||
|
angles=(90 180 270)
|
||||||
|
rotation=${angles[$RANDOM % ${#angles[@]}]}
|
||||||
|
|
||||||
|
echo "Rotating $filename by $rotation degrees..."
|
||||||
|
|
||||||
|
# Create temporary file for rotated PDF
|
||||||
|
temp_file="${pdf_file}.tmp"
|
||||||
|
|
||||||
|
# Rotate the PDF using qpdf
|
||||||
|
if qpdf "$pdf_file" "$temp_file" --rotate=+$rotation; then
|
||||||
|
# Replace original with rotated version
|
||||||
|
mv "$temp_file" "$pdf_file"
|
||||||
|
echo " ✓ Successfully rotated $filename by $rotation degrees"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
echo " ✗ Failed to rotate $filename"
|
||||||
|
rm -f "$temp_file"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
((total++))
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "----------------------------------------"
|
||||||
|
echo "Summary:"
|
||||||
|
echo " Total PDFs processed: $total"
|
||||||
|
echo " Successfully rotated: $success"
|
||||||
|
if [ $failed -gt 0 ]; then
|
||||||
|
echo " Failed: $failed"
|
||||||
|
fi
|
||||||
@ -10,15 +10,25 @@ set -e
|
|||||||
|
|
||||||
# Parse command line arguments
|
# Parse command line arguments
|
||||||
MODEL=""
|
MODEL=""
|
||||||
|
B200_MODE=""
|
||||||
|
BENCH_BRANCH=""
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case $1 in
|
case $1 in
|
||||||
--model)
|
--model)
|
||||||
MODEL="$2"
|
MODEL="$2"
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
|
--b200)
|
||||||
|
B200_MODE="true"
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--benchbranch)
|
||||||
|
BENCH_BRANCH="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
echo "Unknown option: $1"
|
echo "Unknown option: $1"
|
||||||
echo "Usage: $0 [--model MODEL_NAME]"
|
echo "Usage: $0 [--model MODEL_NAME] [--b200] [--benchbranch BRANCH_NAME]"
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
@ -78,12 +88,27 @@ cat << 'EOF' > /tmp/run_benchmark_experiment.py
|
|||||||
import sys
|
import sys
|
||||||
from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar
|
from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar
|
||||||
|
|
||||||
# Get image tag, beaker user, git branch, git hash, and optional model from command line
|
# Get image tag, beaker user, git branch, git hash, optional model, b200 mode, and bench branch from command line
|
||||||
image_tag = sys.argv[1]
|
image_tag = sys.argv[1]
|
||||||
beaker_user = sys.argv[2]
|
beaker_user = sys.argv[2]
|
||||||
git_branch = sys.argv[3]
|
git_branch = sys.argv[3]
|
||||||
git_hash = sys.argv[4]
|
git_hash = sys.argv[4]
|
||||||
model = sys.argv[5] if len(sys.argv) > 5 else None
|
model = None
|
||||||
|
b200_mode = False
|
||||||
|
bench_branch = None
|
||||||
|
|
||||||
|
# Parse remaining arguments
|
||||||
|
arg_idx = 5
|
||||||
|
while arg_idx < len(sys.argv):
|
||||||
|
if sys.argv[arg_idx] == "--b200":
|
||||||
|
b200_mode = True
|
||||||
|
arg_idx += 1
|
||||||
|
elif sys.argv[arg_idx] == "--benchbranch":
|
||||||
|
bench_branch = sys.argv[arg_idx + 1]
|
||||||
|
arg_idx += 2
|
||||||
|
else:
|
||||||
|
model = sys.argv[arg_idx]
|
||||||
|
arg_idx += 1
|
||||||
|
|
||||||
# Initialize Beaker client
|
# Initialize Beaker client
|
||||||
b = Beaker.from_env(default_workspace="ai2/olmocr")
|
b = Beaker.from_env(default_workspace="ai2/olmocr")
|
||||||
@ -111,8 +136,14 @@ if has_aws_creds:
|
|||||||
"mkdir -p ~/.aws",
|
"mkdir -p ~/.aws",
|
||||||
'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
|
'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
|
||||||
])
|
])
|
||||||
|
|
||||||
|
# Build git clone command with optional branch
|
||||||
|
git_clone_cmd = "git clone https://huggingface.co/datasets/allenai/olmOCR-bench"
|
||||||
|
if bench_branch:
|
||||||
|
git_clone_cmd += f" -b {bench_branch}"
|
||||||
|
|
||||||
commands.extend([
|
commands.extend([
|
||||||
"git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
|
git_clone_cmd,
|
||||||
"cd olmOCR-bench && git lfs pull && cd ..",
|
"cd olmOCR-bench && git lfs pull && cd ..",
|
||||||
pipeline_cmd,
|
pipeline_cmd,
|
||||||
"python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/",
|
"python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/",
|
||||||
@ -132,7 +163,7 @@ task_spec_args = {
|
|||||||
preemptible=True,
|
preemptible=True,
|
||||||
),
|
),
|
||||||
"resources": TaskResources(gpu_count=1),
|
"resources": TaskResources(gpu_count=1),
|
||||||
"constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
|
"constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
|
||||||
"result": ResultSpec(path="/noop-results"),
|
"result": ResultSpec(path="/noop-results"),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -181,9 +212,9 @@ perf_task_spec_args = {
|
|||||||
priority=Priority.normal,
|
priority=Priority.normal,
|
||||||
preemptible=True,
|
preemptible=True,
|
||||||
),
|
),
|
||||||
# Need to reserve all 8 gpus for performance spec or else benchmark results can be off
|
# Need to reserve all 8 gpus for performance spec or else benchmark results can be off (1 for b200 mode)
|
||||||
"resources": TaskResources(gpu_count=8),
|
"resources": TaskResources(gpu_count=1 if b200_mode else 8),
|
||||||
"constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
|
"constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
|
||||||
"result": ResultSpec(path="/noop-results"),
|
"result": ResultSpec(path="/noop-results"),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -208,13 +239,27 @@ EOF
|
|||||||
|
|
||||||
# Run the Python script to create the experiments
|
# Run the Python script to create the experiments
|
||||||
echo "Creating Beaker experiments..."
|
echo "Creating Beaker experiments..."
|
||||||
|
|
||||||
|
# Build command with appropriate arguments
|
||||||
|
CMD="$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH"
|
||||||
|
|
||||||
if [ -n "$MODEL" ]; then
|
if [ -n "$MODEL" ]; then
|
||||||
echo "Using model: $MODEL"
|
echo "Using model: $MODEL"
|
||||||
$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH "$MODEL"
|
CMD="$CMD $MODEL"
|
||||||
else
|
|
||||||
$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ -n "$B200_MODE" ]; then
|
||||||
|
echo "Using B200 mode: ai2/titan-cirrascale cluster with 1 GPU for perf task"
|
||||||
|
CMD="$CMD --b200"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$BENCH_BRANCH" ]; then
|
||||||
|
echo "Using bench branch: $BENCH_BRANCH"
|
||||||
|
CMD="$CMD --benchbranch $BENCH_BRANCH"
|
||||||
|
fi
|
||||||
|
|
||||||
|
eval $CMD
|
||||||
|
|
||||||
# Clean up temporary file
|
# Clean up temporary file
|
||||||
rm /tmp/run_benchmark_experiment.py
|
rm /tmp/run_benchmark_experiment.py
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user