From acc2687f214f5fa2642a0b6001285c933691d963 Mon Sep 17 00:00:00 2001 From: aman-17 Date: Wed, 28 May 2025 14:35:23 -0700 Subject: [PATCH 01/13] Updated dockerfile and added a file --- Dockerfile | 3 + olmocr/bench/workspace_to_bench.py | 228 +++++++++++++++++++++++++++++ 2 files changed, 231 insertions(+) create mode 100644 olmocr/bench/workspace_to_bench.py diff --git a/Dockerfile b/Dockerfile index 856b7c8..472c708 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,6 +16,7 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends \ ca-certificates \ build-essential \ curl \ + wget \ unzip RUN rm -rf /var/lib/apt/lists/* \ @@ -41,8 +42,10 @@ RUN playwright install-deps RUN playwright install chromium COPY olmocr olmocr +COPY scripts scripts WORKDIR /root COPY olmocr olmocr +COPY scripts scripts RUN python3 -m sglang.launch_server --help RUN python3 -m olmocr.pipeline --help \ No newline at end of file diff --git a/olmocr/bench/workspace_to_bench.py b/olmocr/bench/workspace_to_bench.py new file mode 100644 index 0000000..ce13969 --- /dev/null +++ b/olmocr/bench/workspace_to_bench.py @@ -0,0 +1,228 @@ +""" +Convert JSONL files to Markdown files and handle missing PDFs +Usage: + python workspace_to_benchmark.py localworkspace ./markdown_output --bench-path ../olmOCR-bench/ +""" + +import json +import sys +import argparse +from pathlib import Path +from collections import defaultdict + + +def load_jsonl_files(input_dir): + """Load all JSONL files from the input directory.""" + jsonl_files = list(Path(input_dir).glob("*.jsonl")) + if not jsonl_files: + print(f"No JSONL files found in {input_dir}") + return [] + + print(f"Found {len(jsonl_files)} JSONL files: {[f.name for f in jsonl_files]}") + return jsonl_files + + +def parse_jsonl_entries(jsonl_files): + """Parse all JSONL files and extract entries with text and metadata.""" + all_entries = [] + pdf_sources = set() + + for jsonl_file in jsonl_files: + print(f"Processing {jsonl_file.name}...") + + with open(jsonl_file, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + + try: + entry = json.loads(line) + text = entry.get('text', '') + metadata = entry.get('metadata', {}) + source_file = metadata.get('Source-File', '') + + if source_file: + pdf_sources.add(source_file) + + all_entries.append({ + 'text': text, + 'source_file': source_file, + 'metadata': metadata, + 'entry': entry + }) + + except json.JSONDecodeError as e: + print(f"Error parsing line {line_num} in {jsonl_file.name}: {e}") + continue + + print(f"Loaded {len(all_entries)} entries from JSONL files") + print(f"Found {len(pdf_sources)} unique PDF sources") + + return all_entries, pdf_sources + + +def get_subdir_and_pdf_name(source_file_path): + """Extract subdirectory and PDF filename from source file path.""" + if not source_file_path: + return None, None + + path_parts = Path(source_file_path).parts + + try: + pdfs_index = path_parts.index('pdfs') + if pdfs_index + 1 < len(path_parts): + subdir = path_parts[pdfs_index + 1] + pdf_name = Path(source_file_path).stem + return subdir, pdf_name + except ValueError: + pass + + return None, None + + +def create_markdown_files(entries, output_dir): + """Create markdown files from JSONL entries in subdir/{pdf_name}.md format.""" + output_path = Path(output_dir) + + subdir_pdf_to_entries = defaultdict(list) + + for entry in entries: + subdir, pdf_name = get_subdir_and_pdf_name(entry['source_file']) + if subdir and pdf_name: + key = (subdir, pdf_name) + subdir_pdf_to_entries[key].append(entry) + + created_files = set() + + for (subdir, pdf_name), pdf_entries in subdir_pdf_to_entries.items(): + subdir_path = output_path / subdir + subdir_path.mkdir(parents=True, exist_ok=True) + + md_filename = f"{pdf_name}_pg1_repeat1.md" + md_filepath = subdir_path / md_filename + combined_text = [] + + for entry in pdf_entries: + text = entry['text'] + if text.strip(): + source_file = entry['source_file'] + combined_text.append(text) + + with open(md_filepath, 'w', encoding='utf-8') as f: + f.write('\n'.join(combined_text)) + + created_files.add((subdir, pdf_name)) + print(f"Created: {subdir}/{md_filename}_pg1_repeat1") + + print(f"Created {len(created_files)} markdown files from JSONL data") + return created_files + + +def find_missing_pdfs(pdf_sources, created_files, base_bench_path): + """Find PDFs that exist in directories but are missing from JSONL data.""" + subdirs = set() + + for source_file in pdf_sources: + if not source_file: + continue + + subdir, _ = get_subdir_and_pdf_name(source_file) + if subdir: + subdirs.add(subdir) + + print(f"Found PDF subdirectories: {sorted(subdirs)}") + + missing_pdfs = [] + + for subdir in subdirs: + pdf_dir = Path(base_bench_path) / "bench_data" / "pdfs" / subdir + + if not pdf_dir.exists(): + print(f"Warning: Directory {pdf_dir} does not exist") + continue + + pdf_files = list(pdf_dir.glob("*.pdf")) + print(f"Found {len(pdf_files)} PDF files in {subdir}/") + + for pdf_file in pdf_files: + pdf_name = pdf_file.stem + + if (subdir, pdf_name) not in created_files: + missing_pdfs.append({ + 'pdf_name': pdf_name, + 'full_path': pdf_file, + 'subdir': subdir + }) + + print(f"Found {len(missing_pdfs)} missing PDFs") + return missing_pdfs + + +def create_blank_markdown_files(missing_pdfs, output_dir): + """Create blank markdown files for missing PDFs in subdir/{pdf_name}.md format.""" + output_path = Path(output_dir) + + for missing_pdf in missing_pdfs: + subdir = missing_pdf['subdir'] + pdf_name = missing_pdf['pdf_name'] + + subdir_path = output_path / subdir + subdir_path.mkdir(parents=True, exist_ok=True) + + md_filename = f"{pdf_name}_pg1_repeat1.md" + md_filepath = subdir_path / md_filename + + content = "" + + with open(md_filepath, 'w', encoding='utf-8') as f: + f.write(content) + + print(f"Created blank: {subdir}/{md_filename}_pg1_repeat1") + + print(f"Created {len(missing_pdfs)} blank markdown files for missing PDFs") + + +def main(): + parser = argparse.ArgumentParser(description="Convert JSONL files to Markdown and handle missing PDFs") + parser.add_argument("workspace_dir", help="Your workspace directory") + parser.add_argument("output_dir", nargs='?', default="./markdown_output", + help="Output directory for markdown files (default: ./markdown_output)") + parser.add_argument("--bench-path", default="../olmOCR-bench", + help="Path to olmOCR-bench directory (default: ../olmOCR-bench)") + + args = parser.parse_args() + input_dir = args.workspace_dir + "/results" + input_dir = Path(input_dir) + output_dir = Path(args.output_dir) + bench_path = Path(args.bench_path) + + if not input_dir.exists(): + print(f"Error: Input directory {input_dir} does not exist") + sys.exit(1) + + jsonl_files = load_jsonl_files(input_dir) + if not jsonl_files: + sys.exit(1) + + entries, pdf_sources = parse_jsonl_entries(jsonl_files) + if not entries: + print("No entries found in JSONL files") + sys.exit(1) + + created_files = create_markdown_files(entries, output_dir) + + missing_pdfs = find_missing_pdfs(pdf_sources, created_files, bench_path) + + if missing_pdfs: + create_blank_markdown_files(missing_pdfs, output_dir) + + print(f"\nSummary:") + print(f"Created {len(created_files)} markdown files from JSONL data") + print(f"Created {len(missing_pdfs)} blank markdown files for missing PDFs") + print(f"Total markdown files: {len(created_files) + len(missing_pdfs)}") + print(f"Output directory: {output_dir.absolute()}") + + +if __name__ == "__main__": + main() From cd5db7f281ed3e4ef40e5ba9bd86258669743187 Mon Sep 17 00:00:00 2001 From: aman-17 Date: Wed, 28 May 2025 14:42:07 -0700 Subject: [PATCH 02/13] fixed style and lint --- olmocr/bench/workspace_to_bench.py | 141 +++++++++++++---------------- 1 file changed, 65 insertions(+), 76 deletions(-) diff --git a/olmocr/bench/workspace_to_bench.py b/olmocr/bench/workspace_to_bench.py index ce13969..19fa4ae 100644 --- a/olmocr/bench/workspace_to_bench.py +++ b/olmocr/bench/workspace_to_bench.py @@ -4,11 +4,11 @@ Usage: python workspace_to_benchmark.py localworkspace ./markdown_output --bench-path ../olmOCR-bench/ """ +import argparse import json import sys -import argparse -from pathlib import Path from collections import defaultdict +from pathlib import Path def load_jsonl_files(input_dir): @@ -17,7 +17,7 @@ def load_jsonl_files(input_dir): if not jsonl_files: print(f"No JSONL files found in {input_dir}") return [] - + print(f"Found {len(jsonl_files)} JSONL files: {[f.name for f in jsonl_files]}") return jsonl_files @@ -26,39 +26,34 @@ def parse_jsonl_entries(jsonl_files): """Parse all JSONL files and extract entries with text and metadata.""" all_entries = [] pdf_sources = set() - + for jsonl_file in jsonl_files: print(f"Processing {jsonl_file.name}...") - - with open(jsonl_file, 'r', encoding='utf-8') as f: + + with open(jsonl_file, "r", encoding="utf-8") as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line: continue - + try: entry = json.loads(line) - text = entry.get('text', '') - metadata = entry.get('metadata', {}) - source_file = metadata.get('Source-File', '') - + text = entry.get("text", "") + metadata = entry.get("metadata", {}) + source_file = metadata.get("Source-File", "") + if source_file: pdf_sources.add(source_file) - - all_entries.append({ - 'text': text, - 'source_file': source_file, - 'metadata': metadata, - 'entry': entry - }) - + + all_entries.append({"text": text, "source_file": source_file, "metadata": metadata, "entry": entry}) + except json.JSONDecodeError as e: print(f"Error parsing line {line_num} in {jsonl_file.name}: {e}") continue - + print(f"Loaded {len(all_entries)} entries from JSONL files") print(f"Found {len(pdf_sources)} unique PDF sources") - + return all_entries, pdf_sources @@ -68,33 +63,33 @@ def get_subdir_and_pdf_name(source_file_path): return None, None path_parts = Path(source_file_path).parts - + try: - pdfs_index = path_parts.index('pdfs') + pdfs_index = path_parts.index("pdfs") if pdfs_index + 1 < len(path_parts): subdir = path_parts[pdfs_index + 1] - pdf_name = Path(source_file_path).stem + pdf_name = Path(source_file_path).stem return subdir, pdf_name except ValueError: pass - + return None, None def create_markdown_files(entries, output_dir): """Create markdown files from JSONL entries in subdir/{pdf_name}.md format.""" output_path = Path(output_dir) - + subdir_pdf_to_entries = defaultdict(list) - + for entry in entries: - subdir, pdf_name = get_subdir_and_pdf_name(entry['source_file']) + subdir, pdf_name = get_subdir_and_pdf_name(entry["source_file"]) if subdir and pdf_name: key = (subdir, pdf_name) subdir_pdf_to_entries[key].append(entry) - + created_files = set() - + for (subdir, pdf_name), pdf_entries in subdir_pdf_to_entries.items(): subdir_path = output_path / subdir subdir_path.mkdir(parents=True, exist_ok=True) @@ -102,19 +97,19 @@ def create_markdown_files(entries, output_dir): md_filename = f"{pdf_name}_pg1_repeat1.md" md_filepath = subdir_path / md_filename combined_text = [] - + for entry in pdf_entries: - text = entry['text'] + text = entry["text"] if text.strip(): - source_file = entry['source_file'] + source_file = entry["source_file"] combined_text.append(text) - - with open(md_filepath, 'w', encoding='utf-8') as f: - f.write('\n'.join(combined_text)) - + + with open(md_filepath, "w", encoding="utf-8") as f: + f.write("\n".join(combined_text)) + created_files.add((subdir, pdf_name)) print(f"Created: {subdir}/{md_filename}_pg1_repeat1") - + print(f"Created {len(created_files)} markdown files from JSONL data") return created_files @@ -122,39 +117,35 @@ def create_markdown_files(entries, output_dir): def find_missing_pdfs(pdf_sources, created_files, base_bench_path): """Find PDFs that exist in directories but are missing from JSONL data.""" subdirs = set() - + for source_file in pdf_sources: if not source_file: continue - + subdir, _ = get_subdir_and_pdf_name(source_file) if subdir: subdirs.add(subdir) - + print(f"Found PDF subdirectories: {sorted(subdirs)}") - + missing_pdfs = [] - + for subdir in subdirs: pdf_dir = Path(base_bench_path) / "bench_data" / "pdfs" / subdir - + if not pdf_dir.exists(): print(f"Warning: Directory {pdf_dir} does not exist") continue - + pdf_files = list(pdf_dir.glob("*.pdf")) print(f"Found {len(pdf_files)} PDF files in {subdir}/") - + for pdf_file in pdf_files: - pdf_name = pdf_file.stem - + pdf_name = pdf_file.stem + if (subdir, pdf_name) not in created_files: - missing_pdfs.append({ - 'pdf_name': pdf_name, - 'full_path': pdf_file, - 'subdir': subdir - }) - + missing_pdfs.append({"pdf_name": pdf_name, "full_path": pdf_file, "subdir": subdir}) + print(f"Found {len(missing_pdfs)} missing PDFs") return missing_pdfs @@ -162,61 +153,59 @@ def find_missing_pdfs(pdf_sources, created_files, base_bench_path): def create_blank_markdown_files(missing_pdfs, output_dir): """Create blank markdown files for missing PDFs in subdir/{pdf_name}.md format.""" output_path = Path(output_dir) - + for missing_pdf in missing_pdfs: - subdir = missing_pdf['subdir'] - pdf_name = missing_pdf['pdf_name'] - + subdir = missing_pdf["subdir"] + pdf_name = missing_pdf["pdf_name"] + subdir_path = output_path / subdir subdir_path.mkdir(parents=True, exist_ok=True) - + md_filename = f"{pdf_name}_pg1_repeat1.md" md_filepath = subdir_path / md_filename - + content = "" - - with open(md_filepath, 'w', encoding='utf-8') as f: + + with open(md_filepath, "w", encoding="utf-8") as f: f.write(content) - + print(f"Created blank: {subdir}/{md_filename}_pg1_repeat1") - + print(f"Created {len(missing_pdfs)} blank markdown files for missing PDFs") def main(): parser = argparse.ArgumentParser(description="Convert JSONL files to Markdown and handle missing PDFs") parser.add_argument("workspace_dir", help="Your workspace directory") - parser.add_argument("output_dir", nargs='?', default="./markdown_output", - help="Output directory for markdown files (default: ./markdown_output)") - parser.add_argument("--bench-path", default="../olmOCR-bench", - help="Path to olmOCR-bench directory (default: ../olmOCR-bench)") - + parser.add_argument("output_dir", nargs="?", default="./markdown_output", help="Output directory for markdown files (default: ./markdown_output)") + parser.add_argument("--bench-path", default="../olmOCR-bench", help="Path to olmOCR-bench directory (default: ../olmOCR-bench)") + args = parser.parse_args() input_dir = args.workspace_dir + "/results" input_dir = Path(input_dir) output_dir = Path(args.output_dir) bench_path = Path(args.bench_path) - + if not input_dir.exists(): print(f"Error: Input directory {input_dir} does not exist") sys.exit(1) - + jsonl_files = load_jsonl_files(input_dir) if not jsonl_files: sys.exit(1) - + entries, pdf_sources = parse_jsonl_entries(jsonl_files) if not entries: print("No entries found in JSONL files") sys.exit(1) - + created_files = create_markdown_files(entries, output_dir) - + missing_pdfs = find_missing_pdfs(pdf_sources, created_files, bench_path) - + if missing_pdfs: create_blank_markdown_files(missing_pdfs, output_dir) - + print(f"\nSummary:") print(f"Created {len(created_files)} markdown files from JSONL data") print(f"Created {len(missing_pdfs)} blank markdown files for missing PDFs") From 8a6309366389523c7c7d8faa39fc15f9d5aefa64 Mon Sep 17 00:00:00 2001 From: aman-17 Date: Wed, 28 May 2025 14:45:07 -0700 Subject: [PATCH 03/13] fixed lint --- olmocr/bench/workspace_to_bench.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/olmocr/bench/workspace_to_bench.py b/olmocr/bench/workspace_to_bench.py index 19fa4ae..3f61b4f 100644 --- a/olmocr/bench/workspace_to_bench.py +++ b/olmocr/bench/workspace_to_bench.py @@ -101,7 +101,7 @@ def create_markdown_files(entries, output_dir): for entry in pdf_entries: text = entry["text"] if text.strip(): - source_file = entry["source_file"] + # source_file = entry["source_file"] combined_text.append(text) with open(md_filepath, "w", encoding="utf-8") as f: @@ -206,7 +206,7 @@ def main(): if missing_pdfs: create_blank_markdown_files(missing_pdfs, output_dir) - print(f"\nSummary:") + print("\nSummary:") print(f"Created {len(created_files)} markdown files from JSONL data") print(f"Created {len(missing_pdfs)} blank markdown files for missing PDFs") print(f"Total markdown files: {len(created_files) + len(missing_pdfs)}") From ce616c6514211f49571f5249eb8230a590b2f386 Mon Sep 17 00:00:00 2001 From: aman-17 Date: Wed, 28 May 2025 19:01:01 -0700 Subject: [PATCH 04/13] addressed Jake's comments --- Dockerfile | 1 - olmocr/bench/{ => scripts}/workspace_to_bench.py | 0 2 files changed, 1 deletion(-) rename olmocr/bench/{ => scripts}/workspace_to_bench.py (100%) diff --git a/Dockerfile b/Dockerfile index 472c708..e51ef36 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,7 +45,6 @@ COPY olmocr olmocr COPY scripts scripts WORKDIR /root COPY olmocr olmocr -COPY scripts scripts RUN python3 -m sglang.launch_server --help RUN python3 -m olmocr.pipeline --help \ No newline at end of file diff --git a/olmocr/bench/workspace_to_bench.py b/olmocr/bench/scripts/workspace_to_bench.py similarity index 100% rename from olmocr/bench/workspace_to_bench.py rename to olmocr/bench/scripts/workspace_to_bench.py From 8347e384fd0e46584fce508efd9caaf962244436 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 29 May 2025 16:12:06 +0000 Subject: [PATCH 05/13] I think this fixes up the docker file --- Dockerfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index e51ef36..3d5786e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -31,6 +31,7 @@ ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh RUN /install.sh && rm /install.sh ENV PYTHONUNBUFFERED=1 + WORKDIR /root COPY pyproject.toml pyproject.toml COPY olmocr/version.py olmocr/version.py @@ -41,10 +42,7 @@ RUN /root/.local/bin/uv pip install --system --no-cache ".[bench]" RUN playwright install-deps RUN playwright install chromium COPY olmocr olmocr - COPY scripts scripts -WORKDIR /root -COPY olmocr olmocr RUN python3 -m sglang.launch_server --help RUN python3 -m olmocr.pipeline --help \ No newline at end of file From 475cc1c3a42c678b79d021132662893bf5837d7a Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 29 May 2025 17:08:05 +0000 Subject: [PATCH 06/13] Working on runner script --- scripts/run_benchmark.sh | 73 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100755 scripts/run_benchmark.sh diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh new file mode 100755 index 0000000..2cffdfc --- /dev/null +++ b/scripts/run_benchmark.sh @@ -0,0 +1,73 @@ +#!/bin/bash + +set -e + +# Get version from version.py +VERSION=$(python -c 'import olmocr.version; print(olmocr.version.VERSION)') +echo "OlmOCR version: $VERSION" + +# Get first 10 characters of git hash +GIT_HASH=$(git rev-parse HEAD | cut -c1-10) +echo "Git hash: $GIT_HASH" + +# Create full image tag +IMAGE_TAG="olmocr-benchmark-${VERSION}-${GIT_HASH}" +echo "Building Docker image with tag: $IMAGE_TAG" + +# Build the Docker image +echo "Building Docker image..." +docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG . + +# Push image to beaker +echo "Pushing image to Beaker..." +beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG + +# Create Python script to run beaker experiment +cat << 'EOF' > /tmp/run_benchmark_experiment.py +import sys +from beaker import Beaker, ExperimentSpec, TaskSpec, TaskResources, ImageSource, Priority, Constraints + +# Get image tag from command line +image_tag = sys.argv[1] + +# Initialize Beaker client +b = Beaker.from_env(default_workspace="ai2/oe-data-pdf") + +# Create experiment spec +experiment_spec = ExperimentSpec( + description="OlmOCR Benchmark Run", + budget="ai2/oe-data", + tasks=[ + TaskSpec( + name="olmocr-benchmark", + image=ImageSource(beaker=f"ai2/oe-data-pdf/{image_tag}"), + command=[ + "bash", "-c", + " && ".join([ + "huggingface-cli download --repo-type dataset --resume-download allenai/olmOCR-bench --local-dir ./olmOCR-bench", + "python -m olmocr.pipeline ./localworkspace --markdown --pdfs './olmOCR-bench/bench_data/pdfs/**/*.pdf'", + "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/markdown_output --bench-path ./olmOCR-bench/", + "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data" + ]) + ], + resources=TaskResources(gpu_count=1), + #constraints=Constraint(cluster=["ai2/pluto-cirrascale", "ai2/jupiter-cirrascale"]), + priority=Priority.normal, + ) + ], +) + +# Create the experiment +experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/oe-data-pdf") +print(f"Created experiment: {experiment.id}") +print(f"View at: https://beaker.org/ex/{experiment.id}") +EOF + +# Run the Python script to create the experiment +echo "Creating Beaker experiment..." +python /tmp/run_benchmark_experiment.py $IMAGE_TAG + +# Clean up temporary file +rm /tmp/run_benchmark_experiment.py + +echo "Benchmark experiment submitted successfully!" \ No newline at end of file From ff31faebe4c364a2733e13aaabb6fcae485a4f74 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 29 May 2025 17:12:41 +0000 Subject: [PATCH 07/13] Runner improvements --- scripts/run_benchmark.sh | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index 2cffdfc..7072147 100755 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -2,8 +2,17 @@ set -e +# Use conda environment Python if available, otherwise use system Python +if [ -n "$CONDA_PREFIX" ]; then + PYTHON="$CONDA_PREFIX/bin/python" + echo "Using conda Python from: $CONDA_PREFIX" +else + PYTHON="python" + echo "Warning: No conda environment detected, using system Python" +fi + # Get version from version.py -VERSION=$(python -c 'import olmocr.version; print(olmocr.version.VERSION)') +VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)') echo "OlmOCR version: $VERSION" # Get first 10 characters of git hash @@ -25,7 +34,7 @@ beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG # Create Python script to run beaker experiment cat << 'EOF' > /tmp/run_benchmark_experiment.py import sys -from beaker import Beaker, ExperimentSpec, TaskSpec, TaskResources, ImageSource, Priority, Constraints +from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints # Get image tag from command line image_tag = sys.argv[1] @@ -50,9 +59,13 @@ experiment_spec = ExperimentSpec( "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data" ]) ], + context=TaskContext( + priority=Priority.normal, + preemptible=True, + ), resources=TaskResources(gpu_count=1), #constraints=Constraint(cluster=["ai2/pluto-cirrascale", "ai2/jupiter-cirrascale"]), - priority=Priority.normal, + result=ResultSpec(path="/noop-results"), ) ], ) @@ -65,7 +78,7 @@ EOF # Run the Python script to create the experiment echo "Creating Beaker experiment..." -python /tmp/run_benchmark_experiment.py $IMAGE_TAG +$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG # Clean up temporary file rm /tmp/run_benchmark_experiment.py From 06988ac5333459566c08a7fff5c7d9965c959756 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 29 May 2025 17:18:12 +0000 Subject: [PATCH 08/13] Image fixes --- scripts/run_benchmark.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index 7072147..df0275d 100755 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -27,6 +27,10 @@ echo "Building Docker image with tag: $IMAGE_TAG" echo "Building Docker image..." docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG . +# Get Beaker username +BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name') +echo "Beaker user: $BEAKER_USER" + # Push image to beaker echo "Pushing image to Beaker..." beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG @@ -36,8 +40,9 @@ cat << 'EOF' > /tmp/run_benchmark_experiment.py import sys from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints -# Get image tag from command line +# Get image tag and beaker user from command line image_tag = sys.argv[1] +beaker_user = sys.argv[2] # Initialize Beaker client b = Beaker.from_env(default_workspace="ai2/oe-data-pdf") @@ -49,7 +54,7 @@ experiment_spec = ExperimentSpec( tasks=[ TaskSpec( name="olmocr-benchmark", - image=ImageSource(beaker=f"ai2/oe-data-pdf/{image_tag}"), + image=ImageSource(beaker=f"{beaker_user}/{image_tag}"), command=[ "bash", "-c", " && ".join([ @@ -78,7 +83,7 @@ EOF # Run the Python script to create the experiment echo "Creating Beaker experiment..." -$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG +$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER # Clean up temporary file rm /tmp/run_benchmark_experiment.py From e8e6b6cb17b8ceac42bf4512ed53896d5307d22a Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 29 May 2025 17:19:36 +0000 Subject: [PATCH 09/13] More fixes --- scripts/run_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index df0275d..58cdcb0 100755 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -69,7 +69,7 @@ experiment_spec = ExperimentSpec( preemptible=True, ), resources=TaskResources(gpu_count=1), - #constraints=Constraint(cluster=["ai2/pluto-cirrascale", "ai2/jupiter-cirrascale"]), + constraints=Constraint(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale"]), result=ResultSpec(path="/noop-results"), ) ], From 15e00642126a95c735b10aefd2003a5f249dc045 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 29 May 2025 17:20:32 +0000 Subject: [PATCH 10/13] More fixes --- scripts/run_benchmark.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index 58cdcb0..6127c20 100755 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -45,7 +45,7 @@ image_tag = sys.argv[1] beaker_user = sys.argv[2] # Initialize Beaker client -b = Beaker.from_env(default_workspace="ai2/oe-data-pdf") +b = Beaker.from_env(default_workspace="ai2/olmocr") # Create experiment spec experiment_spec = ExperimentSpec( @@ -69,14 +69,14 @@ experiment_spec = ExperimentSpec( preemptible=True, ), resources=TaskResources(gpu_count=1), - constraints=Constraint(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale"]), + constraints=Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale"]), result=ResultSpec(path="/noop-results"), ) ], ) # Create the experiment -experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/oe-data-pdf") +experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr") print(f"Created experiment: {experiment.id}") print(f"View at: https://beaker.org/ex/{experiment.id}") EOF From 45e0ae59dce5c1c2f6379fbe6af5ee2054a08112 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 29 May 2025 17:21:58 +0000 Subject: [PATCH 11/13] omg --- scripts/run_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index 6127c20..f204d02 100755 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -69,7 +69,7 @@ experiment_spec = ExperimentSpec( preemptible=True, ), resources=TaskResources(gpu_count=1), - constraints=Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale"]), + constraints=Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]), result=ResultSpec(path="/noop-results"), ) ], From 129412cdb01899f7aaf3a4f0368e668ba8873dec Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 29 May 2025 17:38:00 +0000 Subject: [PATCH 12/13] Git lfs for more reliable downloads --- Dockerfile | 1 + scripts/run_benchmark.sh | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 3d5786e..1591a71 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,6 +10,7 @@ RUN apt-get update -y && apt-get install -y poppler-utils ttf-mscorefonts-instal RUN apt-get update -y && apt-get install -y --no-install-recommends \ git \ + git-lfs \ python3.11 \ python3.11-dev \ python3.11-distutils \ diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index f204d02..7e0e06e 100755 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -58,7 +58,8 @@ experiment_spec = ExperimentSpec( command=[ "bash", "-c", " && ".join([ - "huggingface-cli download --repo-type dataset --resume-download allenai/olmOCR-bench --local-dir ./olmOCR-bench", + "git clone https://huggingface.co/datasets/allenai/olmOCR-bench", + "cd olmOCR-bench && git lfs pull && cd ..", "python -m olmocr.pipeline ./localworkspace --markdown --pdfs './olmOCR-bench/bench_data/pdfs/**/*.pdf'", "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/markdown_output --bench-path ./olmOCR-bench/", "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data" From 01c4a561d3caad3b2a062c669c9f2352ebe2655b Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 29 May 2025 17:58:11 +0000 Subject: [PATCH 13/13] Script fixes --- scripts/run_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index 7e0e06e..bdebaeb 100755 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -60,7 +60,7 @@ experiment_spec = ExperimentSpec( " && ".join([ "git clone https://huggingface.co/datasets/allenai/olmOCR-bench", "cd olmOCR-bench && git lfs pull && cd ..", - "python -m olmocr.pipeline ./localworkspace --markdown --pdfs './olmOCR-bench/bench_data/pdfs/**/*.pdf'", + "python -m olmocr.pipeline ./localworkspace --markdown --pdfs ./olmOCR-bench/bench_data/pdfs/**/*.pdf", "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/markdown_output --bench-path ./olmOCR-bench/", "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data" ])