From 945f1d84fe9e612c0a18603b46aa6363ae070f68 Mon Sep 17 00:00:00 2001
From: aman-17 <amanrangapur@gmail.com>
Date: Wed, 28 May 2025 21:08:06 +0000
Subject: [PATCH] added beaker bench tests

---
 olmocr/bench/workspace_to_benchmark.py | 228 +++++++++++++++++++++++++
 scripts/beaker/benchmark.sh            |  18 ++
 scripts/beaker/benchmark.yaml          |  26 +++
 3 files changed, 272 insertions(+)
 create mode 100644 olmocr/bench/workspace_to_benchmark.py
 create mode 100644 scripts/beaker/benchmark.sh
 create mode 100644 scripts/beaker/benchmark.yaml

diff --git a/olmocr/bench/workspace_to_benchmark.py b/olmocr/bench/workspace_to_benchmark.py
new file mode 100644
index 0000000..ce13969
--- /dev/null
+++ b/olmocr/bench/workspace_to_benchmark.py
@@ -0,0 +1,228 @@
+"""
+Convert JSONL files to Markdown files and handle missing PDFs
+Usage:
+    python workspace_to_benchmark.py localworkspace ./markdown_output --bench-path ../olmOCR-bench/
+"""
+
+import json
+import sys
+import argparse
+from pathlib import Path
+from collections import defaultdict
+
+
+def load_jsonl_files(input_dir):
+    """Load all JSONL files from the input directory."""
+    jsonl_files = list(Path(input_dir).glob("*.jsonl"))
+    if not jsonl_files:
+        print(f"No JSONL files found in {input_dir}")
+        return []
+    
+    print(f"Found {len(jsonl_files)} JSONL files: {[f.name for f in jsonl_files]}")
+    return jsonl_files
+
+
+def parse_jsonl_entries(jsonl_files):
+    """Parse all JSONL files and extract entries with text and metadata."""
+    all_entries = []
+    pdf_sources = set()
+    
+    for jsonl_file in jsonl_files:
+        print(f"Processing {jsonl_file.name}...")
+        
+        with open(jsonl_file, 'r', encoding='utf-8') as f:
+            for line_num, line in enumerate(f, 1):
+                line = line.strip()
+                if not line:
+                    continue
+                    
+                try:
+                    entry = json.loads(line)
+                    text = entry.get('text', '')
+                    metadata = entry.get('metadata', {})
+                    source_file = metadata.get('Source-File', '')
+                    
+                    if source_file:
+                        pdf_sources.add(source_file)
+                        
+                    all_entries.append({
+                        'text': text,
+                        'source_file': source_file,
+                        'metadata': metadata,
+                        'entry': entry
+                    })
+                    
+                except json.JSONDecodeError as e:
+                    print(f"Error parsing line {line_num} in {jsonl_file.name}: {e}")
+                    continue
+    
+    print(f"Loaded {len(all_entries)} entries from JSONL files")
+    print(f"Found {len(pdf_sources)} unique PDF sources")
+    
+    return all_entries, pdf_sources
+
+
+def get_subdir_and_pdf_name(source_file_path):
+    """Extract subdirectory and PDF filename from source file path."""
+    if not source_file_path:
+        return None, None
+
+    path_parts = Path(source_file_path).parts
+    
+    try:
+        pdfs_index = path_parts.index('pdfs')
+        if pdfs_index + 1 < len(path_parts):
+            subdir = path_parts[pdfs_index + 1]
+            pdf_name = Path(source_file_path).stem  
+            return subdir, pdf_name
+    except ValueError:
+        pass
+    
+    return None, None
+
+
+def create_markdown_files(entries, output_dir):
+    """Create markdown files from JSONL entries in subdir/{pdf_name}.md format."""
+    output_path = Path(output_dir)
+    
+    subdir_pdf_to_entries = defaultdict(list)
+    
+    for entry in entries:
+        subdir, pdf_name = get_subdir_and_pdf_name(entry['source_file'])
+        if subdir and pdf_name:
+            key = (subdir, pdf_name)
+            subdir_pdf_to_entries[key].append(entry)
+    
+    created_files = set()
+    
+    for (subdir, pdf_name), pdf_entries in subdir_pdf_to_entries.items():
+        subdir_path = output_path / subdir
+        subdir_path.mkdir(parents=True, exist_ok=True)
+
+        md_filename = f"{pdf_name}_pg1_repeat1.md"
+        md_filepath = subdir_path / md_filename
+        combined_text = []
+        
+        for entry in pdf_entries:
+            text = entry['text']
+            if text.strip():
+                source_file = entry['source_file']
+                combined_text.append(text)
+        
+        with open(md_filepath, 'w', encoding='utf-8') as f:
+            f.write('\n'.join(combined_text))
+        
+        created_files.add((subdir, pdf_name))
+        print(f"Created: {subdir}/{md_filename}_pg1_repeat1")
+    
+    print(f"Created {len(created_files)} markdown files from JSONL data")
+    return created_files
+
+
+def find_missing_pdfs(pdf_sources, created_files, base_bench_path):
+    """Find PDFs that exist in directories but are missing from JSONL data."""
+    subdirs = set()
+    
+    for source_file in pdf_sources:
+        if not source_file:
+            continue
+            
+        subdir, _ = get_subdir_and_pdf_name(source_file)
+        if subdir:
+            subdirs.add(subdir)
+    
+    print(f"Found PDF subdirectories: {sorted(subdirs)}")
+    
+    missing_pdfs = []
+    
+    for subdir in subdirs:
+        pdf_dir = Path(base_bench_path) / "bench_data" / "pdfs" / subdir
+        
+        if not pdf_dir.exists():
+            print(f"Warning: Directory {pdf_dir} does not exist")
+            continue
+            
+        pdf_files = list(pdf_dir.glob("*.pdf"))
+        print(f"Found {len(pdf_files)} PDF files in {subdir}/")
+        
+        for pdf_file in pdf_files:
+            pdf_name = pdf_file.stem 
+            
+            if (subdir, pdf_name) not in created_files:
+                missing_pdfs.append({
+                    'pdf_name': pdf_name,
+                    'full_path': pdf_file,
+                    'subdir': subdir
+                })
+    
+    print(f"Found {len(missing_pdfs)} missing PDFs")
+    return missing_pdfs
+
+
+def create_blank_markdown_files(missing_pdfs, output_dir):
+    """Create blank markdown files for missing PDFs in subdir/{pdf_name}.md format."""
+    output_path = Path(output_dir)
+    
+    for missing_pdf in missing_pdfs:
+        subdir = missing_pdf['subdir']
+        pdf_name = missing_pdf['pdf_name']
+        
+        subdir_path = output_path / subdir
+        subdir_path.mkdir(parents=True, exist_ok=True)
+        
+        md_filename = f"{pdf_name}_pg1_repeat1.md"
+        md_filepath = subdir_path / md_filename
+        
+        content = ""
+        
+        with open(md_filepath, 'w', encoding='utf-8') as f:
+            f.write(content)
+        
+        print(f"Created blank: {subdir}/{md_filename}_pg1_repeat1")
+    
+    print(f"Created {len(missing_pdfs)} blank markdown files for missing PDFs")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert JSONL files to Markdown and handle missing PDFs")
+    parser.add_argument("workspace_dir", help="Your workspace directory")
+    parser.add_argument("output_dir", nargs='?', default="./markdown_output", 
+                       help="Output directory for markdown files (default: ./markdown_output)")
+    parser.add_argument("--bench-path", default="../olmOCR-bench", 
+                       help="Path to olmOCR-bench directory (default: ../olmOCR-bench)")
+    
+    args = parser.parse_args()
+    input_dir = args.workspace_dir + "/results"
+    input_dir = Path(input_dir)
+    output_dir = Path(args.output_dir)
+    bench_path = Path(args.bench_path)
+    
+    if not input_dir.exists():
+        print(f"Error: Input directory {input_dir} does not exist")
+        sys.exit(1)
+    
+    jsonl_files = load_jsonl_files(input_dir)
+    if not jsonl_files:
+        sys.exit(1)
+    
+    entries, pdf_sources = parse_jsonl_entries(jsonl_files)
+    if not entries:
+        print("No entries found in JSONL files")
+        sys.exit(1)
+    
+    created_files = create_markdown_files(entries, output_dir)
+    
+    missing_pdfs = find_missing_pdfs(pdf_sources, created_files, bench_path)
+    
+    if missing_pdfs:
+        create_blank_markdown_files(missing_pdfs, output_dir)
+    
+    print(f"\nSummary:")
+    print(f"Created {len(created_files)} markdown files from JSONL data")
+    print(f"Created {len(missing_pdfs)} blank markdown files for missing PDFs")
+    print(f"Total markdown files: {len(created_files) + len(missing_pdfs)}")
+    print(f"Output directory: {output_dir.absolute()}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/beaker/benchmark.sh b/scripts/beaker/benchmark.sh
new file mode 100644
index 0000000..18ac099
--- /dev/null
+++ b/scripts/beaker/benchmark.sh
@@ -0,0 +1,18 @@
+# download olmocr-bench dataset
+apt install git-lfs
+git lfs install
+git clone https://huggingface.co/datasets/allenai/olmOCR-bench
+cd olmOCR-bench/
+git lfs pull
+cd ..
+
+python -m olmocr.pipeline ./localworkspace --markdown --pdfs ./olmOCR-bench/bench_data/pdfs/arxiv_math/*.pdf
+python -m olmocr.pipeline ./localworkspace --markdown --pdfs ./olmOCR-bench/bench_data/pdfs/headers_footers/*.pdf
+python -m olmocr.pipeline ./localworkspace --markdown --pdfs ./olmOCR-bench/bench_data/pdfs/long_tiny_text/*.pdf
+python -m olmocr.pipeline ./localworkspace --markdown --pdfs ./olmOCR-bench/bench_data/pdfs/multi_column/*.pdf
+python -m olmocr.pipeline ./localworkspace --markdown --pdfs ./olmOCR-bench/bench_data/pdfs/old_scans/*.pdf
+python -m olmocr.pipeline ./localworkspace --markdown --pdfs ./olmOCR-bench/bench_data/pdfs/old_scans_math/*.pdf
+python -m olmocr.pipeline ./localworkspace --markdown --pdfs ./olmOCR-bench/bench_data/pdfs/tables/*.pdf
+
+python olmocr/bench/workspace_to_benchmark.py localworkspace/ olmOCR-bench/bench_data/markdown_output --bench-path ./olmOCR-bench/
+python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data
\ No newline at end of file
diff --git a/scripts/beaker/benchmark.yaml b/scripts/beaker/benchmark.yaml
new file mode 100644
index 0000000..a10c79e
--- /dev/null
+++ b/scripts/beaker/benchmark.yaml
@@ -0,0 +1,26 @@
+version: v2
+budget: ai2/oe-training
+description: Test benchmark results
+retry:
+  allowedTaskRetries: 3
+tasks:
+  - name: testing-olmocr-bench-results
+    image:
+      beaker: amanr/olmocr_latest
+    command:
+      - /bin/bash
+      - /new_mount_path/unshard_upload_hf_gs_augusta.sh
+    resources:
+      gpuCount: 1
+    datasets:
+      - mountPath: /new_mount_path/
+        source:
+          beaker: amanr/01
+      - mountPath: /data/
+        source:
+          hostPath: /data/
+    context:
+      preemptible: true
+    constraints:
+      cluster:
+        - ai2/augusta-google-1