From f21ff08c2f9bc0b4e6cc34208416114c71af584d Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Wed, 4 Jun 2025 23:10:14 -0700
Subject: [PATCH 01/18] Fix marker benchmarks

---
 README.md                          | 26 ++++++++++----------
 olmocr/bench/README.md             | 38 ++++++++++++++++++++----------
 olmocr/bench/runners/run_marker.py | 20 ++++++++++++++--
 olmocr/bench/tests.py              |  2 ++
 scripts/pareto_plot.py             |  8 +++----
 5 files changed, 62 insertions(+), 32 deletions(-)
diff --git a/README.md b/README.md
index 6e95c54..ee0040b 100644
--- a/README.md
+++ b/README.md
@@ -61,18 +61,6 @@ We also ship a comprehensive benchmark suite covering over 7,000 test cases acro
     </tr>
   </thead>
   <tbody>
-    <tr>
-      <td align="left">Marker v1.6.2</td>
-      <td align="center">24.3</td>
-      <td align="center">22.1</td>
-      <td align="center">69.8</td>
-      <td align="center">24.3</td>
-      <td align="center">87.1</td>
-      <td align="center">71.0</td>
-      <td align="center">76.9</td>
-      <td align="center"><strong>99.5</strong></td>
-      <td align="center">59.4 ± 1.1</td>
-    </tr>
     <tr>
       <td align="left">MinerU v1.3.10</td>
       <td align="center">75.4</td>
@@ -87,7 +75,7 @@ We also ship a comprehensive benchmark suite covering over 7,000 test cases acro
     </tr>
     <tr>
       <td align="left">Mistral OCR API</td>
-      <td align="center"><strong>77.2</strong></td>
+      <td align="center">77.2</td>
       <td align="center">67.5</td>
       <td align="center">60.6</td>
       <td align="center">29.3</td>
@@ -97,6 +85,18 @@ We also ship a comprehensive benchmark suite covering over 7,000 test cases acro
       <td align="center">99.4</td>
       <td align="center">72.0 ± 1.1</td>
     </tr>
+    <tr>
+      <td align="left">Marker v1.7.4 (hybrid)</td>
+      <td align="center"><strong>77.7</strong></td>
+      <td align="center">71.2</td>
+      <td align="center"><strong>78.1</strong></td>
+      <td align="center">32.3</td>
+      <td align="center">83.4</td>
+      <td align="center">73.8</td>
+      <td align="center">79.0</td>
+      <td align="center">99.2</td>
+      <td align="center">74.3 ± 1.1</td>
+    </tr>
     <tr>
       <td align="left">olmOCR v0.1.68 (pipeline.py)</td>
       <td align="center">75.6</td>
diff --git a/olmocr/bench/README.md b/olmocr/bench/README.md
index 8cd0f72..8ab5f26 100644
--- a/olmocr/bench/README.md
+++ b/olmocr/bench/README.md
@@ -46,16 +46,28 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
       <td align="center">48.3 ± 1.1</td>
     </tr>
     <tr>
-      <td align="left">Marker v1.6.2</td>
-      <td align="center">24.3</td>
-      <td align="center">22.1</td>
-      <td align="center">69.8</td>
-      <td align="center">24.3</td>
-      <td align="center">87.1</td>
-      <td align="center">71.0</td>
-      <td align="center">76.9</td>
-      <td align="center"><strong>99.5</strong></td>
-      <td align="center">59.4 ± 1.1</td>
+      <td align="left">Marker v1.7.4 (base)</td>
+      <td align="center"><strong>77.7</strong></td>
+      <td align="center">59.6</td>
+      <td align="center">57.9</td>
+      <td align="center">27.8</td>
+      <td align="center">85.3</td>
+      <td align="center">73.5</td>
+      <td align="center">78.7</td>
+      <td align="center">99.1</td>
+      <td align="center">70.0 ± 1.1</td>
+    </tr>
+    <tr>
+      <td align="left">Marker v1.7.4 (hybrid)</td>
+      <td align="center"><strong>77.7</strong></td>
+      <td align="center">71.2</td>
+      <td align="center"><strong>78.1</strong></td>
+      <td align="center">32.3</td>
+      <td align="center">83.4</td>
+      <td align="center">73.8</td>
+      <td align="center">79.0</td>
+      <td align="center">99.2</td>
+      <td align="center">74.3 ± 1.1</td>
     </tr>
     <tr>
       <td align="left">MinerU v1.3.10</td>
@@ -71,14 +83,14 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
     </tr>
     <tr>
       <td align="left">Mistral OCR API</td>
-      <td align="center"><strong>77.2</strong></td>
+      <td align="center">77.2</td>
       <td align="center">67.5</td>
       <td align="center">60.6</td>
       <td align="center">29.3</td>
       <td align="center">93.6</td>
       <td align="center">71.3</td>
       <td align="center">77.1</td>
-      <td align="center">99.4</td>
+      <td align="center"><strong>99.4</strong></td>
       <td align="center">72.0 ± 1.1</td>
     </tr>
     <tr>
@@ -121,7 +133,7 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
       <td align="left">Gemini Flash 2 (Anchored)</td>
       <td align="center">54.5</td>
       <td align="center">56.1</td>
-      <td align="center"><strong>72.1</strong></td>
+      <td align="center">72.1</td>
       <td align="center">34.2</td>
       <td align="center">64.7</td>
       <td align="center">61.5</td>
diff --git a/olmocr/bench/runners/run_marker.py b/olmocr/bench/runners/run_marker.py
index 58733cd..594a03d 100644
--- a/olmocr/bench/runners/run_marker.py
+++ b/olmocr/bench/runners/run_marker.py
@@ -4,6 +4,7 @@ import tempfile
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
+from marker.config.parser import ConfigParser
 from pypdf import PdfReader, PdfWriter
 
 _marker_converter = None
@@ -12,13 +13,28 @@ _marker_converter = None
 def run_marker(pdf_path: str, page_num: int = 1) -> str:
     global _marker_converter
 
+    google_key_exists = os.getenv("GOOGLE_API_KEY") is not None
+
     if _marker_converter is None:
         # Create a configuration dictionary with the necessary settings
         config = {
-            "texify_inline_spans": True,  # This enables conversion of inline math to LaTeX
+            "format_lines": True,  # This enables conversion of inline math to LaTeX
+            "use_llm": google_key_exists, # Activate LLM mode if google key is specified
+            "disable_tqdm": True,  # Disable tqdm for cleaner output
+            "recognition_batch_size": 256,
+            "layout_batch_size": 48,
+            "detection_batch_size": 48,
+            "equation_batch_size": 64,
+            "table_rec_batch_size": 48,
+            "ocr_error_batch_size": 64,
         }
+        config_parser = ConfigParser(config)
 
-        _marker_converter = PdfConverter(artifact_dict=create_model_dict(), config=config)
+        _marker_converter = PdfConverter(
+            artifact_dict=create_model_dict(),
+            config=config_parser.generate_config_dict(),
+            llm_service=config_parser.get_llm_service(),
+        )
 
     # Extract the specific page from the PDF
     pdf_to_process = pdf_path
diff --git a/olmocr/bench/tests.py b/olmocr/bench/tests.py
index ec87313..320d31a 100644
--- a/olmocr/bench/tests.py
+++ b/olmocr/bench/tests.py
@@ -123,6 +123,8 @@ def normalize_text(md_content: str) -> str:
     # Remove markdown bold formatting (** or __ for bold)
     md_content = re.sub(r"\*\*(.*?)\*\*", r"\1", md_content)
     md_content = re.sub(r"__(.*?)__", r"\1", md_content)
+    md_content = re.sub(r"</?b>", "", md_content)  # Remove <b> tags if they exist
+    md_content = re.sub(r"</?i>", "", md_content)  # Remove <i> tags if they exist
 
     # Remove markdown italics formatting (* or _ for italics)
     md_content = re.sub(r"\*(.*?)\*", r"\1", md_content)
diff --git a/scripts/pareto_plot.py b/scripts/pareto_plot.py
index d3806df..a7a2d03 100644
--- a/scripts/pareto_plot.py
+++ b/scripts/pareto_plot.py
@@ -64,7 +64,7 @@ data = {
         "MinerU",
         "Gemini Flash 2",
         "Gemini Flash 2 (Batch)",
-        "Marker v1.6.2",
+        "Marker v1.7.4",
         "Ours",
         "Qwen 2 VL",
         "Qwen 2.5 VL",
@@ -77,7 +77,7 @@ data = {
         61.5,  # MinerU
         63.8,  # Gemini Flash 2 (Anchored)
         63.8,  # Same performance for batch
-        59.4,  # marker v1.6.2
+        74.3,  # marker v1.7.4 hybrid
         77.4,  # Ours (performance is the same across hardware)
         31.5,  # Qwen2VL
         65.5,  # Qwen2.5VL
@@ -94,7 +94,7 @@ model_categories = {
     "MinerU": "Open Source Tool",
     "Gemini Flash 2": "Commercial VLM",
     "Gemini Flash 2 (Batch)": "Commercial VLM",
-    "Marker v1.6.2": "Open Source Tool",
+    "Marker v1.7.4": "Open Source Tool",
     "Ours": "Ours",
     "Qwen 2 VL": "Open VLM",
     "Qwen 2.5 VL": "Open VLM",
@@ -132,7 +132,7 @@ model_label_offsets = {
     "MinerU": [-15, -20],
     "Gemini Flash 2": [-10, 10],
     "Gemini Flash 2 (Batch)": [-50, -15],
-    "Marker v1.6.2": [-35, -20],
+    "Marker v1.7.4": [-35, -20],
     "Ours": [-20, 10],
     "Qwen 2 VL": [-35, 10],
     "Qwen 2.5 VL": [-35, 10],

From 9ffbe8df46de52f1f9eb4bc45186d1254368a71e Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 5 Jun 2025 15:58:19 +0000
Subject: [PATCH 02/18] Adding quick stats percentage done check

---
 olmocr/pipeline.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index f65e4fe..f804a5b 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -909,6 +909,7 @@ def print_stats(args, root_work_queue):
             logger.warning(f"Error processing {s3_path}: {e}")
             return 0, 0, 0, 0, 0, set(), 0, 0
 
+    print(f"\nCompleted work items {completed_items:,} out of {total_items:,}: {completed_items/total_items*100:.2f}%")
     print("\nProcessing output files...")
     docs_total = 0
     input_tokens_total = 0

From 267f52bd79c17538fac845200dd5b5738b543a3b Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Fri, 6 Jun 2025 13:47:29 -0400
Subject: [PATCH 03/18] Update marker cost

---
 olmocr/bench/runners/run_marker.py | 2 +-
 scripts/pareto_plot.py             | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/olmocr/bench/runners/run_marker.py b/olmocr/bench/runners/run_marker.py
index 594a03d..6d15643 100644
--- a/olmocr/bench/runners/run_marker.py
+++ b/olmocr/bench/runners/run_marker.py
@@ -18,7 +18,7 @@ def run_marker(pdf_path: str, page_num: int = 1) -> str:
     if _marker_converter is None:
         # Create a configuration dictionary with the necessary settings
         config = {
-            "format_lines": True,  # This enables conversion of inline math to LaTeX
+            "force_ocr": True,  # This enables conversion of inline math to LaTeX
             "use_llm": google_key_exists, # Activate LLM mode if google key is specified
             "disable_tqdm": True,  # Disable tqdm for cleaner output
             "recognition_batch_size": 256,
diff --git a/scripts/pareto_plot.py b/scripts/pareto_plot.py
index a7a2d03..9c4bbc7 100644
--- a/scripts/pareto_plot.py
+++ b/scripts/pareto_plot.py
@@ -69,7 +69,7 @@ data = {
         "Qwen 2 VL",
         "Qwen 2.5 VL",
     ],
-    COST_COLUMN_NAME: [12480, 6240, 1000, 596, 499, 249, 235, 178, 178, 178],  # Same cost as Ours  # Same cost as Ours
+    COST_COLUMN_NAME: [12480, 6240, 1000, 596, 499, 249, 75, 178, 178, 178],  # Same cost as Ours  # Same cost as Ours
     PERF_COLUMN_NAME: [
         69.9,  # GPT-4o (Anchored)
         69.9,  # Same performance for batch
@@ -77,7 +77,7 @@ data = {
         61.5,  # MinerU
         63.8,  # Gemini Flash 2 (Anchored)
         63.8,  # Same performance for batch
-        74.3,  # marker v1.7.4 hybrid
+        70.0,  # marker v1.7.4 base
         77.4,  # Ours (performance is the same across hardware)
         31.5,  # Qwen2VL
         65.5,  # Qwen2.5VL

From 02574447202277675c9e85e00e44da86dded115d Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Fri, 6 Jun 2025 18:52:01 +0000
Subject: [PATCH 04/18] Ok, cleaner retry pattern for model downloading

---
 olmocr/pipeline.py | 42 +++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index f804a5b..4f7d8f5 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -708,19 +708,31 @@ async def sglang_server_ready():
     raise Exception("sglang server did not become ready after waiting.")
 
 
-async def download_model(model_name_or_path: str):
-    if model_name_or_path.startswith("s3://") or model_name_or_path.startswith("gs://") or model_name_or_path.startswith("weka://"):
-        logger.info(f"Downloading model directory from '{model_name_or_path}'")
-        model_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "olmocr", "model")
-        download_directory([model_name_or_path], model_cache_dir)
-        return model_cache_dir
-    elif os.path.isabs(model_name_or_path) and os.path.isdir(model_name_or_path):
-        logger.info(f"Using local model path at '{model_name_or_path}'")
-        return model_name_or_path
-    else:
-        logger.info(f"Downloading model with hugging face '{model_name_or_path}'")
-        snapshot_download(repo_id=model_name_or_path)
-        return model_name_or_path
+async def download_model(model_name_or_path: str, max_retries: int=5):
+    for retry in range(max_retries):
+        try:
+            if model_name_or_path.startswith("s3://") or model_name_or_path.startswith("gs://") or model_name_or_path.startswith("weka://"):
+                logger.info(f"Downloading model directory from '{model_name_or_path}'")
+                model_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "olmocr", "model")
+                # Delete existing model cache directory if it exists
+                if os.path.exists(model_cache_dir):
+                    shutil.rmtree(model_cache_dir)
+                download_directory([model_name_or_path], model_cache_dir)
+                return model_cache_dir
+            elif os.path.isabs(model_name_or_path) and os.path.isdir(model_name_or_path):
+                logger.info(f"Using local model path at '{model_name_or_path}'")
+                return model_name_or_path
+            else:
+                logger.info(f"Downloading model with hugging face '{model_name_or_path}'")
+                snapshot_download(repo_id=model_name_or_path)
+                return model_name_or_path
+        except Exception:
+            if retry == max_retries - 1:
+                raise # Raise on final attempt and fail the job
+
+            sleep_time = random.randrange(2, 20) * 2**retry
+            logger.exception(f"Could not download model, sleeping for {sleep_time} seconds to retry ({retry + 1}/{max_retries})")
+            await asyncio.sleep(random.randrange(10, 30) * 2**retry)
 
 
 async def metrics_reporter(work_queue):
@@ -1037,8 +1049,8 @@ async def main():
 
         # Wait a little bit so that not all beaker jobs in a task start at the same time and download the model at the same time
         replica_count = int(os.environ.get("BEAKER_REPLICA_COUNT", "1"))
-        interval = 10 if (replica_count - 1) * 10 <= 240 else 240 / max(1, replica_count - 1)
-        sleep_time = int(int(os.environ.get("BEAKER_REPLICA_RANK", "0")) * interval)
+        interval = 10 if (replica_count - 1) * 10 <= 30 else 30 / max(1, replica_count - 1)
+        sleep_time = int(os.environ.get("BEAKER_REPLICA_RANK", "0")) * interval
         logger.info(f"Beaker job sleeping for {sleep_time} seconds to stagger model downloads")
         await asyncio.sleep(sleep_time)
 

From cbc4580b72bb4a1613d80aa0d5f3aa6075ad414e Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 12 Jun 2025 17:21:21 +0000
Subject: [PATCH 05/18] Fixing #240

---
 olmocr/pipeline.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index 4f7d8f5..7ad2b3d 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -329,7 +329,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
 
 
 async def process_pdf(args, worker_id: int, pdf_orig_path: str):
-    with tempfile.NamedTemporaryFile("wb+", suffix=".pdf") as tf:
+    with tempfile.NamedTemporaryFile("wb+", suffix=".pdf", delete=False) as tf:
         try:
             data = await asyncio.to_thread(lambda: get_s3_bytes_with_backoff(pdf_s3, pdf_orig_path))
             tf.write(data)
@@ -347,6 +347,7 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str):
             tf.write(convert_image_to_pdf_bytes(tf.name))
             tf.flush()
 
+    try:
         try:
             reader = PdfReader(tf.name)
             num_pages = reader.get_num_pages()
@@ -398,7 +399,9 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str):
             # You can't build a dolma doc with even 1 failed page, so just get out of here
             # However, you don't want to propagate an exception higher up and cancel the entire work_group
             return None
-
+    finally:
+        if os.path.exists(tf.name):
+            os.unlink(tf.name)
 
 def build_dolma_document(pdf_orig_path, page_results):
     # Build the document text and page spans

From af7aaef60520e2e2a4241d09f839ee100cae8695 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 12 Jun 2025 20:07:17 +0000
Subject: [PATCH 06/18] Run marker script

---
 olmocr/bench/runners/run_marker.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/olmocr/bench/runners/run_marker.py b/olmocr/bench/runners/run_marker.py
index 6d15643..d444408 100644
--- a/olmocr/bench/runners/run_marker.py
+++ b/olmocr/bench/runners/run_marker.py
@@ -13,13 +13,11 @@ _marker_converter = None
 def run_marker(pdf_path: str, page_num: int = 1) -> str:
     global _marker_converter
 
-    google_key_exists = os.getenv("GOOGLE_API_KEY") is not None
-
     if _marker_converter is None:
         # Create a configuration dictionary with the necessary settings
         config = {
             "force_ocr": True,  # This enables conversion of inline math to LaTeX
-            "use_llm": google_key_exists, # Activate LLM mode if google key is specified
+            "use_llm": False, # We would prefer to run just plain marker for reporting bench results, not hybrid mode
             "disable_tqdm": True,  # Disable tqdm for cleaner output
             "recognition_batch_size": 256,
             "layout_batch_size": 48,
@@ -33,7 +31,6 @@ def run_marker(pdf_path: str, page_num: int = 1) -> str:
         _marker_converter = PdfConverter(
             artifact_dict=create_model_dict(),
             config=config_parser.generate_config_dict(),
-            llm_service=config_parser.get_llm_service(),
         )
 
     # Extract the specific page from the PDF

From 9787d007b99c6413a522b9ed9c0e961951b39dc8 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 12 Jun 2025 21:02:46 +0000
Subject: [PATCH 07/18] Pulling in bigger benchmark script from vllm branch to
 main

---
 scripts/run_benchmark.sh | 190 ++++++++++++++++++++++++++++++++-------
 1 file changed, 156 insertions(+), 34 deletions(-)

diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh
index e7667b0..4d00b70 100755
--- a/scripts/run_benchmark.sh
+++ b/scripts/run_benchmark.sh
@@ -1,7 +1,39 @@
 #!/bin/bash
 
+# Runs an olmocr-bench run using the full pipeline (no fallback)
+#  Without model parameter (default behavior):, uses the default image from hugging face
+#   ./scripts/run_benchmark.sh
+#  With model parameter: for testing custom models
+#   ./scripts/run_benchmark.sh --model your-model-name
+
 set -e
 
+# Parse command line arguments
+MODEL=""
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Usage: $0 [--model MODEL_NAME]"
+            exit 1
+            ;;
+    esac
+done
+
+# Check for uncommitted changes
+if ! git diff-index --quiet HEAD --; then
+    echo "Error: There are uncommitted changes in the repository."
+    echo "Please commit or stash your changes before running the benchmark."
+    echo ""
+    echo "Uncommitted changes:"
+    git status --short
+    exit 1
+fi
+
 # Use conda environment Python if available, otherwise use system Python
 if [ -n "$CONDA_PREFIX" ]; then
     PYTHON="$CONDA_PREFIX/bin/python"
@@ -36,63 +68,153 @@ BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
 echo "Beaker user: $BEAKER_USER"
 
 # Push image to beaker
-echo "Pushing image to Beaker..."
-beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG
+echo "Trying to push image to Beaker..."
+if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then
+    echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."
+fi
 
 # Create Python script to run beaker experiment
 cat << 'EOF' > /tmp/run_benchmark_experiment.py
 import sys
-from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints
+from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar
 
-# Get image tag, beaker user, git branch, and git hash from command line
+# Get image tag, beaker user, git branch, git hash, and optional model from command line
 image_tag = sys.argv[1]
 beaker_user = sys.argv[2]
 git_branch = sys.argv[3]
 git_hash = sys.argv[4]
+model = sys.argv[5] if len(sys.argv) > 5 else None
 
 # Initialize Beaker client
 b = Beaker.from_env(default_workspace="ai2/olmocr")
 
-# Create experiment spec
+# Build the pipeline command with optional model parameter
+pipeline_cmd = "python -m olmocr.pipeline ./localworkspace --markdown --pdfs ./olmOCR-bench/bench_data/pdfs/**/*.pdf"
+if model:
+    pipeline_cmd += f" --model {model}"
+
+# Check if AWS credentials secret exists
+aws_creds_secret = f"{beaker_user}-AWS_CREDENTIALS_FILE"
+try:
+    # Try to get the secret to see if it exists
+    b.secret.get(aws_creds_secret, workspace="ai2/olmocr")
+    has_aws_creds = True
+    print(f"Found AWS credentials secret: {aws_creds_secret}")
+except:
+    has_aws_creds = False
+    print(f"AWS credentials secret not found: {aws_creds_secret}")
+
+# First experiment: Original benchmark job
+commands = []
+if has_aws_creds:
+    commands.extend([
+        "mkdir -p ~/.aws",
+        'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
+    ])
+commands.extend([
+    "git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
+    "cd olmOCR-bench && git lfs pull && cd ..",
+    pipeline_cmd,
+    "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/",
+    "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data"
+])
+
+# Build task spec with optional env vars
+task_spec_args = {
+    "name": "olmocr-benchmark",
+    "image": ImageSource(beaker=f"{beaker_user}/{image_tag}"),
+    "command": [
+        "bash", "-c",
+        " && ".join(commands)
+    ],
+    "context": TaskContext(
+        priority=Priority.normal,
+        preemptible=True,
+    ),
+    "resources": TaskResources(gpu_count=1),
+    "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
+    "result": ResultSpec(path="/noop-results"),
+}
+
+# Add env vars if AWS credentials exist
+if has_aws_creds:
+    task_spec_args["env_vars"] = [
+        EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret)
+    ]
+
+# Create first experiment spec
 experiment_spec = ExperimentSpec(
     description=f"OlmOCR Benchmark Run - Branch: {git_branch}, Commit: {git_hash}",
     budget="ai2/oe-data",
-    tasks=[
-        TaskSpec(
-            name="olmocr-benchmark",
-            image=ImageSource(beaker=f"{beaker_user}/{image_tag}"),
-            command=[
-                "bash", "-c",
-                " && ".join([
-                    "git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
-                    "cd olmOCR-bench && git lfs pull && cd ..",
-                    "python -m olmocr.pipeline ./localworkspace --markdown --pdfs ./olmOCR-bench/bench_data/pdfs/**/*.pdf",
-                    "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/",
-                    "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data"
-                ])
-            ],
-            context=TaskContext(
-                priority=Priority.normal,
-                preemptible=True,
-            ),
-            resources=TaskResources(gpu_count=1),
-            constraints=Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
-            result=ResultSpec(path="/noop-results"),
-        )
-    ],
+    tasks=[TaskSpec(**task_spec_args)],
 )
 
-# Create the experiment
+# Create the first experiment
 experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr")
-print(f"Created experiment: {experiment.id}")
+print(f"Created benchmark experiment: {experiment.id}")
 print(f"View at: https://beaker.org/ex/{experiment.id}")
+print("-------")
+print("")
+
+# Second experiment: Performance test job
+perf_pipeline_cmd = "python -m olmocr.pipeline ./localworkspace --markdown --pdfs s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/*.pdf"
+if model:
+    perf_pipeline_cmd += f" --model {model}"
+
+perf_commands = []
+if has_aws_creds:
+    perf_commands.extend([
+        "mkdir -p ~/.aws",
+        'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
+    ])
+perf_commands.append(perf_pipeline_cmd)
+
+# Build performance task spec
+perf_task_spec_args = {
+    "name": "olmocr-performance",
+    "image": ImageSource(beaker=f"{beaker_user}/{image_tag}"),
+    "command": [
+        "bash", "-c",
+        " && ".join(perf_commands)
+    ],
+    "context": TaskContext(
+        priority=Priority.normal,
+        preemptible=True,
+    ),
+    "resources": TaskResources(gpu_count=1),
+    "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
+    "result": ResultSpec(path="/noop-results"),
+}
+
+# Add env vars if AWS credentials exist
+if has_aws_creds:
+    perf_task_spec_args["env_vars"] = [
+        EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret)
+    ]
+
+# Create performance experiment spec
+perf_experiment_spec = ExperimentSpec(
+    description=f"OlmOCR Performance Test - Branch: {git_branch}, Commit: {git_hash}",
+    budget="ai2/oe-data",
+    tasks=[TaskSpec(**perf_task_spec_args)],
+)
+
+# Create the performance experiment
+perf_experiment = b.experiment.create(spec=perf_experiment_spec, workspace="ai2/olmocr")
+print(f"Created performance experiment: {perf_experiment.id}")
+print(f"View at: https://beaker.org/ex/{perf_experiment.id}")
 EOF
 
-# Run the Python script to create the experiment
-echo "Creating Beaker experiment..."
-$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH
+# Run the Python script to create the experiments
+echo "Creating Beaker experiments..."
+if [ -n "$MODEL" ]; then
+    echo "Using model: $MODEL"
+    $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH "$MODEL"
+else
+    $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH
+fi
 
 # Clean up temporary file
 rm /tmp/run_benchmark_experiment.py
 
-echo "Benchmark experiment submitted successfully!"
\ No newline at end of file
+echo "Benchmark experiments submitted successfully!"
\ No newline at end of file

From 044874a634a5a65942d67aa182789175cc240bc9 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 12 Jun 2025 21:12:58 +0000
Subject: [PATCH 08/18] Adding marker benchmark

---
 scripts/run_marker_benchmark.sh | 197 ++++++++++++++++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100644 scripts/run_marker_benchmark.sh

diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh
new file mode 100644
index 0000000..a21d4a9
--- /dev/null
+++ b/scripts/run_marker_benchmark.sh
@@ -0,0 +1,197 @@
+#!/bin/bash
+
+# Runs marker benchmark, measuring both olmOCR-bench performance and per document processing performance
+#   ./scripts/run_marker_benchmark.sh
+#   ./scripts/run_marker_benchmark.sh 1.7.5
+
+set -e
+
+# Parse command line arguments
+MARKER_VERSION="${1:-1.7.5}"
+echo "Using marker version: $MARKER_VERSION"
+
+# Check for uncommitted changes
+if ! git diff-index --quiet HEAD --; then
+    echo "Error: There are uncommitted changes in the repository."
+    echo "Please commit or stash your changes before running the benchmark."
+    echo ""
+    echo "Uncommitted changes:"
+    git status --short
+    exit 1
+fi
+
+# Use conda environment Python if available, otherwise use system Python
+if [ -n "$CONDA_PREFIX" ]; then
+    PYTHON="$CONDA_PREFIX/bin/python"
+    echo "Using conda Python from: $CONDA_PREFIX"
+else
+    PYTHON="python"
+    echo "Warning: No conda environment detected, using system Python"
+fi
+
+# Get version from version.py
+VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)')
+echo "OlmOCR version: $VERSION"
+
+# Get first 10 characters of git hash
+GIT_HASH=$(git rev-parse HEAD | cut -c1-10)
+echo "Git hash: $GIT_HASH"
+
+# Get current git branch name
+GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
+echo "Git branch: $GIT_BRANCH"
+
+# Create full image tag
+IMAGE_TAG="olmocr-benchmark-${VERSION}-${GIT_HASH}"
+echo "Building Docker image with tag: $IMAGE_TAG"
+
+# Build the Docker image
+echo "Building Docker image..."
+docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
+
+# Get Beaker username
+BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
+echo "Beaker user: $BEAKER_USER"
+
+# Push image to beaker
+echo "Trying to push image to Beaker..."
+if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then
+    echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."
+fi
+
+# Create Python script to run beaker experiment
+cat << 'EOF' > /tmp/run_benchmark_experiment.py
+import sys
+from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar
+
+# Get image tag, beaker user, git branch, git hash, and marker version from command line
+image_tag = sys.argv[1]
+beaker_user = sys.argv[2]
+git_branch = sys.argv[3]
+git_hash = sys.argv[4]
+marker_version = sys.argv[5]
+
+# Initialize Beaker client
+b = Beaker.from_env(default_workspace="ai2/olmocr")
+
+
+# Check if AWS credentials secret exists
+aws_creds_secret = f"{beaker_user}-AWS_CREDENTIALS_FILE"
+try:
+    # Try to get the secret to see if it exists
+    b.secret.get(aws_creds_secret, workspace="ai2/olmocr")
+    has_aws_creds = True
+    print(f"Found AWS credentials secret: {aws_creds_secret}")
+except:
+    has_aws_creds = False
+    print(f"AWS credentials secret not found: {aws_creds_secret}")
+
+# First experiment: Original benchmark job
+commands = []
+if has_aws_creds:
+    commands.extend([
+        "mkdir -p ~/.aws",
+        'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
+    ])
+commands.extend([
+    "git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
+    "cd olmOCR-bench && git lfs pull && cd ..",
+    f"pip install marker=={marker_version}",
+    "python -m olmocr.bench.convert marker --dir ./olmOCR-bench/bench_data",
+    "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data"
+])
+
+# Build task spec with optional env vars
+task_spec_args = {
+    "name": "marker-benchmark",
+    "image": ImageSource(beaker=f"{beaker_user}/{image_tag}"),
+    "command": [
+        "bash", "-c",
+        " && ".join(commands)
+    ],
+    "context": TaskContext(
+        priority=Priority.normal,
+        preemptible=True,
+    ),
+    "resources": TaskResources(gpu_count=1),
+    "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
+    "result": ResultSpec(path="/noop-results"),
+}
+
+# Add env vars if AWS credentials exist
+if has_aws_creds:
+    task_spec_args["env_vars"] = [
+        EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret)
+    ]
+
+# Create first experiment spec
+experiment_spec = ExperimentSpec(
+    description=f"Marker {marker_version} Benchmark Run - Branch: {git_branch}, Commit: {git_hash}",
+    budget="ai2/oe-data",
+    tasks=[TaskSpec(**task_spec_args)],
+)
+
+# Create the first experiment
+experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr")
+print(f"Created benchmark experiment: {experiment.id}")
+print(f"View at: https://beaker.org/ex/{experiment.id}")
+print("-------")
+print("")
+
+
+perf_commands = []
+if has_aws_creds:
+    perf_commands.extend([
+        "mkdir -p ~/.aws",
+        'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
+    ])
+perf_commands.extend([
+    f"pip install marker=={marker_version}",
+    "s5cmd cp s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/* /root/olmOCR-mix-0225_benchmark_set/",
+    "marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8"
+])
+
+# Build performance task spec
+perf_task_spec_args = {
+    "name": "marker-performance",
+    "image": ImageSource(beaker=f"{beaker_user}/{image_tag}"),
+    "command": [
+        "bash", "-c",
+        " && ".join(perf_commands)
+    ],
+    "context": TaskContext(
+        priority=Priority.normal,
+        preemptible=True,
+    ),
+    "resources": TaskResources(gpu_count=1),
+    "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
+    "result": ResultSpec(path="/noop-results"),
+}
+
+# Add env vars if AWS credentials exist
+if has_aws_creds:
+    perf_task_spec_args["env_vars"] = [
+        EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret)
+    ]
+
+# Create performance experiment spec
+perf_experiment_spec = ExperimentSpec(
+    description=f"Marker {marker_version} Performance Test - Branch: {git_branch}, Commit: {git_hash}",
+    budget="ai2/oe-data",
+    tasks=[TaskSpec(**perf_task_spec_args)],
+)
+
+# Create the performance experiment
+perf_experiment = b.experiment.create(spec=perf_experiment_spec, workspace="ai2/olmocr")
+print(f"Created performance experiment: {perf_experiment.id}")
+print(f"View at: https://beaker.org/ex/{perf_experiment.id}")
+EOF
+
+# Run the Python script to create the experiments
+echo "Creating Beaker experiments..."
+$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH $MARKER_VERSION
+
+# Clean up temporary file
+rm /tmp/run_benchmark_experiment.py
+
+echo "Benchmark experiments submitted successfully!"
\ No newline at end of file

From f8dfd857652017d3eee75d6e22936989b4599d93 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 12 Jun 2025 21:13:31 +0000
Subject: [PATCH 09/18] Script

---
 scripts/run_marker_benchmark.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 scripts/run_marker_benchmark.sh

diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh
old mode 100644
new mode 100755

From 548187902b7fbde45553266dbedd52703e4c0304 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 12 Jun 2025 21:14:00 +0000
Subject: [PATCH 10/18] Ignore

---
 .gitignore         | 1 +
 olmocr/pipeline.py | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2a1c30a..cf93ea3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@ olmOCR-bench/*
 table_data*/
 /synth*/
 dolma_samples/*
+old_train/
 /*.html
 scoreelo.csv
 debug.log
diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index 7ad2b3d..e869316 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -403,6 +403,7 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str):
         if os.path.exists(tf.name):
             os.unlink(tf.name)
 
+
 def build_dolma_document(pdf_orig_path, page_results):
     # Build the document text and page spans
     document_text = ""
@@ -711,7 +712,7 @@ async def sglang_server_ready():
     raise Exception("sglang server did not become ready after waiting.")
 
 
-async def download_model(model_name_or_path: str, max_retries: int=5):
+async def download_model(model_name_or_path: str, max_retries: int = 5):
     for retry in range(max_retries):
         try:
             if model_name_or_path.startswith("s3://") or model_name_or_path.startswith("gs://") or model_name_or_path.startswith("weka://"):
@@ -731,7 +732,7 @@ async def download_model(model_name_or_path: str, max_retries: int=5):
                 return model_name_or_path
         except Exception:
             if retry == max_retries - 1:
-                raise # Raise on final attempt and fail the job
+                raise  # Raise on final attempt and fail the job
 
             sleep_time = random.randrange(2, 20) * 2**retry
             logger.exception(f"Could not download model, sleeping for {sleep_time} seconds to retry ({retry + 1}/{max_retries})")

From 4bfcfce7672ba2a2fb973574b66a4a67048abb04 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 12 Jun 2025 21:18:58 +0000
Subject: [PATCH 11/18] Actually install the right thing

---
 scripts/run_marker_benchmark.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh
index a21d4a9..9c559d3 100755
--- a/scripts/run_marker_benchmark.sh
+++ b/scripts/run_marker_benchmark.sh
@@ -96,7 +96,7 @@ if has_aws_creds:
 commands.extend([
     "git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
     "cd olmOCR-bench && git lfs pull && cd ..",
-    f"pip install marker=={marker_version}",
+    f"pip install marker-pdf=={marker_version}",
     "python -m olmocr.bench.convert marker --dir ./olmOCR-bench/bench_data",
     "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data"
 ])
@@ -146,7 +146,7 @@ if has_aws_creds:
         'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
     ])
 perf_commands.extend([
-    f"pip install marker=={marker_version}",
+    f"pip install marker-pdf=={marker_version}",
     "s5cmd cp s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/* /root/olmOCR-mix-0225_benchmark_set/",
     "marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8"
 ])

From 0f3b45c1a38f0cfec596d31610f6df377a77cae5 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 12 Jun 2025 21:19:17 +0000
Subject: [PATCH 12/18] Add time

---
 scripts/run_marker_benchmark.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh
index 9c559d3..e8178a5 100755
--- a/scripts/run_marker_benchmark.sh
+++ b/scripts/run_marker_benchmark.sh
@@ -148,7 +148,7 @@ if has_aws_creds:
 perf_commands.extend([
     f"pip install marker-pdf=={marker_version}",
     "s5cmd cp s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/* /root/olmOCR-mix-0225_benchmark_set/",
-    "marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8"
+    "time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8"
 ])
 
 # Build performance task spec

From 59e0a1ccb0bae5efa45e23d295ee5fa477743958 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 12 Jun 2025 21:23:53 +0000
Subject: [PATCH 13/18] Marker wants newer torchvision

---
 scripts/run_marker_benchmark.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh
index e8178a5..3a84b0e 100755
--- a/scripts/run_marker_benchmark.sh
+++ b/scripts/run_marker_benchmark.sh
@@ -97,6 +97,7 @@ commands.extend([
     "git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
     "cd olmOCR-bench && git lfs pull && cd ..",
     f"pip install marker-pdf=={marker_version}",
+    "pip install --upgrade torchvision",
     "python -m olmocr.bench.convert marker --dir ./olmOCR-bench/bench_data",
     "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data"
 ])
@@ -147,6 +148,7 @@ if has_aws_creds:
     ])
 perf_commands.extend([
     f"pip install marker-pdf=={marker_version}",
+    "pip install --upgrade torchvision",
     "s5cmd cp s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/* /root/olmOCR-mix-0225_benchmark_set/",
     "time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8"
 ])

From fc06797bec8c1a238e8f0fe81e3530f3dbe6c9f3 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 12 Jun 2025 21:29:39 +0000
Subject: [PATCH 14/18] aws cli

---
 scripts/run_marker_benchmark.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh
index 3a84b0e..ad56e66 100755
--- a/scripts/run_marker_benchmark.sh
+++ b/scripts/run_marker_benchmark.sh
@@ -149,7 +149,7 @@ if has_aws_creds:
 perf_commands.extend([
     f"pip install marker-pdf=={marker_version}",
     "pip install --upgrade torchvision",
-    "s5cmd cp s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/* /root/olmOCR-mix-0225_benchmark_set/",
+    "aws s3 cp --recursive s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/ /root/olmOCR-mix-0225_benchmark_set/",
     "time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8"
 ])
 

From fcd8bbec92182caea6bc434b3471fe31e130ed63 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 12 Jun 2025 21:38:28 +0000
Subject: [PATCH 15/18] Install aws cli

---
 scripts/run_marker_benchmark.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh
index ad56e66..05d1179 100755
--- a/scripts/run_marker_benchmark.sh
+++ b/scripts/run_marker_benchmark.sh
@@ -149,6 +149,7 @@ if has_aws_creds:
 perf_commands.extend([
     f"pip install marker-pdf=={marker_version}",
     "pip install --upgrade torchvision",
+    "pip install awscli",
     "aws s3 cp --recursive s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/ /root/olmOCR-mix-0225_benchmark_set/",
     "time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8"
 ])

From 3da6e2d58799a1eb2b24ed1fba24d2becb4fe811 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 12 Jun 2025 22:23:41 +0000
Subject: [PATCH 16/18] Pareto plot update, keep cost the same for now

---
 scripts/pareto_plot.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/pareto_plot.py b/scripts/pareto_plot.py
index 9c4bbc7..7c04a70 100644
--- a/scripts/pareto_plot.py
+++ b/scripts/pareto_plot.py
@@ -64,12 +64,12 @@ data = {
         "MinerU",
         "Gemini Flash 2",
         "Gemini Flash 2 (Batch)",
-        "Marker v1.7.4",
+        "Marker v1.7.5",
         "Ours",
         "Qwen 2 VL",
         "Qwen 2.5 VL",
     ],
-    COST_COLUMN_NAME: [12480, 6240, 1000, 596, 499, 249, 75, 178, 178, 178],  # Same cost as Ours  # Same cost as Ours
+    COST_COLUMN_NAME: [12480, 6240, 1000, 596, 499, 249, 235, 178, 178, 178],  # Same cost as Ours  # Same cost as Ours
     PERF_COLUMN_NAME: [
         69.9,  # GPT-4o (Anchored)
         69.9,  # Same performance for batch
@@ -77,7 +77,7 @@ data = {
         61.5,  # MinerU
         63.8,  # Gemini Flash 2 (Anchored)
         63.8,  # Same performance for batch
-        70.0,  # marker v1.7.4 base
+        70.1,  # marker v1.7.5 base
         77.4,  # Ours (performance is the same across hardware)
         31.5,  # Qwen2VL
         65.5,  # Qwen2.5VL
@@ -94,7 +94,7 @@ model_categories = {
     "MinerU": "Open Source Tool",
     "Gemini Flash 2": "Commercial VLM",
     "Gemini Flash 2 (Batch)": "Commercial VLM",
-    "Marker v1.7.4": "Open Source Tool",
+    "Marker v1.7.5": "Open Source Tool",
     "Ours": "Ours",
     "Qwen 2 VL": "Open VLM",
     "Qwen 2.5 VL": "Open VLM",
@@ -132,7 +132,7 @@ model_label_offsets = {
     "MinerU": [-15, -20],
     "Gemini Flash 2": [-10, 10],
     "Gemini Flash 2 (Batch)": [-50, -15],
-    "Marker v1.7.4": [-35, -20],
+    "Marker v1.7.5": [-20, 15],
     "Ours": [-20, 10],
     "Qwen 2 VL": [-35, 10],
     "Qwen 2.5 VL": [-35, 10],

From f273de6e6ec36c1c75eed6ec9935bc3df09987f2 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 12 Jun 2025 15:32:09 -0700
Subject: [PATCH 17/18] Update README.md

Updating to v.1.7.5 marker that I ran locally with base only for now
---
 olmocr/bench/README.md | 35 ++++++++++-------------------------
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/olmocr/bench/README.md b/olmocr/bench/README.md
index 8ab5f26..65c29bb 100644
--- a/olmocr/bench/README.md
+++ b/olmocr/bench/README.md
@@ -37,7 +37,7 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
       <td align="left">GOT OCR</td>
       <td align="center">52.7</td>
       <td align="center">52.0</td>
-      <td align="center">0.2</td>
+      <td align="center">0.20</td>
       <td align="center">22.1</td>
       <td align="center">93.6</td>
       <td align="center">42.0</td>
@@ -46,28 +46,16 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
       <td align="center">48.3 ± 1.1</td>
     </tr>
     <tr>
-      <td align="left">Marker v1.7.4 (base)</td>
-      <td align="center"><strong>77.7</strong></td>
-      <td align="center">59.6</td>
+      <td align="left">Marker v1.7.5 (base)</td>
+      <td align="center">76.0</td>
       <td align="center">57.9</td>
+      <td align="center">57.6</td>
       <td align="center">27.8</td>
-      <td align="center">85.3</td>
-      <td align="center">73.5</td>
-      <td align="center">78.7</td>
+      <td align="center">84.9</td>
+      <td align="center">72.9</td>
+      <td align="center">84.6</td>
       <td align="center">99.1</td>
-      <td align="center">70.0 ± 1.1</td>
-    </tr>
-    <tr>
-      <td align="left">Marker v1.7.4 (hybrid)</td>
-      <td align="center"><strong>77.7</strong></td>
-      <td align="center">71.2</td>
-      <td align="center"><strong>78.1</strong></td>
-      <td align="center">32.3</td>
-      <td align="center">83.4</td>
-      <td align="center">73.8</td>
-      <td align="center">79.0</td>
-      <td align="center">99.2</td>
-      <td align="center">74.3 ± 1.1</td>
+      <td align="center">70.1 ± 1.1</td>
     </tr>
     <tr>
       <td align="left">MinerU v1.3.10</td>
@@ -83,7 +71,7 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
     </tr>
     <tr>
       <td align="left">Mistral OCR API</td>
-      <td align="center">77.2</td>
+      <td align="center"><strong>77.2</strong></td>
       <td align="center">67.5</td>
       <td align="center">60.6</td>
       <td align="center">29.3</td>
@@ -169,7 +157,7 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
       <td align="left">olmOCR v0.1.68 (No Anchor)</td>
       <td align="center">72.1</td>
       <td align="center">74.7</td>
-      <td align="center">71.5</td>
+      <td align="center"><strong>71.5</strong></td>
       <td align="center">43.7</td>
       <td align="center">91.6</td>
       <td align="center">78.5</td>
@@ -300,6 +288,3 @@ We have an internal data annotation tool that can be used to review the question
 ```bash
 python -m olmocr.bench.review_app --port 5000 --debug ./olmOCR-bench/bench_data/multi_column.jsonl --force
 ```
-
-
-

From 37090e2801e489c57b0f445acde3ce795352344f Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 12 Jun 2025 22:35:08 +0000
Subject: [PATCH 18/18] Go back to workers 1 in marker test script

---
 scripts/run_marker_benchmark.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/run_marker_benchmark.sh b/scripts/run_marker_benchmark.sh
index 05d1179..332a3f0 100755
--- a/scripts/run_marker_benchmark.sh
+++ b/scripts/run_marker_benchmark.sh
@@ -151,7 +151,9 @@ perf_commands.extend([
     "pip install --upgrade torchvision",
     "pip install awscli",
     "aws s3 cp --recursive s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/ /root/olmOCR-mix-0225_benchmark_set/",
-    "time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8"
+    # Tried with workers 8, but it was taking a really huge amount of time
+    #"time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker --workers 8"
+    "time marker --force_ocr /root/olmOCR-mix-0225_benchmark_set/ --output_dir /root/olmOCR-mix-0225_benchmark_set_marker"
 ])
 
 # Build performance task spec