From cb4f23dc0ca926c0afe88085074bf566b5348fbe Mon Sep 17 00:00:00 2001
From: Tong Liang <tongliang517@gmail.com>
Date: Sat, 16 Aug 2025 21:48:07 -0400
Subject: [PATCH 01/40] Fix typo in README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b4c1327..fdda60b 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ Features:
  - (Based on a 7B parameter VLM, so it requires a GPU)
 
 ### News
- - August 13, 2024 - v0.3.0 - [New model release](https://huggingface.co/allenai/olmOCR-7B-0825-FP8), fixes auto-rotation detection, and hallucinations on blank documents.
+ - August 13, 2025 - v0.3.0 - [New model release](https://huggingface.co/allenai/olmOCR-7B-0825-FP8), fixes auto-rotation detection, and hallucinations on blank documents.
  - July 24, 2025 - v0.2.1 - [New model release](https://huggingface.co/allenai/olmOCR-7B-0725-FP8), scores 3 points higher on [olmOCR-Bench](https://github.com/allenai/olmocr/tree/main/olmocr/bench), also runs significantly faster because it's default FP8, and needs much fewer retries per document.
  - July 23, 2025 - v0.2.0 - New cleaned up [trainer code](https://github.com/allenai/olmocr/tree/main/olmocr/train), makes it much simpler to train olmOCR models yourself.
  - June 17, 2025 - v0.1.75 - Switch from sglang to vllm based inference pipeline, updated docker image to CUDA 12.8.

From b8a2b92174350dfbfdaf00015446567764988a59 Mon Sep 17 00:00:00 2001
From: Haydn Jones <haydnjonest@gmail.com>
Date: Wed, 20 Aug 2025 19:21:38 -0400
Subject: [PATCH 02/40] External vLLM

---
 olmocr/pipeline.py | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index 640a0aa..73d6603 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -213,7 +213,10 @@ async def apost(url, json_data):
 
 
 async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: str, page_num: int) -> PageResult:
-    COMPLETION_URL = f"http://localhost:{BASE_SERVER_PORT}/v1/chat/completions"
+    if args.external_vllm_url:
+        COMPLETION_URL = f"{args.external_vllm_url.rstrip('/')}/v1/chat/completions"
+    else:
+        COMPLETION_URL = f"http://localhost:{BASE_SERVER_PORT}/v1/chat/completions"
     MAX_RETRIES = args.max_page_retries
     MODEL_MAX_CONTEXT = 16384
     TEMPERATURE_BY_ATTEMPT = [0.1, 0.1, 0.2, 0.3, 0.5, 0.8, 0.9, 1.0]
@@ -607,6 +610,7 @@ async def vllm_server_task(model_name_or_path, args, semaphore, unknown_args=Non
     if unknown_args:
         cmd.extend(unknown_args)
 
+    breakpoint()
     proc = await asyncio.create_subprocess_exec(
         *cmd,
         stdout=asyncio.subprocess.PIPE,
@@ -730,10 +734,13 @@ async def vllm_server_host(model_name_or_path, args, semaphore, unknown_args=Non
         sys.exit(1)
 
 
-async def vllm_server_ready():
+async def vllm_server_ready(args):
     max_attempts = 300
     delay_sec = 1
-    url = f"http://localhost:{BASE_SERVER_PORT}/v1/models"
+    if args.external_vllm_url:
+        url = f"{args.external_vllm_url.rstrip('/')}/v1/models"
+    else:
+        url = f"http://localhost:{BASE_SERVER_PORT}/v1/models"
 
     for attempt in range(1, max_attempts + 1):
         try:
@@ -1069,6 +1076,9 @@ async def main():
     vllm_group.add_argument("--tensor-parallel-size", "-tp", type=int, default=1, help="Tensor parallel size for vLLM")
     vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM")
     vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server")
+    vllm_group.add_argument(
+        "--external-vllm-url", type=str, help="URL of external vLLM server (e.g., http://hostname:port). If provided, skips spawning local vLLM instance"
+    )
 
     # Beaker/job running stuff
     beaker_group = parser.add_argument_group("beaker/cluster execution")
@@ -1207,12 +1217,17 @@ async def main():
 
     # If you get this far, then you are doing inference and need a GPU
     # check_sglang_version()
-    check_torch_gpu_available()
+    if not args.external_vllm_url:
+        check_torch_gpu_available()
 
     logger.info(f"Starting pipeline with PID {os.getpid()}")
 
     # Download the model before you do anything else
-    model_name_or_path = await download_model(args.model)
+    if args.external_vllm_url:
+        logger.info(f"Using external vLLM server at {args.external_vllm_url}")
+        model_name_or_path = None
+    else:
+        model_name_or_path = await download_model(args.model)
 
     # Initialize the work queue
     qsize = await work_queue.initialize_queue()
@@ -1226,9 +1241,12 @@ async def main():
     # As soon as one worker is no longer saturating the gpu, the next one can start sending requests
     semaphore = asyncio.Semaphore(1)
 
-    vllm_server = asyncio.create_task(vllm_server_host(model_name_or_path, args, semaphore, unknown_args))
+    # Start local vLLM instance if not using external one
+    vllm_server = None
+    if not args.external_vllm_url:
+        vllm_server = asyncio.create_task(vllm_server_host(model_name_or_path, args, semaphore, unknown_args))
 
-    await vllm_server_ready()
+    await vllm_server_ready(args)
 
     metrics_task = asyncio.create_task(metrics_reporter(work_queue))
 
@@ -1241,11 +1259,16 @@ async def main():
     # Wait for all worker tasks to finish
     await asyncio.gather(*worker_tasks)
 
-    vllm_server.cancel()
+    # Cancel vLLM server if it was started
+    if vllm_server is not None:
+        vllm_server.cancel()
     metrics_task.cancel()
 
     # Wait for cancelled tasks to complete
-    await asyncio.gather(vllm_server, metrics_task, return_exceptions=True)
+    tasks_to_wait = [metrics_task]
+    if vllm_server is not None:
+        tasks_to_wait.append(vllm_server)
+    await asyncio.gather(*tasks_to_wait, return_exceptions=True)
 
     # Output final metrics summary
     metrics_summary = metrics.get_metrics_summary()

From b34c3611e1101830d7245c40174f3d358462dc9b Mon Sep 17 00:00:00 2001
From: Haydn Jones <haydnjonest@gmail.com>
Date: Wed, 20 Aug 2025 19:22:48 -0400
Subject: [PATCH 03/40] oopsy woopsy

---
 olmocr/pipeline.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index 73d6603..143664d 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -610,7 +610,6 @@ async def vllm_server_task(model_name_or_path, args, semaphore, unknown_args=Non
     if unknown_args:
         cmd.extend(unknown_args)
 
-    breakpoint()
     proc = await asyncio.create_subprocess_exec(
         *cmd,
         stdout=asyncio.subprocess.PIPE,

From 261c722f561e06f100174ebd17e5b1b0aea5c1e2 Mon Sep 17 00:00:00 2001
From: Haydn Jones <haydnjonest@gmail.com>
Date: Thu, 21 Aug 2025 17:49:07 -0400
Subject: [PATCH 04/40] Update README + arg name

---
 README.md          | 21 +++++++++++++++++++--
 olmocr/pipeline.py | 18 +++++++++---------
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index fdda60b..024b641 100644
--- a/README.md
+++ b/README.md
@@ -196,6 +196,20 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs tests/gnarly_pdfs/*
 
 With the addition of the `--markdown` flag, results will be stored as markdown files inside of `./localworkspace/markdown/`. 
 
+### Using External vLLM Server
+
+If you have a vLLM server already running elsewhere (or any inference platform implementing the relevant subset of the OpenAI API), you can point olmOCR to use it instead of spawning a local instance:
+
+```bash
+# Use external vLLM server instead of local one
+python -m olmocr.pipeline ./localworkspace --server http://remote-server:8000 --markdown --pdfs tests/gnarly_pdfs/*.pdf
+```
+
+The served model name should be `olmocr`. An example vLLM launch command would be:
+```bash
+vllm serve allenai/olmOCR-7B-0825-FP8 --served-model-name olmocr --max-model-len 16384
+```
+
 #### Viewing Results
 
 The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
@@ -271,7 +285,7 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs olmocr-sample.pdf
 python -m olmocr.pipeline --help
 usage: pipeline.py [-h] [--pdfs [PDFS ...]] [--model MODEL] [--workspace_profile WORKSPACE_PROFILE] [--pdf_profile PDF_PROFILE] [--pages_per_group PAGES_PER_GROUP] [--max_page_retries MAX_PAGE_RETRIES] [--max_page_error_rate MAX_PAGE_ERROR_RATE] [--workers WORKERS]
                    [--apply_filter] [--stats] [--markdown] [--target_longest_image_dim TARGET_LONGEST_IMAGE_DIM] [--target_anchor_text_len TARGET_ANCHOR_TEXT_LEN] [--guided_decoding] [--gpu-memory-utilization GPU_MEMORY_UTILIZATION] [--max_model_len MAX_MODEL_LEN]
-                   [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--data-parallel-size DATA_PARALLEL_SIZE] [--port PORT] [--beaker] [--beaker_workspace BEAKER_WORKSPACE] [--beaker_cluster BEAKER_CLUSTER] [--beaker_gpus BEAKER_GPUS] [--beaker_priority BEAKER_PRIORITY]
+                   [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--data-parallel-size DATA_PARALLEL_SIZE] [--port PORT] [--server SERVER] [--beaker] [--beaker_workspace BEAKER_WORKSPACE] [--beaker_cluster BEAKER_CLUSTER] [--beaker_gpus BEAKER_GPUS] [--beaker_priority BEAKER_PRIORITY]
                    workspace
 
 Manager for running millions of PDFs through a batch inference pipeline
@@ -303,7 +317,7 @@ options:
                         Maximum amount of anchor text to use (characters), not used for new models
   --guided_decoding     Enable guided decoding for model YAML type outputs
 
-VLLM Forwarded arguments:
+VLLM arguments:
   --gpu-memory-utilization GPU_MEMORY_UTILIZATION
                         Fraction of VRAM vLLM may pre-allocate for KV-cache (passed through to vllm serve).
   --max_model_len MAX_MODEL_LEN
@@ -313,6 +327,9 @@ VLLM Forwarded arguments:
   --data-parallel-size DATA_PARALLEL_SIZE, -dp DATA_PARALLEL_SIZE
                         Data parallel size for vLLM
   --port PORT           Port to use for the VLLM server
+  --server SERVER       URL of external vLLM (or other compatible provider)
+                        server (e.g., http://hostname:port). If provided,
+                        skips spawning local vLLM instance
 
 beaker/cluster execution:
   --beaker              Submit this job to beaker instead of running locally
diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index 143664d..78ca51b 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -213,8 +213,8 @@ async def apost(url, json_data):
 
 
 async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: str, page_num: int) -> PageResult:
-    if args.external_vllm_url:
-        COMPLETION_URL = f"{args.external_vllm_url.rstrip('/')}/v1/chat/completions"
+    if args.server:
+        COMPLETION_URL = f"{args.server.rstrip('/')}/v1/chat/completions"
     else:
         COMPLETION_URL = f"http://localhost:{BASE_SERVER_PORT}/v1/chat/completions"
     MAX_RETRIES = args.max_page_retries
@@ -736,8 +736,8 @@ async def vllm_server_host(model_name_or_path, args, semaphore, unknown_args=Non
 async def vllm_server_ready(args):
     max_attempts = 300
     delay_sec = 1
-    if args.external_vllm_url:
-        url = f"{args.external_vllm_url.rstrip('/')}/v1/models"
+    if args.server:
+        url = f"{args.server.rstrip('/')}/v1/models"
     else:
         url = f"http://localhost:{BASE_SERVER_PORT}/v1/models"
 
@@ -1076,7 +1076,7 @@ async def main():
     vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM")
     vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server")
     vllm_group.add_argument(
-        "--external-vllm-url", type=str, help="URL of external vLLM server (e.g., http://hostname:port). If provided, skips spawning local vLLM instance"
+        "--server", type=str, help="URL of external vLLM (or other compatible provider) server (e.g., http://hostname:port). If provided, skips spawning local vLLM instance"
     )
 
     # Beaker/job running stuff
@@ -1216,14 +1216,14 @@ async def main():
 
     # If you get this far, then you are doing inference and need a GPU
     # check_sglang_version()
-    if not args.external_vllm_url:
+    if not args.server:
         check_torch_gpu_available()
 
     logger.info(f"Starting pipeline with PID {os.getpid()}")
 
     # Download the model before you do anything else
-    if args.external_vllm_url:
-        logger.info(f"Using external vLLM server at {args.external_vllm_url}")
+    if args.server:
+        logger.info(f"Using external server at {args.server}")
         model_name_or_path = None
     else:
         model_name_or_path = await download_model(args.model)
@@ -1242,7 +1242,7 @@ async def main():
 
     # Start local vLLM instance if not using external one
     vllm_server = None
-    if not args.external_vllm_url:
+    if not args.server:
         vllm_server = asyncio.create_task(vllm_server_host(model_name_or_path, args, semaphore, unknown_args))
 
     await vllm_server_ready(args)

From 2c638366489ad989919da84381d9b39c8458f824 Mon Sep 17 00:00:00 2001
From: Haydn Jones <haydnjonest@gmail.com>
Date: Sat, 23 Aug 2025 20:07:05 -0400
Subject: [PATCH 05/40] Black and mock

---
 olmocr/pipeline.py     | 4 +++-
 tests/test_pipeline.py | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index 78ca51b..04a2170 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -1076,7 +1076,9 @@ async def main():
     vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM")
     vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server")
     vllm_group.add_argument(
-        "--server", type=str, help="URL of external vLLM (or other compatible provider) server (e.g., http://hostname:port). If provided, skips spawning local vLLM instance"
+        "--server",
+        type=str,
+        help="URL of external vLLM (or other compatible provider) server (e.g., http://hostname:port). If provided, skips spawning local vLLM instance",
     )
 
     # Beaker/job running stuff
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index e0d69d9..600753d 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -192,6 +192,7 @@ class MockArgs:
     max_page_retries: int = 8
     target_longest_image_dim: int = 1288
     guided_decoding: bool = False
+    server: str | None = None
 
 
 class TestRotationCorrection:

From c7aa217281eccb7bc5a921757b2e2b1c9df761e0 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 25 Aug 2025 20:12:10 +0000
Subject: [PATCH 06/40] Scripts to run benchmarks better

---
 .gitignore               |  3 ++
 scripts/run_benchmark.sh | 68 +++++++++++++++++++++++++++++++++-------
 2 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index da29165..8f9d32b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,8 +22,11 @@ table_data*/
 /synth*/
 dolma_samples/*
 old_train/
+filtered_items/
+filtered_items_prefilter/
 augraphy_cache/
 /*.html
+html_templates*/
 scoreelo.csv
 debug.log
 birrpipeline-debug.log
diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh
index 0c9863a..8725cb7 100755
--- a/scripts/run_benchmark.sh
+++ b/scripts/run_benchmark.sh
@@ -10,15 +10,25 @@ set -e
 
 # Parse command line arguments
 MODEL=""
+B200_MODE=""
+BENCH_BRANCH=""
 while [[ $# -gt 0 ]]; do
     case $1 in
         --model)
             MODEL="$2"
             shift 2
             ;;
+        --b200)
+            B200_MODE="true"
+            shift
+            ;;
+        --benchbranch)
+            BENCH_BRANCH="$2"
+            shift 2
+            ;;
         *)
             echo "Unknown option: $1"
-            echo "Usage: $0 [--model MODEL_NAME]"
+            echo "Usage: $0 [--model MODEL_NAME] [--b200] [--benchbranch BRANCH_NAME]"
             exit 1
             ;;
     esac
@@ -78,12 +88,27 @@ cat << 'EOF' > /tmp/run_benchmark_experiment.py
 import sys
 from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar
 
-# Get image tag, beaker user, git branch, git hash, and optional model from command line
+# Get image tag, beaker user, git branch, git hash, optional model, b200 mode, and bench branch from command line
 image_tag = sys.argv[1]
 beaker_user = sys.argv[2]
 git_branch = sys.argv[3]
 git_hash = sys.argv[4]
-model = sys.argv[5] if len(sys.argv) > 5 else None
+model = None
+b200_mode = False
+bench_branch = None
+
+# Parse remaining arguments
+arg_idx = 5
+while arg_idx < len(sys.argv):
+    if sys.argv[arg_idx] == "--b200":
+        b200_mode = True
+        arg_idx += 1
+    elif sys.argv[arg_idx] == "--benchbranch":
+        bench_branch = sys.argv[arg_idx + 1]
+        arg_idx += 2
+    else:
+        model = sys.argv[arg_idx]
+        arg_idx += 1
 
 # Initialize Beaker client
 b = Beaker.from_env(default_workspace="ai2/olmocr")
@@ -111,11 +136,18 @@ if has_aws_creds:
         "mkdir -p ~/.aws",
         'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
     ])
+
+# Build git clone command with optional branch
+git_clone_cmd = "git clone https://huggingface.co/datasets/allenai/olmOCR-bench"
+if bench_branch:
+    git_clone_cmd += f" -b {bench_branch}"
+
 commands.extend([
-    "git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
+    git_clone_cmd,
     "cd olmOCR-bench && git lfs pull && cd ..",
     pipeline_cmd,
     "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/",
+    "aws s3 cp --recursive localworkspace/ s3://ai2-oe-data/jakep/olmocr-bench-runs/$BEAKER_WORKLOAD_ID/",
     "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data"
 ])
 
@@ -132,7 +164,7 @@ task_spec_args = {
         preemptible=True,
     ),
     "resources": TaskResources(gpu_count=1),
-    "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
+    "constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
     "result": ResultSpec(path="/noop-results"),
 }
 
@@ -181,9 +213,9 @@ perf_task_spec_args = {
         priority=Priority.normal,
         preemptible=True,
     ),
-    # Need to reserve all 8 gpus for performance spec or else benchmark results can be off
-    "resources": TaskResources(gpu_count=8),
-    "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
+    # Need to reserve all 8 gpus for performance spec or else benchmark results can be off (1 for b200 mode)
+    "resources": TaskResources(gpu_count=1 if b200_mode else 8),
+    "constraints": Constraints(cluster=["ai2/titan-cirrascale"] if b200_mode else ["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
     "result": ResultSpec(path="/noop-results"),
 }
 
@@ -208,13 +240,27 @@ EOF
 
 # Run the Python script to create the experiments
 echo "Creating Beaker experiments..."
+
+# Build command with appropriate arguments
+CMD="$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH"
+
 if [ -n "$MODEL" ]; then
     echo "Using model: $MODEL"
-    $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH "$MODEL"
-else
-    $PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH
+    CMD="$CMD $MODEL"
 fi
 
+if [ -n "$B200_MODE" ]; then
+    echo "Using B200 mode: ai2/titan-cirrascale cluster with 1 GPU for perf task"
+    CMD="$CMD --b200"
+fi
+
+if [ -n "$BENCH_BRANCH" ]; then
+    echo "Using bench branch: $BENCH_BRANCH"
+    CMD="$CMD --benchbranch $BENCH_BRANCH"
+fi
+
+eval $CMD
+
 # Clean up temporary file
 rm /tmp/run_benchmark_experiment.py
 

From ad33672781f7cef03949d36e0aab46545d0365b7 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 25 Aug 2025 21:04:53 +0000
Subject: [PATCH 07/40] fix

---
 scripts/run_benchmark.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh
index 8725cb7..e0eb6e4 100755
--- a/scripts/run_benchmark.sh
+++ b/scripts/run_benchmark.sh
@@ -147,7 +147,8 @@ commands.extend([
     "cd olmOCR-bench && git lfs pull && cd ..",
     pipeline_cmd,
     "python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/",
-    "aws s3 cp --recursive localworkspace/ s3://ai2-oe-data/jakep/olmocr-bench-runs/$BEAKER_WORKLOAD_ID/",
+    "pip install s5cmd",
+    "s5cmd cp localworkspace/ s3://ai2-oe-data/jakep/olmocr-bench-runs/$BEAKER_WORKLOAD_ID/",
     "python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data"
 ])
 

From 6be12c2e06fe6036a968bfd05daca2ca29e4d3ed Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 25 Aug 2025 22:01:24 +0000
Subject: [PATCH 08/40] Baseline tests for blanks

---
 olmocr/bench/tests.py | 75 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 56 insertions(+), 19 deletions(-)

diff --git a/olmocr/bench/tests.py b/olmocr/bench/tests.py
index 320d31a..dc461f0 100644
--- a/olmocr/bench/tests.py
+++ b/olmocr/bench/tests.py
@@ -5,7 +5,7 @@ import unicodedata
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import asdict, dataclass, field
 from enum import Enum
-from typing import List, Optional, Set, Tuple
+from typing import Dict, List, Optional, Set, Tuple, Union
 
 import numpy as np
 from bs4 import BeautifulSoup
@@ -130,7 +130,7 @@ def normalize_text(md_content: str) -> str:
     md_content = re.sub(r"\*(.*?)\*", r"\1", md_content)
     md_content = re.sub(r"_(.*?)_", r"\1", md_content)
 
-    # Convert down to a consistent unicode form, so é == e + accent, unicode forms
+    # Convert down to a consistent unicode form, so é == e + accent, unicode forms
     md_content = unicodedata.normalize("NFC", md_content)
 
     # Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too
@@ -867,11 +867,22 @@ class BaselineTest(BasePDFTest):
 
     """
 
+    max_length: Optional[int] = None  # Used to implement blank page checks
+
     max_repeats: int = 30
     check_disallowed_characters: bool = True
 
     def run(self, content: str) -> Tuple[bool, str]:
-        if len("".join(c for c in content if c.isalnum()).strip()) == 0:
+        base_content_len = len("".join(c for c in content if c.isalnum()).strip())
+
+        # If this a blank page check, then it short circuits the rest of the checks
+        if self.max_length is not None:
+            if base_content_len > self.max_length:
+                return False, f"{base_content_len} characters were output for a page we expected to be blank"
+            else:
+                return True, ""
+
+        if base_content_len == 0:
             return False, "The text contains no alpha numeric characters"
 
         # Makes sure that the content has no egregious repeated ngrams at the end, which indicate a degradation of quality
@@ -965,6 +976,45 @@ class MathTest(BasePDFTest):
         return False, f"No match found for {self.math} anywhere in content"
 
 
+def load_single_test(data: Union[str, Dict]) -> BasePDFTest:
+    """
+    Load a single test from a JSON line string or JSON object.
+
+    Args:
+        data: Either a JSON string to parse or a dictionary containing test data.
+
+    Returns:
+        A test object of the appropriate type.
+
+    Raises:
+        ValidationError: If the test type is unknown or data is invalid.
+        json.JSONDecodeError: If the string cannot be parsed as JSON.
+    """
+    # Handle JSON string input
+    if isinstance(data, str):
+        data = data.strip()
+        if not data:
+            raise ValueError("Empty string provided")
+        data = json.loads(data)
+
+    # Process the test data
+    test_type = data.get("type")
+    if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}:
+        test = TextPresenceTest(**data)
+    elif test_type == TestType.ORDER.value:
+        test = TextOrderTest(**data)
+    elif test_type == TestType.TABLE.value:
+        test = TableTest(**data)
+    elif test_type == TestType.MATH.value:
+        test = MathTest(**data)
+    elif test_type == TestType.BASELINE.value:
+        test = BaselineTest(**data)
+    else:
+        raise ValidationError(f"Unknown test type: {test_type}")
+
+    return test
+
+
 def load_tests(jsonl_file: str) -> List[BasePDFTest]:
     """
     Load tests from a JSONL file using parallel processing with a ThreadPoolExecutor.
@@ -976,7 +1026,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
         A list of test objects.
     """
 
-    def process_line(line_tuple: Tuple[int, str]) -> Optional[Tuple[int, BasePDFTest]]:
+    def process_line_with_number(line_tuple: Tuple[int, str]) -> Optional[Tuple[int, BasePDFTest]]:
         """
         Process a single line from the JSONL file and return a tuple of (line_number, test object).
         Returns None for empty lines.
@@ -987,20 +1037,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
             return None
 
         try:
-            data = json.loads(line)
-            test_type = data.get("type")
-            if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}:
-                test = TextPresenceTest(**data)
-            elif test_type == TestType.ORDER.value:
-                test = TextOrderTest(**data)
-            elif test_type == TestType.TABLE.value:
-                test = TableTest(**data)
-            elif test_type == TestType.MATH.value:
-                test = MathTest(**data)
-            elif test_type == TestType.BASELINE.value:
-                test = BaselineTest(**data)
-            else:
-                raise ValidationError(f"Unknown test type: {test_type}")
+            test = load_single_test(line)
             return (line_number, test)
         except json.JSONDecodeError as e:
             print(f"Error parsing JSON on line {line_number}: {e}")
@@ -1021,7 +1058,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
     # Use a ThreadPoolExecutor to process each line in parallel.
     with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 1, 64)) as executor:
         # Submit all tasks concurrently.
-        futures = {executor.submit(process_line, item): item[0] for item in lines}
+        futures = {executor.submit(process_line_with_number, item): item[0] for item in lines}
         # Use tqdm to show progress as futures complete.
         for future in tqdm(as_completed(futures), total=len(futures), desc="Loading tests"):
             result = future.result()

From 3eec58012c7b96f0880a8ec5f89b2bf5af2655a2 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Tue, 26 Aug 2025 17:52:50 +0000
Subject: [PATCH 09/40] Docker ignore

---
 .dockerignore | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.dockerignore b/.dockerignore
index 6f6d633..919c2da 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -9,6 +9,9 @@ __pycache__
 
 # Let's not copy any bash scripts from the scripts folder over, otherwise trashing the docker image too much with recent changes
 scripts/*.sh
+scripts/**/*.sh
 
 # Nor copy any olmocr bench files
-olmOCR-bench/
\ No newline at end of file
+olmOCR-bench/
+olmOCR-bench*/
+html_templates*/
\ No newline at end of file

From 03c7479a17014f4e104756c863a22678c9ae26c2 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Wed, 27 Aug 2025 16:33:37 +0000
Subject: [PATCH 10/40] VLLM version bump

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 163c5eb..694c0de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,7 @@ Changelog = "https://github.com/allenai/olmocr/blob/main/CHANGELOG.md"
 
 [project.optional-dependencies]
 gpu = [
-    "vllm==0.10.0"
+    "vllm==0.10.1.1"
 ]
 
 dev = [

From 27792664bfdb5c6a2a1a457206ff396edd7ac5e7 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Wed, 27 Aug 2025 16:35:51 +0000
Subject: [PATCH 11/40] Transformers version bump needed also

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 694c0de..370be61 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ dependencies = [
   "boto3",
   "httpx",
   "torch>=2.7.0",
-  "transformers==4.53.2",
+  "transformers==4.55.2",
   "img2pdf",
   "beaker-py",
 ]

From edd098093b31de31431e2c9a95ad4b53f7b4efe2 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Wed, 27 Aug 2025 18:55:26 +0000
Subject: [PATCH 12/40] Reverting version changes that broke, vllm 0.10.1 is
 not good

---
 olmOCR-bench-0825            | 1 +
 olmOCR-bench-snapshot-082225 | 1 +
 pyproject.toml               | 4 ++--
 3 files changed, 4 insertions(+), 2 deletions(-)
 create mode 160000 olmOCR-bench-0825
 create mode 160000 olmOCR-bench-snapshot-082225

diff --git a/olmOCR-bench-0825 b/olmOCR-bench-0825
new file mode 160000
index 0000000..a0100ab
--- /dev/null
+++ b/olmOCR-bench-0825
@@ -0,0 +1 @@
+Subproject commit a0100ab4cce52d7419cc09cce21aa42226118df2
diff --git a/olmOCR-bench-snapshot-082225 b/olmOCR-bench-snapshot-082225
new file mode 160000
index 0000000..eaa8289
--- /dev/null
+++ b/olmOCR-bench-snapshot-082225
@@ -0,0 +1 @@
+Subproject commit eaa828947384ffce68f08c223a0f5f4e2f2df624
diff --git a/pyproject.toml b/pyproject.toml
index 370be61..163c5eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ dependencies = [
   "boto3",
   "httpx",
   "torch>=2.7.0",
-  "transformers==4.55.2",
+  "transformers==4.53.2",
   "img2pdf",
   "beaker-py",
 ]
@@ -51,7 +51,7 @@ Changelog = "https://github.com/allenai/olmocr/blob/main/CHANGELOG.md"
 
 [project.optional-dependencies]
 gpu = [
-    "vllm==0.10.1.1"
+    "vllm==0.10.0"
 ]
 
 dev = [

From f3cdc78b4f4a524889e3f9a48bd3b99330c928aa Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Sun, 31 Aug 2025 03:12:30 +0000
Subject: [PATCH 13/40] Pushing new version

---
 olmocr/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/olmocr/version.py b/olmocr/version.py
index bf1c1af..79a242e 100644
--- a/olmocr/version.py
+++ b/olmocr/version.py
@@ -2,7 +2,7 @@ _MAJOR = "0"
 _MINOR = "3"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "3"
+_PATCH = "4"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""

From 56b08d5aa4edd9de841b24184411a90e92a9e065 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Sun, 31 Aug 2025 03:12:39 +0000
Subject: [PATCH 14/40] Bump version to v0.3.4 for release

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 86f22d1..f711eb7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+## [v0.3.4](https://github.com/allenai/olmocr/releases/tag/v0.3.4) - 2025-08-31
+
 ## [v0.3.3](https://github.com/allenai/olmocr/releases/tag/v0.3.3) - 2025-08-15
 
 ## [v0.3.2](https://github.com/allenai/olmocr/releases/tag/v0.3.2) - 2025-08-14

From c720c02d832d0e56e81fad7b83022996254c7040 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Tue, 2 Sep 2025 06:45:24 +0000
Subject: [PATCH 15/40] Cleaning up repo a bit

---
 .gitignore                   | 2 ++
 olmOCR-bench-0825            | 1 -
 olmOCR-bench-snapshot-082225 | 1 -
 3 files changed, 2 insertions(+), 2 deletions(-)
 delete mode 160000 olmOCR-bench-0825
 delete mode 160000 olmOCR-bench-snapshot-082225

diff --git a/.gitignore b/.gitignore
index 8f9d32b..1b40340 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,8 @@ old_train/
 gpt4otestset_output/*
 pdfs/*
 olmOCR-bench/*
+olmOCR-bench-0825/
+olmOCR-bench-snapshot-082225/
 table_data*/
 /synth*/
 dolma_samples/*
diff --git a/olmOCR-bench-0825 b/olmOCR-bench-0825
deleted file mode 160000
index a0100ab..0000000
--- a/olmOCR-bench-0825
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a0100ab4cce52d7419cc09cce21aa42226118df2
diff --git a/olmOCR-bench-snapshot-082225 b/olmOCR-bench-snapshot-082225
deleted file mode 160000
index eaa8289..0000000
--- a/olmOCR-bench-snapshot-082225
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit eaa828947384ffce68f08c223a0f5f4e2f2df624

From 8f88a98e5d10d9b2fc8bdba18a88f2fb62ce78fc Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 4 Sep 2025 22:15:55 +0000
Subject: [PATCH 16/40] prepare checkpoint script fixes

---
 olmocr/train/prepare_checkpoint.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/olmocr/train/prepare_checkpoint.py b/olmocr/train/prepare_checkpoint.py
index 1ebe4c6..e8b6443 100755
--- a/olmocr/train/prepare_checkpoint.py
+++ b/olmocr/train/prepare_checkpoint.py
@@ -31,6 +31,7 @@ Examples:
 
 import argparse
 import concurrent.futures
+import fnmatch
 import json
 import os
 import shutil
@@ -59,11 +60,20 @@ TOKENIZER_FILES = ["chat_template.json", "merges.txt", "preprocessor_config.json
 SUPPORTED_ARCHITECTURES = ["Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration"]
 
 # Files to exclude from copying (training-related files)
-EXCLUDED_FILES = {"optimizer.pt", "scheduler.pt", "rng_state.pth", "trainer_state.json", "training_args.bin"}
+# Supports exact matches and glob patterns
+EXCLUDED_FILES = {"optimizer.pt", "scheduler.pt", "rng_state.pth", "trainer_state.json", "training_args.bin", "*.pt", "*.pth"}
 
 s3_client = boto3.client("s3")
 
 
+def should_exclude_file(filename: str) -> bool:
+    """Check if a file should be excluded based on EXCLUDED_FILES patterns."""
+    for pattern in EXCLUDED_FILES:
+        if fnmatch.fnmatch(filename, pattern):
+            return True
+    return False
+
+
 def is_s3_path(path: str) -> bool:
     """Check if a path is an S3 path."""
     return path.startswith("s3://")
@@ -123,7 +133,7 @@ def copy_local_to_local(source_dir: str, dest_dir: str) -> None:
     files_to_copy = []
     for root, _, files in os.walk(source_dir):
         for file in files:
-            if file in EXCLUDED_FILES:
+            if should_exclude_file(file):
                 print(f"Skipping excluded file: {file}")
                 continue
             src_path = os.path.join(root, file)
@@ -164,7 +174,7 @@ def copy_s3_to_local(source_bucket: str, source_prefix: str, dest_dir: str) -> N
                 continue
 
             filename = os.path.basename(key)
-            if filename in EXCLUDED_FILES:
+            if should_exclude_file(filename):
                 print(f"Skipping excluded file: {filename}")
                 continue
 
@@ -187,7 +197,7 @@ def copy_local_to_s3(source_dir: str, dest_bucket: str, dest_prefix: str) -> Non
     upload_tasks = []
     for root, _, files in os.walk(source_dir):
         for file in files:
-            if file in EXCLUDED_FILES:
+            if should_exclude_file(file):
                 print(f"Skipping excluded file: {file}")
                 continue
             local_path = os.path.join(root, file)
@@ -218,7 +228,7 @@ def copy_s3_to_s3(source_bucket: str, source_prefix: str, dest_bucket: str, dest
                 continue
 
             filename = os.path.basename(key)
-            if filename in EXCLUDED_FILES:
+            if should_exclude_file(filename):
                 print(f"Skipping excluded file: {filename}")
                 continue
 

From fe425fde209b14d2801f885c45e9ec129f16c586 Mon Sep 17 00:00:00 2001
From: Charitarth Chugh <37895518+charitarthchugh@users.noreply.github.com>
Date: Thu, 25 Sep 2025 14:29:49 -0400
Subject: [PATCH 17/40] Add chunked prefill and limit mm per prompt options

---
 olmocr/pipeline.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index 04a2170..65ea7f1 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -599,6 +599,8 @@ async def vllm_server_task(model_name_or_path, args, semaphore, unknown_args=Non
         str(args.tensor_parallel_size),
         "--data-parallel-size",
         str(args.data_parallel_size),
+        "--enable-chunked-prefill",
+        "--limit-mm-per-prompt '{\"video\": 0}'"
     ]
 
     if args.gpu_memory_utilization is not None:

From 7fe3f65de7da70e0bedf42462dc2abadf9750499 Mon Sep 17 00:00:00 2001
From: aman-17 <amanrangapur@gmail.com>
Date: Fri, 26 Sep 2025 11:06:51 -0700
Subject: [PATCH 18/40] added support for deepinfra

---
 DEEPINFRA_SETUP.md | 64 +++++++++++++++++++++++++++++++++++++
 olmocr/pipeline.py | 78 +++++++++++++++++++++++++++++++++++-----------
 2 files changed, 124 insertions(+), 18 deletions(-)
 create mode 100644 DEEPINFRA_SETUP.md

diff --git a/DEEPINFRA_SETUP.md b/DEEPINFRA_SETUP.md
new file mode 100644
index 0000000..5e18381
--- /dev/null
+++ b/DEEPINFRA_SETUP.md
@@ -0,0 +1,64 @@
+# Using olmOCR with DeepInfra
+
+This guide explains how to use olmOCR with DeepInfra's hosted API service for cloud-based inference.
+
+## Prerequisites
+
+1. **DeepInfra Account**: Sign up at https://deepinfra.com/
+2. **API Key**: Get your API key from the DeepInfra dashboard
+3. **olmOCR**: Ensure you have the modified version with authentication support
+
+## Setup
+
+### 1. Get your DeepInfra API Key
+
+1. Log in to https://deepinfra.com/
+2. Navigate to your dashboard
+3. Generate or copy your API key
+4. Store it securely (recommended: as an environment variable)
+
+```bash
+export DEEPINFRA_API_KEY="your-api-key-here"
+```
+
+### 2. Usage
+
+Run olmOCR with the DeepInfra server endpoint:
+
+```bash
+python -m olmocr.pipeline ./localworkspace \
+  --server https://api.deepinfra.com/v1/openai \
+  --api_key $DEEPINFRA_API_KEY \
+  --model allenai/olmOCR-7B-0725-FP8 \
+  --markdown \
+  --pdfs path/to/your/*.pdf
+```
+
+### Command Line Arguments
+
+- `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai`
+- `--api_key`: Your DeepInfra API key (or use environment variable)
+- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8`
+- Other arguments work the same as with local inference
+
+### Example with S3 Storage
+
+For large-scale processing with S3:
+
+```bash
+python -m olmocr.pipeline s3://your-bucket/workspace \
+  --server https://api.deepinfra.com/v1/openai \
+  --api_key $DEEPINFRA_API_KEY \
+  --model allenai/olmOCR-7B-0725-FP8 \
+  --pdfs s3://your-bucket/pdfs/*.pdf \
+  --workers 10 \
+  --markdown
+```
+
+## Pricing
+
+As of 2024, DeepInfra charges for the olmOCR model:
+- Input tokens: ~$0.27 per million tokens
+- Output tokens: ~$0.81 per million tokens
+
+Check current pricing at: https://deepinfra.com/pricing
\ No newline at end of file
diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index 04a2170..2b57a94 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -11,6 +11,7 @@ import os
 import random
 import re
 import shutil
+import ssl
 import sys
 import tempfile
 import time
@@ -104,7 +105,7 @@ class PageResult:
     is_fallback: bool
 
 
-async def build_page_query(local_pdf_path: str, page: int, target_longest_image_dim: int, image_rotation: int = 0) -> dict:
+async def build_page_query(local_pdf_path: str, page: int, target_longest_image_dim: int, image_rotation: int = 0, model_name: str = "olmocr") -> dict:
     MAX_TOKENS = 4500
     assert image_rotation in [0, 90, 180, 270], "Invalid image rotation provided in build_page_query"
 
@@ -132,7 +133,7 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_
         image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
 
     return {
-        "model": "olmocr",
+        "model": model_name,
         "messages": [
             {
                 "role": "user",
@@ -151,25 +152,44 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_
 # It feels strange perhaps, but httpx and aiohttp are very complex beasts
 # Ex. the sessionpool in httpcore has 4 different locks in it, and I've noticed
 # that at the scale of 100M+ requests, that they deadlock in different strange ways
-async def apost(url, json_data):
+async def apost(url, json_data, api_key=None):
     parsed_url = urlparse(url)
     host = parsed_url.hostname
-    port = parsed_url.port or 80
+    # Default to 443 for HTTPS, 80 for HTTP
+    if parsed_url.scheme == 'https':
+        port = parsed_url.port or 443
+        use_ssl = True
+    else:
+        port = parsed_url.port or 80
+        use_ssl = False
     path = parsed_url.path or "/"
 
     writer = None
     try:
-        reader, writer = await asyncio.open_connection(host, port)
+        if use_ssl:
+            ssl_context = ssl.create_default_context()
+            reader, writer = await asyncio.open_connection(host, port, ssl=ssl_context)
+        else:
+            reader, writer = await asyncio.open_connection(host, port)
 
         json_payload = json.dumps(json_data)
-        request = (
-            f"POST {path} HTTP/1.1\r\n"
-            f"Host: {host}\r\n"
-            f"Content-Type: application/json\r\n"
-            f"Content-Length: {len(json_payload)}\r\n"
-            f"Connection: close\r\n\r\n"
-            f"{json_payload}"
-        )
+
+        # Build request headers
+        headers = [
+            f"POST {path} HTTP/1.1",
+            f"Host: {host}",
+            f"Content-Type: application/json",
+            f"Content-Length: {len(json_payload)}",
+        ]
+
+        # Add Authorization header if API key is provided
+        if api_key:
+            headers.append(f"Authorization: Bearer {api_key}")
+
+        headers.append("Connection: close")
+
+        # Construct the full request
+        request = "\r\n".join(headers) + "\r\n\r\n" + json_payload
         writer.write(request.encode())
         await writer.drain()
 
@@ -214,7 +234,13 @@ async def apost(url, json_data):
 
 async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: str, page_num: int) -> PageResult:
     if args.server:
-        COMPLETION_URL = f"{args.server.rstrip('/')}/v1/chat/completions"
+        server_url = args.server.rstrip('/')
+        # Check if the server URL already contains '/v1/openai' (DeepInfra case)
+        if '/v1/openai' in server_url:
+            COMPLETION_URL = f"{server_url}/chat/completions"
+        else:
+            COMPLETION_URL = f"{server_url}/v1/chat/completions"
+        logger.debug(f"Using completion URL: {COMPLETION_URL}")
     else:
         COMPLETION_URL = f"http://localhost:{BASE_SERVER_PORT}/v1/chat/completions"
     MAX_RETRIES = args.max_page_retries
@@ -227,11 +253,14 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
 
     while attempt < MAX_RETRIES:
         lookup_attempt = min(attempt, len(TEMPERATURE_BY_ATTEMPT) - 1)
+        # Use the model name from args if provided, otherwise default to 'olmocr'
+        model_name = getattr(args, 'model', 'olmocr') if args.server else 'olmocr'
         query = await build_page_query(
             pdf_local_path,
             page_num,
             args.target_longest_image_dim,
             image_rotation=cumulative_rotation,
+            model_name=model_name,
         )
         # Change temperature as number of attempts increases to overcome repetition issues at expense of quality
         query["temperature"] = TEMPERATURE_BY_ATTEMPT[lookup_attempt]
@@ -245,7 +274,9 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
         logger.debug(f"Built page query for {pdf_orig_path}-{page_num}")
 
         try:
-            status_code, response_body = await apost(COMPLETION_URL, json_data=query)
+            # Pass API key if provided
+            api_key = getattr(args, 'api_key', None)
+            status_code, response_body = await apost(COMPLETION_URL, json_data=query, api_key=api_key)
 
             if status_code == 400:
                 raise ValueError(f"Got BadRequestError from server: {response_body}, skipping this response")
@@ -737,14 +768,24 @@ async def vllm_server_ready(args):
     max_attempts = 300
     delay_sec = 1
     if args.server:
-        url = f"{args.server.rstrip('/')}/v1/models"
+        # Check if the server URL already contains '/v1/openai' (DeepInfra case)
+        server_url = args.server.rstrip('/')
+        if '/v1/openai' in server_url:
+            url = f"{server_url}/models"
+        else:
+            url = f"{server_url}/v1/models"
     else:
         url = f"http://localhost:{BASE_SERVER_PORT}/v1/models"
 
     for attempt in range(1, max_attempts + 1):
         try:
+            # Add authentication headers if API key is provided
+            headers = {}
+            if args.server and hasattr(args, 'api_key') and args.api_key:
+                headers['Authorization'] = f'Bearer {args.api_key}'
+
             async with httpx.AsyncClient() as session:
-                response = await session.get(url)
+                response = await session.get(url, headers=headers)
 
                 if response.status_code == 200:
                     logger.info("vllm server is ready.")
@@ -1064,7 +1105,8 @@ async def main():
     parser.add_argument("--target_longest_image_dim", type=int, help="Dimension on longest side to use for rendering the pdf pages", default=1288)
     parser.add_argument("--target_anchor_text_len", type=int, help="Maximum amount of anchor text to use (characters), not used for new models", default=-1)
     parser.add_argument("--guided_decoding", action="store_true", help="Enable guided decoding for model YAML type outputs")
-
+    parser.add_argument('--api_key', type=str, default=None, help='API key for authenticated remote servers (e.g., DeepInfra)')
+    
     vllm_group = parser.add_argument_group(
         "VLLM arguments", "These arguments are passed to vLLM. Any unrecognized arguments are also automatically forwarded to vLLM."
     )

From 2a5792e5ed9044f14577706f1e4ca942448a5bbf Mon Sep 17 00:00:00 2001
From: aman-17 <amanrangapur@gmail.com>
Date: Fri, 26 Sep 2025 13:29:48 -0700
Subject: [PATCH 19/40] add if else for vllm local usage bug for api argument

---
 olmocr/pipeline.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index 2b57a94..90ffce0 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -253,8 +253,13 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
 
     while attempt < MAX_RETRIES:
         lookup_attempt = min(attempt, len(TEMPERATURE_BY_ATTEMPT) - 1)
-        # Use the model name from args if provided, otherwise default to 'olmocr'
-        model_name = getattr(args, 'model', 'olmocr') if args.server else 'olmocr'
+        # For external servers (like DeepInfra), use the model name from args
+        # For local inference, always use 'olmocr'
+        if args.server and hasattr(args, 'model'):
+            model_name = args.model
+        else:
+            model_name = 'olmocr'
+
         query = await build_page_query(
             pdf_local_path,
             page_num,
@@ -274,8 +279,11 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
         logger.debug(f"Built page query for {pdf_orig_path}-{page_num}")
 
         try:
-            # Pass API key if provided
-            api_key = getattr(args, 'api_key', None)
+            # Pass API key only for external servers that need authentication
+            if args.server and hasattr(args, 'api_key'):
+                api_key = args.api_key
+            else:
+                api_key = None
             status_code, response_body = await apost(COMPLETION_URL, json_data=query, api_key=api_key)
 
             if status_code == 400:

From 90589e16de31dfc742bd53959100eaa2bea9824d Mon Sep 17 00:00:00 2001
From: aman-17 <amanrangapur@gmail.com>
Date: Fri, 26 Sep 2025 13:56:34 -0700
Subject: [PATCH 20/40] Added deepinfra usage to readme

---
 DEEPINFRA_SETUP.md | 64 ----------------------------------------------
 README.md          | 20 +++++++++++++++
 2 files changed, 20 insertions(+), 64 deletions(-)
 delete mode 100644 DEEPINFRA_SETUP.md

diff --git a/DEEPINFRA_SETUP.md b/DEEPINFRA_SETUP.md
deleted file mode 100644
index 5e18381..0000000
--- a/DEEPINFRA_SETUP.md
+++ /dev/null
@@ -1,64 +0,0 @@
-# Using olmOCR with DeepInfra
-
-This guide explains how to use olmOCR with DeepInfra's hosted API service for cloud-based inference.
-
-## Prerequisites
-
-1. **DeepInfra Account**: Sign up at https://deepinfra.com/
-2. **API Key**: Get your API key from the DeepInfra dashboard
-3. **olmOCR**: Ensure you have the modified version with authentication support
-
-## Setup
-
-### 1. Get your DeepInfra API Key
-
-1. Log in to https://deepinfra.com/
-2. Navigate to your dashboard
-3. Generate or copy your API key
-4. Store it securely (recommended: as an environment variable)
-
-```bash
-export DEEPINFRA_API_KEY="your-api-key-here"
-```
-
-### 2. Usage
-
-Run olmOCR with the DeepInfra server endpoint:
-
-```bash
-python -m olmocr.pipeline ./localworkspace \
-  --server https://api.deepinfra.com/v1/openai \
-  --api_key $DEEPINFRA_API_KEY \
-  --model allenai/olmOCR-7B-0725-FP8 \
-  --markdown \
-  --pdfs path/to/your/*.pdf
-```
-
-### Command Line Arguments
-
-- `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai`
-- `--api_key`: Your DeepInfra API key (or use environment variable)
-- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8`
-- Other arguments work the same as with local inference
-
-### Example with S3 Storage
-
-For large-scale processing with S3:
-
-```bash
-python -m olmocr.pipeline s3://your-bucket/workspace \
-  --server https://api.deepinfra.com/v1/openai \
-  --api_key $DEEPINFRA_API_KEY \
-  --model allenai/olmOCR-7B-0725-FP8 \
-  --pdfs s3://your-bucket/pdfs/*.pdf \
-  --workers 10 \
-  --markdown
-```
-
-## Pricing
-
-As of 2024, DeepInfra charges for the olmOCR model:
-- Input tokens: ~$0.27 per million tokens
-- Output tokens: ~$0.81 per million tokens
-
-Check current pricing at: https://deepinfra.com/pricing
\ No newline at end of file
diff --git a/README.md b/README.md
index 024b641..e385e70 100644
--- a/README.md
+++ b/README.md
@@ -249,6 +249,26 @@ For example:
 ```bash
 python -m olmocr.pipeline s3://my_s3_bucket/pdfworkspaces/exampleworkspace --pdfs s3://my_s3_bucket/jakep/gnarly_pdfs/*.pdf --beaker --beaker_gpus 4
 ```
+### Using DeepInfra
+Signup at [DeepInfra](https://deepinfra.com/) and get your API key from the DeepInfra dashboard.
+Store the API key as an environment variable.
+```bash
+export DEEPINFRA_API_KEY="your-api-key-here"
+```
+#### Run olmOCR with the DeepInfra server endpoint:
+```bash
+python -m olmocr.pipeline ./localworkspace \
+  --server https://api.deepinfra.com/v1/openai \
+  --api_key $DEEPINFRA_API_KEY \
+  --model allenai/olmOCR-7B-0725-FP8 \
+  --markdown \
+  --pdfs path/to/your/*.pdf
+```
+- `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai`
+- `--api_key`: Your DeepInfra API key
+- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8`
+- Other arguments work the same as with local inference
+
 
 ### Using Docker
 

From e7ae5e6240c34a22bb32ba51fd59743c6be784f5 Mon Sep 17 00:00:00 2001
From: aman-17 <amanrangapur@gmail.com>
Date: Fri, 26 Sep 2025 13:58:34 -0700
Subject: [PATCH 21/40] fixed style

---
 olmocr/pipeline.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index 90ffce0..b64fb60 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -156,7 +156,7 @@ async def apost(url, json_data, api_key=None):
     parsed_url = urlparse(url)
     host = parsed_url.hostname
     # Default to 443 for HTTPS, 80 for HTTP
-    if parsed_url.scheme == 'https':
+    if parsed_url.scheme == "https":
         port = parsed_url.port or 443
         use_ssl = True
     else:
@@ -234,9 +234,9 @@ async def apost(url, json_data, api_key=None):
 
 async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: str, page_num: int) -> PageResult:
     if args.server:
-        server_url = args.server.rstrip('/')
+        server_url = args.server.rstrip("/")
         # Check if the server URL already contains '/v1/openai' (DeepInfra case)
-        if '/v1/openai' in server_url:
+        if "/v1/openai" in server_url:
             COMPLETION_URL = f"{server_url}/chat/completions"
         else:
             COMPLETION_URL = f"{server_url}/v1/chat/completions"
@@ -255,10 +255,10 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
         lookup_attempt = min(attempt, len(TEMPERATURE_BY_ATTEMPT) - 1)
         # For external servers (like DeepInfra), use the model name from args
         # For local inference, always use 'olmocr'
-        if args.server and hasattr(args, 'model'):
+        if args.server and hasattr(args, "model"):
             model_name = args.model
         else:
-            model_name = 'olmocr'
+            model_name = "olmocr"
 
         query = await build_page_query(
             pdf_local_path,
@@ -280,7 +280,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
 
         try:
             # Pass API key only for external servers that need authentication
-            if args.server and hasattr(args, 'api_key'):
+            if args.server and hasattr(args, "api_key"):
                 api_key = args.api_key
             else:
                 api_key = None
@@ -777,8 +777,8 @@ async def vllm_server_ready(args):
     delay_sec = 1
     if args.server:
         # Check if the server URL already contains '/v1/openai' (DeepInfra case)
-        server_url = args.server.rstrip('/')
-        if '/v1/openai' in server_url:
+        server_url = args.server.rstrip("/")
+        if "/v1/openai" in server_url:
             url = f"{server_url}/models"
         else:
             url = f"{server_url}/v1/models"
@@ -789,8 +789,8 @@ async def vllm_server_ready(args):
         try:
             # Add authentication headers if API key is provided
             headers = {}
-            if args.server and hasattr(args, 'api_key') and args.api_key:
-                headers['Authorization'] = f'Bearer {args.api_key}'
+            if args.server and hasattr(args, "api_key") and args.api_key:
+                headers["Authorization"] = f"Bearer {args.api_key}"
 
             async with httpx.AsyncClient() as session:
                 response = await session.get(url, headers=headers)
@@ -1113,8 +1113,8 @@ async def main():
     parser.add_argument("--target_longest_image_dim", type=int, help="Dimension on longest side to use for rendering the pdf pages", default=1288)
     parser.add_argument("--target_anchor_text_len", type=int, help="Maximum amount of anchor text to use (characters), not used for new models", default=-1)
     parser.add_argument("--guided_decoding", action="store_true", help="Enable guided decoding for model YAML type outputs")
-    parser.add_argument('--api_key', type=str, default=None, help='API key for authenticated remote servers (e.g., DeepInfra)')
-    
+    parser.add_argument("--api_key", type=str, default=None, help="API key for authenticated remote servers (e.g., DeepInfra)")
+
     vllm_group = parser.add_argument_group(
         "VLLM arguments", "These arguments are passed to vLLM. Any unrecognized arguments are also automatically forwarded to vLLM."
     )

From 556ff26d585c860194c887c5042eadf5d2878bfb Mon Sep 17 00:00:00 2001
From: aman-17 <amanrangapur@gmail.com>
Date: Fri, 26 Sep 2025 14:08:40 -0700
Subject: [PATCH 22/40] fixed lint, style, ruff

---
 olmocr/pipeline.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index b64fb60..1c5febb 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -174,7 +174,6 @@ async def apost(url, json_data, api_key=None):
 
         json_payload = json.dumps(json_data)
 
-        # Build request headers
         headers = [
             f"POST {path} HTTP/1.1",
             f"Host: {host}",
@@ -182,18 +181,15 @@ async def apost(url, json_data, api_key=None):
             f"Content-Length: {len(json_payload)}",
         ]
 
-        # Add Authorization header if API key is provided
         if api_key:
             headers.append(f"Authorization: Bearer {api_key}")
 
         headers.append("Connection: close")
 
-        # Construct the full request
         request = "\r\n".join(headers) + "\r\n\r\n" + json_payload
         writer.write(request.encode())
         await writer.drain()
 
-        # Read status line
         status_line = await reader.readline()
         if not status_line:
             raise ConnectionError("No response from server")
@@ -279,7 +275,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
         logger.debug(f"Built page query for {pdf_orig_path}-{page_num}")
 
         try:
-            # Pass API key only for external servers that need authentication
+            # Passing API key only for external servers that need authentication
             if args.server and hasattr(args, "api_key"):
                 api_key = args.api_key
             else:
@@ -787,7 +783,6 @@ async def vllm_server_ready(args):
 
     for attempt in range(1, max_attempts + 1):
         try:
-            # Add authentication headers if API key is provided
             headers = {}
             if args.server and hasattr(args, "api_key") and args.api_key:
                 headers["Authorization"] = f"Bearer {args.api_key}"

From 359abef6547dec814c4cbaaa57b1ef4e26641888 Mon Sep 17 00:00:00 2001
From: aman-17 <amanrangapur@gmail.com>
Date: Fri, 26 Sep 2025 14:19:22 -0700
Subject: [PATCH 23/40] updated pytests

---
 tests/test_pipeline.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 600753d..71a2f3c 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -268,9 +268,9 @@ This is the corrected text from the document."""
         build_page_query_calls = []
         original_build_page_query = build_page_query
 
-        async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0):
+        async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0, model_name="olmocr"):
             build_page_query_calls.append(image_rotation)
-            return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation)
+            return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation, model_name)
 
         with patch("olmocr.pipeline.apost", side_effect=mock_apost):
             with patch("olmocr.pipeline.tracker", mock_tracker):
@@ -376,9 +376,9 @@ Document is now correctly oriented after 180 degree rotation."""
         build_page_query_calls = []
         original_build_page_query = build_page_query
 
-        async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0):
+        async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0, model_name="olmocr"):
             build_page_query_calls.append(image_rotation)
-            return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation)
+            return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation, model_name)
 
         with patch("olmocr.pipeline.apost", side_effect=mock_apost):
             with patch("olmocr.pipeline.tracker", mock_tracker):
@@ -482,9 +482,9 @@ Document correctly oriented at 90 degrees total rotation."""
         build_page_query_calls = []
         original_build_page_query = build_page_query
 
-        async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0):
+        async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0, model_name="olmocr"):
             build_page_query_calls.append(image_rotation)
-            return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation)
+            return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation, model_name)
 
         with patch("olmocr.pipeline.apost", side_effect=mock_apost):
             with patch("olmocr.pipeline.tracker", mock_tracker):

From f3c4073395a967e038b0ab092b1a6d8ed12adcb5 Mon Sep 17 00:00:00 2001
From: aman-17 <amanrangapur@gmail.com>
Date: Fri, 26 Sep 2025 14:25:25 -0700
Subject: [PATCH 24/40] added Api_key argument to pipeline pytests

---
 tests/test_pipeline.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 71a2f3c..1541639 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -209,7 +209,7 @@ class TestRotationCorrection:
         # Counter to track number of API calls
         call_count = 0
 
-        async def mock_apost(url, json_data):
+        async def mock_apost(url, json_data, api_key=None):
             nonlocal call_count
             call_count += 1
 
@@ -311,7 +311,7 @@ This is the corrected text from the document."""
         # Counter to track number of API calls
         call_count = 0
 
-        async def mock_apost(url, json_data):
+        async def mock_apost(url, json_data, api_key=None):
             nonlocal call_count
             call_count += 1
 
@@ -420,7 +420,7 @@ Document is now correctly oriented after 180 degree rotation."""
         # Counter to track number of API calls
         call_count = 0
 
-        async def mock_apost(url, json_data):
+        async def mock_apost(url, json_data, api_key=None):
             nonlocal call_count
             call_count += 1
 

From 9c750903cef946c6cdaff9e5ec4197c831cde970 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 29 Sep 2025 17:06:14 +0000
Subject: [PATCH 25/40] Ignore files

---
 .dockerignore | 3 ++-
 .gitignore    | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 919c2da..0f3fbb2 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -14,4 +14,5 @@ scripts/**/*.sh
 # Nor copy any olmocr bench files
 olmOCR-bench/
 olmOCR-bench*/
-html_templates*/
\ No newline at end of file
+html_templates*/
+olmocr-synthmix-*/
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 1b40340..77b9cb3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,8 +18,6 @@ old_train/
 gpt4otestset_output/*
 pdfs/*
 olmOCR-bench/*
-olmOCR-bench-0825/
-olmOCR-bench-snapshot-082225/
 table_data*/
 /synth*/
 dolma_samples/*
@@ -29,6 +27,7 @@ filtered_items_prefilter/
 augraphy_cache/
 /*.html
 html_templates*/
+olmocr-synthmix*/
 scoreelo.csv
 debug.log
 birrpipeline-debug.log

From 0c6d88986352862381b29a4d2396aea6f8f1a7df Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 29 Sep 2025 17:26:22 +0000
Subject: [PATCH 26/40] Adding retry code on 429 errors from exteranl providers

---
 README.md          | 2 ++
 olmocr/pipeline.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/README.md b/README.md
index e385e70..23a87a4 100644
--- a/README.md
+++ b/README.md
@@ -260,12 +260,14 @@ export DEEPINFRA_API_KEY="your-api-key-here"
 python -m olmocr.pipeline ./localworkspace \
   --server https://api.deepinfra.com/v1/openai \
   --api_key $DEEPINFRA_API_KEY \
+  --pages_per_group 100 \
   --model allenai/olmOCR-7B-0725-FP8 \
   --markdown \
   --pdfs path/to/your/*.pdf
 ```
 - `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai`
 - `--api_key`: Your DeepInfra API key
+- `--pages_per_group`: You may want a smaller number of pages per group as many external provides have lower concurrent request limits
 - `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8`
 - Other arguments work the same as with local inference
 
diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index 1c5febb..fb3893f 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -284,6 +284,8 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
 
             if status_code == 400:
                 raise ValueError(f"Got BadRequestError from server: {response_body}, skipping this response")
+            elif status_code == 429:
+                raise ConnectionError(f"Too many requests, doing exponential backoff")
             elif status_code == 500:
                 raise ValueError(f"Got InternalServerError from server: {response_body}, skipping this response")
             elif status_code != 200:

From a0bc5a46908d9f32d9c8360c7de1a07ca7588a25 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 29 Sep 2025 17:29:28 +0000
Subject: [PATCH 27/40] Deepinfra readme

---
 README.md | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 23a87a4..d12af55 100644
--- a/README.md
+++ b/README.md
@@ -210,6 +210,29 @@ The served model name should be `olmocr`. An example vLLM launch command would b
 vllm serve allenai/olmOCR-7B-0825-FP8 --served-model-name olmocr --max-model-len 16384
 ```
 
+#### Run olmOCR with the DeepInfra server endpoint:
+Signup at [DeepInfra](https://deepinfra.com/) and get your API key from the DeepInfra dashboard.
+Store the API key as an environment variable.
+```bash
+export DEEPINFRA_API_KEY="your-api-key-here"
+```
+
+```bash
+python -m olmocr.pipeline ./localworkspace \
+  --server https://api.deepinfra.com/v1/openai \
+  --api_key $DEEPINFRA_API_KEY \
+  --pages_per_group 100 \
+  --model allenai/olmOCR-7B-0725-FP8 \
+  --markdown \
+  --pdfs path/to/your/*.pdf
+```
+- `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai`
+- `--api_key`: Your DeepInfra API key
+- `--pages_per_group`: You may want a smaller number of pages per group as many external provides have lower concurrent request limits
+- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8`
+- Other arguments work the same as with local inference
+
+
 #### Viewing Results
 
 The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
@@ -249,27 +272,6 @@ For example:
 ```bash
 python -m olmocr.pipeline s3://my_s3_bucket/pdfworkspaces/exampleworkspace --pdfs s3://my_s3_bucket/jakep/gnarly_pdfs/*.pdf --beaker --beaker_gpus 4
 ```
-### Using DeepInfra
-Signup at [DeepInfra](https://deepinfra.com/) and get your API key from the DeepInfra dashboard.
-Store the API key as an environment variable.
-```bash
-export DEEPINFRA_API_KEY="your-api-key-here"
-```
-#### Run olmOCR with the DeepInfra server endpoint:
-```bash
-python -m olmocr.pipeline ./localworkspace \
-  --server https://api.deepinfra.com/v1/openai \
-  --api_key $DEEPINFRA_API_KEY \
-  --pages_per_group 100 \
-  --model allenai/olmOCR-7B-0725-FP8 \
-  --markdown \
-  --pdfs path/to/your/*.pdf
-```
-- `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai`
-- `--api_key`: Your DeepInfra API key
-- `--pages_per_group`: You may want a smaller number of pages per group as many external provides have lower concurrent request limits
-- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8`
-- Other arguments work the same as with local inference
 
 
 ### Using Docker

From c587eb90506c3ca8093fa85b1f5698b5c5cf5ee1 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 29 Sep 2025 17:36:41 +0000
Subject: [PATCH 28/40] Ugh, release script adds all files by default

---
 olmocr/version.py  | 2 +-
 scripts/release.sh | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/olmocr/version.py b/olmocr/version.py
index 79a242e..cee5284 100644
--- a/olmocr/version.py
+++ b/olmocr/version.py
@@ -2,7 +2,7 @@ _MAJOR = "0"
 _MINOR = "3"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "4"
+_PATCH = "6"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""
diff --git a/scripts/release.sh b/scripts/release.sh
index dc5ab60..718dc29 100755
--- a/scripts/release.sh
+++ b/scripts/release.sh
@@ -68,7 +68,6 @@ read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
 
 if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then
     python scripts/prepare_changelog.py
-    git add -A
     git commit -m "Bump version to $TAG for release" || true && git push
     echo "Creating new git tag $TAG"
     git tag "$TAG" -m "$TAG"

From fb1ef9e38af7f9dcfae2b8d8cf77663224c43f8a Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 29 Sep 2025 17:37:14 +0000
Subject: [PATCH 29/40] Release script fix

---
 scripts/release.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/release.sh b/scripts/release.sh
index 718dc29..ef30083 100755
--- a/scripts/release.sh
+++ b/scripts/release.sh
@@ -68,6 +68,7 @@ read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
 
 if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then
     python scripts/prepare_changelog.py
+    git add CHANGELOG.md
     git commit -m "Bump version to $TAG for release" || true && git push
     echo "Creating new git tag $TAG"
     git tag "$TAG" -m "$TAG"

From 8982bae756c738ad63962b0a60c304b037665197 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 29 Sep 2025 17:37:25 +0000
Subject: [PATCH 30/40] Bump version to v0.3.6 for release

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f711eb7..09fd707 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+## [v0.3.6](https://github.com/allenai/olmocr/releases/tag/v0.3.6) - 2025-09-29
+
 ## [v0.3.4](https://github.com/allenai/olmocr/releases/tag/v0.3.4) - 2025-08-31
 
 ## [v0.3.3](https://github.com/allenai/olmocr/releases/tag/v0.3.3) - 2025-08-15

From f4356de0910ad3243ca57ced27ef730ec5434150 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 29 Sep 2025 17:56:03 +0000
Subject: [PATCH 31/40] deepinfra readme improved

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d12af55..08c0e81 100644
--- a/README.md
+++ b/README.md
@@ -222,14 +222,14 @@ python -m olmocr.pipeline ./localworkspace \
   --server https://api.deepinfra.com/v1/openai \
   --api_key $DEEPINFRA_API_KEY \
   --pages_per_group 100 \
-  --model allenai/olmOCR-7B-0725-FP8 \
+  --model allenai/olmOCR-7B-0825 \
   --markdown \
   --pdfs path/to/your/*.pdf
 ```
 - `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai`
 - `--api_key`: Your DeepInfra API key
 - `--pages_per_group`: You may want a smaller number of pages per group as many external provides have lower concurrent request limits
-- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8`
+- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0825`
 - Other arguments work the same as with local inference
 
 

From 9feb41af82f40a0358614a896dcbaa6793cb7ea8 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 6 Oct 2025 18:57:16 +0000
Subject: [PATCH 32/40] New docker file approach for vllm 0.11

---
 Dockerfile     | 88 ++++++++++++++++++++++----------------------------
 pyproject.toml |  2 +-
 2 files changed, 40 insertions(+), 50 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 2ac06b0..515ecd7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,62 +1,52 @@
-ARG CUDA_VERSION=12.8.1
-FROM --platform=linux/amd64 nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+FROM vllm/vllm-openai:v0.11.0
 
-# Needs to be repeated below the FROM, or else it's not picked up
-ARG PYTHON_VERSION=3.12
-ARG CUDA_VERSION=12.8.1
+ENV PYTHON_VERSION=3.12
+ENV CUSTOM_PY="/usr/bin/python${PYTHON_VERSION}"
 
-# Set environment variable to prevent interactive prompts
-ENV DEBIAN_FRONTEND=noninteractive
-
-# From original VLLM dockerfile https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile
-# Install Python and other dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo python3-apt \
-    && for i in 1 2 3; do \
-    add-apt-repository -y ppa:deadsnakes/ppa && break || \
-    { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-    done \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv
-
-# olmOCR Specific Installs - Install fonts BEFORE changing Python version
-RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections && \
+# Workaround for installing fonts, which are needed for good rendering of documents
+RUN DIST_PY=$(ls /usr/bin/python3.[0-9]* | sort -V | head -n1) && \
+    # If a python alternative scheme already exists, remember its value so we \
+    # can restore it later; otherwise, we will restore to CUSTOM_PY when we \
+    # are done. \
+    if update-alternatives --query python3 >/dev/null 2>&1; then \
+        ORIGINAL_PY=$(update-alternatives --query python3 | awk -F": " '/Value:/ {print $2}'); \
+    else \
+        ORIGINAL_PY=$CUSTOM_PY; \
+    fi && \
+    # ---- APT operations that require the distro python3 ------------------- \
+    echo "Temporarily switching python3 alternative to ${DIST_PY} so that APT scripts use the distro‑built Python runtime." && \
+    update-alternatives --install /usr/bin/python3 python3 ${DIST_PY} 1 && \
+    update-alternatives --set python3 ${DIST_PY} && \
+    update-alternatives --install /usr/bin/python python ${DIST_PY} 1 && \
+    update-alternatives --set python ${DIST_PY} && \
     apt-get update -y && \
-    apt-get install -y --no-install-recommends poppler-utils fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools ttf-mscorefonts-installer
-
-# Now update Python alternatives
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
-
-# Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv python3 -m pip install uv
-
-# Install some helper utilities for things like the benchmark
-RUN apt-get update -y && apt-get install -y --no-install-recommends \
-    git \
-    git-lfs \
-    curl \
-    wget \
-    unzip
-
-ENV PYTHONUNBUFFERED=1
+    apt-get remove -y python3-blinker || true && \
+    # Pre‑seed the Microsoft Core Fonts EULA so the build is non‑interactive \
+    echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        python3-apt \
+        update-notifier-common \
+        poppler-utils \
+        fonts-crosextra-caladea \
+        fonts-crosextra-carlito \
+        gsfonts \
+        lcdf-typetools \
+        ttf-mscorefonts-installer && \
+    # ---- Restore the original / custom Python alternative ----------------- \
+    echo "Restoring python3 alternative to ${ORIGINAL_PY}" && \
+    update-alternatives --install /usr/bin/python3 python3 ${ORIGINAL_PY} 1 && \
+    update-alternatives --set python3 ${ORIGINAL_PY} && \
+    update-alternatives --install /usr/bin/python python ${ORIGINAL_PY} 1 || true && \
+    update-alternatives --set python ${ORIGINAL_PY} || true && \
+    # Ensure pip is available for the restored Python \
+    curl -sS https://bootstrap.pypa.io/get-pip.py | ${ORIGINAL_PY}
 
 # keep the build context clean
 WORKDIR /build          
 COPY . /build
 
-
 # Needed to resolve setuptools dependencies
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
-RUN uv pip install --system --no-cache ".[gpu]" --extra-index-url https://download.pytorch.org/whl/cu128
-RUN uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl
 RUN uv pip install --system --no-cache ".[bench]"
 
 RUN playwright install-deps
diff --git a/pyproject.toml b/pyproject.toml
index 163c5eb..3c40b75 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,7 @@ Changelog = "https://github.com/allenai/olmocr/blob/main/CHANGELOG.md"
 
 [project.optional-dependencies]
 gpu = [
-    "vllm==0.10.0"
+    "vllm==0.11.0"
 ]
 
 dev = [

From 9b517a02be7529f71aca399f589acd42aeeb79c8 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 6 Oct 2025 19:47:19 +0000
Subject: [PATCH 33/40] Git lfs in docker image

---
 Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 515ecd7..7db0fea 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -31,7 +31,8 @@ RUN DIST_PY=$(ls /usr/bin/python3.[0-9]* | sort -V | head -n1) && \
         fonts-crosextra-carlito \
         gsfonts \
         lcdf-typetools \
-        ttf-mscorefonts-installer && \
+        ttf-mscorefonts-installer \
+        git git-lfs curl wget unzip && \
     # ---- Restore the original / custom Python alternative ----------------- \
     echo "Restoring python3 alternative to ${ORIGINAL_PY}" && \
     update-alternatives --install /usr/bin/python3 python3 ${ORIGINAL_PY} 1 && \

From 81be6f5c1f4d59511b7ea874408bbf4fc0fc2c19 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 6 Oct 2025 19:52:55 +0000
Subject: [PATCH 34/40] Transformers version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3c40b75..52142a0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ dependencies = [
   "boto3",
   "httpx",
   "torch>=2.7.0",
-  "transformers==4.53.2",
+  "transformers==4.55.2",
   "img2pdf",
   "beaker-py",
 ]

From c75f5b98a1d46cf0b75595491faf2c9e67626542 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 6 Oct 2025 20:26:41 +0000
Subject: [PATCH 35/40] Cleaning up pr 341 arguments to match with vllm 0.11,
 which only has V1 engine and thus always does chunked prefill. And fixes arg
 syntax

---
 olmocr/pipeline.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index 40f72aa..b388c68 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -636,8 +636,7 @@ async def vllm_server_task(model_name_or_path, args, semaphore, unknown_args=Non
         str(args.tensor_parallel_size),
         "--data-parallel-size",
         str(args.data_parallel_size),
-        "--enable-chunked-prefill",
-        "--limit-mm-per-prompt '{\"video\": 0}'"
+        "--limit-mm-per-prompt", "{\"video\": 0}"  # Disabling video encoder saves RAM that you can put towards the KV cache, thanks @charitarthchugh
     ]
 
     if args.gpu_memory_utilization is not None:

From 1951a849ec57cd8319ec8b88080345f3270c557a Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 6 Oct 2025 21:10:00 +0000
Subject: [PATCH 36/40] Version bump with new vllm

---
 olmocr/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/olmocr/version.py b/olmocr/version.py
index cee5284..02091a9 100644
--- a/olmocr/version.py
+++ b/olmocr/version.py
@@ -2,7 +2,7 @@ _MAJOR = "0"
 _MINOR = "3"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "6"
+_PATCH = "7"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""

From 9c7c670f1fd7d4e07ff02a5500606fb084fe72d1 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 6 Oct 2025 21:10:07 +0000
Subject: [PATCH 37/40] Bump version to v0.3.7 for release

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 09fd707..c089fea 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+## [v0.3.7](https://github.com/allenai/olmocr/releases/tag/v0.3.7) - 2025-10-06
+
 ## [v0.3.6](https://github.com/allenai/olmocr/releases/tag/v0.3.6) - 2025-09-29
 
 ## [v0.3.4](https://github.com/allenai/olmocr/releases/tag/v0.3.4) - 2025-08-31

From 7fe756fe6357aedf0beb89055c5c88ea7793a2cd Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 6 Oct 2025 21:10:32 +0000
Subject: [PATCH 38/40] Formatting

---
 olmocr/pipeline.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index b388c68..4d3d1cb 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -636,7 +636,8 @@ async def vllm_server_task(model_name_or_path, args, semaphore, unknown_args=Non
         str(args.tensor_parallel_size),
         "--data-parallel-size",
         str(args.data_parallel_size),
-        "--limit-mm-per-prompt", "{\"video\": 0}"  # Disabling video encoder saves RAM that you can put towards the KV cache, thanks @charitarthchugh
+        "--limit-mm-per-prompt",
+        '{"video": 0}',  # Disabling video encoder saves RAM that you can put towards the KV cache, thanks @charitarthchugh
     ]
 
     if args.gpu_memory_utilization is not None:

From e12941a608c59d2026ec1da1b04bade0b06e68dd Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 6 Oct 2025 21:46:10 +0000
Subject: [PATCH 39/40] Version bump

---
 olmocr/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/olmocr/version.py b/olmocr/version.py
index 02091a9..3bbaff7 100644
--- a/olmocr/version.py
+++ b/olmocr/version.py
@@ -2,7 +2,7 @@ _MAJOR = "0"
 _MINOR = "3"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "7"
+_PATCH = "8"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""

From c89787183a1f9b008d5ef643aa8822115ef4dbfd Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 6 Oct 2025 21:46:18 +0000
Subject: [PATCH 40/40] Bump version to v0.3.8 for release

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c089fea..a1d37d7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+## [v0.3.8](https://github.com/allenai/olmocr/releases/tag/v0.3.8) - 2025-10-06
+
 ## [v0.3.7](https://github.com/allenai/olmocr/releases/tag/v0.3.7) - 2025-10-06
 
 ## [v0.3.6](https://github.com/allenai/olmocr/releases/tag/v0.3.6) - 2025-09-29