From fe425fde209b14d2801f885c45e9ec129f16c586 Mon Sep 17 00:00:00 2001
From: Charitarth Chugh <37895518+charitarthchugh@users.noreply.github.com>
Date: Thu, 25 Sep 2025 14:29:49 -0400
Subject: [PATCH] Add chunked prefill and limit mm per prompt options

---
 olmocr/pipeline.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
index 04a2170..65ea7f1 100644
--- a/olmocr/pipeline.py
+++ b/olmocr/pipeline.py
@@ -599,6 +599,8 @@ async def vllm_server_task(model_name_or_path, args, semaphore, unknown_args=Non
         str(args.tensor_parallel_size),
         "--data-parallel-size",
         str(args.data_parallel_size),
+        "--enable-chunked-prefill",
+        "--limit-mm-per-prompt '{\"video\": 0}'"
     ]
 
     if args.gpu_memory_utilization is not None: