From fe425fde209b14d2801f885c45e9ec129f16c586 Mon Sep 17 00:00:00 2001 From: Charitarth Chugh <37895518+charitarthchugh@users.noreply.github.com> Date: Thu, 25 Sep 2025 14:29:49 -0400 Subject: [PATCH] Add chunked prefill and limit mm per prompt options --- olmocr/pipeline.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 04a2170..65ea7f1 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -599,6 +599,8 @@ async def vllm_server_task(model_name_or_path, args, semaphore, unknown_args=Non str(args.tensor_parallel_size), "--data-parallel-size", str(args.data_parallel_size), + "--enable-chunked-prefill", + "--limit-mm-per-prompt '{\"video\": 0}'" ] if args.gpu_memory_utilization is not None: