Add chunked prefill and limit mm per prompt options

This commit is contained in:
Charitarth Chugh 2025-09-25 14:29:49 -04:00 committed by GitHub
parent 8f88a98e5d
commit fe425fde20
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -599,6 +599,8 @@ async def vllm_server_task(model_name_or_path, args, semaphore, unknown_args=Non
str(args.tensor_parallel_size), str(args.tensor_parallel_size),
"--data-parallel-size", "--data-parallel-size",
str(args.data_parallel_size), str(args.data_parallel_size),
"--enable-chunked-prefill",
"--limit-mm-per-prompt '{\"video\": 0}'"
] ]
if args.gpu_memory_utilization is not None: if args.gpu_memory_utilization is not None: