From 261c722f561e06f100174ebd17e5b1b0aea5c1e2 Mon Sep 17 00:00:00 2001 From: Haydn Jones Date: Thu, 21 Aug 2025 17:49:07 -0400 Subject: [PATCH] Update README + arg name --- README.md | 21 +++++++++++++++++++-- olmocr/pipeline.py | 18 +++++++++--------- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index fdda60b..024b641 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,20 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs tests/gnarly_pdfs/* With the addition of the `--markdown` flag, results will be stored as markdown files inside of `./localworkspace/markdown/`. +### Using External vLLM Server + +If you have a vLLM server already running elsewhere (or any inference platform implementing the relevant subset of the OpenAI API), you can point olmOCR to use it instead of spawning a local instance: + +```bash +# Use external vLLM server instead of local one +python -m olmocr.pipeline ./localworkspace --server http://remote-server:8000 --markdown --pdfs tests/gnarly_pdfs/*.pdf +``` + +The served model name should be `olmocr`. An example vLLM launch command would be: +```bash +vllm serve allenai/olmOCR-7B-0825-FP8 --served-model-name olmocr --max-model-len 16384 +``` + #### Viewing Results The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`). @@ -271,7 +285,7 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs olmocr-sample.pdf python -m olmocr.pipeline --help usage: pipeline.py [-h] [--pdfs [PDFS ...]] [--model MODEL] [--workspace_profile WORKSPACE_PROFILE] [--pdf_profile PDF_PROFILE] [--pages_per_group PAGES_PER_GROUP] [--max_page_retries MAX_PAGE_RETRIES] [--max_page_error_rate MAX_PAGE_ERROR_RATE] [--workers WORKERS] [--apply_filter] [--stats] [--markdown] [--target_longest_image_dim TARGET_LONGEST_IMAGE_DIM] [--target_anchor_text_len TARGET_ANCHOR_TEXT_LEN] [--guided_decoding] [--gpu-memory-utilization GPU_MEMORY_UTILIZATION] [--max_model_len MAX_MODEL_LEN] - [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--data-parallel-size DATA_PARALLEL_SIZE] [--port PORT] [--beaker] [--beaker_workspace BEAKER_WORKSPACE] [--beaker_cluster BEAKER_CLUSTER] [--beaker_gpus BEAKER_GPUS] [--beaker_priority BEAKER_PRIORITY] + [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--data-parallel-size DATA_PARALLEL_SIZE] [--port PORT] [--server SERVER] [--beaker] [--beaker_workspace BEAKER_WORKSPACE] [--beaker_cluster BEAKER_CLUSTER] [--beaker_gpus BEAKER_GPUS] [--beaker_priority BEAKER_PRIORITY] workspace Manager for running millions of PDFs through a batch inference pipeline @@ -303,7 +317,7 @@ options: Maximum amount of anchor text to use (characters), not used for new models --guided_decoding Enable guided decoding for model YAML type outputs -VLLM Forwarded arguments: +VLLM arguments: --gpu-memory-utilization GPU_MEMORY_UTILIZATION Fraction of VRAM vLLM may pre-allocate for KV-cache (passed through to vllm serve). --max_model_len MAX_MODEL_LEN @@ -313,6 +327,9 @@ VLLM Forwarded arguments: --data-parallel-size DATA_PARALLEL_SIZE, -dp DATA_PARALLEL_SIZE Data parallel size for vLLM --port PORT Port to use for the VLLM server + --server SERVER URL of external vLLM (or other compatible provider) + server (e.g., http://hostname:port). If provided, + skips spawning local vLLM instance beaker/cluster execution: --beaker Submit this job to beaker instead of running locally diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 143664d..78ca51b 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -213,8 +213,8 @@ async def apost(url, json_data): async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: str, page_num: int) -> PageResult: - if args.external_vllm_url: - COMPLETION_URL = f"{args.external_vllm_url.rstrip('/')}/v1/chat/completions" + if args.server: + COMPLETION_URL = f"{args.server.rstrip('/')}/v1/chat/completions" else: COMPLETION_URL = f"http://localhost:{BASE_SERVER_PORT}/v1/chat/completions" MAX_RETRIES = args.max_page_retries @@ -736,8 +736,8 @@ async def vllm_server_host(model_name_or_path, args, semaphore, unknown_args=Non async def vllm_server_ready(args): max_attempts = 300 delay_sec = 1 - if args.external_vllm_url: - url = f"{args.external_vllm_url.rstrip('/')}/v1/models" + if args.server: + url = f"{args.server.rstrip('/')}/v1/models" else: url = f"http://localhost:{BASE_SERVER_PORT}/v1/models" @@ -1076,7 +1076,7 @@ async def main(): vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM") vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server") vllm_group.add_argument( - "--external-vllm-url", type=str, help="URL of external vLLM server (e.g., http://hostname:port). If provided, skips spawning local vLLM instance" + "--server", type=str, help="URL of external vLLM (or other compatible provider) server (e.g., http://hostname:port). If provided, skips spawning local vLLM instance" ) # Beaker/job running stuff @@ -1216,14 +1216,14 @@ async def main(): # If you get this far, then you are doing inference and need a GPU # check_sglang_version() - if not args.external_vllm_url: + if not args.server: check_torch_gpu_available() logger.info(f"Starting pipeline with PID {os.getpid()}") # Download the model before you do anything else - if args.external_vllm_url: - logger.info(f"Using external vLLM server at {args.external_vllm_url}") + if args.server: + logger.info(f"Using external server at {args.server}") model_name_or_path = None else: model_name_or_path = await download_model(args.model) @@ -1242,7 +1242,7 @@ async def main(): # Start local vLLM instance if not using external one vllm_server = None - if not args.external_vllm_url: + if not args.server: vllm_server = asyncio.create_task(vllm_server_host(model_name_or_path, args, semaphore, unknown_args)) await vllm_server_ready(args)