mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-27 15:51:23 +00:00
Update README + arg name
This commit is contained in:
parent
b34c3611e1
commit
261c722f56
21
README.md
21
README.md
@ -196,6 +196,20 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs tests/gnarly_pdfs/*
|
|||||||
|
|
||||||
With the addition of the `--markdown` flag, results will be stored as markdown files inside of `./localworkspace/markdown/`.
|
With the addition of the `--markdown` flag, results will be stored as markdown files inside of `./localworkspace/markdown/`.
|
||||||
|
|
||||||
|
### Using External vLLM Server
|
||||||
|
|
||||||
|
If you have a vLLM server already running elsewhere (or any inference platform implementing the relevant subset of the OpenAI API), you can point olmOCR to use it instead of spawning a local instance:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Use external vLLM server instead of local one
|
||||||
|
python -m olmocr.pipeline ./localworkspace --server http://remote-server:8000 --markdown --pdfs tests/gnarly_pdfs/*.pdf
|
||||||
|
```
|
||||||
|
|
||||||
|
The served model name should be `olmocr`. An example vLLM launch command would be:
|
||||||
|
```bash
|
||||||
|
vllm serve allenai/olmOCR-7B-0825-FP8 --served-model-name olmocr --max-model-len 16384
|
||||||
|
```
|
||||||
|
|
||||||
#### Viewing Results
|
#### Viewing Results
|
||||||
|
|
||||||
The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
|
The `./localworkspace/` workspace folder will then have both [Dolma](https://github.com/allenai/dolma) and markdown files (if using `--markdown`).
|
||||||
@ -271,7 +285,7 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs olmocr-sample.pdf
|
|||||||
python -m olmocr.pipeline --help
|
python -m olmocr.pipeline --help
|
||||||
usage: pipeline.py [-h] [--pdfs [PDFS ...]] [--model MODEL] [--workspace_profile WORKSPACE_PROFILE] [--pdf_profile PDF_PROFILE] [--pages_per_group PAGES_PER_GROUP] [--max_page_retries MAX_PAGE_RETRIES] [--max_page_error_rate MAX_PAGE_ERROR_RATE] [--workers WORKERS]
|
usage: pipeline.py [-h] [--pdfs [PDFS ...]] [--model MODEL] [--workspace_profile WORKSPACE_PROFILE] [--pdf_profile PDF_PROFILE] [--pages_per_group PAGES_PER_GROUP] [--max_page_retries MAX_PAGE_RETRIES] [--max_page_error_rate MAX_PAGE_ERROR_RATE] [--workers WORKERS]
|
||||||
[--apply_filter] [--stats] [--markdown] [--target_longest_image_dim TARGET_LONGEST_IMAGE_DIM] [--target_anchor_text_len TARGET_ANCHOR_TEXT_LEN] [--guided_decoding] [--gpu-memory-utilization GPU_MEMORY_UTILIZATION] [--max_model_len MAX_MODEL_LEN]
|
[--apply_filter] [--stats] [--markdown] [--target_longest_image_dim TARGET_LONGEST_IMAGE_DIM] [--target_anchor_text_len TARGET_ANCHOR_TEXT_LEN] [--guided_decoding] [--gpu-memory-utilization GPU_MEMORY_UTILIZATION] [--max_model_len MAX_MODEL_LEN]
|
||||||
[--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--data-parallel-size DATA_PARALLEL_SIZE] [--port PORT] [--beaker] [--beaker_workspace BEAKER_WORKSPACE] [--beaker_cluster BEAKER_CLUSTER] [--beaker_gpus BEAKER_GPUS] [--beaker_priority BEAKER_PRIORITY]
|
[--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--data-parallel-size DATA_PARALLEL_SIZE] [--port PORT] [--server SERVER] [--beaker] [--beaker_workspace BEAKER_WORKSPACE] [--beaker_cluster BEAKER_CLUSTER] [--beaker_gpus BEAKER_GPUS] [--beaker_priority BEAKER_PRIORITY]
|
||||||
workspace
|
workspace
|
||||||
|
|
||||||
Manager for running millions of PDFs through a batch inference pipeline
|
Manager for running millions of PDFs through a batch inference pipeline
|
||||||
@ -303,7 +317,7 @@ options:
|
|||||||
Maximum amount of anchor text to use (characters), not used for new models
|
Maximum amount of anchor text to use (characters), not used for new models
|
||||||
--guided_decoding Enable guided decoding for model YAML type outputs
|
--guided_decoding Enable guided decoding for model YAML type outputs
|
||||||
|
|
||||||
VLLM Forwarded arguments:
|
VLLM arguments:
|
||||||
--gpu-memory-utilization GPU_MEMORY_UTILIZATION
|
--gpu-memory-utilization GPU_MEMORY_UTILIZATION
|
||||||
Fraction of VRAM vLLM may pre-allocate for KV-cache (passed through to vllm serve).
|
Fraction of VRAM vLLM may pre-allocate for KV-cache (passed through to vllm serve).
|
||||||
--max_model_len MAX_MODEL_LEN
|
--max_model_len MAX_MODEL_LEN
|
||||||
@ -313,6 +327,9 @@ VLLM Forwarded arguments:
|
|||||||
--data-parallel-size DATA_PARALLEL_SIZE, -dp DATA_PARALLEL_SIZE
|
--data-parallel-size DATA_PARALLEL_SIZE, -dp DATA_PARALLEL_SIZE
|
||||||
Data parallel size for vLLM
|
Data parallel size for vLLM
|
||||||
--port PORT Port to use for the VLLM server
|
--port PORT Port to use for the VLLM server
|
||||||
|
--server SERVER URL of external vLLM (or other compatible provider)
|
||||||
|
server (e.g., http://hostname:port). If provided,
|
||||||
|
skips spawning local vLLM instance
|
||||||
|
|
||||||
beaker/cluster execution:
|
beaker/cluster execution:
|
||||||
--beaker Submit this job to beaker instead of running locally
|
--beaker Submit this job to beaker instead of running locally
|
||||||
|
|||||||
@ -213,8 +213,8 @@ async def apost(url, json_data):
|
|||||||
|
|
||||||
|
|
||||||
async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: str, page_num: int) -> PageResult:
|
async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: str, page_num: int) -> PageResult:
|
||||||
if args.external_vllm_url:
|
if args.server:
|
||||||
COMPLETION_URL = f"{args.external_vllm_url.rstrip('/')}/v1/chat/completions"
|
COMPLETION_URL = f"{args.server.rstrip('/')}/v1/chat/completions"
|
||||||
else:
|
else:
|
||||||
COMPLETION_URL = f"http://localhost:{BASE_SERVER_PORT}/v1/chat/completions"
|
COMPLETION_URL = f"http://localhost:{BASE_SERVER_PORT}/v1/chat/completions"
|
||||||
MAX_RETRIES = args.max_page_retries
|
MAX_RETRIES = args.max_page_retries
|
||||||
@ -736,8 +736,8 @@ async def vllm_server_host(model_name_or_path, args, semaphore, unknown_args=Non
|
|||||||
async def vllm_server_ready(args):
|
async def vllm_server_ready(args):
|
||||||
max_attempts = 300
|
max_attempts = 300
|
||||||
delay_sec = 1
|
delay_sec = 1
|
||||||
if args.external_vllm_url:
|
if args.server:
|
||||||
url = f"{args.external_vllm_url.rstrip('/')}/v1/models"
|
url = f"{args.server.rstrip('/')}/v1/models"
|
||||||
else:
|
else:
|
||||||
url = f"http://localhost:{BASE_SERVER_PORT}/v1/models"
|
url = f"http://localhost:{BASE_SERVER_PORT}/v1/models"
|
||||||
|
|
||||||
@ -1076,7 +1076,7 @@ async def main():
|
|||||||
vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM")
|
vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM")
|
||||||
vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server")
|
vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server")
|
||||||
vllm_group.add_argument(
|
vllm_group.add_argument(
|
||||||
"--external-vllm-url", type=str, help="URL of external vLLM server (e.g., http://hostname:port). If provided, skips spawning local vLLM instance"
|
"--server", type=str, help="URL of external vLLM (or other compatible provider) server (e.g., http://hostname:port). If provided, skips spawning local vLLM instance"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Beaker/job running stuff
|
# Beaker/job running stuff
|
||||||
@ -1216,14 +1216,14 @@ async def main():
|
|||||||
|
|
||||||
# If you get this far, then you are doing inference and need a GPU
|
# If you get this far, then you are doing inference and need a GPU
|
||||||
# check_sglang_version()
|
# check_sglang_version()
|
||||||
if not args.external_vllm_url:
|
if not args.server:
|
||||||
check_torch_gpu_available()
|
check_torch_gpu_available()
|
||||||
|
|
||||||
logger.info(f"Starting pipeline with PID {os.getpid()}")
|
logger.info(f"Starting pipeline with PID {os.getpid()}")
|
||||||
|
|
||||||
# Download the model before you do anything else
|
# Download the model before you do anything else
|
||||||
if args.external_vllm_url:
|
if args.server:
|
||||||
logger.info(f"Using external vLLM server at {args.external_vllm_url}")
|
logger.info(f"Using external server at {args.server}")
|
||||||
model_name_or_path = None
|
model_name_or_path = None
|
||||||
else:
|
else:
|
||||||
model_name_or_path = await download_model(args.model)
|
model_name_or_path = await download_model(args.model)
|
||||||
@ -1242,7 +1242,7 @@ async def main():
|
|||||||
|
|
||||||
# Start local vLLM instance if not using external one
|
# Start local vLLM instance if not using external one
|
||||||
vllm_server = None
|
vllm_server = None
|
||||||
if not args.external_vllm_url:
|
if not args.server:
|
||||||
vllm_server = asyncio.create_task(vllm_server_host(model_name_or_path, args, semaphore, unknown_args))
|
vllm_server = asyncio.create_task(vllm_server_host(model_name_or_path, args, semaphore, unknown_args))
|
||||||
|
|
||||||
await vllm_server_ready(args)
|
await vllm_server_ready(args)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user