From 7fe3f65de7da70e0bedf42462dc2abadf9750499 Mon Sep 17 00:00:00 2001 From: aman-17 Date: Fri, 26 Sep 2025 11:06:51 -0700 Subject: [PATCH 1/7] added support for deepinfra --- DEEPINFRA_SETUP.md | 64 +++++++++++++++++++++++++++++++++++++ olmocr/pipeline.py | 78 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 124 insertions(+), 18 deletions(-) create mode 100644 DEEPINFRA_SETUP.md diff --git a/DEEPINFRA_SETUP.md b/DEEPINFRA_SETUP.md new file mode 100644 index 0000000..5e18381 --- /dev/null +++ b/DEEPINFRA_SETUP.md @@ -0,0 +1,64 @@ +# Using olmOCR with DeepInfra + +This guide explains how to use olmOCR with DeepInfra's hosted API service for cloud-based inference. + +## Prerequisites + +1. **DeepInfra Account**: Sign up at https://deepinfra.com/ +2. **API Key**: Get your API key from the DeepInfra dashboard +3. **olmOCR**: Ensure you have the modified version with authentication support + +## Setup + +### 1. Get your DeepInfra API Key + +1. Log in to https://deepinfra.com/ +2. Navigate to your dashboard +3. Generate or copy your API key +4. Store it securely (recommended: as an environment variable) + +```bash +export DEEPINFRA_API_KEY="your-api-key-here" +``` + +### 2. Usage + +Run olmOCR with the DeepInfra server endpoint: + +```bash +python -m olmocr.pipeline ./localworkspace \ + --server https://api.deepinfra.com/v1/openai \ + --api_key $DEEPINFRA_API_KEY \ + --model allenai/olmOCR-7B-0725-FP8 \ + --markdown \ + --pdfs path/to/your/*.pdf +``` + +### Command Line Arguments + +- `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai` +- `--api_key`: Your DeepInfra API key (or use environment variable) +- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8` +- Other arguments work the same as with local inference + +### Example with S3 Storage + +For large-scale processing with S3: + +```bash +python -m olmocr.pipeline s3://your-bucket/workspace \ + --server https://api.deepinfra.com/v1/openai \ + --api_key $DEEPINFRA_API_KEY \ + --model allenai/olmOCR-7B-0725-FP8 \ + --pdfs s3://your-bucket/pdfs/*.pdf \ + --workers 10 \ + --markdown +``` + +## Pricing + +As of 2024, DeepInfra charges for the olmOCR model: +- Input tokens: ~$0.27 per million tokens +- Output tokens: ~$0.81 per million tokens + +Check current pricing at: https://deepinfra.com/pricing \ No newline at end of file diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 04a2170..2b57a94 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -11,6 +11,7 @@ import os import random import re import shutil +import ssl import sys import tempfile import time @@ -104,7 +105,7 @@ class PageResult: is_fallback: bool -async def build_page_query(local_pdf_path: str, page: int, target_longest_image_dim: int, image_rotation: int = 0) -> dict: +async def build_page_query(local_pdf_path: str, page: int, target_longest_image_dim: int, image_rotation: int = 0, model_name: str = "olmocr") -> dict: MAX_TOKENS = 4500 assert image_rotation in [0, 90, 180, 270], "Invalid image rotation provided in build_page_query" @@ -132,7 +133,7 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_ image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") return { - "model": "olmocr", + "model": model_name, "messages": [ { "role": "user", @@ -151,25 +152,44 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_ # It feels strange perhaps, but httpx and aiohttp are very complex beasts # Ex. the sessionpool in httpcore has 4 different locks in it, and I've noticed # that at the scale of 100M+ requests, that they deadlock in different strange ways -async def apost(url, json_data): +async def apost(url, json_data, api_key=None): parsed_url = urlparse(url) host = parsed_url.hostname - port = parsed_url.port or 80 + # Default to 443 for HTTPS, 80 for HTTP + if parsed_url.scheme == 'https': + port = parsed_url.port or 443 + use_ssl = True + else: + port = parsed_url.port or 80 + use_ssl = False path = parsed_url.path or "/" writer = None try: - reader, writer = await asyncio.open_connection(host, port) + if use_ssl: + ssl_context = ssl.create_default_context() + reader, writer = await asyncio.open_connection(host, port, ssl=ssl_context) + else: + reader, writer = await asyncio.open_connection(host, port) json_payload = json.dumps(json_data) - request = ( - f"POST {path} HTTP/1.1\r\n" - f"Host: {host}\r\n" - f"Content-Type: application/json\r\n" - f"Content-Length: {len(json_payload)}\r\n" - f"Connection: close\r\n\r\n" - f"{json_payload}" - ) + + # Build request headers + headers = [ + f"POST {path} HTTP/1.1", + f"Host: {host}", + f"Content-Type: application/json", + f"Content-Length: {len(json_payload)}", + ] + + # Add Authorization header if API key is provided + if api_key: + headers.append(f"Authorization: Bearer {api_key}") + + headers.append("Connection: close") + + # Construct the full request + request = "\r\n".join(headers) + "\r\n\r\n" + json_payload writer.write(request.encode()) await writer.drain() @@ -214,7 +234,13 @@ async def apost(url, json_data): async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: str, page_num: int) -> PageResult: if args.server: - COMPLETION_URL = f"{args.server.rstrip('/')}/v1/chat/completions" + server_url = args.server.rstrip('/') + # Check if the server URL already contains '/v1/openai' (DeepInfra case) + if '/v1/openai' in server_url: + COMPLETION_URL = f"{server_url}/chat/completions" + else: + COMPLETION_URL = f"{server_url}/v1/chat/completions" + logger.debug(f"Using completion URL: {COMPLETION_URL}") else: COMPLETION_URL = f"http://localhost:{BASE_SERVER_PORT}/v1/chat/completions" MAX_RETRIES = args.max_page_retries @@ -227,11 +253,14 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: while attempt < MAX_RETRIES: lookup_attempt = min(attempt, len(TEMPERATURE_BY_ATTEMPT) - 1) + # Use the model name from args if provided, otherwise default to 'olmocr' + model_name = getattr(args, 'model', 'olmocr') if args.server else 'olmocr' query = await build_page_query( pdf_local_path, page_num, args.target_longest_image_dim, image_rotation=cumulative_rotation, + model_name=model_name, ) # Change temperature as number of attempts increases to overcome repetition issues at expense of quality query["temperature"] = TEMPERATURE_BY_ATTEMPT[lookup_attempt] @@ -245,7 +274,9 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: logger.debug(f"Built page query for {pdf_orig_path}-{page_num}") try: - status_code, response_body = await apost(COMPLETION_URL, json_data=query) + # Pass API key if provided + api_key = getattr(args, 'api_key', None) + status_code, response_body = await apost(COMPLETION_URL, json_data=query, api_key=api_key) if status_code == 400: raise ValueError(f"Got BadRequestError from server: {response_body}, skipping this response") @@ -737,14 +768,24 @@ async def vllm_server_ready(args): max_attempts = 300 delay_sec = 1 if args.server: - url = f"{args.server.rstrip('/')}/v1/models" + # Check if the server URL already contains '/v1/openai' (DeepInfra case) + server_url = args.server.rstrip('/') + if '/v1/openai' in server_url: + url = f"{server_url}/models" + else: + url = f"{server_url}/v1/models" else: url = f"http://localhost:{BASE_SERVER_PORT}/v1/models" for attempt in range(1, max_attempts + 1): try: + # Add authentication headers if API key is provided + headers = {} + if args.server and hasattr(args, 'api_key') and args.api_key: + headers['Authorization'] = f'Bearer {args.api_key}' + async with httpx.AsyncClient() as session: - response = await session.get(url) + response = await session.get(url, headers=headers) if response.status_code == 200: logger.info("vllm server is ready.") @@ -1064,7 +1105,8 @@ async def main(): parser.add_argument("--target_longest_image_dim", type=int, help="Dimension on longest side to use for rendering the pdf pages", default=1288) parser.add_argument("--target_anchor_text_len", type=int, help="Maximum amount of anchor text to use (characters), not used for new models", default=-1) parser.add_argument("--guided_decoding", action="store_true", help="Enable guided decoding for model YAML type outputs") - + parser.add_argument('--api_key', type=str, default=None, help='API key for authenticated remote servers (e.g., DeepInfra)') + vllm_group = parser.add_argument_group( "VLLM arguments", "These arguments are passed to vLLM. Any unrecognized arguments are also automatically forwarded to vLLM." ) From 2a5792e5ed9044f14577706f1e4ca942448a5bbf Mon Sep 17 00:00:00 2001 From: aman-17 Date: Fri, 26 Sep 2025 13:29:48 -0700 Subject: [PATCH 2/7] add if else for vllm local usage bug for api argument --- olmocr/pipeline.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 2b57a94..90ffce0 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -253,8 +253,13 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: while attempt < MAX_RETRIES: lookup_attempt = min(attempt, len(TEMPERATURE_BY_ATTEMPT) - 1) - # Use the model name from args if provided, otherwise default to 'olmocr' - model_name = getattr(args, 'model', 'olmocr') if args.server else 'olmocr' + # For external servers (like DeepInfra), use the model name from args + # For local inference, always use 'olmocr' + if args.server and hasattr(args, 'model'): + model_name = args.model + else: + model_name = 'olmocr' + query = await build_page_query( pdf_local_path, page_num, @@ -274,8 +279,11 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: logger.debug(f"Built page query for {pdf_orig_path}-{page_num}") try: - # Pass API key if provided - api_key = getattr(args, 'api_key', None) + # Pass API key only for external servers that need authentication + if args.server and hasattr(args, 'api_key'): + api_key = args.api_key + else: + api_key = None status_code, response_body = await apost(COMPLETION_URL, json_data=query, api_key=api_key) if status_code == 400: From 90589e16de31dfc742bd53959100eaa2bea9824d Mon Sep 17 00:00:00 2001 From: aman-17 Date: Fri, 26 Sep 2025 13:56:34 -0700 Subject: [PATCH 3/7] Added deepinfra usage to readme --- DEEPINFRA_SETUP.md | 64 ---------------------------------------------- README.md | 20 +++++++++++++++ 2 files changed, 20 insertions(+), 64 deletions(-) delete mode 100644 DEEPINFRA_SETUP.md diff --git a/DEEPINFRA_SETUP.md b/DEEPINFRA_SETUP.md deleted file mode 100644 index 5e18381..0000000 --- a/DEEPINFRA_SETUP.md +++ /dev/null @@ -1,64 +0,0 @@ -# Using olmOCR with DeepInfra - -This guide explains how to use olmOCR with DeepInfra's hosted API service for cloud-based inference. - -## Prerequisites - -1. **DeepInfra Account**: Sign up at https://deepinfra.com/ -2. **API Key**: Get your API key from the DeepInfra dashboard -3. **olmOCR**: Ensure you have the modified version with authentication support - -## Setup - -### 1. Get your DeepInfra API Key - -1. Log in to https://deepinfra.com/ -2. Navigate to your dashboard -3. Generate or copy your API key -4. Store it securely (recommended: as an environment variable) - -```bash -export DEEPINFRA_API_KEY="your-api-key-here" -``` - -### 2. Usage - -Run olmOCR with the DeepInfra server endpoint: - -```bash -python -m olmocr.pipeline ./localworkspace \ - --server https://api.deepinfra.com/v1/openai \ - --api_key $DEEPINFRA_API_KEY \ - --model allenai/olmOCR-7B-0725-FP8 \ - --markdown \ - --pdfs path/to/your/*.pdf -``` - -### Command Line Arguments - -- `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai` -- `--api_key`: Your DeepInfra API key (or use environment variable) -- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8` -- Other arguments work the same as with local inference - -### Example with S3 Storage - -For large-scale processing with S3: - -```bash -python -m olmocr.pipeline s3://your-bucket/workspace \ - --server https://api.deepinfra.com/v1/openai \ - --api_key $DEEPINFRA_API_KEY \ - --model allenai/olmOCR-7B-0725-FP8 \ - --pdfs s3://your-bucket/pdfs/*.pdf \ - --workers 10 \ - --markdown -``` - -## Pricing - -As of 2024, DeepInfra charges for the olmOCR model: -- Input tokens: ~$0.27 per million tokens -- Output tokens: ~$0.81 per million tokens - -Check current pricing at: https://deepinfra.com/pricing \ No newline at end of file diff --git a/README.md b/README.md index 024b641..e385e70 100644 --- a/README.md +++ b/README.md @@ -249,6 +249,26 @@ For example: ```bash python -m olmocr.pipeline s3://my_s3_bucket/pdfworkspaces/exampleworkspace --pdfs s3://my_s3_bucket/jakep/gnarly_pdfs/*.pdf --beaker --beaker_gpus 4 ``` +### Using DeepInfra +Signup at [DeepInfra](https://deepinfra.com/) and get your API key from the DeepInfra dashboard. +Store the API key as an environment variable. +```bash +export DEEPINFRA_API_KEY="your-api-key-here" +``` +#### Run olmOCR with the DeepInfra server endpoint: +```bash +python -m olmocr.pipeline ./localworkspace \ + --server https://api.deepinfra.com/v1/openai \ + --api_key $DEEPINFRA_API_KEY \ + --model allenai/olmOCR-7B-0725-FP8 \ + --markdown \ + --pdfs path/to/your/*.pdf +``` +- `--server`: DeepInfra's OpenAI-compatible endpoint: `https://api.deepinfra.com/v1/openai` +- `--api_key`: Your DeepInfra API key +- `--model`: The model identifier on DeepInfra: `allenai/olmOCR-7B-0725-FP8` +- Other arguments work the same as with local inference + ### Using Docker From e7ae5e6240c34a22bb32ba51fd59743c6be784f5 Mon Sep 17 00:00:00 2001 From: aman-17 Date: Fri, 26 Sep 2025 13:58:34 -0700 Subject: [PATCH 4/7] fixed style --- olmocr/pipeline.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 90ffce0..b64fb60 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -156,7 +156,7 @@ async def apost(url, json_data, api_key=None): parsed_url = urlparse(url) host = parsed_url.hostname # Default to 443 for HTTPS, 80 for HTTP - if parsed_url.scheme == 'https': + if parsed_url.scheme == "https": port = parsed_url.port or 443 use_ssl = True else: @@ -234,9 +234,9 @@ async def apost(url, json_data, api_key=None): async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: str, page_num: int) -> PageResult: if args.server: - server_url = args.server.rstrip('/') + server_url = args.server.rstrip("/") # Check if the server URL already contains '/v1/openai' (DeepInfra case) - if '/v1/openai' in server_url: + if "/v1/openai" in server_url: COMPLETION_URL = f"{server_url}/chat/completions" else: COMPLETION_URL = f"{server_url}/v1/chat/completions" @@ -255,10 +255,10 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: lookup_attempt = min(attempt, len(TEMPERATURE_BY_ATTEMPT) - 1) # For external servers (like DeepInfra), use the model name from args # For local inference, always use 'olmocr' - if args.server and hasattr(args, 'model'): + if args.server and hasattr(args, "model"): model_name = args.model else: - model_name = 'olmocr' + model_name = "olmocr" query = await build_page_query( pdf_local_path, @@ -280,7 +280,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: try: # Pass API key only for external servers that need authentication - if args.server and hasattr(args, 'api_key'): + if args.server and hasattr(args, "api_key"): api_key = args.api_key else: api_key = None @@ -777,8 +777,8 @@ async def vllm_server_ready(args): delay_sec = 1 if args.server: # Check if the server URL already contains '/v1/openai' (DeepInfra case) - server_url = args.server.rstrip('/') - if '/v1/openai' in server_url: + server_url = args.server.rstrip("/") + if "/v1/openai" in server_url: url = f"{server_url}/models" else: url = f"{server_url}/v1/models" @@ -789,8 +789,8 @@ async def vllm_server_ready(args): try: # Add authentication headers if API key is provided headers = {} - if args.server and hasattr(args, 'api_key') and args.api_key: - headers['Authorization'] = f'Bearer {args.api_key}' + if args.server and hasattr(args, "api_key") and args.api_key: + headers["Authorization"] = f"Bearer {args.api_key}" async with httpx.AsyncClient() as session: response = await session.get(url, headers=headers) @@ -1113,8 +1113,8 @@ async def main(): parser.add_argument("--target_longest_image_dim", type=int, help="Dimension on longest side to use for rendering the pdf pages", default=1288) parser.add_argument("--target_anchor_text_len", type=int, help="Maximum amount of anchor text to use (characters), not used for new models", default=-1) parser.add_argument("--guided_decoding", action="store_true", help="Enable guided decoding for model YAML type outputs") - parser.add_argument('--api_key', type=str, default=None, help='API key for authenticated remote servers (e.g., DeepInfra)') - + parser.add_argument("--api_key", type=str, default=None, help="API key for authenticated remote servers (e.g., DeepInfra)") + vllm_group = parser.add_argument_group( "VLLM arguments", "These arguments are passed to vLLM. Any unrecognized arguments are also automatically forwarded to vLLM." ) From 556ff26d585c860194c887c5042eadf5d2878bfb Mon Sep 17 00:00:00 2001 From: aman-17 Date: Fri, 26 Sep 2025 14:08:40 -0700 Subject: [PATCH 5/7] fixed lint, style, ruff --- olmocr/pipeline.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index b64fb60..1c5febb 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -174,7 +174,6 @@ async def apost(url, json_data, api_key=None): json_payload = json.dumps(json_data) - # Build request headers headers = [ f"POST {path} HTTP/1.1", f"Host: {host}", @@ -182,18 +181,15 @@ async def apost(url, json_data, api_key=None): f"Content-Length: {len(json_payload)}", ] - # Add Authorization header if API key is provided if api_key: headers.append(f"Authorization: Bearer {api_key}") headers.append("Connection: close") - # Construct the full request request = "\r\n".join(headers) + "\r\n\r\n" + json_payload writer.write(request.encode()) await writer.drain() - # Read status line status_line = await reader.readline() if not status_line: raise ConnectionError("No response from server") @@ -279,7 +275,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: logger.debug(f"Built page query for {pdf_orig_path}-{page_num}") try: - # Pass API key only for external servers that need authentication + # Passing API key only for external servers that need authentication if args.server and hasattr(args, "api_key"): api_key = args.api_key else: @@ -787,7 +783,6 @@ async def vllm_server_ready(args): for attempt in range(1, max_attempts + 1): try: - # Add authentication headers if API key is provided headers = {} if args.server and hasattr(args, "api_key") and args.api_key: headers["Authorization"] = f"Bearer {args.api_key}" From 359abef6547dec814c4cbaaa57b1ef4e26641888 Mon Sep 17 00:00:00 2001 From: aman-17 Date: Fri, 26 Sep 2025 14:19:22 -0700 Subject: [PATCH 6/7] updated pytests --- tests/test_pipeline.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 600753d..71a2f3c 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -268,9 +268,9 @@ This is the corrected text from the document.""" build_page_query_calls = [] original_build_page_query = build_page_query - async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0): + async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0, model_name="olmocr"): build_page_query_calls.append(image_rotation) - return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation) + return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation, model_name) with patch("olmocr.pipeline.apost", side_effect=mock_apost): with patch("olmocr.pipeline.tracker", mock_tracker): @@ -376,9 +376,9 @@ Document is now correctly oriented after 180 degree rotation.""" build_page_query_calls = [] original_build_page_query = build_page_query - async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0): + async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0, model_name="olmocr"): build_page_query_calls.append(image_rotation) - return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation) + return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation, model_name) with patch("olmocr.pipeline.apost", side_effect=mock_apost): with patch("olmocr.pipeline.tracker", mock_tracker): @@ -482,9 +482,9 @@ Document correctly oriented at 90 degrees total rotation.""" build_page_query_calls = [] original_build_page_query = build_page_query - async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0): + async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation=0, model_name="olmocr"): build_page_query_calls.append(image_rotation) - return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation) + return await original_build_page_query(local_pdf_path, page, target_longest_image_dim, image_rotation, model_name) with patch("olmocr.pipeline.apost", side_effect=mock_apost): with patch("olmocr.pipeline.tracker", mock_tracker): From f3c4073395a967e038b0ab092b1a6d8ed12adcb5 Mon Sep 17 00:00:00 2001 From: aman-17 Date: Fri, 26 Sep 2025 14:25:25 -0700 Subject: [PATCH 7/7] added Api_key argument to pipeline pytests --- tests/test_pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 71a2f3c..1541639 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -209,7 +209,7 @@ class TestRotationCorrection: # Counter to track number of API calls call_count = 0 - async def mock_apost(url, json_data): + async def mock_apost(url, json_data, api_key=None): nonlocal call_count call_count += 1 @@ -311,7 +311,7 @@ This is the corrected text from the document.""" # Counter to track number of API calls call_count = 0 - async def mock_apost(url, json_data): + async def mock_apost(url, json_data, api_key=None): nonlocal call_count call_count += 1 @@ -420,7 +420,7 @@ Document is now correctly oriented after 180 degree rotation.""" # Counter to track number of API calls call_count = 0 - async def mock_apost(url, json_data): + async def mock_apost(url, json_data, api_key=None): nonlocal call_count call_count += 1