From 5cb32c3289b8f666880ef549a6323a01f9b366e7 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Wed, 5 Mar 2025 13:33:39 -0800 Subject: [PATCH] Convert script work with server backends --- olmocr/bench/convert.py | 4 +- olmocr/bench/runners/run_chatgpt.py | 6 +- olmocr/bench/runners/run_olmocr.py | 5 +- olmocr/bench/runners/run_server.py | 74 ++++++++++++++++++ olmocr/bench/scripts/convert_all.sh | 114 ++++++++++++++++++++++++++-- 5 files changed, 191 insertions(+), 12 deletions(-) create mode 100644 olmocr/bench/runners/run_server.py diff --git a/olmocr/bench/convert.py b/olmocr/bench/convert.py index ba6174a..a6d05bf 100644 --- a/olmocr/bench/convert.py +++ b/olmocr/bench/convert.py @@ -74,7 +74,8 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, force): else: # Run synchronous function markdown = method(pdf_path, page_num=1, **kwargs) - except: + except Exception as ex: + print(f"Exception {str(ex)} occurred while processing {base_name}_{i}") markdown = None if markdown is None: @@ -106,6 +107,7 @@ if __name__ == "__main__": "marker": ("olmocr.bench.runners.run_marker", "run_marker"), "mineru": ("olmocr.bench.runners.run_mineru", "run_mineru"), "chatgpt": ("olmocr.bench.runners.run_chatgpt", "run_chatgpt"), + "server": ("olmocr.bench.runners.run_server", "run_server"), } # Build config by importing only requested methods. diff --git a/olmocr/bench/runners/run_chatgpt.py b/olmocr/bench/runners/run_chatgpt.py index e1291c3..67f8702 100644 --- a/olmocr/bench/runners/run_chatgpt.py +++ b/olmocr/bench/runners/run_chatgpt.py @@ -11,13 +11,11 @@ from olmocr.prompts.prompts import ( openai_response_format_schema, ) - def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-06", temperature: float = 0.1) -> str: """ - Convert page of a PDF file to markdown using GOT-OCR. + Convert page of a PDF file to markdown using the commercial openAI APIs. - This function renders the first page of the PDF to an image, runs OCR on that image, - and returns the OCR result as a markdown-formatted string. + See run_server.py for running against an openai compatible server Args: pdf_path (str): The local path to the PDF file. diff --git a/olmocr/bench/runners/run_olmocr.py b/olmocr/bench/runners/run_olmocr.py index 0e82cfc..129f369 100644 --- a/olmocr/bench/runners/run_olmocr.py +++ b/olmocr/bench/runners/run_olmocr.py @@ -32,7 +32,10 @@ class Args: async def run_olmocr(pdf_path: str, page_num: int = 1, temperature: float = 0.8) -> str: """ - Process a single page of a PDF using the olmocr pipeline. + Process a single page of a PDF using the official olmocr pipeline, as in pipeline.py + + The idea is that this is getting called exactly how it is in the pipeline script, so we can do comparisons. + Though, this method is slow, because it only does one request at a time. Args: pdf_path: Path to the PDF file diff --git a/olmocr/bench/runners/run_server.py b/olmocr/bench/runners/run_server.py new file mode 100644 index 0000000..c7a2908 --- /dev/null +++ b/olmocr/bench/runners/run_server.py @@ -0,0 +1,74 @@ +import json +import os +from typing import Literal +import httpx + +from olmocr.data.renderpdf import render_pdf_to_base64png +from olmocr.prompts.anchor import get_anchor_text +from olmocr.prompts.prompts import ( + PageResponse, + build_openai_silver_data_prompt, + build_finetuning_prompt, +) + +async def run_server(pdf_path: str, page_num: int = 1, server: str = "localhost:30000", model: str = "allenai/olmOCR-7B-0225-preview", + temperature: float = 0.1, target_longest_image_dim: int = 1024, + prompt_template: Literal["full", "finetune"]="finetune", + response_template: Literal["plain", "json"]="json") -> str: + """ + Convert page of a PDF file to markdown by calling a request + running against an openai compatible server. + + You can use this for running against vllm, sglang, servers + as well as mixing and matching different model's. + + It will only make one direct request, with no retries or error checking. + + Returns: + str: The OCR result in markdown format. + """ + # Convert the first page of the PDF to a base64-encoded PNG image. + image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim) + anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport") + + if prompt_template == "full": + prompt = build_openai_silver_data_prompt(anchor_text) + else: + prompt = build_finetuning_prompt(anchor_text) + + request = { + "model": model, + "messages":[ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, + ], + } + ], + "temperature": temperature, + "max_tokens": 3000, + } + + # Make request and get response using httpx + url = f"http://{server}/v1/chat/completions" + + async with httpx.AsyncClient(timeout=300) as client: + response = await client.post(url, json=request) + + print(response.status_code) + data = response.json() + + print(data) + choice = data["choices"][0] + print(choice) + assert choice["finish_reason"] == "stop", "Response from server did not finish with finish_reason stop as expected, this is probably going to lead to bad data" + + if response_template == "json": + data = choice["message"]["content"] + page_data = json.loads(page_data) + page_response = PageResponse(**page_data) + return page_response.natural_text + elif response_template == "plain": + return choice["message"]["content"] diff --git a/olmocr/bench/scripts/convert_all.sh b/olmocr/bench/scripts/convert_all.sh index bf4476f..bcb294d 100755 --- a/olmocr/bench/scripts/convert_all.sh +++ b/olmocr/bench/scripts/convert_all.sh @@ -1,7 +1,33 @@ #!/bin/bash +# Exit on error but allow the trap to execute set -e +# Global variable to track server PID +SERVER_PID="" + +# Trap function to handle Ctrl+C (SIGINT) +cleanup() { + echo -e "\n[INFO] Received interrupt signal. Cleaning up..." + + # Find and kill any Python processes started by this script + echo "[INFO] Stopping any running Python processes" + pkill -P $$ python || true + + # Stop sglang server if running + if [ -n "$SERVER_PID" ] && kill -0 "$SERVER_PID" 2>/dev/null; then + echo "[INFO] Stopping sglang server (PID: $SERVER_PID)" + kill -TERM "$SERVER_PID" 2>/dev/null || true + wait "$SERVER_PID" 2>/dev/null || true + fi + + echo "[INFO] Cleanup complete. Exiting." + exit 1 +} + +# Set the trap for SIGINT (Ctrl+C) +trap cleanup SIGINT + # Function to create conda environment if it doesn't exist create_conda_env() { env_name=$1 @@ -16,10 +42,62 @@ create_conda_env() { fi } -# # Create and activate olmocr environment -# create_conda_env "olmocr" "3.11" -# source $(conda info --base)/etc/profile.d/conda.sh -# source activate olmocr +# Function to start sglang server with OpenAI API for a specific model +start_sglang_server() { + model_name=$1 + echo "Starting sglang server for model: $model_name" + + # Start the server in the background and save the PID + python -m sglang.launch_server --model $model_name --chat-template qwen2-vl & + SERVER_PID=$! + + # Check if the server process is running + if ! kill -0 $SERVER_PID 2>/dev/null; then + echo "Failed to start server process. Exiting." + exit 1 + fi + + # Wait for the server to be ready by checking the models endpoint + echo "Waiting for server to be ready..." + max_attempts=300 + attempt=0 + + while [ $attempt -lt $max_attempts ]; do + # Try to reach the models endpoint with an API key header + if curl -s "http://localhost:30000/v1/models" \ + -o /dev/null -w "%{http_code}" | grep -q "200"; then + echo "Server is ready!" + return 0 + fi + + attempt=$((attempt + 1)) + echo "Waiting for server... attempt $attempt/$max_attempts" + sleep 2 + done + + echo "Server failed to become ready after multiple attempts. Exiting." + kill $SERVER_PID + SERVER_PID="" + exit 1 +} + +# Function to stop the sglang server +stop_sglang_server() { + echo "Stopping sglang server with PID: $SERVER_PID" + if [ -n "$SERVER_PID" ] && kill -0 "$SERVER_PID" 2>/dev/null; then + kill $SERVER_PID + wait $SERVER_PID 2>/dev/null || true + echo "Server stopped." + else + echo "No server to stop." + fi + SERVER_PID="" +} + +# Create and activate olmocr environment +create_conda_env "olmocr" "3.11" +source $(conda info --base)/etc/profile.d/conda.sh +source activate olmocr # # Run olmocr benchmarks # echo "Running olmocr benchmarks..." @@ -39,9 +117,28 @@ create_conda_env() { # echo "Running chatgpt benchmarks..." # python -m olmocr.bench.convert chatgpt +# Run raw server benchmarks with sglang server +# For each model, start server, run benchmark, then stop server + +# olmocr_base_temp0_1 +start_sglang_server "allenai/olmOCR-7B-0225-preview" +python -m olmocr.bench.convert server:name=olmocr_base_temp0_1:model=allenai/olmOCR-7B-0225-preview:temperature=0.1:response_template=json --repeats 5 +python -m olmocr.bench.convert server:name=olmocr_base_temp0_8:model=allenai/olmOCR-7B-0225-preview:temperature=0.8:response_template=json --repeats 5 +stop_sglang_server + +# qwen2_vl_7b +start_sglang_server "Qwen/Qwen2-VL-7B-Instruct" +python -m olmocr.bench.convert server:name=qwen2_vl_7b:model=Qwen/Qwen2-VL-7B-Instruct:temperature=0.1:response_template=plain --repeats 5 +stop_sglang_server + +# qwen25_vl_7b +start_sglang_server "Qwen/Qwen2.5-VL-7B-Instruct" +python -m olmocr.bench.convert server:name=qwen25_vl_7b:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:response_template=plain --repeats 5 +stop_sglang_server + # Create and activate mineru environment -create_conda_env "mineru" "3.11" -source activate mineru +# create_conda_env "mineru" "3.11" +# source activate mineru # Install magic-pdf and run benchmarks # TODO: Fix this, I was not able to get it to all install successfully @@ -53,4 +150,9 @@ source activate mineru # python download_models_hf.py # python -m olmocr.bench.convert mineru +# Final cleanup +if [ -n "$SERVER_PID" ] && kill -0 $SERVER_PID 2>/dev/null; then + stop_sglang_server +fi + echo "All benchmarks completed successfully." \ No newline at end of file