Convert script work with server backends

2025-12-16 17:56:25 +00:00 · 2025-03-05 13:33:39 -08:00 · 2025-03-05 13:33:39 -08:00 · 5cb32c3289
commit 5cb32c3289
parent 87875b3e2f
5 changed files with 191 additions and 12 deletions
--- a/olmocr/bench/convert.py
+++ b/olmocr/bench/convert.py
@ -74,7 +74,8 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, force):
                    else:
                        # Run synchronous function
                        markdown = method(pdf_path, page_num=1, **kwargs)
-                except:
+                except Exception as ex:
                    print(f"Exception {str(ex)} occurred while processing {base_name}_{i}")
                    markdown = None
                if markdown is None:
@ -106,6 +107,7 @@ if __name__ == "__main__":
        "marker": ("olmocr.bench.runners.run_marker", "run_marker"),
        "mineru": ("olmocr.bench.runners.run_mineru", "run_mineru"),
        "chatgpt": ("olmocr.bench.runners.run_chatgpt", "run_chatgpt"),
        "server": ("olmocr.bench.runners.run_server", "run_server"),
    }
    # Build config by importing only requested methods.
--- a/olmocr/bench/runners/run_chatgpt.py
+++ b/olmocr/bench/runners/run_chatgpt.py
@ -11,13 +11,11 @@ from olmocr.prompts.prompts import (
    openai_response_format_schema,
 )
 def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-06", temperature: float = 0.1) -> str:
    """
-    Convert page of a PDF file to markdown using GOT-OCR.
+    Convert page of a PDF file to markdown using the commercial openAI APIs.
-    This function renders the first page of the PDF to an image, runs OCR on that image,
+    See run_server.py for running against an openai compatible server
    and returns the OCR result as a markdown-formatted string.
    Args:
        pdf_path (str): The local path to the PDF file.
--- a/olmocr/bench/runners/run_olmocr.py
+++ b/olmocr/bench/runners/run_olmocr.py
@ -32,7 +32,10 @@ class Args:
 async def run_olmocr(pdf_path: str, page_num: int = 1, temperature: float = 0.8) -> str:
    """
-    Process a single page of a PDF using the olmocr pipeline.
+    Process a single page of a PDF using the official olmocr pipeline, as in pipeline.py
    The idea is that this is getting called exactly how it is in the pipeline script, so we can do comparisons. 
    Though, this method is slow, because it only does one request at a time.
    Args:
        pdf_path: Path to the PDF file
--- a/olmocr/bench/runners/run_server.py
+++ b/olmocr/bench/runners/run_server.py
@ -0,0 +1,74 @@
 import json
 import os
 from typing import Literal
 import httpx
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts.anchor import get_anchor_text
 from olmocr.prompts.prompts import (
    PageResponse,
    build_openai_silver_data_prompt,
    build_finetuning_prompt,
 )
 async def run_server(pdf_path: str, page_num: int = 1, server: str = "localhost:30000", model: str = "allenai/olmOCR-7B-0225-preview",
                temperature: float = 0.1, target_longest_image_dim: int = 1024, 
                prompt_template: Literal["full", "finetune"]="finetune",
                response_template: Literal["plain", "json"]="json") -> str:
    """
    Convert page of a PDF file to markdown by calling a request
    running against an openai compatible server.
    You can use this for running against vllm, sglang, servers
    as well as mixing and matching different model's.
    It will only make one direct request, with no retries or error checking.
    Returns:
        str: The OCR result in markdown format.
    """
    # Convert the first page of the PDF to a base64-encoded PNG image.
    image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
    anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
    if prompt_template == "full":
        prompt = build_openai_silver_data_prompt(anchor_text)
    else:
        prompt = build_finetuning_prompt(anchor_text)
    request = {
        "model": model,
        "messages":[
        {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
                ],
            }
        ],
        "temperature": temperature,
        "max_tokens": 3000,
    }
    # Make request and get response using httpx
    url = f"http://{server}/v1/chat/completions"
    async with httpx.AsyncClient(timeout=300) as client:
        response = await client.post(url, json=request)
        print(response.status_code)
        data = response.json()
        print(data)
        choice = data["choices"][0]
        print(choice)
        assert choice["finish_reason"] == "stop", "Response from server did not finish with finish_reason stop as expected, this is probably going to lead to bad data"
        if response_template == "json":
            data = choice["message"]["content"]
            page_data = json.loads(page_data)
            page_response = PageResponse(**page_data)
            return page_response.natural_text
        elif response_template == "plain":
            return choice["message"]["content"]
--- a/olmocr/bench/scripts/convert_all.sh
+++ b/olmocr/bench/scripts/convert_all.sh
@ -1,7 +1,33 @@
 #!/bin/bash
 # Exit on error but allow the trap to execute
 set -e
 # Global variable to track server PID
 SERVER_PID=""
 # Trap function to handle Ctrl+C (SIGINT)
 cleanup() {
    echo -e "\n[INFO] Received interrupt signal. Cleaning up..."
    # Find and kill any Python processes started by this script
    echo "[INFO] Stopping any running Python processes"
    pkill -P $$ python || true
    # Stop sglang server if running
    if [ -n "$SERVER_PID" ] && kill -0 "$SERVER_PID" 2>/dev/null; then
        echo "[INFO] Stopping sglang server (PID: $SERVER_PID)"
        kill -TERM "$SERVER_PID" 2>/dev/null || true
        wait "$SERVER_PID" 2>/dev/null || true
    fi
    echo "[INFO] Cleanup complete. Exiting."
    exit 1
 }
 # Set the trap for SIGINT (Ctrl+C)
 trap cleanup SIGINT
 # Function to create conda environment if it doesn't exist
 create_conda_env() {
    env_name=$1
@ -16,10 +42,62 @@ create_conda_env() {
    fi
 }
-# # Create and activate olmocr environment
+# Function to start sglang server with OpenAI API for a specific model
-# create_conda_env "olmocr" "3.11"
+start_sglang_server() {
-# source $(conda info --base)/etc/profile.d/conda.sh
+    model_name=$1
-# source activate olmocr
+    echo "Starting sglang server for model: $model_name"
    # Start the server in the background and save the PID
    python -m sglang.launch_server --model $model_name --chat-template qwen2-vl &
    SERVER_PID=$!
    # Check if the server process is running
    if ! kill -0 $SERVER_PID 2>/dev/null; then
        echo "Failed to start server process. Exiting."
        exit 1
    fi
    # Wait for the server to be ready by checking the models endpoint
    echo "Waiting for server to be ready..."
    max_attempts=300
    attempt=0
    while [ $attempt -lt $max_attempts ]; do
        # Try to reach the models endpoint with an API key header
        if curl -s "http://localhost:30000/v1/models" \
           -o /dev/null -w "%{http_code}" | grep -q "200"; then
            echo "Server is ready!"
            return 0
        fi
        attempt=$((attempt + 1))
        echo "Waiting for server... attempt $attempt/$max_attempts"
        sleep 2
    done
    echo "Server failed to become ready after multiple attempts. Exiting."
    kill $SERVER_PID
    SERVER_PID=""
    exit 1
 }
 # Function to stop the sglang server
 stop_sglang_server() {
    echo "Stopping sglang server with PID: $SERVER_PID"
    if [ -n "$SERVER_PID" ] && kill -0 "$SERVER_PID" 2>/dev/null; then
        kill $SERVER_PID
        wait $SERVER_PID 2>/dev/null || true
        echo "Server stopped."
    else
        echo "No server to stop."
    fi
    SERVER_PID=""
 }
 # Create and activate olmocr environment
 create_conda_env "olmocr" "3.11"
 source $(conda info --base)/etc/profile.d/conda.sh
 source activate olmocr
 # # Run olmocr benchmarks
 # echo "Running olmocr benchmarks..."
@ -39,9 +117,28 @@ create_conda_env() {
 # echo "Running chatgpt benchmarks..."
 # python -m olmocr.bench.convert chatgpt
 # Run raw server benchmarks with sglang server
 # For each model, start server, run benchmark, then stop server
 # olmocr_base_temp0_1
 start_sglang_server "allenai/olmOCR-7B-0225-preview"
 python -m olmocr.bench.convert server:name=olmocr_base_temp0_1:model=allenai/olmOCR-7B-0225-preview:temperature=0.1:response_template=json --repeats 5
 python -m olmocr.bench.convert server:name=olmocr_base_temp0_8:model=allenai/olmOCR-7B-0225-preview:temperature=0.8:response_template=json --repeats 5
 stop_sglang_server
 # qwen2_vl_7b
 start_sglang_server "Qwen/Qwen2-VL-7B-Instruct"
 python -m olmocr.bench.convert server:name=qwen2_vl_7b:model=Qwen/Qwen2-VL-7B-Instruct:temperature=0.1:response_template=plain --repeats 5
 stop_sglang_server
 # qwen25_vl_7b
 start_sglang_server "Qwen/Qwen2.5-VL-7B-Instruct"
 python -m olmocr.bench.convert server:name=qwen25_vl_7b:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:response_template=plain --repeats 5
 stop_sglang_server
 # Create and activate mineru environment
-create_conda_env "mineru" "3.11"
+# create_conda_env "mineru" "3.11"
-source activate mineru
+# source activate mineru
 # Install magic-pdf and run benchmarks
 # TODO: Fix this, I was not able to get it to all install successfully
@ -53,4 +150,9 @@ source activate mineru
 # python download_models_hf.py
 # python -m olmocr.bench.convert mineru
 # Final cleanup
 if [ -n "$SERVER_PID" ] && kill -0 $SERVER_PID 2>/dev/null; then
    stop_sglang_server
 fi
 echo "All benchmarks completed successfully."