Convert script work with server backends

2025-12-18 18:56:39 +00:00 · 2025-03-05 13:33:39 -08:00 · 2025-03-05 13:33:39 -08:00 · 5cb32c3289
commit 5cb32c3289
parent 87875b3e2f
5 changed files with 191 additions and 12 deletions
--- a/olmocr/bench/convert.py
+++ b/olmocr/bench/convert.py
@ -74,7 +74,8 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, force):
                    else:
                        # Run synchronous function
                        markdown = method(pdf_path, page_num=1, **kwargs)
-                except:
+                except Exception as ex:
+                    print(f"Exception {str(ex)} occurred while processing {base_name}_{i}")
                    markdown = None

                if markdown is None:
@ -106,6 +107,7 @@ if __name__ == "__main__":
        "marker": ("olmocr.bench.runners.run_marker", "run_marker"),
        "mineru": ("olmocr.bench.runners.run_mineru", "run_mineru"),
        "chatgpt": ("olmocr.bench.runners.run_chatgpt", "run_chatgpt"),
+        "server": ("olmocr.bench.runners.run_server", "run_server"),
    }

    # Build config by importing only requested methods.
--- a/olmocr/bench/runners/run_chatgpt.py
+++ b/olmocr/bench/runners/run_chatgpt.py
@ -11,13 +11,11 @@ from olmocr.prompts.prompts import (
    openai_response_format_schema,
 )

-
 def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-06", temperature: float = 0.1) -> str:
    """
-    Convert page of a PDF file to markdown using GOT-OCR.
+    Convert page of a PDF file to markdown using the commercial openAI APIs.

-    This function renders the first page of the PDF to an image, runs OCR on that image,
-    and returns the OCR result as a markdown-formatted string.
+    See run_server.py for running against an openai compatible server

    Args:
        pdf_path (str): The local path to the PDF file.
--- a/olmocr/bench/runners/run_olmocr.py
+++ b/olmocr/bench/runners/run_olmocr.py
@ -32,7 +32,10 @@ class Args:

 async def run_olmocr(pdf_path: str, page_num: int = 1, temperature: float = 0.8) -> str:
    """
-    Process a single page of a PDF using the olmocr pipeline.
+    Process a single page of a PDF using the official olmocr pipeline, as in pipeline.py
+
+    The idea is that this is getting called exactly how it is in the pipeline script, so we can do comparisons. 
+    Though, this method is slow, because it only does one request at a time.

    Args:
        pdf_path: Path to the PDF file
--- a/olmocr/bench/runners/run_server.py
+++ b/olmocr/bench/runners/run_server.py
@ -0,0 +1,74 @@
+import json
+import os
+from typing import Literal
+import httpx
+
+from olmocr.data.renderpdf import render_pdf_to_base64png
+from olmocr.prompts.anchor import get_anchor_text
+from olmocr.prompts.prompts import (
+    PageResponse,
+    build_openai_silver_data_prompt,
+    build_finetuning_prompt,
+)
+
+async def run_server(pdf_path: str, page_num: int = 1, server: str = "localhost:30000", model: str = "allenai/olmOCR-7B-0225-preview",
+                temperature: float = 0.1, target_longest_image_dim: int = 1024, 
+                prompt_template: Literal["full", "finetune"]="finetune",
+                response_template: Literal["plain", "json"]="json") -> str:
+    """
+    Convert page of a PDF file to markdown by calling a request
+    running against an openai compatible server.
+    
+    You can use this for running against vllm, sglang, servers
+    as well as mixing and matching different model's.
+    
+    It will only make one direct request, with no retries or error checking.
+    
+    Returns:
+        str: The OCR result in markdown format.
+    """
+    # Convert the first page of the PDF to a base64-encoded PNG image.
+    image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
+    anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
+    
+    if prompt_template == "full":
+        prompt = build_openai_silver_data_prompt(anchor_text)
+    else:
+        prompt = build_finetuning_prompt(anchor_text)
+    
+    request = {
+        "model": model,
+        "messages":[
+        {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                ],
+            }
+        ],
+        "temperature": temperature,
+        "max_tokens": 3000,
+    }
+    
+    # Make request and get response using httpx
+    url = f"http://{server}/v1/chat/completions"
+    
+    async with httpx.AsyncClient(timeout=300) as client:
+        response = await client.post(url, json=request)
+
+        print(response.status_code)
+        data = response.json()
+
+        print(data)
+        choice = data["choices"][0]
+        print(choice)
+        assert choice["finish_reason"] == "stop", "Response from server did not finish with finish_reason stop as expected, this is probably going to lead to bad data"
+     
+        if response_template == "json":
+            data = choice["message"]["content"]
+            page_data = json.loads(page_data)
+            page_response = PageResponse(**page_data)
+            return page_response.natural_text
+        elif response_template == "plain":
+            return choice["message"]["content"]
--- a/olmocr/bench/scripts/convert_all.sh
+++ b/olmocr/bench/scripts/convert_all.sh
@ -1,7 +1,33 @@
 #!/bin/bash

+# Exit on error but allow the trap to execute
 set -e

+# Global variable to track server PID
+SERVER_PID=""
+
+# Trap function to handle Ctrl+C (SIGINT)
+cleanup() {
+    echo -e "\n[INFO] Received interrupt signal. Cleaning up..."
+    
+    # Find and kill any Python processes started by this script
+    echo "[INFO] Stopping any running Python processes"
+    pkill -P $$ python || true
+    
+    # Stop sglang server if running
+    if [ -n "$SERVER_PID" ] && kill -0 "$SERVER_PID" 2>/dev/null; then
+        echo "[INFO] Stopping sglang server (PID: $SERVER_PID)"
+        kill -TERM "$SERVER_PID" 2>/dev/null || true
+        wait "$SERVER_PID" 2>/dev/null || true
+    fi
+    
+    echo "[INFO] Cleanup complete. Exiting."
+    exit 1
+}
+
+# Set the trap for SIGINT (Ctrl+C)
+trap cleanup SIGINT
+
 # Function to create conda environment if it doesn't exist
 create_conda_env() {
    env_name=$1
@ -16,10 +42,62 @@ create_conda_env() {
    fi
 }

-# # Create and activate olmocr environment
-# create_conda_env "olmocr" "3.11"
-# source $(conda info --base)/etc/profile.d/conda.sh
-# source activate olmocr
+# Function to start sglang server with OpenAI API for a specific model
+start_sglang_server() {
+    model_name=$1
+    echo "Starting sglang server for model: $model_name"
+    
+    # Start the server in the background and save the PID
+    python -m sglang.launch_server --model $model_name --chat-template qwen2-vl &
+    SERVER_PID=$!
+    
+    # Check if the server process is running
+    if ! kill -0 $SERVER_PID 2>/dev/null; then
+        echo "Failed to start server process. Exiting."
+        exit 1
+    fi
+    
+    # Wait for the server to be ready by checking the models endpoint
+    echo "Waiting for server to be ready..."
+    max_attempts=300
+    attempt=0
+    
+    while [ $attempt -lt $max_attempts ]; do
+        # Try to reach the models endpoint with an API key header
+        if curl -s "http://localhost:30000/v1/models" \
+           -o /dev/null -w "%{http_code}" | grep -q "200"; then
+            echo "Server is ready!"
+            return 0
+        fi
+        
+        attempt=$((attempt + 1))
+        echo "Waiting for server... attempt $attempt/$max_attempts"
+        sleep 2
+    done
+    
+    echo "Server failed to become ready after multiple attempts. Exiting."
+    kill $SERVER_PID
+    SERVER_PID=""
+    exit 1
+}
+
+# Function to stop the sglang server
+stop_sglang_server() {
+    echo "Stopping sglang server with PID: $SERVER_PID"
+    if [ -n "$SERVER_PID" ] && kill -0 "$SERVER_PID" 2>/dev/null; then
+        kill $SERVER_PID
+        wait $SERVER_PID 2>/dev/null || true
+        echo "Server stopped."
+    else
+        echo "No server to stop."
+    fi
+    SERVER_PID=""
+}
+
+# Create and activate olmocr environment
+create_conda_env "olmocr" "3.11"
+source $(conda info --base)/etc/profile.d/conda.sh
+source activate olmocr

 # # Run olmocr benchmarks
 # echo "Running olmocr benchmarks..."
@ -39,9 +117,28 @@ create_conda_env() {
 # echo "Running chatgpt benchmarks..."
 # python -m olmocr.bench.convert chatgpt

+# Run raw server benchmarks with sglang server
+# For each model, start server, run benchmark, then stop server
+
+# olmocr_base_temp0_1
+start_sglang_server "allenai/olmOCR-7B-0225-preview"
+python -m olmocr.bench.convert server:name=olmocr_base_temp0_1:model=allenai/olmOCR-7B-0225-preview:temperature=0.1:response_template=json --repeats 5
+python -m olmocr.bench.convert server:name=olmocr_base_temp0_8:model=allenai/olmOCR-7B-0225-preview:temperature=0.8:response_template=json --repeats 5
+stop_sglang_server
+
+# qwen2_vl_7b
+start_sglang_server "Qwen/Qwen2-VL-7B-Instruct"
+python -m olmocr.bench.convert server:name=qwen2_vl_7b:model=Qwen/Qwen2-VL-7B-Instruct:temperature=0.1:response_template=plain --repeats 5
+stop_sglang_server
+
+# qwen25_vl_7b
+start_sglang_server "Qwen/Qwen2.5-VL-7B-Instruct"
+python -m olmocr.bench.convert server:name=qwen25_vl_7b:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:response_template=plain --repeats 5
+stop_sglang_server
+
 # Create and activate mineru environment
-create_conda_env "mineru" "3.11"
-source activate mineru
+# create_conda_env "mineru" "3.11"
+# source activate mineru

 # Install magic-pdf and run benchmarks
 # TODO: Fix this, I was not able to get it to all install successfully
@ -53,4 +150,9 @@ source activate mineru
 # python download_models_hf.py
 # python -m olmocr.bench.convert mineru

+# Final cleanup
+if [ -n "$SERVER_PID" ] && kill -0 $SERVER_PID 2>/dev/null; then
+    stop_sglang_server
+fi
+
 echo "All benchmarks completed successfully."