Convert script work with server backends

This commit is contained in:
Jake Poznanski 2025-03-05 13:33:39 -08:00
parent 87875b3e2f
commit 5cb32c3289
5 changed files with 191 additions and 12 deletions

View File

@ -74,7 +74,8 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, force):
else: else:
# Run synchronous function # Run synchronous function
markdown = method(pdf_path, page_num=1, **kwargs) markdown = method(pdf_path, page_num=1, **kwargs)
except: except Exception as ex:
print(f"Exception {str(ex)} occurred while processing {base_name}_{i}")
markdown = None markdown = None
if markdown is None: if markdown is None:
@ -106,6 +107,7 @@ if __name__ == "__main__":
"marker": ("olmocr.bench.runners.run_marker", "run_marker"), "marker": ("olmocr.bench.runners.run_marker", "run_marker"),
"mineru": ("olmocr.bench.runners.run_mineru", "run_mineru"), "mineru": ("olmocr.bench.runners.run_mineru", "run_mineru"),
"chatgpt": ("olmocr.bench.runners.run_chatgpt", "run_chatgpt"), "chatgpt": ("olmocr.bench.runners.run_chatgpt", "run_chatgpt"),
"server": ("olmocr.bench.runners.run_server", "run_server"),
} }
# Build config by importing only requested methods. # Build config by importing only requested methods.

View File

@ -11,13 +11,11 @@ from olmocr.prompts.prompts import (
openai_response_format_schema, openai_response_format_schema,
) )
def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-06", temperature: float = 0.1) -> str: def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-06", temperature: float = 0.1) -> str:
""" """
Convert page of a PDF file to markdown using GOT-OCR. Convert page of a PDF file to markdown using the commercial openAI APIs.
This function renders the first page of the PDF to an image, runs OCR on that image, See run_server.py for running against an openai compatible server
and returns the OCR result as a markdown-formatted string.
Args: Args:
pdf_path (str): The local path to the PDF file. pdf_path (str): The local path to the PDF file.

View File

@ -32,7 +32,10 @@ class Args:
async def run_olmocr(pdf_path: str, page_num: int = 1, temperature: float = 0.8) -> str: async def run_olmocr(pdf_path: str, page_num: int = 1, temperature: float = 0.8) -> str:
""" """
Process a single page of a PDF using the olmocr pipeline. Process a single page of a PDF using the official olmocr pipeline, as in pipeline.py
The idea is that this is getting called exactly how it is in the pipeline script, so we can do comparisons.
Though, this method is slow, because it only does one request at a time.
Args: Args:
pdf_path: Path to the PDF file pdf_path: Path to the PDF file

View File

@ -0,0 +1,74 @@
import json
import os
from typing import Literal
import httpx
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts.anchor import get_anchor_text
from olmocr.prompts.prompts import (
PageResponse,
build_openai_silver_data_prompt,
build_finetuning_prompt,
)
async def run_server(pdf_path: str, page_num: int = 1, server: str = "localhost:30000", model: str = "allenai/olmOCR-7B-0225-preview",
temperature: float = 0.1, target_longest_image_dim: int = 1024,
prompt_template: Literal["full", "finetune"]="finetune",
response_template: Literal["plain", "json"]="json") -> str:
"""
Convert page of a PDF file to markdown by calling a request
running against an openai compatible server.
You can use this for running against vllm, sglang, servers
as well as mixing and matching different model's.
It will only make one direct request, with no retries or error checking.
Returns:
str: The OCR result in markdown format.
"""
# Convert the first page of the PDF to a base64-encoded PNG image.
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
if prompt_template == "full":
prompt = build_openai_silver_data_prompt(anchor_text)
else:
prompt = build_finetuning_prompt(anchor_text)
request = {
"model": model,
"messages":[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
],
}
],
"temperature": temperature,
"max_tokens": 3000,
}
# Make request and get response using httpx
url = f"http://{server}/v1/chat/completions"
async with httpx.AsyncClient(timeout=300) as client:
response = await client.post(url, json=request)
print(response.status_code)
data = response.json()
print(data)
choice = data["choices"][0]
print(choice)
assert choice["finish_reason"] == "stop", "Response from server did not finish with finish_reason stop as expected, this is probably going to lead to bad data"
if response_template == "json":
data = choice["message"]["content"]
page_data = json.loads(page_data)
page_response = PageResponse(**page_data)
return page_response.natural_text
elif response_template == "plain":
return choice["message"]["content"]

View File

@ -1,7 +1,33 @@
#!/bin/bash #!/bin/bash
# Exit on error but allow the trap to execute
set -e set -e
# Global variable to track server PID
SERVER_PID=""
# Trap function to handle Ctrl+C (SIGINT)
cleanup() {
echo -e "\n[INFO] Received interrupt signal. Cleaning up..."
# Find and kill any Python processes started by this script
echo "[INFO] Stopping any running Python processes"
pkill -P $$ python || true
# Stop sglang server if running
if [ -n "$SERVER_PID" ] && kill -0 "$SERVER_PID" 2>/dev/null; then
echo "[INFO] Stopping sglang server (PID: $SERVER_PID)"
kill -TERM "$SERVER_PID" 2>/dev/null || true
wait "$SERVER_PID" 2>/dev/null || true
fi
echo "[INFO] Cleanup complete. Exiting."
exit 1
}
# Set the trap for SIGINT (Ctrl+C)
trap cleanup SIGINT
# Function to create conda environment if it doesn't exist # Function to create conda environment if it doesn't exist
create_conda_env() { create_conda_env() {
env_name=$1 env_name=$1
@ -16,10 +42,62 @@ create_conda_env() {
fi fi
} }
# # Create and activate olmocr environment # Function to start sglang server with OpenAI API for a specific model
# create_conda_env "olmocr" "3.11" start_sglang_server() {
# source $(conda info --base)/etc/profile.d/conda.sh model_name=$1
# source activate olmocr echo "Starting sglang server for model: $model_name"
# Start the server in the background and save the PID
python -m sglang.launch_server --model $model_name --chat-template qwen2-vl &
SERVER_PID=$!
# Check if the server process is running
if ! kill -0 $SERVER_PID 2>/dev/null; then
echo "Failed to start server process. Exiting."
exit 1
fi
# Wait for the server to be ready by checking the models endpoint
echo "Waiting for server to be ready..."
max_attempts=300
attempt=0
while [ $attempt -lt $max_attempts ]; do
# Try to reach the models endpoint with an API key header
if curl -s "http://localhost:30000/v1/models" \
-o /dev/null -w "%{http_code}" | grep -q "200"; then
echo "Server is ready!"
return 0
fi
attempt=$((attempt + 1))
echo "Waiting for server... attempt $attempt/$max_attempts"
sleep 2
done
echo "Server failed to become ready after multiple attempts. Exiting."
kill $SERVER_PID
SERVER_PID=""
exit 1
}
# Function to stop the sglang server
stop_sglang_server() {
echo "Stopping sglang server with PID: $SERVER_PID"
if [ -n "$SERVER_PID" ] && kill -0 "$SERVER_PID" 2>/dev/null; then
kill $SERVER_PID
wait $SERVER_PID 2>/dev/null || true
echo "Server stopped."
else
echo "No server to stop."
fi
SERVER_PID=""
}
# Create and activate olmocr environment
create_conda_env "olmocr" "3.11"
source $(conda info --base)/etc/profile.d/conda.sh
source activate olmocr
# # Run olmocr benchmarks # # Run olmocr benchmarks
# echo "Running olmocr benchmarks..." # echo "Running olmocr benchmarks..."
@ -39,9 +117,28 @@ create_conda_env() {
# echo "Running chatgpt benchmarks..." # echo "Running chatgpt benchmarks..."
# python -m olmocr.bench.convert chatgpt # python -m olmocr.bench.convert chatgpt
# Run raw server benchmarks with sglang server
# For each model, start server, run benchmark, then stop server
# olmocr_base_temp0_1
start_sglang_server "allenai/olmOCR-7B-0225-preview"
python -m olmocr.bench.convert server:name=olmocr_base_temp0_1:model=allenai/olmOCR-7B-0225-preview:temperature=0.1:response_template=json --repeats 5
python -m olmocr.bench.convert server:name=olmocr_base_temp0_8:model=allenai/olmOCR-7B-0225-preview:temperature=0.8:response_template=json --repeats 5
stop_sglang_server
# qwen2_vl_7b
start_sglang_server "Qwen/Qwen2-VL-7B-Instruct"
python -m olmocr.bench.convert server:name=qwen2_vl_7b:model=Qwen/Qwen2-VL-7B-Instruct:temperature=0.1:response_template=plain --repeats 5
stop_sglang_server
# qwen25_vl_7b
start_sglang_server "Qwen/Qwen2.5-VL-7B-Instruct"
python -m olmocr.bench.convert server:name=qwen25_vl_7b:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:response_template=plain --repeats 5
stop_sglang_server
# Create and activate mineru environment # Create and activate mineru environment
create_conda_env "mineru" "3.11" # create_conda_env "mineru" "3.11"
source activate mineru # source activate mineru
# Install magic-pdf and run benchmarks # Install magic-pdf and run benchmarks
# TODO: Fix this, I was not able to get it to all install successfully # TODO: Fix this, I was not able to get it to all install successfully
@ -53,4 +150,9 @@ source activate mineru
# python download_models_hf.py # python download_models_hf.py
# python -m olmocr.bench.convert mineru # python -m olmocr.bench.convert mineru
# Final cleanup
if [ -n "$SERVER_PID" ] && kill -0 $SERVER_PID 2>/dev/null; then
stop_sglang_server
fi
echo "All benchmarks completed successfully." echo "All benchmarks completed successfully."