This commit is contained in:
Jake Poznanski 2025-04-07 21:39:57 +00:00
commit 500dedc11c
6 changed files with 97 additions and 44 deletions

View File

@ -229,6 +229,7 @@ if __name__ == "__main__":
"gemini": ("olmocr.bench.runners.run_gemini", "run_gemini"),
"mistral": ("olmocr.bench.runners.run_mistral", "run_mistral"),
"docling": ("olmocr.bench.runners.run_docling", "run_docling"),
"rolmocr": ("olmocr.bench.runners.run_rolmocr", "run_rolmocr"),
"transformers": ("olmocr.bench.runners.run_transformers", "run_transformers"),
"server": ("olmocr.bench.runners.run_server", "run_server"),
}

View File

@ -1,3 +1,7 @@
def build_basic_prompt() -> str:
return "Just return the markdown representation of this document as if you were reading it naturally. Convert equations to markdown using \( \) for inline math, and \[ \] otherwise."
def claude_response_format_schema() -> dict:
return (
{
@ -44,41 +48,3 @@ def claude_response_format_schema() -> dict:
},
},
)
def gemini_response_format_schema() -> dict:
return (
{
"type": "OBJECT",
"properties": {
"primary_language": {
"type": "STRING",
"description": "The primary language of the text using two-letter codes or null if there is no text at all that you think you should read.",
},
"is_rotation_valid": {
"type": "BOOL",
"description": "Is this page oriented correctly for reading? Answer only considering the textual content, do not factor in the rotation of any charts, tables, drawings, or figures.",
},
"rotation_correction": {
"type": "INTEGER",
"enum": [0, 90, 180, 270],
"description": "Indicates the degree of clockwise rotation needed if the page is not oriented correctly.",
},
"is_table": {"type": "BOOL", "description": "Indicates if the majority of the page content is in tabular format."},
"is_diagram": {"type": "BOOL", "description": "Indicates if the majority of the page content is a visual diagram."},
"natural_text": {"type": "STRING", "description": "The natural text content extracted from the page."},
},
"required": ["primary_language", "is_rotation_valid", "rotation_correction", "is_table", "is_diagram", "natural_text"],
"propertyOrdering": ["primary_language", "is_rotation_valid", "rotation_correction", "is_table", "is_diagram", "natural_text"],
},
)
def build_find_difference_prompt(base_text: str) -> str:
return (
f"Below is an image of a document page, along with raw textual content previously extracted using different models."
f"Your goal is to carefully identify the differences between the extracted texts from both models and determine which one is more accurate by comparing them with the image."
f"Only return the differences and specify which model extracted the text with higher accuracy.\n"
f"Do not hallucinate.\n"
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
)

View File

@ -0,0 +1,68 @@
import json
from typing import Literal
import httpx
from olmocr.bench.prompts import build_basic_prompt, build_rolmocr_prompt
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts.anchor import get_anchor_text
from olmocr.prompts.prompts import (
PageResponse,
build_finetuning_prompt,
build_openai_silver_data_prompt,
)
async def run_rolmcr(
pdf_path: str,
page_num: int = 1,
server: str = "localhost:30000",
model: str = "reducto/RolmOCR",
temperature: float = 0.2,
target_longest_image_dim: int = 1024,
) -> str:
"""
Returns:
str: The OCR result in markdown format.
"""
# Convert the first page of the PDF to a base64-encoded PNG image.
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
request = {
"model": model,
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
},
{
"type": "text",
"text": "Return the plain text representation of this document as if you were reading it naturally.\n",
},
],
}
],
"temperature": temperature,
"max_tokens": 4096,
}
# Make request and get response using httpx
url = f"http://{server}/v1/chat/completions"
async with httpx.AsyncClient(timeout=300) as client:
response = await client.post(url, json=request)
response.raise_for_status()
data = response.json()
choice = data["choices"][0]
assert (
choice["finish_reason"] == "stop"
), "Response from server did not finish with finish_reason stop as expected, this is probably going to lead to bad data"
return choice["message"]["content"]

View File

@ -3,6 +3,7 @@ from typing import Literal
import httpx
from olmocr.bench.prompts import build_basic_prompt
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts.anchor import get_anchor_text
from olmocr.prompts.prompts import (
@ -43,7 +44,9 @@ async def run_server(
elif prompt_template == "finetune":
prompt = build_finetuning_prompt(anchor_text)
elif prompt_template == "basic":
prompt = "Just return the plain text representation of this document as if you were reading it naturally."
prompt = build_basic_prompt()
elif prompt_template == "rolmocr":
prompt = build_rolmocr_prompt()
else:
raise ValueError("Unknown prompt template")

View File

@ -181,7 +181,7 @@ python -m olmocr.bench.convert gemini:name=gemini_flash2:model=gemini-2.0-flash
echo "Running mistral..."
pip install mistralai
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data mistral
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data --parallel 4 mistral
# Run raw server benchmarks with generic server function
# For each model, start server, run benchmark, then stop server
@ -219,9 +219,15 @@ check_port || exit 1
# stop_server
# qwen2.5 works best with vllm for now, in a fresh environment
# start_server vllm "Qwen/Qwen2.5-VL-7B-Instruct" --max-model-len 8192
# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=qwen25vl_prompt3:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:prompt_template=basic:response_template=plain --parallel 50
# stop_server
source activate vllm
start_server vllm "Qwen/Qwen2.5-VL-7B-Instruct" --max-model-len 8192
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=qwen25vl_prompt3:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:prompt_template=basic:response_template=plain --parallel 50
stop_server
start_server vllm "reducto/RolmOCR" --max-model-len 8192
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data rolmocr --parallel 50
stop_server
# TODO: Fix this, I was not able to get it to all install successfully
# Create and activate mineru environment

View File

@ -1,13 +1,22 @@
import os
from openai import OpenAI
from prompts import build_find_difference_prompt
from runners.run_chatgpt import run_chatgpt
from runners.run_gemini import run_gemini
from olmocr.data.renderpdf import render_pdf_to_base64png
def build_find_difference_prompt(base_text: str) -> str:
return (
f"Below is an image of a document page, along with raw textual content previously extracted using different models."
f"Your goal is to carefully identify the differences between the extracted texts from both models and determine which one is more accurate by comparing them with the image."
f"Only return the differences and specify which model extracted the text with higher accuracy.\n"
f"Do not hallucinate.\n"
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
)
def combined_output(pdf_path: str) -> str:
chatgpt_output = run_chatgpt(pdf_path)
gemini_output = run_gemini(pdf_path)