mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-02 02:54:53 +00:00
Merge branch 'main' of https://github.com/allenai/olmocr
This commit is contained in:
commit
500dedc11c
@ -229,6 +229,7 @@ if __name__ == "__main__":
|
||||
"gemini": ("olmocr.bench.runners.run_gemini", "run_gemini"),
|
||||
"mistral": ("olmocr.bench.runners.run_mistral", "run_mistral"),
|
||||
"docling": ("olmocr.bench.runners.run_docling", "run_docling"),
|
||||
"rolmocr": ("olmocr.bench.runners.run_rolmocr", "run_rolmocr"),
|
||||
"transformers": ("olmocr.bench.runners.run_transformers", "run_transformers"),
|
||||
"server": ("olmocr.bench.runners.run_server", "run_server"),
|
||||
}
|
||||
|
||||
@ -1,3 +1,7 @@
|
||||
def build_basic_prompt() -> str:
|
||||
return "Just return the markdown representation of this document as if you were reading it naturally. Convert equations to markdown using \( \) for inline math, and \[ \] otherwise."
|
||||
|
||||
|
||||
def claude_response_format_schema() -> dict:
|
||||
return (
|
||||
{
|
||||
@ -44,41 +48,3 @@ def claude_response_format_schema() -> dict:
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def gemini_response_format_schema() -> dict:
|
||||
return (
|
||||
{
|
||||
"type": "OBJECT",
|
||||
"properties": {
|
||||
"primary_language": {
|
||||
"type": "STRING",
|
||||
"description": "The primary language of the text using two-letter codes or null if there is no text at all that you think you should read.",
|
||||
},
|
||||
"is_rotation_valid": {
|
||||
"type": "BOOL",
|
||||
"description": "Is this page oriented correctly for reading? Answer only considering the textual content, do not factor in the rotation of any charts, tables, drawings, or figures.",
|
||||
},
|
||||
"rotation_correction": {
|
||||
"type": "INTEGER",
|
||||
"enum": [0, 90, 180, 270],
|
||||
"description": "Indicates the degree of clockwise rotation needed if the page is not oriented correctly.",
|
||||
},
|
||||
"is_table": {"type": "BOOL", "description": "Indicates if the majority of the page content is in tabular format."},
|
||||
"is_diagram": {"type": "BOOL", "description": "Indicates if the majority of the page content is a visual diagram."},
|
||||
"natural_text": {"type": "STRING", "description": "The natural text content extracted from the page."},
|
||||
},
|
||||
"required": ["primary_language", "is_rotation_valid", "rotation_correction", "is_table", "is_diagram", "natural_text"],
|
||||
"propertyOrdering": ["primary_language", "is_rotation_valid", "rotation_correction", "is_table", "is_diagram", "natural_text"],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def build_find_difference_prompt(base_text: str) -> str:
|
||||
return (
|
||||
f"Below is an image of a document page, along with raw textual content previously extracted using different models."
|
||||
f"Your goal is to carefully identify the differences between the extracted texts from both models and determine which one is more accurate by comparing them with the image."
|
||||
f"Only return the differences and specify which model extracted the text with higher accuracy.\n"
|
||||
f"Do not hallucinate.\n"
|
||||
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
|
||||
)
|
||||
|
||||
68
olmocr/bench/runners/run_rolmocr.py
Normal file
68
olmocr/bench/runners/run_rolmocr.py
Normal file
@ -0,0 +1,68 @@
|
||||
import json
|
||||
from typing import Literal
|
||||
|
||||
import httpx
|
||||
|
||||
from olmocr.bench.prompts import build_basic_prompt, build_rolmocr_prompt
|
||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||
from olmocr.prompts.anchor import get_anchor_text
|
||||
from olmocr.prompts.prompts import (
|
||||
PageResponse,
|
||||
build_finetuning_prompt,
|
||||
build_openai_silver_data_prompt,
|
||||
)
|
||||
|
||||
|
||||
async def run_rolmcr(
|
||||
pdf_path: str,
|
||||
page_num: int = 1,
|
||||
server: str = "localhost:30000",
|
||||
model: str = "reducto/RolmOCR",
|
||||
temperature: float = 0.2,
|
||||
target_longest_image_dim: int = 1024,
|
||||
) -> str:
|
||||
"""
|
||||
|
||||
|
||||
Returns:
|
||||
str: The OCR result in markdown format.
|
||||
"""
|
||||
# Convert the first page of the PDF to a base64-encoded PNG image.
|
||||
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
|
||||
|
||||
request = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Return the plain text representation of this document as if you were reading it naturally.\n",
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
"temperature": temperature,
|
||||
"max_tokens": 4096,
|
||||
}
|
||||
|
||||
# Make request and get response using httpx
|
||||
url = f"http://{server}/v1/chat/completions"
|
||||
|
||||
async with httpx.AsyncClient(timeout=300) as client:
|
||||
response = await client.post(url, json=request)
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
choice = data["choices"][0]
|
||||
assert (
|
||||
choice["finish_reason"] == "stop"
|
||||
), "Response from server did not finish with finish_reason stop as expected, this is probably going to lead to bad data"
|
||||
|
||||
return choice["message"]["content"]
|
||||
@ -3,6 +3,7 @@ from typing import Literal
|
||||
|
||||
import httpx
|
||||
|
||||
from olmocr.bench.prompts import build_basic_prompt
|
||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||
from olmocr.prompts.anchor import get_anchor_text
|
||||
from olmocr.prompts.prompts import (
|
||||
@ -43,7 +44,9 @@ async def run_server(
|
||||
elif prompt_template == "finetune":
|
||||
prompt = build_finetuning_prompt(anchor_text)
|
||||
elif prompt_template == "basic":
|
||||
prompt = "Just return the plain text representation of this document as if you were reading it naturally."
|
||||
prompt = build_basic_prompt()
|
||||
elif prompt_template == "rolmocr":
|
||||
prompt = build_rolmocr_prompt()
|
||||
else:
|
||||
raise ValueError("Unknown prompt template")
|
||||
|
||||
|
||||
@ -181,7 +181,7 @@ python -m olmocr.bench.convert gemini:name=gemini_flash2:model=gemini-2.0-flash
|
||||
|
||||
echo "Running mistral..."
|
||||
pip install mistralai
|
||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data mistral
|
||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data --parallel 4 mistral
|
||||
|
||||
# Run raw server benchmarks with generic server function
|
||||
# For each model, start server, run benchmark, then stop server
|
||||
@ -219,9 +219,15 @@ check_port || exit 1
|
||||
# stop_server
|
||||
|
||||
# qwen2.5 works best with vllm for now, in a fresh environment
|
||||
# start_server vllm "Qwen/Qwen2.5-VL-7B-Instruct" --max-model-len 8192
|
||||
# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=qwen25vl_prompt3:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:prompt_template=basic:response_template=plain --parallel 50
|
||||
# stop_server
|
||||
source activate vllm
|
||||
|
||||
start_server vllm "Qwen/Qwen2.5-VL-7B-Instruct" --max-model-len 8192
|
||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=qwen25vl_prompt3:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:prompt_template=basic:response_template=plain --parallel 50
|
||||
stop_server
|
||||
|
||||
start_server vllm "reducto/RolmOCR" --max-model-len 8192
|
||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data rolmocr --parallel 50
|
||||
stop_server
|
||||
|
||||
# TODO: Fix this, I was not able to get it to all install successfully
|
||||
# Create and activate mineru environment
|
||||
|
||||
@ -1,13 +1,22 @@
|
||||
import os
|
||||
|
||||
from openai import OpenAI
|
||||
from prompts import build_find_difference_prompt
|
||||
from runners.run_chatgpt import run_chatgpt
|
||||
from runners.run_gemini import run_gemini
|
||||
|
||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||
|
||||
|
||||
def build_find_difference_prompt(base_text: str) -> str:
|
||||
return (
|
||||
f"Below is an image of a document page, along with raw textual content previously extracted using different models."
|
||||
f"Your goal is to carefully identify the differences between the extracted texts from both models and determine which one is more accurate by comparing them with the image."
|
||||
f"Only return the differences and specify which model extracted the text with higher accuracy.\n"
|
||||
f"Do not hallucinate.\n"
|
||||
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
|
||||
)
|
||||
|
||||
|
||||
def combined_output(pdf_path: str) -> str:
|
||||
chatgpt_output = run_chatgpt(pdf_path)
|
||||
gemini_output = run_gemini(pdf_path)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user