mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-25 14:52:56 +00:00
Merge branch 'main' of https://github.com/allenai/olmocr
This commit is contained in:
commit
500dedc11c
@ -229,6 +229,7 @@ if __name__ == "__main__":
|
|||||||
"gemini": ("olmocr.bench.runners.run_gemini", "run_gemini"),
|
"gemini": ("olmocr.bench.runners.run_gemini", "run_gemini"),
|
||||||
"mistral": ("olmocr.bench.runners.run_mistral", "run_mistral"),
|
"mistral": ("olmocr.bench.runners.run_mistral", "run_mistral"),
|
||||||
"docling": ("olmocr.bench.runners.run_docling", "run_docling"),
|
"docling": ("olmocr.bench.runners.run_docling", "run_docling"),
|
||||||
|
"rolmocr": ("olmocr.bench.runners.run_rolmocr", "run_rolmocr"),
|
||||||
"transformers": ("olmocr.bench.runners.run_transformers", "run_transformers"),
|
"transformers": ("olmocr.bench.runners.run_transformers", "run_transformers"),
|
||||||
"server": ("olmocr.bench.runners.run_server", "run_server"),
|
"server": ("olmocr.bench.runners.run_server", "run_server"),
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,3 +1,7 @@
|
|||||||
|
def build_basic_prompt() -> str:
|
||||||
|
return "Just return the markdown representation of this document as if you were reading it naturally. Convert equations to markdown using \( \) for inline math, and \[ \] otherwise."
|
||||||
|
|
||||||
|
|
||||||
def claude_response_format_schema() -> dict:
|
def claude_response_format_schema() -> dict:
|
||||||
return (
|
return (
|
||||||
{
|
{
|
||||||
@ -44,41 +48,3 @@ def claude_response_format_schema() -> dict:
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def gemini_response_format_schema() -> dict:
|
|
||||||
return (
|
|
||||||
{
|
|
||||||
"type": "OBJECT",
|
|
||||||
"properties": {
|
|
||||||
"primary_language": {
|
|
||||||
"type": "STRING",
|
|
||||||
"description": "The primary language of the text using two-letter codes or null if there is no text at all that you think you should read.",
|
|
||||||
},
|
|
||||||
"is_rotation_valid": {
|
|
||||||
"type": "BOOL",
|
|
||||||
"description": "Is this page oriented correctly for reading? Answer only considering the textual content, do not factor in the rotation of any charts, tables, drawings, or figures.",
|
|
||||||
},
|
|
||||||
"rotation_correction": {
|
|
||||||
"type": "INTEGER",
|
|
||||||
"enum": [0, 90, 180, 270],
|
|
||||||
"description": "Indicates the degree of clockwise rotation needed if the page is not oriented correctly.",
|
|
||||||
},
|
|
||||||
"is_table": {"type": "BOOL", "description": "Indicates if the majority of the page content is in tabular format."},
|
|
||||||
"is_diagram": {"type": "BOOL", "description": "Indicates if the majority of the page content is a visual diagram."},
|
|
||||||
"natural_text": {"type": "STRING", "description": "The natural text content extracted from the page."},
|
|
||||||
},
|
|
||||||
"required": ["primary_language", "is_rotation_valid", "rotation_correction", "is_table", "is_diagram", "natural_text"],
|
|
||||||
"propertyOrdering": ["primary_language", "is_rotation_valid", "rotation_correction", "is_table", "is_diagram", "natural_text"],
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def build_find_difference_prompt(base_text: str) -> str:
|
|
||||||
return (
|
|
||||||
f"Below is an image of a document page, along with raw textual content previously extracted using different models."
|
|
||||||
f"Your goal is to carefully identify the differences between the extracted texts from both models and determine which one is more accurate by comparing them with the image."
|
|
||||||
f"Only return the differences and specify which model extracted the text with higher accuracy.\n"
|
|
||||||
f"Do not hallucinate.\n"
|
|
||||||
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
|
|
||||||
)
|
|
||||||
|
|||||||
68
olmocr/bench/runners/run_rolmocr.py
Normal file
68
olmocr/bench/runners/run_rolmocr.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
import json
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from olmocr.bench.prompts import build_basic_prompt, build_rolmocr_prompt
|
||||||
|
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||||
|
from olmocr.prompts.anchor import get_anchor_text
|
||||||
|
from olmocr.prompts.prompts import (
|
||||||
|
PageResponse,
|
||||||
|
build_finetuning_prompt,
|
||||||
|
build_openai_silver_data_prompt,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_rolmcr(
|
||||||
|
pdf_path: str,
|
||||||
|
page_num: int = 1,
|
||||||
|
server: str = "localhost:30000",
|
||||||
|
model: str = "reducto/RolmOCR",
|
||||||
|
temperature: float = 0.2,
|
||||||
|
target_longest_image_dim: int = 1024,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The OCR result in markdown format.
|
||||||
|
"""
|
||||||
|
# Convert the first page of the PDF to a base64-encoded PNG image.
|
||||||
|
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
|
||||||
|
|
||||||
|
request = {
|
||||||
|
"model": model,
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "Return the plain text representation of this document as if you were reading it naturally.\n",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"temperature": temperature,
|
||||||
|
"max_tokens": 4096,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Make request and get response using httpx
|
||||||
|
url = f"http://{server}/v1/chat/completions"
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=300) as client:
|
||||||
|
response = await client.post(url, json=request)
|
||||||
|
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
choice = data["choices"][0]
|
||||||
|
assert (
|
||||||
|
choice["finish_reason"] == "stop"
|
||||||
|
), "Response from server did not finish with finish_reason stop as expected, this is probably going to lead to bad data"
|
||||||
|
|
||||||
|
return choice["message"]["content"]
|
||||||
@ -3,6 +3,7 @@ from typing import Literal
|
|||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
from olmocr.bench.prompts import build_basic_prompt
|
||||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||||
from olmocr.prompts.anchor import get_anchor_text
|
from olmocr.prompts.anchor import get_anchor_text
|
||||||
from olmocr.prompts.prompts import (
|
from olmocr.prompts.prompts import (
|
||||||
@ -43,7 +44,9 @@ async def run_server(
|
|||||||
elif prompt_template == "finetune":
|
elif prompt_template == "finetune":
|
||||||
prompt = build_finetuning_prompt(anchor_text)
|
prompt = build_finetuning_prompt(anchor_text)
|
||||||
elif prompt_template == "basic":
|
elif prompt_template == "basic":
|
||||||
prompt = "Just return the plain text representation of this document as if you were reading it naturally."
|
prompt = build_basic_prompt()
|
||||||
|
elif prompt_template == "rolmocr":
|
||||||
|
prompt = build_rolmocr_prompt()
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unknown prompt template")
|
raise ValueError("Unknown prompt template")
|
||||||
|
|
||||||
|
|||||||
@ -181,7 +181,7 @@ python -m olmocr.bench.convert gemini:name=gemini_flash2:model=gemini-2.0-flash
|
|||||||
|
|
||||||
echo "Running mistral..."
|
echo "Running mistral..."
|
||||||
pip install mistralai
|
pip install mistralai
|
||||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data mistral
|
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data --parallel 4 mistral
|
||||||
|
|
||||||
# Run raw server benchmarks with generic server function
|
# Run raw server benchmarks with generic server function
|
||||||
# For each model, start server, run benchmark, then stop server
|
# For each model, start server, run benchmark, then stop server
|
||||||
@ -219,9 +219,15 @@ check_port || exit 1
|
|||||||
# stop_server
|
# stop_server
|
||||||
|
|
||||||
# qwen2.5 works best with vllm for now, in a fresh environment
|
# qwen2.5 works best with vllm for now, in a fresh environment
|
||||||
# start_server vllm "Qwen/Qwen2.5-VL-7B-Instruct" --max-model-len 8192
|
source activate vllm
|
||||||
# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=qwen25vl_prompt3:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:prompt_template=basic:response_template=plain --parallel 50
|
|
||||||
# stop_server
|
start_server vllm "Qwen/Qwen2.5-VL-7B-Instruct" --max-model-len 8192
|
||||||
|
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=qwen25vl_prompt3:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:prompt_template=basic:response_template=plain --parallel 50
|
||||||
|
stop_server
|
||||||
|
|
||||||
|
start_server vllm "reducto/RolmOCR" --max-model-len 8192
|
||||||
|
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data rolmocr --parallel 50
|
||||||
|
stop_server
|
||||||
|
|
||||||
# TODO: Fix this, I was not able to get it to all install successfully
|
# TODO: Fix this, I was not able to get it to all install successfully
|
||||||
# Create and activate mineru environment
|
# Create and activate mineru environment
|
||||||
|
|||||||
@ -1,13 +1,22 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from prompts import build_find_difference_prompt
|
|
||||||
from runners.run_chatgpt import run_chatgpt
|
from runners.run_chatgpt import run_chatgpt
|
||||||
from runners.run_gemini import run_gemini
|
from runners.run_gemini import run_gemini
|
||||||
|
|
||||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||||
|
|
||||||
|
|
||||||
|
def build_find_difference_prompt(base_text: str) -> str:
|
||||||
|
return (
|
||||||
|
f"Below is an image of a document page, along with raw textual content previously extracted using different models."
|
||||||
|
f"Your goal is to carefully identify the differences between the extracted texts from both models and determine which one is more accurate by comparing them with the image."
|
||||||
|
f"Only return the differences and specify which model extracted the text with higher accuracy.\n"
|
||||||
|
f"Do not hallucinate.\n"
|
||||||
|
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def combined_output(pdf_path: str) -> str:
|
def combined_output(pdf_path: str) -> str:
|
||||||
chatgpt_output = run_chatgpt(pdf_path)
|
chatgpt_output = run_chatgpt(pdf_path)
|
||||||
gemini_output = run_gemini(pdf_path)
|
gemini_output = run_gemini(pdf_path)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user