Merge branch 'main' of https://github.com/allenai/olmocr

2025-11-25 14:52:56 +00:00 · 2025-04-07 21:39:57 +00:00 · 2025-04-07 21:39:57 +00:00 · 500dedc11c
commit 500dedc11c
parent f0d18e8b80 ae4fda7429
6 changed files with 97 additions and 44 deletions
--- a/olmocr/bench/convert.py
+++ b/olmocr/bench/convert.py
@ -229,6 +229,7 @@ if __name__ == "__main__":
        "gemini": ("olmocr.bench.runners.run_gemini", "run_gemini"),
        "mistral": ("olmocr.bench.runners.run_mistral", "run_mistral"),
        "docling": ("olmocr.bench.runners.run_docling", "run_docling"),
        "rolmocr": ("olmocr.bench.runners.run_rolmocr", "run_rolmocr"),
        "transformers": ("olmocr.bench.runners.run_transformers", "run_transformers"),
        "server": ("olmocr.bench.runners.run_server", "run_server"),
    }
--- a/olmocr/bench/prompts.py
+++ b/olmocr/bench/prompts.py
@ -1,3 +1,7 @@
 def build_basic_prompt() -> str:
    return "Just return the markdown representation of this document as if you were reading it naturally. Convert equations to markdown using \( \) for inline math, and \[ \] otherwise."
 def claude_response_format_schema() -> dict:
    return (
        {
@ -44,41 +48,3 @@ def claude_response_format_schema() -> dict:
            },
        },
    )
 def gemini_response_format_schema() -> dict:
    return (
        {
            "type": "OBJECT",
            "properties": {
                "primary_language": {
                    "type": "STRING",
                    "description": "The primary language of the text using two-letter codes or null if there is no text at all that you think you should read.",
                },
                "is_rotation_valid": {
                    "type": "BOOL",
                    "description": "Is this page oriented correctly for reading? Answer only considering the textual content, do not factor in the rotation of any charts, tables, drawings, or figures.",
                },
                "rotation_correction": {
                    "type": "INTEGER",
                    "enum": [0, 90, 180, 270],
                    "description": "Indicates the degree of clockwise rotation needed if the page is not oriented correctly.",
                },
                "is_table": {"type": "BOOL", "description": "Indicates if the majority of the page content is in tabular format."},
                "is_diagram": {"type": "BOOL", "description": "Indicates if the majority of the page content is a visual diagram."},
                "natural_text": {"type": "STRING", "description": "The natural text content extracted from the page."},
            },
            "required": ["primary_language", "is_rotation_valid", "rotation_correction", "is_table", "is_diagram", "natural_text"],
            "propertyOrdering": ["primary_language", "is_rotation_valid", "rotation_correction", "is_table", "is_diagram", "natural_text"],
        },
    )
 def build_find_difference_prompt(base_text: str) -> str:
    return (
        f"Below is an image of a document page, along with raw textual content previously extracted using different models."
        f"Your goal is to carefully identify the differences between the extracted texts from both models and determine which one is more accurate by comparing them with the image."
        f"Only return the differences and specify which model extracted the text with higher accuracy.\n"
        f"Do not hallucinate.\n"
        f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
    )
--- a/olmocr/bench/runners/run_rolmocr.py
+++ b/olmocr/bench/runners/run_rolmocr.py
@ -0,0 +1,68 @@
 import json
 from typing import Literal
 import httpx
 from olmocr.bench.prompts import build_basic_prompt, build_rolmocr_prompt
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts.anchor import get_anchor_text
 from olmocr.prompts.prompts import (
    PageResponse,
    build_finetuning_prompt,
    build_openai_silver_data_prompt,
 )
 async def run_rolmcr(
    pdf_path: str,
    page_num: int = 1,
    server: str = "localhost:30000",
    model: str = "reducto/RolmOCR",
    temperature: float = 0.2,
    target_longest_image_dim: int = 1024,
 ) -> str:
    """
    Returns:
        str: The OCR result in markdown format.
    """
    # Convert the first page of the PDF to a base64-encoded PNG image.
    image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
    request = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{image_base64}"},
                    },
                    {
                        "type": "text",
                        "text": "Return the plain text representation of this document as if you were reading it naturally.\n",
                    },
                ],
            }
        ],
        "temperature": temperature,
        "max_tokens": 4096,
    }
    # Make request and get response using httpx
    url = f"http://{server}/v1/chat/completions"
    async with httpx.AsyncClient(timeout=300) as client:
        response = await client.post(url, json=request)
        response.raise_for_status()
        data = response.json()
        choice = data["choices"][0]
        assert (
            choice["finish_reason"] == "stop"
        ), "Response from server did not finish with finish_reason stop as expected, this is probably going to lead to bad data"
        return choice["message"]["content"]
--- a/olmocr/bench/runners/run_server.py
+++ b/olmocr/bench/runners/run_server.py
@ -3,6 +3,7 @@ from typing import Literal
 import httpx
 from olmocr.bench.prompts import build_basic_prompt
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts.anchor import get_anchor_text
 from olmocr.prompts.prompts import (
@ -43,7 +44,9 @@ async def run_server(
    elif prompt_template == "finetune":
        prompt = build_finetuning_prompt(anchor_text)
    elif prompt_template == "basic":
-        prompt = "Just return the plain text representation of this document as if you were reading it naturally."
+        prompt = build_basic_prompt()
    elif prompt_template == "rolmocr":
        prompt = build_rolmocr_prompt()
    else:
        raise ValueError("Unknown prompt template")
--- a/olmocr/bench/scripts/convert_all.sh
+++ b/olmocr/bench/scripts/convert_all.sh
@ -181,7 +181,7 @@ python -m olmocr.bench.convert gemini:name=gemini_flash2:model=gemini-2.0-flash
 echo "Running mistral..."
 pip install mistralai
-python -m olmocr.bench.convert --dir olmOCR-bench/bench_data mistral 
+python -m olmocr.bench.convert --dir olmOCR-bench/bench_data --parallel 4 mistral 
 # Run raw server benchmarks with generic server function
 # For each model, start server, run benchmark, then stop server
@ -219,9 +219,15 @@ check_port || exit 1
 # stop_server
 # qwen2.5 works best with vllm for now, in a fresh environment
-# start_server vllm "Qwen/Qwen2.5-VL-7B-Instruct" --max-model-len 8192
+source activate vllm
-# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=qwen25vl_prompt3:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:prompt_template=basic:response_template=plain --parallel 50
+
-# stop_server
+start_server vllm "Qwen/Qwen2.5-VL-7B-Instruct" --max-model-len 8192
 python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=qwen25vl_prompt3:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:prompt_template=basic:response_template=plain --parallel 50
 stop_server
 start_server vllm "reducto/RolmOCR" --max-model-len 8192
 python -m olmocr.bench.convert --dir olmOCR-bench/bench_data rolmocr --parallel 50
 stop_server
 # TODO: Fix this, I was not able to get it to all install successfully
 # Create and activate mineru environment
--- a/olmocr/bench/scripts/run_difference.py
+++ b/olmocr/bench/scripts/run_difference.py
@ -1,13 +1,22 @@
 import os
 from openai import OpenAI
 from prompts import build_find_difference_prompt
 from runners.run_chatgpt import run_chatgpt
 from runners.run_gemini import run_gemini
 from olmocr.data.renderpdf import render_pdf_to_base64png
 def build_find_difference_prompt(base_text: str) -> str:
    return (
        f"Below is an image of a document page, along with raw textual content previously extracted using different models."
        f"Your goal is to carefully identify the differences between the extracted texts from both models and determine which one is more accurate by comparing them with the image."
        f"Only return the differences and specify which model extracted the text with higher accuracy.\n"
        f"Do not hallucinate.\n"
        f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
    )
 def combined_output(pdf_path: str) -> str:
    chatgpt_output = run_chatgpt(pdf_path)
    gemini_output = run_gemini(pdf_path)