Merge branch 'main' of https://github.com/allenai/olmocr

2025-11-02 02:54:53 +00:00 · 2025-04-07 21:39:57 +00:00 · 2025-04-07 21:39:57 +00:00 · 500dedc11c
commit 500dedc11c
parent f0d18e8b80 ae4fda7429
6 changed files with 97 additions and 44 deletions
--- a/olmocr/bench/convert.py
+++ b/olmocr/bench/convert.py
@ -229,6 +229,7 @@ if __name__ == "__main__":
        "gemini": ("olmocr.bench.runners.run_gemini", "run_gemini"),
        "mistral": ("olmocr.bench.runners.run_mistral", "run_mistral"),
        "docling": ("olmocr.bench.runners.run_docling", "run_docling"),
+        "rolmocr": ("olmocr.bench.runners.run_rolmocr", "run_rolmocr"),
        "transformers": ("olmocr.bench.runners.run_transformers", "run_transformers"),
        "server": ("olmocr.bench.runners.run_server", "run_server"),
    }
--- a/olmocr/bench/prompts.py
+++ b/olmocr/bench/prompts.py
@ -1,3 +1,7 @@
+def build_basic_prompt() -> str:
+    return "Just return the markdown representation of this document as if you were reading it naturally. Convert equations to markdown using \( \) for inline math, and \[ \] otherwise."
+
+
 def claude_response_format_schema() -> dict:
    return (
        {
@ -44,41 +48,3 @@ def claude_response_format_schema() -> dict:
            },
        },
    )
-
-
-def gemini_response_format_schema() -> dict:
-    return (
-        {
-            "type": "OBJECT",
-            "properties": {
-                "primary_language": {
-                    "type": "STRING",
-                    "description": "The primary language of the text using two-letter codes or null if there is no text at all that you think you should read.",
-                },
-                "is_rotation_valid": {
-                    "type": "BOOL",
-                    "description": "Is this page oriented correctly for reading? Answer only considering the textual content, do not factor in the rotation of any charts, tables, drawings, or figures.",
-                },
-                "rotation_correction": {
-                    "type": "INTEGER",
-                    "enum": [0, 90, 180, 270],
-                    "description": "Indicates the degree of clockwise rotation needed if the page is not oriented correctly.",
-                },
-                "is_table": {"type": "BOOL", "description": "Indicates if the majority of the page content is in tabular format."},
-                "is_diagram": {"type": "BOOL", "description": "Indicates if the majority of the page content is a visual diagram."},
-                "natural_text": {"type": "STRING", "description": "The natural text content extracted from the page."},
-            },
-            "required": ["primary_language", "is_rotation_valid", "rotation_correction", "is_table", "is_diagram", "natural_text"],
-            "propertyOrdering": ["primary_language", "is_rotation_valid", "rotation_correction", "is_table", "is_diagram", "natural_text"],
-        },
-    )
-
-
-def build_find_difference_prompt(base_text: str) -> str:
-    return (
-        f"Below is an image of a document page, along with raw textual content previously extracted using different models."
-        f"Your goal is to carefully identify the differences between the extracted texts from both models and determine which one is more accurate by comparing them with the image."
-        f"Only return the differences and specify which model extracted the text with higher accuracy.\n"
-        f"Do not hallucinate.\n"
-        f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
-    )
--- a/olmocr/bench/runners/run_rolmocr.py
+++ b/olmocr/bench/runners/run_rolmocr.py
@ -0,0 +1,68 @@
+import json
+from typing import Literal
+
+import httpx
+
+from olmocr.bench.prompts import build_basic_prompt, build_rolmocr_prompt
+from olmocr.data.renderpdf import render_pdf_to_base64png
+from olmocr.prompts.anchor import get_anchor_text
+from olmocr.prompts.prompts import (
+    PageResponse,
+    build_finetuning_prompt,
+    build_openai_silver_data_prompt,
+)
+
+
+async def run_rolmcr(
+    pdf_path: str,
+    page_num: int = 1,
+    server: str = "localhost:30000",
+    model: str = "reducto/RolmOCR",
+    temperature: float = 0.2,
+    target_longest_image_dim: int = 1024,
+) -> str:
+    """
+
+
+    Returns:
+        str: The OCR result in markdown format.
+    """
+    # Convert the first page of the PDF to a base64-encoded PNG image.
+    image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
+
+    request = {
+        "model": model,
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{image_base64}"},
+                    },
+                    {
+                        "type": "text",
+                        "text": "Return the plain text representation of this document as if you were reading it naturally.\n",
+                    },
+                ],
+            }
+        ],
+        "temperature": temperature,
+        "max_tokens": 4096,
+    }
+
+    # Make request and get response using httpx
+    url = f"http://{server}/v1/chat/completions"
+
+    async with httpx.AsyncClient(timeout=300) as client:
+        response = await client.post(url, json=request)
+
+        response.raise_for_status()
+        data = response.json()
+
+        choice = data["choices"][0]
+        assert (
+            choice["finish_reason"] == "stop"
+        ), "Response from server did not finish with finish_reason stop as expected, this is probably going to lead to bad data"
+
+        return choice["message"]["content"]
--- a/olmocr/bench/runners/run_server.py
+++ b/olmocr/bench/runners/run_server.py
@ -3,6 +3,7 @@ from typing import Literal

 import httpx

+from olmocr.bench.prompts import build_basic_prompt
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts.anchor import get_anchor_text
 from olmocr.prompts.prompts import (
@ -43,7 +44,9 @@ async def run_server(
    elif prompt_template == "finetune":
        prompt = build_finetuning_prompt(anchor_text)
    elif prompt_template == "basic":
-        prompt = "Just return the plain text representation of this document as if you were reading it naturally."
+        prompt = build_basic_prompt()
+    elif prompt_template == "rolmocr":
+        prompt = build_rolmocr_prompt()
    else:
        raise ValueError("Unknown prompt template")

--- a/olmocr/bench/scripts/convert_all.sh
+++ b/olmocr/bench/scripts/convert_all.sh
@ -181,7 +181,7 @@ python -m olmocr.bench.convert gemini:name=gemini_flash2:model=gemini-2.0-flash

 echo "Running mistral..."
 pip install mistralai
-python -m olmocr.bench.convert --dir olmOCR-bench/bench_data mistral 
+python -m olmocr.bench.convert --dir olmOCR-bench/bench_data --parallel 4 mistral 

 # Run raw server benchmarks with generic server function
 # For each model, start server, run benchmark, then stop server
@ -219,9 +219,15 @@ check_port || exit 1
 # stop_server

 # qwen2.5 works best with vllm for now, in a fresh environment
-# start_server vllm "Qwen/Qwen2.5-VL-7B-Instruct" --max-model-len 8192
-# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=qwen25vl_prompt3:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:prompt_template=basic:response_template=plain --parallel 50
-# stop_server
+source activate vllm
+
+start_server vllm "Qwen/Qwen2.5-VL-7B-Instruct" --max-model-len 8192
+python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=qwen25vl_prompt3:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:prompt_template=basic:response_template=plain --parallel 50
+stop_server
+
+start_server vllm "reducto/RolmOCR" --max-model-len 8192
+python -m olmocr.bench.convert --dir olmOCR-bench/bench_data rolmocr --parallel 50
+stop_server

 # TODO: Fix this, I was not able to get it to all install successfully
 # Create and activate mineru environment
--- a/olmocr/bench/scripts/run_difference.py
+++ b/olmocr/bench/scripts/run_difference.py
@ -1,13 +1,22 @@
 import os

 from openai import OpenAI
-from prompts import build_find_difference_prompt
 from runners.run_chatgpt import run_chatgpt
 from runners.run_gemini import run_gemini

 from olmocr.data.renderpdf import render_pdf_to_base64png


+def build_find_difference_prompt(base_text: str) -> str:
+    return (
+        f"Below is an image of a document page, along with raw textual content previously extracted using different models."
+        f"Your goal is to carefully identify the differences between the extracted texts from both models and determine which one is more accurate by comparing them with the image."
+        f"Only return the differences and specify which model extracted the text with higher accuracy.\n"
+        f"Do not hallucinate.\n"
+        f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
+    )
+
+
 def combined_output(pdf_path: str) -> str:
    chatgpt_output = run_chatgpt(pdf_path)
    gemini_output = run_gemini(pdf_path)