mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-11-03 19:45:41 +00:00 
			
		
		
		
	Merge branch 'main' of https://github.com/allenai/olmocr
This commit is contained in:
		
						commit
						500dedc11c
					
				@ -229,6 +229,7 @@ if __name__ == "__main__":
 | 
			
		||||
        "gemini": ("olmocr.bench.runners.run_gemini", "run_gemini"),
 | 
			
		||||
        "mistral": ("olmocr.bench.runners.run_mistral", "run_mistral"),
 | 
			
		||||
        "docling": ("olmocr.bench.runners.run_docling", "run_docling"),
 | 
			
		||||
        "rolmocr": ("olmocr.bench.runners.run_rolmocr", "run_rolmocr"),
 | 
			
		||||
        "transformers": ("olmocr.bench.runners.run_transformers", "run_transformers"),
 | 
			
		||||
        "server": ("olmocr.bench.runners.run_server", "run_server"),
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@ -1,3 +1,7 @@
 | 
			
		||||
def build_basic_prompt() -> str:
 | 
			
		||||
    return "Just return the markdown representation of this document as if you were reading it naturally. Convert equations to markdown using \( \) for inline math, and \[ \] otherwise."
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def claude_response_format_schema() -> dict:
 | 
			
		||||
    return (
 | 
			
		||||
        {
 | 
			
		||||
@ -44,41 +48,3 @@ def claude_response_format_schema() -> dict:
 | 
			
		||||
            },
 | 
			
		||||
        },
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def gemini_response_format_schema() -> dict:
 | 
			
		||||
    return (
 | 
			
		||||
        {
 | 
			
		||||
            "type": "OBJECT",
 | 
			
		||||
            "properties": {
 | 
			
		||||
                "primary_language": {
 | 
			
		||||
                    "type": "STRING",
 | 
			
		||||
                    "description": "The primary language of the text using two-letter codes or null if there is no text at all that you think you should read.",
 | 
			
		||||
                },
 | 
			
		||||
                "is_rotation_valid": {
 | 
			
		||||
                    "type": "BOOL",
 | 
			
		||||
                    "description": "Is this page oriented correctly for reading? Answer only considering the textual content, do not factor in the rotation of any charts, tables, drawings, or figures.",
 | 
			
		||||
                },
 | 
			
		||||
                "rotation_correction": {
 | 
			
		||||
                    "type": "INTEGER",
 | 
			
		||||
                    "enum": [0, 90, 180, 270],
 | 
			
		||||
                    "description": "Indicates the degree of clockwise rotation needed if the page is not oriented correctly.",
 | 
			
		||||
                },
 | 
			
		||||
                "is_table": {"type": "BOOL", "description": "Indicates if the majority of the page content is in tabular format."},
 | 
			
		||||
                "is_diagram": {"type": "BOOL", "description": "Indicates if the majority of the page content is a visual diagram."},
 | 
			
		||||
                "natural_text": {"type": "STRING", "description": "The natural text content extracted from the page."},
 | 
			
		||||
            },
 | 
			
		||||
            "required": ["primary_language", "is_rotation_valid", "rotation_correction", "is_table", "is_diagram", "natural_text"],
 | 
			
		||||
            "propertyOrdering": ["primary_language", "is_rotation_valid", "rotation_correction", "is_table", "is_diagram", "natural_text"],
 | 
			
		||||
        },
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def build_find_difference_prompt(base_text: str) -> str:
 | 
			
		||||
    return (
 | 
			
		||||
        f"Below is an image of a document page, along with raw textual content previously extracted using different models."
 | 
			
		||||
        f"Your goal is to carefully identify the differences between the extracted texts from both models and determine which one is more accurate by comparing them with the image."
 | 
			
		||||
        f"Only return the differences and specify which model extracted the text with higher accuracy.\n"
 | 
			
		||||
        f"Do not hallucinate.\n"
 | 
			
		||||
        f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										68
									
								
								olmocr/bench/runners/run_rolmocr.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										68
									
								
								olmocr/bench/runners/run_rolmocr.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,68 @@
 | 
			
		||||
import json
 | 
			
		||||
from typing import Literal
 | 
			
		||||
 | 
			
		||||
import httpx
 | 
			
		||||
 | 
			
		||||
from olmocr.bench.prompts import build_basic_prompt, build_rolmocr_prompt
 | 
			
		||||
from olmocr.data.renderpdf import render_pdf_to_base64png
 | 
			
		||||
from olmocr.prompts.anchor import get_anchor_text
 | 
			
		||||
from olmocr.prompts.prompts import (
 | 
			
		||||
    PageResponse,
 | 
			
		||||
    build_finetuning_prompt,
 | 
			
		||||
    build_openai_silver_data_prompt,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
async def run_rolmcr(
 | 
			
		||||
    pdf_path: str,
 | 
			
		||||
    page_num: int = 1,
 | 
			
		||||
    server: str = "localhost:30000",
 | 
			
		||||
    model: str = "reducto/RolmOCR",
 | 
			
		||||
    temperature: float = 0.2,
 | 
			
		||||
    target_longest_image_dim: int = 1024,
 | 
			
		||||
) -> str:
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        str: The OCR result in markdown format.
 | 
			
		||||
    """
 | 
			
		||||
    # Convert the first page of the PDF to a base64-encoded PNG image.
 | 
			
		||||
    image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
 | 
			
		||||
 | 
			
		||||
    request = {
 | 
			
		||||
        "model": model,
 | 
			
		||||
        "messages": [
 | 
			
		||||
            {
 | 
			
		||||
                "role": "user",
 | 
			
		||||
                "content": [
 | 
			
		||||
                    {
 | 
			
		||||
                        "type": "image_url",
 | 
			
		||||
                        "image_url": {"url": f"data:image/png;base64,{image_base64}"},
 | 
			
		||||
                    },
 | 
			
		||||
                    {
 | 
			
		||||
                        "type": "text",
 | 
			
		||||
                        "text": "Return the plain text representation of this document as if you were reading it naturally.\n",
 | 
			
		||||
                    },
 | 
			
		||||
                ],
 | 
			
		||||
            }
 | 
			
		||||
        ],
 | 
			
		||||
        "temperature": temperature,
 | 
			
		||||
        "max_tokens": 4096,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    # Make request and get response using httpx
 | 
			
		||||
    url = f"http://{server}/v1/chat/completions"
 | 
			
		||||
 | 
			
		||||
    async with httpx.AsyncClient(timeout=300) as client:
 | 
			
		||||
        response = await client.post(url, json=request)
 | 
			
		||||
 | 
			
		||||
        response.raise_for_status()
 | 
			
		||||
        data = response.json()
 | 
			
		||||
 | 
			
		||||
        choice = data["choices"][0]
 | 
			
		||||
        assert (
 | 
			
		||||
            choice["finish_reason"] == "stop"
 | 
			
		||||
        ), "Response from server did not finish with finish_reason stop as expected, this is probably going to lead to bad data"
 | 
			
		||||
 | 
			
		||||
        return choice["message"]["content"]
 | 
			
		||||
@ -3,6 +3,7 @@ from typing import Literal
 | 
			
		||||
 | 
			
		||||
import httpx
 | 
			
		||||
 | 
			
		||||
from olmocr.bench.prompts import build_basic_prompt
 | 
			
		||||
from olmocr.data.renderpdf import render_pdf_to_base64png
 | 
			
		||||
from olmocr.prompts.anchor import get_anchor_text
 | 
			
		||||
from olmocr.prompts.prompts import (
 | 
			
		||||
@ -43,7 +44,9 @@ async def run_server(
 | 
			
		||||
    elif prompt_template == "finetune":
 | 
			
		||||
        prompt = build_finetuning_prompt(anchor_text)
 | 
			
		||||
    elif prompt_template == "basic":
 | 
			
		||||
        prompt = "Just return the plain text representation of this document as if you were reading it naturally."
 | 
			
		||||
        prompt = build_basic_prompt()
 | 
			
		||||
    elif prompt_template == "rolmocr":
 | 
			
		||||
        prompt = build_rolmocr_prompt()
 | 
			
		||||
    else:
 | 
			
		||||
        raise ValueError("Unknown prompt template")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -181,7 +181,7 @@ python -m olmocr.bench.convert gemini:name=gemini_flash2:model=gemini-2.0-flash
 | 
			
		||||
 | 
			
		||||
echo "Running mistral..."
 | 
			
		||||
pip install mistralai
 | 
			
		||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data mistral 
 | 
			
		||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data --parallel 4 mistral 
 | 
			
		||||
 | 
			
		||||
# Run raw server benchmarks with generic server function
 | 
			
		||||
# For each model, start server, run benchmark, then stop server
 | 
			
		||||
@ -219,9 +219,15 @@ check_port || exit 1
 | 
			
		||||
# stop_server
 | 
			
		||||
 | 
			
		||||
# qwen2.5 works best with vllm for now, in a fresh environment
 | 
			
		||||
# start_server vllm "Qwen/Qwen2.5-VL-7B-Instruct" --max-model-len 8192
 | 
			
		||||
# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=qwen25vl_prompt3:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:prompt_template=basic:response_template=plain --parallel 50
 | 
			
		||||
# stop_server
 | 
			
		||||
source activate vllm
 | 
			
		||||
 | 
			
		||||
start_server vllm "Qwen/Qwen2.5-VL-7B-Instruct" --max-model-len 8192
 | 
			
		||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=qwen25vl_prompt3:model=Qwen/Qwen2.5-VL-7B-Instruct:temperature=0.1:prompt_template=basic:response_template=plain --parallel 50
 | 
			
		||||
stop_server
 | 
			
		||||
 | 
			
		||||
start_server vllm "reducto/RolmOCR" --max-model-len 8192
 | 
			
		||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data rolmocr --parallel 50
 | 
			
		||||
stop_server
 | 
			
		||||
 | 
			
		||||
# TODO: Fix this, I was not able to get it to all install successfully
 | 
			
		||||
# Create and activate mineru environment
 | 
			
		||||
 | 
			
		||||
@ -1,13 +1,22 @@
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
from openai import OpenAI
 | 
			
		||||
from prompts import build_find_difference_prompt
 | 
			
		||||
from runners.run_chatgpt import run_chatgpt
 | 
			
		||||
from runners.run_gemini import run_gemini
 | 
			
		||||
 | 
			
		||||
from olmocr.data.renderpdf import render_pdf_to_base64png
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def build_find_difference_prompt(base_text: str) -> str:
 | 
			
		||||
    return (
 | 
			
		||||
        f"Below is an image of a document page, along with raw textual content previously extracted using different models."
 | 
			
		||||
        f"Your goal is to carefully identify the differences between the extracted texts from both models and determine which one is more accurate by comparing them with the image."
 | 
			
		||||
        f"Only return the differences and specify which model extracted the text with higher accuracy.\n"
 | 
			
		||||
        f"Do not hallucinate.\n"
 | 
			
		||||
        f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def combined_output(pdf_path: str) -> str:
 | 
			
		||||
    chatgpt_output = run_chatgpt(pdf_path)
 | 
			
		||||
    gemini_output = run_gemini(pdf_path)
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user