From 1f50a6b6bd39e33e126616c8e6918f11de3a6de6 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 14 Aug 2025 17:44:56 +0000 Subject: [PATCH] Trying out some new prompts --- olmocr/bench/runners/run_chatgpt.py | 16 ++++++++++++---- olmocr/prompts/prompts.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/olmocr/bench/runners/run_chatgpt.py b/olmocr/bench/runners/run_chatgpt.py index 65b5911..4127cf7 100644 --- a/olmocr/bench/runners/run_chatgpt.py +++ b/olmocr/bench/runners/run_chatgpt.py @@ -8,14 +8,16 @@ from olmocr.bench.prompts import ( build_basic_prompt, build_openai_silver_data_prompt_no_document_anchoring, ) -from olmocr.data.renderpdf import render_pdf_to_base64png +from olmocr.data.renderpdf import render_pdf_to_base64png, get_png_dimensions_from_base64 from olmocr.prompts.anchor import get_anchor_text from olmocr.prompts.prompts import ( PageResponse, build_finetuning_prompt, build_openai_silver_data_prompt, openai_response_format_schema, - build_openai_silver_data_prompt_v2 + build_openai_silver_data_prompt_v2, + build_openai_silver_data_prompt_v2_simple, + build_openai_silver_data_prompt_v3_simple, ) @@ -25,7 +27,7 @@ def run_chatgpt( model: str = "gpt-4o-2024-08-06", temperature: float = 0.1, target_longest_image_dim: int = 2048, - prompt_template: Literal["full", "full_no_document_anchoring", "basic", "finetune", "fullv2"] = "finetune", + prompt_template: Literal["full", "full_no_document_anchoring", "basic", "finetune", "fullv2", "fullv2simple", "fullv3simple"] = "finetune", response_template: Literal["plain", "json"] = "json", ) -> str: """ @@ -58,6 +60,12 @@ def run_chatgpt( prompt = build_basic_prompt() elif prompt_template == "fullv2": prompt = build_openai_silver_data_prompt_v2(anchor_text) + elif prompt_template == "fullv2simple": + width, height = get_png_dimensions_from_base64(image_base64) + prompt = build_openai_silver_data_prompt_v2_simple(width, height) + elif prompt_template == "fullv3simple": + width, height = get_png_dimensions_from_base64(image_base64) + prompt = build_openai_silver_data_prompt_v3_simple(width, height) else: raise ValueError("Unknown prompt template") @@ -74,7 +82,7 @@ def run_chatgpt( ], temperature=temperature, max_completion_tokens=20000, - reasoning_effort="high", + #reasoning_effort="high", response_format=openai_response_format_schema() if response_template == "json" else None, safety_identifier="olmocr-bench-runner", ) diff --git a/olmocr/prompts/prompts.py b/olmocr/prompts/prompts.py index c308dc0..b2a366d 100644 --- a/olmocr/prompts/prompts.py +++ b/olmocr/prompts/prompts.py @@ -30,6 +30,35 @@ def build_openai_silver_data_prompt_v2(base_text: str) -> str: f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END" ) +def build_openai_silver_data_prompt_v2_simple(page_width: int, page_height: int) -> str: + return ( + f"Attached is the image of one page of a PDF document." + f"Just return the plain text representation of this document as if you were reading it naturally.\n" + f"Turn equations and math symbols into a LaTeX representation, make sure to use \\( and \\) as a delimiter for inline math, and \\[ and \\] for block math. Always prefer LaTeX syntax instead of using unicode math symbols.\n" + f"Convert tables into HTML format. Remove the headers and footers, but keep references and footnotes.\n" + f"Read any natural handwriting.\n" + f"If there are any figures or charts, label them with the following markdown syntax ![Alt text describing the contents of the figure](page_startx_starty_width_height.png)" + f"This is likely one page out of several in the document, so be sure to preserve any sentences that come from the previous page, or continue onto the next page, exactly as they are.\n" + f"If there is no text at all that you think you should read, you can output null.\n" + f"Do not hallucinate.\n" + f"Page width: {page_width}, Page height: {page_height}" + ) + +def build_openai_silver_data_prompt_v3_simple(page_width: int, page_height: int) -> str: + return ( + f"Attached is the image of one page of a PDF document." + f"Just return the plain text representation of this document as if you were reading it naturally.\n" + f"Turn equations and math symbols into a LaTeX representation, make sure to use \\( and \\) as a delimiter for inline math, and \\[ and \\] for block math. Do NOT use ascii or unicode math symbols such as ∈ ∉ ⊂ ⊃ ⊆ ⊇ ∅ ∪ ∩ ∀ ∃ ¬, just use LaTeX syntax, ex \( \in \\notin \subset \supset \subseteq \) etc. \n" + f"Convert tables into HTML format. Keep the syntax simple, but use for header rows, and use rowspan and colspans appropriately. \n" + f"Remove the headers and footers, but keep references and footnotes.\n" + f"Read any natural handwriting.\n" + f"If there are any figures or charts, label them with the following markdown syntax ![Alt text describing the contents of the figure](page_startx_starty_width_height.png)" + f"This is likely one page out of several in the document, so be sure to preserve any sentences that come from the previous page, or continue onto the next page, exactly as they are.\n" + f"If there is no text at all that you think you should read, you can output null.\n" + f"Do not hallucinate.\n" + f"Page width: {page_width}, Page height: {page_height}" + ) + @dataclass(frozen=True)