diff --git a/olmocr/bench/benchmark.py b/olmocr/bench/benchmark.py index 126b2f7..eb91a66 100644 --- a/olmocr/bench/benchmark.py +++ b/olmocr/bench/benchmark.py @@ -199,7 +199,7 @@ def main(): parser.add_argument("--test_report", type=str, default=None, help="Generate an HTML report of test results. Provide a filename (e.g., results.html).") args = parser.parse_args() - input_folder = args.dir + input_folder = args.dir if os.path.isdir(args.dir) else os.path.dirname(args.dir) n_bootstrap = args.bootstrap_samples ci_level = args.confidence_level pdf_folder = os.path.join(input_folder, "pdfs") @@ -216,7 +216,11 @@ def main(): pdf_basenames = [os.path.relpath(p, pdf_folder) for p in all_pdf_files] - jsonl_files = glob.glob(os.path.join(input_folder, "*.jsonl")) + if os.path.isfile(args.dir): + jsonl_files = [args.dir] + else: + jsonl_files = glob.glob(os.path.join(input_folder, "*.jsonl")) + if not jsonl_files: print(f"Error: No .jsonl files found in {input_folder}.", file=sys.stderr) sys.exit(1) diff --git a/olmocr/bench/runners/run_chatgpt.py b/olmocr/bench/runners/run_chatgpt.py index edb743e..1420616 100644 --- a/olmocr/bench/runners/run_chatgpt.py +++ b/olmocr/bench/runners/run_chatgpt.py @@ -1,18 +1,29 @@ import json import os +from typing import Literal from openai import OpenAI +from olmocr.bench.prompts import build_basic_prompt from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.prompts.anchor import get_anchor_text from olmocr.prompts.prompts import ( PageResponse, + build_finetuning_prompt, build_openai_silver_data_prompt, openai_response_format_schema, ) -def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-06", temperature: float = 0.1) -> str: +def run_chatgpt( + pdf_path: str, + page_num: int = 1, + model: str = "gpt-4o-2024-08-06", + temperature: float = 0.1, + target_longest_image_dim: int = 2048, + prompt_template: Literal["full", "basic", "finetune"] = "finetune", + response_template: Literal["plain", "json"] = "json", +) -> str: """ Convert page of a PDF file to markdown using the commercial openAI APIs. @@ -25,7 +36,7 @@ def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-0 str: The OCR result in markdown format. """ # Convert the first page of the PDF to a base64-encoded PNG image. - image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048) + image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim) anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport") if not os.getenv("OPENAI_API_KEY"): @@ -33,20 +44,29 @@ def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-0 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + if prompt_template == "full": + prompt = build_openai_silver_data_prompt(anchor_text) + elif prompt_template == "finetune": + prompt = build_finetuning_prompt(anchor_text) + elif prompt_template == "basic": + prompt = build_basic_prompt() + else: + raise ValueError("Unknown prompt template") + response = client.chat.completions.create( model=model, messages=[ { "role": "user", "content": [ - {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)}, + {"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, ], } ], temperature=temperature, max_tokens=3000, - response_format=openai_response_format_schema(), + response_format=openai_response_format_schema() if response_template == "json" else None, ) raw_response = response.choices[0].message.content @@ -55,7 +75,10 @@ def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-0 assert response.choices[0].message.refusal is None assert response.choices[0].finish_reason == "stop" - data = json.loads(raw_response) - data = PageResponse(**data) + if response_template == "json": + data = json.loads(raw_response) + data = PageResponse(**data) - return data.natural_text + return data.natural_text + else: + return raw_response diff --git a/olmocr/bench/templates/all_done_latex.html b/olmocr/bench/templates/all_done_latex.html new file mode 100644 index 0000000..5784f94 --- /dev/null +++ b/olmocr/bench/templates/all_done_latex.html @@ -0,0 +1,54 @@ + + + +
+ + +You have reviewed all equations in the dataset.
+ +