diff --git a/olmocr/bench/runners/run_chatgpt.py b/olmocr/bench/runners/run_chatgpt.py index 34277e9..1e6952a 100644 --- a/olmocr/bench/runners/run_chatgpt.py +++ b/olmocr/bench/runners/run_chatgpt.py @@ -4,7 +4,7 @@ import json from openai import OpenAI from olmocr.prompts.anchor import get_anchor_text -from olmocr.prompts.prompts import build_openai_silver_data_prompt, openai_response_format_schema, PageResponse +from olmocr.prompts.prompts import build_silver_data_prompt, response_format_schema, PageResponse from olmocr.data.renderpdf import render_pdf_to_base64png @@ -33,14 +33,14 @@ def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-0 { "role": "user", "content": [ - {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)}, + {"type": "text", "text": build_silver_data_prompt(anchor_text)}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, ], } ], temperature=temperature, max_tokens=3000, - response_format=openai_response_format_schema(), + response_format=response_format_schema(), ) raw_response = response.choices[0].message.content diff --git a/olmocr/bench/runners/run_claude.py b/olmocr/bench/runners/run_claude.py new file mode 100644 index 0000000..1c9d05a --- /dev/null +++ b/olmocr/bench/runners/run_claude.py @@ -0,0 +1,89 @@ +import os +import json +import base64 +from anthropic import Anthropic +from olmocr.prompts.anchor import get_anchor_text +from olmocr.prompts.prompts import build_silver_data_prompt, response_format_schema, PageResponse +from olmocr.data.renderpdf import render_pdf_to_base64png + +def run_claude(pdf_path: str, page_num: int = 1, model: str = "claude-3-7-sonnet-20250219", temperature: float=0.1) -> str: + """ + Convert page of a PDF file to markdown using Claude OCR. + This function renders the specified page of the PDF to an image, runs OCR on that image, + and returns the OCR result as a markdown-formatted string. + + Args: + pdf_path (str): The local path to the PDF file. + page_num (int): The page number to process (starting from 1). + model (str): The Claude model to use. + temperature (float): The temperature parameter for generation. + + Returns: + str: The OCR result in markdown format. + """ + # Convert the specified page of the PDF to a base64-encoded PNG image + image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048) + + # Get anchor text for the page + anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport") + + # Initialize the Claude client + client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + + # Create the message with the prompt and image + response = client.messages.create( + model=model, + max_tokens=3000, + temperature=temperature, + system=build_silver_data_prompt(anchor_text), + messages=[ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": image_base64 + } + }, + {"type": "json_object", "schema": response_format_schema()} + ] + } + ], + + ) + + # Extract and validate the response + raw_response = response.content#[0].text + print(raw_response) + # Parse the JSON response + data = json.loads(raw_response) + data = PageResponse(**data) + + return data.natural_text + +if __name__ == "__main__": + import argparse + + # Set up command-line argument parsing + parser = argparse.ArgumentParser(description="Extract text from a PDF using Claude OCR") + parser.add_argument("pdf_path", help="Path to the PDF file") + parser.add_argument("--page", type=int, default=1, help="Page number to process (default: 1)") + parser.add_argument("--model", default="claude-3-7-sonnet-20250219", help="Claude model to use") + parser.add_argument("--temperature", type=float, default=0.1, help="Temperature for generation") + + # Parse the arguments + args = parser.parse_args() + + # Run the OCR function + result = run_claude( + pdf_path=args.pdf_path, + page_num=args.page, + model=args.model, + temperature=args.temperature + ) + + # Print the result + print(result) \ No newline at end of file diff --git a/olmocr/bench/runners/run_gemini.py b/olmocr/bench/runners/run_gemini.py new file mode 100644 index 0000000..e69de29 diff --git a/olmocr/data/buildsilver.py b/olmocr/data/buildsilver.py index f5879d7..ff2a1e9 100644 --- a/olmocr/data/buildsilver.py +++ b/olmocr/data/buildsilver.py @@ -14,8 +14,8 @@ from tqdm import tqdm from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.filter import PdfFilter from olmocr.prompts import ( - build_openai_silver_data_prompt, - openai_response_format_schema, + build_silver_data_prompt, + response_format_schema, ) from olmocr.prompts.anchor import get_anchor_text @@ -39,7 +39,7 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di # { # "role": "user", # "content": [ - # {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)}, + # {"type": "text", "text": build_silver_data_prompt(anchor_text)}, # {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}} # ], # } @@ -48,7 +48,7 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di # max_tokens=3000, # logprobs=True, # top_logprobs=5, - # response_format=openai_response_format_schema() + # response_format=response_format_schema() # ) # print(response) @@ -70,7 +70,7 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di { "role": "user", "content": [ - {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)}, + {"type": "text", "text": build_silver_data_prompt(anchor_text)}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, ], } @@ -79,7 +79,7 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di "max_tokens": 6000, "logprobs": True, "top_logprobs": 5, - "response_format": openai_response_format_schema(), + "response_format": response_format_schema(), }, } diff --git a/olmocr/data/convertsilver_openai.py b/olmocr/data/convertsilver_openai.py index 055fec4..a1fbc1d 100644 --- a/olmocr/data/convertsilver_openai.py +++ b/olmocr/data/convertsilver_openai.py @@ -65,8 +65,8 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool): obj = build_page_query(local_pdf_path, s3_path, page) # raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport") - # from olmocr.prompts import build_openai_silver_data_prompt - # obj["body"]["messages"][0]["content"][0]["text"] = build_openai_silver_data_prompt(raw_page_text) + # from olmocr.prompts import build_silver_data_prompt + # obj["body"]["messages"][0]["content"][0]["text"] = build_silver_data_prompt(raw_page_text) if obj is not None: outfile.write(json.dumps(obj) + "\n") diff --git a/olmocr/prompts/__init__.py b/olmocr/prompts/__init__.py index 6feaa65..7ee1ad6 100644 --- a/olmocr/prompts/__init__.py +++ b/olmocr/prompts/__init__.py @@ -1,7 +1,7 @@ from .prompts import ( PageResponse, build_finetuning_prompt, - build_openai_silver_data_prompt, + build_silver_data_prompt, extract_raw_text, - openai_response_format_schema, + response_format_schema, ) diff --git a/olmocr/prompts/prompts.py b/olmocr/prompts/prompts.py index e41a05b..1d7de5e 100644 --- a/olmocr/prompts/prompts.py +++ b/olmocr/prompts/prompts.py @@ -4,7 +4,7 @@ from typing import Optional # This is the prompt we use for getting chat gpt 4o to convert documents into our silver training data -def build_openai_silver_data_prompt(base_text: str) -> str: +def build_silver_data_prompt(base_text: str) -> str: return ( f"Below is the image of one page of a PDF document, as well as some raw textual content that was previously extracted for it that includes position information for each image and block of text (The origin [0x0] of the coordinates is in the lower left corner of the image). " f"Just return the plain text representation of this document as if you were reading it naturally.\n" @@ -46,7 +46,7 @@ class PageResponse: raise TypeError("natural_text must be of type Optional[str].") -def openai_response_format_schema() -> dict: +def response_format_schema() -> dict: return { "type": "json_schema", "json_schema": { diff --git a/olmocr/train/inference.py b/olmocr/train/inference.py index d28bffa..b9ddd50 100644 --- a/olmocr/train/inference.py +++ b/olmocr/train/inference.py @@ -8,7 +8,7 @@ from transformers import AutoConfig, AutoProcessor, Qwen2_5_VLForConditionalGene from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.prompts.anchor import get_anchor_text -from olmocr.prompts.prompts import build_openai_silver_data_prompt +from olmocr.prompts.prompts import build_silver_data_prompt @torch.no_grad() @@ -33,7 +33,7 @@ def run_inference(model_name: str): { "role": "user", "content": [ - {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)}, + {"type": "text", "text": build_silver_data_prompt(anchor_text)}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, ], }