This commit is contained in:
“aman-17” 2025-02-25 14:41:48 -08:00
parent aac0c1503d
commit 3a6df83168
8 changed files with 106 additions and 17 deletions

View File

@ -4,7 +4,7 @@ import json
from openai import OpenAI from openai import OpenAI
from olmocr.prompts.anchor import get_anchor_text from olmocr.prompts.anchor import get_anchor_text
from olmocr.prompts.prompts import build_openai_silver_data_prompt, openai_response_format_schema, PageResponse from olmocr.prompts.prompts import build_silver_data_prompt, response_format_schema, PageResponse
from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.data.renderpdf import render_pdf_to_base64png
@ -33,14 +33,14 @@ def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-0
{ {
"role": "user", "role": "user",
"content": [ "content": [
{"type": "text", "text": build_openai_silver_data_prompt(anchor_text)}, {"type": "text", "text": build_silver_data_prompt(anchor_text)},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
], ],
} }
], ],
temperature=temperature, temperature=temperature,
max_tokens=3000, max_tokens=3000,
response_format=openai_response_format_schema(), response_format=response_format_schema(),
) )
raw_response = response.choices[0].message.content raw_response = response.choices[0].message.content

View File

@ -0,0 +1,89 @@
import os
import json
import base64
from anthropic import Anthropic
from olmocr.prompts.anchor import get_anchor_text
from olmocr.prompts.prompts import build_silver_data_prompt, response_format_schema, PageResponse
from olmocr.data.renderpdf import render_pdf_to_base64png
def run_claude(pdf_path: str, page_num: int = 1, model: str = "claude-3-7-sonnet-20250219", temperature: float=0.1) -> str:
"""
Convert page of a PDF file to markdown using Claude OCR.
This function renders the specified page of the PDF to an image, runs OCR on that image,
and returns the OCR result as a markdown-formatted string.
Args:
pdf_path (str): The local path to the PDF file.
page_num (int): The page number to process (starting from 1).
model (str): The Claude model to use.
temperature (float): The temperature parameter for generation.
Returns:
str: The OCR result in markdown format.
"""
# Convert the specified page of the PDF to a base64-encoded PNG image
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
# Get anchor text for the page
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
# Initialize the Claude client
client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
# Create the message with the prompt and image
response = client.messages.create(
model=model,
max_tokens=3000,
temperature=temperature,
system=build_silver_data_prompt(anchor_text),
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": image_base64
}
},
{"type": "json_object", "schema": response_format_schema()}
]
}
],
)
# Extract and validate the response
raw_response = response.content#[0].text
print(raw_response)
# Parse the JSON response
data = json.loads(raw_response)
data = PageResponse(**data)
return data.natural_text
if __name__ == "__main__":
import argparse
# Set up command-line argument parsing
parser = argparse.ArgumentParser(description="Extract text from a PDF using Claude OCR")
parser.add_argument("pdf_path", help="Path to the PDF file")
parser.add_argument("--page", type=int, default=1, help="Page number to process (default: 1)")
parser.add_argument("--model", default="claude-3-7-sonnet-20250219", help="Claude model to use")
parser.add_argument("--temperature", type=float, default=0.1, help="Temperature for generation")
# Parse the arguments
args = parser.parse_args()
# Run the OCR function
result = run_claude(
pdf_path=args.pdf_path,
page_num=args.page,
model=args.model,
temperature=args.temperature
)
# Print the result
print(result)

View File

View File

@ -14,8 +14,8 @@ from tqdm import tqdm
from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.filter import PdfFilter from olmocr.filter import PdfFilter
from olmocr.prompts import ( from olmocr.prompts import (
build_openai_silver_data_prompt, build_silver_data_prompt,
openai_response_format_schema, response_format_schema,
) )
from olmocr.prompts.anchor import get_anchor_text from olmocr.prompts.anchor import get_anchor_text
@ -39,7 +39,7 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
# { # {
# "role": "user", # "role": "user",
# "content": [ # "content": [
# {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)}, # {"type": "text", "text": build_silver_data_prompt(anchor_text)},
# {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}} # {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
# ], # ],
# } # }
@ -48,7 +48,7 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
# max_tokens=3000, # max_tokens=3000,
# logprobs=True, # logprobs=True,
# top_logprobs=5, # top_logprobs=5,
# response_format=openai_response_format_schema() # response_format=response_format_schema()
# ) # )
# print(response) # print(response)
@ -70,7 +70,7 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
{ {
"role": "user", "role": "user",
"content": [ "content": [
{"type": "text", "text": build_openai_silver_data_prompt(anchor_text)}, {"type": "text", "text": build_silver_data_prompt(anchor_text)},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
], ],
} }
@ -79,7 +79,7 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
"max_tokens": 6000, "max_tokens": 6000,
"logprobs": True, "logprobs": True,
"top_logprobs": 5, "top_logprobs": 5,
"response_format": openai_response_format_schema(), "response_format": response_format_schema(),
}, },
} }

View File

@ -65,8 +65,8 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
obj = build_page_query(local_pdf_path, s3_path, page) obj = build_page_query(local_pdf_path, s3_path, page)
# raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport") # raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
# from olmocr.prompts import build_openai_silver_data_prompt # from olmocr.prompts import build_silver_data_prompt
# obj["body"]["messages"][0]["content"][0]["text"] = build_openai_silver_data_prompt(raw_page_text) # obj["body"]["messages"][0]["content"][0]["text"] = build_silver_data_prompt(raw_page_text)
if obj is not None: if obj is not None:
outfile.write(json.dumps(obj) + "\n") outfile.write(json.dumps(obj) + "\n")

View File

@ -1,7 +1,7 @@
from .prompts import ( from .prompts import (
PageResponse, PageResponse,
build_finetuning_prompt, build_finetuning_prompt,
build_openai_silver_data_prompt, build_silver_data_prompt,
extract_raw_text, extract_raw_text,
openai_response_format_schema, response_format_schema,
) )

View File

@ -4,7 +4,7 @@ from typing import Optional
# This is the prompt we use for getting chat gpt 4o to convert documents into our silver training data # This is the prompt we use for getting chat gpt 4o to convert documents into our silver training data
def build_openai_silver_data_prompt(base_text: str) -> str: def build_silver_data_prompt(base_text: str) -> str:
return ( return (
f"Below is the image of one page of a PDF document, as well as some raw textual content that was previously extracted for it that includes position information for each image and block of text (The origin [0x0] of the coordinates is in the lower left corner of the image). " f"Below is the image of one page of a PDF document, as well as some raw textual content that was previously extracted for it that includes position information for each image and block of text (The origin [0x0] of the coordinates is in the lower left corner of the image). "
f"Just return the plain text representation of this document as if you were reading it naturally.\n" f"Just return the plain text representation of this document as if you were reading it naturally.\n"
@ -46,7 +46,7 @@ class PageResponse:
raise TypeError("natural_text must be of type Optional[str].") raise TypeError("natural_text must be of type Optional[str].")
def openai_response_format_schema() -> dict: def response_format_schema() -> dict:
return { return {
"type": "json_schema", "type": "json_schema",
"json_schema": { "json_schema": {

View File

@ -8,7 +8,7 @@ from transformers import AutoConfig, AutoProcessor, Qwen2_5_VLForConditionalGene
from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts.anchor import get_anchor_text from olmocr.prompts.anchor import get_anchor_text
from olmocr.prompts.prompts import build_openai_silver_data_prompt from olmocr.prompts.prompts import build_silver_data_prompt
@torch.no_grad() @torch.no_grad()
@ -33,7 +33,7 @@ def run_inference(model_name: str):
{ {
"role": "user", "role": "user",
"content": [ "content": [
{"type": "text", "text": build_openai_silver_data_prompt(anchor_text)}, {"type": "text", "text": build_silver_data_prompt(anchor_text)},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
], ],
} }