mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-02 20:09:08 +00:00
89 lines
3.1 KiB
Python
89 lines
3.1 KiB
Python
import os
|
|
import json
|
|
import base64
|
|
from anthropic import Anthropic
|
|
from olmocr.prompts.anchor import get_anchor_text
|
|
from olmocr.prompts.prompts import build_silver_data_prompt, response_format_schema, PageResponse
|
|
from olmocr.data.renderpdf import render_pdf_to_base64png
|
|
|
|
def run_claude(pdf_path: str, page_num: int = 1, model: str = "claude-3-7-sonnet-20250219", temperature: float=0.1) -> str:
|
|
"""
|
|
Convert page of a PDF file to markdown using Claude OCR.
|
|
This function renders the specified page of the PDF to an image, runs OCR on that image,
|
|
and returns the OCR result as a markdown-formatted string.
|
|
|
|
Args:
|
|
pdf_path (str): The local path to the PDF file.
|
|
page_num (int): The page number to process (starting from 1).
|
|
model (str): The Claude model to use.
|
|
temperature (float): The temperature parameter for generation.
|
|
|
|
Returns:
|
|
str: The OCR result in markdown format.
|
|
"""
|
|
# Convert the specified page of the PDF to a base64-encoded PNG image
|
|
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
|
|
|
|
# Get anchor text for the page
|
|
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
|
|
|
|
# Initialize the Claude client
|
|
client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
|
|
|
# Create the message with the prompt and image
|
|
response = client.messages.create(
|
|
model=model,
|
|
max_tokens=3000,
|
|
temperature=temperature,
|
|
system=build_silver_data_prompt(anchor_text),
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "base64",
|
|
"media_type": "image/png",
|
|
"data": image_base64
|
|
}
|
|
},
|
|
{"type": "json_object", "schema": response_format_schema()}
|
|
]
|
|
}
|
|
],
|
|
|
|
)
|
|
|
|
# Extract and validate the response
|
|
raw_response = response.content#[0].text
|
|
print(raw_response)
|
|
# Parse the JSON response
|
|
data = json.loads(raw_response)
|
|
data = PageResponse(**data)
|
|
|
|
return data.natural_text
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
# Set up command-line argument parsing
|
|
parser = argparse.ArgumentParser(description="Extract text from a PDF using Claude OCR")
|
|
parser.add_argument("pdf_path", help="Path to the PDF file")
|
|
parser.add_argument("--page", type=int, default=1, help="Page number to process (default: 1)")
|
|
parser.add_argument("--model", default="claude-3-7-sonnet-20250219", help="Claude model to use")
|
|
parser.add_argument("--temperature", type=float, default=0.1, help="Temperature for generation")
|
|
|
|
# Parse the arguments
|
|
args = parser.parse_args()
|
|
|
|
# Run the OCR function
|
|
result = run_claude(
|
|
pdf_path=args.pdf_path,
|
|
page_num=args.page,
|
|
model=args.model,
|
|
temperature=args.temperature
|
|
)
|
|
|
|
# Print the result
|
|
print(result) |