mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-27 09:27:55 +00:00
update
This commit is contained in:
parent
aac0c1503d
commit
3a6df83168
@ -4,7 +4,7 @@ import json
|
|||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
|
|
||||||
from olmocr.prompts.anchor import get_anchor_text
|
from olmocr.prompts.anchor import get_anchor_text
|
||||||
from olmocr.prompts.prompts import build_openai_silver_data_prompt, openai_response_format_schema, PageResponse
|
from olmocr.prompts.prompts import build_silver_data_prompt, response_format_schema, PageResponse
|
||||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||||
|
|
||||||
|
|
||||||
@ -33,14 +33,14 @@ def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-0
|
|||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
{"type": "text", "text": build_openai_silver_data_prompt(anchor_text)},
|
{"type": "text", "text": build_silver_data_prompt(anchor_text)},
|
||||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
max_tokens=3000,
|
max_tokens=3000,
|
||||||
response_format=openai_response_format_schema(),
|
response_format=response_format_schema(),
|
||||||
)
|
)
|
||||||
|
|
||||||
raw_response = response.choices[0].message.content
|
raw_response = response.choices[0].message.content
|
||||||
|
89
olmocr/bench/runners/run_claude.py
Normal file
89
olmocr/bench/runners/run_claude.py
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
import base64
|
||||||
|
from anthropic import Anthropic
|
||||||
|
from olmocr.prompts.anchor import get_anchor_text
|
||||||
|
from olmocr.prompts.prompts import build_silver_data_prompt, response_format_schema, PageResponse
|
||||||
|
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||||
|
|
||||||
|
def run_claude(pdf_path: str, page_num: int = 1, model: str = "claude-3-7-sonnet-20250219", temperature: float=0.1) -> str:
|
||||||
|
"""
|
||||||
|
Convert page of a PDF file to markdown using Claude OCR.
|
||||||
|
This function renders the specified page of the PDF to an image, runs OCR on that image,
|
||||||
|
and returns the OCR result as a markdown-formatted string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_path (str): The local path to the PDF file.
|
||||||
|
page_num (int): The page number to process (starting from 1).
|
||||||
|
model (str): The Claude model to use.
|
||||||
|
temperature (float): The temperature parameter for generation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The OCR result in markdown format.
|
||||||
|
"""
|
||||||
|
# Convert the specified page of the PDF to a base64-encoded PNG image
|
||||||
|
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
|
||||||
|
|
||||||
|
# Get anchor text for the page
|
||||||
|
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
|
||||||
|
|
||||||
|
# Initialize the Claude client
|
||||||
|
client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||||
|
|
||||||
|
# Create the message with the prompt and image
|
||||||
|
response = client.messages.create(
|
||||||
|
model=model,
|
||||||
|
max_tokens=3000,
|
||||||
|
temperature=temperature,
|
||||||
|
system=build_silver_data_prompt(anchor_text),
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"source": {
|
||||||
|
"type": "base64",
|
||||||
|
"media_type": "image/png",
|
||||||
|
"data": image_base64
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{"type": "json_object", "schema": response_format_schema()}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract and validate the response
|
||||||
|
raw_response = response.content#[0].text
|
||||||
|
print(raw_response)
|
||||||
|
# Parse the JSON response
|
||||||
|
data = json.loads(raw_response)
|
||||||
|
data = PageResponse(**data)
|
||||||
|
|
||||||
|
return data.natural_text
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
# Set up command-line argument parsing
|
||||||
|
parser = argparse.ArgumentParser(description="Extract text from a PDF using Claude OCR")
|
||||||
|
parser.add_argument("pdf_path", help="Path to the PDF file")
|
||||||
|
parser.add_argument("--page", type=int, default=1, help="Page number to process (default: 1)")
|
||||||
|
parser.add_argument("--model", default="claude-3-7-sonnet-20250219", help="Claude model to use")
|
||||||
|
parser.add_argument("--temperature", type=float, default=0.1, help="Temperature for generation")
|
||||||
|
|
||||||
|
# Parse the arguments
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Run the OCR function
|
||||||
|
result = run_claude(
|
||||||
|
pdf_path=args.pdf_path,
|
||||||
|
page_num=args.page,
|
||||||
|
model=args.model,
|
||||||
|
temperature=args.temperature
|
||||||
|
)
|
||||||
|
|
||||||
|
# Print the result
|
||||||
|
print(result)
|
0
olmocr/bench/runners/run_gemini.py
Normal file
0
olmocr/bench/runners/run_gemini.py
Normal file
@ -14,8 +14,8 @@ from tqdm import tqdm
|
|||||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||||
from olmocr.filter import PdfFilter
|
from olmocr.filter import PdfFilter
|
||||||
from olmocr.prompts import (
|
from olmocr.prompts import (
|
||||||
build_openai_silver_data_prompt,
|
build_silver_data_prompt,
|
||||||
openai_response_format_schema,
|
response_format_schema,
|
||||||
)
|
)
|
||||||
from olmocr.prompts.anchor import get_anchor_text
|
from olmocr.prompts.anchor import get_anchor_text
|
||||||
|
|
||||||
@ -39,7 +39,7 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
|
|||||||
# {
|
# {
|
||||||
# "role": "user",
|
# "role": "user",
|
||||||
# "content": [
|
# "content": [
|
||||||
# {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)},
|
# {"type": "text", "text": build_silver_data_prompt(anchor_text)},
|
||||||
# {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
|
# {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
|
||||||
# ],
|
# ],
|
||||||
# }
|
# }
|
||||||
@ -48,7 +48,7 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
|
|||||||
# max_tokens=3000,
|
# max_tokens=3000,
|
||||||
# logprobs=True,
|
# logprobs=True,
|
||||||
# top_logprobs=5,
|
# top_logprobs=5,
|
||||||
# response_format=openai_response_format_schema()
|
# response_format=response_format_schema()
|
||||||
# )
|
# )
|
||||||
# print(response)
|
# print(response)
|
||||||
|
|
||||||
@ -70,7 +70,7 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
|
|||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
{"type": "text", "text": build_openai_silver_data_prompt(anchor_text)},
|
{"type": "text", "text": build_silver_data_prompt(anchor_text)},
|
||||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
@ -79,7 +79,7 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
|
|||||||
"max_tokens": 6000,
|
"max_tokens": 6000,
|
||||||
"logprobs": True,
|
"logprobs": True,
|
||||||
"top_logprobs": 5,
|
"top_logprobs": 5,
|
||||||
"response_format": openai_response_format_schema(),
|
"response_format": response_format_schema(),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -65,8 +65,8 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool):
|
|||||||
obj = build_page_query(local_pdf_path, s3_path, page)
|
obj = build_page_query(local_pdf_path, s3_path, page)
|
||||||
# raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
|
# raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
|
||||||
|
|
||||||
# from olmocr.prompts import build_openai_silver_data_prompt
|
# from olmocr.prompts import build_silver_data_prompt
|
||||||
# obj["body"]["messages"][0]["content"][0]["text"] = build_openai_silver_data_prompt(raw_page_text)
|
# obj["body"]["messages"][0]["content"][0]["text"] = build_silver_data_prompt(raw_page_text)
|
||||||
|
|
||||||
if obj is not None:
|
if obj is not None:
|
||||||
outfile.write(json.dumps(obj) + "\n")
|
outfile.write(json.dumps(obj) + "\n")
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
from .prompts import (
|
from .prompts import (
|
||||||
PageResponse,
|
PageResponse,
|
||||||
build_finetuning_prompt,
|
build_finetuning_prompt,
|
||||||
build_openai_silver_data_prompt,
|
build_silver_data_prompt,
|
||||||
extract_raw_text,
|
extract_raw_text,
|
||||||
openai_response_format_schema,
|
response_format_schema,
|
||||||
)
|
)
|
||||||
|
@ -4,7 +4,7 @@ from typing import Optional
|
|||||||
|
|
||||||
|
|
||||||
# This is the prompt we use for getting chat gpt 4o to convert documents into our silver training data
|
# This is the prompt we use for getting chat gpt 4o to convert documents into our silver training data
|
||||||
def build_openai_silver_data_prompt(base_text: str) -> str:
|
def build_silver_data_prompt(base_text: str) -> str:
|
||||||
return (
|
return (
|
||||||
f"Below is the image of one page of a PDF document, as well as some raw textual content that was previously extracted for it that includes position information for each image and block of text (The origin [0x0] of the coordinates is in the lower left corner of the image). "
|
f"Below is the image of one page of a PDF document, as well as some raw textual content that was previously extracted for it that includes position information for each image and block of text (The origin [0x0] of the coordinates is in the lower left corner of the image). "
|
||||||
f"Just return the plain text representation of this document as if you were reading it naturally.\n"
|
f"Just return the plain text representation of this document as if you were reading it naturally.\n"
|
||||||
@ -46,7 +46,7 @@ class PageResponse:
|
|||||||
raise TypeError("natural_text must be of type Optional[str].")
|
raise TypeError("natural_text must be of type Optional[str].")
|
||||||
|
|
||||||
|
|
||||||
def openai_response_format_schema() -> dict:
|
def response_format_schema() -> dict:
|
||||||
return {
|
return {
|
||||||
"type": "json_schema",
|
"type": "json_schema",
|
||||||
"json_schema": {
|
"json_schema": {
|
||||||
|
@ -8,7 +8,7 @@ from transformers import AutoConfig, AutoProcessor, Qwen2_5_VLForConditionalGene
|
|||||||
|
|
||||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||||
from olmocr.prompts.anchor import get_anchor_text
|
from olmocr.prompts.anchor import get_anchor_text
|
||||||
from olmocr.prompts.prompts import build_openai_silver_data_prompt
|
from olmocr.prompts.prompts import build_silver_data_prompt
|
||||||
|
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
@ -33,7 +33,7 @@ def run_inference(model_name: str):
|
|||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
{"type": "text", "text": build_openai_silver_data_prompt(anchor_text)},
|
{"type": "text", "text": build_silver_data_prompt(anchor_text)},
|
||||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user