olmocr/olmocr/bench/runners/run_gemini.py

81 lines
2.8 KiB
Python
Raw Normal View History

2025-02-25 16:57:39 -08:00
import os
import json
import base64
from google.ai import generativelanguage as glm
from google.api_core import client_options
from olmocr.prompts.anchor import get_anchor_text
2025-02-28 11:13:59 -08:00
from olmocr.prompts.prompts import gemini_response_format_schema
2025-02-25 16:57:39 -08:00
from olmocr.data.renderpdf import render_pdf_to_base64png
def run_gemini(pdf_path: str, page_num: int = 1, model: str = "gemini-1.5-pro", temperature: float=0.1) -> str:
"""
Convert page of a PDF file to markdown using Gemini's vision capabilities.
This function renders the specified page of the PDF to an image, runs OCR on that image,
and returns the OCR result as a markdown-formatted string.
Args:
pdf_path (str): The local path to the PDF file.
page_num (int): The page number to process (starting from 1).
model (str): The Gemini model to use.
temperature (float): The temperature parameter for generation.
Returns:
str: The OCR result in markdown format.
"""
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
api_key = os.getenv("GEMINI_API_KEY")
client = glm.GenerativeServiceClient(
client_options=client_options.ClientOptions(
api_key=api_key,
),
)
image_part = glm.Part(
inline_data=glm.Blob(
mime_type="image/png",
data=base64.b64decode(image_base64)
)
)
text_part = glm.Part(
2025-02-26 14:03:51 -08:00
text=f"""{build_silver_data_prompt(anchor_text)}"""
2025-02-25 16:57:39 -08:00
)
generation_config = glm.GenerationConfig(
temperature=temperature,
top_p=1.0,
top_k=32,
max_output_tokens=4096,
)
response_schema = gemini_response_format_schema()
2025-02-26 09:42:35 -08:00
request = glm.GenerateContentRequest(
model=f"models/{model}",
contents=[glm.Content(parts=[image_part, text_part])],
generation_config=generation_config,
)
2025-02-25 16:57:39 -08:00
# request = glm.GenerateContentRequest(
# model=f"models/{model}",
# contents=[glm.Content(parts=[image_part, text_part])],
# generation_config=generation_config,
2025-02-26 09:42:35 -08:00
# tools=[
# glm.Tool(
# function_declarations=[
# glm.FunctionDeclaration(
# name="page_response",
# parameters=response_schema
# )
# ]
# )
# ],
# tool_config=glm.ToolConfig(
# function_calling_config=glm.FunctionCallingConfig(
# mode="any",
# allowed_function_names=["page_response"]
# )
# )
2025-02-25 16:57:39 -08:00
# )
response = client.generate_content(request)
result = response.candidates[0].content.parts[0].text
2025-02-28 11:13:59 -08:00
return result