mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-13 09:12:18 +00:00
Merge branch 'main' of https://github.com/allenai/olmocr
This commit is contained in:
commit
b97e90ce3a
@ -1,8 +1,8 @@
|
|||||||
import base64
|
import base64
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from google.ai import generativelanguage as glm
|
from google import genai
|
||||||
from google.api_core import client_options
|
from google.genai import types
|
||||||
|
|
||||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||||
from olmocr.prompts.anchor import get_anchor_text
|
from olmocr.prompts.anchor import get_anchor_text
|
||||||
@ -30,56 +30,25 @@ def run_gemini(pdf_path: str, page_num: int = 1, model: str = "gemini-2.0-flash"
|
|||||||
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
|
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
|
||||||
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
|
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
|
||||||
api_key = os.getenv("GEMINI_API_KEY")
|
api_key = os.getenv("GEMINI_API_KEY")
|
||||||
client = glm.GenerativeServiceClient(
|
client = genai.Client(api_key=api_key)
|
||||||
client_options=client_options.ClientOptions(
|
image_part = types.Part(inline_data=types.Blob(mime_type="image/png", data=base64.b64decode(image_base64)))
|
||||||
api_key=api_key,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
image_part = glm.Part(inline_data=glm.Blob(mime_type="image/png", data=base64.b64decode(image_base64)))
|
text_part = types.Part(text=f"""{build_openai_silver_data_prompt(anchor_text)}""")
|
||||||
|
generation_config = types.GenerateContentConfig(
|
||||||
text_part = glm.Part(text=f"""{build_openai_silver_data_prompt(anchor_text)}""")
|
|
||||||
generation_config = glm.GenerationConfig(
|
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
top_k=32,
|
top_k=32,
|
||||||
max_output_tokens=4096,
|
max_output_tokens=4096,
|
||||||
)
|
)
|
||||||
# response_schema = gemini_response_format_schema()
|
|
||||||
request = glm.GenerateContentRequest(
|
response = client.models.generate_content(
|
||||||
model=f"models/{model}",
|
model=f"models/{model}",
|
||||||
contents=[glm.Content(parts=[image_part, text_part])],
|
contents=[types.Content(parts=[image_part, text_part])],
|
||||||
generation_config=generation_config,
|
config=generation_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
# request = glm.GenerateContentRequest(
|
|
||||||
# model=f"models/{model}",
|
|
||||||
# contents=[glm.Content(parts=[image_part, text_part])],
|
|
||||||
# generation_config=generation_config,
|
|
||||||
# tools=[
|
|
||||||
# glm.Tool(
|
|
||||||
# function_declarations=[
|
|
||||||
# glm.FunctionDeclaration(
|
|
||||||
# name="page_response",
|
|
||||||
# parameters=response_schema
|
|
||||||
# )
|
|
||||||
# ]
|
|
||||||
# )
|
|
||||||
# ],
|
|
||||||
# tool_config=glm.ToolConfig(
|
|
||||||
# function_calling_config=glm.FunctionCallingConfig(
|
|
||||||
# mode="any",
|
|
||||||
# allowed_function_names=["page_response"]
|
|
||||||
# )
|
|
||||||
# )
|
|
||||||
# )
|
|
||||||
|
|
||||||
response = client.generate_content(request)
|
|
||||||
|
|
||||||
assert len(response.candidates) > 0, "No candidates found"
|
assert len(response.candidates) > 0, "No candidates found"
|
||||||
assert (
|
assert response.candidates[0].finish_reason == types.FinishReason.STOP, "Finish reason was not STOP, likely a processing error or repetition failure"
|
||||||
response.candidates[0].finish_reason == glm.Candidate.FinishReason.STOP
|
|
||||||
), "Finish reason was not STOP, likely a processing error or repetition failure"
|
|
||||||
|
|
||||||
result = response.candidates[0].content.parts[0].text
|
result = response.candidates[0].content.parts[0].text
|
||||||
return result
|
return result
|
||||||
|
@ -158,12 +158,12 @@ source activate olmocr
|
|||||||
|
|
||||||
# Run olmocr benchmarks, exactly as the pipeline.py does it
|
# Run olmocr benchmarks, exactly as the pipeline.py does it
|
||||||
echo "Running olmocr benchmarks..."
|
echo "Running olmocr benchmarks..."
|
||||||
python -m olmocr.bench.convert olmocr_pipeline --parralel 50
|
python -m olmocr.bench.convert olmocr_pipeline --parallel 50 --dir olmOCR-bench/bench_data/
|
||||||
|
|
||||||
# Install marker-pdf and run benchmarks
|
# Install marker-pdf and run benchmarks
|
||||||
echo "Installing marker-pdf and running benchmarks..."
|
echo "Installing marker-pdf and running benchmarks..."
|
||||||
pip install marker-pdf==1.6.1
|
pip install marker-pdf==1.6.1
|
||||||
python -m olmocr.bench.convert marker
|
python -m olmocr.bench.convert marker --dir olmOCR-bench/bench_data/
|
||||||
|
|
||||||
# Install verovio and run benchmarks
|
# Install verovio and run benchmarks
|
||||||
# echo "Installing verovio and running benchmarks..."
|
# echo "Installing verovio and running benchmarks..."
|
||||||
@ -172,12 +172,12 @@ python -m olmocr.bench.convert marker
|
|||||||
|
|
||||||
# Run chatgpt benchmarks
|
# Run chatgpt benchmarks
|
||||||
echo "Running chatgpt benchmarks..."
|
echo "Running chatgpt benchmarks..."
|
||||||
python -m olmocr.bench.convert chatgpt
|
python -m olmocr.bench.convert chatgpt --dir olmOCR-bench/bench_data/
|
||||||
#python -m olmocr.bench.convert chatgpt:name=chatgpt45:model=gpt-4.5-preview-2025-02-27
|
#python -m olmocr.bench.convert chatgpt:name=chatgpt45:model=gpt-4.5-preview-2025-02-27
|
||||||
|
|
||||||
# Run gemini benchmarks
|
# Run gemini benchmarks
|
||||||
echo "Running gemini benchmarks..."
|
echo "Running gemini benchmarks..."
|
||||||
python -m olmocr.bench.convert gemini:name=gemini_flash2:model=gemini-2.0-flash --parallel 4
|
python -m olmocr.bench.convert gemini:name=gemini_flash2:model=gemini-2.0-flash --parallel 4 --dir olmOCR-bench/bench_data/
|
||||||
|
|
||||||
echo "Running mistral..."
|
echo "Running mistral..."
|
||||||
pip install mistralai
|
pip install mistralai
|
||||||
@ -190,27 +190,27 @@ python -m olmocr.bench.convert --dir olmOCR-bench/bench_data mistral
|
|||||||
check_port || exit 1
|
check_port || exit 1
|
||||||
|
|
||||||
# olmocr_base_temp0_1 using sglang server
|
# olmocr_base_temp0_1 using sglang server
|
||||||
start_server sglang "allenai/olmOCR-7B-0225-preview" --chat-template qwen2-vl --mem-fraction-static 0.7
|
# start_server sglang "allenai/olmOCR-7B-0225-preview" --chat-template qwen2-vl --mem-fraction-static 0.7
|
||||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_0:model=allenai/olmOCR-7B-0225-preview:temperature=0.0:prompt_template=fine_tune:response_template=json --repeats 1 --parallel 50
|
# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_0:model=allenai/olmOCR-7B-0225-preview:temperature=0.0:prompt_template=fine_tune:response_template=json --repeats 1 --parallel 50
|
||||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_1:model=allenai/olmOCR-7B-0225-preview:temperature=0.1:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_1:model=allenai/olmOCR-7B-0225-preview:temperature=0.1:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
||||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_2:model=allenai/olmOCR-7B-0225-preview:temperature=0.2:prompt_template=fine_tune:response_template=json --repeats 1 --parallel 50
|
# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_2:model=allenai/olmOCR-7B-0225-preview:temperature=0.2:prompt_template=fine_tune:response_template=json --repeats 1 --parallel 50
|
||||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_3:model=allenai/olmOCR-7B-0225-preview:temperature=0.3:prompt_template=fine_tune:response_template=json --repeats 1 --parallel 50
|
# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_3:model=allenai/olmOCR-7B-0225-preview:temperature=0.3:prompt_template=fine_tune:response_template=json --repeats 1 --parallel 50
|
||||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_4:model=allenai/olmOCR-7B-0225-preview:temperature=0.4:prompt_template=fine_tune:response_template=json --repeats 1 --parallel 50
|
# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_4:model=allenai/olmOCR-7B-0225-preview:temperature=0.4:prompt_template=fine_tune:response_template=json --repeats 1 --parallel 50
|
||||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_5:model=allenai/olmOCR-7B-0225-preview:temperature=0.5:prompt_template=fine_tune:response_template=json --repeats 1 --parallel 50
|
# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_5:model=allenai/olmOCR-7B-0225-preview:temperature=0.5:prompt_template=fine_tune:response_template=json --repeats 1 --parallel 50
|
||||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_6:model=allenai/olmOCR-7B-0225-preview:temperature=0.6:prompt_template=fine_tune:response_template=json --repeats 1 --parallel 50
|
# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_6:model=allenai/olmOCR-7B-0225-preview:temperature=0.6:prompt_template=fine_tune:response_template=json --repeats 1 --parallel 50
|
||||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_7:model=allenai/olmOCR-7B-0225-preview:temperature=0.7:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp0_7:model=allenai/olmOCR-7B-0225-preview:temperature=0.7:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
||||||
|
|
||||||
python -m olmocr.bench.convert server:name=olmocr_base_temp0_1:model=allenai/olmOCR-7B-0225-preview:temperature=0.1:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
# python -m olmocr.bench.convert server:name=olmocr_base_temp0_1:model=allenai/olmOCR-7B-0225-preview:temperature=0.1:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
||||||
python -m olmocr.bench.convert server:name=olmocr_base_temp0_8:model=allenai/olmOCR-7B-0225-preview:temperature=0.8:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
# python -m olmocr.bench.convert server:name=olmocr_base_temp0_8:model=allenai/olmOCR-7B-0225-preview:temperature=0.8:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
||||||
stop_server
|
# stop_server
|
||||||
|
|
||||||
start_server vllm "allenai/olmOCR-7B-0225-preview"
|
# start_server vllm "allenai/olmOCR-7B-0225-preview"
|
||||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp_vllm0_1:model=allenai/olmOCR-7B-0225-preview:temperature=0.1:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp_vllm0_1:model=allenai/olmOCR-7B-0225-preview:temperature=0.1:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
||||||
python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp_vllm0_7:model=allenai/olmOCR-7B-0225-preview:temperature=0.7:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
# python -m olmocr.bench.convert --dir olmOCR-bench/bench_data server:name=olmocr_base_temp_vllm0_7:model=allenai/olmOCR-7B-0225-preview:temperature=0.7:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
||||||
|
|
||||||
python -m olmocr.bench.convert server:name=olmocr_base_vllm_temp0_1:model=allenai/olmOCR-7B-0225-preview:temperature=0.1:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
# python -m olmocr.bench.convert server:name=olmocr_base_vllm_temp0_1:model=allenai/olmOCR-7B-0225-preview:temperature=0.1:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
||||||
python -m olmocr.bench.convert server:name=olmocr_base_vllm_temp0_8:model=allenai/olmOCR-7B-0225-preview:temperature=0.8:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
# python -m olmocr.bench.convert server:name=olmocr_base_vllm_temp0_8:model=allenai/olmOCR-7B-0225-preview:temperature=0.8:prompt_template=fine_tune:response_template=json --repeats 5 --parallel 50
|
||||||
stop_server
|
# stop_server
|
||||||
|
|
||||||
# Feel free to enable if you want
|
# Feel free to enable if you want
|
||||||
# qwen2_vl_7b using sglang server
|
# qwen2_vl_7b using sglang server
|
||||||
|
Loading…
x
Reference in New Issue
Block a user