mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-12 16:52:20 +00:00
added checker for old_scans_math
This commit is contained in:
parent
887efac133
commit
b095be0fed
@ -102,10 +102,10 @@ Several categories of tests have been made so far:
|
||||
## TODO List for release
|
||||
- [ ] Check all tests for duplicates
|
||||
- [ ] Write a script to verify that all baseline tests that actually have weird unicodes have exemptions
|
||||
- [ ] Review math equations in old_scans_math.jsonl using chat gpt script
|
||||
- [X] Review math equations in old_scans_math.jsonl using chat gpt script
|
||||
- [X] Add test category of long_texts which are still ~1 standard printed page, but with dense/small text
|
||||
- [ ] Review multicolumn_tests, make sure they are correct, clean, and don't have order tests between regions
|
||||
- [ ] Remove [] and other special symbols from old_scans
|
||||
- [X] Remove [] and other special symbols from old_scans
|
||||
- [ ] Full review of old_scans, somehow, chatgpt or prolific
|
||||
- [ ] Adjust scoring to weight each test category equally in final score distribution
|
||||
- [ ] Double check marker inline math outputs
|
||||
|
162
olmocr/bench/checker/check_old_scans_math.py
Normal file
162
olmocr/bench/checker/check_old_scans_math.py
Normal file
@ -0,0 +1,162 @@
|
||||
import json
|
||||
import os
|
||||
import argparse
|
||||
from typing import Dict, Any
|
||||
from openai import OpenAI
|
||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||
|
||||
def verify_latex_match(
|
||||
pdf_path: str,
|
||||
page_num: int,
|
||||
latex_expression: str,
|
||||
model: str = "gpt-4o-2024-08-06",
|
||||
temperature: float = 0.1,
|
||||
target_longest_image_dim: int = 2048,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Verify if a LaTeX math expression matches what appears in a PDF page.
|
||||
|
||||
Args:
|
||||
pdf_path (str): Path to the PDF file
|
||||
page_num (int): Page number to check (1-indexed)
|
||||
latex_expression (str): LaTeX expression to verify
|
||||
model (str): OpenAI model to use
|
||||
temperature (float): Temperature for API call
|
||||
target_longest_image_dim (int): Target dimension for the image
|
||||
|
||||
Returns:
|
||||
Dict with verification result
|
||||
"""
|
||||
image_base64 = render_pdf_to_base64png(
|
||||
pdf_path,
|
||||
page_num=page_num,
|
||||
target_longest_image_dim=target_longest_image_dim
|
||||
)
|
||||
|
||||
if not os.getenv("OPENAI_API_KEY"):
|
||||
raise SystemExit("You must specify an OPENAI_API_KEY environment variable")
|
||||
|
||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
prompt = f"""
|
||||
This is a mathematical expression verification task.
|
||||
|
||||
I'm showing you a page from a PDF document containing mathematical expressions.
|
||||
|
||||
Please verify if the following LaTeX expression:
|
||||
|
||||
{latex_expression}
|
||||
|
||||
appears correctly in the document.
|
||||
|
||||
Respond with a JSON object containing:
|
||||
1. "status": "correct" or "incorrect"
|
||||
2. "confidence": a value between 0 and 1 representing your confidence in the answer
|
||||
3. "explanation": a brief explanation of why you believe the expression is correct or incorrect
|
||||
|
||||
Focus specifically on checking if this exact mathematical expression appears in the document.
|
||||
"""
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
||||
],
|
||||
}
|
||||
],
|
||||
# temperature=temperature,
|
||||
response_format={"type": "json_object"},
|
||||
# max_tokens=1000,
|
||||
)
|
||||
raw_response = response.choices[0].message.content
|
||||
result = json.loads(raw_response)
|
||||
|
||||
return {
|
||||
"pdf": pdf_path,
|
||||
"math": latex_expression,
|
||||
"status": result.get("status", "unknown"),
|
||||
"confidence": result.get("confidence", 0),
|
||||
"explanation": result.get("explanation", "No explanation provided")
|
||||
}
|
||||
|
||||
def process_jsonl_file(
|
||||
input_jsonl_path: str,
|
||||
output_jsonl_path: str,
|
||||
model: str = "gpt-4o-2024-08-06",
|
||||
temperature: float = 0.1
|
||||
) -> None:
|
||||
"""
|
||||
Process a JSONL file containing math expressions to verify.
|
||||
|
||||
Args:
|
||||
input_jsonl_path (str): Path to input JSONL file
|
||||
output_jsonl_path (str): Path to output JSONL file
|
||||
model (str): OpenAI model to use
|
||||
temperature (float): Temperature for API call
|
||||
"""
|
||||
processed_count = 0
|
||||
|
||||
with open(output_jsonl_path, 'w') as out_file:
|
||||
with open(input_jsonl_path, 'r') as in_file:
|
||||
for line_num, line in enumerate(in_file, 1):
|
||||
try:
|
||||
entry = json.loads(line.strip())
|
||||
|
||||
pdf_path = entry.get("pdf")
|
||||
page_num = entry.get("page", 1)
|
||||
math_expr = entry.get("math")
|
||||
|
||||
if not all([pdf_path, math_expr]):
|
||||
print(f"Line {line_num}: Skipping entry due to missing required fields")
|
||||
continue
|
||||
|
||||
print(f"Line {line_num}: Processing: {pdf_path}, page {page_num}")
|
||||
|
||||
try:
|
||||
result = verify_latex_match(
|
||||
pdf_path=pdf_path,
|
||||
page_num=page_num,
|
||||
latex_expression=math_expr,
|
||||
model=model,
|
||||
temperature=temperature
|
||||
)
|
||||
out_file.write(json.dumps(result) + '\n')
|
||||
processed_count += 1
|
||||
except Exception as e:
|
||||
print(f"Line {line_num}: Error processing {pdf_path}: {str(e)}")
|
||||
error_result = {
|
||||
"pdf": pdf_path,
|
||||
"math": math_expr,
|
||||
"status": "error",
|
||||
"explanation": str(e)
|
||||
}
|
||||
out_file.write(json.dumps(error_result) + '\n')
|
||||
processed_count += 1
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(f"Line {line_num}: Invalid JSON, skipping")
|
||||
|
||||
print(f"Processed {processed_count} entries. Results saved to {output_jsonl_path}")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Verify LaTeX math expressions in PDFs")
|
||||
parser.add_argument("input_jsonl", help="Path to input JSONL file")
|
||||
parser.add_argument("output_jsonl", help="Path to output JSONL file")
|
||||
parser.add_argument("--model", default="o4-mini-2025-04-16", help="OpenAI model to use")
|
||||
parser.add_argument("--temperature", type=float, default=0.1, help="Temperature for API call")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
process_jsonl_file(
|
||||
input_jsonl_path=args.input_jsonl,
|
||||
output_jsonl_path=args.output_jsonl,
|
||||
model=args.model,
|
||||
temperature=args.temperature
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user