added checker for old_scans_math

2025-12-04 03:02:14 +00:00 · 2025-04-23 14:37:42 -07:00 · 2025-04-23 14:37:42 -07:00 · b095be0fed
commit b095be0fed
parent 887efac133
2 changed files with 164 additions and 2 deletions
--- a/olmocr/bench/README.md
+++ b/olmocr/bench/README.md
@ -102,10 +102,10 @@ Several categories of tests have been made so far:
 ## TODO List for release
 - [ ] Check all tests for duplicates
 - [ ] Write a script to verify that all baseline tests that actually have weird unicodes have exemptions
- - [ ] Review math equations in old_scans_math.jsonl using chat gpt script
+ - [X] Review math equations in old_scans_math.jsonl using chat gpt script
 - [X] Add test category of long_texts which are still ~1 standard printed page, but with dense/small text
 - [ ] Review multicolumn_tests, make sure they are correct, clean, and don't have order tests between regions
- - [ ] Remove [] and other special symbols from old_scans
+ - [X] Remove [] and other special symbols from old_scans
 - [ ] Full review of old_scans, somehow, chatgpt or prolific
 - [ ] Adjust scoring to weight each test category equally in final score distribution
 - [ ] Double check marker inline math outputs
--- a/olmocr/bench/checker/check_old_scans_math.py
+++ b/olmocr/bench/checker/check_old_scans_math.py
@ -0,0 +1,162 @@
+import json
+import os
+import argparse
+from typing import Dict, Any
+from openai import OpenAI
+from olmocr.data.renderpdf import render_pdf_to_base64png
+
+def verify_latex_match(
+    pdf_path: str,
+    page_num: int,
+    latex_expression: str,
+    model: str = "gpt-4o-2024-08-06",
+    temperature: float = 0.1,
+    target_longest_image_dim: int = 2048,
+) -> Dict[str, Any]:
+    """
+    Verify if a LaTeX math expression matches what appears in a PDF page.
+    
+    Args:
+        pdf_path (str): Path to the PDF file
+        page_num (int): Page number to check (1-indexed)
+        latex_expression (str): LaTeX expression to verify
+        model (str): OpenAI model to use
+        temperature (float): Temperature for API call
+        target_longest_image_dim (int): Target dimension for the image
+        
+    Returns:
+        Dict with verification result
+    """
+    image_base64 = render_pdf_to_base64png(
+        pdf_path, 
+        page_num=page_num, 
+        target_longest_image_dim=target_longest_image_dim
+    )
+    
+    if not os.getenv("OPENAI_API_KEY"):
+        raise SystemExit("You must specify an OPENAI_API_KEY environment variable")
+    
+    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+    
+    prompt = f"""
+    This is a mathematical expression verification task.
+    
+    I'm showing you a page from a PDF document containing mathematical expressions.
+    
+    Please verify if the following LaTeX expression:
+    
+    {latex_expression}
+    
+    appears correctly in the document.
+    
+    Respond with a JSON object containing:
+    1. "status": "correct" or "incorrect"
+    2. "confidence": a value between 0 and 1 representing your confidence in the answer
+    3. "explanation": a brief explanation of why you believe the expression is correct or incorrect
+    
+    Focus specifically on checking if this exact mathematical expression appears in the document.
+    """
+    
+    response = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                ],
+            }
+        ],
+        # temperature=temperature,
+        response_format={"type": "json_object"},
+        # max_tokens=1000,
+    )
+    raw_response = response.choices[0].message.content
+    result = json.loads(raw_response)
+    
+    return {
+        "pdf": pdf_path,
+        "math": latex_expression,
+        "status": result.get("status", "unknown"),
+        "confidence": result.get("confidence", 0),
+        "explanation": result.get("explanation", "No explanation provided")
+    }
+
+def process_jsonl_file(
+    input_jsonl_path: str,
+    output_jsonl_path: str,
+    model: str = "gpt-4o-2024-08-06", 
+    temperature: float = 0.1
+) -> None:
+    """
+    Process a JSONL file containing math expressions to verify.
+    
+    Args:
+        input_jsonl_path (str): Path to input JSONL file
+        output_jsonl_path (str): Path to output JSONL file
+        model (str): OpenAI model to use
+        temperature (float): Temperature for API call
+    """
+    processed_count = 0
+    
+    with open(output_jsonl_path, 'w') as out_file:
+        with open(input_jsonl_path, 'r') as in_file:
+            for line_num, line in enumerate(in_file, 1):
+                try:
+                    entry = json.loads(line.strip())
+                    
+                    pdf_path = entry.get("pdf")
+                    page_num = entry.get("page", 1)
+                    math_expr = entry.get("math")
+                    
+                    if not all([pdf_path, math_expr]):
+                        print(f"Line {line_num}: Skipping entry due to missing required fields")
+                        continue
+                    
+                    print(f"Line {line_num}: Processing: {pdf_path}, page {page_num}")
+                    
+                    try:
+                        result = verify_latex_match(
+                            pdf_path=pdf_path,
+                            page_num=page_num,
+                            latex_expression=math_expr,
+                            model=model,
+                            temperature=temperature
+                        )
+                        out_file.write(json.dumps(result) + '\n')
+                        processed_count += 1
+                    except Exception as e:
+                        print(f"Line {line_num}: Error processing {pdf_path}: {str(e)}")
+                        error_result = {
+                            "pdf": pdf_path,
+                            "math": math_expr,
+                            "status": "error",
+                            "explanation": str(e)
+                        }
+                        out_file.write(json.dumps(error_result) + '\n')
+                        processed_count += 1
+                        
+                except json.JSONDecodeError:
+                    print(f"Line {line_num}: Invalid JSON, skipping")
+    
+    print(f"Processed {processed_count} entries. Results saved to {output_jsonl_path}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Verify LaTeX math expressions in PDFs")
+    parser.add_argument("input_jsonl", help="Path to input JSONL file")
+    parser.add_argument("output_jsonl", help="Path to output JSONL file")
+    parser.add_argument("--model", default="o4-mini-2025-04-16", help="OpenAI model to use")
+    parser.add_argument("--temperature", type=float, default=0.1, help="Temperature for API call")
+    
+    args = parser.parse_args()
+    
+    process_jsonl_file(
+        input_jsonl_path=args.input_jsonl,
+        output_jsonl_path=args.output_jsonl,
+        model=args.model,
+        temperature=args.temperature
+    )
+
+if __name__ == "__main__":
+    main()