Flask based review app first attempt

2025-06-27 04:00:02 +00:00 · 2025-03-18 16:53:36 +00:00 · 2025-03-18 16:53:36 +00:00 · 4939e41154
commit 4939e41154
parent 93450c326d
4 changed files with 1023 additions and 7 deletions
--- a/olmocr/bench/miners/mine_tables_gemini.py
+++ b/olmocr/bench/miners/mine_tables_gemini.py
@ -16,7 +16,6 @@ Usage:

 import argparse
 import base64
-import json
 import os
 import random
 from typing import Dict, List, Optional, Tuple
@ -139,10 +138,11 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
            parts=[
                image_part,
                types.Part.from_text(
-                    text=(
-                        "Analyze the document attached and output it in plain text. "
-                        "Please output the tables in valid HTML format that preserves the structure and content exactly. "
-                        "Include the complete table with all rows and columns. Make each table cell be sensible and semantically correct with the original intent of the table."
+                    text=(                           
+                        "Analyze the document attached and output it in markdown format. "
+                        "Output equations as Latex escaped with $$. "
+                        "Output tables in valid HTML format that preserves the structure and content exactly. "
+                        "Output figures with just a simple markdown image placeholder."
                    )
                ),
            ],
@ -415,4 +415,4 @@ def main():


 if __name__ == "__main__":
-    main()
+    main()
--- a/olmocr/bench/miners/mine_tables_gpt.py
+++ b/olmocr/bench/miners/mine_tables_gpt.py
@ -0,0 +1,435 @@
+#!/usr/bin/env python3
+"""
+mine_tables.py - Extract tables from PDF documents and create table tests.
+
+This script:
+1. Takes a file containing S3 paths to PDF documents as input
+2. For each PDF, extracts a random page and renders it to an image
+3. Uses GPT-4o to identify tables in the rendered image
+4. Extracts table content and creates table relationship tests by making a second GPT-4o request
+   that now includes the page image alongside the prompt (e.g., "Given cell with {cell_value}, which cell is directly to the left of it?")
+5. Extracts the page from the PDF and saves it to an output folder
+
+Usage:
+  python mine_tables.py --input_list path/to/s3_paths.txt --output_dir path/to/output --api_key your_openai_api_key
+"""
+
+import argparse
+import base64
+import json
+import os
+import random
+from typing import Dict, List, Optional, Tuple
+
+import boto3
+import numpy as np
+import pypdf
+from bs4 import BeautifulSoup
+from openai import OpenAI
+from tqdm import tqdm
+
+from olmocr.bench.tests import TableTest, save_tests
+from olmocr.data.renderpdf import render_pdf_to_base64png
+from olmocr.filter import PdfFilter
+
+
+def download_pdf_from_s3(s3_path: str, local_path: str) -> bool:
+    """
+    Download a PDF file from S3.
+
+    Args:
+        s3_path: The S3 path (s3://bucket/path/to/file.pdf)
+        local_path: The local path to save the file
+
+    Returns:
+        bool: True if download was successful, False otherwise
+    """
+    try:
+        # Parse S3 path
+        parts = s3_path.replace("s3://", "").split("/", 1)
+        bucket = parts[0]
+        key = parts[1]
+
+        # Create S3 client
+        s3 = boto3.client("s3")
+
+        # Create directory if it doesn't exist
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+
+        # Download file
+        s3.download_file(bucket, key, local_path)
+        return True
+    except Exception as e:
+        print(f"Error downloading {s3_path}: {str(e)}")
+        return False
+
+
+def extract_page_from_pdf(input_path: str, output_path: str, page_num: int) -> bool:
+    """
+    Extract a specific page from a PDF and save it as a new PDF.
+
+    Args:
+        input_path: Path to the input PDF
+        output_path: Path to save the extracted page
+        page_num: The page number to extract (0-indexed)
+
+    Returns:
+        bool: True if extraction was successful, False otherwise
+    """
+    try:
+        # Ensure output directory exists
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+
+        # Read the input PDF
+        reader = pypdf.PdfReader(input_path)
+
+        # Check if page number is valid
+        if page_num >= len(reader.pages):
+            print(f"Page number {page_num} out of range for {input_path} with {len(reader.pages)} pages")
+            return False
+
+        # Create a new PDF with just the selected page
+        writer = pypdf.PdfWriter()
+        writer.add_page(reader.pages[page_num])
+
+        # Write the output PDF
+        with open(output_path, "wb") as output_file:
+            writer.write(output_file)
+
+        return True
+    except Exception as e:
+        print(f"Error extracting page {page_num} from {input_path}: {str(e)}")
+        raise
+
+
+def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[List[np.ndarray], str]]:
+    """
+    Use GPT-4o to detect tables in a rendered PDF page.
+
+    Args:
+        pdf_path: Path to the PDF file
+        page_num: The page number to analyze (0-indexed)
+        api_key: OpenAI API key
+
+    Returns:
+        Optional[Tuple[List[np.ndarray], str]]:
+            A tuple with a list of detected tables (as numpy arrays) and the base64 string of the rendered page image.
+            Returns None if detection fails.
+    """
+    # Initialize OpenAI client
+    client = OpenAI(api_key=api_key)
+    model = "gpt-4o"
+
+    # Render the PDF page as an image (render_pdf_to_base64png is 1-indexed)
+    try:
+        image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num + 1, target_longest_image_dim=2048)
+    except Exception as e:
+        print(f"Error rendering PDF page: {str(e)}")
+        return None
+
+    # Prepare prompt for GPT-4o to extract tables
+    try:
+        # Call OpenAI API
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{image_base64}",
+                                "detail": "high"
+                            }
+                        },
+                        {
+                            "type": "text",
+                            "text": (
+                                "Analyze the document attached and output it in markdown format. "
+                                "Output equations as Latex escaped with $$. "
+                                "Output tables in valid HTML format that preserves the structure and content exactly. "
+                                "Output figures with just a simple markdown image placeholder."
+                            )
+                        }
+                    ]
+                }
+            ],
+            temperature=0.2,
+        )
+
+        if not response.choices or len(response.choices) == 0:
+            print(f"No response generated for {pdf_path} page {page_num}")
+            return None
+
+        # Parse the response
+        response_text = response.choices[0].message.content
+
+        print(response_text)
+
+        # Parse tables from HTML
+        parsed_tables = []
+        soup = BeautifulSoup(response_text, "html.parser")
+        tables = soup.find_all("table")
+
+        for table in tables:
+            rows = table.find_all("tr")
+            table_data = []
+            for row in rows:
+                cells = row.find_all(["th", "td"])
+                row_data = [cell.get_text().strip() for cell in cells]
+                table_data.append(row_data)
+            # Ensure all rows have the same number of columns
+            if table_data:
+                max_cols = max(len(row) for row in table_data)
+                padded_data = [row + [""] * (max_cols - len(row)) for row in table_data]
+                table_array = np.array(padded_data)
+                parsed_tables.append(table_array)
+
+        # Return both the parsed tables and the rendered image (base64 string)
+        return (parsed_tables, image_base64) if parsed_tables else None
+
+    except Exception as e:
+        print(f"Error detecting tables in {pdf_path} page {page_num}: {str(e)}")
+        return None
+
+
+def generate_table_tests(tables: List[np.ndarray], pdf_image: str, api_key: str, max_tests_per_table: int = 3) -> List[Dict]:
+    """
+    Generate table tests from the detected tables by making a second GPT-4o request for each candidate cell.
+
+    For each candidate cell in a table, the function selects one valid relationship (e.g., "left", "up", "top_heading", etc.)
+    and sends a prompt to GPT-4o including the page image. For example:
+      "Given a cell in a table with value 'XYZ', please answer: which cell is directly to the left of it? Provide only the cell's text."
+
+    Args:
+        tables: List of tables as numpy arrays
+        pdf_image: Base64 string of the rendered page image
+        api_key: OpenAI API key to use for generating relationship tests
+        max_tests_per_table: Maximum number of tests to generate per table
+
+    Returns:
+        List of table test dictionaries
+    """
+    tests = []
+    # Initialize OpenAI client for test queries
+    client = OpenAI(api_key=api_key)
+    model = "gpt-4o"
+
+    # Mapping for relationship prompts
+    prompt_map = {
+        "up": "which cell is directly above it?",
+        "down": "which cell is directly below it?",
+        "left": "which cell is directly to the left of it?",
+        "right": "which cell is directly to the right of it?",
+        "top_heading": "what is the top heading for this cell?",
+        "left_heading": "what is the left heading for this cell?",
+    }
+
+    for table in tables:
+        rows, cols = table.shape
+        if table.size == 0 or rows < 2 or cols < 2:
+            continue  # Skip tables that are too small
+
+        # Try up to 3x max_tests_per_table candidate cells
+        candidate_positions = []
+        for _ in range(max_tests_per_table * 3):
+            row = random.randint(0, rows - 1)
+            col = random.randint(0, cols - 1)
+            if not table[row, col].strip():
+                continue
+            candidate_positions.append((row, col))
+
+        random.shuffle(candidate_positions)
+        tests_for_this_table = 0
+
+        for row, col in candidate_positions:
+            if tests_for_this_table >= max_tests_per_table:
+                break
+
+            cell_value = table[row, col].strip()
+            # Determine valid relationship types based on candidate's position
+            valid_relationships = []
+            if row > 0:
+                valid_relationships.append("up")
+            if row < rows - 1:
+                valid_relationships.append("down")
+            if col > 0:
+                valid_relationships.append("left")
+            if col < cols - 1:
+                valid_relationships.append("right")
+            if row > 0:
+                valid_relationships.append("top_heading")
+            if col > 0:
+                valid_relationships.append("left_heading")
+            if not valid_relationships:
+                continue
+
+            relationship = random.choice(valid_relationships)
+            prompt = (
+                f"Given a cell in a table with value '{cell_value}', please answer: "
+                f"{prompt_map[relationship]} Provide only the cell's text or output 'null' if there is not a matching cell."
+            )
+
+            try:
+                response = client.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image_url",
+                                    "image_url": {
+                                        "url": f"data:image/png;base64,{pdf_image}",
+                                        "detail": "high"
+                                    }
+                                },
+                                {
+                                    "type": "text",
+                                    "text": prompt
+                                }
+                            ]
+                        }
+                    ],
+                    temperature=0.2,
+                )
+                
+                if not response.choices or len(response.choices) == 0:
+                    continue
+                    
+                answer_text = response.choices[0].message.content.strip()
+                if answer_text and "null" not in answer_text:
+                    test_data = {"cell": cell_value, relationship: answer_text}
+                    tests.append(test_data)
+                    tests_for_this_table += 1
+            except Exception as e:
+                print(f"Error querying GPT-4o for cell '{cell_value}' and relationship '{relationship}': {str(e)}")
+
+    return tests
+
+
+def process_pdf(s3_path: str, temp_dir: str, output_dir: str, api_key: str, tests: List[TableTest]) -> None:
+    """
+    Process a single PDF from S3.
+
+    Args:
+        s3_path: S3 path to the PDF
+        temp_dir: Directory for temporary files
+        output_dir: Directory for output files
+        api_key: OpenAI API key
+        tests: List to append tests to
+    """
+    # Extract filename from S3 path
+    pdf_filename = os.path.basename(s3_path)
+    local_pdf_path = os.path.join(temp_dir, pdf_filename)
+
+    # Download PDF from S3
+    if not download_pdf_from_s3(s3_path, local_pdf_path):
+        return
+
+    pdf_filter = PdfFilter()
+
+    if pdf_filter.filter_out_pdf(local_pdf_path):
+        print(f"Filtering out {pdf_filename}")
+        return
+
+    try:
+        # Read the PDF to get the number of pages
+        reader = pypdf.PdfReader(local_pdf_path)
+        num_pages = len(reader.pages)
+
+        if num_pages == 0:
+            print(f"PDF {pdf_filename} has no pages")
+            return
+
+        all_pages = list(range(len(reader.pages)))
+        random.shuffle(all_pages)
+
+        for page_num in all_pages:
+            # Detect tables and obtain the rendered image for this page
+            result = detect_tables(local_pdf_path, page_num, api_key)
+            if not result:
+                print(f"No tables detected in {pdf_filename} page {page_num+1}")
+                continue
+
+            tables, image_base64 = result
+
+            # Generate table tests using the new GPT-4o query approach with the page image
+            table_tests_data = generate_table_tests(tables, image_base64, api_key, max_tests_per_table=5)
+
+            if not table_tests_data:
+                print(f"Could not generate valid tests for tables in {pdf_filename} page {page_num+1}")
+                continue
+
+            # Extract the page and save to output dir
+            pdf_basename = os.path.splitext(pdf_filename)[0]
+            output_pdf_path = os.path.join(output_dir, "pdfs", f"{pdf_basename}_pg{page_num+1}.pdf")
+            extract_page_from_pdf(local_pdf_path, output_pdf_path, page_num)
+
+            # Create table tests
+            for i, test_data in enumerate(table_tests_data):
+                test_id = f"{pdf_basename}_pg{page_num+1}_table_{i:02d}"
+                test = TableTest(
+                    id=test_id,
+                    pdf=f"{pdf_basename}_pg{page_num+1}.pdf",
+                    page=1,  # The extracted PDF has only one page
+                    type="table",
+                    cell=test_data["cell"],
+                    up=test_data.get("up", None),
+                    down=test_data.get("down", None),
+                    left=test_data.get("left", None),
+                    right=test_data.get("right", None),
+                    top_heading=test_data.get("top_heading", None),
+                    left_heading=test_data.get("left_heading", None),
+                )
+                tests.append(test)
+
+            print(f"Processed {pdf_filename} page {page_num+1}, found {len(tables)} tables, created {len(table_tests_data)} tests")
+            return  # Process only one page per PDF
+
+    except Exception as e:
+        print(f"Error processing {pdf_filename}: {str(e)}")
+    finally:
+        if os.path.exists(local_pdf_path):
+            os.remove(local_pdf_path)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Extract tables from PDF documents and create table tests")
+    parser.add_argument("--input_list", required=True, help="Path to a file containing S3 paths to PDFs")
+    parser.add_argument("--output_dir", required=True, help="Directory to store extracted pages and tests")
+    parser.add_argument("--api_key", help="OpenAI API key (if not provided, will use OPENAI_API_KEY environment variable)")
+    parser.add_argument("--temp_dir", default="/tmp/mine_tables", help="Directory for temporary files")
+    parser.add_argument("--max_tests", type=int, default=100, help="Maximum number of tests to generate")
+    args = parser.parse_args()
+
+    # Get API key
+    api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        print("Error: OpenAI API key not provided. Use --api_key or set OPENAI_API_KEY environment variable.")
+        return
+
+    os.makedirs(args.temp_dir, exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "pdfs"), exist_ok=True)
+
+    with open(args.input_list, "r") as f:
+        s3_paths = [line.strip() for line in f if line.strip()]
+
+    print(f"Found {len(s3_paths)} PDF paths in input list")
+    tests = []
+    for s3_path in tqdm(s3_paths, desc="Processing PDFs"):
+        process_pdf(s3_path, args.temp_dir, args.output_dir, api_key, tests)
+
+        if tests:
+            save_tests(tests, os.path.join(args.output_dir, "table_tests.jsonl"))
+
+        if len(tests) >= args.max_tests:
+            print(f"Reached maximum number of tests ({args.max_tests}), stopping")
+            break
+
+    print(f"Saved {len(tests)} table tests to {os.path.join(args.output_dir, 'table_tests.jsonl')}")
+
+
+if __name__ == "__main__":
+    main()
--- a/olmocr/bench/review_app.py
+++ b/olmocr/bench/review_app.py
@ -0,0 +1,581 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import os
+import shutil
+import sys
+import tempfile
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
+
+import flask
+from flask import Flask, render_template, request, jsonify, redirect, url_for
+from werkzeug.utils import secure_filename
+
+from olmocr.data.renderpdf import render_pdf_to_base64png
+from . import tests
+
+app = Flask(__name__)
+
+# Global state
+DATASET_DIR = ""
+CURRENT_PDF = None
+PDF_TESTS = {}
+ALL_PDFS = []
+
+
+def find_next_unchecked_pdf() -> Optional[str]:
+    """Find the next PDF with at least one unchecked test."""
+    global PDF_TESTS, ALL_PDFS
+    
+    for pdf_name in ALL_PDFS:
+        pdf_tests = PDF_TESTS[pdf_name]
+        for test in pdf_tests:
+            if test.get("checked") is None:
+                return pdf_name
+    return None
+
+
+def save_dataset(jsonl_file: str) -> None:
+    """Save the tests to a JSONL file, using temp file for atomic write."""
+    global PDF_TESTS
+    
+    # Flatten all tests
+    all_tests = []
+    for pdf_tests in PDF_TESTS.values():
+        all_tests.extend(pdf_tests)
+    
+    # Create temp file and write updated content
+    with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
+        for test in all_tests:
+            temp_file.write(json.dumps(test) + "\n")
+    
+    # Atomic replace
+    shutil.move(temp_file.name, jsonl_file)
+
+
+@app.route('/')
+def index():
+    """Main page displaying the current PDF and its tests."""
+    global CURRENT_PDF, PDF_TESTS, DATASET_DIR
+    
+    # If no current PDF is set, find the next one with unchecked tests
+    if CURRENT_PDF is None:
+        CURRENT_PDF = find_next_unchecked_pdf()
+    
+    # If still no PDF, all tests have been checked
+    if CURRENT_PDF is None:
+        return render_template('all_done.html')
+    
+    # Get the tests for the current PDF
+    current_tests = PDF_TESTS.get(CURRENT_PDF, [])
+    
+    # Render the PDF
+    pdf_path = os.path.join(DATASET_DIR, "pdfs", CURRENT_PDF)
+    base64_img = render_pdf_to_base64png(pdf_path, 0)
+    
+    return render_template(
+        'review.html', 
+        pdf_name=CURRENT_PDF,
+        tests=current_tests,
+        pdf_img=base64_img,
+        pdf_index=ALL_PDFS.index(CURRENT_PDF) if CURRENT_PDF in ALL_PDFS else 0,
+        total_pdfs=len(ALL_PDFS)
+    )
+
+
+@app.route('/update_test', methods=['POST'])
+def update_test():
+    """API endpoint to update a test."""
+    global PDF_TESTS, DATASET_DIR
+    
+    data = request.json
+    pdf_name = data.get('pdf')
+    test_id = data.get('id')
+    field = data.get('field')
+    value = data.get('value')
+    
+    # Find and update the test
+    for test in PDF_TESTS.get(pdf_name, []):
+        if test.get('id') == test_id:
+            test[field] = value
+            break
+    
+    # Save the updated tests
+    dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl")
+    save_dataset(dataset_file)
+    
+    return jsonify({"status": "success"})
+
+
+@app.route('/next_pdf', methods=['POST'])
+def next_pdf():
+    """Move to the next PDF in the list."""
+    global CURRENT_PDF, ALL_PDFS
+    
+    if CURRENT_PDF in ALL_PDFS:
+        current_index = ALL_PDFS.index(CURRENT_PDF)
+        if current_index < len(ALL_PDFS) - 1:
+            CURRENT_PDF = ALL_PDFS[current_index + 1]
+        else:
+            CURRENT_PDF = find_next_unchecked_pdf()
+    else:
+        CURRENT_PDF = find_next_unchecked_pdf()
+    
+    return redirect(url_for('index'))
+
+
+@app.route('/prev_pdf', methods=['POST'])
+def prev_pdf():
+    """Move to the previous PDF in the list."""
+    global CURRENT_PDF, ALL_PDFS
+    
+    if CURRENT_PDF in ALL_PDFS:
+        current_index = ALL_PDFS.index(CURRENT_PDF)
+        if current_index > 0:
+            CURRENT_PDF = ALL_PDFS[current_index - 1]
+    
+    return redirect(url_for('index'))
+
+
+@app.route('/goto_pdf/<int:index>', methods=['POST'])
+def goto_pdf(index):
+    """Go to a specific PDF by index."""
+    global CURRENT_PDF, ALL_PDFS
+    
+    if 0 <= index < len(ALL_PDFS):
+        CURRENT_PDF = ALL_PDFS[index]
+    
+    return redirect(url_for('index'))
+
+
+def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
+    """Load tests from the dataset file and organize them by PDF."""
+    dataset_file = os.path.join(dataset_dir, "table_tests.jsonl")
+    
+    if not os.path.exists(dataset_file):
+        raise FileNotFoundError(f"Dataset file not found: {dataset_file}")
+    
+    pdf_tests = defaultdict(list)
+    
+    with open(dataset_file, "r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+                
+            try:
+                test = json.loads(line)
+                pdf_name = test.get('pdf')
+                if pdf_name:
+                    pdf_tests[pdf_name].append(test)
+            except json.JSONDecodeError:
+                print(f"Warning: Could not parse line as JSON: {line}")
+    
+    all_pdfs = list(pdf_tests.keys())
+    
+    return pdf_tests, all_pdfs
+
+
+def create_templates_directory():
+    """Create templates directory for Flask if it doesn't exist."""
+    templates_dir = os.path.join(os.path.dirname(__file__), 'templates')
+    os.makedirs(templates_dir, exist_ok=True)
+    
+    # Create review template
+    review_template = os.path.join(templates_dir, 'review.html')
+    with open(review_template, 'w') as f:
+        f.write("""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>PDF Test Review</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 0;
+            padding: 20px;
+            background-color: #f5f5f5;
+        }
+        
+        .container {
+            max-width: 1920px;
+            margin: 0 auto;
+            display: flex;
+            flex-direction: row;
+        }
+        
+        h1 {
+            color: #333;
+            margin-bottom: 20px;
+        }
+        
+        .navigation {
+            display: flex;
+            justify-content: space-between;
+            margin-bottom: 20px;
+        }
+        
+        .pdf-viewer {
+            flex: 1;
+            padding: 20px;
+            background-color: white;
+            border-radius: 8px;
+            box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
+            margin-right: 20px;
+            overflow: auto;
+            max-height: calc(100vh - 100px);
+        }
+        
+        .pdf-image {
+            max-width: 100%;
+        }
+        
+        .tests-panel {
+            flex: 1;
+            padding: 20px;
+            background-color: white;
+            border-radius: 8px;
+            box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
+            overflow-y: auto;
+            max-height: calc(100vh - 100px);
+        }
+        
+        .test-item {
+            margin-bottom: 20px;
+            padding: 15px;
+            border: 1px solid #e0e0e0;
+            border-radius: 4px;
+        }
+        
+        .test-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 10px;
+        }
+        
+        .test-type {
+            display: inline-block;
+            padding: 5px 10px;
+            border-radius: 4px;
+            color: white;
+            font-weight: bold;
+        }
+        
+        .present {
+            background-color: #28a745;
+        }
+        
+        .absent {
+            background-color: #dc3545;
+        }
+        
+        .order {
+            background-color: #fd7e14;
+        }
+        
+        .table {
+            background-color: #17a2b8;
+        }
+        
+        .math {
+            background-color: #6f42c1;
+        }
+        
+        .baseline {
+            background-color: #4a6fa5;
+        }
+        
+        .unknown {
+            background-color: #6c757d;
+        }
+        
+        .test-buttons {
+            display: flex;
+            gap: 10px;
+        }
+        
+        .test-content {
+            margin-bottom: 10px;
+        }
+        
+        button {
+            padding: 8px 16px;
+            border: none;
+            border-radius: 4px;
+            cursor: pointer;
+            font-weight: bold;
+        }
+        
+        .approve-btn {
+            background-color: #28a745;
+            color: white;
+        }
+        
+        .reject-btn {
+            background-color: #dc3545;
+            color: white;
+        }
+        
+        .edit-btn {
+            background-color: #17a2b8;
+            color: white;
+        }
+        
+        .next-btn, .prev-btn {
+            background-color: #4a6fa5;
+            color: white;
+        }
+        
+        textarea {
+            width: 100%;
+            padding: 8px;
+            border: 1px solid #ccc;
+            border-radius: 4px;
+            resize: vertical;
+        }
+        
+        .editable {
+            border: 1px dashed #ccc;
+            padding: 5px;
+            margin-bottom: 5px;
+        }
+        
+        .status-approved {
+            border-left: 5px solid #28a745;
+        }
+        
+        .status-rejected {
+            border-left: 5px solid #dc3545;
+        }
+    </style>
+</head>
+<body>
+    <h1>PDF Test Review: {{ pdf_name }} ({{ pdf_index + 1 }}/{{ total_pdfs }})</h1>
+    
+    <div class="navigation">
+        <form action="/prev_pdf" method="post">
+            <button type="submit" class="prev-btn">Previous PDF</button>
+        </form>
+        <form action="/next_pdf" method="post">
+            <button type="submit" class="next-btn">Next PDF</button>
+        </form>
+    </div>
+    
+    <div class="container">
+        <div class="pdf-viewer">
+            <img class="pdf-image" src="data:image/png;base64,{{ pdf_img }}" alt="{{ pdf_name }}">
+        </div>
+        
+        <div class="tests-panel">
+            <h2>Tests ({{ tests|length }})</h2>
+            
+            {% for test in tests %}
+            <div class="test-item {% if test.checked == 'verified' %}status-approved{% elif test.checked == 'rejected' %}status-rejected{% endif %}" data-id="{{ test.id }}">
+                <div class="test-header">
+                    <span class="test-type {{ test.type }}">{{ test.type|upper }}</span>
+                    <div class="test-buttons">
+                        <button class="approve-btn" onclick="updateTestStatus('{{ test.pdf }}', '{{ test.id }}', 'checked', 'verified')">Approve</button>
+                        <button class="reject-btn" onclick="updateTestStatus('{{ test.pdf }}', '{{ test.id }}', 'checked', 'rejected')">Reject</button>
+                        <button class="edit-btn" onclick="toggleEditMode('{{ test.id }}')">Edit</button>
+                    </div>
+                </div>
+                
+                <div class="test-content">
+                    {% if test.type == 'present' or test.type == 'absent' %}
+                        <div><strong>Text:</strong> <span class="editable" data-field="text" data-id="{{ test.id }}">{{ test.text }}</span></div>
+                        <div><strong>Case Sensitive:</strong> {{ test.case_sensitive }}</div>
+                        {% if test.first_n %}<div><strong>First N:</strong> {{ test.first_n }}</div>{% endif %}
+                        {% if test.last_n %}<div><strong>Last N:</strong> {{ test.last_n }}</div>{% endif %}
+                    {% elif test.type == 'order' %}
+                        <div><strong>Before:</strong> <span class="editable" data-field="before" data-id="{{ test.id }}">{{ test.before }}</span></div>
+                        <div><strong>After:</strong> <span class="editable" data-field="after" data-id="{{ test.id }}">{{ test.after }}</span></div>
+                    {% elif test.type == 'table' %}
+                        <div><strong>Cell:</strong> <span class="editable" data-field="cell" data-id="{{ test.id }}">{{ test.cell }}</span></div>
+                        {% if test.up %}<div><strong>Up:</strong> <span class="editable" data-field="up" data-id="{{ test.id }}">{{ test.up }}</span></div>{% endif %}
+                        {% if test.down %}<div><strong>Down:</strong> <span class="editable" data-field="down" data-id="{{ test.id }}">{{ test.down }}</span></div>{% endif %}
+                        {% if test.left %}<div><strong>Left:</strong> <span class="editable" data-field="left" data-id="{{ test.id }}">{{ test.left }}</span></div>{% endif %}
+                        {% if test.right %}<div><strong>Right:</strong> <span class="editable" data-field="right" data-id="{{ test.id }}">{{ test.right }}</span></div>{% endif %}
+                        {% if test.top_heading %}<div><strong>Top Heading:</strong> <span class="editable" data-field="top_heading" data-id="{{ test.id }}">{{ test.top_heading }}</span></div>{% endif %}
+                        {% if test.left_heading %}<div><strong>Left Heading:</strong> <span class="editable" data-field="left_heading" data-id="{{ test.id }}">{{ test.left_heading }}</span></div>{% endif %}
+                    {% elif test.type == 'math' %}
+                        <div><strong>Math:</strong> <span class="editable" data-field="math" data-id="{{ test.id }}">{{ test.math }}</span></div>
+                    {% endif %}
+                    <div><strong>Max Diffs:</strong> {{ test.max_diffs }}</div>
+                    <div><strong>Status:</strong> {{ test.checked or 'Not checked' }}</div>
+                </div>
+            </div>
+            {% endfor %}
+        </div>
+    </div>
+    
+    <script>
+        // Function to update test status (approve/reject)
+        function updateTestStatus(pdfName, testId, field, value) {
+            fetch('/update_test', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json',
+                },
+                body: JSON.stringify({
+                    pdf: pdfName,
+                    id: testId,
+                    field: field,
+                    value: value
+                }),
+            })
+            .then(response => response.json())
+            .then(data => {
+                // Update UI to reflect change
+                const testItem = document.querySelector(`.test-item[data-id="${testId}"]`);
+                testItem.classList.remove('status-approved', 'status-rejected');
+                
+                if (value === 'verified') {
+                    testItem.classList.add('status-approved');
+                } else if (value === 'rejected') {
+                    testItem.classList.add('status-rejected');
+                }
+            })
+            .catch(error => {
+                console.error('Error updating test:', error);
+            });
+        }
+        
+        // Toggle edit mode for a field
+        function toggleEditMode(testId) {
+            const editables = document.querySelectorAll(`.editable[data-id="${testId}"]`);
+            
+            editables.forEach(editable => {
+                const field = editable.dataset.field;
+                const currentValue = editable.innerText;
+                
+                // Create textarea
+                const textarea = document.createElement('textarea');
+                textarea.value = currentValue;
+                textarea.dataset.field = field;
+                textarea.dataset.originalValue = currentValue;
+                
+                // Replace the span with textarea
+                editable.parentNode.replaceChild(textarea, editable);
+                
+                // Focus the textarea
+                textarea.focus();
+                
+                // Add blur event to save changes
+                textarea.addEventListener('blur', function() {
+                    const newValue = this.value;
+                    const pdfName = '{{ pdf_name }}';
+                    
+                    // If value changed, save it
+                    if (newValue !== this.dataset.originalValue) {
+                        updateTestStatus(pdfName, testId, field, newValue);
+                    }
+                    
+                    // Create span again
+                    const span = document.createElement('span');
+                    span.className = 'editable';
+                    span.dataset.field = field;
+                    span.dataset.id = testId;
+                    span.innerText = newValue;
+                    
+                    // Replace textarea with span
+                    this.parentNode.replaceChild(span, this);
+                });
+            });
+        }
+    </script>
+</body>
+</html>""")
+    
+    # Create all done template
+    all_done_template = os.path.join(templates_dir, 'all_done.html')
+    with open(all_done_template, 'w') as f:
+        f.write("""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>All Tests Reviewed</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 0;
+            padding: 20px;
+            background-color: #f5f5f5;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            height: 100vh;
+            text-align: center;
+        }
+        
+        .message {
+            background-color: white;
+            padding: 40px;
+            border-radius: 8px;
+            box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
+        }
+        
+        h1 {
+            color: #28a745;
+        }
+    </style>
+</head>
+<body>
+    <div class="message">
+        <h1>All Tests Reviewed!</h1>
+        <p>You have completed reviewing all tests in the dataset.</p>
+    </div>
+</body>
+</html>""")
+
+
+def main():
+    """Main entry point with command-line arguments."""
+    global DATASET_DIR, PDF_TESTS, ALL_PDFS
+    
+    parser = argparse.ArgumentParser(description="Interactive Test Review App")
+    parser.add_argument("dataset_dir", help="Path to the dataset directory containing table_tests.jsonl and pdfs/ folder")
+    parser.add_argument("--port", type=int, default=5000, help="Port for the Flask app")
+    parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app")
+    parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode")
+    
+    args = parser.parse_args()
+    
+    # Validate dataset directory
+    if not os.path.isdir(args.dataset_dir):
+        print(f"Error: Dataset directory not found: {args.dataset_dir}")
+        return 1
+    
+    pdf_dir = os.path.join(args.dataset_dir, "pdfs")
+    if not os.path.isdir(pdf_dir):
+        print(f"Error: PDF directory not found: {pdf_dir}")
+        return 1
+    
+    # Store dataset directory globally
+    DATASET_DIR = args.dataset_dir
+    
+    # Load dataset
+    try:
+        PDF_TESTS, ALL_PDFS = load_dataset(args.dataset_dir)
+    except Exception as e:
+        print(f"Error loading dataset: {str(e)}")
+        return 1
+    
+    # Create templates directory
+    create_templates_directory()
+    
+    # Find first PDF with unchecked tests
+    CURRENT_PDF = find_next_unchecked_pdf()
+    
+    # Start Flask app
+    print(f"Starting server at http://{args.host}:{args.port}")
+    app.run(host=args.host, port=args.port, debug=args.debug)
+    
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/pyproject.toml
+++ b/pyproject.toml
@ -86,10 +86,10 @@ bench = [
    "sequence_align",
    "syntok",
    "google-genai",
-    "google-generativeai",
    "playwright",
    "mistralai",
    "lxml",
+    "flask",
 ]

 train = [