Flask based review app first attempt

This commit is contained in:
Jake Poznanski 2025-03-18 16:53:36 +00:00
parent 93450c326d
commit 4939e41154
4 changed files with 1023 additions and 7 deletions

View File

@ -16,7 +16,6 @@ Usage:
import argparse
import base64
import json
import os
import random
from typing import Dict, List, Optional, Tuple
@ -139,10 +138,11 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
parts=[
image_part,
types.Part.from_text(
text=(
"Analyze the document attached and output it in plain text. "
"Please output the tables in valid HTML format that preserves the structure and content exactly. "
"Include the complete table with all rows and columns. Make each table cell be sensible and semantically correct with the original intent of the table."
text=(
"Analyze the document attached and output it in markdown format. "
"Output equations as Latex escaped with $$. "
"Output tables in valid HTML format that preserves the structure and content exactly. "
"Output figures with just a simple markdown image placeholder."
)
),
],
@ -415,4 +415,4 @@ def main():
if __name__ == "__main__":
main()
main()

View File

@ -0,0 +1,435 @@
#!/usr/bin/env python3
"""
mine_tables.py - Extract tables from PDF documents and create table tests.
This script:
1. Takes a file containing S3 paths to PDF documents as input
2. For each PDF, extracts a random page and renders it to an image
3. Uses GPT-4o to identify tables in the rendered image
4. Extracts table content and creates table relationship tests by making a second GPT-4o request
that now includes the page image alongside the prompt (e.g., "Given cell with {cell_value}, which cell is directly to the left of it?")
5. Extracts the page from the PDF and saves it to an output folder
Usage:
python mine_tables.py --input_list path/to/s3_paths.txt --output_dir path/to/output --api_key your_openai_api_key
"""
import argparse
import base64
import json
import os
import random
from typing import Dict, List, Optional, Tuple
import boto3
import numpy as np
import pypdf
from bs4 import BeautifulSoup
from openai import OpenAI
from tqdm import tqdm
from olmocr.bench.tests import TableTest, save_tests
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.filter import PdfFilter
def download_pdf_from_s3(s3_path: str, local_path: str) -> bool:
"""
Download a PDF file from S3.
Args:
s3_path: The S3 path (s3://bucket/path/to/file.pdf)
local_path: The local path to save the file
Returns:
bool: True if download was successful, False otherwise
"""
try:
# Parse S3 path
parts = s3_path.replace("s3://", "").split("/", 1)
bucket = parts[0]
key = parts[1]
# Create S3 client
s3 = boto3.client("s3")
# Create directory if it doesn't exist
os.makedirs(os.path.dirname(local_path), exist_ok=True)
# Download file
s3.download_file(bucket, key, local_path)
return True
except Exception as e:
print(f"Error downloading {s3_path}: {str(e)}")
return False
def extract_page_from_pdf(input_path: str, output_path: str, page_num: int) -> bool:
"""
Extract a specific page from a PDF and save it as a new PDF.
Args:
input_path: Path to the input PDF
output_path: Path to save the extracted page
page_num: The page number to extract (0-indexed)
Returns:
bool: True if extraction was successful, False otherwise
"""
try:
# Ensure output directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Read the input PDF
reader = pypdf.PdfReader(input_path)
# Check if page number is valid
if page_num >= len(reader.pages):
print(f"Page number {page_num} out of range for {input_path} with {len(reader.pages)} pages")
return False
# Create a new PDF with just the selected page
writer = pypdf.PdfWriter()
writer.add_page(reader.pages[page_num])
# Write the output PDF
with open(output_path, "wb") as output_file:
writer.write(output_file)
return True
except Exception as e:
print(f"Error extracting page {page_num} from {input_path}: {str(e)}")
raise
def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[List[np.ndarray], str]]:
"""
Use GPT-4o to detect tables in a rendered PDF page.
Args:
pdf_path: Path to the PDF file
page_num: The page number to analyze (0-indexed)
api_key: OpenAI API key
Returns:
Optional[Tuple[List[np.ndarray], str]]:
A tuple with a list of detected tables (as numpy arrays) and the base64 string of the rendered page image.
Returns None if detection fails.
"""
# Initialize OpenAI client
client = OpenAI(api_key=api_key)
model = "gpt-4o"
# Render the PDF page as an image (render_pdf_to_base64png is 1-indexed)
try:
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num + 1, target_longest_image_dim=2048)
except Exception as e:
print(f"Error rendering PDF page: {str(e)}")
return None
# Prepare prompt for GPT-4o to extract tables
try:
# Call OpenAI API
response = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}",
"detail": "high"
}
},
{
"type": "text",
"text": (
"Analyze the document attached and output it in markdown format. "
"Output equations as Latex escaped with $$. "
"Output tables in valid HTML format that preserves the structure and content exactly. "
"Output figures with just a simple markdown image placeholder."
)
}
]
}
],
temperature=0.2,
)
if not response.choices or len(response.choices) == 0:
print(f"No response generated for {pdf_path} page {page_num}")
return None
# Parse the response
response_text = response.choices[0].message.content
print(response_text)
# Parse tables from HTML
parsed_tables = []
soup = BeautifulSoup(response_text, "html.parser")
tables = soup.find_all("table")
for table in tables:
rows = table.find_all("tr")
table_data = []
for row in rows:
cells = row.find_all(["th", "td"])
row_data = [cell.get_text().strip() for cell in cells]
table_data.append(row_data)
# Ensure all rows have the same number of columns
if table_data:
max_cols = max(len(row) for row in table_data)
padded_data = [row + [""] * (max_cols - len(row)) for row in table_data]
table_array = np.array(padded_data)
parsed_tables.append(table_array)
# Return both the parsed tables and the rendered image (base64 string)
return (parsed_tables, image_base64) if parsed_tables else None
except Exception as e:
print(f"Error detecting tables in {pdf_path} page {page_num}: {str(e)}")
return None
def generate_table_tests(tables: List[np.ndarray], pdf_image: str, api_key: str, max_tests_per_table: int = 3) -> List[Dict]:
"""
Generate table tests from the detected tables by making a second GPT-4o request for each candidate cell.
For each candidate cell in a table, the function selects one valid relationship (e.g., "left", "up", "top_heading", etc.)
and sends a prompt to GPT-4o including the page image. For example:
"Given a cell in a table with value 'XYZ', please answer: which cell is directly to the left of it? Provide only the cell's text."
Args:
tables: List of tables as numpy arrays
pdf_image: Base64 string of the rendered page image
api_key: OpenAI API key to use for generating relationship tests
max_tests_per_table: Maximum number of tests to generate per table
Returns:
List of table test dictionaries
"""
tests = []
# Initialize OpenAI client for test queries
client = OpenAI(api_key=api_key)
model = "gpt-4o"
# Mapping for relationship prompts
prompt_map = {
"up": "which cell is directly above it?",
"down": "which cell is directly below it?",
"left": "which cell is directly to the left of it?",
"right": "which cell is directly to the right of it?",
"top_heading": "what is the top heading for this cell?",
"left_heading": "what is the left heading for this cell?",
}
for table in tables:
rows, cols = table.shape
if table.size == 0 or rows < 2 or cols < 2:
continue # Skip tables that are too small
# Try up to 3x max_tests_per_table candidate cells
candidate_positions = []
for _ in range(max_tests_per_table * 3):
row = random.randint(0, rows - 1)
col = random.randint(0, cols - 1)
if not table[row, col].strip():
continue
candidate_positions.append((row, col))
random.shuffle(candidate_positions)
tests_for_this_table = 0
for row, col in candidate_positions:
if tests_for_this_table >= max_tests_per_table:
break
cell_value = table[row, col].strip()
# Determine valid relationship types based on candidate's position
valid_relationships = []
if row > 0:
valid_relationships.append("up")
if row < rows - 1:
valid_relationships.append("down")
if col > 0:
valid_relationships.append("left")
if col < cols - 1:
valid_relationships.append("right")
if row > 0:
valid_relationships.append("top_heading")
if col > 0:
valid_relationships.append("left_heading")
if not valid_relationships:
continue
relationship = random.choice(valid_relationships)
prompt = (
f"Given a cell in a table with value '{cell_value}', please answer: "
f"{prompt_map[relationship]} Provide only the cell's text or output 'null' if there is not a matching cell."
)
try:
response = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{pdf_image}",
"detail": "high"
}
},
{
"type": "text",
"text": prompt
}
]
}
],
temperature=0.2,
)
if not response.choices or len(response.choices) == 0:
continue
answer_text = response.choices[0].message.content.strip()
if answer_text and "null" not in answer_text:
test_data = {"cell": cell_value, relationship: answer_text}
tests.append(test_data)
tests_for_this_table += 1
except Exception as e:
print(f"Error querying GPT-4o for cell '{cell_value}' and relationship '{relationship}': {str(e)}")
return tests
def process_pdf(s3_path: str, temp_dir: str, output_dir: str, api_key: str, tests: List[TableTest]) -> None:
"""
Process a single PDF from S3.
Args:
s3_path: S3 path to the PDF
temp_dir: Directory for temporary files
output_dir: Directory for output files
api_key: OpenAI API key
tests: List to append tests to
"""
# Extract filename from S3 path
pdf_filename = os.path.basename(s3_path)
local_pdf_path = os.path.join(temp_dir, pdf_filename)
# Download PDF from S3
if not download_pdf_from_s3(s3_path, local_pdf_path):
return
pdf_filter = PdfFilter()
if pdf_filter.filter_out_pdf(local_pdf_path):
print(f"Filtering out {pdf_filename}")
return
try:
# Read the PDF to get the number of pages
reader = pypdf.PdfReader(local_pdf_path)
num_pages = len(reader.pages)
if num_pages == 0:
print(f"PDF {pdf_filename} has no pages")
return
all_pages = list(range(len(reader.pages)))
random.shuffle(all_pages)
for page_num in all_pages:
# Detect tables and obtain the rendered image for this page
result = detect_tables(local_pdf_path, page_num, api_key)
if not result:
print(f"No tables detected in {pdf_filename} page {page_num+1}")
continue
tables, image_base64 = result
# Generate table tests using the new GPT-4o query approach with the page image
table_tests_data = generate_table_tests(tables, image_base64, api_key, max_tests_per_table=5)
if not table_tests_data:
print(f"Could not generate valid tests for tables in {pdf_filename} page {page_num+1}")
continue
# Extract the page and save to output dir
pdf_basename = os.path.splitext(pdf_filename)[0]
output_pdf_path = os.path.join(output_dir, "pdfs", f"{pdf_basename}_pg{page_num+1}.pdf")
extract_page_from_pdf(local_pdf_path, output_pdf_path, page_num)
# Create table tests
for i, test_data in enumerate(table_tests_data):
test_id = f"{pdf_basename}_pg{page_num+1}_table_{i:02d}"
test = TableTest(
id=test_id,
pdf=f"{pdf_basename}_pg{page_num+1}.pdf",
page=1, # The extracted PDF has only one page
type="table",
cell=test_data["cell"],
up=test_data.get("up", None),
down=test_data.get("down", None),
left=test_data.get("left", None),
right=test_data.get("right", None),
top_heading=test_data.get("top_heading", None),
left_heading=test_data.get("left_heading", None),
)
tests.append(test)
print(f"Processed {pdf_filename} page {page_num+1}, found {len(tables)} tables, created {len(table_tests_data)} tests")
return # Process only one page per PDF
except Exception as e:
print(f"Error processing {pdf_filename}: {str(e)}")
finally:
if os.path.exists(local_pdf_path):
os.remove(local_pdf_path)
def main():
parser = argparse.ArgumentParser(description="Extract tables from PDF documents and create table tests")
parser.add_argument("--input_list", required=True, help="Path to a file containing S3 paths to PDFs")
parser.add_argument("--output_dir", required=True, help="Directory to store extracted pages and tests")
parser.add_argument("--api_key", help="OpenAI API key (if not provided, will use OPENAI_API_KEY environment variable)")
parser.add_argument("--temp_dir", default="/tmp/mine_tables", help="Directory for temporary files")
parser.add_argument("--max_tests", type=int, default=100, help="Maximum number of tests to generate")
args = parser.parse_args()
# Get API key
api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
if not api_key:
print("Error: OpenAI API key not provided. Use --api_key or set OPENAI_API_KEY environment variable.")
return
os.makedirs(args.temp_dir, exist_ok=True)
os.makedirs(os.path.join(args.output_dir, "pdfs"), exist_ok=True)
with open(args.input_list, "r") as f:
s3_paths = [line.strip() for line in f if line.strip()]
print(f"Found {len(s3_paths)} PDF paths in input list")
tests = []
for s3_path in tqdm(s3_paths, desc="Processing PDFs"):
process_pdf(s3_path, args.temp_dir, args.output_dir, api_key, tests)
if tests:
save_tests(tests, os.path.join(args.output_dir, "table_tests.jsonl"))
if len(tests) >= args.max_tests:
print(f"Reached maximum number of tests ({args.max_tests}), stopping")
break
print(f"Saved {len(tests)} table tests to {os.path.join(args.output_dir, 'table_tests.jsonl')}")
if __name__ == "__main__":
main()

581
olmocr/bench/review_app.py Normal file
View File

@ -0,0 +1,581 @@
#!/usr/bin/env python3
import argparse
import json
import os
import shutil
import sys
import tempfile
from collections import defaultdict
from typing import Dict, List, Optional, Tuple
import flask
from flask import Flask, render_template, request, jsonify, redirect, url_for
from werkzeug.utils import secure_filename
from olmocr.data.renderpdf import render_pdf_to_base64png
from . import tests
app = Flask(__name__)
# Global state
DATASET_DIR = ""
CURRENT_PDF = None
PDF_TESTS = {}
ALL_PDFS = []
def find_next_unchecked_pdf() -> Optional[str]:
"""Find the next PDF with at least one unchecked test."""
global PDF_TESTS, ALL_PDFS
for pdf_name in ALL_PDFS:
pdf_tests = PDF_TESTS[pdf_name]
for test in pdf_tests:
if test.get("checked") is None:
return pdf_name
return None
def save_dataset(jsonl_file: str) -> None:
"""Save the tests to a JSONL file, using temp file for atomic write."""
global PDF_TESTS
# Flatten all tests
all_tests = []
for pdf_tests in PDF_TESTS.values():
all_tests.extend(pdf_tests)
# Create temp file and write updated content
with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
for test in all_tests:
temp_file.write(json.dumps(test) + "\n")
# Atomic replace
shutil.move(temp_file.name, jsonl_file)
@app.route('/')
def index():
"""Main page displaying the current PDF and its tests."""
global CURRENT_PDF, PDF_TESTS, DATASET_DIR
# If no current PDF is set, find the next one with unchecked tests
if CURRENT_PDF is None:
CURRENT_PDF = find_next_unchecked_pdf()
# If still no PDF, all tests have been checked
if CURRENT_PDF is None:
return render_template('all_done.html')
# Get the tests for the current PDF
current_tests = PDF_TESTS.get(CURRENT_PDF, [])
# Render the PDF
pdf_path = os.path.join(DATASET_DIR, "pdfs", CURRENT_PDF)
base64_img = render_pdf_to_base64png(pdf_path, 0)
return render_template(
'review.html',
pdf_name=CURRENT_PDF,
tests=current_tests,
pdf_img=base64_img,
pdf_index=ALL_PDFS.index(CURRENT_PDF) if CURRENT_PDF in ALL_PDFS else 0,
total_pdfs=len(ALL_PDFS)
)
@app.route('/update_test', methods=['POST'])
def update_test():
"""API endpoint to update a test."""
global PDF_TESTS, DATASET_DIR
data = request.json
pdf_name = data.get('pdf')
test_id = data.get('id')
field = data.get('field')
value = data.get('value')
# Find and update the test
for test in PDF_TESTS.get(pdf_name, []):
if test.get('id') == test_id:
test[field] = value
break
# Save the updated tests
dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl")
save_dataset(dataset_file)
return jsonify({"status": "success"})
@app.route('/next_pdf', methods=['POST'])
def next_pdf():
"""Move to the next PDF in the list."""
global CURRENT_PDF, ALL_PDFS
if CURRENT_PDF in ALL_PDFS:
current_index = ALL_PDFS.index(CURRENT_PDF)
if current_index < len(ALL_PDFS) - 1:
CURRENT_PDF = ALL_PDFS[current_index + 1]
else:
CURRENT_PDF = find_next_unchecked_pdf()
else:
CURRENT_PDF = find_next_unchecked_pdf()
return redirect(url_for('index'))
@app.route('/prev_pdf', methods=['POST'])
def prev_pdf():
"""Move to the previous PDF in the list."""
global CURRENT_PDF, ALL_PDFS
if CURRENT_PDF in ALL_PDFS:
current_index = ALL_PDFS.index(CURRENT_PDF)
if current_index > 0:
CURRENT_PDF = ALL_PDFS[current_index - 1]
return redirect(url_for('index'))
@app.route('/goto_pdf/<int:index>', methods=['POST'])
def goto_pdf(index):
"""Go to a specific PDF by index."""
global CURRENT_PDF, ALL_PDFS
if 0 <= index < len(ALL_PDFS):
CURRENT_PDF = ALL_PDFS[index]
return redirect(url_for('index'))
def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
"""Load tests from the dataset file and organize them by PDF."""
dataset_file = os.path.join(dataset_dir, "table_tests.jsonl")
if not os.path.exists(dataset_file):
raise FileNotFoundError(f"Dataset file not found: {dataset_file}")
pdf_tests = defaultdict(list)
with open(dataset_file, "r") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
test = json.loads(line)
pdf_name = test.get('pdf')
if pdf_name:
pdf_tests[pdf_name].append(test)
except json.JSONDecodeError:
print(f"Warning: Could not parse line as JSON: {line}")
all_pdfs = list(pdf_tests.keys())
return pdf_tests, all_pdfs
def create_templates_directory():
"""Create templates directory for Flask if it doesn't exist."""
templates_dir = os.path.join(os.path.dirname(__file__), 'templates')
os.makedirs(templates_dir, exist_ok=True)
# Create review template
review_template = os.path.join(templates_dir, 'review.html')
with open(review_template, 'w') as f:
f.write("""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>PDF Test Review</title>
<style>
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 20px;
background-color: #f5f5f5;
}
.container {
max-width: 1920px;
margin: 0 auto;
display: flex;
flex-direction: row;
}
h1 {
color: #333;
margin-bottom: 20px;
}
.navigation {
display: flex;
justify-content: space-between;
margin-bottom: 20px;
}
.pdf-viewer {
flex: 1;
padding: 20px;
background-color: white;
border-radius: 8px;
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
margin-right: 20px;
overflow: auto;
max-height: calc(100vh - 100px);
}
.pdf-image {
max-width: 100%;
}
.tests-panel {
flex: 1;
padding: 20px;
background-color: white;
border-radius: 8px;
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
overflow-y: auto;
max-height: calc(100vh - 100px);
}
.test-item {
margin-bottom: 20px;
padding: 15px;
border: 1px solid #e0e0e0;
border-radius: 4px;
}
.test-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 10px;
}
.test-type {
display: inline-block;
padding: 5px 10px;
border-radius: 4px;
color: white;
font-weight: bold;
}
.present {
background-color: #28a745;
}
.absent {
background-color: #dc3545;
}
.order {
background-color: #fd7e14;
}
.table {
background-color: #17a2b8;
}
.math {
background-color: #6f42c1;
}
.baseline {
background-color: #4a6fa5;
}
.unknown {
background-color: #6c757d;
}
.test-buttons {
display: flex;
gap: 10px;
}
.test-content {
margin-bottom: 10px;
}
button {
padding: 8px 16px;
border: none;
border-radius: 4px;
cursor: pointer;
font-weight: bold;
}
.approve-btn {
background-color: #28a745;
color: white;
}
.reject-btn {
background-color: #dc3545;
color: white;
}
.edit-btn {
background-color: #17a2b8;
color: white;
}
.next-btn, .prev-btn {
background-color: #4a6fa5;
color: white;
}
textarea {
width: 100%;
padding: 8px;
border: 1px solid #ccc;
border-radius: 4px;
resize: vertical;
}
.editable {
border: 1px dashed #ccc;
padding: 5px;
margin-bottom: 5px;
}
.status-approved {
border-left: 5px solid #28a745;
}
.status-rejected {
border-left: 5px solid #dc3545;
}
</style>
</head>
<body>
<h1>PDF Test Review: {{ pdf_name }} ({{ pdf_index + 1 }}/{{ total_pdfs }})</h1>
<div class="navigation">
<form action="/prev_pdf" method="post">
<button type="submit" class="prev-btn">Previous PDF</button>
</form>
<form action="/next_pdf" method="post">
<button type="submit" class="next-btn">Next PDF</button>
</form>
</div>
<div class="container">
<div class="pdf-viewer">
<img class="pdf-image" src="data:image/png;base64,{{ pdf_img }}" alt="{{ pdf_name }}">
</div>
<div class="tests-panel">
<h2>Tests ({{ tests|length }})</h2>
{% for test in tests %}
<div class="test-item {% if test.checked == 'verified' %}status-approved{% elif test.checked == 'rejected' %}status-rejected{% endif %}" data-id="{{ test.id }}">
<div class="test-header">
<span class="test-type {{ test.type }}">{{ test.type|upper }}</span>
<div class="test-buttons">
<button class="approve-btn" onclick="updateTestStatus('{{ test.pdf }}', '{{ test.id }}', 'checked', 'verified')">Approve</button>
<button class="reject-btn" onclick="updateTestStatus('{{ test.pdf }}', '{{ test.id }}', 'checked', 'rejected')">Reject</button>
<button class="edit-btn" onclick="toggleEditMode('{{ test.id }}')">Edit</button>
</div>
</div>
<div class="test-content">
{% if test.type == 'present' or test.type == 'absent' %}
<div><strong>Text:</strong> <span class="editable" data-field="text" data-id="{{ test.id }}">{{ test.text }}</span></div>
<div><strong>Case Sensitive:</strong> {{ test.case_sensitive }}</div>
{% if test.first_n %}<div><strong>First N:</strong> {{ test.first_n }}</div>{% endif %}
{% if test.last_n %}<div><strong>Last N:</strong> {{ test.last_n }}</div>{% endif %}
{% elif test.type == 'order' %}
<div><strong>Before:</strong> <span class="editable" data-field="before" data-id="{{ test.id }}">{{ test.before }}</span></div>
<div><strong>After:</strong> <span class="editable" data-field="after" data-id="{{ test.id }}">{{ test.after }}</span></div>
{% elif test.type == 'table' %}
<div><strong>Cell:</strong> <span class="editable" data-field="cell" data-id="{{ test.id }}">{{ test.cell }}</span></div>
{% if test.up %}<div><strong>Up:</strong> <span class="editable" data-field="up" data-id="{{ test.id }}">{{ test.up }}</span></div>{% endif %}
{% if test.down %}<div><strong>Down:</strong> <span class="editable" data-field="down" data-id="{{ test.id }}">{{ test.down }}</span></div>{% endif %}
{% if test.left %}<div><strong>Left:</strong> <span class="editable" data-field="left" data-id="{{ test.id }}">{{ test.left }}</span></div>{% endif %}
{% if test.right %}<div><strong>Right:</strong> <span class="editable" data-field="right" data-id="{{ test.id }}">{{ test.right }}</span></div>{% endif %}
{% if test.top_heading %}<div><strong>Top Heading:</strong> <span class="editable" data-field="top_heading" data-id="{{ test.id }}">{{ test.top_heading }}</span></div>{% endif %}
{% if test.left_heading %}<div><strong>Left Heading:</strong> <span class="editable" data-field="left_heading" data-id="{{ test.id }}">{{ test.left_heading }}</span></div>{% endif %}
{% elif test.type == 'math' %}
<div><strong>Math:</strong> <span class="editable" data-field="math" data-id="{{ test.id }}">{{ test.math }}</span></div>
{% endif %}
<div><strong>Max Diffs:</strong> {{ test.max_diffs }}</div>
<div><strong>Status:</strong> {{ test.checked or 'Not checked' }}</div>
</div>
</div>
{% endfor %}
</div>
</div>
<script>
// Function to update test status (approve/reject)
function updateTestStatus(pdfName, testId, field, value) {
fetch('/update_test', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
pdf: pdfName,
id: testId,
field: field,
value: value
}),
})
.then(response => response.json())
.then(data => {
// Update UI to reflect change
const testItem = document.querySelector(`.test-item[data-id="${testId}"]`);
testItem.classList.remove('status-approved', 'status-rejected');
if (value === 'verified') {
testItem.classList.add('status-approved');
} else if (value === 'rejected') {
testItem.classList.add('status-rejected');
}
})
.catch(error => {
console.error('Error updating test:', error);
});
}
// Toggle edit mode for a field
function toggleEditMode(testId) {
const editables = document.querySelectorAll(`.editable[data-id="${testId}"]`);
editables.forEach(editable => {
const field = editable.dataset.field;
const currentValue = editable.innerText;
// Create textarea
const textarea = document.createElement('textarea');
textarea.value = currentValue;
textarea.dataset.field = field;
textarea.dataset.originalValue = currentValue;
// Replace the span with textarea
editable.parentNode.replaceChild(textarea, editable);
// Focus the textarea
textarea.focus();
// Add blur event to save changes
textarea.addEventListener('blur', function() {
const newValue = this.value;
const pdfName = '{{ pdf_name }}';
// If value changed, save it
if (newValue !== this.dataset.originalValue) {
updateTestStatus(pdfName, testId, field, newValue);
}
// Create span again
const span = document.createElement('span');
span.className = 'editable';
span.dataset.field = field;
span.dataset.id = testId;
span.innerText = newValue;
// Replace textarea with span
this.parentNode.replaceChild(span, this);
});
});
}
</script>
</body>
</html>""")
# Create all done template
all_done_template = os.path.join(templates_dir, 'all_done.html')
with open(all_done_template, 'w') as f:
f.write("""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>All Tests Reviewed</title>
<style>
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 20px;
background-color: #f5f5f5;
display: flex;
justify-content: center;
align-items: center;
height: 100vh;
text-align: center;
}
.message {
background-color: white;
padding: 40px;
border-radius: 8px;
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
}
h1 {
color: #28a745;
}
</style>
</head>
<body>
<div class="message">
<h1>All Tests Reviewed!</h1>
<p>You have completed reviewing all tests in the dataset.</p>
</div>
</body>
</html>""")
def main():
"""Main entry point with command-line arguments."""
global DATASET_DIR, PDF_TESTS, ALL_PDFS
parser = argparse.ArgumentParser(description="Interactive Test Review App")
parser.add_argument("dataset_dir", help="Path to the dataset directory containing table_tests.jsonl and pdfs/ folder")
parser.add_argument("--port", type=int, default=5000, help="Port for the Flask app")
parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app")
parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode")
args = parser.parse_args()
# Validate dataset directory
if not os.path.isdir(args.dataset_dir):
print(f"Error: Dataset directory not found: {args.dataset_dir}")
return 1
pdf_dir = os.path.join(args.dataset_dir, "pdfs")
if not os.path.isdir(pdf_dir):
print(f"Error: PDF directory not found: {pdf_dir}")
return 1
# Store dataset directory globally
DATASET_DIR = args.dataset_dir
# Load dataset
try:
PDF_TESTS, ALL_PDFS = load_dataset(args.dataset_dir)
except Exception as e:
print(f"Error loading dataset: {str(e)}")
return 1
# Create templates directory
create_templates_directory()
# Find first PDF with unchecked tests
CURRENT_PDF = find_next_unchecked_pdf()
# Start Flask app
print(f"Starting server at http://{args.host}:{args.port}")
app.run(host=args.host, port=args.port, debug=args.debug)
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@ -86,10 +86,10 @@ bench = [
"sequence_align",
"syntok",
"google-genai",
"google-generativeai",
"playwright",
"mistralai",
"lxml",
"flask",
]
train = [