mirror of
https://github.com/allenai/olmocr.git
synced 2025-06-27 04:00:02 +00:00
Flask based review app first attempt
This commit is contained in:
parent
93450c326d
commit
4939e41154
@ -16,7 +16,6 @@ Usage:
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
@ -139,10 +138,11 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
|
||||
parts=[
|
||||
image_part,
|
||||
types.Part.from_text(
|
||||
text=(
|
||||
"Analyze the document attached and output it in plain text. "
|
||||
"Please output the tables in valid HTML format that preserves the structure and content exactly. "
|
||||
"Include the complete table with all rows and columns. Make each table cell be sensible and semantically correct with the original intent of the table."
|
||||
text=(
|
||||
"Analyze the document attached and output it in markdown format. "
|
||||
"Output equations as Latex escaped with $$. "
|
||||
"Output tables in valid HTML format that preserves the structure and content exactly. "
|
||||
"Output figures with just a simple markdown image placeholder."
|
||||
)
|
||||
),
|
||||
],
|
||||
@ -415,4 +415,4 @@ def main():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
435
olmocr/bench/miners/mine_tables_gpt.py
Normal file
435
olmocr/bench/miners/mine_tables_gpt.py
Normal file
@ -0,0 +1,435 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
mine_tables.py - Extract tables from PDF documents and create table tests.
|
||||
|
||||
This script:
|
||||
1. Takes a file containing S3 paths to PDF documents as input
|
||||
2. For each PDF, extracts a random page and renders it to an image
|
||||
3. Uses GPT-4o to identify tables in the rendered image
|
||||
4. Extracts table content and creates table relationship tests by making a second GPT-4o request
|
||||
that now includes the page image alongside the prompt (e.g., "Given cell with {cell_value}, which cell is directly to the left of it?")
|
||||
5. Extracts the page from the PDF and saves it to an output folder
|
||||
|
||||
Usage:
|
||||
python mine_tables.py --input_list path/to/s3_paths.txt --output_dir path/to/output --api_key your_openai_api_key
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import boto3
|
||||
import numpy as np
|
||||
import pypdf
|
||||
from bs4 import BeautifulSoup
|
||||
from openai import OpenAI
|
||||
from tqdm import tqdm
|
||||
|
||||
from olmocr.bench.tests import TableTest, save_tests
|
||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||
from olmocr.filter import PdfFilter
|
||||
|
||||
|
||||
def download_pdf_from_s3(s3_path: str, local_path: str) -> bool:
|
||||
"""
|
||||
Download a PDF file from S3.
|
||||
|
||||
Args:
|
||||
s3_path: The S3 path (s3://bucket/path/to/file.pdf)
|
||||
local_path: The local path to save the file
|
||||
|
||||
Returns:
|
||||
bool: True if download was successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Parse S3 path
|
||||
parts = s3_path.replace("s3://", "").split("/", 1)
|
||||
bucket = parts[0]
|
||||
key = parts[1]
|
||||
|
||||
# Create S3 client
|
||||
s3 = boto3.client("s3")
|
||||
|
||||
# Create directory if it doesn't exist
|
||||
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
||||
|
||||
# Download file
|
||||
s3.download_file(bucket, key, local_path)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error downloading {s3_path}: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def extract_page_from_pdf(input_path: str, output_path: str, page_num: int) -> bool:
|
||||
"""
|
||||
Extract a specific page from a PDF and save it as a new PDF.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input PDF
|
||||
output_path: Path to save the extracted page
|
||||
page_num: The page number to extract (0-indexed)
|
||||
|
||||
Returns:
|
||||
bool: True if extraction was successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Ensure output directory exists
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
|
||||
# Read the input PDF
|
||||
reader = pypdf.PdfReader(input_path)
|
||||
|
||||
# Check if page number is valid
|
||||
if page_num >= len(reader.pages):
|
||||
print(f"Page number {page_num} out of range for {input_path} with {len(reader.pages)} pages")
|
||||
return False
|
||||
|
||||
# Create a new PDF with just the selected page
|
||||
writer = pypdf.PdfWriter()
|
||||
writer.add_page(reader.pages[page_num])
|
||||
|
||||
# Write the output PDF
|
||||
with open(output_path, "wb") as output_file:
|
||||
writer.write(output_file)
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error extracting page {page_num} from {input_path}: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[List[np.ndarray], str]]:
|
||||
"""
|
||||
Use GPT-4o to detect tables in a rendered PDF page.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the PDF file
|
||||
page_num: The page number to analyze (0-indexed)
|
||||
api_key: OpenAI API key
|
||||
|
||||
Returns:
|
||||
Optional[Tuple[List[np.ndarray], str]]:
|
||||
A tuple with a list of detected tables (as numpy arrays) and the base64 string of the rendered page image.
|
||||
Returns None if detection fails.
|
||||
"""
|
||||
# Initialize OpenAI client
|
||||
client = OpenAI(api_key=api_key)
|
||||
model = "gpt-4o"
|
||||
|
||||
# Render the PDF page as an image (render_pdf_to_base64png is 1-indexed)
|
||||
try:
|
||||
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num + 1, target_longest_image_dim=2048)
|
||||
except Exception as e:
|
||||
print(f"Error rendering PDF page: {str(e)}")
|
||||
return None
|
||||
|
||||
# Prepare prompt for GPT-4o to extract tables
|
||||
try:
|
||||
# Call OpenAI API
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{image_base64}",
|
||||
"detail": "high"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": (
|
||||
"Analyze the document attached and output it in markdown format. "
|
||||
"Output equations as Latex escaped with $$. "
|
||||
"Output tables in valid HTML format that preserves the structure and content exactly. "
|
||||
"Output figures with just a simple markdown image placeholder."
|
||||
)
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
temperature=0.2,
|
||||
)
|
||||
|
||||
if not response.choices or len(response.choices) == 0:
|
||||
print(f"No response generated for {pdf_path} page {page_num}")
|
||||
return None
|
||||
|
||||
# Parse the response
|
||||
response_text = response.choices[0].message.content
|
||||
|
||||
print(response_text)
|
||||
|
||||
# Parse tables from HTML
|
||||
parsed_tables = []
|
||||
soup = BeautifulSoup(response_text, "html.parser")
|
||||
tables = soup.find_all("table")
|
||||
|
||||
for table in tables:
|
||||
rows = table.find_all("tr")
|
||||
table_data = []
|
||||
for row in rows:
|
||||
cells = row.find_all(["th", "td"])
|
||||
row_data = [cell.get_text().strip() for cell in cells]
|
||||
table_data.append(row_data)
|
||||
# Ensure all rows have the same number of columns
|
||||
if table_data:
|
||||
max_cols = max(len(row) for row in table_data)
|
||||
padded_data = [row + [""] * (max_cols - len(row)) for row in table_data]
|
||||
table_array = np.array(padded_data)
|
||||
parsed_tables.append(table_array)
|
||||
|
||||
# Return both the parsed tables and the rendered image (base64 string)
|
||||
return (parsed_tables, image_base64) if parsed_tables else None
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error detecting tables in {pdf_path} page {page_num}: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
def generate_table_tests(tables: List[np.ndarray], pdf_image: str, api_key: str, max_tests_per_table: int = 3) -> List[Dict]:
|
||||
"""
|
||||
Generate table tests from the detected tables by making a second GPT-4o request for each candidate cell.
|
||||
|
||||
For each candidate cell in a table, the function selects one valid relationship (e.g., "left", "up", "top_heading", etc.)
|
||||
and sends a prompt to GPT-4o including the page image. For example:
|
||||
"Given a cell in a table with value 'XYZ', please answer: which cell is directly to the left of it? Provide only the cell's text."
|
||||
|
||||
Args:
|
||||
tables: List of tables as numpy arrays
|
||||
pdf_image: Base64 string of the rendered page image
|
||||
api_key: OpenAI API key to use for generating relationship tests
|
||||
max_tests_per_table: Maximum number of tests to generate per table
|
||||
|
||||
Returns:
|
||||
List of table test dictionaries
|
||||
"""
|
||||
tests = []
|
||||
# Initialize OpenAI client for test queries
|
||||
client = OpenAI(api_key=api_key)
|
||||
model = "gpt-4o"
|
||||
|
||||
# Mapping for relationship prompts
|
||||
prompt_map = {
|
||||
"up": "which cell is directly above it?",
|
||||
"down": "which cell is directly below it?",
|
||||
"left": "which cell is directly to the left of it?",
|
||||
"right": "which cell is directly to the right of it?",
|
||||
"top_heading": "what is the top heading for this cell?",
|
||||
"left_heading": "what is the left heading for this cell?",
|
||||
}
|
||||
|
||||
for table in tables:
|
||||
rows, cols = table.shape
|
||||
if table.size == 0 or rows < 2 or cols < 2:
|
||||
continue # Skip tables that are too small
|
||||
|
||||
# Try up to 3x max_tests_per_table candidate cells
|
||||
candidate_positions = []
|
||||
for _ in range(max_tests_per_table * 3):
|
||||
row = random.randint(0, rows - 1)
|
||||
col = random.randint(0, cols - 1)
|
||||
if not table[row, col].strip():
|
||||
continue
|
||||
candidate_positions.append((row, col))
|
||||
|
||||
random.shuffle(candidate_positions)
|
||||
tests_for_this_table = 0
|
||||
|
||||
for row, col in candidate_positions:
|
||||
if tests_for_this_table >= max_tests_per_table:
|
||||
break
|
||||
|
||||
cell_value = table[row, col].strip()
|
||||
# Determine valid relationship types based on candidate's position
|
||||
valid_relationships = []
|
||||
if row > 0:
|
||||
valid_relationships.append("up")
|
||||
if row < rows - 1:
|
||||
valid_relationships.append("down")
|
||||
if col > 0:
|
||||
valid_relationships.append("left")
|
||||
if col < cols - 1:
|
||||
valid_relationships.append("right")
|
||||
if row > 0:
|
||||
valid_relationships.append("top_heading")
|
||||
if col > 0:
|
||||
valid_relationships.append("left_heading")
|
||||
if not valid_relationships:
|
||||
continue
|
||||
|
||||
relationship = random.choice(valid_relationships)
|
||||
prompt = (
|
||||
f"Given a cell in a table with value '{cell_value}', please answer: "
|
||||
f"{prompt_map[relationship]} Provide only the cell's text or output 'null' if there is not a matching cell."
|
||||
)
|
||||
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{pdf_image}",
|
||||
"detail": "high"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
temperature=0.2,
|
||||
)
|
||||
|
||||
if not response.choices or len(response.choices) == 0:
|
||||
continue
|
||||
|
||||
answer_text = response.choices[0].message.content.strip()
|
||||
if answer_text and "null" not in answer_text:
|
||||
test_data = {"cell": cell_value, relationship: answer_text}
|
||||
tests.append(test_data)
|
||||
tests_for_this_table += 1
|
||||
except Exception as e:
|
||||
print(f"Error querying GPT-4o for cell '{cell_value}' and relationship '{relationship}': {str(e)}")
|
||||
|
||||
return tests
|
||||
|
||||
|
||||
def process_pdf(s3_path: str, temp_dir: str, output_dir: str, api_key: str, tests: List[TableTest]) -> None:
|
||||
"""
|
||||
Process a single PDF from S3.
|
||||
|
||||
Args:
|
||||
s3_path: S3 path to the PDF
|
||||
temp_dir: Directory for temporary files
|
||||
output_dir: Directory for output files
|
||||
api_key: OpenAI API key
|
||||
tests: List to append tests to
|
||||
"""
|
||||
# Extract filename from S3 path
|
||||
pdf_filename = os.path.basename(s3_path)
|
||||
local_pdf_path = os.path.join(temp_dir, pdf_filename)
|
||||
|
||||
# Download PDF from S3
|
||||
if not download_pdf_from_s3(s3_path, local_pdf_path):
|
||||
return
|
||||
|
||||
pdf_filter = PdfFilter()
|
||||
|
||||
if pdf_filter.filter_out_pdf(local_pdf_path):
|
||||
print(f"Filtering out {pdf_filename}")
|
||||
return
|
||||
|
||||
try:
|
||||
# Read the PDF to get the number of pages
|
||||
reader = pypdf.PdfReader(local_pdf_path)
|
||||
num_pages = len(reader.pages)
|
||||
|
||||
if num_pages == 0:
|
||||
print(f"PDF {pdf_filename} has no pages")
|
||||
return
|
||||
|
||||
all_pages = list(range(len(reader.pages)))
|
||||
random.shuffle(all_pages)
|
||||
|
||||
for page_num in all_pages:
|
||||
# Detect tables and obtain the rendered image for this page
|
||||
result = detect_tables(local_pdf_path, page_num, api_key)
|
||||
if not result:
|
||||
print(f"No tables detected in {pdf_filename} page {page_num+1}")
|
||||
continue
|
||||
|
||||
tables, image_base64 = result
|
||||
|
||||
# Generate table tests using the new GPT-4o query approach with the page image
|
||||
table_tests_data = generate_table_tests(tables, image_base64, api_key, max_tests_per_table=5)
|
||||
|
||||
if not table_tests_data:
|
||||
print(f"Could not generate valid tests for tables in {pdf_filename} page {page_num+1}")
|
||||
continue
|
||||
|
||||
# Extract the page and save to output dir
|
||||
pdf_basename = os.path.splitext(pdf_filename)[0]
|
||||
output_pdf_path = os.path.join(output_dir, "pdfs", f"{pdf_basename}_pg{page_num+1}.pdf")
|
||||
extract_page_from_pdf(local_pdf_path, output_pdf_path, page_num)
|
||||
|
||||
# Create table tests
|
||||
for i, test_data in enumerate(table_tests_data):
|
||||
test_id = f"{pdf_basename}_pg{page_num+1}_table_{i:02d}"
|
||||
test = TableTest(
|
||||
id=test_id,
|
||||
pdf=f"{pdf_basename}_pg{page_num+1}.pdf",
|
||||
page=1, # The extracted PDF has only one page
|
||||
type="table",
|
||||
cell=test_data["cell"],
|
||||
up=test_data.get("up", None),
|
||||
down=test_data.get("down", None),
|
||||
left=test_data.get("left", None),
|
||||
right=test_data.get("right", None),
|
||||
top_heading=test_data.get("top_heading", None),
|
||||
left_heading=test_data.get("left_heading", None),
|
||||
)
|
||||
tests.append(test)
|
||||
|
||||
print(f"Processed {pdf_filename} page {page_num+1}, found {len(tables)} tables, created {len(table_tests_data)} tests")
|
||||
return # Process only one page per PDF
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {pdf_filename}: {str(e)}")
|
||||
finally:
|
||||
if os.path.exists(local_pdf_path):
|
||||
os.remove(local_pdf_path)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Extract tables from PDF documents and create table tests")
|
||||
parser.add_argument("--input_list", required=True, help="Path to a file containing S3 paths to PDFs")
|
||||
parser.add_argument("--output_dir", required=True, help="Directory to store extracted pages and tests")
|
||||
parser.add_argument("--api_key", help="OpenAI API key (if not provided, will use OPENAI_API_KEY environment variable)")
|
||||
parser.add_argument("--temp_dir", default="/tmp/mine_tables", help="Directory for temporary files")
|
||||
parser.add_argument("--max_tests", type=int, default=100, help="Maximum number of tests to generate")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get API key
|
||||
api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
print("Error: OpenAI API key not provided. Use --api_key or set OPENAI_API_KEY environment variable.")
|
||||
return
|
||||
|
||||
os.makedirs(args.temp_dir, exist_ok=True)
|
||||
os.makedirs(os.path.join(args.output_dir, "pdfs"), exist_ok=True)
|
||||
|
||||
with open(args.input_list, "r") as f:
|
||||
s3_paths = [line.strip() for line in f if line.strip()]
|
||||
|
||||
print(f"Found {len(s3_paths)} PDF paths in input list")
|
||||
tests = []
|
||||
for s3_path in tqdm(s3_paths, desc="Processing PDFs"):
|
||||
process_pdf(s3_path, args.temp_dir, args.output_dir, api_key, tests)
|
||||
|
||||
if tests:
|
||||
save_tests(tests, os.path.join(args.output_dir, "table_tests.jsonl"))
|
||||
|
||||
if len(tests) >= args.max_tests:
|
||||
print(f"Reached maximum number of tests ({args.max_tests}), stopping")
|
||||
break
|
||||
|
||||
print(f"Saved {len(tests)} table tests to {os.path.join(args.output_dir, 'table_tests.jsonl')}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
581
olmocr/bench/review_app.py
Normal file
581
olmocr/bench/review_app.py
Normal file
@ -0,0 +1,581 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import flask
|
||||
from flask import Flask, render_template, request, jsonify, redirect, url_for
|
||||
from werkzeug.utils import secure_filename
|
||||
|
||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||
from . import tests
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Global state
|
||||
DATASET_DIR = ""
|
||||
CURRENT_PDF = None
|
||||
PDF_TESTS = {}
|
||||
ALL_PDFS = []
|
||||
|
||||
|
||||
def find_next_unchecked_pdf() -> Optional[str]:
|
||||
"""Find the next PDF with at least one unchecked test."""
|
||||
global PDF_TESTS, ALL_PDFS
|
||||
|
||||
for pdf_name in ALL_PDFS:
|
||||
pdf_tests = PDF_TESTS[pdf_name]
|
||||
for test in pdf_tests:
|
||||
if test.get("checked") is None:
|
||||
return pdf_name
|
||||
return None
|
||||
|
||||
|
||||
def save_dataset(jsonl_file: str) -> None:
|
||||
"""Save the tests to a JSONL file, using temp file for atomic write."""
|
||||
global PDF_TESTS
|
||||
|
||||
# Flatten all tests
|
||||
all_tests = []
|
||||
for pdf_tests in PDF_TESTS.values():
|
||||
all_tests.extend(pdf_tests)
|
||||
|
||||
# Create temp file and write updated content
|
||||
with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
|
||||
for test in all_tests:
|
||||
temp_file.write(json.dumps(test) + "\n")
|
||||
|
||||
# Atomic replace
|
||||
shutil.move(temp_file.name, jsonl_file)
|
||||
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
"""Main page displaying the current PDF and its tests."""
|
||||
global CURRENT_PDF, PDF_TESTS, DATASET_DIR
|
||||
|
||||
# If no current PDF is set, find the next one with unchecked tests
|
||||
if CURRENT_PDF is None:
|
||||
CURRENT_PDF = find_next_unchecked_pdf()
|
||||
|
||||
# If still no PDF, all tests have been checked
|
||||
if CURRENT_PDF is None:
|
||||
return render_template('all_done.html')
|
||||
|
||||
# Get the tests for the current PDF
|
||||
current_tests = PDF_TESTS.get(CURRENT_PDF, [])
|
||||
|
||||
# Render the PDF
|
||||
pdf_path = os.path.join(DATASET_DIR, "pdfs", CURRENT_PDF)
|
||||
base64_img = render_pdf_to_base64png(pdf_path, 0)
|
||||
|
||||
return render_template(
|
||||
'review.html',
|
||||
pdf_name=CURRENT_PDF,
|
||||
tests=current_tests,
|
||||
pdf_img=base64_img,
|
||||
pdf_index=ALL_PDFS.index(CURRENT_PDF) if CURRENT_PDF in ALL_PDFS else 0,
|
||||
total_pdfs=len(ALL_PDFS)
|
||||
)
|
||||
|
||||
|
||||
@app.route('/update_test', methods=['POST'])
|
||||
def update_test():
|
||||
"""API endpoint to update a test."""
|
||||
global PDF_TESTS, DATASET_DIR
|
||||
|
||||
data = request.json
|
||||
pdf_name = data.get('pdf')
|
||||
test_id = data.get('id')
|
||||
field = data.get('field')
|
||||
value = data.get('value')
|
||||
|
||||
# Find and update the test
|
||||
for test in PDF_TESTS.get(pdf_name, []):
|
||||
if test.get('id') == test_id:
|
||||
test[field] = value
|
||||
break
|
||||
|
||||
# Save the updated tests
|
||||
dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl")
|
||||
save_dataset(dataset_file)
|
||||
|
||||
return jsonify({"status": "success"})
|
||||
|
||||
|
||||
@app.route('/next_pdf', methods=['POST'])
|
||||
def next_pdf():
|
||||
"""Move to the next PDF in the list."""
|
||||
global CURRENT_PDF, ALL_PDFS
|
||||
|
||||
if CURRENT_PDF in ALL_PDFS:
|
||||
current_index = ALL_PDFS.index(CURRENT_PDF)
|
||||
if current_index < len(ALL_PDFS) - 1:
|
||||
CURRENT_PDF = ALL_PDFS[current_index + 1]
|
||||
else:
|
||||
CURRENT_PDF = find_next_unchecked_pdf()
|
||||
else:
|
||||
CURRENT_PDF = find_next_unchecked_pdf()
|
||||
|
||||
return redirect(url_for('index'))
|
||||
|
||||
|
||||
@app.route('/prev_pdf', methods=['POST'])
|
||||
def prev_pdf():
|
||||
"""Move to the previous PDF in the list."""
|
||||
global CURRENT_PDF, ALL_PDFS
|
||||
|
||||
if CURRENT_PDF in ALL_PDFS:
|
||||
current_index = ALL_PDFS.index(CURRENT_PDF)
|
||||
if current_index > 0:
|
||||
CURRENT_PDF = ALL_PDFS[current_index - 1]
|
||||
|
||||
return redirect(url_for('index'))
|
||||
|
||||
|
||||
@app.route('/goto_pdf/<int:index>', methods=['POST'])
|
||||
def goto_pdf(index):
|
||||
"""Go to a specific PDF by index."""
|
||||
global CURRENT_PDF, ALL_PDFS
|
||||
|
||||
if 0 <= index < len(ALL_PDFS):
|
||||
CURRENT_PDF = ALL_PDFS[index]
|
||||
|
||||
return redirect(url_for('index'))
|
||||
|
||||
|
||||
def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
|
||||
"""Load tests from the dataset file and organize them by PDF."""
|
||||
dataset_file = os.path.join(dataset_dir, "table_tests.jsonl")
|
||||
|
||||
if not os.path.exists(dataset_file):
|
||||
raise FileNotFoundError(f"Dataset file not found: {dataset_file}")
|
||||
|
||||
pdf_tests = defaultdict(list)
|
||||
|
||||
with open(dataset_file, "r") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
test = json.loads(line)
|
||||
pdf_name = test.get('pdf')
|
||||
if pdf_name:
|
||||
pdf_tests[pdf_name].append(test)
|
||||
except json.JSONDecodeError:
|
||||
print(f"Warning: Could not parse line as JSON: {line}")
|
||||
|
||||
all_pdfs = list(pdf_tests.keys())
|
||||
|
||||
return pdf_tests, all_pdfs
|
||||
|
||||
|
||||
def create_templates_directory():
|
||||
"""Create templates directory for Flask if it doesn't exist."""
|
||||
templates_dir = os.path.join(os.path.dirname(__file__), 'templates')
|
||||
os.makedirs(templates_dir, exist_ok=True)
|
||||
|
||||
# Create review template
|
||||
review_template = os.path.join(templates_dir, 'review.html')
|
||||
with open(review_template, 'w') as f:
|
||||
f.write("""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>PDF Test Review</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: Arial, sans-serif;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 1920px;
|
||||
margin: 0 auto;
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #333;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.navigation {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.pdf-viewer {
|
||||
flex: 1;
|
||||
padding: 20px;
|
||||
background-color: white;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
|
||||
margin-right: 20px;
|
||||
overflow: auto;
|
||||
max-height: calc(100vh - 100px);
|
||||
}
|
||||
|
||||
.pdf-image {
|
||||
max-width: 100%;
|
||||
}
|
||||
|
||||
.tests-panel {
|
||||
flex: 1;
|
||||
padding: 20px;
|
||||
background-color: white;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
|
||||
overflow-y: auto;
|
||||
max-height: calc(100vh - 100px);
|
||||
}
|
||||
|
||||
.test-item {
|
||||
margin-bottom: 20px;
|
||||
padding: 15px;
|
||||
border: 1px solid #e0e0e0;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.test-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.test-type {
|
||||
display: inline-block;
|
||||
padding: 5px 10px;
|
||||
border-radius: 4px;
|
||||
color: white;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.present {
|
||||
background-color: #28a745;
|
||||
}
|
||||
|
||||
.absent {
|
||||
background-color: #dc3545;
|
||||
}
|
||||
|
||||
.order {
|
||||
background-color: #fd7e14;
|
||||
}
|
||||
|
||||
.table {
|
||||
background-color: #17a2b8;
|
||||
}
|
||||
|
||||
.math {
|
||||
background-color: #6f42c1;
|
||||
}
|
||||
|
||||
.baseline {
|
||||
background-color: #4a6fa5;
|
||||
}
|
||||
|
||||
.unknown {
|
||||
background-color: #6c757d;
|
||||
}
|
||||
|
||||
.test-buttons {
|
||||
display: flex;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.test-content {
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
button {
|
||||
padding: 8px 16px;
|
||||
border: none;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.approve-btn {
|
||||
background-color: #28a745;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.reject-btn {
|
||||
background-color: #dc3545;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.edit-btn {
|
||||
background-color: #17a2b8;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.next-btn, .prev-btn {
|
||||
background-color: #4a6fa5;
|
||||
color: white;
|
||||
}
|
||||
|
||||
textarea {
|
||||
width: 100%;
|
||||
padding: 8px;
|
||||
border: 1px solid #ccc;
|
||||
border-radius: 4px;
|
||||
resize: vertical;
|
||||
}
|
||||
|
||||
.editable {
|
||||
border: 1px dashed #ccc;
|
||||
padding: 5px;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
|
||||
.status-approved {
|
||||
border-left: 5px solid #28a745;
|
||||
}
|
||||
|
||||
.status-rejected {
|
||||
border-left: 5px solid #dc3545;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>PDF Test Review: {{ pdf_name }} ({{ pdf_index + 1 }}/{{ total_pdfs }})</h1>
|
||||
|
||||
<div class="navigation">
|
||||
<form action="/prev_pdf" method="post">
|
||||
<button type="submit" class="prev-btn">Previous PDF</button>
|
||||
</form>
|
||||
<form action="/next_pdf" method="post">
|
||||
<button type="submit" class="next-btn">Next PDF</button>
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<div class="container">
|
||||
<div class="pdf-viewer">
|
||||
<img class="pdf-image" src="data:image/png;base64,{{ pdf_img }}" alt="{{ pdf_name }}">
|
||||
</div>
|
||||
|
||||
<div class="tests-panel">
|
||||
<h2>Tests ({{ tests|length }})</h2>
|
||||
|
||||
{% for test in tests %}
|
||||
<div class="test-item {% if test.checked == 'verified' %}status-approved{% elif test.checked == 'rejected' %}status-rejected{% endif %}" data-id="{{ test.id }}">
|
||||
<div class="test-header">
|
||||
<span class="test-type {{ test.type }}">{{ test.type|upper }}</span>
|
||||
<div class="test-buttons">
|
||||
<button class="approve-btn" onclick="updateTestStatus('{{ test.pdf }}', '{{ test.id }}', 'checked', 'verified')">Approve</button>
|
||||
<button class="reject-btn" onclick="updateTestStatus('{{ test.pdf }}', '{{ test.id }}', 'checked', 'rejected')">Reject</button>
|
||||
<button class="edit-btn" onclick="toggleEditMode('{{ test.id }}')">Edit</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="test-content">
|
||||
{% if test.type == 'present' or test.type == 'absent' %}
|
||||
<div><strong>Text:</strong> <span class="editable" data-field="text" data-id="{{ test.id }}">{{ test.text }}</span></div>
|
||||
<div><strong>Case Sensitive:</strong> {{ test.case_sensitive }}</div>
|
||||
{% if test.first_n %}<div><strong>First N:</strong> {{ test.first_n }}</div>{% endif %}
|
||||
{% if test.last_n %}<div><strong>Last N:</strong> {{ test.last_n }}</div>{% endif %}
|
||||
{% elif test.type == 'order' %}
|
||||
<div><strong>Before:</strong> <span class="editable" data-field="before" data-id="{{ test.id }}">{{ test.before }}</span></div>
|
||||
<div><strong>After:</strong> <span class="editable" data-field="after" data-id="{{ test.id }}">{{ test.after }}</span></div>
|
||||
{% elif test.type == 'table' %}
|
||||
<div><strong>Cell:</strong> <span class="editable" data-field="cell" data-id="{{ test.id }}">{{ test.cell }}</span></div>
|
||||
{% if test.up %}<div><strong>Up:</strong> <span class="editable" data-field="up" data-id="{{ test.id }}">{{ test.up }}</span></div>{% endif %}
|
||||
{% if test.down %}<div><strong>Down:</strong> <span class="editable" data-field="down" data-id="{{ test.id }}">{{ test.down }}</span></div>{% endif %}
|
||||
{% if test.left %}<div><strong>Left:</strong> <span class="editable" data-field="left" data-id="{{ test.id }}">{{ test.left }}</span></div>{% endif %}
|
||||
{% if test.right %}<div><strong>Right:</strong> <span class="editable" data-field="right" data-id="{{ test.id }}">{{ test.right }}</span></div>{% endif %}
|
||||
{% if test.top_heading %}<div><strong>Top Heading:</strong> <span class="editable" data-field="top_heading" data-id="{{ test.id }}">{{ test.top_heading }}</span></div>{% endif %}
|
||||
{% if test.left_heading %}<div><strong>Left Heading:</strong> <span class="editable" data-field="left_heading" data-id="{{ test.id }}">{{ test.left_heading }}</span></div>{% endif %}
|
||||
{% elif test.type == 'math' %}
|
||||
<div><strong>Math:</strong> <span class="editable" data-field="math" data-id="{{ test.id }}">{{ test.math }}</span></div>
|
||||
{% endif %}
|
||||
<div><strong>Max Diffs:</strong> {{ test.max_diffs }}</div>
|
||||
<div><strong>Status:</strong> {{ test.checked or 'Not checked' }}</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
// Function to update test status (approve/reject)
|
||||
function updateTestStatus(pdfName, testId, field, value) {
|
||||
fetch('/update_test', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
pdf: pdfName,
|
||||
id: testId,
|
||||
field: field,
|
||||
value: value
|
||||
}),
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
// Update UI to reflect change
|
||||
const testItem = document.querySelector(`.test-item[data-id="${testId}"]`);
|
||||
testItem.classList.remove('status-approved', 'status-rejected');
|
||||
|
||||
if (value === 'verified') {
|
||||
testItem.classList.add('status-approved');
|
||||
} else if (value === 'rejected') {
|
||||
testItem.classList.add('status-rejected');
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error updating test:', error);
|
||||
});
|
||||
}
|
||||
|
||||
// Toggle edit mode for a field
|
||||
function toggleEditMode(testId) {
|
||||
const editables = document.querySelectorAll(`.editable[data-id="${testId}"]`);
|
||||
|
||||
editables.forEach(editable => {
|
||||
const field = editable.dataset.field;
|
||||
const currentValue = editable.innerText;
|
||||
|
||||
// Create textarea
|
||||
const textarea = document.createElement('textarea');
|
||||
textarea.value = currentValue;
|
||||
textarea.dataset.field = field;
|
||||
textarea.dataset.originalValue = currentValue;
|
||||
|
||||
// Replace the span with textarea
|
||||
editable.parentNode.replaceChild(textarea, editable);
|
||||
|
||||
// Focus the textarea
|
||||
textarea.focus();
|
||||
|
||||
// Add blur event to save changes
|
||||
textarea.addEventListener('blur', function() {
|
||||
const newValue = this.value;
|
||||
const pdfName = '{{ pdf_name }}';
|
||||
|
||||
// If value changed, save it
|
||||
if (newValue !== this.dataset.originalValue) {
|
||||
updateTestStatus(pdfName, testId, field, newValue);
|
||||
}
|
||||
|
||||
// Create span again
|
||||
const span = document.createElement('span');
|
||||
span.className = 'editable';
|
||||
span.dataset.field = field;
|
||||
span.dataset.id = testId;
|
||||
span.innerText = newValue;
|
||||
|
||||
// Replace textarea with span
|
||||
this.parentNode.replaceChild(span, this);
|
||||
});
|
||||
});
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>""")
|
||||
|
||||
# Create all done template
|
||||
all_done_template = os.path.join(templates_dir, 'all_done.html')
|
||||
with open(all_done_template, 'w') as f:
|
||||
f.write("""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>All Tests Reviewed</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: Arial, sans-serif;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background-color: #f5f5f5;
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
height: 100vh;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.message {
|
||||
background-color: white;
|
||||
padding: 40px;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #28a745;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="message">
|
||||
<h1>All Tests Reviewed!</h1>
|
||||
<p>You have completed reviewing all tests in the dataset.</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>""")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point with command-line arguments."""
|
||||
global DATASET_DIR, PDF_TESTS, ALL_PDFS
|
||||
|
||||
parser = argparse.ArgumentParser(description="Interactive Test Review App")
|
||||
parser.add_argument("dataset_dir", help="Path to the dataset directory containing table_tests.jsonl and pdfs/ folder")
|
||||
parser.add_argument("--port", type=int, default=5000, help="Port for the Flask app")
|
||||
parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app")
|
||||
parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate dataset directory
|
||||
if not os.path.isdir(args.dataset_dir):
|
||||
print(f"Error: Dataset directory not found: {args.dataset_dir}")
|
||||
return 1
|
||||
|
||||
pdf_dir = os.path.join(args.dataset_dir, "pdfs")
|
||||
if not os.path.isdir(pdf_dir):
|
||||
print(f"Error: PDF directory not found: {pdf_dir}")
|
||||
return 1
|
||||
|
||||
# Store dataset directory globally
|
||||
DATASET_DIR = args.dataset_dir
|
||||
|
||||
# Load dataset
|
||||
try:
|
||||
PDF_TESTS, ALL_PDFS = load_dataset(args.dataset_dir)
|
||||
except Exception as e:
|
||||
print(f"Error loading dataset: {str(e)}")
|
||||
return 1
|
||||
|
||||
# Create templates directory
|
||||
create_templates_directory()
|
||||
|
||||
# Find first PDF with unchecked tests
|
||||
CURRENT_PDF = find_next_unchecked_pdf()
|
||||
|
||||
# Start Flask app
|
||||
print(f"Starting server at http://{args.host}:{args.port}")
|
||||
app.run(host=args.host, port=args.port, debug=args.debug)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
@ -86,10 +86,10 @@ bench = [
|
||||
"sequence_align",
|
||||
"syntok",
|
||||
"google-genai",
|
||||
"google-generativeai",
|
||||
"playwright",
|
||||
"mistralai",
|
||||
"lxml",
|
||||
"flask",
|
||||
]
|
||||
|
||||
train = [
|
||||
|
Loading…
x
Reference in New Issue
Block a user