diff --git a/olmocr/bench/viewer.py b/olmocr/bench/viewer.py
index fb4aa7d..a5d472c 100644
--- a/olmocr/bench/viewer.py
+++ b/olmocr/bench/viewer.py
@@ -1,86 +1,146 @@
#!/usr/bin/env python3
-import argparse
import json
-import os
import sys
+import os
+import argparse
from collections import defaultdict
-
from olmocr.data.renderpdf import render_pdf_to_base64png
-
def parse_rules_file(file_path):
"""Parse the rules file and organize rules by PDF."""
pdf_rules = defaultdict(list)
-
- with open(file_path, "r") as f:
+
+ with open(file_path, 'r') as f:
for line in f:
line = line.strip()
if not line:
continue
-
+
try:
rule = json.loads(line)
- if "pdf" in rule:
- pdf_rules[rule["pdf"]].append(rule)
+ # Add checked field if it doesn't exist
+ if 'checked' not in rule:
+ rule['checked'] = None
+
+ if 'pdf' in rule:
+ pdf_rules[rule['pdf']].append(rule)
except json.JSONDecodeError:
print(f"Warning: Could not parse line as JSON: {line}")
-
+
return pdf_rules
-
-def get_rule_html(rule):
- """Generate HTML representation for a rule."""
- rule_type = rule.get("type", "unknown")
-
- if rule_type == "present":
+def get_rule_html(rule, rule_index):
+ """Generate HTML representation for a rule with interactive elements."""
+ rule_type = rule.get('type', 'unknown')
+ rule_id = f"rule-{rule_index}"
+
+ # Determine status button class based on 'checked' value
+ checked_status = rule.get('checked')
+ if checked_status == "verified":
+ status_class = "status-verified"
+ elif checked_status == "rejected":
+ status_class = "status-rejected"
+ else:
+ status_class = "status-unchecked"
+
+ # Create thumbs up/down buttons
+ status_button = f"""
+
+
+
+
+ """
+
+ # Create HTML based on rule type
+ if rule_type == 'present':
return f"""
-
"""
-
def generate_html(pdf_rules, rules_file_path):
- """Generate the HTML page with PDF renderings and rules."""
+ """Generate the HTML page with PDF renderings and interactive rules."""
# Limit to 10 unique PDFs
pdf_names = list(pdf_rules.keys())[:10]
-
+
+ # Prepare rules data for JavaScript
+ all_rules = []
+ for pdf_name in pdf_names:
+ all_rules.extend(pdf_rules[pdf_name])
+
+ rules_json = json.dumps(all_rules)
+
html = """
- PDF Rules Visualizer
+ Interactive PDF Rules Visualizer
-
PDF Rules Visualizer
+
Interactive PDF Rules Visualizer
"""
-
+
+ # Global rule index for unique IDs
+ rule_index = 0
+
for pdf_name in pdf_names:
rules = pdf_rules[pdf_name]
-
+
# Render the PDF (first page only) from the /pdfs folder
try:
- pdf_path = os.path.join(os.path.dirname(rules_file_path), "pdfs", pdf_name)
+ pdf_path = os.path.join(os.path.dirname(rules_file_path), 'pdfs', pdf_name)
base64_img = render_pdf_to_base64png(pdf_path, 0)
img_html = f''
except Exception as e:
img_html = f'
"""
-
- html += """
+
+ # Add JavaScript to manage interactivity
+ html += f"""
+
+
"""
-
+
return html
-
def main():
- parser = argparse.ArgumentParser(description="Generate an HTML visualization of PDF rules.")
- parser.add_argument("rules_file", help="Path to the rules file (JSON lines format)")
- parser.add_argument("-o", "--output", help="Output HTML file path", default="pdf_rules_visualization.html")
-
+ parser = argparse.ArgumentParser(description='Generate an interactive HTML visualization of PDF rules.')
+ parser.add_argument('rules_file', help='Path to the rules file (JSON lines format)')
+ parser.add_argument('-o', '--output', help='Output HTML file path', default='interactive_pdf_rules.html')
+
args = parser.parse_args()
-
+
if not os.path.exists(args.rules_file):
print(f"Error: Rules file not found: {args.rules_file}")
sys.exit(1)
-
+
pdf_rules = parse_rules_file(args.rules_file)
html = generate_html(pdf_rules, args.rules_file)
-
- with open(args.output, "w") as f:
+
+ with open(args.output, 'w') as f:
f.write(html)
-
- print(f"HTML visualization created: {args.output}")
-
+
+ print(f"Interactive HTML visualization created: {args.output}")
if __name__ == "__main__":
- main()
+ main()
\ No newline at end of file