diff --git a/olmocr/bench/viewer.py b/olmocr/bench/viewer.py index fb4aa7d..a5d472c 100644 --- a/olmocr/bench/viewer.py +++ b/olmocr/bench/viewer.py @@ -1,86 +1,146 @@ #!/usr/bin/env python3 -import argparse import json -import os import sys +import os +import argparse from collections import defaultdict - from olmocr.data.renderpdf import render_pdf_to_base64png - def parse_rules_file(file_path): """Parse the rules file and organize rules by PDF.""" pdf_rules = defaultdict(list) - - with open(file_path, "r") as f: + + with open(file_path, 'r') as f: for line in f: line = line.strip() if not line: continue - + try: rule = json.loads(line) - if "pdf" in rule: - pdf_rules[rule["pdf"]].append(rule) + # Add checked field if it doesn't exist + if 'checked' not in rule: + rule['checked'] = None + + if 'pdf' in rule: + pdf_rules[rule['pdf']].append(rule) except json.JSONDecodeError: print(f"Warning: Could not parse line as JSON: {line}") - + return pdf_rules - -def get_rule_html(rule): - """Generate HTML representation for a rule.""" - rule_type = rule.get("type", "unknown") - - if rule_type == "present": +def get_rule_html(rule, rule_index): + """Generate HTML representation for a rule with interactive elements.""" + rule_type = rule.get('type', 'unknown') + rule_id = f"rule-{rule_index}" + + # Determine status button class based on 'checked' value + checked_status = rule.get('checked') + if checked_status == "verified": + status_class = "status-verified" + elif checked_status == "rejected": + status_class = "status-rejected" + else: + status_class = "status-unchecked" + + # Create thumbs up/down buttons + status_button = f""" +
+ + +
+ """ + + # Create HTML based on rule type + if rule_type == 'present': return f""" - + + {status_button} PRESENT - "{rule.get('text', '')}" + +
{rule.get('text', '')}
+ Threshold: {rule.get('threshold', 'N/A')} """ - elif rule_type == "absent": + elif rule_type == 'absent': return f""" - + + {status_button} ABSENT - "{rule.get('text', '')}" + +
{rule.get('text', '')}
+ Threshold: {rule.get('threshold', 'N/A')} """ - elif rule_type == "order": + elif rule_type == 'order': return f""" - + + {status_button} ORDER -

Before: "{rule.get('before', '')}"

-

After: "{rule.get('after', '')}"

+

Before: + {rule.get('before', '')} +

+

After: + {rule.get('after', '')} +

Threshold: {rule.get('threshold', 'N/A')} """ else: return f""" - + + {status_button} UNKNOWN Unknown rule type: {rule_type} """ - def generate_html(pdf_rules, rules_file_path): - """Generate the HTML page with PDF renderings and rules.""" + """Generate the HTML page with PDF renderings and interactive rules.""" # Limit to 10 unique PDFs pdf_names = list(pdf_rules.keys())[:10] - + + # Prepare rules data for JavaScript + all_rules = [] + for pdf_name in pdf_names: + all_rules.extend(pdf_rules[pdf_name]) + + rules_json = json.dumps(all_rules) + html = """ - PDF Rules Visualizer + Interactive PDF Rules Visualizer
-

PDF Rules Visualizer

+

Interactive PDF Rules Visualizer

""" - + + # Global rule index for unique IDs + rule_index = 0 + for pdf_name in pdf_names: rules = pdf_rules[pdf_name] - + # Render the PDF (first page only) from the /pdfs folder try: - pdf_path = os.path.join(os.path.dirname(rules_file_path), "pdfs", pdf_name) + pdf_path = os.path.join(os.path.dirname(rules_file_path), 'pdfs', pdf_name) base64_img = render_pdf_to_base64png(pdf_path, 0) img_html = f'{pdf_name}' except Exception as e: img_html = f'
Error rendering PDF: {str(e)}
' - + html += f"""
{pdf_name}
@@ -220,6 +360,7 @@ def generate_html(pdf_rules, rules_file_path): + @@ -227,10 +368,11 @@ def generate_html(pdf_rules, rules_file_path): """ - + for rule in rules: - html += get_rule_html(rule) - + html += get_rule_html(rule, rule_index) + rule_index += 1 + html += """
Status Type Content Parameters
@@ -238,35 +380,96 @@ def generate_html(pdf_rules, rules_file_path):
""" - - html += """ + + # Add JavaScript to manage interactivity + html += f""" + + """ - + return html - def main(): - parser = argparse.ArgumentParser(description="Generate an HTML visualization of PDF rules.") - parser.add_argument("rules_file", help="Path to the rules file (JSON lines format)") - parser.add_argument("-o", "--output", help="Output HTML file path", default="pdf_rules_visualization.html") - + parser = argparse.ArgumentParser(description='Generate an interactive HTML visualization of PDF rules.') + parser.add_argument('rules_file', help='Path to the rules file (JSON lines format)') + parser.add_argument('-o', '--output', help='Output HTML file path', default='interactive_pdf_rules.html') + args = parser.parse_args() - + if not os.path.exists(args.rules_file): print(f"Error: Rules file not found: {args.rules_file}") sys.exit(1) - + pdf_rules = parse_rules_file(args.rules_file) html = generate_html(pdf_rules, args.rules_file) - - with open(args.output, "w") as f: + + with open(args.output, 'w') as f: f.write(html) - - print(f"HTML visualization created: {args.output}") - + + print(f"Interactive HTML visualization created: {args.output}") if __name__ == "__main__": - main() + main() \ No newline at end of file