diff --git a/olmocr/bench/viewer.py b/olmocr/bench/viewer.py index d094aeb..a5d472c 100644 --- a/olmocr/bench/viewer.py +++ b/olmocr/bench/viewer.py @@ -18,6 +18,10 @@ def parse_rules_file(file_path): try: rule = json.loads(line) + # Add checked field if it doesn't exist + if 'checked' not in rule: + rule['checked'] = None + if 'pdf' in rule: pdf_rules[rule['pdf']].append(rule) except json.JSONDecodeError: @@ -25,40 +29,93 @@ def parse_rules_file(file_path): return pdf_rules -def get_rule_html(rule): - """Generate HTML representation for a rule.""" +def get_rule_html(rule, rule_index): + """Generate HTML representation for a rule with interactive elements.""" rule_type = rule.get('type', 'unknown') + rule_id = f"rule-{rule_index}" + # Determine status button class based on 'checked' value + checked_status = rule.get('checked') + if checked_status == "verified": + status_class = "status-verified" + elif checked_status == "rejected": + status_class = "status-rejected" + else: + status_class = "status-unchecked" + + # Create thumbs up/down buttons + status_button = f""" +
+ + +
+ """ + + # Create HTML based on rule type if rule_type == 'present': return f""" - + + {status_button} PRESENT - "{rule.get('text', '')}" + +
{rule.get('text', '')}
+ Threshold: {rule.get('threshold', 'N/A')} """ elif rule_type == 'absent': return f""" - + + {status_button} ABSENT - "{rule.get('text', '')}" + +
{rule.get('text', '')}
+ Threshold: {rule.get('threshold', 'N/A')} """ elif rule_type == 'order': return f""" - + + {status_button} ORDER -

Before: "{rule.get('before', '')}"

-

After: "{rule.get('after', '')}"

+

Before: + {rule.get('before', '')} +

+

After: + {rule.get('after', '')} +

Threshold: {rule.get('threshold', 'N/A')} """ else: return f""" - + + {status_button} UNKNOWN Unknown rule type: {rule_type} @@ -66,17 +123,24 @@ def get_rule_html(rule): """ def generate_html(pdf_rules, rules_file_path): - """Generate the HTML page with PDF renderings and rules.""" + """Generate the HTML page with PDF renderings and interactive rules.""" # Limit to 10 unique PDFs pdf_names = list(pdf_rules.keys())[:10] + # Prepare rules data for JavaScript + all_rules = [] + for pdf_name in pdf_names: + all_rules.extend(pdf_rules[pdf_name]) + + rules_json = json.dumps(all_rules) + html = """ - PDF Rules Visualizer + Interactive PDF Rules Visualizer
-

PDF Rules Visualizer

+

Interactive PDF Rules Visualizer

""" + # Global rule index for unique IDs + rule_index = 0 + for pdf_name in pdf_names: rules = pdf_rules[pdf_name] @@ -216,6 +360,7 @@ def generate_html(pdf_rules, rules_file_path): + @@ -225,7 +370,8 @@ def generate_html(pdf_rules, rules_file_path): """ for rule in rules: - html += get_rule_html(rule) + html += get_rule_html(rule, rule_index) + rule_index += 1 html += """ @@ -235,8 +381,71 @@ def generate_html(pdf_rules, rules_file_path): """ - html += """ + # Add JavaScript to manage interactivity + html += f""" + + """ @@ -244,9 +453,9 @@ def generate_html(pdf_rules, rules_file_path): return html def main(): - parser = argparse.ArgumentParser(description='Generate an HTML visualization of PDF rules.') + parser = argparse.ArgumentParser(description='Generate an interactive HTML visualization of PDF rules.') parser.add_argument('rules_file', help='Path to the rules file (JSON lines format)') - parser.add_argument('-o', '--output', help='Output HTML file path', default='pdf_rules_visualization.html') + parser.add_argument('-o', '--output', help='Output HTML file path', default='interactive_pdf_rules.html') args = parser.parse_args() @@ -260,7 +469,7 @@ def main(): with open(args.output, 'w') as f: f.write(html) - print(f"HTML visualization created: {args.output}") + print(f"Interactive HTML visualization created: {args.output}") if __name__ == "__main__": main() \ No newline at end of file
Status Type Content Parameters