diff --git a/pdelfin/eval/evalhtml.py b/pdelfin/eval/evalhtml.py index 17cec53..6dcc0c3 100644 --- a/pdelfin/eval/evalhtml.py +++ b/pdelfin/eval/evalhtml.py @@ -1,70 +1,88 @@ -from concurrent.futures import ThreadPoolExecutor, as_completed -from jinja2 import Template -import random import os -import subprocess +import random import tempfile import boto3 -import base64 -import io - +from concurrent.futures import ThreadPoolExecutor +from jinja2 import Template from urllib.parse import urlparse -from PIL import Image +from difflib import SequenceMatcher from tqdm import tqdm - from pdelfin.silver_data.renderpdf import render_pdf_to_base64png session = boto3.Session(profile_name='s2') s3_client = session.client('s3') +def generate_diff_html(a, b): + """ + Generates HTML with differences between strings a and b. + Additions in 'b' are highlighted in green, deletions from 'a' are highlighted in red. + """ + seq_matcher = SequenceMatcher(None, a, b) + output_html = "" + for opcode, a0, a1, b0, b1 in seq_matcher.get_opcodes(): + if opcode == 'equal': + output_html += a[a0:a1] + elif opcode == 'insert': + output_html += f"{b[b0:b1]}" + elif opcode == 'delete': + output_html += f"{a[a0:a1]}" + elif opcode == 'replace': + output_html += f"{a[a0:a1]}{b[b0:b1]}" + return output_html def process_entry(i, entry): # Randomly decide whether to display gold on the left or right if random.choice([True, False]): left_text, right_text = entry["gold_text"], entry["eval_text"] - left_alignment, right_alignment = entry["alignment"], entry["alignment"] left_class, right_class = "gold", "eval" else: left_text, right_text = entry["eval_text"], entry["gold_text"] - left_alignment, right_alignment = entry["alignment"], entry["alignment"] left_class, right_class = "eval", "gold" - # Convert newlines to

tags for proper formatting + # Generate diff for right_text compared to left_text + diff_html = generate_diff_html(left_text, right_text) + left_text = "

" + left_text.replace("\n", "

") + "

" right_text = "

" + right_text.replace("\n", "

") + "

" + diff_html = "

" + diff_html.replace("\n", "

") + "

" parsed_url = urlparse(entry["s3_path"]) bucket = parsed_url.netloc s3_key = parsed_url.path.lstrip('/') - signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800) + signed_pdf_link = s3_client.generate_presigned_url( + "get_object", + Params={"Bucket": bucket, "Key": s3_key}, + ExpiresIn=604800 + ) with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf: pdf_path = tmp_pdf.name bucket, key = entry["s3_path"].replace("s3://", "").split('/', 1) s3_client.download_file(bucket, key, pdf_path) - - page_image_base64 = render_pdf_to_base64png(tmp_pdf.name, entry["page"], target_longest_image_dim=1024) + page_image_base64 = render_pdf_to_base64png( + tmp_pdf.name, entry["page"], target_longest_image_dim=1024 + ) return { "entry_id": i, "page_image": page_image_base64, "s3_path": entry["s3_path"], "page": entry["page"], + "alignment": entry["alignment"], "signed_pdf_link": signed_pdf_link, "left_text": left_text, "right_text": right_text, - "left_alignment": left_alignment, - "right_alignment": right_alignment, + "diff_text": diff_html, "left_class": left_class, "right_class": right_class, "gold_class": "gold" if left_class == "gold" else "eval", "eval_class": "eval" if right_class == "eval" else "gold" } - def create_review_html(data, filename="review_page.html"): # Load the Jinja2 template from the file - with open(os.path.join(os.path.dirname(__file__), "evalhtml_template.html"), "r") as f: + template_path = os.path.join(os.path.dirname(__file__), "evalhtml_template.html") + with open(template_path, "r") as f: template = Template(f.read()) entries = [] diff --git a/pdelfin/eval/evalhtml_template.html b/pdelfin/eval/evalhtml_template.html index 9b0e319..9c79039 100644 --- a/pdelfin/eval/evalhtml_template.html +++ b/pdelfin/eval/evalhtml_template.html @@ -41,6 +41,7 @@ flex-direction: column; justify-content: space-between; cursor: pointer; + position: relative; } .text-block:hover { background-color: #e0e0e0; @@ -159,6 +160,25 @@ .voting-buttons button.selected { border: 3px solid #000; } + /* for diffs */ + .added { + background-color: #d4fcdc; + } + .removed { + background-color: #fcd4d4; + text-decoration: line-through; + } + + /* Diff Toggle Styles */ + body.diffed .right-text { + display: none; + } + body.diffed .diff-text { + display: block; + } + .diff-text { + display: none; + } @@ -166,8 +186,15 @@
- - +
+ + +
+ +
+ + +
Votes
@@ -177,7 +204,7 @@
Render -
Alignment: {{ entry.left_alignment }}
+
Alignment: {{ entry.alignment }}
{{ entry.s3_path }} (Page {{ entry.page }}) @@ -190,8 +217,10 @@
{{ entry.left_text|safe }}
+
-
{{ entry.right_text|safe }}
+
{{ entry.right_text|safe }}
+
{{ entry.diff_text|safe }}
{% endfor %} @@ -221,11 +250,18 @@ overlay.classList.remove('active'); }); + // Handle Reveal Gold/Eval Toggle document.getElementById('reveal-toggle').addEventListener('change', (e) => { document.body.classList.toggle('revealed', e.target.checked); updateReveal(); }); + // Handle Diff Toggle + document.getElementById('diff-toggle').addEventListener('change', (e) => { + document.body.classList.toggle('diffed', e.target.checked); + toggleDiff(e.target.checked); + }); + // Handle text-block selections document.querySelectorAll('.text-block').forEach(block => { block.addEventListener('click', () => selectChoice(block)); @@ -263,7 +299,9 @@ } }); - updateVoteInfo(datastore); + // Ensure diff state is consistent on load + const diffToggle = document.getElementById('diff-toggle'); + toggleDiff(diffToggle.checked); } async function selectChoice(block, save = true) { @@ -392,6 +430,15 @@ }); } + // Function to toggle diff text + function toggleDiff(isDiffed) { + if (isDiffed) { + document.body.classList.add('diffed'); + } else { + document.body.classList.remove('diffed'); + } + } + diff --git a/pdelfin/eval/runeval.py b/pdelfin/eval/runeval.py index d686674..8bcbb3d 100644 --- a/pdelfin/eval/runeval.py +++ b/pdelfin/eval/runeval.py @@ -208,9 +208,11 @@ def process_jsonl_file(jsonl_file, gold_data, comparer): if data.error is not None: total_errors += 1 + eval_text = f"[Error processing this page: {data.error}]" if data.error is None and data.finish_reason != "stop": total_overruns += 1 + eval_text += f"\n[Error processing this page: overrun {data.finish_reason}]" if len(gold_text.strip()) < 3 and len(eval_text.strip()) < 3: alignment = 1.0 diff --git a/pyproject.toml b/pyproject.toml index 23ffcc4..c1b86a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,8 @@ dependencies = [ "pypdfium2", "lingua-language-detector", "Pillow", - "ftfy" + "ftfy", + "bleach" ] license = {file = "LICENSE"}