diff --git a/pdelfin/eval/evalhtml.py b/pdelfin/eval/evalhtml.py index 17cec53..6dcc0c3 100644 --- a/pdelfin/eval/evalhtml.py +++ b/pdelfin/eval/evalhtml.py @@ -1,70 +1,88 @@ -from concurrent.futures import ThreadPoolExecutor, as_completed -from jinja2 import Template -import random import os -import subprocess +import random import tempfile import boto3 -import base64 -import io - +from concurrent.futures import ThreadPoolExecutor +from jinja2 import Template from urllib.parse import urlparse -from PIL import Image +from difflib import SequenceMatcher from tqdm import tqdm - from pdelfin.silver_data.renderpdf import render_pdf_to_base64png session = boto3.Session(profile_name='s2') s3_client = session.client('s3') +def generate_diff_html(a, b): + """ + Generates HTML with differences between strings a and b. + Additions in 'b' are highlighted in green, deletions from 'a' are highlighted in red. + """ + seq_matcher = SequenceMatcher(None, a, b) + output_html = "" + for opcode, a0, a1, b0, b1 in seq_matcher.get_opcodes(): + if opcode == 'equal': + output_html += a[a0:a1] + elif opcode == 'insert': + output_html += f"{b[b0:b1]}" + elif opcode == 'delete': + output_html += f"{a[a0:a1]}" + elif opcode == 'replace': + output_html += f"{a[a0:a1]}{b[b0:b1]}" + return output_html def process_entry(i, entry): # Randomly decide whether to display gold on the left or right if random.choice([True, False]): left_text, right_text = entry["gold_text"], entry["eval_text"] - left_alignment, right_alignment = entry["alignment"], entry["alignment"] left_class, right_class = "gold", "eval" else: left_text, right_text = entry["eval_text"], entry["gold_text"] - left_alignment, right_alignment = entry["alignment"], entry["alignment"] left_class, right_class = "eval", "gold" - # Convert newlines to
tags for proper formatting + # Generate diff for right_text compared to left_text + diff_html = generate_diff_html(left_text, right_text) + left_text = "
" + left_text.replace("\n", "
") + "
" right_text = "" + right_text.replace("\n", "
") + "
" + diff_html = "" + diff_html.replace("\n", "
") + "
" parsed_url = urlparse(entry["s3_path"]) bucket = parsed_url.netloc s3_key = parsed_url.path.lstrip('/') - signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800) + signed_pdf_link = s3_client.generate_presigned_url( + "get_object", + Params={"Bucket": bucket, "Key": s3_key}, + ExpiresIn=604800 + ) with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf: pdf_path = tmp_pdf.name bucket, key = entry["s3_path"].replace("s3://", "").split('/', 1) s3_client.download_file(bucket, key, pdf_path) - - page_image_base64 = render_pdf_to_base64png(tmp_pdf.name, entry["page"], target_longest_image_dim=1024) + page_image_base64 = render_pdf_to_base64png( + tmp_pdf.name, entry["page"], target_longest_image_dim=1024 + ) return { "entry_id": i, "page_image": page_image_base64, "s3_path": entry["s3_path"], "page": entry["page"], + "alignment": entry["alignment"], "signed_pdf_link": signed_pdf_link, "left_text": left_text, "right_text": right_text, - "left_alignment": left_alignment, - "right_alignment": right_alignment, + "diff_text": diff_html, "left_class": left_class, "right_class": right_class, "gold_class": "gold" if left_class == "gold" else "eval", "eval_class": "eval" if right_class == "eval" else "gold" } - def create_review_html(data, filename="review_page.html"): # Load the Jinja2 template from the file - with open(os.path.join(os.path.dirname(__file__), "evalhtml_template.html"), "r") as f: + template_path = os.path.join(os.path.dirname(__file__), "evalhtml_template.html") + with open(template_path, "r") as f: template = Template(f.read()) entries = [] diff --git a/pdelfin/eval/evalhtml_template.html b/pdelfin/eval/evalhtml_template.html index 9b0e319..9c79039 100644 --- a/pdelfin/eval/evalhtml_template.html +++ b/pdelfin/eval/evalhtml_template.html @@ -41,6 +41,7 @@ flex-direction: column; justify-content: space-between; cursor: pointer; + position: relative; } .text-block:hover { background-color: #e0e0e0; @@ -159,6 +160,25 @@ .voting-buttons button.selected { border: 3px solid #000; } + /* for diffs */ + .added { + background-color: #d4fcdc; + } + .removed { + background-color: #fcd4d4; + text-decoration: line-through; + } + + /* Diff Toggle Styles */ + body.diffed .right-text { + display: none; + } + body.diffed .diff-text { + display: block; + } + .diff-text { + display: none; + } @@ -166,8 +186,15 @@