import os import random import tempfile import boto3 from concurrent.futures import ThreadPoolExecutor from jinja2 import Template from urllib.parse import urlparse from difflib import SequenceMatcher from tqdm import tqdm from pdelfin.data.renderpdf import render_pdf_to_base64png session = boto3.Session(profile_name='s2') s3_client = session.client('s3') def generate_diff_html(a, b): """ Generates HTML with differences between strings a and b. Additions in 'b' are highlighted in green, deletions from 'a' are highlighted in red. """ seq_matcher = SequenceMatcher(None, a, b) output_html = "" for opcode, a0, a1, b0, b1 in seq_matcher.get_opcodes(): if opcode == 'equal': output_html += a[a0:a1] elif opcode == 'insert': output_html += f"{b[b0:b1]}" elif opcode == 'delete': output_html += f"{a[a0:a1]}" elif opcode == 'replace': output_html += f"{a[a0:a1]}{b[b0:b1]}" return output_html def process_entry(i, entry): # Randomly decide whether to display gold on the left or right if random.choice([True, False]): left_text, right_text = entry["gold_text"], entry["eval_text"] left_class, right_class = "gold", "eval" else: left_text, right_text = entry["eval_text"], entry["gold_text"] left_class, right_class = "eval", "gold" # Generate diff for right_text compared to left_text diff_html = generate_diff_html(left_text, right_text) left_text = "

" + left_text.replace("\n", "

") + "

" right_text = "

" + right_text.replace("\n", "

") + "

" diff_html = "

" + diff_html.replace("\n", "

") + "

" parsed_url = urlparse(entry["s3_path"]) bucket = parsed_url.netloc s3_key = parsed_url.path.lstrip('/') signed_pdf_link = s3_client.generate_presigned_url( "get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800 ) with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf: pdf_path = tmp_pdf.name bucket, key = entry["s3_path"].replace("s3://", "").split('/', 1) s3_client.download_file(bucket, key, pdf_path) page_image_base64 = render_pdf_to_base64png( tmp_pdf.name, entry["page"], target_longest_image_dim=1024 ) return { "entry_id": i, "page_image": page_image_base64, "s3_path": entry["s3_path"], "page": entry["page"], "alignment": entry["alignment"], "signed_pdf_link": signed_pdf_link, "left_text": left_text, "right_text": right_text, "diff_text": diff_html, "left_class": left_class, "right_class": right_class, "gold_class": "gold" if left_class == "gold" else "eval", "eval_class": "eval" if right_class == "eval" else "gold" } def create_review_html(data, filename="review_page.html"): # Load the Jinja2 template from the file template_path = os.path.join(os.path.dirname(__file__), "evalhtml_template.html") with open(template_path, "r") as f: template = Template(f.read()) entries = [] with ThreadPoolExecutor() as executor: # Submit tasks to the executor futures = [executor.submit(process_entry, i, entry) for i, entry in enumerate(data)] # Process the results as they are completed for future in tqdm(futures): entries.append(future.result()) # Render the template with the entries final_html = template.render(entries=entries) # Write the HTML content to the specified file with open(filename, "w") as f: f.write(final_html) print(f"HTML file '{filename}' created successfully!")