import os import random import tempfile import boto3 from concurrent.futures import ThreadPoolExecutor from jinja2 import Template from urllib.parse import urlparse from difflib import SequenceMatcher from tqdm import tqdm from pdelfin.silver_data.renderpdf import render_pdf_to_base64png session = boto3.Session(profile_name='s2') s3_client = session.client('s3') def generate_diff_html(a, b): """ Generates HTML with differences between strings a and b. Additions in 'b' are highlighted in green, deletions from 'a' are highlighted in red. """ seq_matcher = SequenceMatcher(None, a, b) output_html = "" for opcode, a0, a1, b0, b1 in seq_matcher.get_opcodes(): if opcode == 'equal': output_html += a[a0:a1] elif opcode == 'insert': output_html += f"{b[b0:b1]}" elif opcode == 'delete': output_html += f"{a[a0:a1]}" elif opcode == 'replace': output_html += f"{a[a0:a1]}{b[b0:b1]}" return output_html def process_entry(i, entry): # Randomly decide whether to display gold on the left or right if random.choice([True, False]): left_text, right_text = entry["gold_text"], entry["eval_text"] left_class, right_class = "gold", "eval" else: left_text, right_text = entry["eval_text"], entry["gold_text"] left_class, right_class = "eval", "gold" # Generate diff for right_text compared to left_text diff_html = generate_diff_html(left_text, right_text) left_text = "
" + left_text.replace("\n", "
") + "
" right_text = "" + right_text.replace("\n", "
") + "
" diff_html = "" + diff_html.replace("\n", "
") + "
" parsed_url = urlparse(entry["s3_path"]) bucket = parsed_url.netloc s3_key = parsed_url.path.lstrip('/') signed_pdf_link = s3_client.generate_presigned_url( "get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800 ) with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf: pdf_path = tmp_pdf.name bucket, key = entry["s3_path"].replace("s3://", "").split('/', 1) s3_client.download_file(bucket, key, pdf_path) page_image_base64 = render_pdf_to_base64png( tmp_pdf.name, entry["page"], target_longest_image_dim=1024 ) return { "entry_id": i, "page_image": page_image_base64, "s3_path": entry["s3_path"], "page": entry["page"], "alignment": entry["alignment"], "signed_pdf_link": signed_pdf_link, "left_text": left_text, "right_text": right_text, "diff_text": diff_html, "left_class": left_class, "right_class": right_class, "gold_class": "gold" if left_class == "gold" else "eval", "eval_class": "eval" if right_class == "eval" else "gold" } def create_review_html(data, filename="review_page.html"): # Load the Jinja2 template from the file template_path = os.path.join(os.path.dirname(__file__), "evalhtml_template.html") with open(template_path, "r") as f: template = Template(f.read()) entries = [] with ThreadPoolExecutor() as executor: # Submit tasks to the executor futures = [executor.submit(process_entry, i, entry) for i, entry in enumerate(data)] # Process the results as they are completed for future in tqdm(futures): entries.append(future.result()) # Render the template with the entries final_html = template.render(entries=entries) # Write the HTML content to the specified file with open(filename, "w") as f: f.write(final_html) print(f"HTML file '{filename}' created successfully!")