from concurrent.futures import ThreadPoolExecutor, as_completed from jinja2 import Template import random import os import subprocess import tempfile import boto3 import base64 import io from urllib.parse import urlparse from PIL import Image from tqdm import tqdm session = boto3.Session(profile_name='s2') s3_client = session.client('s3') def render_pdf_to_base64png(s3_path, page): with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf: pdf_path = tmp_pdf.name bucket, key = s3_path.replace("s3://", "").split('/', 1) s3_client.download_file(bucket, key, pdf_path) # Render the PDF to an image, and display it in the first position pdftoppm_result = subprocess.run( ["pdftoppm", "-png", "-f", str(page), "-l", str(page), pdf_path], timeout=120, stdout=subprocess.PIPE, stderr=subprocess.PIPE) assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr png_image = Image.open(io.BytesIO(pdftoppm_result.stdout)) webp_output = io.BytesIO() png_image.save(webp_output, format="WEBP") image_base64 = base64.b64encode(webp_output.getvalue()).decode("utf-8") return image_base64 def process_entry(i, entry): # Randomly decide whether to display gold on the left or right if random.choice([True, False]): left_text, right_text = entry["gold_text"], entry["eval_text"] left_alignment, right_alignment = entry["alignment"], entry["alignment"] left_class, right_class = "gold", "eval" else: left_text, right_text = entry["eval_text"], entry["gold_text"] left_alignment, right_alignment = entry["alignment"], entry["alignment"] left_class, right_class = "eval", "gold" # Convert newlines to

tags for proper formatting left_text = "

" + left_text.replace("\n", "

") + "

" right_text = "

" + right_text.replace("\n", "

") + "

" parsed_url = urlparse(entry["s3_path"]) bucket = parsed_url.netloc s3_key = parsed_url.path.lstrip('/') signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800) return { "entry_id": i, "page_image": render_pdf_to_base64png(entry["s3_path"], entry["page"]), "s3_path": entry["s3_path"], "page": entry["page"], "signed_pdf_link": signed_pdf_link, "left_text": left_text, "right_text": right_text, "left_alignment": left_alignment, "right_alignment": right_alignment, "left_class": left_class, "right_class": right_class, "gold_class": "gold" if left_class == "gold" else "eval", "eval_class": "eval" if right_class == "eval" else "gold" } def create_review_html(data, filename="review_page.html"): # Load the Jinja2 template from the file with open(os.path.join(os.path.dirname(__file__), "evalhtml_template.html"), "r") as f: template = Template(f.read()) entries = [] with ThreadPoolExecutor() as executor: # Submit tasks to the executor futures = [executor.submit(process_entry, i, entry) for i, entry in enumerate(data)] # Process the results as they are completed for future in tqdm(futures): entries.append(future.result()) # Render the template with the entries final_html = template.render(entries=entries) # Write the HTML content to the specified file with open(filename, "w") as f: f.write(final_html) print(f"HTML file '{filename}' created successfully!")