2024-09-24 21:57:51 +00:00
|
|
|
import os
|
2024-10-09 17:53:26 +00:00
|
|
|
import random
|
2024-09-24 21:57:51 +00:00
|
|
|
import tempfile
|
|
|
|
import boto3
|
2024-10-09 17:53:26 +00:00
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
from jinja2 import Template
|
2024-09-24 21:57:51 +00:00
|
|
|
from urllib.parse import urlparse
|
2024-10-09 17:53:26 +00:00
|
|
|
from difflib import SequenceMatcher
|
2024-09-24 21:57:51 +00:00
|
|
|
from tqdm import tqdm
|
2024-10-09 20:20:06 +00:00
|
|
|
from pdelfin.data.renderpdf import render_pdf_to_base64png
|
2024-10-09 16:57:13 +00:00
|
|
|
|
2024-09-24 21:57:51 +00:00
|
|
|
session = boto3.Session(profile_name='s2')
|
|
|
|
s3_client = session.client('s3')
|
|
|
|
|
2024-10-09 17:53:26 +00:00
|
|
|
def generate_diff_html(a, b):
|
|
|
|
"""
|
|
|
|
Generates HTML with differences between strings a and b.
|
|
|
|
Additions in 'b' are highlighted in green, deletions from 'a' are highlighted in red.
|
|
|
|
"""
|
|
|
|
seq_matcher = SequenceMatcher(None, a, b)
|
|
|
|
output_html = ""
|
|
|
|
for opcode, a0, a1, b0, b1 in seq_matcher.get_opcodes():
|
|
|
|
if opcode == 'equal':
|
|
|
|
output_html += a[a0:a1]
|
|
|
|
elif opcode == 'insert':
|
|
|
|
output_html += f"<span class='added'>{b[b0:b1]}</span>"
|
|
|
|
elif opcode == 'delete':
|
|
|
|
output_html += f"<span class='removed'>{a[a0:a1]}</span>"
|
|
|
|
elif opcode == 'replace':
|
|
|
|
output_html += f"<span class='removed'>{a[a0:a1]}</span><span class='added'>{b[b0:b1]}</span>"
|
|
|
|
return output_html
|
2024-09-24 21:57:51 +00:00
|
|
|
|
2024-10-01 16:46:35 +00:00
|
|
|
def process_entry(i, entry):
|
|
|
|
# Randomly decide whether to display gold on the left or right
|
|
|
|
if random.choice([True, False]):
|
|
|
|
left_text, right_text = entry["gold_text"], entry["eval_text"]
|
|
|
|
left_class, right_class = "gold", "eval"
|
|
|
|
else:
|
|
|
|
left_text, right_text = entry["eval_text"], entry["gold_text"]
|
|
|
|
left_class, right_class = "eval", "gold"
|
|
|
|
|
2024-10-09 17:53:26 +00:00
|
|
|
# Generate diff for right_text compared to left_text
|
|
|
|
diff_html = generate_diff_html(left_text, right_text)
|
|
|
|
|
2024-10-01 16:46:35 +00:00
|
|
|
left_text = "<p>" + left_text.replace("\n", "</p><p>") + "</p>"
|
|
|
|
right_text = "<p>" + right_text.replace("\n", "</p><p>") + "</p>"
|
2024-10-09 17:53:26 +00:00
|
|
|
diff_html = "<p>" + diff_html.replace("\n", "</p><p>") + "</p>"
|
2024-10-01 16:46:35 +00:00
|
|
|
|
|
|
|
parsed_url = urlparse(entry["s3_path"])
|
|
|
|
bucket = parsed_url.netloc
|
|
|
|
s3_key = parsed_url.path.lstrip('/')
|
2024-10-09 17:53:26 +00:00
|
|
|
signed_pdf_link = s3_client.generate_presigned_url(
|
|
|
|
"get_object",
|
|
|
|
Params={"Bucket": bucket, "Key": s3_key},
|
|
|
|
ExpiresIn=604800
|
|
|
|
)
|
2024-10-01 16:46:35 +00:00
|
|
|
|
2024-10-09 16:57:13 +00:00
|
|
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
|
|
|
|
pdf_path = tmp_pdf.name
|
|
|
|
bucket, key = entry["s3_path"].replace("s3://", "").split('/', 1)
|
|
|
|
s3_client.download_file(bucket, key, pdf_path)
|
2024-10-09 17:53:26 +00:00
|
|
|
page_image_base64 = render_pdf_to_base64png(
|
|
|
|
tmp_pdf.name, entry["page"], target_longest_image_dim=1024
|
|
|
|
)
|
2024-10-09 16:57:13 +00:00
|
|
|
|
2024-10-01 16:46:35 +00:00
|
|
|
return {
|
|
|
|
"entry_id": i,
|
2024-10-09 16:57:13 +00:00
|
|
|
"page_image": page_image_base64,
|
2024-10-01 16:46:35 +00:00
|
|
|
"s3_path": entry["s3_path"],
|
|
|
|
"page": entry["page"],
|
2024-10-09 17:53:26 +00:00
|
|
|
"alignment": entry["alignment"],
|
2024-10-01 16:46:35 +00:00
|
|
|
"signed_pdf_link": signed_pdf_link,
|
|
|
|
"left_text": left_text,
|
|
|
|
"right_text": right_text,
|
2024-10-09 17:53:26 +00:00
|
|
|
"diff_text": diff_html,
|
2024-10-01 16:46:35 +00:00
|
|
|
"left_class": left_class,
|
|
|
|
"right_class": right_class,
|
|
|
|
"gold_class": "gold" if left_class == "gold" else "eval",
|
|
|
|
"eval_class": "eval" if right_class == "eval" else "gold"
|
|
|
|
}
|
|
|
|
|
2024-09-24 21:57:51 +00:00
|
|
|
def create_review_html(data, filename="review_page.html"):
|
|
|
|
# Load the Jinja2 template from the file
|
2024-10-09 17:53:26 +00:00
|
|
|
template_path = os.path.join(os.path.dirname(__file__), "evalhtml_template.html")
|
|
|
|
with open(template_path, "r") as f:
|
2024-09-24 21:57:51 +00:00
|
|
|
template = Template(f.read())
|
2024-10-01 16:46:35 +00:00
|
|
|
|
2024-09-24 21:57:51 +00:00
|
|
|
entries = []
|
2024-10-01 16:46:35 +00:00
|
|
|
with ThreadPoolExecutor() as executor:
|
|
|
|
# Submit tasks to the executor
|
|
|
|
futures = [executor.submit(process_entry, i, entry) for i, entry in enumerate(data)]
|
|
|
|
|
|
|
|
# Process the results as they are completed
|
|
|
|
for future in tqdm(futures):
|
|
|
|
entries.append(future.result())
|
2024-09-24 21:57:51 +00:00
|
|
|
|
|
|
|
# Render the template with the entries
|
|
|
|
final_html = template.render(entries=entries)
|
|
|
|
|
|
|
|
# Write the HTML content to the specified file
|
|
|
|
with open(filename, "w") as f:
|
|
|
|
f.write(final_html)
|
|
|
|
|
|
|
|
print(f"HTML file '{filename}' created successfully!")
|