olmocr/pdelfin/eval/evalhtml.py

import os
import random
import tempfile
import boto3
from concurrent.futures import ThreadPoolExecutor
from jinja2 import Template
from urllib.parse import urlparse
from difflib import SequenceMatcher
from tqdm import tqdm
from pdelfin.data.renderpdf import render_pdf_to_base64png

session = boto3.Session(profile_name='s2')
s3_client = session.client('s3')

def generate_diff_html(a, b):
    """
    Generates HTML with differences between strings a and b.
    Additions in 'b' are highlighted in green, deletions from 'a' are highlighted in red.
    """
    seq_matcher = SequenceMatcher(None, a, b)
    output_html = ""
    for opcode, a0, a1, b0, b1 in seq_matcher.get_opcodes():
        if opcode == 'equal':
            output_html += a[a0:a1]
        elif opcode == 'insert':
            output_html += f"<span class='added'>{b[b0:b1]}</span>"
        elif opcode == 'delete':
            output_html += f"<span class='removed'>{a[a0:a1]}</span>"
        elif opcode == 'replace':
            output_html += f"<span class='removed'>{a[a0:a1]}</span><span class='added'>{b[b0:b1]}</span>"
    return output_html

def process_entry(i, entry):
    # Randomly decide whether to display gold on the left or right
    if random.choice([True, False]):
        left_text, right_text = entry["gold_text"], entry["eval_text"]
        left_class, right_class = "gold", "eval"
    else:
        left_text, right_text = entry["eval_text"], entry["gold_text"]
        left_class, right_class = "eval", "gold"

    # Generate diff for right_text compared to left_text
    diff_html = generate_diff_html(left_text, right_text)

    left_text = "<p>" + left_text.replace("\n", "</p><p>") + "</p>"
    right_text = "<p>" + right_text.replace("\n", "</p><p>") + "</p>"
    diff_html = "<p>" + diff_html.replace("\n", "</p><p>") + "</p>"

    parsed_url = urlparse(entry["s3_path"])
    bucket = parsed_url.netloc
    s3_key = parsed_url.path.lstrip('/')
    signed_pdf_link = s3_client.generate_presigned_url(
        "get_object",
        Params={"Bucket": bucket, "Key": s3_key},
        ExpiresIn=604800
    )

    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
        pdf_path = tmp_pdf.name
        bucket, key = entry["s3_path"].replace("s3://", "").split('/', 1)
        s3_client.download_file(bucket, key, pdf_path)
        page_image_base64 = render_pdf_to_base64png(
            tmp_pdf.name, entry["page"], target_longest_image_dim=1024
        )

    return {
        "entry_id": i,
        "page_image": page_image_base64,
        "s3_path": entry["s3_path"],
        "page": entry["page"],
        "alignment": entry["alignment"],
        "signed_pdf_link": signed_pdf_link,
        "left_text": left_text,
        "right_text": right_text,
        "diff_text": diff_html, 
        "left_class": left_class,
        "right_class": right_class,
        "gold_class": "gold" if left_class == "gold" else "eval",
        "eval_class": "eval" if right_class == "eval" else "gold"
    }

def create_review_html(data, filename="review_page.html"):
    # Load the Jinja2 template from the file
    template_path = os.path.join(os.path.dirname(__file__), "evalhtml_template.html")
    with open(template_path, "r") as f:
        template = Template(f.read())

    entries = []
    with ThreadPoolExecutor() as executor:
        # Submit tasks to the executor
        futures = [executor.submit(process_entry, i, entry) for i, entry in enumerate(data)]

        # Process the results as they are completed
        for future in tqdm(futures):
            entries.append(future.result())

    # Render the template with the entries
    final_html = template.render(entries=entries)

    # Write the HTML content to the specified file
    with open(filename, "w") as f:
        f.write(final_html)

    print(f"HTML file '{filename}' created successfully!")
Adding in eval scripts from oe-data-internal now all in one place 2024-09-24 21:57:51 +00:00			`import os`
Adding diff to tinyhost 2024-10-09 17:53:26 +00:00			`import random`
Adding in eval scripts from oe-data-internal now all in one place 2024-09-24 21:57:51 +00:00			`import tempfile`
			`import boto3`
Adding diff to tinyhost 2024-10-09 17:53:26 +00:00			`from concurrent.futures import ThreadPoolExecutor`
			`from jinja2 import Template`
Adding in eval scripts from oe-data-internal now all in one place 2024-09-24 21:57:51 +00:00			`from urllib.parse import urlparse`
Adding diff to tinyhost 2024-10-09 17:53:26 +00:00			`from difflib import SequenceMatcher`
Adding in eval scripts from oe-data-internal now all in one place 2024-09-24 21:57:51 +00:00			`from tqdm import tqdm`
bugfixes 2024-10-09 20:20:06 +00:00			`from pdelfin.data.renderpdf import render_pdf_to_base64png`
Unifying some of the pdf rendering stuff 2024-10-09 16:57:13 +00:00
Adding in eval scripts from oe-data-internal now all in one place 2024-09-24 21:57:51 +00:00			`session = boto3.Session(profile_name='s2')`
			`s3_client = session.client('s3')`

Adding diff to tinyhost 2024-10-09 17:53:26 +00:00			`def generate_diff_html(a, b):`
			`"""`
			`Generates HTML with differences between strings a and b.`
			`Additions in 'b' are highlighted in green, deletions from 'a' are highlighted in red.`
			`"""`
			`seq_matcher = SequenceMatcher(None, a, b)`
			`output_html = ""`
			`for opcode, a0, a1, b0, b1 in seq_matcher.get_opcodes():`
			`if opcode == 'equal':`
			`output_html += a[a0:a1]`
			`elif opcode == 'insert':`
			`output_html += f"<span class='added'>{b[b0:b1]}</span>"`
			`elif opcode == 'delete':`
			`output_html += f"<span class='removed'>{a[a0:a1]}</span>"`
			`elif opcode == 'replace':`
			`output_html += f"<span class='removed'>{a[a0:a1]}</span><span class='added'>{b[b0:b1]}</span>"`
			`return output_html`
Adding in eval scripts from oe-data-internal now all in one place 2024-09-24 21:57:51 +00:00
Runeval is much improved now 2024-10-01 16:46:35 +00:00			`def process_entry(i, entry):`
			`# Randomly decide whether to display gold on the left or right`
			`if random.choice([True, False]):`
			`left_text, right_text = entry["gold_text"], entry["eval_text"]`
			`left_class, right_class = "gold", "eval"`
			`else:`
			`left_text, right_text = entry["eval_text"], entry["gold_text"]`
			`left_class, right_class = "eval", "gold"`

Adding diff to tinyhost 2024-10-09 17:53:26 +00:00			`# Generate diff for right_text compared to left_text`
			`diff_html = generate_diff_html(left_text, right_text)`

Runeval is much improved now 2024-10-01 16:46:35 +00:00			`left_text = "<p>" + left_text.replace("\n", "</p><p>") + "</p>"`
			`right_text = "<p>" + right_text.replace("\n", "</p><p>") + "</p>"`
Adding diff to tinyhost 2024-10-09 17:53:26 +00:00			`diff_html = "<p>" + diff_html.replace("\n", "</p><p>") + "</p>"`
Runeval is much improved now 2024-10-01 16:46:35 +00:00
			`parsed_url = urlparse(entry["s3_path"])`
			`bucket = parsed_url.netloc`
			`s3_key = parsed_url.path.lstrip('/')`
Adding diff to tinyhost 2024-10-09 17:53:26 +00:00			`signed_pdf_link = s3_client.generate_presigned_url(`
			`"get_object",`
			`Params={"Bucket": bucket, "Key": s3_key},`
			`ExpiresIn=604800`
			`)`
Runeval is much improved now 2024-10-01 16:46:35 +00:00
Unifying some of the pdf rendering stuff 2024-10-09 16:57:13 +00:00			`with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:`
			`pdf_path = tmp_pdf.name`
			`bucket, key = entry["s3_path"].replace("s3://", "").split('/', 1)`
			`s3_client.download_file(bucket, key, pdf_path)`
Adding diff to tinyhost 2024-10-09 17:53:26 +00:00			`page_image_base64 = render_pdf_to_base64png(`
			`tmp_pdf.name, entry["page"], target_longest_image_dim=1024`
			`)`
Unifying some of the pdf rendering stuff 2024-10-09 16:57:13 +00:00
Runeval is much improved now 2024-10-01 16:46:35 +00:00			`return {`
			`"entry_id": i,`
Unifying some of the pdf rendering stuff 2024-10-09 16:57:13 +00:00			`"page_image": page_image_base64,`
Runeval is much improved now 2024-10-01 16:46:35 +00:00			`"s3_path": entry["s3_path"],`
			`"page": entry["page"],`
Adding diff to tinyhost 2024-10-09 17:53:26 +00:00			`"alignment": entry["alignment"],`
Runeval is much improved now 2024-10-01 16:46:35 +00:00			`"signed_pdf_link": signed_pdf_link,`
			`"left_text": left_text,`
			`"right_text": right_text,`
Adding diff to tinyhost 2024-10-09 17:53:26 +00:00			`"diff_text": diff_html,`
Runeval is much improved now 2024-10-01 16:46:35 +00:00			`"left_class": left_class,`
			`"right_class": right_class,`
			`"gold_class": "gold" if left_class == "gold" else "eval",`
			`"eval_class": "eval" if right_class == "eval" else "gold"`
			`}`

Adding in eval scripts from oe-data-internal now all in one place 2024-09-24 21:57:51 +00:00			`def create_review_html(data, filename="review_page.html"):`
			`# Load the Jinja2 template from the file`
Adding diff to tinyhost 2024-10-09 17:53:26 +00:00			`template_path = os.path.join(os.path.dirname(__file__), "evalhtml_template.html")`
			`with open(template_path, "r") as f:`
Adding in eval scripts from oe-data-internal now all in one place 2024-09-24 21:57:51 +00:00			`template = Template(f.read())`
Runeval is much improved now 2024-10-01 16:46:35 +00:00
Adding in eval scripts from oe-data-internal now all in one place 2024-09-24 21:57:51 +00:00			`entries = []`
Runeval is much improved now 2024-10-01 16:46:35 +00:00			`with ThreadPoolExecutor() as executor:`
			`# Submit tasks to the executor`
			`futures = [executor.submit(process_entry, i, entry) for i, entry in enumerate(data)]`

			`# Process the results as they are completed`
			`for future in tqdm(futures):`
			`entries.append(future.result())`
Adding in eval scripts from oe-data-internal now all in one place 2024-09-24 21:57:51 +00:00
			`# Render the template with the entries`
			`final_html = template.render(entries=entries)`

			`# Write the HTML content to the specified file`
			`with open(filename, "w") as f:`
			`f.write(final_html)`

			`print(f"HTML file '{filename}' created successfully!")`