Adding diff to tinyhost

This commit is contained in:
Jake Poznanski 2024-10-09 17:53:26 +00:00
parent 400e92180b
commit 0c56dec704
4 changed files with 93 additions and 25 deletions

View File

@ -1,70 +1,88 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from jinja2 import Template
import random
import os
import subprocess
import random
import tempfile
import boto3
import base64
import io
from concurrent.futures import ThreadPoolExecutor
from jinja2 import Template
from urllib.parse import urlparse
from PIL import Image
from difflib import SequenceMatcher
from tqdm import tqdm
from pdelfin.silver_data.renderpdf import render_pdf_to_base64png
session = boto3.Session(profile_name='s2')
s3_client = session.client('s3')
def generate_diff_html(a, b):
"""
Generates HTML with differences between strings a and b.
Additions in 'b' are highlighted in green, deletions from 'a' are highlighted in red.
"""
seq_matcher = SequenceMatcher(None, a, b)
output_html = ""
for opcode, a0, a1, b0, b1 in seq_matcher.get_opcodes():
if opcode == 'equal':
output_html += a[a0:a1]
elif opcode == 'insert':
output_html += f"<span class='added'>{b[b0:b1]}</span>"
elif opcode == 'delete':
output_html += f"<span class='removed'>{a[a0:a1]}</span>"
elif opcode == 'replace':
output_html += f"<span class='removed'>{a[a0:a1]}</span><span class='added'>{b[b0:b1]}</span>"
return output_html
def process_entry(i, entry):
# Randomly decide whether to display gold on the left or right
if random.choice([True, False]):
left_text, right_text = entry["gold_text"], entry["eval_text"]
left_alignment, right_alignment = entry["alignment"], entry["alignment"]
left_class, right_class = "gold", "eval"
else:
left_text, right_text = entry["eval_text"], entry["gold_text"]
left_alignment, right_alignment = entry["alignment"], entry["alignment"]
left_class, right_class = "eval", "gold"
# Convert newlines to <p> tags for proper formatting
# Generate diff for right_text compared to left_text
diff_html = generate_diff_html(left_text, right_text)
left_text = "<p>" + left_text.replace("\n", "</p><p>") + "</p>"
right_text = "<p>" + right_text.replace("\n", "</p><p>") + "</p>"
diff_html = "<p>" + diff_html.replace("\n", "</p><p>") + "</p>"
parsed_url = urlparse(entry["s3_path"])
bucket = parsed_url.netloc
s3_key = parsed_url.path.lstrip('/')
signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800)
signed_pdf_link = s3_client.generate_presigned_url(
"get_object",
Params={"Bucket": bucket, "Key": s3_key},
ExpiresIn=604800
)
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
pdf_path = tmp_pdf.name
bucket, key = entry["s3_path"].replace("s3://", "").split('/', 1)
s3_client.download_file(bucket, key, pdf_path)
page_image_base64 = render_pdf_to_base64png(tmp_pdf.name, entry["page"], target_longest_image_dim=1024)
page_image_base64 = render_pdf_to_base64png(
tmp_pdf.name, entry["page"], target_longest_image_dim=1024
)
return {
"entry_id": i,
"page_image": page_image_base64,
"s3_path": entry["s3_path"],
"page": entry["page"],
"alignment": entry["alignment"],
"signed_pdf_link": signed_pdf_link,
"left_text": left_text,
"right_text": right_text,
"left_alignment": left_alignment,
"right_alignment": right_alignment,
"diff_text": diff_html,
"left_class": left_class,
"right_class": right_class,
"gold_class": "gold" if left_class == "gold" else "eval",
"eval_class": "eval" if right_class == "eval" else "gold"
}
def create_review_html(data, filename="review_page.html"):
# Load the Jinja2 template from the file
with open(os.path.join(os.path.dirname(__file__), "evalhtml_template.html"), "r") as f:
template_path = os.path.join(os.path.dirname(__file__), "evalhtml_template.html")
with open(template_path, "r") as f:
template = Template(f.read())
entries = []

View File

@ -41,6 +41,7 @@
flex-direction: column;
justify-content: space-between;
cursor: pointer;
position: relative;
}
.text-block:hover {
background-color: #e0e0e0;
@ -159,6 +160,25 @@
.voting-buttons button.selected {
border: 3px solid #000;
}
/* for diffs */
.added {
background-color: #d4fcdc;
}
.removed {
background-color: #fcd4d4;
text-decoration: line-through;
}
/* Diff Toggle Styles */
body.diffed .right-text {
display: none;
}
body.diffed .diff-text {
display: block;
}
.diff-text {
display: none;
}
</style>
</head>
<body>
@ -166,8 +186,15 @@
<!-- Floating Reveal Box -->
<div class="reveal-box">
<input type="checkbox" id="reveal-toggle" />
<label for="reveal-toggle">Reveal Gold/Eval</label>
<div>
<input type="checkbox" id="diff-toggle" />
<label for="diff-toggle">Toggle diff</label>
</div>
<div>
<input type="checkbox" id="reveal-toggle" />
<label for="reveal-toggle">Reveal Gold/Eval</label>
</div>
<div class="reveal-info" id="vote-info">Votes</div>
</div>
@ -177,7 +204,7 @@
<div class="image-container">
<img src="data:image/png;base64,{{ entry.page_image }}" alt="Render">
<div class="alignment">Alignment: {{ entry.left_alignment }}</div>
<div class="alignment">Alignment: {{ entry.alignment }}</div>
<a href="{{entry.signed_pdf_link}}#page={{ entry.page }}" target="_blank">{{ entry.s3_path }} (Page {{ entry.page }})</a>
<!-- Voting Buttons -->
@ -190,8 +217,10 @@
<div class="text-block {{ entry.left_class }}" data-choice="left">
<div>{{ entry.left_text|safe }}</div>
</div>
<!-- Updated Right Text-Block with separate divs for right_text and diff_text -->
<div class="text-block {{ entry.right_class }}" data-choice="right">
<div>{{ entry.right_text|safe }}</div>
<div class="right-text">{{ entry.right_text|safe }}</div>
<div class="diff-text">{{ entry.diff_text|safe }}</div>
</div>
</div>
{% endfor %}
@ -221,11 +250,18 @@
overlay.classList.remove('active');
});
// Handle Reveal Gold/Eval Toggle
document.getElementById('reveal-toggle').addEventListener('change', (e) => {
document.body.classList.toggle('revealed', e.target.checked);
updateReveal();
});
// Handle Diff Toggle
document.getElementById('diff-toggle').addEventListener('change', (e) => {
document.body.classList.toggle('diffed', e.target.checked);
toggleDiff(e.target.checked);
});
// Handle text-block selections
document.querySelectorAll('.text-block').forEach(block => {
block.addEventListener('click', () => selectChoice(block));
@ -263,7 +299,9 @@
}
});
updateVoteInfo(datastore);
// Ensure diff state is consistent on load
const diffToggle = document.getElementById('diff-toggle');
toggleDiff(diffToggle.checked);
}
async function selectChoice(block, save = true) {
@ -392,6 +430,15 @@
});
}
// Function to toggle diff text
function toggleDiff(isDiffed) {
if (isDiffed) {
document.body.classList.add('diffed');
} else {
document.body.classList.remove('diffed');
}
}
</script>
</body>
</html>

View File

@ -208,9 +208,11 @@ def process_jsonl_file(jsonl_file, gold_data, comparer):
if data.error is not None:
total_errors += 1
eval_text = f"[Error processing this page: {data.error}]"
if data.error is None and data.finish_reason != "stop":
total_overruns += 1
eval_text += f"\n[Error processing this page: overrun {data.finish_reason}]"
if len(gold_text.strip()) < 3 and len(eval_text.strip()) < 3:
alignment = 1.0

View File

@ -26,7 +26,8 @@ dependencies = [
"pypdfium2",
"lingua-language-detector",
"Pillow",
"ftfy"
"ftfy",
"bleach"
]
license = {file = "LICENSE"}