mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-02 02:01:09 +00:00
Adding diff to tinyhost
This commit is contained in:
parent
400e92180b
commit
0c56dec704
@ -1,70 +1,88 @@
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from jinja2 import Template
|
||||
import random
|
||||
import os
|
||||
import subprocess
|
||||
import random
|
||||
import tempfile
|
||||
import boto3
|
||||
import base64
|
||||
import io
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from jinja2 import Template
|
||||
from urllib.parse import urlparse
|
||||
from PIL import Image
|
||||
from difflib import SequenceMatcher
|
||||
from tqdm import tqdm
|
||||
|
||||
from pdelfin.silver_data.renderpdf import render_pdf_to_base64png
|
||||
|
||||
session = boto3.Session(profile_name='s2')
|
||||
s3_client = session.client('s3')
|
||||
|
||||
def generate_diff_html(a, b):
|
||||
"""
|
||||
Generates HTML with differences between strings a and b.
|
||||
Additions in 'b' are highlighted in green, deletions from 'a' are highlighted in red.
|
||||
"""
|
||||
seq_matcher = SequenceMatcher(None, a, b)
|
||||
output_html = ""
|
||||
for opcode, a0, a1, b0, b1 in seq_matcher.get_opcodes():
|
||||
if opcode == 'equal':
|
||||
output_html += a[a0:a1]
|
||||
elif opcode == 'insert':
|
||||
output_html += f"<span class='added'>{b[b0:b1]}</span>"
|
||||
elif opcode == 'delete':
|
||||
output_html += f"<span class='removed'>{a[a0:a1]}</span>"
|
||||
elif opcode == 'replace':
|
||||
output_html += f"<span class='removed'>{a[a0:a1]}</span><span class='added'>{b[b0:b1]}</span>"
|
||||
return output_html
|
||||
|
||||
def process_entry(i, entry):
|
||||
# Randomly decide whether to display gold on the left or right
|
||||
if random.choice([True, False]):
|
||||
left_text, right_text = entry["gold_text"], entry["eval_text"]
|
||||
left_alignment, right_alignment = entry["alignment"], entry["alignment"]
|
||||
left_class, right_class = "gold", "eval"
|
||||
else:
|
||||
left_text, right_text = entry["eval_text"], entry["gold_text"]
|
||||
left_alignment, right_alignment = entry["alignment"], entry["alignment"]
|
||||
left_class, right_class = "eval", "gold"
|
||||
|
||||
# Convert newlines to <p> tags for proper formatting
|
||||
# Generate diff for right_text compared to left_text
|
||||
diff_html = generate_diff_html(left_text, right_text)
|
||||
|
||||
left_text = "<p>" + left_text.replace("\n", "</p><p>") + "</p>"
|
||||
right_text = "<p>" + right_text.replace("\n", "</p><p>") + "</p>"
|
||||
diff_html = "<p>" + diff_html.replace("\n", "</p><p>") + "</p>"
|
||||
|
||||
parsed_url = urlparse(entry["s3_path"])
|
||||
bucket = parsed_url.netloc
|
||||
s3_key = parsed_url.path.lstrip('/')
|
||||
signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800)
|
||||
signed_pdf_link = s3_client.generate_presigned_url(
|
||||
"get_object",
|
||||
Params={"Bucket": bucket, "Key": s3_key},
|
||||
ExpiresIn=604800
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
|
||||
pdf_path = tmp_pdf.name
|
||||
bucket, key = entry["s3_path"].replace("s3://", "").split('/', 1)
|
||||
s3_client.download_file(bucket, key, pdf_path)
|
||||
|
||||
page_image_base64 = render_pdf_to_base64png(tmp_pdf.name, entry["page"], target_longest_image_dim=1024)
|
||||
page_image_base64 = render_pdf_to_base64png(
|
||||
tmp_pdf.name, entry["page"], target_longest_image_dim=1024
|
||||
)
|
||||
|
||||
return {
|
||||
"entry_id": i,
|
||||
"page_image": page_image_base64,
|
||||
"s3_path": entry["s3_path"],
|
||||
"page": entry["page"],
|
||||
"alignment": entry["alignment"],
|
||||
"signed_pdf_link": signed_pdf_link,
|
||||
"left_text": left_text,
|
||||
"right_text": right_text,
|
||||
"left_alignment": left_alignment,
|
||||
"right_alignment": right_alignment,
|
||||
"diff_text": diff_html,
|
||||
"left_class": left_class,
|
||||
"right_class": right_class,
|
||||
"gold_class": "gold" if left_class == "gold" else "eval",
|
||||
"eval_class": "eval" if right_class == "eval" else "gold"
|
||||
}
|
||||
|
||||
|
||||
def create_review_html(data, filename="review_page.html"):
|
||||
# Load the Jinja2 template from the file
|
||||
with open(os.path.join(os.path.dirname(__file__), "evalhtml_template.html"), "r") as f:
|
||||
template_path = os.path.join(os.path.dirname(__file__), "evalhtml_template.html")
|
||||
with open(template_path, "r") as f:
|
||||
template = Template(f.read())
|
||||
|
||||
entries = []
|
||||
|
||||
@ -41,6 +41,7 @@
|
||||
flex-direction: column;
|
||||
justify-content: space-between;
|
||||
cursor: pointer;
|
||||
position: relative;
|
||||
}
|
||||
.text-block:hover {
|
||||
background-color: #e0e0e0;
|
||||
@ -159,6 +160,25 @@
|
||||
.voting-buttons button.selected {
|
||||
border: 3px solid #000;
|
||||
}
|
||||
/* for diffs */
|
||||
.added {
|
||||
background-color: #d4fcdc;
|
||||
}
|
||||
.removed {
|
||||
background-color: #fcd4d4;
|
||||
text-decoration: line-through;
|
||||
}
|
||||
|
||||
/* Diff Toggle Styles */
|
||||
body.diffed .right-text {
|
||||
display: none;
|
||||
}
|
||||
body.diffed .diff-text {
|
||||
display: block;
|
||||
}
|
||||
.diff-text {
|
||||
display: none;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
@ -166,8 +186,15 @@
|
||||
|
||||
<!-- Floating Reveal Box -->
|
||||
<div class="reveal-box">
|
||||
<input type="checkbox" id="reveal-toggle" />
|
||||
<label for="reveal-toggle">Reveal Gold/Eval</label>
|
||||
<div>
|
||||
<input type="checkbox" id="diff-toggle" />
|
||||
<label for="diff-toggle">Toggle diff</label>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<input type="checkbox" id="reveal-toggle" />
|
||||
<label for="reveal-toggle">Reveal Gold/Eval</label>
|
||||
</div>
|
||||
<div class="reveal-info" id="vote-info">Votes</div>
|
||||
</div>
|
||||
|
||||
@ -177,7 +204,7 @@
|
||||
<div class="image-container">
|
||||
<img src="data:image/png;base64,{{ entry.page_image }}" alt="Render">
|
||||
|
||||
<div class="alignment">Alignment: {{ entry.left_alignment }}</div>
|
||||
<div class="alignment">Alignment: {{ entry.alignment }}</div>
|
||||
<a href="{{entry.signed_pdf_link}}#page={{ entry.page }}" target="_blank">{{ entry.s3_path }} (Page {{ entry.page }})</a>
|
||||
|
||||
<!-- Voting Buttons -->
|
||||
@ -190,8 +217,10 @@
|
||||
<div class="text-block {{ entry.left_class }}" data-choice="left">
|
||||
<div>{{ entry.left_text|safe }}</div>
|
||||
</div>
|
||||
<!-- Updated Right Text-Block with separate divs for right_text and diff_text -->
|
||||
<div class="text-block {{ entry.right_class }}" data-choice="right">
|
||||
<div>{{ entry.right_text|safe }}</div>
|
||||
<div class="right-text">{{ entry.right_text|safe }}</div>
|
||||
<div class="diff-text">{{ entry.diff_text|safe }}</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
@ -221,11 +250,18 @@
|
||||
overlay.classList.remove('active');
|
||||
});
|
||||
|
||||
// Handle Reveal Gold/Eval Toggle
|
||||
document.getElementById('reveal-toggle').addEventListener('change', (e) => {
|
||||
document.body.classList.toggle('revealed', e.target.checked);
|
||||
updateReveal();
|
||||
});
|
||||
|
||||
// Handle Diff Toggle
|
||||
document.getElementById('diff-toggle').addEventListener('change', (e) => {
|
||||
document.body.classList.toggle('diffed', e.target.checked);
|
||||
toggleDiff(e.target.checked);
|
||||
});
|
||||
|
||||
// Handle text-block selections
|
||||
document.querySelectorAll('.text-block').forEach(block => {
|
||||
block.addEventListener('click', () => selectChoice(block));
|
||||
@ -263,7 +299,9 @@
|
||||
}
|
||||
});
|
||||
|
||||
updateVoteInfo(datastore);
|
||||
// Ensure diff state is consistent on load
|
||||
const diffToggle = document.getElementById('diff-toggle');
|
||||
toggleDiff(diffToggle.checked);
|
||||
}
|
||||
|
||||
async function selectChoice(block, save = true) {
|
||||
@ -392,6 +430,15 @@
|
||||
});
|
||||
}
|
||||
|
||||
// Function to toggle diff text
|
||||
function toggleDiff(isDiffed) {
|
||||
if (isDiffed) {
|
||||
document.body.classList.add('diffed');
|
||||
} else {
|
||||
document.body.classList.remove('diffed');
|
||||
}
|
||||
}
|
||||
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@ -208,9 +208,11 @@ def process_jsonl_file(jsonl_file, gold_data, comparer):
|
||||
|
||||
if data.error is not None:
|
||||
total_errors += 1
|
||||
eval_text = f"[Error processing this page: {data.error}]"
|
||||
|
||||
if data.error is None and data.finish_reason != "stop":
|
||||
total_overruns += 1
|
||||
eval_text += f"\n[Error processing this page: overrun {data.finish_reason}]"
|
||||
|
||||
if len(gold_text.strip()) < 3 and len(eval_text.strip()) < 3:
|
||||
alignment = 1.0
|
||||
|
||||
@ -26,7 +26,8 @@ dependencies = [
|
||||
"pypdfium2",
|
||||
"lingua-language-detector",
|
||||
"Pillow",
|
||||
"ftfy"
|
||||
"ftfy",
|
||||
"bleach"
|
||||
]
|
||||
license = {file = "LICENSE"}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user