olmocr/pdelfin/eval/scoreelo.py

275 lines
9.3 KiB
Python
Raw Normal View History

2025-01-14 22:40:56 +00:00
import requests
import re
from urllib.parse import urlsplit, urlunsplit, parse_qs, urlencode
2025-01-14 22:57:17 +00:00
import json
from collections import defaultdict
def fetch_review_page_html(url):
"""
Fetch the HTML from the Tinyhost URL.
"""
resp = requests.get(url)
resp.raise_for_status()
return resp.text
2025-01-14 22:40:56 +00:00
2025-01-14 22:57:17 +00:00
def extract_presigned_url(html):
"""
Given the HTML of the page, extract the `presignedGetUrl`.
Returns None if not found.
"""
match = re.search(r'const presignedGetUrl = \"(.*?)\";', html)
if not match:
return None
return match.group(1)
2025-01-14 22:40:56 +00:00
2025-01-14 22:57:17 +00:00
def fetch_presigned_datastore(presigned_url):
2025-01-14 22:40:56 +00:00
"""
2025-01-14 22:57:17 +00:00
Fetch the JSON datastore from the presigned URL.
Returns a dict. If any error or no content, returns {}.
"""
try:
# Clean up the presigned URL (sometimes the signature may need re-encoding)
url_parts = urlsplit(presigned_url)
query_params = parse_qs(url_parts.query)
encoded_query = urlencode(query_params, doseq=True)
cleaned_url = urlunsplit((
url_parts.scheme,
url_parts.netloc,
url_parts.path,
encoded_query,
url_parts.fragment
))
resp = requests.get(cleaned_url, headers={
"Host": url_parts.netloc,
"User-Agent": "Mozilla/5.0"
})
resp.raise_for_status()
return resp.json()
except Exception as e:
print(f"Error fetching datastore from {presigned_url}: {e}")
return {}
def sanitize_key(key):
return re.sub(r'[^a-zA-Z0-9-_]', '_', key)
2025-01-14 22:40:56 +00:00
2025-01-14 22:57:17 +00:00
def parse_entry_metadata(html):
"""
Parse each .entry block from the HTML to figure out:
- data-entry-id
- left-metadata
- right-metadata
- classes that might indicate 'gold', 'eval', etc.
Returns a dict:
{
entry_id_1: {
'left_metadata': str,
'right_metadata': str,
'is_left_gold': bool,
'is_right_gold': bool,
'is_left_eval': bool,
'is_right_eval': bool,
...
},
...
}
Note: This uses a regex that looks for something like:
<div class="entry gold eval" data-entry-id="..."
data-left-metadata="..." data-right-metadata="...">
Adjust if your HTML changes structure or you need more nuance.
2025-01-14 22:40:56 +00:00
"""
2025-01-14 22:57:17 +00:00
# This regex attempts to capture:
# 1) The entire `class="entry ..."` portion (to detect "gold", "eval", etc.)
# 2) data-entry-id="some_string"
# 3) data-left-metadata="some_string"
# 4) data-right-metadata="some_string"
#
# We use a DOTALL and multiline approach, and some non-greedy capturing.
pattern = re.compile(
r'<div\s+class="entry([^"]*)"\s+data-entry-id="([^"]+)"\s+data-left-metadata="([^"]+)"\s+data-right-metadata="([^"]+)"',
re.DOTALL | re.MULTILINE
)
entries = {}
for m in pattern.finditer(html):
class_str = m.group(1).strip() # e.g. " gold eval"
entry_id = m.group(2).strip()
left_md = m.group(3).strip()
right_md = m.group(4).strip()
# Transform the HTML's data-entry-id to match the JS datastore keys:
entry_id = sanitize_key(entry_id)
2025-01-14 22:40:56 +00:00
2025-01-14 22:57:17 +00:00
# Basic flags
is_left_gold = " gold" in class_str or class_str.startswith("gold")
is_right_gold = " gold" in class_str or class_str.startswith("gold")
is_left_eval = " eval" in class_str or class_str.startswith("eval")
is_right_eval = " eval" in class_str or class_str.startswith("eval")
2025-01-14 22:40:56 +00:00
2025-01-14 22:57:17 +00:00
# The above checks are naive because the template often has:
# class="entry gold eval"
# for the entire entry, meaning it might not tell you *which side* is gold/eval.
#
# If your template uses "left_class=gold" or "right_class=eval" etc.
# you might need to refine your approach:
# For instance, we see the code uses separate classes on the text blocks themselves:
# <div class="text-block {{ entry.left_class }}" data-choice="left">
# So you may want to parse those sub-blocks if you need the actual side.
#
# For demonstration, we'll just store the raw class string plus the metadata:
#
# If the real logic is "gold" is always left, "eval" is always right, you can adapt here.
2025-01-14 22:40:56 +00:00
2025-01-14 22:57:17 +00:00
entries[entry_id] = {
"class_str": class_str,
"left_metadata": left_md,
"right_metadata": right_md,
}
return entries
2025-01-14 22:40:56 +00:00
2025-01-14 22:57:17 +00:00
def build_comparison_report(entries_dict, datastore):
"""
Build a comparison report showing how often each type of method
beats each other type of method, based on user votes in the datastore.
We assume:
- If user vote is 'left', then left_metadata's method "wins".
- If user vote is 'right', then right_metadata's method "wins".
- If user vote is 'both_good', 'both_bad', or 'invalid_pdf', we do not count it as a direct matchup.
Returns a nested dict or a summary of (methodA, methodB) => { "A_wins": x, "B_wins": y }.
"""
print(entries_dict)
print("")
2025-01-14 22:40:56 +00:00
2025-01-14 22:57:17 +00:00
print(datastore)
2025-01-14 22:40:56 +00:00
2025-01-14 22:57:17 +00:00
print("")
2025-01-14 22:40:56 +00:00
2025-01-14 22:57:17 +00:00
# comparisons[ (methodA, methodB) ] = [A_wins, B_wins]
comparisons = defaultdict(lambda: [0, 0])
2025-01-14 22:40:56 +00:00
2025-01-14 22:57:17 +00:00
for entry_id, vote in datastore.items():
if entry_id not in entries_dict:
# No matching <div> found for this key in the HTML
continue
left_method = entries_dict[entry_id]["left_metadata"]
right_method = entries_dict[entry_id]["right_metadata"]
if vote == "left":
# left_method beats right_method
if left_method != right_method:
# Use a sorted tuple so the pair is always in a consistent order
pair = tuple(sorted([left_method, right_method]))
if pair[0] == left_method:
comparisons[pair][0] += 1 # left_method is pair[0]
else:
comparisons[pair][1] += 1 # left_method is pair[1]
elif vote == "right":
# right_method beats left_method
if left_method != right_method:
pair = tuple(sorted([left_method, right_method]))
if pair[0] == right_method:
comparisons[pair][0] += 1
else:
comparisons[pair][1] += 1
else:
# "both_good", "both_bad", "invalid_pdf", etc.
# Not counted as a direct head-to-head winner.
pass
2025-01-14 22:40:56 +00:00
2025-01-14 22:57:17 +00:00
# Build a more readable summary
# comparisons[(A, B)] = [A_wins, B_wins], where A < B lexicographically in that tuple
results = []
for (A, B), (A_wins, B_wins) in comparisons.items():
total = A_wins + B_wins
A_rate = (A_wins / total) if total else 0
B_rate = (B_wins / total) if total else 0
results.append({
"methodA": A,
"methodB": B,
"A_wins": A_wins,
"B_wins": B_wins,
"A_win_rate": f"{100 * A_rate:.1f}%",
"B_win_rate": f"{100 * B_rate:.1f}%",
})
2025-01-14 22:40:56 +00:00
return results
2025-01-14 22:57:17 +00:00
def make_report(urls):
"""
Main function that:
- Fetches each HTML page
- Extracts presignedGetUrl
- Fetches the JSON datastore
- Parses .entry blocks for metadata
- Produces an overall "win rate" report for each method vs method
"""
# Aggregate all entries from all URLs into a single dict
# so each entry_id is unique across all pages (they usually are).
master_entries_dict = {}
master_datastore = {}
for url in urls:
try:
html = fetch_review_page_html(url)
except Exception as e:
print(f"Error fetching HTML from {url}: {e}")
continue
# Extract the presignedGetUrl
presigned_url = extract_presigned_url(html)
if not presigned_url:
print(f"Warning: Could not find presignedGetUrl in {url}")
continue
# Fetch the datastore
datastore = fetch_presigned_datastore(presigned_url)
# Parse the HTML for entry metadata
entries_dict = parse_entry_metadata(html)
# Merge into master
for k, v in entries_dict.items():
master_entries_dict[k] = v
for k, v in datastore.items():
master_datastore[k] = v
# Now build the comparison report
report = build_comparison_report(master_entries_dict, master_datastore)
2025-01-14 22:40:56 +00:00
2025-01-14 22:57:17 +00:00
print("=== Comparison Report (Win Rates) ===")
if not report:
print("No head-to-head comparisons found (did not find left/right votes).")
return
# Print out each matchup
for row in report:
A = row["methodA"]
B = row["methodB"]
A_wins = row["A_wins"]
B_wins = row["B_wins"]
A_win_rate = row["A_win_rate"]
B_win_rate = row["B_win_rate"]
print(
f"{A} vs {B}: "
f"{A} wins={A_wins} ({A_win_rate}), "
f"{B} wins={B_wins} ({B_win_rate})"
)
2025-01-14 22:40:56 +00:00
2025-01-14 22:57:17 +00:00
if __name__ == "__main__":
# Example usage
urls = [
"https://jakep-tinyhost.s3.amazonaws.com/review_page-681aae527593.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=BR1nqCUKQLBlh3HIsHjeyRVQumI%3D&Expires=1737500018",
# Add more URLs here...
]
make_report(urls)