More elo scoring stuff

2025-12-11 23:32:10 +00:00 · 2025-01-14 22:40:56 +00:00 · 2025-01-14 22:40:56 +00:00 · 00f2a67ac4
commit 00f2a67ac4
parent 834e91c8d5
5 changed files with 130 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,6 +9,7 @@ s2orc_previews/*
 s2orc_previews_3200/*
 sample200_vllm/*
 sample200_sglang/*
+pdelfin_testset/*
 /*.html
 debug.log
 birrpipeline-debug.log
--- a/pdelfin/eval/buildelo.py
+++ b/pdelfin/eval/buildelo.py
@ -2,13 +2,17 @@ import argparse
 import boto3
 import dataclasses
 import random
+import re

+from tqdm import tqdm
 from itertools import combinations
 from pdelfin.s3_utils import parse_s3_path, expand_s3_glob, get_s3_bytes
 from dolma_refine.evaluate.metrics import DocumentEditSimilarity
 from dolma_refine.evaluate.segmenters import SpacySegmenter
 from dolma_refine.evaluate.aligners import HirschbergAligner

+from pdelfin.eval.evalhtml import create_review_html
+
 s3_client = boto3.client('s3')

@dataclasses.dataclass
@ -18,10 +22,34 @@ class Comparison:
    comparison_a_path: str
    comparison_b_path: str

+    comparison_a_str: str
+    comparison_b_str: str
+
    alignment: float

-def build_review_page(args):
-    pass
+    @property
+    def comparison_a_method(self):
+        return re.search(r'page[0-9]+_(\w+)\.md$', self.comparison_a_path).group(1)
+
+    @property
+    def comparison_b_method(self):
+        return re.search(r'page[0-9]+_(\w+)\.md$', self.comparison_b_path).group(1)
+
+def build_review_page(args, comparisons):
+    page_data = []
+
+    for comp in comparisons:
+       page_data.append({
+                "s3_path": comp.pdf_path,
+                "page": 1,
+                "gold_text": comp.comparison_a_str,
+                "gold_metadata": comp.comparison_a_method,
+                "eval_text": comp.comparison_b_str,
+                "eval_metadata": comp.comparison_b_method,
+                "alignment": comp.alignment
+            })
+       
+    create_review_html(page_data, args.name + ".html")

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
@ -61,7 +89,7 @@ if __name__ == "__main__":
    all_pdfs = set(expand_s3_glob(s3_client, args.s3_path + "/*.pdf"))
    all_mds = set(expand_s3_glob(s3_client, args.s3_path + "/*.md"))

-    for pdf_path in all_pdfs:
+    for pdf_path in tqdm(all_pdfs):
        pdf_comps = []
        for comp in args.comparisons:
            comp_path = pdf_path.replace(".pdf", f"_{comp}.md")
@ -79,10 +107,15 @@ if __name__ == "__main__":
                Comparison(pdf_path=pdf_path,
                comparison_a_path=compa,
                comparison_b_path=compb,
+                comparison_a_str=text_a,
+                comparison_b_str=text_b,
                alignment=comparer.compute(text_a, text_b)
                )
            )
    
-            print(all_comps[-1])
-
-    result = build_review_page(args)
+        # DEBUG CODE, remove
+        if len(all_comps) > 10:
+            break
+            
+    all_comps.sort(key=lambda c: c.alignment)
+    result = build_review_page(args, all_comps[0:args.review_size])
--- a/pdelfin/eval/evalhtml.py
+++ b/pdelfin/eval/evalhtml.py
@ -35,9 +35,11 @@ def process_entry(i, entry):
    if random.choice([True, False]):
        left_text, right_text = entry["gold_text"], entry["eval_text"]
        left_class, right_class = "gold", "eval"
+        left_metadata, right_metadata = entry.get("gold_metadata", ""), entry.get("eval_metadata", "")
    else:
        left_text, right_text = entry["eval_text"], entry["gold_text"]
        left_class, right_class = "eval", "gold"
+        left_metadata, right_metadata = entry.get("eval_metadata", ""), entry.get("gold_metadata", "")

    # Generate diff for right_text compared to left_text
    diff_html = generate_diff_html(left_text, right_text)
@ -70,6 +72,8 @@ def process_entry(i, entry):
        "page": entry["page"],
        "alignment": entry["alignment"],
        "signed_pdf_link": signed_pdf_link,
+        "left_metadata": left_metadata,
+        "right_metadata": right_metadata,
        "left_text": left_text,
        "right_text": right_text,
        "diff_text": diff_html, 
--- a/pdelfin/eval/evalhtml_template.html
+++ b/pdelfin/eval/evalhtml_template.html
@ -200,7 +200,7 @@

    <div class="container">
        {% for entry in entries %}
-        <div class="entry {{ entry.gold_class }} {{ entry.eval_class }}" data-entry-id="{{ entry.s3_path | replace('/', '_') }}_{{ entry.page }}">
+        <div class="entry {{ entry.gold_class }} {{ entry.eval_class }}" data-entry-id="{{ entry.s3_path | replace('/', '_') }}_{{ entry.page }}" data-left-metadata="{{ entry.left_metadata }}" data-right-metadata="{{ entry.right_metadata }}">
            <div class="image-container">
                <img src="data:image/png;base64,{{ entry.page_image }}" alt="Render">

--- a/pdelfin/eval/scoreelo.py
+++ b/pdelfin/eval/scoreelo.py
@ -0,0 +1,85 @@
+# TODO Takes in a list of tinyhost urls as arguments
+# ex https://jakep-tinyhost.s3.amazonaws.com/review_page-a1617c2734b2.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=NEsAN69b98Z%2BqDR23zmQKu%2B5IHM%3D&Expires=1737496145
+
+# Extracts out the presignedGetUrl from the source code,
+# const presignedGetUrl = "https://jakep-tinyhost.s3.amazonaws.com//etSe2zObhx1hpcO7TcS7.json?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=bl0wav%2BDqXL5%2FCo12Mmu2Sm0gGQ%3D&Expires=1737496145";
+# And gets the contents of this page
+
+# Next, get's all the votes, figures out what they match to
+
+# Given all the votes calculates the ELO score
+
+import requests
+import re
+from urllib.parse import urlsplit, urlunsplit, parse_qs, urlencode
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+
+def fetch_presigned_content(urls):
+    """
+    Extracts the `presignedGetUrl` from the source code of the given URLs and fetches the content of the URL.
+
+    Args:
+        urls (list): List of tinyhost URLs.
+
+    Returns:
+        dict: A dictionary mapping the original URL to the content of the `presignedGetUrl`.
+    """
+    results = {}
+
+    for url in urls:
+        try:
+            # Fetch the source code of the page
+            response = requests.get(url)
+            response.raise_for_status()
+            source_code = response.text
+
+            # Extract the presignedGetUrl using a regular expression
+            match = re.search(r'const presignedGetUrl = \"(.*?)\";', source_code)
+            if not match:
+                print(f"No presignedGetUrl found in {url}")
+                results[url] = None
+                continue
+
+            presigned_url = match.group(1)
+
+            # Fetch the content of the presigned URL
+            print(presigned_url)
+            # Step 1: Split the URL into components
+            url_parts = urlsplit(presigned_url)
+
+            # Step 2: Parse query parameters
+            query_params = parse_qs(url_parts.query)
+
+            print(query_params)
+            # Step 3: Re-encode the query parameters properly
+            encoded_query = urlencode(query_params, doseq=True)
+
+            # Step 4: Rebuild the URL with the cleaned query string
+            cleaned_url = urlunsplit((url_parts.scheme, url_parts.netloc, url_parts.path, encoded_query, url_parts.fragment))
+
+            print("Cleaned URL:", cleaned_url)
+
+            presigned_response = requests.get(presigned_url, headers={"Host": "jakep-tinyhost.s3.amazonaws.com",
+                                                                       "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"})
+            presigned_response.raise_for_status()
+
+            # Store the content in the results dictionary
+            results[url] = presigned_response.text
+        except requests.RequestException as e:
+            print(f"Error fetching data from {url} or its presigned URL: {e}")
+            results[url] = None
+
+    return results
+
+# Example usage
+urls = [
+    "https://jakep-tinyhost.s3.amazonaws.com/review_page-59c2f52d9bf3.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=UPIEQMLEXWG%2BpAkvm7YJrrEIgnI%3D&Expires=1737499054"
+]
+
+content_map = fetch_presigned_content(urls)
+
+for original_url, content in content_map.items():
+    print(f"Content fetched from presigned URL in {original_url}:")
+    print(content)