diff --git a/.gitignore b/.gitignore index d3a4573..e0ffe86 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ s2orc_previews/* s2orc_previews_3200/* sample200_vllm/* sample200_sglang/* +pdelfin_testset/* /*.html debug.log birrpipeline-debug.log diff --git a/pdelfin/eval/runelo.py b/pdelfin/eval/buildelo.py similarity index 67% rename from pdelfin/eval/runelo.py rename to pdelfin/eval/buildelo.py index f707172..1308d43 100644 --- a/pdelfin/eval/runelo.py +++ b/pdelfin/eval/buildelo.py @@ -2,13 +2,17 @@ import argparse import boto3 import dataclasses import random +import re +from tqdm import tqdm from itertools import combinations from pdelfin.s3_utils import parse_s3_path, expand_s3_glob, get_s3_bytes from dolma_refine.evaluate.metrics import DocumentEditSimilarity from dolma_refine.evaluate.segmenters import SpacySegmenter from dolma_refine.evaluate.aligners import HirschbergAligner +from pdelfin.eval.evalhtml import create_review_html + s3_client = boto3.client('s3') @dataclasses.dataclass @@ -18,10 +22,34 @@ class Comparison: comparison_a_path: str comparison_b_path: str + comparison_a_str: str + comparison_b_str: str + alignment: float -def build_review_page(args): - pass + @property + def comparison_a_method(self): + return re.search(r'page[0-9]+_(\w+)\.md$', self.comparison_a_path).group(1) + + @property + def comparison_b_method(self): + return re.search(r'page[0-9]+_(\w+)\.md$', self.comparison_b_path).group(1) + +def build_review_page(args, comparisons): + page_data = [] + + for comp in comparisons: + page_data.append({ + "s3_path": comp.pdf_path, + "page": 1, + "gold_text": comp.comparison_a_str, + "gold_metadata": comp.comparison_a_method, + "eval_text": comp.comparison_b_str, + "eval_metadata": comp.comparison_b_method, + "alignment": comp.alignment + }) + + create_review_html(page_data, args.name + ".html") if __name__ == "__main__": parser = argparse.ArgumentParser( @@ -61,7 +89,7 @@ if __name__ == "__main__": all_pdfs = set(expand_s3_glob(s3_client, args.s3_path + "/*.pdf")) all_mds = set(expand_s3_glob(s3_client, args.s3_path + "/*.md")) - for pdf_path in all_pdfs: + for pdf_path in tqdm(all_pdfs): pdf_comps = [] for comp in args.comparisons: comp_path = pdf_path.replace(".pdf", f"_{comp}.md") @@ -79,10 +107,15 @@ if __name__ == "__main__": Comparison(pdf_path=pdf_path, comparison_a_path=compa, comparison_b_path=compb, + comparison_a_str=text_a, + comparison_b_str=text_b, alignment=comparer.compute(text_a, text_b) ) ) - print(all_comps[-1]) - - result = build_review_page(args) \ No newline at end of file + # DEBUG CODE, remove + if len(all_comps) > 10: + break + + all_comps.sort(key=lambda c: c.alignment) + result = build_review_page(args, all_comps[0:args.review_size]) \ No newline at end of file diff --git a/pdelfin/eval/evalhtml.py b/pdelfin/eval/evalhtml.py index 8ab4b35..390b396 100644 --- a/pdelfin/eval/evalhtml.py +++ b/pdelfin/eval/evalhtml.py @@ -35,9 +35,11 @@ def process_entry(i, entry): if random.choice([True, False]): left_text, right_text = entry["gold_text"], entry["eval_text"] left_class, right_class = "gold", "eval" + left_metadata, right_metadata = entry.get("gold_metadata", ""), entry.get("eval_metadata", "") else: left_text, right_text = entry["eval_text"], entry["gold_text"] left_class, right_class = "eval", "gold" + left_metadata, right_metadata = entry.get("eval_metadata", ""), entry.get("gold_metadata", "") # Generate diff for right_text compared to left_text diff_html = generate_diff_html(left_text, right_text) @@ -70,6 +72,8 @@ def process_entry(i, entry): "page": entry["page"], "alignment": entry["alignment"], "signed_pdf_link": signed_pdf_link, + "left_metadata": left_metadata, + "right_metadata": right_metadata, "left_text": left_text, "right_text": right_text, "diff_text": diff_html, diff --git a/pdelfin/eval/evalhtml_template.html b/pdelfin/eval/evalhtml_template.html index 9c79039..84b1bdd 100644 --- a/pdelfin/eval/evalhtml_template.html +++ b/pdelfin/eval/evalhtml_template.html @@ -200,7 +200,7 @@
{% for entry in entries %} -
+
Render diff --git a/pdelfin/eval/scoreelo.py b/pdelfin/eval/scoreelo.py new file mode 100644 index 0000000..87efb1f --- /dev/null +++ b/pdelfin/eval/scoreelo.py @@ -0,0 +1,85 @@ +# TODO Takes in a list of tinyhost urls as arguments +# ex https://jakep-tinyhost.s3.amazonaws.com/review_page-a1617c2734b2.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=NEsAN69b98Z%2BqDR23zmQKu%2B5IHM%3D&Expires=1737496145 + +# Extracts out the presignedGetUrl from the source code, +# const presignedGetUrl = "https://jakep-tinyhost.s3.amazonaws.com//etSe2zObhx1hpcO7TcS7.json?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=bl0wav%2BDqXL5%2FCo12Mmu2Sm0gGQ%3D&Expires=1737496145"; +# And gets the contents of this page + +# Next, get's all the votes, figures out what they match to + +# Given all the votes calculates the ELO score + +import requests +import re +from urllib.parse import urlsplit, urlunsplit, parse_qs, urlencode +import logging + +logging.basicConfig(level=logging.DEBUG) + +def fetch_presigned_content(urls): + """ + Extracts the `presignedGetUrl` from the source code of the given URLs and fetches the content of the URL. + + Args: + urls (list): List of tinyhost URLs. + + Returns: + dict: A dictionary mapping the original URL to the content of the `presignedGetUrl`. + """ + results = {} + + for url in urls: + try: + # Fetch the source code of the page + response = requests.get(url) + response.raise_for_status() + source_code = response.text + + # Extract the presignedGetUrl using a regular expression + match = re.search(r'const presignedGetUrl = \"(.*?)\";', source_code) + if not match: + print(f"No presignedGetUrl found in {url}") + results[url] = None + continue + + presigned_url = match.group(1) + + # Fetch the content of the presigned URL + print(presigned_url) + # Step 1: Split the URL into components + url_parts = urlsplit(presigned_url) + + # Step 2: Parse query parameters + query_params = parse_qs(url_parts.query) + + print(query_params) + # Step 3: Re-encode the query parameters properly + encoded_query = urlencode(query_params, doseq=True) + + # Step 4: Rebuild the URL with the cleaned query string + cleaned_url = urlunsplit((url_parts.scheme, url_parts.netloc, url_parts.path, encoded_query, url_parts.fragment)) + + print("Cleaned URL:", cleaned_url) + + presigned_response = requests.get(presigned_url, headers={"Host": "jakep-tinyhost.s3.amazonaws.com", + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"}) + presigned_response.raise_for_status() + + # Store the content in the results dictionary + results[url] = presigned_response.text + except requests.RequestException as e: + print(f"Error fetching data from {url} or its presigned URL: {e}") + results[url] = None + + return results + +# Example usage +urls = [ + "https://jakep-tinyhost.s3.amazonaws.com/review_page-59c2f52d9bf3.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=UPIEQMLEXWG%2BpAkvm7YJrrEIgnI%3D&Expires=1737499054" +] + +content_map = fetch_presigned_content(urls) + +for original_url, content in content_map.items(): + print(f"Content fetched from presigned URL in {original_url}:") + print(content)