mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-14 16:53:18 +00:00
More elo scoring stuff
This commit is contained in:
parent
834e91c8d5
commit
00f2a67ac4
1
.gitignore
vendored
1
.gitignore
vendored
@ -9,6 +9,7 @@ s2orc_previews/*
|
|||||||
s2orc_previews_3200/*
|
s2orc_previews_3200/*
|
||||||
sample200_vllm/*
|
sample200_vllm/*
|
||||||
sample200_sglang/*
|
sample200_sglang/*
|
||||||
|
pdelfin_testset/*
|
||||||
/*.html
|
/*.html
|
||||||
debug.log
|
debug.log
|
||||||
birrpipeline-debug.log
|
birrpipeline-debug.log
|
||||||
|
|||||||
@ -2,13 +2,17 @@ import argparse
|
|||||||
import boto3
|
import boto3
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import random
|
import random
|
||||||
|
import re
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
from itertools import combinations
|
from itertools import combinations
|
||||||
from pdelfin.s3_utils import parse_s3_path, expand_s3_glob, get_s3_bytes
|
from pdelfin.s3_utils import parse_s3_path, expand_s3_glob, get_s3_bytes
|
||||||
from dolma_refine.evaluate.metrics import DocumentEditSimilarity
|
from dolma_refine.evaluate.metrics import DocumentEditSimilarity
|
||||||
from dolma_refine.evaluate.segmenters import SpacySegmenter
|
from dolma_refine.evaluate.segmenters import SpacySegmenter
|
||||||
from dolma_refine.evaluate.aligners import HirschbergAligner
|
from dolma_refine.evaluate.aligners import HirschbergAligner
|
||||||
|
|
||||||
|
from pdelfin.eval.evalhtml import create_review_html
|
||||||
|
|
||||||
s3_client = boto3.client('s3')
|
s3_client = boto3.client('s3')
|
||||||
|
|
||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
@ -18,10 +22,34 @@ class Comparison:
|
|||||||
comparison_a_path: str
|
comparison_a_path: str
|
||||||
comparison_b_path: str
|
comparison_b_path: str
|
||||||
|
|
||||||
|
comparison_a_str: str
|
||||||
|
comparison_b_str: str
|
||||||
|
|
||||||
alignment: float
|
alignment: float
|
||||||
|
|
||||||
def build_review_page(args):
|
@property
|
||||||
pass
|
def comparison_a_method(self):
|
||||||
|
return re.search(r'page[0-9]+_(\w+)\.md$', self.comparison_a_path).group(1)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def comparison_b_method(self):
|
||||||
|
return re.search(r'page[0-9]+_(\w+)\.md$', self.comparison_b_path).group(1)
|
||||||
|
|
||||||
|
def build_review_page(args, comparisons):
|
||||||
|
page_data = []
|
||||||
|
|
||||||
|
for comp in comparisons:
|
||||||
|
page_data.append({
|
||||||
|
"s3_path": comp.pdf_path,
|
||||||
|
"page": 1,
|
||||||
|
"gold_text": comp.comparison_a_str,
|
||||||
|
"gold_metadata": comp.comparison_a_method,
|
||||||
|
"eval_text": comp.comparison_b_str,
|
||||||
|
"eval_metadata": comp.comparison_b_method,
|
||||||
|
"alignment": comp.alignment
|
||||||
|
})
|
||||||
|
|
||||||
|
create_review_html(page_data, args.name + ".html")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@ -61,7 +89,7 @@ if __name__ == "__main__":
|
|||||||
all_pdfs = set(expand_s3_glob(s3_client, args.s3_path + "/*.pdf"))
|
all_pdfs = set(expand_s3_glob(s3_client, args.s3_path + "/*.pdf"))
|
||||||
all_mds = set(expand_s3_glob(s3_client, args.s3_path + "/*.md"))
|
all_mds = set(expand_s3_glob(s3_client, args.s3_path + "/*.md"))
|
||||||
|
|
||||||
for pdf_path in all_pdfs:
|
for pdf_path in tqdm(all_pdfs):
|
||||||
pdf_comps = []
|
pdf_comps = []
|
||||||
for comp in args.comparisons:
|
for comp in args.comparisons:
|
||||||
comp_path = pdf_path.replace(".pdf", f"_{comp}.md")
|
comp_path = pdf_path.replace(".pdf", f"_{comp}.md")
|
||||||
@ -79,10 +107,15 @@ if __name__ == "__main__":
|
|||||||
Comparison(pdf_path=pdf_path,
|
Comparison(pdf_path=pdf_path,
|
||||||
comparison_a_path=compa,
|
comparison_a_path=compa,
|
||||||
comparison_b_path=compb,
|
comparison_b_path=compb,
|
||||||
|
comparison_a_str=text_a,
|
||||||
|
comparison_b_str=text_b,
|
||||||
alignment=comparer.compute(text_a, text_b)
|
alignment=comparer.compute(text_a, text_b)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
print(all_comps[-1])
|
# DEBUG CODE, remove
|
||||||
|
if len(all_comps) > 10:
|
||||||
|
break
|
||||||
|
|
||||||
result = build_review_page(args)
|
all_comps.sort(key=lambda c: c.alignment)
|
||||||
|
result = build_review_page(args, all_comps[0:args.review_size])
|
||||||
@ -35,9 +35,11 @@ def process_entry(i, entry):
|
|||||||
if random.choice([True, False]):
|
if random.choice([True, False]):
|
||||||
left_text, right_text = entry["gold_text"], entry["eval_text"]
|
left_text, right_text = entry["gold_text"], entry["eval_text"]
|
||||||
left_class, right_class = "gold", "eval"
|
left_class, right_class = "gold", "eval"
|
||||||
|
left_metadata, right_metadata = entry.get("gold_metadata", ""), entry.get("eval_metadata", "")
|
||||||
else:
|
else:
|
||||||
left_text, right_text = entry["eval_text"], entry["gold_text"]
|
left_text, right_text = entry["eval_text"], entry["gold_text"]
|
||||||
left_class, right_class = "eval", "gold"
|
left_class, right_class = "eval", "gold"
|
||||||
|
left_metadata, right_metadata = entry.get("eval_metadata", ""), entry.get("gold_metadata", "")
|
||||||
|
|
||||||
# Generate diff for right_text compared to left_text
|
# Generate diff for right_text compared to left_text
|
||||||
diff_html = generate_diff_html(left_text, right_text)
|
diff_html = generate_diff_html(left_text, right_text)
|
||||||
@ -70,6 +72,8 @@ def process_entry(i, entry):
|
|||||||
"page": entry["page"],
|
"page": entry["page"],
|
||||||
"alignment": entry["alignment"],
|
"alignment": entry["alignment"],
|
||||||
"signed_pdf_link": signed_pdf_link,
|
"signed_pdf_link": signed_pdf_link,
|
||||||
|
"left_metadata": left_metadata,
|
||||||
|
"right_metadata": right_metadata,
|
||||||
"left_text": left_text,
|
"left_text": left_text,
|
||||||
"right_text": right_text,
|
"right_text": right_text,
|
||||||
"diff_text": diff_html,
|
"diff_text": diff_html,
|
||||||
|
|||||||
@ -200,7 +200,7 @@
|
|||||||
|
|
||||||
<div class="container">
|
<div class="container">
|
||||||
{% for entry in entries %}
|
{% for entry in entries %}
|
||||||
<div class="entry {{ entry.gold_class }} {{ entry.eval_class }}" data-entry-id="{{ entry.s3_path | replace('/', '_') }}_{{ entry.page }}">
|
<div class="entry {{ entry.gold_class }} {{ entry.eval_class }}" data-entry-id="{{ entry.s3_path | replace('/', '_') }}_{{ entry.page }}" data-left-metadata="{{ entry.left_metadata }}" data-right-metadata="{{ entry.right_metadata }}">
|
||||||
<div class="image-container">
|
<div class="image-container">
|
||||||
<img src="data:image/png;base64,{{ entry.page_image }}" alt="Render">
|
<img src="data:image/png;base64,{{ entry.page_image }}" alt="Render">
|
||||||
|
|
||||||
|
|||||||
85
pdelfin/eval/scoreelo.py
Normal file
85
pdelfin/eval/scoreelo.py
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
# TODO Takes in a list of tinyhost urls as arguments
|
||||||
|
# ex https://jakep-tinyhost.s3.amazonaws.com/review_page-a1617c2734b2.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=NEsAN69b98Z%2BqDR23zmQKu%2B5IHM%3D&Expires=1737496145
|
||||||
|
|
||||||
|
# Extracts out the presignedGetUrl from the source code,
|
||||||
|
# const presignedGetUrl = "https://jakep-tinyhost.s3.amazonaws.com//etSe2zObhx1hpcO7TcS7.json?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=bl0wav%2BDqXL5%2FCo12Mmu2Sm0gGQ%3D&Expires=1737496145";
|
||||||
|
# And gets the contents of this page
|
||||||
|
|
||||||
|
# Next, get's all the votes, figures out what they match to
|
||||||
|
|
||||||
|
# Given all the votes calculates the ELO score
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
from urllib.parse import urlsplit, urlunsplit, parse_qs, urlencode
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
|
def fetch_presigned_content(urls):
|
||||||
|
"""
|
||||||
|
Extracts the `presignedGetUrl` from the source code of the given URLs and fetches the content of the URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls (list): List of tinyhost URLs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: A dictionary mapping the original URL to the content of the `presignedGetUrl`.
|
||||||
|
"""
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
try:
|
||||||
|
# Fetch the source code of the page
|
||||||
|
response = requests.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
source_code = response.text
|
||||||
|
|
||||||
|
# Extract the presignedGetUrl using a regular expression
|
||||||
|
match = re.search(r'const presignedGetUrl = \"(.*?)\";', source_code)
|
||||||
|
if not match:
|
||||||
|
print(f"No presignedGetUrl found in {url}")
|
||||||
|
results[url] = None
|
||||||
|
continue
|
||||||
|
|
||||||
|
presigned_url = match.group(1)
|
||||||
|
|
||||||
|
# Fetch the content of the presigned URL
|
||||||
|
print(presigned_url)
|
||||||
|
# Step 1: Split the URL into components
|
||||||
|
url_parts = urlsplit(presigned_url)
|
||||||
|
|
||||||
|
# Step 2: Parse query parameters
|
||||||
|
query_params = parse_qs(url_parts.query)
|
||||||
|
|
||||||
|
print(query_params)
|
||||||
|
# Step 3: Re-encode the query parameters properly
|
||||||
|
encoded_query = urlencode(query_params, doseq=True)
|
||||||
|
|
||||||
|
# Step 4: Rebuild the URL with the cleaned query string
|
||||||
|
cleaned_url = urlunsplit((url_parts.scheme, url_parts.netloc, url_parts.path, encoded_query, url_parts.fragment))
|
||||||
|
|
||||||
|
print("Cleaned URL:", cleaned_url)
|
||||||
|
|
||||||
|
presigned_response = requests.get(presigned_url, headers={"Host": "jakep-tinyhost.s3.amazonaws.com",
|
||||||
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"})
|
||||||
|
presigned_response.raise_for_status()
|
||||||
|
|
||||||
|
# Store the content in the results dictionary
|
||||||
|
results[url] = presigned_response.text
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Error fetching data from {url} or its presigned URL: {e}")
|
||||||
|
results[url] = None
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
# Example usage
|
||||||
|
urls = [
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page-59c2f52d9bf3.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=UPIEQMLEXWG%2BpAkvm7YJrrEIgnI%3D&Expires=1737499054"
|
||||||
|
]
|
||||||
|
|
||||||
|
content_map = fetch_presigned_content(urls)
|
||||||
|
|
||||||
|
for original_url, content in content_map.items():
|
||||||
|
print(f"Content fetched from presigned URL in {original_url}:")
|
||||||
|
print(content)
|
||||||
Loading…
x
Reference in New Issue
Block a user