mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-11 16:09:00 +00:00
Runeval is much improved now
This commit is contained in:
parent
8a66ecee25
commit
9d6e2faf95
@ -1,3 +1,4 @@
|
|||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from jinja2 import Template
|
from jinja2 import Template
|
||||||
import random
|
import random
|
||||||
import os
|
import os
|
||||||
@ -41,13 +42,7 @@ def render_pdf_to_base64png(s3_path, page):
|
|||||||
return image_base64
|
return image_base64
|
||||||
|
|
||||||
|
|
||||||
def create_review_html(data, filename="review_page.html"):
|
def process_entry(i, entry):
|
||||||
# Load the Jinja2 template from the file
|
|
||||||
with open(os.path.join(os.path.dirname(__file__), "evalhtml_template.html"), "r") as f:
|
|
||||||
template = Template(f.read())
|
|
||||||
|
|
||||||
entries = []
|
|
||||||
for i, entry in tqdm(enumerate(data)):
|
|
||||||
# Randomly decide whether to display gold on the left or right
|
# Randomly decide whether to display gold on the left or right
|
||||||
if random.choice([True, False]):
|
if random.choice([True, False]):
|
||||||
left_text, right_text = entry["gold_text"], entry["eval_text"]
|
left_text, right_text = entry["gold_text"], entry["eval_text"]
|
||||||
@ -67,8 +62,7 @@ def create_review_html(data, filename="review_page.html"):
|
|||||||
s3_key = parsed_url.path.lstrip('/')
|
s3_key = parsed_url.path.lstrip('/')
|
||||||
signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800)
|
signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800)
|
||||||
|
|
||||||
# Create a dictionary for each entry
|
return {
|
||||||
entries.append({
|
|
||||||
"entry_id": i,
|
"entry_id": i,
|
||||||
"page_image": render_pdf_to_base64png(entry["s3_path"], entry["page"]),
|
"page_image": render_pdf_to_base64png(entry["s3_path"], entry["page"]),
|
||||||
"s3_path": entry["s3_path"],
|
"s3_path": entry["s3_path"],
|
||||||
@ -82,7 +76,22 @@ def create_review_html(data, filename="review_page.html"):
|
|||||||
"right_class": right_class,
|
"right_class": right_class,
|
||||||
"gold_class": "gold" if left_class == "gold" else "eval",
|
"gold_class": "gold" if left_class == "gold" else "eval",
|
||||||
"eval_class": "eval" if right_class == "eval" else "gold"
|
"eval_class": "eval" if right_class == "eval" else "gold"
|
||||||
})
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def create_review_html(data, filename="review_page.html"):
|
||||||
|
# Load the Jinja2 template from the file
|
||||||
|
with open(os.path.join(os.path.dirname(__file__), "evalhtml_template.html"), "r") as f:
|
||||||
|
template = Template(f.read())
|
||||||
|
|
||||||
|
entries = []
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
# Submit tasks to the executor
|
||||||
|
futures = [executor.submit(process_entry, i, entry) for i, entry in enumerate(data)]
|
||||||
|
|
||||||
|
# Process the results as they are completed
|
||||||
|
for future in tqdm(futures):
|
||||||
|
entries.append(future.result())
|
||||||
|
|
||||||
# Render the template with the entries
|
# Render the template with the entries
|
||||||
final_html = template.render(entries=entries)
|
final_html = template.render(entries=entries)
|
||||||
|
|||||||
@ -10,6 +10,7 @@ import hashlib
|
|||||||
import random
|
import random
|
||||||
import zstandard
|
import zstandard
|
||||||
import sys
|
import sys
|
||||||
|
import argparse
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
|
||||||
@ -58,29 +59,11 @@ def load_gold_data(gold_data_path: str) -> dict:
|
|||||||
|
|
||||||
gold_data = {}
|
gold_data = {}
|
||||||
|
|
||||||
# List the contents of the S3 bucket
|
gold_jsonl_files = list_jsonl_files(gold_data_path)
|
||||||
bucket_name, prefix = gold_data_path.replace("s3://", "").split("/", 1)
|
|
||||||
paginator = s3_client.get_paginator('list_objects_v2')
|
|
||||||
pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
|
|
||||||
|
|
||||||
for page in pages:
|
|
||||||
for obj in page.get('Contents', []):
|
|
||||||
s3_key = obj['Key']
|
|
||||||
if s3_key.endswith('.json'):
|
|
||||||
local_file_path = os.path.join(CACHE_DIR, os.path.basename(s3_key))
|
|
||||||
etag = obj['ETag'].strip('"') # ETag is the checksum
|
|
||||||
|
|
||||||
# Check if the file is already cached and verify its checksum
|
|
||||||
if os.path.exists(local_file_path):
|
|
||||||
cached_file_hash = compute_file_hash(local_file_path)
|
|
||||||
if cached_file_hash != etag:
|
|
||||||
raise ValueError(f"File {local_file_path} has changed on S3. Clear the cache in {CACHE_DIR} and reload.")
|
|
||||||
else:
|
|
||||||
# Download the file from S3 if not cached
|
|
||||||
download_from_s3(f"s3://{bucket_name}/{s3_key}", local_file_path)
|
|
||||||
|
|
||||||
|
for path in gold_jsonl_files:
|
||||||
# Load the JSON file
|
# Load the JSON file
|
||||||
with smart_open(local_file_path, 'r') as f:
|
with smart_open(path, 'r') as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
data = json.loads(line)
|
data = json.loads(line)
|
||||||
|
|
||||||
@ -197,7 +180,7 @@ def process_jsonl_file(jsonl_file, gold_data, comparer):
|
|||||||
|
|
||||||
return total_alignment_score, char_weighted_alignment_score, total_chars, total_pages, page_data
|
return total_alignment_score, char_weighted_alignment_score, total_chars, total_pages, page_data
|
||||||
|
|
||||||
def do_eval(gold_data_path: str, eval_data_path: str, ) -> tuple[float, list[dict]]:
|
def do_eval(gold_data_path: str, eval_data_path: str, review_page_name: str) -> tuple[float, list[dict]]:
|
||||||
gold_data = load_gold_data(gold_data_path)
|
gold_data = load_gold_data(gold_data_path)
|
||||||
|
|
||||||
total_alignment_score = 0
|
total_alignment_score = 0
|
||||||
@ -238,28 +221,52 @@ def do_eval(gold_data_path: str, eval_data_path: str, ) -> tuple[float, list[dic
|
|||||||
# if pd["alignment"] > 0.97:
|
# if pd["alignment"] > 0.97:
|
||||||
# continue
|
# continue
|
||||||
|
|
||||||
if len(pd["gold_text"]) < 200 and len(pd["eval_text"]) < 200:
|
# if len(pd["gold_text"]) < 200 and len(pd["eval_text"]) < 200:
|
||||||
continue
|
# continue
|
||||||
|
|
||||||
page_eval_data.append(pd)
|
page_eval_data.append(pd)
|
||||||
|
|
||||||
# Select random entries to return in the page_eval_data
|
|
||||||
page_eval_data = random.sample(page_eval_data, 20)
|
|
||||||
|
|
||||||
# Select the top 20 lowest alignments
|
|
||||||
# page_eval_data.sort(key=lambda x: x["alignment"])
|
|
||||||
# page_eval_data = page_eval_data[:20]
|
|
||||||
|
|
||||||
# Uncomment this to generate a nice review page to use with tinyhost
|
|
||||||
create_review_html(page_eval_data, filename="review_page.html")
|
|
||||||
|
|
||||||
print(f"Compared {len(total_pages_compared):,} pages")
|
print(f"Compared {len(total_pages_compared):,} pages")
|
||||||
print(f"Total corpus alignment: {total_alignment_score:.2f}")
|
print(f"Total corpus alignment: {total_alignment_score:.2f}")
|
||||||
print(f"Mean alignment: {total_alignment_score / total_weight:.3f}")
|
print(f"Mean alignment: {total_alignment_score / total_weight:.3f}")
|
||||||
|
|
||||||
|
print("...creating review page")
|
||||||
|
|
||||||
|
# Select random entries to return in the page_eval_data
|
||||||
|
page_eval_data = random.sample(page_eval_data, 20)
|
||||||
|
create_review_html(page_eval_data, filename=review_page_name + "_sample.html")
|
||||||
|
|
||||||
|
# Select the top 20 lowest alignments
|
||||||
|
page_eval_data.sort(key=lambda x: x["alignment"])
|
||||||
|
page_eval_data = page_eval_data[:20]
|
||||||
|
|
||||||
|
# Uncomment this to generate a nice review page to use with tinyhost
|
||||||
|
create_review_html(page_eval_data, filename=review_page_name + "_worst.html")
|
||||||
|
|
||||||
|
|
||||||
return total_alignment_score / total_weight, page_eval_data
|
return total_alignment_score / total_weight, page_eval_data
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
result = do_eval(gold_data_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v3_eval/",
|
parser = argparse.ArgumentParser(
|
||||||
eval_data_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v3_eval/")
|
description="Transform JSONL files by extracting and renaming specific fields."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--name',
|
||||||
|
default="review_page",
|
||||||
|
help="What name to give to this evaluation/comparison"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'gold_data_path',
|
||||||
|
type=str,
|
||||||
|
help='Path to the gold data directory containing JSONL files. Can be a local path or S3 URL. Can be openai "done" data, or birr "done" data'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'eval_data_path',
|
||||||
|
type=str,
|
||||||
|
help='Path to the eval data directory containing JSONL files. Can be a local path or S3 URL. Can be openai "done" data, or birr "done" data'
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
result = do_eval(gold_data_path=args.gold_data_path, eval_data_path=args.eval_data_path, review_page_name=args.name)
|
||||||
Loading…
x
Reference in New Issue
Block a user