From a50ffe27c9035c072a27c37fbf50d754c8d6300e Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Tue, 24 Sep 2024 21:57:51 +0000 Subject: [PATCH] Adding in eval scripts from oe-data-internal now all in one place --- pdelfin/eval/__init__.py | 0 pdelfin/eval/evalhtml.py | 94 +++++++ pdelfin/eval/evalhtml_template.html | 397 ++++++++++++++++++++++++++++ pdelfin/eval/runeval.py | 258 ++++++++++++++++++ 4 files changed, 749 insertions(+) create mode 100644 pdelfin/eval/__init__.py create mode 100644 pdelfin/eval/evalhtml.py create mode 100644 pdelfin/eval/evalhtml_template.html create mode 100644 pdelfin/eval/runeval.py diff --git a/pdelfin/eval/__init__.py b/pdelfin/eval/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pdelfin/eval/evalhtml.py b/pdelfin/eval/evalhtml.py new file mode 100644 index 0000000..f9bd208 --- /dev/null +++ b/pdelfin/eval/evalhtml.py @@ -0,0 +1,94 @@ +from jinja2 import Template +import random +import os +import subprocess +import tempfile +import boto3 +import base64 +import io + +from urllib.parse import urlparse +from PIL import Image +from tqdm import tqdm + +session = boto3.Session(profile_name='s2') +s3_client = session.client('s3') + + +def render_pdf_to_base64png(s3_path, page): + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf: + pdf_path = tmp_pdf.name + bucket, key = s3_path.replace("s3://", "").split('/', 1) + s3_client.download_file(bucket, key, pdf_path) + + # Render the PDF to an image, and display it in the first position + pdftoppm_result = subprocess.run( + ["pdftoppm", + "-png", + "-f", str(page), + "-l", str(page), + pdf_path], + timeout=120, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr + + png_image = Image.open(io.BytesIO(pdftoppm_result.stdout)) + webp_output = io.BytesIO() + png_image.save(webp_output, format="WEBP") + + image_base64 = base64.b64encode(webp_output.getvalue()).decode("utf-8") + + return image_base64 + + +def create_review_html(data, filename="review_page.html"): + # Load the Jinja2 template from the file + with open(os.path.join(os.path.dirname(__file__), "evalhtml_template.html"), "r") as f: + template = Template(f.read()) + + entries = [] + for i, entry in tqdm(enumerate(data)): + # Randomly decide whether to display gold on the left or right + if random.choice([True, False]): + left_text, right_text = entry["gold_text"], entry["eval_text"] + left_alignment, right_alignment = entry["alignment"], entry["alignment"] + left_class, right_class = "gold", "eval" + else: + left_text, right_text = entry["eval_text"], entry["gold_text"] + left_alignment, right_alignment = entry["alignment"], entry["alignment"] + left_class, right_class = "eval", "gold" + + # Convert newlines to

tags for proper formatting + left_text = "

" + left_text.replace("\n", "

") + "

" + right_text = "

" + right_text.replace("\n", "

") + "

" + + parsed_url = urlparse(entry["s3_path"]) + bucket = parsed_url.netloc + s3_key = parsed_url.path.lstrip('/') + signed_pdf_link = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": s3_key}, ExpiresIn=604800) + + # Create a dictionary for each entry + entries.append({ + "entry_id": i, + "page_image": render_pdf_to_base64png(entry["s3_path"], entry["page"]), + "s3_path": entry["s3_path"], + "page": entry["page"], + "signed_pdf_link": signed_pdf_link, + "left_text": left_text, + "right_text": right_text, + "left_alignment": left_alignment, + "right_alignment": right_alignment, + "left_class": left_class, + "right_class": right_class, + "gold_class": "gold" if left_class == "gold" else "eval", + "eval_class": "eval" if right_class == "eval" else "gold" + }) + + # Render the template with the entries + final_html = template.render(entries=entries) + + # Write the HTML content to the specified file + with open(filename, "w") as f: + f.write(final_html) + + print(f"HTML file '{filename}' created successfully!") diff --git a/pdelfin/eval/evalhtml_template.html b/pdelfin/eval/evalhtml_template.html new file mode 100644 index 0000000..11e803b --- /dev/null +++ b/pdelfin/eval/evalhtml_template.html @@ -0,0 +1,397 @@ + + + + + + Text Evaluation Review + + + +

Text Evaluation Review

+ + +
+ + +
Votes
+
+ +
+ {% for entry in entries %} +
+
+ Render + +
Alignment: {{ entry.left_alignment }}
+ {{ entry.s3_path }} (Page {{ entry.page }}) + + +
+ + + +
+
+
+
{{ entry.left_text|safe }}
+
+
+
{{ entry.right_text|safe }}
+
+
+ {% endfor %} +
+ + +
+ + + + diff --git a/pdelfin/eval/runeval.py b/pdelfin/eval/runeval.py new file mode 100644 index 0000000..5fc0348 --- /dev/null +++ b/pdelfin/eval/runeval.py @@ -0,0 +1,258 @@ +# This script will build a set of scores for the accuracy of a given pdf conversion tactic against a gold dataset +# +# You might need to pip install git+https://github.com/allenai/refine.git@soldni/eval-m +# in order to use some of the existing aligner scoring that was developed as part +# of the refiner pipeline +import boto3 +import os +import json +import hashlib +import random +import zstandard +import sys + +from tqdm import tqdm +from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed +from pathlib import Path +from smart_open import smart_open, register_compressor +from dolma_refine.evaluate.metrics import DocumentEditSimilarity +from dolma_refine.evaluate.segmenters import SpacySegmenter +from dolma_refine.evaluate.aligners import HirschbergAligner + +from .evalhtml import create_review_html + + +CACHE_DIR = os.path.join(Path.home(), ".cache", "pdf_gold_data_cache") + +s3_client = boto3.client('s3') + + +def _handle_zst(file_obj, mode): + return zstandard.open(file_obj, mode) + +register_compressor(".zstd", _handle_zst) +register_compressor(".zst", _handle_zst) + +# Helper function to download files from S3 +def download_from_s3(s3_path: str, local_path: str): + bucket_name, key = s3_path.replace("s3://", "").split("/", 1) + s3_client.download_file(bucket_name, key, local_path) + +def is_debugging(): + return sys.gettrace() is not None + +# Create a hash to store file contents and check for changes +def compute_file_hash(file_path: str) -> str: + hash_md5 = hashlib.md5() + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + +# Load every .json file from GOLD_DATA_S3_PATH (and saves it to some temp folder for quick loading next time) +# returns map from "custom_id" ex. "s3://ai2-s2-pdfs/39ce/3db4516cd6e7d7f8e580a494c7a665a6a16a.pdf-4" (where the -4 means page 4) +# to the gold standard text +def load_gold_data(gold_data_path: str) -> dict: + if not os.path.exists(CACHE_DIR): + os.makedirs(CACHE_DIR) + + gold_data = {} + + # List the contents of the S3 bucket + bucket_name, prefix = gold_data_path.replace("s3://", "").split("/", 1) + paginator = s3_client.get_paginator('list_objects_v2') + pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix) + + for page in pages: + for obj in page.get('Contents', []): + s3_key = obj['Key'] + if s3_key.endswith('.json'): + local_file_path = os.path.join(CACHE_DIR, os.path.basename(s3_key)) + etag = obj['ETag'].strip('"') # ETag is the checksum + + # Check if the file is already cached and verify its checksum + if os.path.exists(local_file_path): + cached_file_hash = compute_file_hash(local_file_path) + if cached_file_hash != etag: + raise ValueError(f"File {local_file_path} has changed on S3. Clear the cache in {CACHE_DIR} and reload.") + else: + # Download the file from S3 if not cached + download_from_s3(f"s3://{bucket_name}/{s3_key}", local_file_path) + + # Load the JSON file + with smart_open(local_file_path, 'r') as f: + for line in f: + data = json.loads(line) + + if "custom_id" in data: + # This is for loading gold data that came out of openai's batch API directly + custom_id = data["custom_id"] + text = data["response"]["body"]["choices"][0]["message"]["content"] + else: + # This is for loading gold data that went through the mise pdf refine pipeline + custom_id = data["s3_path"] + "-" + str(data["page"]) + text = data["outputs"][0]["text"] + + gold_data[custom_id] = text + + print(f"Loaded {len(gold_data):,} gold data entries for comparison") + + return gold_data + +# Helper function to list all .jsonl files from a directory or an S3 bucket +def list_jsonl_files(path: str) -> list: + valid_endings = [".json", ".jsonl", ".json.zstd", ".jsonl.zstd"] + jsonl_files = [] + + if path.startswith("s3://"): + bucket_name, prefix = path.replace("s3://", "").split("/", 1) + paginator = s3_client.get_paginator('list_objects_v2') + pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix) + + for page in pages: + for obj in page.get('Contents', []): + if any(obj['Key'].endswith(ending) for ending in valid_endings): + jsonl_files.append(f"s3://{bucket_name}/{obj['Key']}") + + else: + # If it's a local directory, list all .jsonl files + for root, _, files in os.walk(path): + for file in files: + if any(file.endswith(ending) for ending in valid_endings): + jsonl_files.append(os.path.join(root, file)) + + return jsonl_files + +# Takes in a path to a local directory or s3://[bucket]/[prefix path] where your jsonl files are stored +# This is most likely the output location of the refiner +# Expecting each jsonl line to include {s3_path: [path to original pdf], page: [pagenum], text: [proper page text]} +# Returns the average Levenshtein distance match between the data +def process_jsonl_file(jsonl_file, gold_data, comparer): + page_data = {} + total_alignment_score = 0 + char_weighted_alignment_score = 0 + total_pages = 0 + total_chars = 0 + + with smart_open(jsonl_file, 'r') as f: + for line in f: + data = json.loads(line) + + if "custom_id" in data: + goldkey = data["custom_id"] + data["s3_path"] = goldkey[:goldkey.rindex("-")] + data["page"] = int(goldkey[goldkey.rindex("-") + 1:]) + else: + goldkey = data["s3_path"] + "-" + str(data["page"]) + + if goldkey not in gold_data: + continue + + gold_text = gold_data[goldkey] + + # You need to consider the case when no input is provided to the refiner, it will hallucinate + # So in that case we say there is no eval text + if len(data["text"].strip()) == 0: + eval_text = "" + else: + eval_text = data["outputs"][0]["text"][0] + + # If the eval text or gold text is empty, we skip this page and don't use it for comparison + # It means that something was an OCR page, and the text-based pipeline just won't be able to handle that + if len(eval_text.strip()) < 10 or len(gold_text.strip()) < 10: + continue + + #eval_text = data["text"] # Uncomment to measure the raw input text to the refiner, without any refining happening + + alignment = comparer.compute(gold_text, eval_text) + + # print("GOLD_______________________________________") + # print(gold_text) + # print("EVAL________________________________________") + # print(eval_text) + # print("") + # print(f"Alignment: {alignment:.3f}") + # print("") + # input() + + page_data[goldkey] = { + "s3_path": data["s3_path"], + "page": data["page"], + "gold_text": gold_text, + "eval_text": eval_text, + "alignment": alignment + } + + total_alignment_score += alignment + char_weighted_alignment_score += alignment * len(gold_text) + total_chars += len(gold_text) + total_pages += 1 + + return total_alignment_score, char_weighted_alignment_score, total_chars, total_pages, page_data + +def do_eval(gold_data_path: str, eval_data_path: str, ) -> tuple[float, list[dict]]: + gold_data = load_gold_data(gold_data_path) + + total_alignment_score = 0 + total_weight = 0 + total_pages_compared = set() + + page_eval_data = [] + + segmenter = SpacySegmenter("spacy") + aligner = HirschbergAligner(match_score=1, + mismatch_score=-1, + indel_score=-1) + comparer = DocumentEditSimilarity(segmenter=segmenter, aligner=aligner) + + # List all .jsonl files in the directory or S3 bucket + jsonl_files = list_jsonl_files(eval_data_path) + + if not jsonl_files: + raise ValueError("No .jsonl files found in the specified path.") + + print(f"Found {len(jsonl_files):,} files to evaluate") + + with ProcessPoolExecutor() if not is_debugging() else ThreadPoolExecutor() as executor: + # Prepare the future tasks + futures = [executor.submit(process_jsonl_file, jsonl_file, gold_data, comparer) for jsonl_file in jsonl_files] + + # Process each future as it completes + for future in tqdm(as_completed(futures), total=len(jsonl_files)): + alignment_score, char_weighted_score, chars, pages, page_data = future.result() # Get the result of the completed task + + # Aggregate statistics + total_alignment_score += char_weighted_score + total_weight += chars + total_pages_compared |= page_data.keys() + + # Generate the eval data + for pd_key, pd in page_data.items(): + if pd["alignment"] > 0.97: + continue + + if len(pd["gold_text"]) < 200 and len(pd["eval_text"]) < 200: + continue + + page_eval_data.append(pd) + + # Select random entries to return in the page_eval_data + page_eval_data = random.sample(page_eval_data, 20) + + # Select the top 20 lowest alignments + # page_eval_data.sort(key=lambda x: x["alignment"]) + # page_eval_data = page_eval_data[:20] + + # Uncomment this to generate a nice review page to use with tinyhost + create_review_html(page_eval_data, filename="review_page.html") + + print(f"Compared {len(total_pages_compared):,} pages") + print(f"Total corpus alignment: {total_alignment_score:.2f}") + print(f"Mean alignment: {total_alignment_score / total_weight:.3f}") + + return total_alignment_score / total_weight, page_eval_data + + +if __name__ == "__main__": + result = do_eval(gold_data_path="s3://ai2-oe-data/jakep/openai_batch_done_v2/", + eval_data_path="s3://ai2-oe-data/birr-dev/qwen2-vl/outputs/for-jake/2b/2024-09-24/") \ No newline at end of file