From c4a0fb9af5232e65283b3d16f1834717c714691d Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 15 May 2025 22:50:29 +0000 Subject: [PATCH] Adding back in proper CI estimation --- olmocr/bench/benchmark.py | 159 +++++++++++++++++++++++++++++++++--- olmocr/bench/utils.py | 167 +++++++++++++++++++++++++++++++------- 2 files changed, 288 insertions(+), 38 deletions(-) diff --git a/olmocr/bench/benchmark.py b/olmocr/bench/benchmark.py index 505c3b6..4f7cb21 100644 --- a/olmocr/bench/benchmark.py +++ b/olmocr/bench/benchmark.py @@ -296,8 +296,43 @@ def main(): # Always store test results for displaying jsonl file groupings test_results_by_candidate[candidate_name] = test_results - if all_test_scores: - ci = calculate_bootstrap_ci(all_test_scores, n_bootstrap=n_bootstrap, ci_level=ci_level) + # Group results by jsonl file for more accurate CI calculation + jsonl_results = {} + jsonl_scores = [] # List to store scores by jsonl file for CI calculation + jsonl_file_sizes = [] # List to store the number of tests per jsonl file + + for test in all_tests: + # Get the jsonl file this test came from + jsonl_file = test_to_jsonl.get(test.id, "unknown") + + if jsonl_file not in jsonl_results: + jsonl_results[jsonl_file] = {"total": 0, "passed": 0, "scores": []} + + jsonl_results[jsonl_file]["total"] += 1 + + # Get the test result for this candidate if it exists + if not candidate_errors and hasattr(test, "pdf") and hasattr(test, "page"): + pdf_name = test.pdf + page = test.page + if pdf_name in test_results and page in test_results.get(pdf_name, {}): + for t, passed, _ in test_results[pdf_name][page]: + if t.id == test.id: + # Store the test score in its jsonl group + result_score = 1.0 if passed else 0.0 + jsonl_results[jsonl_file]["scores"].append(result_score) + if passed: + jsonl_results[jsonl_file]["passed"] += 1 + break + + # Gather all the scores by jsonl file for CI calculation + for jsonl_file, results in jsonl_results.items(): + if results["scores"]: + jsonl_file_sizes.append(len(results["scores"])) + jsonl_scores.extend(results["scores"]) + + # Calculate CI using the updated function with splits + if jsonl_scores: + ci = calculate_bootstrap_ci(jsonl_scores, n_bootstrap=n_bootstrap, ci_level=ci_level, splits=jsonl_file_sizes) else: ci = (0.0, 0.0) summary.append((candidate_name, overall_score, total_tests, candidate_errors, test_failures, test_type_breakdown, ci, all_test_scores)) @@ -309,9 +344,15 @@ def main(): if test_failures: for fail in test_failures: print(f" [FAIL] {fail}") - # Note: This score is still the average over all tests and will be updated to - # the average of per-JSONL file scores in the final summary - print(f" Average Score: {overall_score * 100:.1f}% (95% CI: [{ci[0] * 100:.1f}%, {ci[1] * 100:.1f}%]) over {total_tests} tests.") + # Calculate and show the per-category average score + jsonl_pass_rates = [] + for _, results in jsonl_results.items(): + if results["total"] > 0: + pass_rate = results["passed"] / results["total"] + jsonl_pass_rates.append(pass_rate) + + per_category_score = sum(jsonl_pass_rates) / len(jsonl_pass_rates) if jsonl_pass_rates else 0.0 + print(f" Average Score: {per_category_score * 100:.1f}% (95% CI: [{ci[0] * 100:.1f}%, {ci[1] * 100:.1f}%]) over {total_tests} tests.") print("\n" + "=" * 60) print("Final Summary with 95% Confidence Intervals:") @@ -359,8 +400,7 @@ def main(): ciw_str = "" else: status = f"{new_overall_score * 100:0.1f}%" - # Note: CI calculation would need to be updated too for full accuracy, - # but keeping as-is for now as it would require deeper changes + # Use the CI that was calculated with proper category-based bootstrap half_width = ((ci[1] - ci[0]) / 2) * 100 ciw_str = f"± {half_width:0.1f}%" print(f"{candidate_name:20s} : Average Score: {status} {ciw_str} (average of per-JSONL scores)") @@ -392,7 +432,40 @@ def main(): if top_olmocr and top_non_olmocr: olmocr_name, olmocr_score = top_olmocr[0], top_olmocr[1] non_olmocr_name, non_olmocr_score = top_non_olmocr[0], top_non_olmocr[1] - diff, p_value = perform_permutation_test(top_olmocr[7], top_non_olmocr[7]) + # Extract file sizes and scores for both candidates + olmocr_jsonl_sizes = [] + non_olmocr_jsonl_sizes = [] + + # Extract jsonl file sizes for each candidate + for test in all_tests: + jsonl_file = test_to_jsonl.get(test.id, "unknown") + # Process for top_olmocr + if not top_olmocr[3] and hasattr(test, "pdf") and hasattr(test, "page"): + pdf_name = test.pdf + page = test.page + if pdf_name in test_results_by_candidate.get(top_olmocr[0], {}) and page in test_results_by_candidate[top_olmocr[0]].get(pdf_name, {}): + for t, _, _ in test_results_by_candidate[top_olmocr[0]][pdf_name][page]: + if t.id == test.id: + if jsonl_file not in olmocr_jsonl_sizes: + olmocr_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file])) + break + + # Process for top_non_olmocr + if not top_non_olmocr[3] and hasattr(test, "pdf") and hasattr(test, "page"): + pdf_name = test.pdf + page = test.page + if pdf_name in test_results_by_candidate.get(top_non_olmocr[0], {}) and page in test_results_by_candidate[top_non_olmocr[0]].get(pdf_name, {}): + for t, _, _ in test_results_by_candidate[top_non_olmocr[0]][pdf_name][page]: + if t.id == test.id: + if jsonl_file not in non_olmocr_jsonl_sizes: + non_olmocr_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file])) + break + + diff, p_value = perform_permutation_test( + top_olmocr[7], top_non_olmocr[7], + splits_a=olmocr_jsonl_sizes if olmocr_jsonl_sizes else None, + splits_b=non_olmocr_jsonl_sizes if non_olmocr_jsonl_sizes else None + ) print("\nComparison 1: Top olmocr vs Top non-olmocr candidate") print(f" {olmocr_name} ({olmocr_score*100:.1f}%) vs {non_olmocr_name} ({non_olmocr_score*100:.1f}%)") print(f" Difference: {diff*100:.2f}% (positive means {olmocr_name} is better)") @@ -405,7 +478,40 @@ def main(): print("\nCannot perform olmocr vs non-olmocr comparison: Missing candidates") if len(top_two_olmocr) >= 2: - diff, p_value = perform_permutation_test(top_two_olmocr[0][7], top_two_olmocr[1][7]) + # Extract file sizes for each candidate + olmocr1_jsonl_sizes = [] + olmocr2_jsonl_sizes = [] + + # Extract jsonl file sizes for each candidate + for test in all_tests: + jsonl_file = test_to_jsonl.get(test.id, "unknown") + # Process for first olmocr candidate + if not top_two_olmocr[0][3] and hasattr(test, "pdf") and hasattr(test, "page"): + pdf_name = test.pdf + page = test.page + if pdf_name in test_results_by_candidate.get(top_two_olmocr[0][0], {}) and page in test_results_by_candidate[top_two_olmocr[0][0]].get(pdf_name, {}): + for t, _, _ in test_results_by_candidate[top_two_olmocr[0][0]][pdf_name][page]: + if t.id == test.id: + if jsonl_file not in olmocr1_jsonl_sizes: + olmocr1_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file])) + break + + # Process for second olmocr candidate + if not top_two_olmocr[1][3] and hasattr(test, "pdf") and hasattr(test, "page"): + pdf_name = test.pdf + page = test.page + if pdf_name in test_results_by_candidate.get(top_two_olmocr[1][0], {}) and page in test_results_by_candidate[top_two_olmocr[1][0]].get(pdf_name, {}): + for t, _, _ in test_results_by_candidate[top_two_olmocr[1][0]][pdf_name][page]: + if t.id == test.id: + if jsonl_file not in olmocr2_jsonl_sizes: + olmocr2_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file])) + break + + diff, p_value = perform_permutation_test( + top_two_olmocr[0][7], top_two_olmocr[1][7], + splits_a=olmocr1_jsonl_sizes if olmocr1_jsonl_sizes else None, + splits_b=olmocr2_jsonl_sizes if olmocr2_jsonl_sizes else None + ) print("\nComparison 2: Top two olmocr candidates") print(f" {top_two_olmocr[0][0]} ({top_two_olmocr[0][1]*100:.1f}%) vs {top_two_olmocr[1][0]} ({top_two_olmocr[1][1]*100:.1f}%)") print(f" Difference: {diff*100:.2f}% (positive means {top_two_olmocr[0][0]} is better)") @@ -423,7 +529,40 @@ def main(): print("\nNot enough valid candidates among the selected for permutation tests.") else: for cand1, cand2 in combinations(selected_candidates, 2): - diff, p_value = perform_permutation_test(cand1[7], cand2[7]) + # Extract file sizes for each candidate + cand1_jsonl_sizes = [] + cand2_jsonl_sizes = [] + + # Extract jsonl file sizes for each candidate + for test in all_tests: + jsonl_file = test_to_jsonl.get(test.id, "unknown") + # Process for first candidate + if not cand1[3] and hasattr(test, "pdf") and hasattr(test, "page"): + pdf_name = test.pdf + page = test.page + if pdf_name in test_results_by_candidate.get(cand1[0], {}) and page in test_results_by_candidate[cand1[0]].get(pdf_name, {}): + for t, _, _ in test_results_by_candidate[cand1[0]][pdf_name][page]: + if t.id == test.id: + if jsonl_file not in cand1_jsonl_sizes: + cand1_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file])) + break + + # Process for second candidate + if not cand2[3] and hasattr(test, "pdf") and hasattr(test, "page"): + pdf_name = test.pdf + page = test.page + if pdf_name in test_results_by_candidate.get(cand2[0], {}) and page in test_results_by_candidate[cand2[0]].get(pdf_name, {}): + for t, _, _ in test_results_by_candidate[cand2[0]][pdf_name][page]: + if t.id == test.id: + if jsonl_file not in cand2_jsonl_sizes: + cand2_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file])) + break + + diff, p_value = perform_permutation_test( + cand1[7], cand2[7], + splits_a=cand1_jsonl_sizes if cand1_jsonl_sizes else None, + splits_b=cand2_jsonl_sizes if cand2_jsonl_sizes else None + ) print(f"\nComparison: {cand1[0]} vs {cand2[0]}") print(f" {cand1[0]} ({cand1[1]*100:.1f}%) vs {cand2[0]} ({cand2[1]*100:.1f}%)") print(f" Difference: {diff*100:.2f}% (positive means {cand1[0]} is better)") diff --git a/olmocr/bench/utils.py b/olmocr/bench/utils.py index aa930b1..af69857 100644 --- a/olmocr/bench/utils.py +++ b/olmocr/bench/utils.py @@ -3,14 +3,17 @@ from typing import List, Tuple import numpy as np -def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci_level: float = 0.95) -> Tuple[float, float]: +def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci_level: float = 0.95, splits: List[int] = None) -> Tuple[float, float]: """ - Calculate bootstrap confidence interval for test scores. + Calculate bootstrap confidence interval for test scores, respecting category splits. Args: test_scores: List of test scores (0.0 to 1.0 for each test) n_bootstrap: Number of bootstrap samples to generate ci_level: Confidence interval level (default: 0.95 for 95% CI) + splits: List of sizes for each category. If provided, resampling will be done + within each category independently, and the overall score will be the + average of per-category scores. If None, resampling is done across all tests. Returns: Tuple of (lower_bound, upper_bound) representing the confidence interval @@ -20,13 +23,41 @@ def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci # Convert to numpy array for efficiency scores = np.array(test_scores) - - # Generate bootstrap samples - bootstrap_means = [] - for _ in range(n_bootstrap): - # Sample with replacement - sample = np.random.choice(scores, size=len(scores), replace=True) - bootstrap_means.append(np.mean(sample)) + + # Simple case - no splits provided, use traditional bootstrap + if splits is None: + # Generate bootstrap samples + bootstrap_means = [] + for _ in range(n_bootstrap): + # Sample with replacement + sample = np.random.choice(scores, size=len(scores), replace=True) + bootstrap_means.append(np.mean(sample)) + else: + # Validate splits + if sum(splits) != len(scores): + raise ValueError(f"Sum of splits ({sum(splits)}) must equal length of test_scores ({len(scores)})") + + # Convert flat scores list to a list of category scores + category_scores = [] + start_idx = 0 + for split_size in splits: + category_scores.append(scores[start_idx:start_idx + split_size]) + start_idx += split_size + + # Generate bootstrap samples respecting category structure + bootstrap_means = [] + for _ in range(n_bootstrap): + # Sample within each category independently + category_means = [] + for cat_scores in category_scores: + if len(cat_scores) > 0: + # Sample with replacement within this category + cat_sample = np.random.choice(cat_scores, size=len(cat_scores), replace=True) + category_means.append(np.mean(cat_sample)) + + # Overall score is average of category means (if any categories have scores) + if category_means: + bootstrap_means.append(np.mean(category_means)) # Calculate confidence interval alpha = (1 - ci_level) / 2 @@ -36,7 +67,8 @@ def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci return (lower_bound, upper_bound) -def perform_permutation_test(scores_a: List[float], scores_b: List[float], n_permutations: int = 10000) -> Tuple[float, float]: +def perform_permutation_test(scores_a: List[float], scores_b: List[float], n_permutations: int = 10000, + splits_a: List[int] = None, splits_b: List[int] = None) -> Tuple[float, float]: """ Perform a permutation test to determine if there's a significant difference between two sets of test scores. @@ -45,6 +77,8 @@ def perform_permutation_test(scores_a: List[float], scores_b: List[float], n_per scores_a: List of test scores for candidate A scores_b: List of test scores for candidate B n_permutations: Number of permutations to perform + splits_a: List of sizes for each category in scores_a + splits_b: List of sizes for each category in scores_b Returns: Tuple of (observed_difference, p_value) @@ -52,29 +86,106 @@ def perform_permutation_test(scores_a: List[float], scores_b: List[float], n_per if not scores_a or not scores_b: return (0.0, 1.0) - # Calculate observed difference in means - observed_diff = np.mean(scores_a) - np.mean(scores_b) + # Function to calculate mean of means with optional category splits + def mean_of_category_means(scores, splits=None): + if splits is None: + return np.mean(scores) + + category_means = [] + start_idx = 0 + for split_size in splits: + if split_size > 0: + category_scores = scores[start_idx:start_idx + split_size] + category_means.append(np.mean(category_scores)) + start_idx += split_size + + return np.mean(category_means) if category_means else 0.0 - # Combine all scores - combined = np.concatenate([scores_a, scores_b]) - n_a = len(scores_a) + # Calculate observed difference in means using category structure if provided + mean_a = mean_of_category_means(scores_a, splits_a) + mean_b = mean_of_category_means(scores_b, splits_b) + observed_diff = mean_a - mean_b - # Perform permutation test - count_greater_or_equal = 0 - for _ in range(n_permutations): - # Shuffle the combined array - np.random.shuffle(combined) + # If no splits are provided, fall back to traditional permutation test + if splits_a is None and splits_b is None: + # Combine all scores + combined = np.concatenate([scores_a, scores_b]) + n_a = len(scores_a) - # Split into two groups of original sizes - perm_a = combined[:n_a] - perm_b = combined[n_a:] + # Perform permutation test + count_greater_or_equal = 0 + for _ in range(n_permutations): + # Shuffle the combined array + np.random.shuffle(combined) - # Calculate difference in means - perm_diff = np.mean(perm_a) - np.mean(perm_b) + # Split into two groups of original sizes + perm_a = combined[:n_a] + perm_b = combined[n_a:] - # Count how many permuted differences are >= to observed difference in absolute value - if abs(perm_diff) >= abs(observed_diff): - count_greater_or_equal += 1 + # Calculate difference in means + perm_diff = np.mean(perm_a) - np.mean(perm_b) + + # Count how many permuted differences are >= to observed difference in absolute value + if abs(perm_diff) >= abs(observed_diff): + count_greater_or_equal += 1 + else: + # For category-based permutation test, we need to maintain category structure + # Validate that the splits match the score lengths + if splits_a is not None and sum(splits_a) != len(scores_a): + raise ValueError(f"Sum of splits_a ({sum(splits_a)}) must equal length of scores_a ({len(scores_a)})") + if splits_b is not None and sum(splits_b) != len(scores_b): + raise ValueError(f"Sum of splits_b ({sum(splits_b)}) must equal length of scores_b ({len(scores_b)})") + + # Create category structures + categories_a = [] + categories_b = [] + + if splits_a is not None: + start_idx = 0 + for split_size in splits_a: + categories_a.append(scores_a[start_idx:start_idx + split_size]) + start_idx += split_size + else: + # If no splits for A, treat all scores as one category + categories_a = [scores_a] + + if splits_b is not None: + start_idx = 0 + for split_size in splits_b: + categories_b.append(scores_b[start_idx:start_idx + split_size]) + start_idx += split_size + else: + # If no splits for B, treat all scores as one category + categories_b = [scores_b] + + # Perform permutation test maintaining category structure + count_greater_or_equal = 0 + for _ in range(n_permutations): + # For each category pair, shuffle and redistribute + perm_categories_a = [] + perm_categories_b = [] + + for cat_a, cat_b in zip(categories_a, categories_b): + # Combine and shuffle + combined = np.concatenate([cat_a, cat_b]) + np.random.shuffle(combined) + + # Redistribute maintaining original sizes + perm_categories_a.append(combined[:len(cat_a)]) + perm_categories_b.append(combined[len(cat_a):]) + + # Flatten permuted categories + perm_a = np.concatenate(perm_categories_a) + perm_b = np.concatenate(perm_categories_b) + + # Calculate difference in means respecting category structure + perm_mean_a = mean_of_category_means(perm_a, splits_a) + perm_mean_b = mean_of_category_means(perm_b, splits_b) + perm_diff = perm_mean_a - perm_mean_b + + # Count how many permuted differences are >= to observed difference in absolute value + if abs(perm_diff) >= abs(observed_diff): + count_greater_or_equal += 1 # Calculate p-value p_value = count_greater_or_equal / n_permutations