mirror of
https://github.com/allenai/olmocr.git
synced 2025-06-27 04:00:02 +00:00
Adding back in proper CI estimation
This commit is contained in:
parent
d17210f40d
commit
c4a0fb9af5
@ -296,8 +296,43 @@ def main():
|
||||
# Always store test results for displaying jsonl file groupings
|
||||
test_results_by_candidate[candidate_name] = test_results
|
||||
|
||||
if all_test_scores:
|
||||
ci = calculate_bootstrap_ci(all_test_scores, n_bootstrap=n_bootstrap, ci_level=ci_level)
|
||||
# Group results by jsonl file for more accurate CI calculation
|
||||
jsonl_results = {}
|
||||
jsonl_scores = [] # List to store scores by jsonl file for CI calculation
|
||||
jsonl_file_sizes = [] # List to store the number of tests per jsonl file
|
||||
|
||||
for test in all_tests:
|
||||
# Get the jsonl file this test came from
|
||||
jsonl_file = test_to_jsonl.get(test.id, "unknown")
|
||||
|
||||
if jsonl_file not in jsonl_results:
|
||||
jsonl_results[jsonl_file] = {"total": 0, "passed": 0, "scores": []}
|
||||
|
||||
jsonl_results[jsonl_file]["total"] += 1
|
||||
|
||||
# Get the test result for this candidate if it exists
|
||||
if not candidate_errors and hasattr(test, "pdf") and hasattr(test, "page"):
|
||||
pdf_name = test.pdf
|
||||
page = test.page
|
||||
if pdf_name in test_results and page in test_results.get(pdf_name, {}):
|
||||
for t, passed, _ in test_results[pdf_name][page]:
|
||||
if t.id == test.id:
|
||||
# Store the test score in its jsonl group
|
||||
result_score = 1.0 if passed else 0.0
|
||||
jsonl_results[jsonl_file]["scores"].append(result_score)
|
||||
if passed:
|
||||
jsonl_results[jsonl_file]["passed"] += 1
|
||||
break
|
||||
|
||||
# Gather all the scores by jsonl file for CI calculation
|
||||
for jsonl_file, results in jsonl_results.items():
|
||||
if results["scores"]:
|
||||
jsonl_file_sizes.append(len(results["scores"]))
|
||||
jsonl_scores.extend(results["scores"])
|
||||
|
||||
# Calculate CI using the updated function with splits
|
||||
if jsonl_scores:
|
||||
ci = calculate_bootstrap_ci(jsonl_scores, n_bootstrap=n_bootstrap, ci_level=ci_level, splits=jsonl_file_sizes)
|
||||
else:
|
||||
ci = (0.0, 0.0)
|
||||
summary.append((candidate_name, overall_score, total_tests, candidate_errors, test_failures, test_type_breakdown, ci, all_test_scores))
|
||||
@ -309,9 +344,15 @@ def main():
|
||||
if test_failures:
|
||||
for fail in test_failures:
|
||||
print(f" [FAIL] {fail}")
|
||||
# Note: This score is still the average over all tests and will be updated to
|
||||
# the average of per-JSONL file scores in the final summary
|
||||
print(f" Average Score: {overall_score * 100:.1f}% (95% CI: [{ci[0] * 100:.1f}%, {ci[1] * 100:.1f}%]) over {total_tests} tests.")
|
||||
# Calculate and show the per-category average score
|
||||
jsonl_pass_rates = []
|
||||
for _, results in jsonl_results.items():
|
||||
if results["total"] > 0:
|
||||
pass_rate = results["passed"] / results["total"]
|
||||
jsonl_pass_rates.append(pass_rate)
|
||||
|
||||
per_category_score = sum(jsonl_pass_rates) / len(jsonl_pass_rates) if jsonl_pass_rates else 0.0
|
||||
print(f" Average Score: {per_category_score * 100:.1f}% (95% CI: [{ci[0] * 100:.1f}%, {ci[1] * 100:.1f}%]) over {total_tests} tests.")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Final Summary with 95% Confidence Intervals:")
|
||||
@ -359,8 +400,7 @@ def main():
|
||||
ciw_str = ""
|
||||
else:
|
||||
status = f"{new_overall_score * 100:0.1f}%"
|
||||
# Note: CI calculation would need to be updated too for full accuracy,
|
||||
# but keeping as-is for now as it would require deeper changes
|
||||
# Use the CI that was calculated with proper category-based bootstrap
|
||||
half_width = ((ci[1] - ci[0]) / 2) * 100
|
||||
ciw_str = f"± {half_width:0.1f}%"
|
||||
print(f"{candidate_name:20s} : Average Score: {status} {ciw_str} (average of per-JSONL scores)")
|
||||
@ -392,7 +432,40 @@ def main():
|
||||
if top_olmocr and top_non_olmocr:
|
||||
olmocr_name, olmocr_score = top_olmocr[0], top_olmocr[1]
|
||||
non_olmocr_name, non_olmocr_score = top_non_olmocr[0], top_non_olmocr[1]
|
||||
diff, p_value = perform_permutation_test(top_olmocr[7], top_non_olmocr[7])
|
||||
# Extract file sizes and scores for both candidates
|
||||
olmocr_jsonl_sizes = []
|
||||
non_olmocr_jsonl_sizes = []
|
||||
|
||||
# Extract jsonl file sizes for each candidate
|
||||
for test in all_tests:
|
||||
jsonl_file = test_to_jsonl.get(test.id, "unknown")
|
||||
# Process for top_olmocr
|
||||
if not top_olmocr[3] and hasattr(test, "pdf") and hasattr(test, "page"):
|
||||
pdf_name = test.pdf
|
||||
page = test.page
|
||||
if pdf_name in test_results_by_candidate.get(top_olmocr[0], {}) and page in test_results_by_candidate[top_olmocr[0]].get(pdf_name, {}):
|
||||
for t, _, _ in test_results_by_candidate[top_olmocr[0]][pdf_name][page]:
|
||||
if t.id == test.id:
|
||||
if jsonl_file not in olmocr_jsonl_sizes:
|
||||
olmocr_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file]))
|
||||
break
|
||||
|
||||
# Process for top_non_olmocr
|
||||
if not top_non_olmocr[3] and hasattr(test, "pdf") and hasattr(test, "page"):
|
||||
pdf_name = test.pdf
|
||||
page = test.page
|
||||
if pdf_name in test_results_by_candidate.get(top_non_olmocr[0], {}) and page in test_results_by_candidate[top_non_olmocr[0]].get(pdf_name, {}):
|
||||
for t, _, _ in test_results_by_candidate[top_non_olmocr[0]][pdf_name][page]:
|
||||
if t.id == test.id:
|
||||
if jsonl_file not in non_olmocr_jsonl_sizes:
|
||||
non_olmocr_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file]))
|
||||
break
|
||||
|
||||
diff, p_value = perform_permutation_test(
|
||||
top_olmocr[7], top_non_olmocr[7],
|
||||
splits_a=olmocr_jsonl_sizes if olmocr_jsonl_sizes else None,
|
||||
splits_b=non_olmocr_jsonl_sizes if non_olmocr_jsonl_sizes else None
|
||||
)
|
||||
print("\nComparison 1: Top olmocr vs Top non-olmocr candidate")
|
||||
print(f" {olmocr_name} ({olmocr_score*100:.1f}%) vs {non_olmocr_name} ({non_olmocr_score*100:.1f}%)")
|
||||
print(f" Difference: {diff*100:.2f}% (positive means {olmocr_name} is better)")
|
||||
@ -405,7 +478,40 @@ def main():
|
||||
print("\nCannot perform olmocr vs non-olmocr comparison: Missing candidates")
|
||||
|
||||
if len(top_two_olmocr) >= 2:
|
||||
diff, p_value = perform_permutation_test(top_two_olmocr[0][7], top_two_olmocr[1][7])
|
||||
# Extract file sizes for each candidate
|
||||
olmocr1_jsonl_sizes = []
|
||||
olmocr2_jsonl_sizes = []
|
||||
|
||||
# Extract jsonl file sizes for each candidate
|
||||
for test in all_tests:
|
||||
jsonl_file = test_to_jsonl.get(test.id, "unknown")
|
||||
# Process for first olmocr candidate
|
||||
if not top_two_olmocr[0][3] and hasattr(test, "pdf") and hasattr(test, "page"):
|
||||
pdf_name = test.pdf
|
||||
page = test.page
|
||||
if pdf_name in test_results_by_candidate.get(top_two_olmocr[0][0], {}) and page in test_results_by_candidate[top_two_olmocr[0][0]].get(pdf_name, {}):
|
||||
for t, _, _ in test_results_by_candidate[top_two_olmocr[0][0]][pdf_name][page]:
|
||||
if t.id == test.id:
|
||||
if jsonl_file not in olmocr1_jsonl_sizes:
|
||||
olmocr1_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file]))
|
||||
break
|
||||
|
||||
# Process for second olmocr candidate
|
||||
if not top_two_olmocr[1][3] and hasattr(test, "pdf") and hasattr(test, "page"):
|
||||
pdf_name = test.pdf
|
||||
page = test.page
|
||||
if pdf_name in test_results_by_candidate.get(top_two_olmocr[1][0], {}) and page in test_results_by_candidate[top_two_olmocr[1][0]].get(pdf_name, {}):
|
||||
for t, _, _ in test_results_by_candidate[top_two_olmocr[1][0]][pdf_name][page]:
|
||||
if t.id == test.id:
|
||||
if jsonl_file not in olmocr2_jsonl_sizes:
|
||||
olmocr2_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file]))
|
||||
break
|
||||
|
||||
diff, p_value = perform_permutation_test(
|
||||
top_two_olmocr[0][7], top_two_olmocr[1][7],
|
||||
splits_a=olmocr1_jsonl_sizes if olmocr1_jsonl_sizes else None,
|
||||
splits_b=olmocr2_jsonl_sizes if olmocr2_jsonl_sizes else None
|
||||
)
|
||||
print("\nComparison 2: Top two olmocr candidates")
|
||||
print(f" {top_two_olmocr[0][0]} ({top_two_olmocr[0][1]*100:.1f}%) vs {top_two_olmocr[1][0]} ({top_two_olmocr[1][1]*100:.1f}%)")
|
||||
print(f" Difference: {diff*100:.2f}% (positive means {top_two_olmocr[0][0]} is better)")
|
||||
@ -423,7 +529,40 @@ def main():
|
||||
print("\nNot enough valid candidates among the selected for permutation tests.")
|
||||
else:
|
||||
for cand1, cand2 in combinations(selected_candidates, 2):
|
||||
diff, p_value = perform_permutation_test(cand1[7], cand2[7])
|
||||
# Extract file sizes for each candidate
|
||||
cand1_jsonl_sizes = []
|
||||
cand2_jsonl_sizes = []
|
||||
|
||||
# Extract jsonl file sizes for each candidate
|
||||
for test in all_tests:
|
||||
jsonl_file = test_to_jsonl.get(test.id, "unknown")
|
||||
# Process for first candidate
|
||||
if not cand1[3] and hasattr(test, "pdf") and hasattr(test, "page"):
|
||||
pdf_name = test.pdf
|
||||
page = test.page
|
||||
if pdf_name in test_results_by_candidate.get(cand1[0], {}) and page in test_results_by_candidate[cand1[0]].get(pdf_name, {}):
|
||||
for t, _, _ in test_results_by_candidate[cand1[0]][pdf_name][page]:
|
||||
if t.id == test.id:
|
||||
if jsonl_file not in cand1_jsonl_sizes:
|
||||
cand1_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file]))
|
||||
break
|
||||
|
||||
# Process for second candidate
|
||||
if not cand2[3] and hasattr(test, "pdf") and hasattr(test, "page"):
|
||||
pdf_name = test.pdf
|
||||
page = test.page
|
||||
if pdf_name in test_results_by_candidate.get(cand2[0], {}) and page in test_results_by_candidate[cand2[0]].get(pdf_name, {}):
|
||||
for t, _, _ in test_results_by_candidate[cand2[0]][pdf_name][page]:
|
||||
if t.id == test.id:
|
||||
if jsonl_file not in cand2_jsonl_sizes:
|
||||
cand2_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file]))
|
||||
break
|
||||
|
||||
diff, p_value = perform_permutation_test(
|
||||
cand1[7], cand2[7],
|
||||
splits_a=cand1_jsonl_sizes if cand1_jsonl_sizes else None,
|
||||
splits_b=cand2_jsonl_sizes if cand2_jsonl_sizes else None
|
||||
)
|
||||
print(f"\nComparison: {cand1[0]} vs {cand2[0]}")
|
||||
print(f" {cand1[0]} ({cand1[1]*100:.1f}%) vs {cand2[0]} ({cand2[1]*100:.1f}%)")
|
||||
print(f" Difference: {diff*100:.2f}% (positive means {cand1[0]} is better)")
|
||||
|
@ -3,14 +3,17 @@ from typing import List, Tuple
|
||||
import numpy as np
|
||||
|
||||
|
||||
def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci_level: float = 0.95) -> Tuple[float, float]:
|
||||
def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci_level: float = 0.95, splits: List[int] = None) -> Tuple[float, float]:
|
||||
"""
|
||||
Calculate bootstrap confidence interval for test scores.
|
||||
Calculate bootstrap confidence interval for test scores, respecting category splits.
|
||||
|
||||
Args:
|
||||
test_scores: List of test scores (0.0 to 1.0 for each test)
|
||||
n_bootstrap: Number of bootstrap samples to generate
|
||||
ci_level: Confidence interval level (default: 0.95 for 95% CI)
|
||||
splits: List of sizes for each category. If provided, resampling will be done
|
||||
within each category independently, and the overall score will be the
|
||||
average of per-category scores. If None, resampling is done across all tests.
|
||||
|
||||
Returns:
|
||||
Tuple of (lower_bound, upper_bound) representing the confidence interval
|
||||
@ -20,13 +23,41 @@ def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci
|
||||
|
||||
# Convert to numpy array for efficiency
|
||||
scores = np.array(test_scores)
|
||||
|
||||
# Generate bootstrap samples
|
||||
bootstrap_means = []
|
||||
for _ in range(n_bootstrap):
|
||||
# Sample with replacement
|
||||
sample = np.random.choice(scores, size=len(scores), replace=True)
|
||||
bootstrap_means.append(np.mean(sample))
|
||||
|
||||
# Simple case - no splits provided, use traditional bootstrap
|
||||
if splits is None:
|
||||
# Generate bootstrap samples
|
||||
bootstrap_means = []
|
||||
for _ in range(n_bootstrap):
|
||||
# Sample with replacement
|
||||
sample = np.random.choice(scores, size=len(scores), replace=True)
|
||||
bootstrap_means.append(np.mean(sample))
|
||||
else:
|
||||
# Validate splits
|
||||
if sum(splits) != len(scores):
|
||||
raise ValueError(f"Sum of splits ({sum(splits)}) must equal length of test_scores ({len(scores)})")
|
||||
|
||||
# Convert flat scores list to a list of category scores
|
||||
category_scores = []
|
||||
start_idx = 0
|
||||
for split_size in splits:
|
||||
category_scores.append(scores[start_idx:start_idx + split_size])
|
||||
start_idx += split_size
|
||||
|
||||
# Generate bootstrap samples respecting category structure
|
||||
bootstrap_means = []
|
||||
for _ in range(n_bootstrap):
|
||||
# Sample within each category independently
|
||||
category_means = []
|
||||
for cat_scores in category_scores:
|
||||
if len(cat_scores) > 0:
|
||||
# Sample with replacement within this category
|
||||
cat_sample = np.random.choice(cat_scores, size=len(cat_scores), replace=True)
|
||||
category_means.append(np.mean(cat_sample))
|
||||
|
||||
# Overall score is average of category means (if any categories have scores)
|
||||
if category_means:
|
||||
bootstrap_means.append(np.mean(category_means))
|
||||
|
||||
# Calculate confidence interval
|
||||
alpha = (1 - ci_level) / 2
|
||||
@ -36,7 +67,8 @@ def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci
|
||||
return (lower_bound, upper_bound)
|
||||
|
||||
|
||||
def perform_permutation_test(scores_a: List[float], scores_b: List[float], n_permutations: int = 10000) -> Tuple[float, float]:
|
||||
def perform_permutation_test(scores_a: List[float], scores_b: List[float], n_permutations: int = 10000,
|
||||
splits_a: List[int] = None, splits_b: List[int] = None) -> Tuple[float, float]:
|
||||
"""
|
||||
Perform a permutation test to determine if there's a significant difference
|
||||
between two sets of test scores.
|
||||
@ -45,6 +77,8 @@ def perform_permutation_test(scores_a: List[float], scores_b: List[float], n_per
|
||||
scores_a: List of test scores for candidate A
|
||||
scores_b: List of test scores for candidate B
|
||||
n_permutations: Number of permutations to perform
|
||||
splits_a: List of sizes for each category in scores_a
|
||||
splits_b: List of sizes for each category in scores_b
|
||||
|
||||
Returns:
|
||||
Tuple of (observed_difference, p_value)
|
||||
@ -52,29 +86,106 @@ def perform_permutation_test(scores_a: List[float], scores_b: List[float], n_per
|
||||
if not scores_a or not scores_b:
|
||||
return (0.0, 1.0)
|
||||
|
||||
# Calculate observed difference in means
|
||||
observed_diff = np.mean(scores_a) - np.mean(scores_b)
|
||||
# Function to calculate mean of means with optional category splits
|
||||
def mean_of_category_means(scores, splits=None):
|
||||
if splits is None:
|
||||
return np.mean(scores)
|
||||
|
||||
category_means = []
|
||||
start_idx = 0
|
||||
for split_size in splits:
|
||||
if split_size > 0:
|
||||
category_scores = scores[start_idx:start_idx + split_size]
|
||||
category_means.append(np.mean(category_scores))
|
||||
start_idx += split_size
|
||||
|
||||
return np.mean(category_means) if category_means else 0.0
|
||||
|
||||
# Combine all scores
|
||||
combined = np.concatenate([scores_a, scores_b])
|
||||
n_a = len(scores_a)
|
||||
# Calculate observed difference in means using category structure if provided
|
||||
mean_a = mean_of_category_means(scores_a, splits_a)
|
||||
mean_b = mean_of_category_means(scores_b, splits_b)
|
||||
observed_diff = mean_a - mean_b
|
||||
|
||||
# Perform permutation test
|
||||
count_greater_or_equal = 0
|
||||
for _ in range(n_permutations):
|
||||
# Shuffle the combined array
|
||||
np.random.shuffle(combined)
|
||||
# If no splits are provided, fall back to traditional permutation test
|
||||
if splits_a is None and splits_b is None:
|
||||
# Combine all scores
|
||||
combined = np.concatenate([scores_a, scores_b])
|
||||
n_a = len(scores_a)
|
||||
|
||||
# Split into two groups of original sizes
|
||||
perm_a = combined[:n_a]
|
||||
perm_b = combined[n_a:]
|
||||
# Perform permutation test
|
||||
count_greater_or_equal = 0
|
||||
for _ in range(n_permutations):
|
||||
# Shuffle the combined array
|
||||
np.random.shuffle(combined)
|
||||
|
||||
# Calculate difference in means
|
||||
perm_diff = np.mean(perm_a) - np.mean(perm_b)
|
||||
# Split into two groups of original sizes
|
||||
perm_a = combined[:n_a]
|
||||
perm_b = combined[n_a:]
|
||||
|
||||
# Count how many permuted differences are >= to observed difference in absolute value
|
||||
if abs(perm_diff) >= abs(observed_diff):
|
||||
count_greater_or_equal += 1
|
||||
# Calculate difference in means
|
||||
perm_diff = np.mean(perm_a) - np.mean(perm_b)
|
||||
|
||||
# Count how many permuted differences are >= to observed difference in absolute value
|
||||
if abs(perm_diff) >= abs(observed_diff):
|
||||
count_greater_or_equal += 1
|
||||
else:
|
||||
# For category-based permutation test, we need to maintain category structure
|
||||
# Validate that the splits match the score lengths
|
||||
if splits_a is not None and sum(splits_a) != len(scores_a):
|
||||
raise ValueError(f"Sum of splits_a ({sum(splits_a)}) must equal length of scores_a ({len(scores_a)})")
|
||||
if splits_b is not None and sum(splits_b) != len(scores_b):
|
||||
raise ValueError(f"Sum of splits_b ({sum(splits_b)}) must equal length of scores_b ({len(scores_b)})")
|
||||
|
||||
# Create category structures
|
||||
categories_a = []
|
||||
categories_b = []
|
||||
|
||||
if splits_a is not None:
|
||||
start_idx = 0
|
||||
for split_size in splits_a:
|
||||
categories_a.append(scores_a[start_idx:start_idx + split_size])
|
||||
start_idx += split_size
|
||||
else:
|
||||
# If no splits for A, treat all scores as one category
|
||||
categories_a = [scores_a]
|
||||
|
||||
if splits_b is not None:
|
||||
start_idx = 0
|
||||
for split_size in splits_b:
|
||||
categories_b.append(scores_b[start_idx:start_idx + split_size])
|
||||
start_idx += split_size
|
||||
else:
|
||||
# If no splits for B, treat all scores as one category
|
||||
categories_b = [scores_b]
|
||||
|
||||
# Perform permutation test maintaining category structure
|
||||
count_greater_or_equal = 0
|
||||
for _ in range(n_permutations):
|
||||
# For each category pair, shuffle and redistribute
|
||||
perm_categories_a = []
|
||||
perm_categories_b = []
|
||||
|
||||
for cat_a, cat_b in zip(categories_a, categories_b):
|
||||
# Combine and shuffle
|
||||
combined = np.concatenate([cat_a, cat_b])
|
||||
np.random.shuffle(combined)
|
||||
|
||||
# Redistribute maintaining original sizes
|
||||
perm_categories_a.append(combined[:len(cat_a)])
|
||||
perm_categories_b.append(combined[len(cat_a):])
|
||||
|
||||
# Flatten permuted categories
|
||||
perm_a = np.concatenate(perm_categories_a)
|
||||
perm_b = np.concatenate(perm_categories_b)
|
||||
|
||||
# Calculate difference in means respecting category structure
|
||||
perm_mean_a = mean_of_category_means(perm_a, splits_a)
|
||||
perm_mean_b = mean_of_category_means(perm_b, splits_b)
|
||||
perm_diff = perm_mean_a - perm_mean_b
|
||||
|
||||
# Count how many permuted differences are >= to observed difference in absolute value
|
||||
if abs(perm_diff) >= abs(observed_diff):
|
||||
count_greater_or_equal += 1
|
||||
|
||||
# Calculate p-value
|
||||
p_value = count_greater_or_equal / n_permutations
|
||||
|
Loading…
x
Reference in New Issue
Block a user