human eval data; elo ratings script; dependencies

2025-11-30 01:01:02 +00:00 · 2025-02-13 16:59:09 -08:00 · 2025-02-13 16:59:09 -08:00 · 88c18b3afa
commit 88c18b3afa
parent 8297955290
3 changed files with 166 additions and 0 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -88,6 +88,12 @@ train = [
    "transformers>=4.45.1"
 ]

+elo = [
+    "numpy",
+    "scipy",
+    "pandas"
+]
+
 [tool.setuptools.packages.find]
 exclude = [
    "*.tests",
--- a/scripts/elo/calculate_elo_ratings.py
+++ b/scripts/elo/calculate_elo_ratings.py
@ -0,0 +1,153 @@
+"""
+
+Elo ratings for olmOCR vs baselines.
+
+See data at scripts/elo/ratings.csv
+
+    MethodA,MethodB,A_wins,B_wins,A_rate(%),B_rate(%)
+    marker,mineru,53,26,67.1,32.9
+    mineru,pdelf,22,55,28.6,71.4
+    gotocr_format,marker,26,45,36.6,63.4
+    marker,pdelf,31,49,38.8,61.3
+    gotocr_format,pdelf,29,41,41.4,58.6
+    gotocr_format,mineru,38,37,50.7,49.3
+
+Invoke via
+    python calculate_elo_ratings.py ratings.csv --num-bootstrap 1000 --num-elo-sims 10 --confidence-level 95 --seed 123
+
+@kylel
+
+"""
+
+import random
+from itertools import combinations
+
+import click
+import numpy as np
+import pandas as pd
+from scipy import stats
+from tqdm import tqdm
+
+
+def calculate_elo(matches_data, all_methods, k_factor=32, initial_rating=1500, n_replications=10, random_state=None):
+    """Calculate Elo ratings with multiple replications per dataset"""
+    all_ratings = {method: [] for method in all_methods}
+
+    for _ in range(n_replications):
+        matches = matches_data.sample(frac=1, replace=False, random_state=random_state).reset_index(drop=True)
+        ratings = {method: initial_rating for method in all_methods}
+
+        for _, row in matches.iterrows():
+            method_a, method_b = row["MethodA"], row["MethodB"]
+            a_wins, b_wins = row["A_wins"], row["B_wins"]
+
+            for _ in range(int(a_wins)):
+                ra, rb = update_single_match(ratings[method_a], ratings[method_b], 1, k_factor)
+                ratings[method_a], ratings[method_b] = ra, rb
+
+            for _ in range(int(b_wins)):
+                ra, rb = update_single_match(ratings[method_a], ratings[method_b], 0, k_factor)
+                ratings[method_a], ratings[method_b] = ra, rb
+
+        for method in all_methods:
+            all_ratings[method].append(ratings[method])
+
+    return {method: np.mean(ratings) for method, ratings in all_ratings.items()}
+
+
+def update_single_match(rating_a, rating_b, actual_score, k_factor):
+    """Update ratings for a single match"""
+    expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
+    new_rating_a = rating_a + k_factor * (actual_score - expected_a)
+    new_rating_b = rating_b + k_factor * ((1 - actual_score) - (1 - expected_a))
+    return new_rating_a, new_rating_b
+
+
+def bootstrap_elo_and_tests(df, num_bootstrap=1000, num_elo_sims=10, confidence_level=95, k_factor=32, initial_rating=1500, random_state=None):
+    """Calculate bootstrapped Elo ratings with confidence intervals and perform pairwise significance tests"""
+
+    ci_lower = (100 - confidence_level) / 2
+    ci_upper = 100 - ci_lower
+
+    all_methods = set(df["MethodA"].unique()) | set(df["MethodB"].unique())
+    bootstrap_ratings = {method: [] for method in all_methods}
+
+    for _ in tqdm(range(num_bootstrap)):
+        bootstrap_sample = df.sample(n=len(df), replace=True, random_state=random_state)
+        ratings = calculate_elo(bootstrap_sample, all_methods, k_factor, initial_rating, num_elo_sims)
+
+        for method in all_methods:
+            bootstrap_ratings[method].append(ratings[method])
+
+    # Calculate statistics and perform significance tests
+    results = {}
+
+    # Basic statistics
+    for method in all_methods:
+        ratings_array = np.array(bootstrap_ratings[method])
+        results[method] = {
+            "mean": np.mean(ratings_array),
+            "std": np.std(ratings_array),
+            "ci_lower": np.percentile(ratings_array, ci_lower),
+            "ci_upper": np.percentile(ratings_array, ci_upper),
+            "bootstrap_samples": ratings_array,  # Store for significance testing
+        }
+
+    # Pairwise significance tests
+    significance_tests = {}
+    for method1, method2 in combinations(all_methods, 2):
+        # Calculate difference distribution
+        diff_distribution = results[method1]["bootstrap_samples"] - results[method2]["bootstrap_samples"]
+
+        # Calculate p-value (two-tailed test)
+        p_value = 2 * min(np.mean(diff_distribution >= 0), np.mean(diff_distribution <= 0))
+
+        # Store results
+        significance_tests[(method1, method2)] = {
+            "diff_mean": np.mean(diff_distribution),
+            "diff_ci_lower": np.percentile(diff_distribution, ci_lower),
+            "diff_ci_upper": np.percentile(diff_distribution, ci_upper),
+            "p_value": p_value,
+        }
+
+    return results, significance_tests
+
+
+@click.command()
+@click.argument("ratings_file", type=click.Path(exists=True))
+@click.option("--num-bootstrap", default=1000, help="Number of bootstrap iterations")
+@click.option("--num-elo-sims", default=10, help="Number of ELO simulations per bootstrap")
+@click.option("--confidence-level", default=95, help="Confidence level for intervals (in percent)")
+@click.option("--seed", default=42, help="Random seed for reproducibility")
+def main(ratings_file, num_bootstrap, num_elo_sims, confidence_level, seed):
+    # Set random seed
+    random.seed(seed)
+    np.random.seed(seed)
+
+    # Load data
+    df = pd.read_csv(ratings_file)
+
+    # Calculate bootstrapped Elo ratings
+    results, significance_tests = bootstrap_elo_and_tests(df, num_bootstrap=num_bootstrap, num_elo_sims=num_elo_sims)
+
+    # Sort and display results
+    print(f"\nBootstrapped Elo Ratings ({confidence_level}% CI):")
+    print("-" * 50)
+    sorted_results = dict(sorted(results.items(), key=lambda x: x[1]["mean"], reverse=True))
+    for method, stats in sorted_results.items():
+        print(f"{method:12} {stats['mean']:6.1f} ± {stats['std']:4.1f} [{stats['ci_lower']:6.1f}, {stats['ci_upper']:6.1f}]")
+
+    # Display pairwise significance tests
+    print("\nPairwise Significance Tests:")
+    print("-" * 50)
+    for (method1, method2), stats in significance_tests.items():
+        sig_marker = "*" if stats["p_value"] < (1 - confidence_level / 100) else " "
+        print(
+            f"{method1:12} vs {method2:12} Δ = {stats['diff_mean']:6.1f} "
+            + f"[{stats['diff_ci_lower']:6.1f}, {stats['diff_ci_upper']:6.1f}] "
+            + f"p = {stats['p_value']:.3f}{sig_marker}"
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/elo/ratings.csv
+++ b/scripts/elo/ratings.csv
@ -0,0 +1,7 @@
+MethodA,MethodB,A_wins,B_wins,A_rate(%),B_rate(%)
+marker,mineru,53,26,67.1,32.9
+mineru,pdelf,22,55,28.6,71.4
+gotocr_format,marker,26,45,36.6,63.4
+marker,pdelf,31,49,38.8,61.3
+gotocr_format,pdelf,29,41,41.4,58.6
+gotocr_format,mineru,38,37,50.7,49.3