mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-28 00:06:41 +00:00
Code cleanup, version bump, remove unused permutation test
This commit is contained in:
parent
5de52e7d13
commit
63aee2c1e5
@ -28,7 +28,7 @@ from tqdm import tqdm
|
|||||||
|
|
||||||
from .report import generate_html_report
|
from .report import generate_html_report
|
||||||
from .tests import BaselineTest, BasePDFTest, load_tests, save_tests
|
from .tests import BaselineTest, BasePDFTest, load_tests, save_tests
|
||||||
from .utils import calculate_bootstrap_ci, perform_permutation_test
|
from .utils import calculate_bootstrap_ci
|
||||||
|
|
||||||
|
|
||||||
def evaluate_candidate(
|
def evaluate_candidate(
|
||||||
@ -186,16 +186,6 @@ def main():
|
|||||||
default=0.95,
|
default=0.95,
|
||||||
help="Confidence level for interval calculation (default: 0.95 for 95% CI).",
|
help="Confidence level for interval calculation (default: 0.95 for 95% CI).",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--permutation_tests",
|
|
||||||
nargs="?",
|
|
||||||
const="default",
|
|
||||||
help=(
|
|
||||||
"Run permutation testing. If provided without candidate names, run default tests. "
|
|
||||||
"If provided with a comma-separated list of candidate names (e.g. --permutation_tests asdf,qwe,ert), "
|
|
||||||
"run permutation tests on all pairs of the specified candidates."
|
|
||||||
),
|
|
||||||
)
|
|
||||||
# New arguments
|
# New arguments
|
||||||
parser.add_argument("--sample", type=int, default=None, help="Randomly sample N tests to run instead of all tests.")
|
parser.add_argument("--sample", type=int, default=None, help="Randomly sample N tests to run instead of all tests.")
|
||||||
parser.add_argument("--test_report", type=str, default=None, help="Generate an HTML report of test results. Provide a filename (e.g., results.html).")
|
parser.add_argument("--test_report", type=str, default=None, help="Generate an HTML report of test results. Provide a filename (e.g., results.html).")
|
||||||
@ -300,16 +290,16 @@ def main():
|
|||||||
jsonl_results = {}
|
jsonl_results = {}
|
||||||
jsonl_scores = [] # List to store scores by jsonl file for CI calculation
|
jsonl_scores = [] # List to store scores by jsonl file for CI calculation
|
||||||
jsonl_file_sizes = [] # List to store the number of tests per jsonl file
|
jsonl_file_sizes = [] # List to store the number of tests per jsonl file
|
||||||
|
|
||||||
for test in all_tests:
|
for test in all_tests:
|
||||||
# Get the jsonl file this test came from
|
# Get the jsonl file this test came from
|
||||||
jsonl_file = test_to_jsonl.get(test.id, "unknown")
|
jsonl_file = test_to_jsonl.get(test.id, "unknown")
|
||||||
|
|
||||||
if jsonl_file not in jsonl_results:
|
if jsonl_file not in jsonl_results:
|
||||||
jsonl_results[jsonl_file] = {"total": 0, "passed": 0, "scores": []}
|
jsonl_results[jsonl_file] = {"total": 0, "passed": 0, "scores": []}
|
||||||
|
|
||||||
jsonl_results[jsonl_file]["total"] += 1
|
jsonl_results[jsonl_file]["total"] += 1
|
||||||
|
|
||||||
# Get the test result for this candidate if it exists
|
# Get the test result for this candidate if it exists
|
||||||
if not candidate_errors and hasattr(test, "pdf") and hasattr(test, "page"):
|
if not candidate_errors and hasattr(test, "pdf") and hasattr(test, "page"):
|
||||||
pdf_name = test.pdf
|
pdf_name = test.pdf
|
||||||
@ -323,13 +313,13 @@ def main():
|
|||||||
if passed:
|
if passed:
|
||||||
jsonl_results[jsonl_file]["passed"] += 1
|
jsonl_results[jsonl_file]["passed"] += 1
|
||||||
break
|
break
|
||||||
|
|
||||||
# Gather all the scores by jsonl file for CI calculation
|
# Gather all the scores by jsonl file for CI calculation
|
||||||
for jsonl_file, results in jsonl_results.items():
|
for jsonl_file, results in jsonl_results.items():
|
||||||
if results["scores"]:
|
if results["scores"]:
|
||||||
jsonl_file_sizes.append(len(results["scores"]))
|
jsonl_file_sizes.append(len(results["scores"]))
|
||||||
jsonl_scores.extend(results["scores"])
|
jsonl_scores.extend(results["scores"])
|
||||||
|
|
||||||
# Calculate CI using the updated function with splits
|
# Calculate CI using the updated function with splits
|
||||||
if jsonl_scores:
|
if jsonl_scores:
|
||||||
ci = calculate_bootstrap_ci(jsonl_scores, n_bootstrap=n_bootstrap, ci_level=ci_level, splits=jsonl_file_sizes)
|
ci = calculate_bootstrap_ci(jsonl_scores, n_bootstrap=n_bootstrap, ci_level=ci_level, splits=jsonl_file_sizes)
|
||||||
@ -350,7 +340,7 @@ def main():
|
|||||||
if results["total"] > 0:
|
if results["total"] > 0:
|
||||||
pass_rate = results["passed"] / results["total"]
|
pass_rate = results["passed"] / results["total"]
|
||||||
jsonl_pass_rates.append(pass_rate)
|
jsonl_pass_rates.append(pass_rate)
|
||||||
|
|
||||||
per_category_score = sum(jsonl_pass_rates) / len(jsonl_pass_rates) if jsonl_pass_rates else 0.0
|
per_category_score = sum(jsonl_pass_rates) / len(jsonl_pass_rates) if jsonl_pass_rates else 0.0
|
||||||
print(f" Average Score: {per_category_score * 100:.1f}% (95% CI: [{ci[0] * 100:.1f}%, {ci[1] * 100:.1f}%]) over {total_tests} tests.")
|
print(f" Average Score: {per_category_score * 100:.1f}% (95% CI: [{ci[0] * 100:.1f}%, {ci[1] * 100:.1f}%]) over {total_tests} tests.")
|
||||||
|
|
||||||
@ -418,161 +408,6 @@ def main():
|
|||||||
print(f" {jsonl_file:30s}: {pass_rate:0.1f}% ({results['passed']}/{results['total']} tests)")
|
print(f" {jsonl_file:30s}: {pass_rate:0.1f}% ({results['passed']}/{results['total']} tests)")
|
||||||
print("")
|
print("")
|
||||||
|
|
||||||
if args.permutation_tests is not None:
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("Pairwise Permutation Tests:")
|
|
||||||
valid_candidates = [c for c in summary if not c[3]]
|
|
||||||
if args.permutation_tests == "default":
|
|
||||||
olmocr_candidates = sorted([c for c in valid_candidates if "olmocr" in c[0].lower()], key=lambda x: x[1], reverse=True)
|
|
||||||
non_olmocr_candidates = sorted([c for c in valid_candidates if "olmocr" not in c[0].lower()], key=lambda x: x[1], reverse=True)
|
|
||||||
top_olmocr = olmocr_candidates[0] if olmocr_candidates else None
|
|
||||||
top_non_olmocr = non_olmocr_candidates[0] if non_olmocr_candidates else None
|
|
||||||
top_two_olmocr = olmocr_candidates[:2]
|
|
||||||
|
|
||||||
if top_olmocr and top_non_olmocr:
|
|
||||||
olmocr_name, olmocr_score = top_olmocr[0], top_olmocr[1]
|
|
||||||
non_olmocr_name, non_olmocr_score = top_non_olmocr[0], top_non_olmocr[1]
|
|
||||||
# Extract file sizes and scores for both candidates
|
|
||||||
olmocr_jsonl_sizes = []
|
|
||||||
non_olmocr_jsonl_sizes = []
|
|
||||||
|
|
||||||
# Extract jsonl file sizes for each candidate
|
|
||||||
for test in all_tests:
|
|
||||||
jsonl_file = test_to_jsonl.get(test.id, "unknown")
|
|
||||||
# Process for top_olmocr
|
|
||||||
if not top_olmocr[3] and hasattr(test, "pdf") and hasattr(test, "page"):
|
|
||||||
pdf_name = test.pdf
|
|
||||||
page = test.page
|
|
||||||
if pdf_name in test_results_by_candidate.get(top_olmocr[0], {}) and page in test_results_by_candidate[top_olmocr[0]].get(pdf_name, {}):
|
|
||||||
for t, _, _ in test_results_by_candidate[top_olmocr[0]][pdf_name][page]:
|
|
||||||
if t.id == test.id:
|
|
||||||
if jsonl_file not in olmocr_jsonl_sizes:
|
|
||||||
olmocr_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file]))
|
|
||||||
break
|
|
||||||
|
|
||||||
# Process for top_non_olmocr
|
|
||||||
if not top_non_olmocr[3] and hasattr(test, "pdf") and hasattr(test, "page"):
|
|
||||||
pdf_name = test.pdf
|
|
||||||
page = test.page
|
|
||||||
if pdf_name in test_results_by_candidate.get(top_non_olmocr[0], {}) and page in test_results_by_candidate[top_non_olmocr[0]].get(pdf_name, {}):
|
|
||||||
for t, _, _ in test_results_by_candidate[top_non_olmocr[0]][pdf_name][page]:
|
|
||||||
if t.id == test.id:
|
|
||||||
if jsonl_file not in non_olmocr_jsonl_sizes:
|
|
||||||
non_olmocr_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file]))
|
|
||||||
break
|
|
||||||
|
|
||||||
diff, p_value = perform_permutation_test(
|
|
||||||
top_olmocr[7], top_non_olmocr[7],
|
|
||||||
splits_a=olmocr_jsonl_sizes if olmocr_jsonl_sizes else None,
|
|
||||||
splits_b=non_olmocr_jsonl_sizes if non_olmocr_jsonl_sizes else None
|
|
||||||
)
|
|
||||||
print("\nComparison 1: Top olmocr vs Top non-olmocr candidate")
|
|
||||||
print(f" {olmocr_name} ({olmocr_score*100:.1f}%) vs {non_olmocr_name} ({non_olmocr_score*100:.1f}%)")
|
|
||||||
print(f" Difference: {diff*100:.2f}% (positive means {olmocr_name} is better)")
|
|
||||||
print(f" p-value: {p_value:.4f}")
|
|
||||||
if p_value < 0.05:
|
|
||||||
print(" Result: Statistically significant difference (p < 0.05)")
|
|
||||||
else:
|
|
||||||
print(" Result: No statistically significant difference (p ≥ 0.05)")
|
|
||||||
else:
|
|
||||||
print("\nCannot perform olmocr vs non-olmocr comparison: Missing candidates")
|
|
||||||
|
|
||||||
if len(top_two_olmocr) >= 2:
|
|
||||||
# Extract file sizes for each candidate
|
|
||||||
olmocr1_jsonl_sizes = []
|
|
||||||
olmocr2_jsonl_sizes = []
|
|
||||||
|
|
||||||
# Extract jsonl file sizes for each candidate
|
|
||||||
for test in all_tests:
|
|
||||||
jsonl_file = test_to_jsonl.get(test.id, "unknown")
|
|
||||||
# Process for first olmocr candidate
|
|
||||||
if not top_two_olmocr[0][3] and hasattr(test, "pdf") and hasattr(test, "page"):
|
|
||||||
pdf_name = test.pdf
|
|
||||||
page = test.page
|
|
||||||
if pdf_name in test_results_by_candidate.get(top_two_olmocr[0][0], {}) and page in test_results_by_candidate[top_two_olmocr[0][0]].get(pdf_name, {}):
|
|
||||||
for t, _, _ in test_results_by_candidate[top_two_olmocr[0][0]][pdf_name][page]:
|
|
||||||
if t.id == test.id:
|
|
||||||
if jsonl_file not in olmocr1_jsonl_sizes:
|
|
||||||
olmocr1_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file]))
|
|
||||||
break
|
|
||||||
|
|
||||||
# Process for second olmocr candidate
|
|
||||||
if not top_two_olmocr[1][3] and hasattr(test, "pdf") and hasattr(test, "page"):
|
|
||||||
pdf_name = test.pdf
|
|
||||||
page = test.page
|
|
||||||
if pdf_name in test_results_by_candidate.get(top_two_olmocr[1][0], {}) and page in test_results_by_candidate[top_two_olmocr[1][0]].get(pdf_name, {}):
|
|
||||||
for t, _, _ in test_results_by_candidate[top_two_olmocr[1][0]][pdf_name][page]:
|
|
||||||
if t.id == test.id:
|
|
||||||
if jsonl_file not in olmocr2_jsonl_sizes:
|
|
||||||
olmocr2_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file]))
|
|
||||||
break
|
|
||||||
|
|
||||||
diff, p_value = perform_permutation_test(
|
|
||||||
top_two_olmocr[0][7], top_two_olmocr[1][7],
|
|
||||||
splits_a=olmocr1_jsonl_sizes if olmocr1_jsonl_sizes else None,
|
|
||||||
splits_b=olmocr2_jsonl_sizes if olmocr2_jsonl_sizes else None
|
|
||||||
)
|
|
||||||
print("\nComparison 2: Top two olmocr candidates")
|
|
||||||
print(f" {top_two_olmocr[0][0]} ({top_two_olmocr[0][1]*100:.1f}%) vs {top_two_olmocr[1][0]} ({top_two_olmocr[1][1]*100:.1f}%)")
|
|
||||||
print(f" Difference: {diff*100:.2f}% (positive means {top_two_olmocr[0][0]} is better)")
|
|
||||||
print(f" p-value: {p_value:.4f}")
|
|
||||||
if p_value < 0.05:
|
|
||||||
print(" Result: Statistically significant difference (p < 0.05)")
|
|
||||||
else:
|
|
||||||
print(" Result: No statistically significant difference (p ≥ 0.05)")
|
|
||||||
else:
|
|
||||||
print("\nCannot perform top two olmocr comparison: Not enough olmocr candidates")
|
|
||||||
else:
|
|
||||||
candidate_names = [name.strip() for name in args.permutation_tests.split(",")]
|
|
||||||
selected_candidates = [c for c in valid_candidates if c[0] in candidate_names]
|
|
||||||
if len(selected_candidates) < 2:
|
|
||||||
print("\nNot enough valid candidates among the selected for permutation tests.")
|
|
||||||
else:
|
|
||||||
for cand1, cand2 in combinations(selected_candidates, 2):
|
|
||||||
# Extract file sizes for each candidate
|
|
||||||
cand1_jsonl_sizes = []
|
|
||||||
cand2_jsonl_sizes = []
|
|
||||||
|
|
||||||
# Extract jsonl file sizes for each candidate
|
|
||||||
for test in all_tests:
|
|
||||||
jsonl_file = test_to_jsonl.get(test.id, "unknown")
|
|
||||||
# Process for first candidate
|
|
||||||
if not cand1[3] and hasattr(test, "pdf") and hasattr(test, "page"):
|
|
||||||
pdf_name = test.pdf
|
|
||||||
page = test.page
|
|
||||||
if pdf_name in test_results_by_candidate.get(cand1[0], {}) and page in test_results_by_candidate[cand1[0]].get(pdf_name, {}):
|
|
||||||
for t, _, _ in test_results_by_candidate[cand1[0]][pdf_name][page]:
|
|
||||||
if t.id == test.id:
|
|
||||||
if jsonl_file not in cand1_jsonl_sizes:
|
|
||||||
cand1_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file]))
|
|
||||||
break
|
|
||||||
|
|
||||||
# Process for second candidate
|
|
||||||
if not cand2[3] and hasattr(test, "pdf") and hasattr(test, "page"):
|
|
||||||
pdf_name = test.pdf
|
|
||||||
page = test.page
|
|
||||||
if pdf_name in test_results_by_candidate.get(cand2[0], {}) and page in test_results_by_candidate[cand2[0]].get(pdf_name, {}):
|
|
||||||
for t, _, _ in test_results_by_candidate[cand2[0]][pdf_name][page]:
|
|
||||||
if t.id == test.id:
|
|
||||||
if jsonl_file not in cand2_jsonl_sizes:
|
|
||||||
cand2_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file]))
|
|
||||||
break
|
|
||||||
|
|
||||||
diff, p_value = perform_permutation_test(
|
|
||||||
cand1[7], cand2[7],
|
|
||||||
splits_a=cand1_jsonl_sizes if cand1_jsonl_sizes else None,
|
|
||||||
splits_b=cand2_jsonl_sizes if cand2_jsonl_sizes else None
|
|
||||||
)
|
|
||||||
print(f"\nComparison: {cand1[0]} vs {cand2[0]}")
|
|
||||||
print(f" {cand1[0]} ({cand1[1]*100:.1f}%) vs {cand2[0]} ({cand2[1]*100:.1f}%)")
|
|
||||||
print(f" Difference: {diff*100:.2f}% (positive means {cand1[0]} is better)")
|
|
||||||
print(f" p-value: {p_value:.4f}")
|
|
||||||
if p_value < 0.05:
|
|
||||||
print(" Result: Statistically significant difference (p < 0.05)")
|
|
||||||
else:
|
|
||||||
print(" Result: No statistically significant difference (p ≥ 0.05)")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
# Generate HTML report if requested
|
# Generate HTML report if requested
|
||||||
if args.test_report:
|
if args.test_report:
|
||||||
generate_html_report(test_results_by_candidate, pdf_folder, args.test_report)
|
generate_html_report(test_results_by_candidate, pdf_folder, args.test_report)
|
||||||
|
|||||||
@ -2,10 +2,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from anthropic import Anthropic
|
from anthropic import Anthropic
|
||||||
from prompts import (
|
from prompts import build_openai_silver_data_prompt, claude_response_format_schema
|
||||||
build_openai_silver_data_prompt,
|
|
||||||
claude_response_format_schema,
|
|
||||||
)
|
|
||||||
|
|
||||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||||
from olmocr.prompts.anchor import get_anchor_text
|
from olmocr.prompts.anchor import get_anchor_text
|
||||||
|
|||||||
@ -17,11 +17,7 @@ from playwright.async_api import async_playwright
|
|||||||
from syntok.segmenter import process
|
from syntok.segmenter import process
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from olmocr.bench.tests import (
|
from olmocr.bench.tests import TableTest, TestType, parse_html_tables
|
||||||
TableTest,
|
|
||||||
TestType,
|
|
||||||
parse_html_tables,
|
|
||||||
)
|
|
||||||
from olmocr.data.renderpdf import (
|
from olmocr.data.renderpdf import (
|
||||||
get_png_dimensions_from_base64,
|
get_png_dimensions_from_base64,
|
||||||
render_pdf_to_base64png,
|
render_pdf_to_base64png,
|
||||||
|
|||||||
@ -23,7 +23,7 @@ def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci
|
|||||||
|
|
||||||
# Convert to numpy array for efficiency
|
# Convert to numpy array for efficiency
|
||||||
scores = np.array(test_scores)
|
scores = np.array(test_scores)
|
||||||
|
|
||||||
# Simple case - no splits provided, use traditional bootstrap
|
# Simple case - no splits provided, use traditional bootstrap
|
||||||
if splits is None:
|
if splits is None:
|
||||||
# Generate bootstrap samples
|
# Generate bootstrap samples
|
||||||
@ -36,14 +36,14 @@ def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci
|
|||||||
# Validate splits
|
# Validate splits
|
||||||
if sum(splits) != len(scores):
|
if sum(splits) != len(scores):
|
||||||
raise ValueError(f"Sum of splits ({sum(splits)}) must equal length of test_scores ({len(scores)})")
|
raise ValueError(f"Sum of splits ({sum(splits)}) must equal length of test_scores ({len(scores)})")
|
||||||
|
|
||||||
# Convert flat scores list to a list of category scores
|
# Convert flat scores list to a list of category scores
|
||||||
category_scores = []
|
category_scores = []
|
||||||
start_idx = 0
|
start_idx = 0
|
||||||
for split_size in splits:
|
for split_size in splits:
|
||||||
category_scores.append(scores[start_idx:start_idx + split_size])
|
category_scores.append(scores[start_idx : start_idx + split_size])
|
||||||
start_idx += split_size
|
start_idx += split_size
|
||||||
|
|
||||||
# Generate bootstrap samples respecting category structure
|
# Generate bootstrap samples respecting category structure
|
||||||
bootstrap_means = []
|
bootstrap_means = []
|
||||||
for _ in range(n_bootstrap):
|
for _ in range(n_bootstrap):
|
||||||
@ -54,7 +54,7 @@ def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci
|
|||||||
# Sample with replacement within this category
|
# Sample with replacement within this category
|
||||||
cat_sample = np.random.choice(cat_scores, size=len(cat_scores), replace=True)
|
cat_sample = np.random.choice(cat_scores, size=len(cat_scores), replace=True)
|
||||||
category_means.append(np.mean(cat_sample))
|
category_means.append(np.mean(cat_sample))
|
||||||
|
|
||||||
# Overall score is average of category means (if any categories have scores)
|
# Overall score is average of category means (if any categories have scores)
|
||||||
if category_means:
|
if category_means:
|
||||||
bootstrap_means.append(np.mean(category_means))
|
bootstrap_means.append(np.mean(category_means))
|
||||||
@ -67,8 +67,9 @@ def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci
|
|||||||
return (lower_bound, upper_bound)
|
return (lower_bound, upper_bound)
|
||||||
|
|
||||||
|
|
||||||
def perform_permutation_test(scores_a: List[float], scores_b: List[float], n_permutations: int = 10000,
|
def perform_permutation_test(
|
||||||
splits_a: List[int] = None, splits_b: List[int] = None) -> Tuple[float, float]:
|
scores_a: List[float], scores_b: List[float], n_permutations: int = 10000, splits_a: List[int] = None, splits_b: List[int] = None
|
||||||
|
) -> Tuple[float, float]:
|
||||||
"""
|
"""
|
||||||
Perform a permutation test to determine if there's a significant difference
|
Perform a permutation test to determine if there's a significant difference
|
||||||
between two sets of test scores.
|
between two sets of test scores.
|
||||||
@ -90,15 +91,15 @@ def perform_permutation_test(scores_a: List[float], scores_b: List[float], n_per
|
|||||||
def mean_of_category_means(scores, splits=None):
|
def mean_of_category_means(scores, splits=None):
|
||||||
if splits is None:
|
if splits is None:
|
||||||
return np.mean(scores)
|
return np.mean(scores)
|
||||||
|
|
||||||
category_means = []
|
category_means = []
|
||||||
start_idx = 0
|
start_idx = 0
|
||||||
for split_size in splits:
|
for split_size in splits:
|
||||||
if split_size > 0:
|
if split_size > 0:
|
||||||
category_scores = scores[start_idx:start_idx + split_size]
|
category_scores = scores[start_idx : start_idx + split_size]
|
||||||
category_means.append(np.mean(category_scores))
|
category_means.append(np.mean(category_scores))
|
||||||
start_idx += split_size
|
start_idx += split_size
|
||||||
|
|
||||||
return np.mean(category_means) if category_means else 0.0
|
return np.mean(category_means) if category_means else 0.0
|
||||||
|
|
||||||
# Calculate observed difference in means using category structure if provided
|
# Calculate observed difference in means using category structure if provided
|
||||||
@ -135,54 +136,54 @@ def perform_permutation_test(scores_a: List[float], scores_b: List[float], n_per
|
|||||||
raise ValueError(f"Sum of splits_a ({sum(splits_a)}) must equal length of scores_a ({len(scores_a)})")
|
raise ValueError(f"Sum of splits_a ({sum(splits_a)}) must equal length of scores_a ({len(scores_a)})")
|
||||||
if splits_b is not None and sum(splits_b) != len(scores_b):
|
if splits_b is not None and sum(splits_b) != len(scores_b):
|
||||||
raise ValueError(f"Sum of splits_b ({sum(splits_b)}) must equal length of scores_b ({len(scores_b)})")
|
raise ValueError(f"Sum of splits_b ({sum(splits_b)}) must equal length of scores_b ({len(scores_b)})")
|
||||||
|
|
||||||
# Create category structures
|
# Create category structures
|
||||||
categories_a = []
|
categories_a = []
|
||||||
categories_b = []
|
categories_b = []
|
||||||
|
|
||||||
if splits_a is not None:
|
if splits_a is not None:
|
||||||
start_idx = 0
|
start_idx = 0
|
||||||
for split_size in splits_a:
|
for split_size in splits_a:
|
||||||
categories_a.append(scores_a[start_idx:start_idx + split_size])
|
categories_a.append(scores_a[start_idx : start_idx + split_size])
|
||||||
start_idx += split_size
|
start_idx += split_size
|
||||||
else:
|
else:
|
||||||
# If no splits for A, treat all scores as one category
|
# If no splits for A, treat all scores as one category
|
||||||
categories_a = [scores_a]
|
categories_a = [scores_a]
|
||||||
|
|
||||||
if splits_b is not None:
|
if splits_b is not None:
|
||||||
start_idx = 0
|
start_idx = 0
|
||||||
for split_size in splits_b:
|
for split_size in splits_b:
|
||||||
categories_b.append(scores_b[start_idx:start_idx + split_size])
|
categories_b.append(scores_b[start_idx : start_idx + split_size])
|
||||||
start_idx += split_size
|
start_idx += split_size
|
||||||
else:
|
else:
|
||||||
# If no splits for B, treat all scores as one category
|
# If no splits for B, treat all scores as one category
|
||||||
categories_b = [scores_b]
|
categories_b = [scores_b]
|
||||||
|
|
||||||
# Perform permutation test maintaining category structure
|
# Perform permutation test maintaining category structure
|
||||||
count_greater_or_equal = 0
|
count_greater_or_equal = 0
|
||||||
for _ in range(n_permutations):
|
for _ in range(n_permutations):
|
||||||
# For each category pair, shuffle and redistribute
|
# For each category pair, shuffle and redistribute
|
||||||
perm_categories_a = []
|
perm_categories_a = []
|
||||||
perm_categories_b = []
|
perm_categories_b = []
|
||||||
|
|
||||||
for cat_a, cat_b in zip(categories_a, categories_b):
|
for cat_a, cat_b in zip(categories_a, categories_b):
|
||||||
# Combine and shuffle
|
# Combine and shuffle
|
||||||
combined = np.concatenate([cat_a, cat_b])
|
combined = np.concatenate([cat_a, cat_b])
|
||||||
np.random.shuffle(combined)
|
np.random.shuffle(combined)
|
||||||
|
|
||||||
# Redistribute maintaining original sizes
|
# Redistribute maintaining original sizes
|
||||||
perm_categories_a.append(combined[:len(cat_a)])
|
perm_categories_a.append(combined[: len(cat_a)])
|
||||||
perm_categories_b.append(combined[len(cat_a):])
|
perm_categories_b.append(combined[len(cat_a) :])
|
||||||
|
|
||||||
# Flatten permuted categories
|
# Flatten permuted categories
|
||||||
perm_a = np.concatenate(perm_categories_a)
|
perm_a = np.concatenate(perm_categories_a)
|
||||||
perm_b = np.concatenate(perm_categories_b)
|
perm_b = np.concatenate(perm_categories_b)
|
||||||
|
|
||||||
# Calculate difference in means respecting category structure
|
# Calculate difference in means respecting category structure
|
||||||
perm_mean_a = mean_of_category_means(perm_a, splits_a)
|
perm_mean_a = mean_of_category_means(perm_a, splits_a)
|
||||||
perm_mean_b = mean_of_category_means(perm_b, splits_b)
|
perm_mean_b = mean_of_category_means(perm_b, splits_b)
|
||||||
perm_diff = perm_mean_a - perm_mean_b
|
perm_diff = perm_mean_a - perm_mean_b
|
||||||
|
|
||||||
# Count how many permuted differences are >= to observed difference in absolute value
|
# Count how many permuted differences are >= to observed difference in absolute value
|
||||||
if abs(perm_diff) >= abs(observed_diff):
|
if abs(perm_diff) >= abs(observed_diff):
|
||||||
count_greater_or_equal += 1
|
count_greater_or_equal += 1
|
||||||
|
|||||||
@ -208,7 +208,6 @@ class ParagraphEditSimilarity(DocumentEditSimilarity):
|
|||||||
self.sent_window = sent_window
|
self.sent_window = sent_window
|
||||||
|
|
||||||
def segment(self, seq_a_tokens: list[str], seq_b_tokens: list[str]) -> list[tuple[list[str], list[str]]]:
|
def segment(self, seq_a_tokens: list[str], seq_b_tokens: list[str]) -> list[tuple[list[str], list[str]]]:
|
||||||
|
|
||||||
all_spans = []
|
all_spans = []
|
||||||
|
|
||||||
for seq_tokens in (seq_a_tokens, seq_b_tokens):
|
for seq_tokens in (seq_a_tokens, seq_b_tokens):
|
||||||
|
|||||||
@ -91,15 +91,18 @@ class BaseRegistry(Generic[T]):
|
|||||||
|
|
||||||
@overload
|
@overload
|
||||||
@classmethod
|
@classmethod
|
||||||
def get(cls, name: str) -> T: ...
|
def get(cls, name: str) -> T:
|
||||||
|
...
|
||||||
|
|
||||||
@overload
|
@overload
|
||||||
@classmethod
|
@classmethod
|
||||||
def get(cls, name: str, raise_on_missing: Literal[True]) -> T: ...
|
def get(cls, name: str, raise_on_missing: Literal[True]) -> T:
|
||||||
|
...
|
||||||
|
|
||||||
@overload
|
@overload
|
||||||
@classmethod
|
@classmethod
|
||||||
def get(cls, name: str, raise_on_missing: Literal[False]) -> Optional[T]: ...
|
def get(cls, name: str, raise_on_missing: Literal[False]) -> Optional[T]:
|
||||||
|
...
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get(cls, name: str, raise_on_missing: bool = True) -> Optional[T]:
|
def get(cls, name: str, raise_on_missing: bool = True) -> Optional[T]:
|
||||||
|
|||||||
@ -1 +1,2 @@
|
|||||||
class DolmaRefineError(RuntimeError): ...
|
class DolmaRefineError(RuntimeError):
|
||||||
|
...
|
||||||
|
|||||||
@ -2,7 +2,7 @@ _MAJOR = "0"
|
|||||||
_MINOR = "1"
|
_MINOR = "1"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "67"
|
_PATCH = "68"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|||||||
@ -46,7 +46,6 @@ def jsonl_to_markdown(input_file, output_dir):
|
|||||||
# It takes two arguments: the input JSONL file and the output directory.
|
# It takes two arguments: the input JSONL file and the output directory.
|
||||||
# The script will create the output directory if it does not exist.
|
# The script will create the output directory if it does not exist.
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
if len(sys.argv) != 3:
|
if len(sys.argv) != 3:
|
||||||
print("Usage: python jsonl_to_markdown.py <input_file> <output_dir>")
|
print("Usage: python jsonl_to_markdown.py <input_file> <output_dir>")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|||||||
@ -60,7 +60,7 @@ GREEN = "#0fcb8c"
|
|||||||
data = {
|
data = {
|
||||||
MODEL_COLUMN_NAME: [
|
MODEL_COLUMN_NAME: [
|
||||||
"GPT-4o",
|
"GPT-4o",
|
||||||
"GPT-4o (Batch)",
|
"GPT-4o (Batch)",
|
||||||
"Mistral OCR",
|
"Mistral OCR",
|
||||||
"MinerU",
|
"MinerU",
|
||||||
"Gemini Flash 2",
|
"Gemini Flash 2",
|
||||||
@ -71,7 +71,7 @@ data = {
|
|||||||
"Qwen 2 VL (A100)",
|
"Qwen 2 VL (A100)",
|
||||||
"Qwen 2 VL (H100,L40S)",
|
"Qwen 2 VL (H100,L40S)",
|
||||||
"Qwen 2.5 VL (A100)",
|
"Qwen 2.5 VL (A100)",
|
||||||
"Qwen 2.5 VL (H100,L40S)"
|
"Qwen 2.5 VL (H100,L40S)",
|
||||||
],
|
],
|
||||||
COST_COLUMN_NAME: [
|
COST_COLUMN_NAME: [
|
||||||
12480,
|
12480,
|
||||||
@ -86,7 +86,7 @@ data = {
|
|||||||
270, # Same cost as Ours
|
270, # Same cost as Ours
|
||||||
190, # Same cost as Ours
|
190, # Same cost as Ours
|
||||||
270, # Same cost as Ours
|
270, # Same cost as Ours
|
||||||
190 # Same cost as Ours
|
190, # Same cost as Ours
|
||||||
],
|
],
|
||||||
PERF_COLUMN_NAME: [
|
PERF_COLUMN_NAME: [
|
||||||
69.9, # GPT-4o (Anchored)
|
69.9, # GPT-4o (Anchored)
|
||||||
@ -101,8 +101,8 @@ data = {
|
|||||||
31.5, # Qwen2VL
|
31.5, # Qwen2VL
|
||||||
31.5, # Qwen2VL
|
31.5, # Qwen2VL
|
||||||
65.5, # Qwen2.5VL
|
65.5, # Qwen2.5VL
|
||||||
65.5 # Qwen2.5VL
|
65.5, # Qwen2.5VL
|
||||||
]
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
df = pd.DataFrame(data)
|
df = pd.DataFrame(data)
|
||||||
@ -121,41 +121,23 @@ model_categories = {
|
|||||||
"Qwen 2 VL (A100)": "Open VLM",
|
"Qwen 2 VL (A100)": "Open VLM",
|
||||||
"Qwen 2 VL (H100,L40S)": "Open VLM",
|
"Qwen 2 VL (H100,L40S)": "Open VLM",
|
||||||
"Qwen 2.5 VL (A100)": "Open VLM",
|
"Qwen 2.5 VL (A100)": "Open VLM",
|
||||||
"Qwen 2.5 VL (H100,L40S)": "Open VLM"
|
"Qwen 2.5 VL (H100,L40S)": "Open VLM",
|
||||||
}
|
}
|
||||||
|
|
||||||
df[CATEGORY_COLUMN_NAME] = df[MODEL_COLUMN_NAME].map(model_categories)
|
df[CATEGORY_COLUMN_NAME] = df[MODEL_COLUMN_NAME].map(model_categories)
|
||||||
|
|
||||||
# Category colors
|
# Category colors
|
||||||
category_colors = {
|
category_colors = {"Commercial API Tool": DARK_BLUE, "Commercial VLM": DARK_GREEN, "Open Source Tool": LIGHT_GREEN, "Ours": DARK_PINK, "Open VLM": PURPLE}
|
||||||
"Commercial API Tool": DARK_BLUE,
|
|
||||||
"Commercial VLM": DARK_GREEN,
|
|
||||||
"Open Source Tool": LIGHT_GREEN,
|
|
||||||
"Ours": DARK_PINK,
|
|
||||||
"Open VLM": PURPLE
|
|
||||||
}
|
|
||||||
|
|
||||||
df[COLOR_COLUMN_NAME] = df[CATEGORY_COLUMN_NAME].map(category_colors)
|
df[COLOR_COLUMN_NAME] = df[CATEGORY_COLUMN_NAME].map(category_colors)
|
||||||
|
|
||||||
# Define marker types
|
# Define marker types
|
||||||
category_markers = {
|
category_markers = {"Commercial API Tool": "o", "Commercial VLM": "D", "Open Source Tool": "s", "Ours": "*", "Open VLM": "^"}
|
||||||
"Commercial API Tool": "o",
|
|
||||||
"Commercial VLM": "D",
|
|
||||||
"Open Source Tool": "s",
|
|
||||||
"Ours": "*",
|
|
||||||
"Open VLM": "^"
|
|
||||||
}
|
|
||||||
|
|
||||||
df[MARKER_COLUMN_NAME] = df[CATEGORY_COLUMN_NAME].map(category_markers)
|
df[MARKER_COLUMN_NAME] = df[CATEGORY_COLUMN_NAME].map(category_markers)
|
||||||
|
|
||||||
# Define marker sizes - increased sizes
|
# Define marker sizes - increased sizes
|
||||||
category_marker_sizes = {
|
category_marker_sizes = {"Commercial API Tool": 120, "Commercial VLM": 120, "Open Source Tool": 140, "Ours": 300, "Open VLM": 140}
|
||||||
"Commercial API Tool": 120,
|
|
||||||
"Commercial VLM": 120,
|
|
||||||
"Open Source Tool": 140,
|
|
||||||
"Ours": 300,
|
|
||||||
"Open VLM": 140
|
|
||||||
}
|
|
||||||
|
|
||||||
# Define text colors
|
# Define text colors
|
||||||
category_text_colors = {
|
category_text_colors = {
|
||||||
@ -163,7 +145,7 @@ category_text_colors = {
|
|||||||
"Commercial VLM": DARK_GREEN,
|
"Commercial VLM": DARK_GREEN,
|
||||||
"Open Source Tool": DARK_TEAL,
|
"Open Source Tool": DARK_TEAL,
|
||||||
"Ours": "#a51c5c", # darker pink
|
"Ours": "#a51c5c", # darker pink
|
||||||
"Open VLM": "#6f1188" # darker purple
|
"Open VLM": "#6f1188", # darker purple
|
||||||
}
|
}
|
||||||
|
|
||||||
# Label offsets for better readability
|
# Label offsets for better readability
|
||||||
@ -180,7 +162,7 @@ model_label_offsets = {
|
|||||||
"Qwen 2 VL (A100)": [-20, 10],
|
"Qwen 2 VL (A100)": [-20, 10],
|
||||||
"Qwen 2 VL (H100,L40S)": [-60, 25],
|
"Qwen 2 VL (H100,L40S)": [-60, 25],
|
||||||
"Qwen 2.5 VL (A100)": [-20, 10],
|
"Qwen 2.5 VL (A100)": [-20, 10],
|
||||||
"Qwen 2.5 VL (H100,L40S)": [-60, 25]
|
"Qwen 2.5 VL (H100,L40S)": [-60, 25],
|
||||||
}
|
}
|
||||||
|
|
||||||
df[OFFSET_COLUMN_NAME] = df[MODEL_COLUMN_NAME].map(model_label_offsets)
|
df[OFFSET_COLUMN_NAME] = df[MODEL_COLUMN_NAME].map(model_label_offsets)
|
||||||
@ -218,18 +200,24 @@ for idx, row in df.iterrows():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Set up axes
|
# Set up axes
|
||||||
plt.ylim(25, 85) # Set y-axis limits from 25 to 85 to include Qwen2VL
|
plt.ylim(25, 85) # Set y-axis limits from 25 to 85 to include Qwen2VL
|
||||||
plt.xlim(100, 15000)
|
plt.xlim(100, 15000)
|
||||||
plt.xscale('log') # Use log scale for cost
|
plt.xscale("log") # Use log scale for cost
|
||||||
plt.grid(True, which="both", ls=":", color=TEAL, alpha=0.2)
|
plt.grid(True, which="both", ls=":", color=TEAL, alpha=0.2)
|
||||||
|
|
||||||
|
|
||||||
# Format y-axis to show percentages without scientific notation
|
# Format y-axis to show percentages without scientific notation
|
||||||
def percent_formatter(y, pos):
|
def percent_formatter(y, pos):
|
||||||
return f'{y:.1f}%'
|
return f"{y:.1f}%"
|
||||||
|
|
||||||
|
|
||||||
plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(percent_formatter))
|
plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(percent_formatter))
|
||||||
|
|
||||||
|
|
||||||
# Format x-axis to show dollar amounts
|
# Format x-axis to show dollar amounts
|
||||||
def dollar_formatter(x, pos):
|
def dollar_formatter(x, pos):
|
||||||
return f'${x:,.0f}'
|
return f"${x:,.0f}"
|
||||||
|
|
||||||
|
|
||||||
# Set specific x-axis ticks with increased font size
|
# Set specific x-axis ticks with increased font size
|
||||||
plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(dollar_formatter))
|
plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(dollar_formatter))
|
||||||
@ -243,8 +231,8 @@ plt.ylabel("Overall Performance (Pass Rate %)", fontsize=16, weight="medium")
|
|||||||
# plt.title("OCR Engines: Performance vs. Cost", fontsize=12, weight="medium")
|
# plt.title("OCR Engines: Performance vs. Cost", fontsize=12, weight="medium")
|
||||||
|
|
||||||
# Remove spines
|
# Remove spines
|
||||||
plt.gca().spines['top'].set_visible(False)
|
plt.gca().spines["top"].set_visible(False)
|
||||||
plt.gca().spines['right'].set_visible(False)
|
plt.gca().spines["right"].set_visible(False)
|
||||||
|
|
||||||
# Add the legend with custom ordering and increased font size
|
# Add the legend with custom ordering and increased font size
|
||||||
handles, labels = plt.gca().get_legend_handles_labels()
|
handles, labels = plt.gca().get_legend_handles_labels()
|
||||||
@ -254,14 +242,7 @@ ordered_handles = [label_to_handle[label] for label in desired_order if label in
|
|||||||
ordered_labels = [label for label in desired_order if label in labels]
|
ordered_labels = [label for label in desired_order if label in labels]
|
||||||
|
|
||||||
plt.legend(
|
plt.legend(
|
||||||
ordered_handles,
|
ordered_handles, ordered_labels, loc="lower right", fontsize=12, frameon=True, framealpha=0.9, edgecolor=TEAL, facecolor="white" # Increased from 10
|
||||||
ordered_labels,
|
|
||||||
loc="lower right",
|
|
||||||
fontsize=12, # Increased from 10
|
|
||||||
frameon=True,
|
|
||||||
framealpha=0.9,
|
|
||||||
edgecolor=TEAL,
|
|
||||||
facecolor="white"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Adjust layout
|
# Adjust layout
|
||||||
@ -271,4 +252,4 @@ plt.tight_layout()
|
|||||||
for output_path in OUTPUT_PATHS:
|
for output_path in OUTPUT_PATHS:
|
||||||
plt.savefig(output_path, dpi=300, bbox_inches="tight")
|
plt.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||||
|
|
||||||
print(f"Plot saved to {', '.join(OUTPUT_PATHS)}")
|
print(f"Plot saved to {', '.join(OUTPUT_PATHS)}")
|
||||||
|
|||||||
@ -26,9 +26,7 @@ import zstandard as zstd
|
|||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from pydantic import BaseModel, Field, ValidationError
|
from pydantic import BaseModel, Field, ValidationError
|
||||||
|
|
||||||
from olmocr.check import (
|
from olmocr.check import check_torch_gpu_available
|
||||||
check_torch_gpu_available,
|
|
||||||
)
|
|
||||||
from olmocr.metrics import MetricsKeeper
|
from olmocr.metrics import MetricsKeeper
|
||||||
from olmocr.s3_utils import (
|
from olmocr.s3_utils import (
|
||||||
download_directory,
|
download_directory,
|
||||||
|
|||||||
@ -27,9 +27,7 @@ import zstandard as zstd
|
|||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from pydantic import BaseModel, Field, ValidationError
|
from pydantic import BaseModel, Field, ValidationError
|
||||||
|
|
||||||
from olmocr.check import (
|
from olmocr.check import check_torch_gpu_available
|
||||||
check_torch_gpu_available,
|
|
||||||
)
|
|
||||||
from olmocr.metrics import MetricsKeeper
|
from olmocr.metrics import MetricsKeeper
|
||||||
from olmocr.s3_utils import (
|
from olmocr.s3_utils import (
|
||||||
download_directory,
|
download_directory,
|
||||||
|
|||||||
@ -26,9 +26,7 @@ import zstandard as zstd
|
|||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from pydantic import BaseModel, Field, ValidationError
|
from pydantic import BaseModel, Field, ValidationError
|
||||||
|
|
||||||
from olmocr.check import (
|
from olmocr.check import check_torch_gpu_available
|
||||||
check_torch_gpu_available,
|
|
||||||
)
|
|
||||||
from olmocr.metrics import MetricsKeeper
|
from olmocr.metrics import MetricsKeeper
|
||||||
from olmocr.s3_utils import (
|
from olmocr.s3_utils import (
|
||||||
download_directory,
|
download_directory,
|
||||||
|
|||||||
@ -19,7 +19,7 @@ class TestPipelineIntegration(unittest.TestCase):
|
|||||||
print(self.data[-1])
|
print(self.data[-1])
|
||||||
|
|
||||||
def test_edgar(self) -> None:
|
def test_edgar(self) -> None:
|
||||||
self.assertTrue(any("King of England" in line["text"] for line in self.data))
|
self.assertTrue(any("King of the English" in line["text"] for line in self.data))
|
||||||
|
|
||||||
def test_ambig(self) -> None:
|
def test_ambig(self) -> None:
|
||||||
self.assertTrue(any("Apples and Bananas" in line["text"] for line in self.data))
|
self.assertTrue(any("Apples and Bananas" in line["text"] for line in self.data))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user