From 63aee2c1e59e74c43ad0800880ec38c5cc3a17c4 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Fri, 16 May 2025 21:25:32 +0000 Subject: [PATCH] Code cleanup, version bump, remove unused permutation test --- olmocr/bench/benchmark.py | 181 +--------------------- olmocr/bench/runners/run_claude.py | 5 +- olmocr/bench/synth/mine_html_templates.py | 6 +- olmocr/bench/utils.py | 47 +++--- olmocr/eval/dolma_refine/metrics.py | 1 - olmocr/eval/dolma_refine/registry.py | 9 +- olmocr/train/core/errors.py | 3 +- olmocr/version.py | 2 +- scripts/jsonl_to_markdown.py | 1 - scripts/pareto_plot.py | 69 +++------ scripts/rich_tagging_pipeline.py | 4 +- scripts/tagging_pipeline.py | 4 +- scripts/tagging_pipeline_v2.py | 4 +- tests/test_integration.py | 2 +- 14 files changed, 72 insertions(+), 266 deletions(-) diff --git a/olmocr/bench/benchmark.py b/olmocr/bench/benchmark.py index 4f7cb21..95d3b76 100644 --- a/olmocr/bench/benchmark.py +++ b/olmocr/bench/benchmark.py @@ -28,7 +28,7 @@ from tqdm import tqdm from .report import generate_html_report from .tests import BaselineTest, BasePDFTest, load_tests, save_tests -from .utils import calculate_bootstrap_ci, perform_permutation_test +from .utils import calculate_bootstrap_ci def evaluate_candidate( @@ -186,16 +186,6 @@ def main(): default=0.95, help="Confidence level for interval calculation (default: 0.95 for 95% CI).", ) - parser.add_argument( - "--permutation_tests", - nargs="?", - const="default", - help=( - "Run permutation testing. If provided without candidate names, run default tests. " - "If provided with a comma-separated list of candidate names (e.g. --permutation_tests asdf,qwe,ert), " - "run permutation tests on all pairs of the specified candidates." - ), - ) # New arguments parser.add_argument("--sample", type=int, default=None, help="Randomly sample N tests to run instead of all tests.") parser.add_argument("--test_report", type=str, default=None, help="Generate an HTML report of test results. Provide a filename (e.g., results.html).") @@ -300,16 +290,16 @@ def main(): jsonl_results = {} jsonl_scores = [] # List to store scores by jsonl file for CI calculation jsonl_file_sizes = [] # List to store the number of tests per jsonl file - + for test in all_tests: # Get the jsonl file this test came from jsonl_file = test_to_jsonl.get(test.id, "unknown") - + if jsonl_file not in jsonl_results: jsonl_results[jsonl_file] = {"total": 0, "passed": 0, "scores": []} - + jsonl_results[jsonl_file]["total"] += 1 - + # Get the test result for this candidate if it exists if not candidate_errors and hasattr(test, "pdf") and hasattr(test, "page"): pdf_name = test.pdf @@ -323,13 +313,13 @@ def main(): if passed: jsonl_results[jsonl_file]["passed"] += 1 break - + # Gather all the scores by jsonl file for CI calculation for jsonl_file, results in jsonl_results.items(): if results["scores"]: jsonl_file_sizes.append(len(results["scores"])) jsonl_scores.extend(results["scores"]) - + # Calculate CI using the updated function with splits if jsonl_scores: ci = calculate_bootstrap_ci(jsonl_scores, n_bootstrap=n_bootstrap, ci_level=ci_level, splits=jsonl_file_sizes) @@ -350,7 +340,7 @@ def main(): if results["total"] > 0: pass_rate = results["passed"] / results["total"] jsonl_pass_rates.append(pass_rate) - + per_category_score = sum(jsonl_pass_rates) / len(jsonl_pass_rates) if jsonl_pass_rates else 0.0 print(f" Average Score: {per_category_score * 100:.1f}% (95% CI: [{ci[0] * 100:.1f}%, {ci[1] * 100:.1f}%]) over {total_tests} tests.") @@ -418,161 +408,6 @@ def main(): print(f" {jsonl_file:30s}: {pass_rate:0.1f}% ({results['passed']}/{results['total']} tests)") print("") - if args.permutation_tests is not None: - print("\n" + "=" * 60) - print("Pairwise Permutation Tests:") - valid_candidates = [c for c in summary if not c[3]] - if args.permutation_tests == "default": - olmocr_candidates = sorted([c for c in valid_candidates if "olmocr" in c[0].lower()], key=lambda x: x[1], reverse=True) - non_olmocr_candidates = sorted([c for c in valid_candidates if "olmocr" not in c[0].lower()], key=lambda x: x[1], reverse=True) - top_olmocr = olmocr_candidates[0] if olmocr_candidates else None - top_non_olmocr = non_olmocr_candidates[0] if non_olmocr_candidates else None - top_two_olmocr = olmocr_candidates[:2] - - if top_olmocr and top_non_olmocr: - olmocr_name, olmocr_score = top_olmocr[0], top_olmocr[1] - non_olmocr_name, non_olmocr_score = top_non_olmocr[0], top_non_olmocr[1] - # Extract file sizes and scores for both candidates - olmocr_jsonl_sizes = [] - non_olmocr_jsonl_sizes = [] - - # Extract jsonl file sizes for each candidate - for test in all_tests: - jsonl_file = test_to_jsonl.get(test.id, "unknown") - # Process for top_olmocr - if not top_olmocr[3] and hasattr(test, "pdf") and hasattr(test, "page"): - pdf_name = test.pdf - page = test.page - if pdf_name in test_results_by_candidate.get(top_olmocr[0], {}) and page in test_results_by_candidate[top_olmocr[0]].get(pdf_name, {}): - for t, _, _ in test_results_by_candidate[top_olmocr[0]][pdf_name][page]: - if t.id == test.id: - if jsonl_file not in olmocr_jsonl_sizes: - olmocr_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file])) - break - - # Process for top_non_olmocr - if not top_non_olmocr[3] and hasattr(test, "pdf") and hasattr(test, "page"): - pdf_name = test.pdf - page = test.page - if pdf_name in test_results_by_candidate.get(top_non_olmocr[0], {}) and page in test_results_by_candidate[top_non_olmocr[0]].get(pdf_name, {}): - for t, _, _ in test_results_by_candidate[top_non_olmocr[0]][pdf_name][page]: - if t.id == test.id: - if jsonl_file not in non_olmocr_jsonl_sizes: - non_olmocr_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file])) - break - - diff, p_value = perform_permutation_test( - top_olmocr[7], top_non_olmocr[7], - splits_a=olmocr_jsonl_sizes if olmocr_jsonl_sizes else None, - splits_b=non_olmocr_jsonl_sizes if non_olmocr_jsonl_sizes else None - ) - print("\nComparison 1: Top olmocr vs Top non-olmocr candidate") - print(f" {olmocr_name} ({olmocr_score*100:.1f}%) vs {non_olmocr_name} ({non_olmocr_score*100:.1f}%)") - print(f" Difference: {diff*100:.2f}% (positive means {olmocr_name} is better)") - print(f" p-value: {p_value:.4f}") - if p_value < 0.05: - print(" Result: Statistically significant difference (p < 0.05)") - else: - print(" Result: No statistically significant difference (p ≥ 0.05)") - else: - print("\nCannot perform olmocr vs non-olmocr comparison: Missing candidates") - - if len(top_two_olmocr) >= 2: - # Extract file sizes for each candidate - olmocr1_jsonl_sizes = [] - olmocr2_jsonl_sizes = [] - - # Extract jsonl file sizes for each candidate - for test in all_tests: - jsonl_file = test_to_jsonl.get(test.id, "unknown") - # Process for first olmocr candidate - if not top_two_olmocr[0][3] and hasattr(test, "pdf") and hasattr(test, "page"): - pdf_name = test.pdf - page = test.page - if pdf_name in test_results_by_candidate.get(top_two_olmocr[0][0], {}) and page in test_results_by_candidate[top_two_olmocr[0][0]].get(pdf_name, {}): - for t, _, _ in test_results_by_candidate[top_two_olmocr[0][0]][pdf_name][page]: - if t.id == test.id: - if jsonl_file not in olmocr1_jsonl_sizes: - olmocr1_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file])) - break - - # Process for second olmocr candidate - if not top_two_olmocr[1][3] and hasattr(test, "pdf") and hasattr(test, "page"): - pdf_name = test.pdf - page = test.page - if pdf_name in test_results_by_candidate.get(top_two_olmocr[1][0], {}) and page in test_results_by_candidate[top_two_olmocr[1][0]].get(pdf_name, {}): - for t, _, _ in test_results_by_candidate[top_two_olmocr[1][0]][pdf_name][page]: - if t.id == test.id: - if jsonl_file not in olmocr2_jsonl_sizes: - olmocr2_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file])) - break - - diff, p_value = perform_permutation_test( - top_two_olmocr[0][7], top_two_olmocr[1][7], - splits_a=olmocr1_jsonl_sizes if olmocr1_jsonl_sizes else None, - splits_b=olmocr2_jsonl_sizes if olmocr2_jsonl_sizes else None - ) - print("\nComparison 2: Top two olmocr candidates") - print(f" {top_two_olmocr[0][0]} ({top_two_olmocr[0][1]*100:.1f}%) vs {top_two_olmocr[1][0]} ({top_two_olmocr[1][1]*100:.1f}%)") - print(f" Difference: {diff*100:.2f}% (positive means {top_two_olmocr[0][0]} is better)") - print(f" p-value: {p_value:.4f}") - if p_value < 0.05: - print(" Result: Statistically significant difference (p < 0.05)") - else: - print(" Result: No statistically significant difference (p ≥ 0.05)") - else: - print("\nCannot perform top two olmocr comparison: Not enough olmocr candidates") - else: - candidate_names = [name.strip() for name in args.permutation_tests.split(",")] - selected_candidates = [c for c in valid_candidates if c[0] in candidate_names] - if len(selected_candidates) < 2: - print("\nNot enough valid candidates among the selected for permutation tests.") - else: - for cand1, cand2 in combinations(selected_candidates, 2): - # Extract file sizes for each candidate - cand1_jsonl_sizes = [] - cand2_jsonl_sizes = [] - - # Extract jsonl file sizes for each candidate - for test in all_tests: - jsonl_file = test_to_jsonl.get(test.id, "unknown") - # Process for first candidate - if not cand1[3] and hasattr(test, "pdf") and hasattr(test, "page"): - pdf_name = test.pdf - page = test.page - if pdf_name in test_results_by_candidate.get(cand1[0], {}) and page in test_results_by_candidate[cand1[0]].get(pdf_name, {}): - for t, _, _ in test_results_by_candidate[cand1[0]][pdf_name][page]: - if t.id == test.id: - if jsonl_file not in cand1_jsonl_sizes: - cand1_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file])) - break - - # Process for second candidate - if not cand2[3] and hasattr(test, "pdf") and hasattr(test, "page"): - pdf_name = test.pdf - page = test.page - if pdf_name in test_results_by_candidate.get(cand2[0], {}) and page in test_results_by_candidate[cand2[0]].get(pdf_name, {}): - for t, _, _ in test_results_by_candidate[cand2[0]][pdf_name][page]: - if t.id == test.id: - if jsonl_file not in cand2_jsonl_sizes: - cand2_jsonl_sizes.append(len([t for t in all_tests if test_to_jsonl.get(t.id, "") == jsonl_file])) - break - - diff, p_value = perform_permutation_test( - cand1[7], cand2[7], - splits_a=cand1_jsonl_sizes if cand1_jsonl_sizes else None, - splits_b=cand2_jsonl_sizes if cand2_jsonl_sizes else None - ) - print(f"\nComparison: {cand1[0]} vs {cand2[0]}") - print(f" {cand1[0]} ({cand1[1]*100:.1f}%) vs {cand2[0]} ({cand2[1]*100:.1f}%)") - print(f" Difference: {diff*100:.2f}% (positive means {cand1[0]} is better)") - print(f" p-value: {p_value:.4f}") - if p_value < 0.05: - print(" Result: Statistically significant difference (p < 0.05)") - else: - print(" Result: No statistically significant difference (p ≥ 0.05)") - print("=" * 60) - # Generate HTML report if requested if args.test_report: generate_html_report(test_results_by_candidate, pdf_folder, args.test_report) diff --git a/olmocr/bench/runners/run_claude.py b/olmocr/bench/runners/run_claude.py index d71f7c6..5ce3f1d 100644 --- a/olmocr/bench/runners/run_claude.py +++ b/olmocr/bench/runners/run_claude.py @@ -2,10 +2,7 @@ import json import os from anthropic import Anthropic -from prompts import ( - build_openai_silver_data_prompt, - claude_response_format_schema, -) +from prompts import build_openai_silver_data_prompt, claude_response_format_schema from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.prompts.anchor import get_anchor_text diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py index 5c2a73c..5c464cc 100644 --- a/olmocr/bench/synth/mine_html_templates.py +++ b/olmocr/bench/synth/mine_html_templates.py @@ -17,11 +17,7 @@ from playwright.async_api import async_playwright from syntok.segmenter import process from tqdm import tqdm -from olmocr.bench.tests import ( - TableTest, - TestType, - parse_html_tables, -) +from olmocr.bench.tests import TableTest, TestType, parse_html_tables from olmocr.data.renderpdf import ( get_png_dimensions_from_base64, render_pdf_to_base64png, diff --git a/olmocr/bench/utils.py b/olmocr/bench/utils.py index af69857..5fe5f38 100644 --- a/olmocr/bench/utils.py +++ b/olmocr/bench/utils.py @@ -23,7 +23,7 @@ def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci # Convert to numpy array for efficiency scores = np.array(test_scores) - + # Simple case - no splits provided, use traditional bootstrap if splits is None: # Generate bootstrap samples @@ -36,14 +36,14 @@ def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci # Validate splits if sum(splits) != len(scores): raise ValueError(f"Sum of splits ({sum(splits)}) must equal length of test_scores ({len(scores)})") - + # Convert flat scores list to a list of category scores category_scores = [] start_idx = 0 for split_size in splits: - category_scores.append(scores[start_idx:start_idx + split_size]) + category_scores.append(scores[start_idx : start_idx + split_size]) start_idx += split_size - + # Generate bootstrap samples respecting category structure bootstrap_means = [] for _ in range(n_bootstrap): @@ -54,7 +54,7 @@ def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci # Sample with replacement within this category cat_sample = np.random.choice(cat_scores, size=len(cat_scores), replace=True) category_means.append(np.mean(cat_sample)) - + # Overall score is average of category means (if any categories have scores) if category_means: bootstrap_means.append(np.mean(category_means)) @@ -67,8 +67,9 @@ def calculate_bootstrap_ci(test_scores: List[float], n_bootstrap: int = 1000, ci return (lower_bound, upper_bound) -def perform_permutation_test(scores_a: List[float], scores_b: List[float], n_permutations: int = 10000, - splits_a: List[int] = None, splits_b: List[int] = None) -> Tuple[float, float]: +def perform_permutation_test( + scores_a: List[float], scores_b: List[float], n_permutations: int = 10000, splits_a: List[int] = None, splits_b: List[int] = None +) -> Tuple[float, float]: """ Perform a permutation test to determine if there's a significant difference between two sets of test scores. @@ -90,15 +91,15 @@ def perform_permutation_test(scores_a: List[float], scores_b: List[float], n_per def mean_of_category_means(scores, splits=None): if splits is None: return np.mean(scores) - + category_means = [] start_idx = 0 for split_size in splits: if split_size > 0: - category_scores = scores[start_idx:start_idx + split_size] + category_scores = scores[start_idx : start_idx + split_size] category_means.append(np.mean(category_scores)) start_idx += split_size - + return np.mean(category_means) if category_means else 0.0 # Calculate observed difference in means using category structure if provided @@ -135,54 +136,54 @@ def perform_permutation_test(scores_a: List[float], scores_b: List[float], n_per raise ValueError(f"Sum of splits_a ({sum(splits_a)}) must equal length of scores_a ({len(scores_a)})") if splits_b is not None and sum(splits_b) != len(scores_b): raise ValueError(f"Sum of splits_b ({sum(splits_b)}) must equal length of scores_b ({len(scores_b)})") - + # Create category structures categories_a = [] categories_b = [] - + if splits_a is not None: start_idx = 0 for split_size in splits_a: - categories_a.append(scores_a[start_idx:start_idx + split_size]) + categories_a.append(scores_a[start_idx : start_idx + split_size]) start_idx += split_size else: # If no splits for A, treat all scores as one category categories_a = [scores_a] - + if splits_b is not None: start_idx = 0 for split_size in splits_b: - categories_b.append(scores_b[start_idx:start_idx + split_size]) + categories_b.append(scores_b[start_idx : start_idx + split_size]) start_idx += split_size else: # If no splits for B, treat all scores as one category categories_b = [scores_b] - + # Perform permutation test maintaining category structure count_greater_or_equal = 0 for _ in range(n_permutations): # For each category pair, shuffle and redistribute perm_categories_a = [] perm_categories_b = [] - + for cat_a, cat_b in zip(categories_a, categories_b): # Combine and shuffle combined = np.concatenate([cat_a, cat_b]) np.random.shuffle(combined) - + # Redistribute maintaining original sizes - perm_categories_a.append(combined[:len(cat_a)]) - perm_categories_b.append(combined[len(cat_a):]) - + perm_categories_a.append(combined[: len(cat_a)]) + perm_categories_b.append(combined[len(cat_a) :]) + # Flatten permuted categories perm_a = np.concatenate(perm_categories_a) perm_b = np.concatenate(perm_categories_b) - + # Calculate difference in means respecting category structure perm_mean_a = mean_of_category_means(perm_a, splits_a) perm_mean_b = mean_of_category_means(perm_b, splits_b) perm_diff = perm_mean_a - perm_mean_b - + # Count how many permuted differences are >= to observed difference in absolute value if abs(perm_diff) >= abs(observed_diff): count_greater_or_equal += 1 diff --git a/olmocr/eval/dolma_refine/metrics.py b/olmocr/eval/dolma_refine/metrics.py index f35036b..e5a6cd0 100644 --- a/olmocr/eval/dolma_refine/metrics.py +++ b/olmocr/eval/dolma_refine/metrics.py @@ -208,7 +208,6 @@ class ParagraphEditSimilarity(DocumentEditSimilarity): self.sent_window = sent_window def segment(self, seq_a_tokens: list[str], seq_b_tokens: list[str]) -> list[tuple[list[str], list[str]]]: - all_spans = [] for seq_tokens in (seq_a_tokens, seq_b_tokens): diff --git a/olmocr/eval/dolma_refine/registry.py b/olmocr/eval/dolma_refine/registry.py index b8a87a7..a08903d 100644 --- a/olmocr/eval/dolma_refine/registry.py +++ b/olmocr/eval/dolma_refine/registry.py @@ -91,15 +91,18 @@ class BaseRegistry(Generic[T]): @overload @classmethod - def get(cls, name: str) -> T: ... + def get(cls, name: str) -> T: + ... @overload @classmethod - def get(cls, name: str, raise_on_missing: Literal[True]) -> T: ... + def get(cls, name: str, raise_on_missing: Literal[True]) -> T: + ... @overload @classmethod - def get(cls, name: str, raise_on_missing: Literal[False]) -> Optional[T]: ... + def get(cls, name: str, raise_on_missing: Literal[False]) -> Optional[T]: + ... @classmethod def get(cls, name: str, raise_on_missing: bool = True) -> Optional[T]: diff --git a/olmocr/train/core/errors.py b/olmocr/train/core/errors.py index a24dbe0..afe3e4c 100644 --- a/olmocr/train/core/errors.py +++ b/olmocr/train/core/errors.py @@ -1 +1,2 @@ -class DolmaRefineError(RuntimeError): ... +class DolmaRefineError(RuntimeError): + ... diff --git a/olmocr/version.py b/olmocr/version.py index b48bbb3..962f10d 100644 --- a/olmocr/version.py +++ b/olmocr/version.py @@ -2,7 +2,7 @@ _MAJOR = "0" _MINOR = "1" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "67" +_PATCH = "68" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = "" diff --git a/scripts/jsonl_to_markdown.py b/scripts/jsonl_to_markdown.py index d38f22c..ca488f1 100644 --- a/scripts/jsonl_to_markdown.py +++ b/scripts/jsonl_to_markdown.py @@ -46,7 +46,6 @@ def jsonl_to_markdown(input_file, output_dir): # It takes two arguments: the input JSONL file and the output directory. # The script will create the output directory if it does not exist. if __name__ == "__main__": - if len(sys.argv) != 3: print("Usage: python jsonl_to_markdown.py ") sys.exit(1) diff --git a/scripts/pareto_plot.py b/scripts/pareto_plot.py index c702b1b..6757128 100644 --- a/scripts/pareto_plot.py +++ b/scripts/pareto_plot.py @@ -60,7 +60,7 @@ GREEN = "#0fcb8c" data = { MODEL_COLUMN_NAME: [ "GPT-4o", - "GPT-4o (Batch)", + "GPT-4o (Batch)", "Mistral OCR", "MinerU", "Gemini Flash 2", @@ -71,7 +71,7 @@ data = { "Qwen 2 VL (A100)", "Qwen 2 VL (H100,L40S)", "Qwen 2.5 VL (A100)", - "Qwen 2.5 VL (H100,L40S)" + "Qwen 2.5 VL (H100,L40S)", ], COST_COLUMN_NAME: [ 12480, @@ -86,7 +86,7 @@ data = { 270, # Same cost as Ours 190, # Same cost as Ours 270, # Same cost as Ours - 190 # Same cost as Ours + 190, # Same cost as Ours ], PERF_COLUMN_NAME: [ 69.9, # GPT-4o (Anchored) @@ -101,8 +101,8 @@ data = { 31.5, # Qwen2VL 31.5, # Qwen2VL 65.5, # Qwen2.5VL - 65.5 # Qwen2.5VL - ] + 65.5, # Qwen2.5VL + ], } df = pd.DataFrame(data) @@ -121,41 +121,23 @@ model_categories = { "Qwen 2 VL (A100)": "Open VLM", "Qwen 2 VL (H100,L40S)": "Open VLM", "Qwen 2.5 VL (A100)": "Open VLM", - "Qwen 2.5 VL (H100,L40S)": "Open VLM" + "Qwen 2.5 VL (H100,L40S)": "Open VLM", } df[CATEGORY_COLUMN_NAME] = df[MODEL_COLUMN_NAME].map(model_categories) # Category colors -category_colors = { - "Commercial API Tool": DARK_BLUE, - "Commercial VLM": DARK_GREEN, - "Open Source Tool": LIGHT_GREEN, - "Ours": DARK_PINK, - "Open VLM": PURPLE -} +category_colors = {"Commercial API Tool": DARK_BLUE, "Commercial VLM": DARK_GREEN, "Open Source Tool": LIGHT_GREEN, "Ours": DARK_PINK, "Open VLM": PURPLE} df[COLOR_COLUMN_NAME] = df[CATEGORY_COLUMN_NAME].map(category_colors) # Define marker types -category_markers = { - "Commercial API Tool": "o", - "Commercial VLM": "D", - "Open Source Tool": "s", - "Ours": "*", - "Open VLM": "^" -} +category_markers = {"Commercial API Tool": "o", "Commercial VLM": "D", "Open Source Tool": "s", "Ours": "*", "Open VLM": "^"} df[MARKER_COLUMN_NAME] = df[CATEGORY_COLUMN_NAME].map(category_markers) # Define marker sizes - increased sizes -category_marker_sizes = { - "Commercial API Tool": 120, - "Commercial VLM": 120, - "Open Source Tool": 140, - "Ours": 300, - "Open VLM": 140 -} +category_marker_sizes = {"Commercial API Tool": 120, "Commercial VLM": 120, "Open Source Tool": 140, "Ours": 300, "Open VLM": 140} # Define text colors category_text_colors = { @@ -163,7 +145,7 @@ category_text_colors = { "Commercial VLM": DARK_GREEN, "Open Source Tool": DARK_TEAL, "Ours": "#a51c5c", # darker pink - "Open VLM": "#6f1188" # darker purple + "Open VLM": "#6f1188", # darker purple } # Label offsets for better readability @@ -180,7 +162,7 @@ model_label_offsets = { "Qwen 2 VL (A100)": [-20, 10], "Qwen 2 VL (H100,L40S)": [-60, 25], "Qwen 2.5 VL (A100)": [-20, 10], - "Qwen 2.5 VL (H100,L40S)": [-60, 25] + "Qwen 2.5 VL (H100,L40S)": [-60, 25], } df[OFFSET_COLUMN_NAME] = df[MODEL_COLUMN_NAME].map(model_label_offsets) @@ -218,18 +200,24 @@ for idx, row in df.iterrows(): ) # Set up axes -plt.ylim(25, 85) # Set y-axis limits from 25 to 85 to include Qwen2VL +plt.ylim(25, 85) # Set y-axis limits from 25 to 85 to include Qwen2VL plt.xlim(100, 15000) -plt.xscale('log') # Use log scale for cost +plt.xscale("log") # Use log scale for cost plt.grid(True, which="both", ls=":", color=TEAL, alpha=0.2) + # Format y-axis to show percentages without scientific notation def percent_formatter(y, pos): - return f'{y:.1f}%' + return f"{y:.1f}%" + + plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(percent_formatter)) + + # Format x-axis to show dollar amounts def dollar_formatter(x, pos): - return f'${x:,.0f}' + return f"${x:,.0f}" + # Set specific x-axis ticks with increased font size plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(dollar_formatter)) @@ -243,8 +231,8 @@ plt.ylabel("Overall Performance (Pass Rate %)", fontsize=16, weight="medium") # plt.title("OCR Engines: Performance vs. Cost", fontsize=12, weight="medium") # Remove spines -plt.gca().spines['top'].set_visible(False) -plt.gca().spines['right'].set_visible(False) +plt.gca().spines["top"].set_visible(False) +plt.gca().spines["right"].set_visible(False) # Add the legend with custom ordering and increased font size handles, labels = plt.gca().get_legend_handles_labels() @@ -254,14 +242,7 @@ ordered_handles = [label_to_handle[label] for label in desired_order if label in ordered_labels = [label for label in desired_order if label in labels] plt.legend( - ordered_handles, - ordered_labels, - loc="lower right", - fontsize=12, # Increased from 10 - frameon=True, - framealpha=0.9, - edgecolor=TEAL, - facecolor="white" + ordered_handles, ordered_labels, loc="lower right", fontsize=12, frameon=True, framealpha=0.9, edgecolor=TEAL, facecolor="white" # Increased from 10 ) # Adjust layout @@ -271,4 +252,4 @@ plt.tight_layout() for output_path in OUTPUT_PATHS: plt.savefig(output_path, dpi=300, bbox_inches="tight") -print(f"Plot saved to {', '.join(OUTPUT_PATHS)}") \ No newline at end of file +print(f"Plot saved to {', '.join(OUTPUT_PATHS)}") diff --git a/scripts/rich_tagging_pipeline.py b/scripts/rich_tagging_pipeline.py index f31866e..e44724f 100644 --- a/scripts/rich_tagging_pipeline.py +++ b/scripts/rich_tagging_pipeline.py @@ -26,9 +26,7 @@ import zstandard as zstd from huggingface_hub import snapshot_download from pydantic import BaseModel, Field, ValidationError -from olmocr.check import ( - check_torch_gpu_available, -) +from olmocr.check import check_torch_gpu_available from olmocr.metrics import MetricsKeeper from olmocr.s3_utils import ( download_directory, diff --git a/scripts/tagging_pipeline.py b/scripts/tagging_pipeline.py index 7e09abd..de9e02c 100644 --- a/scripts/tagging_pipeline.py +++ b/scripts/tagging_pipeline.py @@ -27,9 +27,7 @@ import zstandard as zstd from huggingface_hub import snapshot_download from pydantic import BaseModel, Field, ValidationError -from olmocr.check import ( - check_torch_gpu_available, -) +from olmocr.check import check_torch_gpu_available from olmocr.metrics import MetricsKeeper from olmocr.s3_utils import ( download_directory, diff --git a/scripts/tagging_pipeline_v2.py b/scripts/tagging_pipeline_v2.py index e46bf93..ffcd2e2 100644 --- a/scripts/tagging_pipeline_v2.py +++ b/scripts/tagging_pipeline_v2.py @@ -26,9 +26,7 @@ import zstandard as zstd from huggingface_hub import snapshot_download from pydantic import BaseModel, Field, ValidationError -from olmocr.check import ( - check_torch_gpu_available, -) +from olmocr.check import check_torch_gpu_available from olmocr.metrics import MetricsKeeper from olmocr.s3_utils import ( download_directory, diff --git a/tests/test_integration.py b/tests/test_integration.py index cb5cddb..520429f 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -19,7 +19,7 @@ class TestPipelineIntegration(unittest.TestCase): print(self.data[-1]) def test_edgar(self) -> None: - self.assertTrue(any("King of England" in line["text"] for line in self.data)) + self.assertTrue(any("King of the English" in line["text"] for line in self.data)) def test_ambig(self) -> None: self.assertTrue(any("Apples and Bananas" in line["text"] for line in self.data))