olmocr/scripts/elo/draw_boxplots.py

"""

Boxplots of Elo ratings with 95% confidence intervals for each method.

Invocation:
    python draw_boxplots.py results.txt boxplots.png

@kylel

"""

import hashlib
import re
from pathlib import Path

import click
import matplotlib.font_manager as font_manager
import matplotlib.pyplot as plt
import numpy as np
import requests

# AI2 Colors
AI2_PINK = "#f0529c"
AI2_DARK_TEAL = "#0a3235"
AI2_TEAL = "#105257"

# Name mappings
NAME_DISPLAY_MAP = {"pdelf": "olmOCR", "mineru": "MinerU", "marker": "Marker", "gotocr_format": "GOTOCR"}


def download_and_cache_file(url, cache_dir=None):
    """Download a file and cache it locally."""
    if cache_dir is None:
        cache_dir = Path.home() / ".cache" / "elo_plot"

    cache_dir = Path(cache_dir)
    cache_dir.mkdir(parents=True, exist_ok=True)

    # Create filename from URL hash
    url_hash = hashlib.sha256(url.encode()).hexdigest()[:12]
    file_name = url.split("/")[-1]
    cached_path = cache_dir / f"{url_hash}_{file_name}"

    if not cached_path.exists():
        response = requests.get(url, stream=True)
        response.raise_for_status()

        with open(cached_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

    return str(cached_path)


def parse_elo_data(file_path):
    """Parse Elo ratings data from a text file."""
    with open(file_path, "r") as f:
        content = f.read()

    # Regular expression to match the data lines
    pattern = r"(\w+)\s+(\d+\.\d+)\s*±\s*(\d+\.\d+)\s*\[(\d+\.\d+),\s*(\d+\.\d+)\]"
    matches = re.finditer(pattern, content)

    # Initialize lists to store data
    names = []
    medians = []
    errors = []
    ci_low = []
    ci_high = []

    for match in matches:
        names.append(match.group(1))
        medians.append(float(match.group(2)))
        errors.append(float(match.group(3)))
        ci_low.append(float(match.group(4)))
        ci_high.append(float(match.group(5)))

    return names, medians, errors, ci_low, ci_high


def create_boxplot(names, medians, errors, ci_low, ci_high, output_path, font_path):
    """Create and save a boxplot of Elo ratings."""
    # Set up Manrope font
    font_manager.fontManager.addfont(font_path)
    plt.rcParams["font.family"] = "Manrope"
    plt.rcParams["font.weight"] = "medium"

    # Define colors - pdelf in pink, others in shades of teal/grey based on performance
    max_median = max(medians)
    colors = []
    for i, median in enumerate(medians):
        if names[i] == "pdelf":
            colors.append(AI2_PINK)
        else:
            # Calculate a shade between dark teal and grey based on performance
            performance_ratio = (median - min(medians)) / (max_median - min(medians))
            base_color = np.array(tuple(int(AI2_DARK_TEAL[i : i + 2], 16) for i in (1, 3, 5))) / 255.0
            grey = np.array([0.7, 0.7, 0.7])  # Light grey
            color = tuple(np.clip(base_color * performance_ratio + grey * (1 - performance_ratio), 0, 1))
            colors.append(color)

    # Create box plot data
    box_data = []
    for i in range(len(names)):
        q1 = medians[i] - errors[i]
        q3 = medians[i] + errors[i]
        box_data.append([ci_low[i], q1, medians[i], q3, ci_high[i]])

    # Create box plot with smaller width and spacing
    plt.figure(figsize=(4, 4))
    bp = plt.boxplot(
        box_data,
        labels=[NAME_DISPLAY_MAP[name] for name in names],
        whis=1.5,
        patch_artist=True,
        widths=0.15,  # Make boxes much narrower
        medianprops=dict(color="black"),  # Make median line black
        positions=np.arange(len(names)) * 0.25,
    )  # Reduce spacing between boxes significantly

    # Color each box
    for patch, color in zip(bp["boxes"], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.8)

    # Style the plot
    # plt.ylabel("Elo Rating", fontsize=12, color=AI2_DARK_TEAL)
    plt.xticks(
        np.arange(len(names)) * 0.25,  # Match positions from boxplot
        [NAME_DISPLAY_MAP[name] for name in names],
        rotation=45,
        ha="right",
        color=AI2_DARK_TEAL,
    )
    plt.yticks(color=AI2_DARK_TEAL)

    # Set x-axis limits to maintain proper spacing
    plt.xlim(-0.1, (len(names) - 1) * 0.25 + 0.1)

    # Remove the title and adjust the layout
    plt.tight_layout()

    # Remove spines
    for spine in plt.gca().spines.values():
        spine.set_visible(False)

    # Add left spine only
    plt.gca().spines["left"].set_visible(True)
    plt.gca().spines["left"].set_color(AI2_DARK_TEAL)
    plt.gca().spines["left"].set_linewidth(0.5)

    # Add bottom spine only
    plt.gca().spines["bottom"].set_visible(True)
    plt.gca().spines["bottom"].set_color(AI2_DARK_TEAL)
    plt.gca().spines["bottom"].set_linewidth(0.5)

    plt.savefig(output_path, dpi=300, bbox_inches="tight", transparent=True)
    plt.close()


@click.command()
@click.argument("input_file", type=click.Path(exists=True))
@click.argument("output_file", type=click.Path())
@click.option(
    "--manrope-medium-font-path",
    type=str,
    default="https://dolma-artifacts.org/Manrope-Medium.ttf",
    help="Path to the Manrope Medium font file (local path or URL)",
)
def main(input_file, output_file, manrope_medium_font_path):
    """Generate a boxplot from Elo ratings data.

    INPUT_FILE: Path to the text file containing Elo ratings data
    OUTPUT_FILE: Path where the plot should be saved
    """
    try:
        # Handle font path - download and cache if it's a URL
        if manrope_medium_font_path.startswith(("http://", "https://")):
            font_path = download_and_cache_file(manrope_medium_font_path)
        else:
            font_path = manrope_medium_font_path

        # Parse the data
        names, medians, errors, ci_low, ci_high = parse_elo_data(input_file)

        # Create and save the plot
        create_boxplot(names, medians, errors, ci_low, ci_high, output_file, font_path)
        click.echo(f"Plot successfully saved to {output_file}")

    except Exception as e:
        click.echo(f"Error: {str(e)}", err=True)
        raise click.Abort()


if __name__ == "__main__":
    main()