mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-10-30 17:39:33 +00:00 
			
		
		
		
	
		
			
	
	
		
			197 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			197 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|   | """
 | ||
|  | 
 | ||
|  | Boxplots of Elo ratings with 95% confidence intervals for each method. | ||
|  | 
 | ||
|  | Invocation: | ||
|  |     python draw_boxplots.py results.txt boxplots.png | ||
|  | 
 | ||
|  | @kylel | ||
|  | 
 | ||
|  | """
 | ||
|  | 
 | ||
|  | import hashlib | ||
|  | import re | ||
|  | from pathlib import Path | ||
|  | 
 | ||
|  | import click | ||
|  | import matplotlib.font_manager as font_manager | ||
|  | import matplotlib.pyplot as plt | ||
|  | import numpy as np | ||
|  | import requests | ||
|  | 
 | ||
|  | # AI2 Colors | ||
|  | AI2_PINK = "#f0529c" | ||
|  | AI2_DARK_TEAL = "#0a3235" | ||
|  | AI2_TEAL = "#105257" | ||
|  | 
 | ||
|  | # Name mappings | ||
|  | NAME_DISPLAY_MAP = {"pdelf": "olmOCR", "mineru": "MinerU", "marker": "Marker", "gotocr_format": "GOTOCR"} | ||
|  | 
 | ||
|  | 
 | ||
|  | def download_and_cache_file(url, cache_dir=None): | ||
|  |     """Download a file and cache it locally.""" | ||
|  |     if cache_dir is None: | ||
|  |         cache_dir = Path.home() / ".cache" / "elo_plot" | ||
|  | 
 | ||
|  |     cache_dir = Path(cache_dir) | ||
|  |     cache_dir.mkdir(parents=True, exist_ok=True) | ||
|  | 
 | ||
|  |     # Create filename from URL hash | ||
|  |     url_hash = hashlib.sha256(url.encode()).hexdigest()[:12] | ||
|  |     file_name = url.split("/")[-1] | ||
|  |     cached_path = cache_dir / f"{url_hash}_{file_name}" | ||
|  | 
 | ||
|  |     if not cached_path.exists(): | ||
|  |         response = requests.get(url, stream=True) | ||
|  |         response.raise_for_status() | ||
|  | 
 | ||
|  |         with open(cached_path, "wb") as f: | ||
|  |             for chunk in response.iter_content(chunk_size=8192): | ||
|  |                 f.write(chunk) | ||
|  | 
 | ||
|  |     return str(cached_path) | ||
|  | 
 | ||
|  | 
 | ||
|  | def parse_elo_data(file_path): | ||
|  |     """Parse Elo ratings data from a text file.""" | ||
|  |     with open(file_path, "r") as f: | ||
|  |         content = f.read() | ||
|  | 
 | ||
|  |     # Regular expression to match the data lines | ||
|  |     pattern = r"(\w+)\s+(\d+\.\d+)\s*±\s*(\d+\.\d+)\s*\[(\d+\.\d+),\s*(\d+\.\d+)\]" | ||
|  |     matches = re.finditer(pattern, content) | ||
|  | 
 | ||
|  |     # Initialize lists to store data | ||
|  |     names = [] | ||
|  |     medians = [] | ||
|  |     errors = [] | ||
|  |     ci_low = [] | ||
|  |     ci_high = [] | ||
|  | 
 | ||
|  |     for match in matches: | ||
|  |         names.append(match.group(1)) | ||
|  |         medians.append(float(match.group(2))) | ||
|  |         errors.append(float(match.group(3))) | ||
|  |         ci_low.append(float(match.group(4))) | ||
|  |         ci_high.append(float(match.group(5))) | ||
|  | 
 | ||
|  |     return names, medians, errors, ci_low, ci_high | ||
|  | 
 | ||
|  | 
 | ||
|  | def create_boxplot(names, medians, errors, ci_low, ci_high, output_path, font_path): | ||
|  |     """Create and save a boxplot of Elo ratings.""" | ||
|  |     # Set up Manrope font | ||
|  |     font_manager.fontManager.addfont(font_path) | ||
|  |     plt.rcParams["font.family"] = "Manrope" | ||
|  |     plt.rcParams["font.weight"] = "medium" | ||
|  | 
 | ||
|  |     # Define colors - pdelf in pink, others in shades of teal/grey based on performance | ||
|  |     max_median = max(medians) | ||
|  |     colors = [] | ||
|  |     for i, median in enumerate(medians): | ||
|  |         if names[i] == "pdelf": | ||
|  |             colors.append(AI2_PINK) | ||
|  |         else: | ||
|  |             # Calculate a shade between dark teal and grey based on performance | ||
|  |             performance_ratio = (median - min(medians)) / (max_median - min(medians)) | ||
|  |             base_color = np.array(tuple(int(AI2_DARK_TEAL[i : i + 2], 16) for i in (1, 3, 5))) / 255.0 | ||
|  |             grey = np.array([0.7, 0.7, 0.7])  # Light grey | ||
|  |             color = tuple(np.clip(base_color * performance_ratio + grey * (1 - performance_ratio), 0, 1)) | ||
|  |             colors.append(color) | ||
|  | 
 | ||
|  |     # Create box plot data | ||
|  |     box_data = [] | ||
|  |     for i in range(len(names)): | ||
|  |         q1 = medians[i] - errors[i] | ||
|  |         q3 = medians[i] + errors[i] | ||
|  |         box_data.append([ci_low[i], q1, medians[i], q3, ci_high[i]]) | ||
|  | 
 | ||
|  |     # Create box plot with smaller width and spacing | ||
|  |     plt.figure(figsize=(4, 4)) | ||
|  |     bp = plt.boxplot( | ||
|  |         box_data, | ||
|  |         labels=[NAME_DISPLAY_MAP[name] for name in names], | ||
|  |         whis=1.5, | ||
|  |         patch_artist=True, | ||
|  |         widths=0.15,  # Make boxes much narrower | ||
|  |         medianprops=dict(color="black"),  # Make median line black | ||
|  |         positions=np.arange(len(names)) * 0.25, | ||
|  |     )  # Reduce spacing between boxes significantly | ||
|  | 
 | ||
|  |     # Color each box | ||
|  |     for patch, color in zip(bp["boxes"], colors): | ||
|  |         patch.set_facecolor(color) | ||
|  |         patch.set_alpha(0.8) | ||
|  | 
 | ||
|  |     # Style the plot | ||
|  |     # plt.ylabel("Elo Rating", fontsize=12, color=AI2_DARK_TEAL) | ||
|  |     plt.xticks( | ||
|  |         np.arange(len(names)) * 0.25,  # Match positions from boxplot | ||
|  |         [NAME_DISPLAY_MAP[name] for name in names], | ||
|  |         rotation=45, | ||
|  |         ha="right", | ||
|  |         color=AI2_DARK_TEAL, | ||
|  |     ) | ||
|  |     plt.yticks(color=AI2_DARK_TEAL) | ||
|  | 
 | ||
|  |     # Set x-axis limits to maintain proper spacing | ||
|  |     plt.xlim(-0.1, (len(names) - 1) * 0.25 + 0.1) | ||
|  | 
 | ||
|  |     # Remove the title and adjust the layout | ||
|  |     plt.tight_layout() | ||
|  | 
 | ||
|  |     # Remove spines | ||
|  |     for spine in plt.gca().spines.values(): | ||
|  |         spine.set_visible(False) | ||
|  | 
 | ||
|  |     # Add left spine only | ||
|  |     plt.gca().spines["left"].set_visible(True) | ||
|  |     plt.gca().spines["left"].set_color(AI2_DARK_TEAL) | ||
|  |     plt.gca().spines["left"].set_linewidth(0.5) | ||
|  | 
 | ||
|  |     # Add bottom spine only | ||
|  |     plt.gca().spines["bottom"].set_visible(True) | ||
|  |     plt.gca().spines["bottom"].set_color(AI2_DARK_TEAL) | ||
|  |     plt.gca().spines["bottom"].set_linewidth(0.5) | ||
|  | 
 | ||
|  |     plt.savefig(output_path, dpi=300, bbox_inches="tight", transparent=True) | ||
|  |     plt.close() | ||
|  | 
 | ||
|  | 
 | ||
|  | @click.command() | ||
|  | @click.argument("input_file", type=click.Path(exists=True)) | ||
|  | @click.argument("output_file", type=click.Path()) | ||
|  | @click.option( | ||
|  |     "--manrope-medium-font-path", | ||
|  |     type=str, | ||
|  |     default="https://dolma-artifacts.org/Manrope-Medium.ttf", | ||
|  |     help="Path to the Manrope Medium font file (local path or URL)", | ||
|  | ) | ||
|  | def main(input_file, output_file, manrope_medium_font_path): | ||
|  |     """Generate a boxplot from Elo ratings data.
 | ||
|  | 
 | ||
|  |     INPUT_FILE: Path to the text file containing Elo ratings data | ||
|  |     OUTPUT_FILE: Path where the plot should be saved | ||
|  |     """
 | ||
|  |     try: | ||
|  |         # Handle font path - download and cache if it's a URL | ||
|  |         if manrope_medium_font_path.startswith(("http://", "https://")): | ||
|  |             font_path = download_and_cache_file(manrope_medium_font_path) | ||
|  |         else: | ||
|  |             font_path = manrope_medium_font_path | ||
|  | 
 | ||
|  |         # Parse the data | ||
|  |         names, medians, errors, ci_low, ci_high = parse_elo_data(input_file) | ||
|  | 
 | ||
|  |         # Create and save the plot | ||
|  |         create_boxplot(names, medians, errors, ci_low, ci_high, output_file, font_path) | ||
|  |         click.echo(f"Plot successfully saved to {output_file}") | ||
|  | 
 | ||
|  |     except Exception as e: | ||
|  |         click.echo(f"Error: {str(e)}", err=True) | ||
|  |         raise click.Abort() | ||
|  | 
 | ||
|  | 
 | ||
|  | if __name__ == "__main__": | ||
|  |     main() |