diff --git a/olmocr/bench/miners/pick_mediod.py b/olmocr/bench/miners/pick_mediod.py new file mode 100755 index 0000000..a3e8770 --- /dev/null +++ b/olmocr/bench/miners/pick_mediod.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +""" +pick_mediod.py - Identify representative examples from repeated OCR outputs + +This code will take as arguments two directories: +--input and --output +Each of those is going to be a directory that was generated by convert.py and is a candidate to be evaluated as part of benchmark.py +What it will do is find and group all of the .md files into their repeats +ex. input_dir/tables/buildingnotes_pg1_repeat1.md, input_dir/tables/buildingnotes_pg1_repeat2.md, etc. +Then, for each repeat, it will use string similarity metrics to calculate the edit distance to every other repeat +The repeat with the lowest mean edit distance will then get output as ..._repeat1.md in the output folder +""" + +import argparse +import glob +import os +import re +import shutil +from typing import Dict, List + +from rapidfuzz import distance as fuzz_distance +from tqdm import tqdm + + +def compute_distance(text1: str, text2: str) -> float: + """ + Compute the edit distance between two text strings using rapidfuzz. + Returns a normalized distance between 0.0 (identical) and 1.0 (completely different). + """ + # Use Levenshtein distance for string comparison + return fuzz_distance.Levenshtein.normalized_distance(text1, text2) + + +def find_mediod(texts: List[str]) -> int: + """ + Find the index of the mediod from a list of texts. + The mediod is the text with the minimum average distance to all other texts. + """ + if not texts: + return -1 + + if len(texts) == 1: + return 0 + + # Calculate pairwise distances between all texts + n = len(texts) + distances = [[0.0 for _ in range(n)] for _ in range(n)] + + for i in range(n): + for j in range(i + 1, n): + dist = compute_distance(texts[i], texts[j]) + distances[i][j] = dist + distances[j][i] = dist + + # Calculate average distance of each text to all others + avg_distances = [] + for i in range(n): + avg_dist = sum(distances[i]) / (n - 1) # Don't include distance to self + avg_distances.append(avg_dist) + + # Return the index of the text with the minimum average distance + min_avg_dist = min(avg_distances) + return avg_distances.index(min_avg_dist) + + +def group_repeats(md_files: List[str]) -> Dict[str, List[str]]: + """ + Group MD files by their base name (without the repeat number). + Returns a dictionary mapping base names to lists of file paths. + """ + grouped = {} + + for md_path in md_files: + base_name = re.sub(r"_repeat\d+\.md$", "", os.path.basename(md_path)) + if base_name not in grouped: + grouped[base_name] = [] + grouped[base_name].append(md_path) + + return grouped + + +def main(): + parser = argparse.ArgumentParser(description="Find mediod (most representative) examples from repeated OCR outputs.") + parser.add_argument( + "--input", type=str, required=True, help="Path to the directory containing repeated OCR outputs (e.g., *_repeat1.md, *_repeat2.md, etc.)" + ) + parser.add_argument("--output", type=str, required=True, help="Path to the directory where mediod examples will be copied") + parser.add_argument("--min_repeats", type=int, default=3, help="Minimum number of repeats required to compute a mediod (default: 3)") + args = parser.parse_args() + + input_dir = args.input + output_dir = args.output + min_repeats = args.min_repeats + + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Find all markdown files in the input directory (recursive) + md_files = glob.glob(os.path.join(input_dir, "**/*.md"), recursive=True) + + if not md_files: + print(f"No markdown files found in {input_dir}") + return + + # Group files by their base name + grouped_files = group_repeats(md_files) + + # Process each group + successful = 0 + skipped = 0 + + print(f"Found {len(grouped_files)} unique test cases with repeats") + + for base_name, file_paths in tqdm(grouped_files.items(), desc="Processing test cases"): + # Skip if there aren't enough repeats + if len(file_paths) < min_repeats: + print(f"Skipping {base_name}: only {len(file_paths)} repeats (minimum {min_repeats} required)") + skipped += 1 + continue + + # Read all text content + texts = [] + for path in file_paths: + try: + with open(path, "r", encoding="utf-8") as f: + texts.append(f.read()) + except Exception as e: + print(f"Error reading {path}: {e}") + continue + + # Find the mediod + mediod_idx = find_mediod(texts) + if mediod_idx == -1: + print(f"Failed to find mediod for {base_name}") + skipped += 1 + continue + + # Get the path of the mediod file + mediod_path = file_paths[mediod_idx] + + # Create the output path, preserving the directory structure relative to input_dir + if os.path.isabs(mediod_path) and os.path.isabs(input_dir): + rel_path = os.path.relpath(mediod_path, input_dir) + # Change the repeat number to 1 in the output filename + output_filename = re.sub(r"_repeat\d+\.md$", "_repeat1.md", os.path.basename(rel_path)) + output_subdir = os.path.dirname(rel_path) + output_path = os.path.join(output_dir, output_subdir, output_filename) + + # Create directories if needed + os.makedirs(os.path.dirname(output_path), exist_ok=True) + else: + # Just output to the root of output_dir with renamed file + output_filename = re.sub(r"_repeat\d+\.md$", "_repeat1.md", os.path.basename(mediod_path)) + output_path = os.path.join(output_dir, output_filename) + + # Copy the mediod file + try: + shutil.copy2(mediod_path, output_path) + successful += 1 + except Exception as e: + print(f"Error copying {mediod_path} to {output_path}: {e}") + + print(f"Processing complete: {successful} mediods copied, {skipped} cases skipped") + + +if __name__ == "__main__": + main() diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py new file mode 100644 index 0000000..9d6cd07 --- /dev/null +++ b/olmocr/bench/synth/mine_html_templates.py @@ -0,0 +1,237 @@ +import argparse +import concurrent.futures +import os +import random +import subprocess +from concurrent.futures import ThreadPoolExecutor + +import pypdf +from anthropic import Anthropic +from tqdm import tqdm + +from olmocr.data.renderpdf import render_pdf_to_base64png + + +def download_s3_pdf(s3_path, local_path): + """Download a PDF from S3 to a local path.""" + os.makedirs(os.path.dirname(local_path), exist_ok=True) + result = subprocess.run(["aws", "s3", "cp", s3_path, local_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + return result.returncode == 0 + + +def generate_html_from_image(client, image_base64): + """Call Claude API to generate HTML from an image.""" + try: + response = client.messages.create( + model="claude-3-7-sonnet-20250219", + max_tokens=4000, + temperature=0.2, + messages=[ + { + "role": "user", + "content": [ + {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}}, + { + "type": "text", + "text": "Render this document as clean, semantic HTML. Use appropriate HTML tags for elements like headings, paragraphs, lists, tables, etc. " + "Use the
and