From ba8b3824bf3aa7b178ec7e0664e56b5bf28f32f6 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 8 Sep 2025 18:54:53 +0000 Subject: [PATCH] Adding some rotation augmentation to the post training step --- olmocr/bench/synth/rotate_html_templates.py | 354 ++++++++++++++++++++ 1 file changed, 354 insertions(+) create mode 100644 olmocr/bench/synth/rotate_html_templates.py diff --git a/olmocr/bench/synth/rotate_html_templates.py b/olmocr/bench/synth/rotate_html_templates.py new file mode 100644 index 0000000..fe1e4d0 --- /dev/null +++ b/olmocr/bench/synth/rotate_html_templates.py @@ -0,0 +1,354 @@ +#!/usr/bin/env python3 +""" +Rotate HTML templates for data augmentation. + +This script takes a synthetic data folder produced by mine_html_templates.py, +copies files to a new location, and applies rotation augmentation to a percentage +of PDFs for unit testing (not training data). + +The script: +1. Copies all files from source to destination +2. Rotates a specified percentage of PDFs in bench_data/pdfs (90, 180, or 270 degrees) +3. Updates FrontMatter in corresponding claude_original markdown files +""" + +import argparse +import json +import os +import random +import re +import shutil +import subprocess +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import pypdf +from pypdf import PageObject, Transformation +from tqdm import tqdm + + +def copy_directory_structure(src_dir: str, dst_dir: str, exclude_dirs: Optional[List[str]] = None): + """ + Copy entire directory structure from source to destination. + + Args: + src_dir: Source directory path + dst_dir: Destination directory path + exclude_dirs: List of directory names to exclude from copying + """ + exclude_dirs = exclude_dirs or [] + + for root, dirs, files in os.walk(src_dir): + # Remove excluded directories from the dirs list to prevent walking into them + dirs[:] = [d for d in dirs if d not in exclude_dirs] + + # Calculate relative path and create corresponding directory in destination + rel_path = os.path.relpath(root, src_dir) + dst_root = os.path.join(dst_dir, rel_path) + os.makedirs(dst_root, exist_ok=True) + + # Copy all files + for file in files: + src_file = os.path.join(root, file) + dst_file = os.path.join(dst_root, file) + + # Check if it's a symlink + if os.path.islink(src_file): + # Get the link target + link_target = os.readlink(src_file) + # Create the same symlink in destination + if os.path.exists(dst_file) or os.path.islink(dst_file): + os.remove(dst_file) + os.symlink(link_target, dst_file) + else: + # Regular file, copy it + shutil.copy2(src_file, dst_file) + + print(f"Copied directory structure from {src_dir} to {dst_dir}") + + +def rotate_pdf(input_path: str, output_path: str, angle: int) -> bool: + """ + Rotate a PDF by the specified angle (counter-clockwise). + + Args: + input_path: Path to input PDF + output_path: Path to save rotated PDF + angle: Rotation angle in counter-clockwise direction (90, 180, or 270 degrees) + + Returns: + True if successful, False otherwise + """ + try: + reader = pypdf.PdfReader(input_path) + writer = pypdf.PdfWriter() + + for page in reader.pages: + # Convert counter-clockwise to clockwise for pypdf (which uses clockwise) + # Counter-clockwise 90° = Clockwise 270° + # Counter-clockwise 180° = Clockwise 180° + # Counter-clockwise 270° = Clockwise 90° + clockwise_angle = (360 - angle) % 360 + page.rotate(clockwise_angle) + writer.add_page(page) + + # Write the rotated PDF + with open(output_path, 'wb') as output_file: + writer.write(output_file) + + return True + except Exception as e: + print(f"Error rotating PDF {input_path}: {e}") + return False + + +def update_frontmatter_rotation(markdown_path: str, rotation_angle: int) -> bool: + """ + Update the FrontMatter in a markdown file to reflect rotation. + + Args: + markdown_path: Path to the markdown file + rotation_angle: The angle the PDF was rotated (90, 180, or 270) + + Returns: + True if successful, False otherwise + """ + try: + with open(markdown_path, 'r') as f: + content = f.read() + + lines = content.split('\n') + + # Check if file starts with FrontMatter + if lines[0] != '---': + print(f"No FrontMatter found in {markdown_path}") + return False + + # Find the closing --- of FrontMatter + end_idx = -1 + for i, line in enumerate(lines[1:], 1): + if line == '---': + end_idx = i + break + + if end_idx == -1: + print(f"Invalid FrontMatter in {markdown_path}") + return False + + # Calculate the correction angle (inverse rotation) + correction_angle = (360 - rotation_angle) % 360 + + # Update FrontMatter lines + updated_lines = [] + rotation_valid_updated = False + rotation_correction_updated = False + + for line in lines[1:end_idx]: + if line.startswith('is_rotation_valid:'): + updated_lines.append('is_rotation_valid: false') + rotation_valid_updated = True + elif line.startswith('rotation_correction:'): + updated_lines.append(f'rotation_correction: {correction_angle}') + rotation_correction_updated = True + else: + updated_lines.append(line) + + # Add missing fields if they weren't present + if not rotation_valid_updated: + updated_lines.append('is_rotation_valid: false') + if not rotation_correction_updated: + updated_lines.append(f'rotation_correction: {correction_angle}') + + # Reconstruct the file content + new_content = '\n'.join(['---'] + updated_lines + ['---'] + lines[end_idx+1:]) + + # Write back to file + with open(markdown_path, 'w') as f: + f.write(new_content) + + return True + + except Exception as e: + print(f"Error updating FrontMatter in {markdown_path}: {e}") + return False + + +def find_corresponding_markdown(pdf_filename: str, claude_original_dir: str) -> Optional[str]: + """ + Find the corresponding markdown file in claude_original directory. + + Args: + pdf_filename: Name of the PDF file (e.g., "pdf_00001_page1.pdf") + claude_original_dir: Path to claude_original directory + + Returns: + Path to the corresponding markdown file, or None if not found + """ + # Extract the base name without extension + base_name = os.path.splitext(pdf_filename)[0] + + # Look for markdown files with pattern: base_name_pg1_repeat1.md + pattern = f"{base_name}_pg1_repeat1.md" + + # Search in subdirectories of claude_original + for root, dirs, files in os.walk(claude_original_dir): + if pattern in files: + return os.path.join(root, pattern) + + # Also try without the _pg1_repeat1 suffix + pattern2 = f"{base_name}.md" + for root, dirs, files in os.walk(claude_original_dir): + if pattern2 in files: + return os.path.join(root, pattern2) + + return None + + +def main(): + parser = argparse.ArgumentParser( + description="Apply rotation augmentation to synthetic data from mine_html_templates.py" + ) + parser.add_argument( + "--input_dir", + required=True, + help="Input directory containing synthetic data from mine_html_templates.py" + ) + parser.add_argument( + "--output_dir", + required=True, + help="Output directory for augmented data" + ) + parser.add_argument( + "--rotation_percentage", + type=float, + default=5.0, + help="Percentage of PDFs to rotate (default: 5%%)" + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed for reproducibility" + ) + parser.add_argument( + "--dry_run", + action="store_true", + help="Print what would be done without actually doing it" + ) + args = parser.parse_args() + + # Set random seed + random.seed(args.seed) + + # Validate input directory + if not os.path.exists(args.input_dir): + print(f"Error: Input directory does not exist: {args.input_dir}") + return 1 + + # Check for required subdirectories + bench_data_dir = os.path.join(args.input_dir, "bench_data") + if not os.path.exists(bench_data_dir): + print(f"Error: bench_data directory not found in {args.input_dir}") + return 1 + + pdfs_dir = os.path.join(bench_data_dir, "pdfs") + claude_original_dir = os.path.join(bench_data_dir, "claude_original") + + if not os.path.exists(pdfs_dir): + print(f"Warning: pdfs directory not found in {bench_data_dir}") + + if not os.path.exists(claude_original_dir): + print(f"Warning: claude_original directory not found in {bench_data_dir}") + + # Step 1: Copy entire directory structure + if not args.dry_run: + print("Copying directory structure...") + copy_directory_structure(args.input_dir, args.output_dir) + else: + print(f"[DRY RUN] Would copy {args.input_dir} to {args.output_dir}") + + # Step 2: Find all PDFs in the destination bench_data/pdfs directory + dst_pdfs_dir = os.path.join(args.output_dir, "bench_data", "pdfs") + dst_claude_dir = os.path.join(args.output_dir, "bench_data", "claude_original") + + if not os.path.exists(dst_pdfs_dir): + print(f"No PDFs directory found at {dst_pdfs_dir}") + return 0 + + # Collect all PDF files recursively + pdf_files = [] + for root, dirs, files in os.walk(dst_pdfs_dir): + for file in files: + if file.endswith('.pdf'): + pdf_files.append(os.path.join(root, file)) + + if not pdf_files: + print("No PDF files found to rotate") + return 0 + + print(f"Found {len(pdf_files)} PDF files") + + # Step 3: Select PDFs to rotate based on percentage + num_to_rotate = int(len(pdf_files) * args.rotation_percentage / 100.0) + if num_to_rotate == 0 and args.rotation_percentage > 0: + num_to_rotate = 1 # Rotate at least one if percentage > 0 + + pdfs_to_rotate = random.sample(pdf_files, min(num_to_rotate, len(pdf_files))) + + print(f"Selected {len(pdfs_to_rotate)} PDFs to rotate ({args.rotation_percentage}%)") + + # Step 4: Rotate selected PDFs and update corresponding markdown files + rotation_angles = [90, 180, 270] + rotated_count = 0 + markdown_updated_count = 0 + + for pdf_path in tqdm(pdfs_to_rotate, desc="Rotating PDFs"): + # Choose random rotation angle + angle = random.choice(rotation_angles) + + if args.dry_run: + print(f"[DRY RUN] Would rotate {pdf_path} by {angle} degrees") + else: + # Create a temporary file for the rotated PDF + temp_path = pdf_path + '.rotated' + + # Rotate the PDF + if rotate_pdf(pdf_path, temp_path, angle): + # Replace original with rotated version + shutil.move(temp_path, pdf_path) + rotated_count += 1 + + # Find and update corresponding markdown file + pdf_filename = os.path.basename(pdf_path) + markdown_path = find_corresponding_markdown(pdf_filename, dst_claude_dir) + + if markdown_path: + if update_frontmatter_rotation(markdown_path, angle): + markdown_updated_count += 1 + else: + # Extract the subdirectory structure from PDF path + rel_pdf_path = os.path.relpath(pdf_path, dst_pdfs_dir) + pdf_subdir = os.path.dirname(rel_pdf_path) + + # Try to find in the same subdirectory structure + if pdf_subdir: + specific_claude_dir = os.path.join(dst_claude_dir, pdf_subdir) + markdown_path = find_corresponding_markdown(pdf_filename, specific_claude_dir) + if markdown_path and update_frontmatter_rotation(markdown_path, angle): + markdown_updated_count += 1 + else: + print(f"Failed to rotate {pdf_path}") + + # Print summary + print(f"\nRotation augmentation complete!") + print(f" - Rotated {rotated_count}/{len(pdfs_to_rotate)} PDFs") + print(f" - Updated {markdown_updated_count}/{len(pdfs_to_rotate)} markdown files") + + if args.dry_run: + print("\n[DRY RUN] No actual changes were made") + + return 0 + + +if __name__ == "__main__": + exit(main())