From 33ecc2316cfe91e478deb644fc0c4fc60fe0a96c Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 25 Aug 2025 16:54:23 +0000
Subject: [PATCH] Startng miner for blank pages, we want to make sure we don't
 hallucinate

---
 olmocr/bench/miners/mine_blank_pages_gpt.py | 296 ++++++++++++++++++++
 1 file changed, 296 insertions(+)
 create mode 100644 olmocr/bench/miners/mine_blank_pages_gpt.py

diff --git a/olmocr/bench/miners/mine_blank_pages_gpt.py b/olmocr/bench/miners/mine_blank_pages_gpt.py
new file mode 100644
index 0000000..8665425
--- /dev/null
+++ b/olmocr/bench/miners/mine_blank_pages_gpt.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+"""
+mine_blank_pages_gpt.py - Identify PDF documents with blank pages and copy them.
+
+This script:
+1. Takes a file containing S3 paths to PDF documents as input
+2. For each PDF, renders a random page and uses GPT-4o with the exact same query as buildsilver.py
+3. Identifies PDFs where the structured output has null natural_text
+4. Copies those PDF files to a new output folder
+
+Usage:
+  python mine_blank_pages_gpt.py --input_list path/to/s3_paths.txt --output_dir path/to/output --api_key your_openai_api_key
+"""
+
+import argparse
+import json
+import os
+import random
+import shutil
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Optional
+
+import boto3
+import pypdf
+from openai import OpenAI
+from tqdm import tqdm
+
+from olmocr.data.renderpdf import render_pdf_to_base64png
+from olmocr.filter import PdfFilter
+from olmocr.prompts import build_openai_silver_data_prompt, openai_response_format_schema
+from olmocr.prompts.anchor import get_anchor_text
+
+TARGET_IMAGE_DIM = 2048
+
+
+def download_pdf_from_s3(s3_path: str, local_path: str) -> bool:
+    """
+    Download a PDF file from S3.
+
+    Args:
+        s3_path: The S3 path (s3://bucket/path/to/file.pdf)
+        local_path: The local path to save the file
+
+    Returns:
+        bool: True if download was successful, False otherwise
+    """
+    try:
+        # Parse S3 path
+        parts = s3_path.replace("s3://", "").split("/", 1)
+        bucket = parts[0]
+        key = parts[1]
+
+        # Create S3 client
+        s3 = boto3.client("s3")
+
+        # Create directory if it doesn't exist
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+
+        # Download file
+        s3.download_file(bucket, key, local_path)
+        return True
+    except Exception as e:
+        print(f"Error downloading {s3_path}: {str(e)}")
+        return False
+
+
+def check_blank_page(pdf_path: str, page_num: int, api_key: str) -> Optional[bool]:
+    """
+    Use GPT-4o with the exact same query as buildsilver.py to check if a page has null natural_text.
+
+    Args:
+        pdf_path: Path to the PDF file
+        page_num: The page number to analyze (0-indexed)
+        api_key: OpenAI API key
+
+    Returns:
+        Optional[bool]: True if natural_text is null, False otherwise, None if detection fails
+    """
+    # Initialize OpenAI client
+    client = OpenAI(api_key=api_key)
+
+    try:
+        # Render the PDF page as an image (render_pdf_to_base64png is 1-indexed)
+        image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num + 1, target_longest_image_dim=TARGET_IMAGE_DIM)
+        
+        # Get anchor text
+        anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport")
+
+        # Build the exact same prompt as buildsilver.py
+       
+        response = client.chat.completions.create(
+            model="gpt-4o-2024-08-06",
+            messages= [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)},
+                            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
+                        ],
+                    }
+                ],
+            temperature=0.1,
+            max_tokens=3000,
+            logprobs=True,
+            top_logprobs=5,
+            response_format=openai_response_format_schema()
+        )
+
+        if not response.choices or len(response.choices) == 0:
+            print(f"No response generated for {pdf_path} page {page_num}")
+            return None
+
+        # Parse the JSON response
+        response_text = response.choices[0].message.content
+        response_data = json.loads(response_text)
+        
+        # Check if natural_text is null
+        is_blank = response_data.get("natural_text") is None
+        
+        if is_blank:
+            print(f"Found blank page in {pdf_path} page {page_num + 1}")
+        
+        return is_blank
+
+    except Exception as e:
+        print(f"Error checking {pdf_path} page {page_num}: {str(e)}")
+        return None
+
+
+def process_pdf(s3_path: str, temp_dir: str, output_dir: str, api_key: str) -> bool:
+    """
+    Process a single PDF from S3.
+
+    Args:
+        s3_path: S3 path to the PDF
+        temp_dir: Directory for temporary files
+        output_dir: Directory for output files
+        api_key: OpenAI API key
+
+    Returns:
+        bool: True if the PDF has a blank page and was copied, False otherwise
+    """
+    # Extract filename from S3 path
+    pdf_filename = os.path.basename(s3_path)
+    local_pdf_path = os.path.join(temp_dir, pdf_filename)
+
+    # Download PDF from S3
+    if not download_pdf_from_s3(s3_path, local_pdf_path):
+        return False
+
+    pdf_filter = PdfFilter()
+
+    if pdf_filter.filter_out_pdf(local_pdf_path):
+        print(f"Filtering out {pdf_filename}")
+        return False
+
+    try:
+        # Read the PDF to get the number of pages
+        reader = pypdf.PdfReader(local_pdf_path)
+        num_pages = len(reader.pages)
+
+        if num_pages == 0:
+            print(f"PDF {pdf_filename} has no pages")
+            return False
+
+        # Select a random page to check
+        page_num = random.randint(0, num_pages - 1)
+        page_num = random.choice([page_num, 0]) # Bias 50% of the time to do the first page
+
+        # Check if the page has null natural_text
+        is_blank = check_blank_page(local_pdf_path, page_num, api_key)
+        
+        if is_blank:
+            # Extract just the blank page and save it as a new PDF
+            os.makedirs(output_dir, exist_ok=True)
+            
+            # Create output filename with basename_pgnum.pdf format
+            pdf_basename = os.path.splitext(pdf_filename)[0]
+            output_pdf_path = os.path.join(output_dir, f"{pdf_basename}_pg{page_num+1}.pdf")
+            
+            # Extract the single page
+            writer = pypdf.PdfWriter()
+            writer.add_page(reader.pages[page_num])
+            
+            # Write the output PDF
+            with open(output_pdf_path, "wb") as output_file:
+                writer.write(output_file)
+            
+            print(f"Extracted blank page {page_num+1} from {pdf_filename} to {os.path.basename(output_pdf_path)}")
+            return True
+        
+        return False
+
+    except Exception as e:
+        print(f"Error processing {pdf_filename}: {str(e)}")
+        return False
+    finally:
+        if os.path.exists(local_pdf_path):
+            os.remove(local_pdf_path)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Identify and copy PDFs with blank pages")
+    parser.add_argument("--input_list", required=True, help="Path to a file containing S3 paths to PDFs")
+    parser.add_argument("--output_dir", required=True, help="Directory to copy PDFs with blank pages")
+    parser.add_argument("--api_key", help="OpenAI API key (if not provided, will use OPENAI_API_KEY environment variable)")
+    parser.add_argument("--temp_dir", default="/tmp/mine_blank_pages", help="Directory for temporary files")
+    parser.add_argument("--max_pdfs", type=int, default=100, help="Maximum number of blank PDFs to find")
+    parser.add_argument("--parallel", type=int, default=1, help="Number of parallel workers (default: 1 for sequential)")
+    parser.add_argument("--reservoir_multiplier", type=int, default=100, help="Multiplier for reservoir sampling (default: 100x max_pdfs)")
+    args = parser.parse_args()
+
+    # Get API key
+    api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        print("Error: OpenAI API key not provided. Use --api_key or set OPENAI_API_KEY environment variable.")
+        return
+
+    os.makedirs(args.temp_dir, exist_ok=True)
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Reservoir sampling to get random subset of PDFs
+    reservoir_size = args.max_pdfs * args.reservoir_multiplier
+    pdf_paths = []
+    n = 0  # Total number of items seen
+    
+    print(f"Using reservoir sampling with size {reservoir_size}")
+    
+    with open(args.input_list, "r") as f:
+        for line in f:
+            n += 1
+            path = line.strip()
+            if not path:
+                continue
+                
+            if len(pdf_paths) < reservoir_size:
+                pdf_paths.append(path)
+            else:
+                # Randomly decide whether to include this item
+                s = random.randint(1, n)
+                if s <= reservoir_size:
+                    pdf_paths[s - 1] = path
+    
+    # Shuffle the reservoir
+    random.shuffle(pdf_paths)
+    
+    print(f"Sampled {len(pdf_paths)} PDF paths from {n} total paths")
+    
+    blank_pdfs_found = 0
+    
+    if args.parallel > 1:
+        # Parallel processing
+        print(f"Processing PDFs with {args.parallel} parallel workers")
+        
+        with ThreadPoolExecutor(max_workers=args.parallel) as executor:
+            futures = []
+            
+            # Submit all tasks
+            for s3_path in pdf_paths:
+                if blank_pdfs_found >= args.max_pdfs:
+                    break
+                future = executor.submit(process_pdf, s3_path, args.temp_dir, args.output_dir, api_key)
+                futures.append(future)
+            
+            # Process results as they complete
+            with tqdm(total=min(len(pdf_paths), args.max_pdfs), desc="Processing PDFs") as pbar:
+                for future in as_completed(futures):
+                    try:
+                        result = future.result()
+                        if result:
+                            blank_pdfs_found += 1
+                            pbar.update(1)
+                            
+                            if blank_pdfs_found >= args.max_pdfs:
+                                print(f"Reached maximum number of blank PDFs ({args.max_pdfs}), stopping")
+                                # Cancel remaining futures
+                                for f in futures:
+                                    f.cancel()
+                                break
+                    except Exception as e:
+                        print(f"Error in parallel processing: {str(e)}")
+    else:
+        # Sequential processing
+        for s3_path in tqdm(pdf_paths, desc="Processing PDFs"):
+            if process_pdf(s3_path, args.temp_dir, args.output_dir, api_key):
+                blank_pdfs_found += 1
+                
+                if blank_pdfs_found >= args.max_pdfs:
+                    print(f"Reached maximum number of blank PDFs ({args.max_pdfs}), stopping")
+                    break
+
+    print(f"Found and copied {blank_pdfs_found} PDFs with blank pages to {args.output_dir}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file