Cleaning up some unused code

2025-12-01 01:30:33 +00:00 · 2025-01-27 18:48:15 +00:00 · 2025-01-27 18:48:15 +00:00 · c6062677aa
commit c6062677aa
parent d8c13d05f6
6 changed files with 2 additions and 1079 deletions
--- a/olmocr/birrpipeline.py
+++ b/olmocr/birrpipeline.py
@ -1,862 +0,0 @@
-import os
-import hashlib
-import boto3
-import sqlite3
-import orjson
-import argparse
-import base64
-import tempfile
-import datetime
-import posixpath
-import threading
-import logging
-import psutil
-import boto3.session
-import urllib3.exceptions
-
-from dataclasses import dataclass
-from pypdf import PdfReader
-from io import BytesIO
-from PIL import Image
-from tqdm import tqdm
-from functools import partial
-from typing import Optional, List, Tuple, Dict, Callable, Any
-from urllib.parse import urlparse
-import concurrent.futures
-from concurrent.futures import ProcessPoolExecutor, as_completed
-
-from olmocr.data.renderpdf import render_pdf_to_base64png
-from olmocr.prompts import build_finetuning_prompt, PageResponse
-from olmocr.prompts.anchor import get_anchor_text
-from olmocr.s3_utils import parse_custom_id, expand_s3_glob, get_s3_bytes, parse_s3_path
-from olmocr.check import check_poppler_version
-
-# Initialize logger
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-# File handler for DEBUG level and above with line-by-line flushing
-class FlushFileHandler(logging.FileHandler):
-    def emit(self, record):
-        super().emit(record)
-        self.flush()  # Explicitly flush after every log entry
-
-file_handler = FlushFileHandler('birrpipeline-debug.log', mode='a')
-file_handler.setLevel(logging.DEBUG)
-file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
-
-# Add handlers to the logger
-logger.addHandler(file_handler)
-
-# Global s3 client for the whole script, feel free to adjust params if you need it
-workspace_s3 = boto3.client('s3')
-pdf_s3 = boto3.client('s3')
-
-# Quiet logs from pypdf
-logging.getLogger("pypdf").setLevel(logging.ERROR)
-
-
-class DatabaseManager:
-    @dataclass(frozen=True)
-    class BatchInferenceRecord:
-        inference_s3_path: str
-        pdf_s3_path: str
-        page_num: int  # 1 indexed!
-        round: int
-        start_index: int
-        length: int
-        finish_reason: str
-        error: Optional[str]
-
-        def is_usable(self):
-            return self.error is None and self.finish_reason == "stop"
-
-    @dataclass(frozen=True)
-    class PDFRecord:
-        s3_path: str
-        num_pages: int
-        status: str
-
-    def __init__(self, s3_workspace: str, skip_init: bool=False):
-        cache_key = hashlib.sha256(s3_workspace.strip().lower().encode('utf-8')).hexdigest()
-        home_cache_dir = os.path.join(os.path.expanduser('~'), '.cache', 'olmocr', cache_key)
-        os.makedirs(home_cache_dir, exist_ok=True)
-        self.db_path = os.path.join(home_cache_dir, 'index.db')
-        
-        self.conn = sqlite3.connect(self.db_path)
-        # Enable WAL mode so you can read and write concurrently
-        self.cursor = self.conn.cursor()
-        self.cursor.execute("PRAGMA journal_mode=WAL;")
-
-        if not skip_init:
-            self._initialize_tables()
-
-    def _initialize_tables(self):
-        self.cursor.execute("""
-            CREATE TABLE IF NOT EXISTS page_results (
-                inference_s3_path TEXT,
-                pdf_s3_path TEXT,
-                page_num INTEGER,
-                round INTEGER,
-                start_index BIGINT,
-                length BIGINT,
-                finish_reason TEXT,
-                error TEXT
-            )
-        """)
-        self.cursor.execute("""
-            CREATE INDEX IF NOT EXISTS idx_path ON page_results(pdf_s3_path)
-        """)
-        self.cursor.execute("""
-            CREATE INDEX IF NOT EXISTS idx_inf_path ON page_results(inference_s3_path)
-        """)
-        self.cursor.execute("""
-            CREATE TABLE IF NOT EXISTS pdfs (
-                s3_path TEXT PRIMARY KEY,
-                num_pages INTEGER,
-                status TEXT DEFAULT 'pending'
-            )
-        """)
-        self.cursor.execute("""
-            CREATE TABLE IF NOT EXISTS processed_files (
-                s3_path TEXT PRIMARY KEY,
-                etag TEXT
-            )
-        """)
-        # Generic metadata such as current round
-        self.cursor.execute("""
-            CREATE TABLE IF NOT EXISTS metadata (
-                key TEXT PRIMARY KEY,
-                value TEXT
-            )
-        """)
-
-        self.conn.commit()
-
-    def get_metadata(self, key: str) -> Optional[str]:
-        self.cursor.execute("SELECT value FROM metadata WHERE key=?", (key,))
-        result = self.cursor.fetchone()
-        return result[0] if result else None
-    
-    def set_metadata(self, key: str, value: str) -> None:
-        self.cursor.execute("""
-            INSERT INTO metadata (key, value)
-            VALUES (?, ?)
-            ON CONFLICT(key) DO UPDATE SET value=excluded.value
-        """, (key, value))
-        self.conn.commit()
-
-    def is_file_processed(self, s3_path, etag):
-        self.cursor.execute("SELECT etag FROM processed_files WHERE s3_path = ?", (s3_path,))
-        result = self.cursor.fetchone()
-        return result is not None and result[0] == etag
-    
-    def update_processed_file(self, s3_path, etag):
-        self.cursor.execute("""
-            INSERT INTO processed_files (s3_path, etag)
-            VALUES (?, ?)
-            ON CONFLICT(s3_path) DO UPDATE SET etag=excluded.etag
-        """, (s3_path, etag))
-        self.conn.commit()
-
-    def clear_index(self):
-        self.cursor.execute("""
-            DELETE FROM processed_files;
-        """)
-        self.cursor.execute("""
-            DELETE FROM page_results;
-        """)
-        self.conn.commit() 
-
-    def add_index_entries(self, index_entries: List['BatchInferenceRecord']):
-        if index_entries:
-            self.cursor.executemany("""
-                INSERT INTO page_results (inference_s3_path, pdf_s3_path, page_num, round, start_index, length, finish_reason, error)
-                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
-            """, [(entry.inference_s3_path, entry.pdf_s3_path, entry.page_num, entry.round, entry.start_index, entry.length, entry.finish_reason, entry.error) for entry in index_entries])
-            self.conn.commit()
-
-    def get_index_entries(self, pdf_s3_path: str) -> List['BatchInferenceRecord']:
-        self.cursor.execute("""
-            SELECT inference_s3_path, pdf_s3_path, page_num, round, start_index, length, finish_reason, error
-            FROM page_results
-            WHERE pdf_s3_path = ?
-            ORDER BY inference_s3_path DESC, start_index ASC, page_num ASC
-        """, (pdf_s3_path,))
-        
-        rows = self.cursor.fetchall()
-        
-        return [
-            self.BatchInferenceRecord(
-                inference_s3_path=row[0],
-                pdf_s3_path=row[1],
-                page_num=row[2],
-                round=row[3],
-                start_index=row[4],
-                length=row[5],
-                finish_reason=row[6],
-                error=row[7]
-            )
-            for row in rows
-        ]
-    
-    def delete_index_entries_by_inference_s3_path(self, inference_s3_path: str):
-        self.cursor.execute("DELETE FROM page_results WHERE inference_s3_path = ?", (inference_s3_path,))
-        self.conn.commit()
-
-    def get_last_indexed_round(self) -> int:
-        self.cursor.execute("""
-            SELECT MAX(round)
-            FROM page_results
-        """)
-
-        result = self.cursor.fetchone()
-        return -1 if result[0] is None else result[0]
-
-    def pdf_exists(self, s3_path: str) -> bool:
-        self.cursor.execute("SELECT 1 FROM pdfs WHERE s3_path = ?", (s3_path,))
-        return self.cursor.fetchone() is not None
-
-    def add_pdf(self, s3_path: str, num_pages: int, status: str = 'pending') -> None:
-        try:
-            self.cursor.execute("""
-                INSERT INTO pdfs (s3_path, num_pages, status)
-                VALUES (?, ?, ?)
-            """, (s3_path, num_pages, status))
-            self.conn.commit()
-        except sqlite3.IntegrityError:
-            logger.warning(f"PDF with s3_path '{s3_path}' already exists.")
-
-    def update_pdf_statuses(self, status_updates: Dict[str, str]) -> None:
-        """
-        Update the status of multiple PDFs in the database.
-
-        :param status_updates: A dictionary where each key is an s3_path (str) and
-                               each value is the new status (str) for that PDF.
-        """
-        self.cursor.executemany("""
-            UPDATE pdfs
-            SET status = ?
-            WHERE s3_path = ?
-        """, [(new_status, s3_path) for s3_path, new_status in status_updates.items()])
-        self.conn.commit()
-
-    def get_pdf(self, s3_path: str) -> Optional['PDFRecord']:
-        self.cursor.execute("""
-            SELECT s3_path, num_pages, status
-            FROM pdfs
-            WHERE s3_path = ?
-        """, (s3_path,))
-        
-        row = self.cursor.fetchone()
-        
-        if row:
-            return self.PDFRecord(
-                s3_path=row[0],
-                num_pages=row[1],
-                status=row[2]
-            )
-        return None
-    
-    def get_pdfs_by_status(self, status: str) -> List['PDFRecord']:
-        self.cursor.execute("""
-            SELECT s3_path, num_pages, status
-            FROM pdfs
-            WHERE status == ?
-            ORDER BY s3_path DESC, num_pages DESC
-        """, (status, ))
-        
-        rows = self.cursor.fetchall()
-        
-        return [
-            self.PDFRecord(
-                s3_path=row[0],
-                num_pages=row[1],
-                status=row[2]
-            )
-            for row in rows
-        ]
-
-    def close(self):
-        self.conn.close()
-
-
-class BatchWriter:
-    def __init__(
-        self,
-        output_prefix: str,
-        max_size_mb: int = 250,
-        after_flush: Optional[Callable[[List[Any]], Any]] = None,
-    ):
-        self.output_prefix = output_prefix
-        self.max_size = max_size_mb * 1024 * 1024  # Convert MB to bytes
-        self.batch_objects = []
-        self.batch_size = 0
-        self.after_flush = after_flush
-        self.threads = []
-        self.temp_file = None  # The temporary file object
-        self.temp_file_path = None  # Path to the temporary file
-
-        parsed = urlparse(output_prefix)
-        self.is_s3 = parsed.scheme in ("s3", "s3a", "s3n")
-
-        if not self.is_s3:
-            os.makedirs(output_prefix, exist_ok=True)
-
-    def write_line(self, obj: Optional[Any]):
-        if obj is None:
-            return
-
-        line_bytes = orjson.dumps(obj)
-        line_size = len(line_bytes) + 1  # +1 for newline
-
-        if self.batch_size + line_size > self.max_size:
-            self._write_batch()
-
-        if self.batch_size == 0:
-            # Open a new temporary file
-            self.temp_file = tempfile.NamedTemporaryFile(mode="wb+", delete=False)
-            self.temp_file_path = self.temp_file.name
-
-        self.temp_file.write(line_bytes + b"\n")
-        self.batch_objects.append(obj)
-        self.batch_size += line_size
-
-    def _write_batch(self):
-        if self.batch_size == 0:
-            return
-
-        # Close the temp file
-        self.temp_file.flush()
-        self.temp_file.close()
-
-        # Start a new thread to upload the temp file
-        thread = threading.Thread(
-            target=self._write_batch_to_file, args=(self.temp_file_path, self.batch_objects)
-        )
-        thread.start()
-        self.threads.append(thread)
-
-        # Reset batch_objects and batch_size
-        self.batch_objects = []
-        self.batch_size = 0
-        self.temp_file = None
-        self.temp_file_path = None
-
-    def _write_batch_to_file(self, temp_file_path: str, batch_objects: List[Any]):
-        # Compute hash based on file content
-        hash_str = self._compute_hash(temp_file_path)
-        output_path = self._get_output_path(hash_str)
-
-        if self.is_s3:
-            bucket, key = parse_s3_path(output_path)
-
-            # Use the s3 client directly
-            try:
-                workspace_s3.upload_file(temp_file_path, bucket, key)
-            except Exception as e:
-                logger.error(f"Failed to upload {temp_file_path} to {output_path}: {e}", exc_info=True)
-        else:
-            # Move the temp file to the output path
-            os.rename(temp_file_path, output_path)
-
-        # After writing, call the after_flush callback if it is set
-        if self.after_flush:
-            self.after_flush(batch_objects)
-
-        os.remove(temp_file_path)
-
-    def _compute_hash(self, temp_file_path: str) -> str:
-        """Compute a 20-character SHA1 hash of the file content."""
-        sha1 = hashlib.sha1()
-        with open(temp_file_path, "rb") as f:
-            while True:
-                data = f.read(1024*1024)
-                if not data:
-                    break
-                sha1.update(data)
-        return sha1.hexdigest()[:20]
-
-    def _get_output_path(self, hash_str: str) -> str:
-        """Generate the full output path with hash in the filename."""
-        parsed = urlparse(self.output_prefix)
-        if self.is_s3:
-            bucket = parsed.netloc
-            key = parsed.path.lstrip("/")
-            if key and not key.endswith("/"):
-                key += "/"
-            full_key = posixpath.join(key, f"output_{hash_str}.jsonl")
-            return f"s3://{bucket}/{full_key}"
-        else:
-            filename = f"output_{hash_str}.jsonl"
-            return os.path.join(self.output_prefix, filename)
-
-    def close(self):
-        self._write_batch()
-        # Wait for all threads to finish
-        for thread in self.threads:
-            thread.join()
-
-
-def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int, target_longest_image_dim: int, target_anchor_text_len: int, image_rotation: int=0) -> dict:
-    assert image_rotation in [0, 90, 180, 270], "Invalid image rotation provided in build_page_query"
-    image_base64 = render_pdf_to_base64png(local_pdf_path, page, target_longest_image_dim=target_longest_image_dim)
-
-    if image_rotation != 0:
-        image_bytes = base64.b64decode(image_base64)
-        with Image.open(BytesIO(image_bytes)) as img:
-            rotated_img = img.rotate(-image_rotation, expand=True)
-
-            # Save the rotated image to a bytes buffer
-            buffered = BytesIO()
-            rotated_img.save(buffered, format="PNG")
-
-        # Encode the rotated image back to base64
-        image_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
-
-
-    anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport", target_length=target_anchor_text_len)
-
-    return {
-        "custom_id": f"{pretty_pdf_path}-{page}",
-        "chat_messages": [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": build_finetuning_prompt(anchor_text)},
-                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
-                ],
-            }
-        ],
-    }
-
-
-def process_jsonl_content(inference_s3_path: str) -> List[DatabaseManager.BatchInferenceRecord]:
-    content_bytes = get_s3_bytes(workspace_s3, inference_s3_path)
-
-    start_index = 0
-    index_entries = []
-    lines = content_bytes.splitlines(keepends=True)  # Split content into lines as bytes
-    for line in lines:
-        line_length = len(line)  # Length in bytes
-
-        try:
-            # Parse the line directly as JSON
-            data = orjson.loads(line)
-            pdf_s3_path, page_num = parse_custom_id(data["custom_id"])
-
-            if data.get("completion_error", None) is not None:
-                index_entries.append(DatabaseManager.BatchInferenceRecord(
-                    inference_s3_path=inference_s3_path,
-                    pdf_s3_path=pdf_s3_path,
-                    page_num=page_num,
-                    round=data["round"],
-                    start_index=start_index,  # Byte offset in the original file
-                    length=line_length,       # Length in bytes
-                    finish_reason="completion_error",
-                    error=data.get("completion_error", None)
-                ))
-            else:
-                # Try to parse the actual model response JSON
-                assert "outputs" in data and len(data["outputs"]) > 0, "No outputs from model detected"
-
-                try:
-                    model_response_json = orjson.loads(data["outputs"][0]["text"])
-                    page_response = PageResponse(**model_response_json)
-
-                    last_error = data.get("completion_error", None)
-
-                    if not page_response.is_rotation_valid:
-                        last_error = "rotation_invalid"
-
-                    index_entries.append(DatabaseManager.BatchInferenceRecord(
-                        inference_s3_path=inference_s3_path,
-                        pdf_s3_path=pdf_s3_path,
-                        page_num=page_num,
-                        round=data["round"],
-                        start_index=start_index,  # Byte offset in the original file
-                        length=line_length,       # Length in bytes
-                        finish_reason=data["outputs"][0]["finish_reason"],
-                        error=last_error,
-                    ))
-                except Exception as e:
-                    error_type = type(e).__name__
-                    index_entries.append(DatabaseManager.BatchInferenceRecord(
-                        inference_s3_path=inference_s3_path,
-                        pdf_s3_path=pdf_s3_path,
-                        page_num=page_num,
-                        round=data["round"],
-                        start_index=start_index,  # Byte offset in the original file
-                        length=line_length,       # Length in bytes
-                        finish_reason=data["outputs"][0]["finish_reason"],
-                        error=error_type,
-                    ))
-
-        except Exception as e:
-            logger.exception(f"Error processing line in {inference_s3_path}: {e}")
-            # Optionally, you might want to add an index entry indicating an error here
-
-        start_index += line_length  # Increment by the number of bytes
-
-    return index_entries
-
-
-def get_pdf_num_pages(s3_path: str) -> Optional[int]:
-    logger.debug(f"Startng to get_pdf_num_pages for {s3_path}")
-    try:
-        with tempfile.NamedTemporaryFile("wb+", suffix=".pdf") as tf:
-            tf.write(get_s3_bytes(pdf_s3, s3_path))
-            tf.flush()
-
-            reader = PdfReader(tf.name)
-            logger.debug(f"Built reader for {s3_path}")
-            return reader.get_num_pages()
-    except Exception as ex:
-        logger.warning(f"Warning, could not add {s3_path} due to {ex}")
-
-    return None
-
-
-def _get_page_data(page_index_entries: List[DatabaseManager.BatchInferenceRecord]) -> List[PageResponse]:
-    usable_page_data = [get_s3_bytes(workspace_s3, page.inference_s3_path,
-                                     start_index=page.start_index,
-                                     end_index=page.start_index + page.length - 1) for page in page_index_entries]
-
-    usable_page_final_results = []
-    for page_data in usable_page_data:
-        data = orjson.loads(page_data)
-        model_response_json = orjson.loads(data["outputs"][0]["text"])
-        page_response = PageResponse(**model_response_json)
-        usable_page_final_results.append(page_response)
-
-    return usable_page_final_results
-
-
-def build_pdf_queries(s3_workspace: str, pdf: DatabaseManager.PDFRecord, cur_round: int, target_longest_image_dim: int, target_anchor_text_len: int) -> list[dict]:
-    db = DatabaseManager(s3_workspace, skip_init=True)
-
-    existing_pages = db.get_index_entries(pdf.s3_path)
-    new_queries = []
-
-    # Shortcut out of downloading the actual PDF
-    if set(page.page_num for page in existing_pages if page.is_usable()) == set(range(1, pdf.num_pages + 1)):
-        db.close()
-        return []
-
-    try:
-        with tempfile.NamedTemporaryFile("wb+", suffix=".pdf") as tf:
-            tf.write(get_s3_bytes(pdf_s3, pdf.s3_path))
-            tf.flush()
-
-            for target_page_num in range(1, pdf.num_pages + 1):
-                # Is there an existing page that has no error
-                if any(page.is_usable() and page.page_num == target_page_num for page in existing_pages):
-                    continue
-
-                has_errored_previously = sum(page.page_num == target_page_num for page in existing_pages)
-
-                if has_errored_previously:
-                    # Retry the page at least one more time regularly
-                    new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num, target_longest_image_dim, target_anchor_text_len), "round": cur_round})
-                    
-                    # If the rotation was previously invalid, then apply a rotation  
-                    rotated_page_data = _get_page_data([page for page in existing_pages if page.page_num == target_page_num and page.error == "rotation_invalid"])
-                    rotation_corrections = set(page_data.rotation_correction for page_data in rotated_page_data)
-                    for correction in rotation_corrections:
-                        logger.debug(f"Adding {correction}-degree rotation query for {pdf.s3_path}-{target_page_num}")
-                        new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num, target_longest_image_dim, target_anchor_text_len, image_rotation=correction), "round": cur_round})
-                    
-                    # TODO: Try to provide a smaller prompt hint if that was the error
-                else:
-                    new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num, target_longest_image_dim, target_anchor_text_len), "round": cur_round})
-    except Exception as ex:
-        logger.warning(f"Warning, could not get batch inferences lines for {pdf.s3_path} due to {ex}")
-
-    db.close()
-    return new_queries
-
-def build_dolma_doc(s3_workspace: str, pdf: DatabaseManager.PDFRecord) -> Optional[dict]:
-    db = DatabaseManager(s3_workspace, skip_init=True)
-    existing_pages = db.get_index_entries(pdf.s3_path)
-    document_text = ""
-    last_page_start_index = 0
-    pdf_page_spans = []
-
-    # Error out quickly if this document cannot be assembled
-    for target_page_num in range(1, pdf.num_pages + 1):
-        usable_pages = [page for page in existing_pages if page.is_usable() and page.page_num == target_page_num]
-
-        if len(usable_pages) == 0:
-            db.close()
-            return None
-    
-    for target_page_num in range(1, pdf.num_pages + 1):
-        usable_pages = [page for page in existing_pages if page.is_usable() and page.page_num == target_page_num]
-        usable_page_final_results = _get_page_data(usable_pages)
-
-        # Sort the pages:
-        # 1. Prefer pages with `is_rotation_valid` set to True.
-        # 2. Within those, sort by the length of the `natural_text` in descending order.
-        usable_page_final_results.sort(
-            key=lambda page: (not page.is_rotation_valid, -len(page.natural_text or ""))
-        )
-
-        target_page_final_result = usable_page_final_results[0]
-    
-        if target_page_final_result.natural_text is not None:
-            document_text += target_page_final_result.natural_text + "\n"
-
-        pdf_page_spans.append([last_page_start_index, len(document_text), target_page_num])
-        last_page_start_index = len(document_text)
-
-    metadata = {
-            "Source-File": pdf.s3_path,
-            "pdf-total-pages": pdf.num_pages,
-        }
-    id_ = hashlib.sha1(document_text.encode()).hexdigest()
-
-    dolma_doc = {
-        "id": id_,
-        "text": document_text,
-        "source": "olmocr",
-        "added": datetime.datetime.now().strftime("%Y-%m-%d"),
-        "created": datetime.datetime.now().strftime("%Y-%m-%d"),
-        "metadata": metadata,
-        "attributes": {
-            "pdf_page_numbers": pdf_page_spans
-        }
-    }
-
-    db.close()
-    return dolma_doc
-
-def mark_pdfs_done(s3_workspace: str, dolma_docs: list[dict]):
-    db = DatabaseManager(s3_workspace, skip_init=True)
-    db.update_pdf_statuses({doc["metadata"]["Source-File"]: "completed" for doc in dolma_docs})
-    db.close()
-
-def get_current_round(s3_workspace: str) -> int:
-    path = s3_workspace[5:]
-    bucket, _, prefix = path.partition('/')
-
-    inference_inputs_prefix = posixpath.join(prefix, 'inference_inputs/')
-    paginator = workspace_s3.get_paginator('list_objects_v2')
-    page_iterator = paginator.paginate(Bucket=bucket, Prefix=inference_inputs_prefix, Delimiter='/')
-
-    round_numbers = []
-    for page in page_iterator:
-        for common_prefix in page.get('CommonPrefixes', []):
-            round_prefix = common_prefix.get('Prefix')
-            # Extract 'round_X' from the prefix
-            round_dir = posixpath.basename(posixpath.dirname(round_prefix))
-            if round_dir.startswith('round_'):
-                try:
-                    round_num = int(round_dir[len('round_'):])
-                    round_numbers.append(round_num)
-                except ValueError:
-                    pass
-    if round_numbers:
-        current_round = max(round_numbers) + 1
-    else:
-        current_round = 0
-    return current_round
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Manager for running millions of PDFs through a batch inference pipeline')
-    parser.add_argument('workspace', help='The S3 path where work will be done e.g., s3://bucket/prefix/)')
-    parser.add_argument('--add_pdfs', help='Path to add pdfs stored in s3 to the workspace, can be a glob path s3://bucket/prefix/*.pdf or path to file containing list of pdf paths', default=None)
-    parser.add_argument('--target_longest_image_dim', type=int, help='Dimension on longest side to use for rendering the pdf pages', default=1024)
-    parser.add_argument('--target_anchor_text_len', type=int, help='Maximum amount of anchor text to use (characters)', default=6000)
-    parser.add_argument('--workspace_profile', help='S3 configuration profile for accessing the workspace', default=None)
-    parser.add_argument('--pdf_profile', help='S3 configuration profile for accessing the raw pdf documents', default=None)
-    parser.add_argument('--max_size_mb', type=int, default=250, help='Max file size in MB')
-    parser.add_argument('--workers', type=int, help='Number of workers to run in the processpool')
-    parser.add_argument('--reindex', action='store_true', default=False, help='Reindex all of the page_results')
-    parser.add_argument('--skip_build_queries', action='store_true', default=False, help='Skip generation of new pdf page queries for batch inferencing')
-    args = parser.parse_args()
-
-    if args.workspace_profile:
-        workspace_session = boto3.Session(profile_name=args.workspace_profile)
-        workspace_s3 = workspace_session.client("s3")
-
-    if args.pdf_profile:
-        pdf_session = boto3.Session(profile_name=args.pdf_profile)
-        pdf_s3 = pdf_session.client("s3")
-
-    db = DatabaseManager(args.workspace)
-    logger.info(f"Loaded db at {db.db_path}")
-
-    if args.reindex:
-        db.clear_index()
-        logger.info("Cleared existing index.")
-
-    current_round = get_current_round(args.workspace)
-    logger.info(f"Current round is {current_round}")
-
-    check_poppler_version()
-
-    # One shared executor to rule them all
-    executor = ProcessPoolExecutor(max_workers=args.workers)
-
-    # If you have new PDFs, step one is to add them to the list
-    if args.add_pdfs:
-        if args.add_pdfs.startswith("s3://"):
-            logger.info(f"Querying all PDFs at {args.add_pdfs}")
-            
-            all_pdfs = expand_s3_glob(pdf_s3, args.add_pdfs)
-            logger.info(f"Found {len(all_pdfs):,} total pdf paths")
-        elif os.path.exists(args.add_pdfs):
-            with open(args.add_pdfs, "r") as f:
-                all_pdfs = [line.strip() for line in f.readlines() if len(line.strip()) > 0]
-        else:
-            raise ValueError("add_pdfs argument needs to be either an s3 glob search path, or a local file contains pdf paths (one per line)")
-
-        all_pdfs = [pdf for pdf in all_pdfs if not db.pdf_exists(pdf)]
-        logger.info(f"Need to import {len(all_pdfs):,} total new pdf paths")
-
-        future_to_path = {executor.submit(get_pdf_num_pages, s3_path): s3_path for s3_path in all_pdfs}
-        for future in tqdm(as_completed(future_to_path), total=len(future_to_path), desc="Adding PDFs"):
-            s3_path = future_to_path[future]
-            num_pages = future.result()
-            logger.debug(f"Got {num_pages} pages back for {s3_path}")
-            if num_pages and not db.pdf_exists(s3_path):
-                db.add_pdf(s3_path, num_pages, "pending")
-
-        logger.info("Completed adding new PDFs.")
-
-    # Now build an index of all the pages that were processed within the workspace so far
-    logger.info("Indexing all batch inference sent to this workspace")
-    inference_output_paths = expand_s3_glob(workspace_s3, f"{args.workspace}/inference_outputs/*.jsonl")
-
-    inference_output_paths = {
-        s3_path: etag for s3_path, etag in inference_output_paths.items()
-        if not db.is_file_processed(s3_path, etag)
-    }
-
-    logger.info(f"Found {len(inference_output_paths):,} new batch inference results to index")
-    future_to_path = {executor.submit(process_jsonl_content, s3_path): (s3_path, etag) for s3_path, etag in inference_output_paths.items()}
-
-    for future in tqdm(as_completed(future_to_path), total=len(future_to_path), desc="Indexing Inference Results"):
-        s3_path, etag = future_to_path.pop(future)
-        try:
-            inference_records = future.result()
-
-            db.delete_index_entries_by_inference_s3_path(s3_path)
-            db.add_index_entries(inference_records)
-            db.update_processed_file(s3_path, etag=etag)
-        except urllib3.exceptions.SSLError:
-            logger.warning(f"Cannot load inference file {s3_path} due to SSL error, will retry another time")
-        except Exception as e:
-            logger.exception(f"Failed to index inference file {s3_path}: {e}")
-
-    # Now query each pdf, if you have all of the pages needed (all pages present, error is null and finish_reason is stop), then you assemble it into a dolma document and output it
-    # If you don't have every page, or if you have pages with errors, then you output a new batch of inference items to use
-    if db.get_last_indexed_round() < current_round - 1:
-        logger.warning(f"WARNING: No new batch inference results found, you need to run batch inference on {args.workspace}/inference_inputs/round_{current_round - 1}")
-        potentially_done_pdfs = db.get_pdfs_by_status("pending")
-    elif args.skip_build_queries:
-        logger.info(f"Skipping generating new batch inference files")
-        potentially_done_pdfs = db.get_pdfs_by_status("pending")
-    else:
-        logger.info("Creating batch inference files for new PDFs")
-        pdf_list = list(db.get_pdfs_by_status("pending"))
-        pdf_iter = iter(pdf_list)
-        pending_futures = {}
-        potentially_done_pdfs = []
-        lines_written = 0
-        new_inference_writer = BatchWriter(f"{args.workspace}/inference_inputs/round_{current_round}", args.max_size_mb)
-        total_pdfs = len(pdf_list)
-        max_pending = 300
-
-        with tqdm(total=total_pdfs, desc="Building Batch Queries") as pbar:
-            # Submit initial batch of futures
-            for _ in range(min(max_pending, total_pdfs)):
-                pdf = next(pdf_iter)
-                future = executor.submit(
-                    build_pdf_queries, args.workspace, pdf, current_round, args.target_longest_image_dim,args.target_anchor_text_len,
-                )
-                pending_futures[future] = pdf
-
-            while pending_futures:
-                # Wait for the next future to complete
-                done, _ = concurrent.futures.wait(
-                    pending_futures.keys(),
-                    return_when=concurrent.futures.FIRST_COMPLETED,
-                )
-
-                for future in done:
-                    pdf = pending_futures.pop(future)
-                    inference_lines = future.result()
-
-                    if len(inference_lines) == 0:
-                        potentially_done_pdfs.append(pdf)
-
-                    for line in inference_lines:
-                        lines_written += 1
-
-                        if line is not None:
-                            new_inference_writer.write_line(line)
-
-                    pbar.update(1)
-
-                    # Submit a new future if there are more PDFs
-                    try:
-                        pdf = next(pdf_iter)
-                        future = executor.submit(
-                            build_pdf_queries, args.workspace, pdf, current_round, args.target_longest_image_dim,args.target_anchor_text_len,
-                        )
-                        pending_futures[future] = pdf
-                    except StopIteration:
-                        pass  # No more PDFs to process
-
-        new_inference_writer.close()
-
-        if lines_written > 0:
-            logger.info(f"Added {lines_written:,} new batch inference requests")
-
-    # Now, finally, assemble any potentially done docs into dolma documents
-    logger.info(f"Assembling potentially finished PDFs into Dolma documents at {args.workspace}/output")
-    future_to_path = {executor.submit(build_dolma_doc, args.workspace, pdf): pdf for pdf in potentially_done_pdfs}
-    new_output_writer = BatchWriter(f"{args.workspace}/output", args.max_size_mb, after_flush=partial(mark_pdfs_done, args.workspace))
-
-    for future in tqdm(as_completed(future_to_path), total=len(future_to_path), desc="Assembling Dolma Docs"):
-        pdf = future_to_path.pop(future)
-        dolma_doc = future.result()
-        
-        if dolma_doc is not None:
-            new_output_writer.write_line(dolma_doc)
-
-    new_output_writer.close()
-
-    logger.info("Final statistics:")
-
-    # Output the number of PDFs in each status "pending" and "completed"
-    pending_pdfs = db.get_pdfs_by_status("pending")
-    completed_pdfs = db.get_pdfs_by_status("completed")
-
-    logger.info(f"Pending PDFs: {len(pending_pdfs):,} ({sum(doc.num_pages for doc in pending_pdfs):,} pages)")
-    logger.info(f"Completed PDFs: {len(completed_pdfs):,} ({sum(doc.num_pages for doc in completed_pdfs):,} pages)")
-
-    # For each round, outputs a report of how many pages were processed, how many had errors, and a breakdown by (error, finish_reason)
-    total_rounds = db.get_last_indexed_round() + 1
-    for round_num in range(total_rounds):
-        db.cursor.execute("""
-            SELECT COUNT(*), error, finish_reason
-            FROM page_results
-            WHERE round = ?
-            GROUP BY error, finish_reason
-        """, (round_num,))
-        
-        results = db.cursor.fetchall()
-        
-        total_pages = sum(count for count, _, _ in results)
-        logger.info(f"Inference Round {round_num} - {total_pages:,} pages processed:")
-
-        for count, error, finish_reason in results:
-            error_str = error if error is not None else "None"
-            logger.info(f"  (error: {error_str}, finish_reason: {finish_reason}) -> {count:,} pages")
-
-    logger.info("Work finished, waiting for all workers to finish cleaning up")
-    executor.shutdown(wait=True)
-    db.close()
--- a/olmocr/cappedpool.py
+++ b/olmocr/cappedpool.py
@ -1,116 +0,0 @@
-import concurrent.futures
-import threading
-import queue
-
-class CappedFuture(concurrent.futures.Future):
-    def __init__(self, semaphore):
-        super().__init__()
-        self._semaphore = semaphore
-        self._result_retrieved = False
-        self._underlying_future = None
-        self._condition = threading.Condition()
-
-    def set_underlying_future(self, underlying_future):
-        with self._condition:
-            self._underlying_future = underlying_future
-            # Transfer the result when the underlying future completes
-            underlying_future.add_done_callback(self._transfer_result)
-
-    def _transfer_result(self, underlying_future):
-        if underlying_future.cancelled():
-            self.set_cancelled()
-        elif underlying_future.exception() is not None:
-            self.set_exception(underlying_future.exception())
-        else:
-            try:
-                result = underlying_future.result()
-                self.set_result(result)
-            except Exception as e:
-                self.set_exception(e)
-
-    def result(self, timeout=None):
-        res = super().result(timeout)
-        self._release_semaphore()
-        return res
-
-    def exception(self, timeout=None):
-        exc = super().exception(timeout)
-        self._release_semaphore()
-        return exc
-
-    def _release_semaphore(self):
-        if not self._result_retrieved:
-            self._result_retrieved = True
-            self._semaphore.release()
-
-    def cancel(self):
-        with self._condition:
-            if self._underlying_future is not None:
-                cancelled = self._underlying_future.cancel()
-                if cancelled:
-                    super().cancel()
-                return cancelled
-            else:
-                # Task has not been submitted yet; cancel directly
-                return super().cancel()
-
-    def cancelled(self):
-        return super().cancelled()
-
-    def running(self):
-        with self._condition:
-            if self._underlying_future is not None:
-                return self._underlying_future.running()
-            else:
-                return False
-
-    def done(self):
-        return super().done()
-
-class CappedProcessPoolExecutor(concurrent.futures.Executor):
-    def __init__(self, max_unprocessed=100, max_workers=None):
-        self._max_unprocessed = max_unprocessed
-        self._semaphore = threading.BoundedSemaphore(max_unprocessed)
-        self._task_queue = queue.Queue()
-        self._shutdown = threading.Event()
-        self._shutdown_lock = threading.Lock()
-        self._executor = concurrent.futures.ProcessPoolExecutor(max_workers=max_workers)
-        self._worker_thread = threading.Thread(target=self._worker)
-        self._worker_thread.daemon = True
-        self._worker_thread.start()
-
-    def submit(self, fn, *args, **kwargs):
-        if self._shutdown.is_set():
-            raise RuntimeError('Cannot submit new tasks after shutdown')
-        # Create a CappedFuture to return to the user
-        user_future = CappedFuture(self._semaphore)
-        # Put the task in the queue
-        self._task_queue.put((user_future, fn, args, kwargs))
-        return user_future
-
-    def _worker(self):
-        while True:
-            if self._shutdown.is_set() and self._task_queue.empty():
-                break
-            try:
-                user_future, fn, args, kwargs = self._task_queue.get(timeout=0.1)
-            except queue.Empty:
-                continue
-            self._semaphore.acquire()
-            if user_future.cancelled():
-                self._semaphore.release()
-                continue
-            # Submit the task to the underlying executor
-            try:
-                underlying_future = self._executor.submit(fn, *args, **kwargs)
-                user_future.set_underlying_future(underlying_future)
-            except Exception as e:
-                user_future.set_exception(e)
-                self._semaphore.release()
-                continue
-
-    def shutdown(self, wait=True):
-        with self._shutdown_lock:
-            self._shutdown.set()
-            self._worker_thread.join()
-            self._executor.shutdown(wait=wait)
--- a/olmocr/beakerpipeline.py
+++ b/olmocr/beakerpipeline.py
--- a/olmocr/s3_queue.py
+++ b/olmocr/s3_queue.py
@ -15,7 +15,7 @@ from olmocr.s3_utils import (
    upload_zstd_csv,
    parse_s3_path
 )
-from pypdf import PdfReader
+

 logger = logging.getLogger(__name__)

--- a/tests/test_cappedpool.py
+++ b/tests/test_cappedpool.py
@ -1,99 +0,0 @@
-import unittest
-import time
-import concurrent.futures
-from concurrent.futures import TimeoutError
-
-# Assuming the CappedProcessPoolExecutor code is in a module named 'capped_executor'
-from olmocr.cappedpool import CappedProcessPoolExecutor
-
-# Define functions at the top level to ensure they are picklable by multiprocessing
-
-def square(x):
-    return x * x
-
-def raise_exception():
-    raise ValueError("Test exception")
-
-def sleep_and_return(x, sleep_time):
-    time.sleep(sleep_time)
-    return x
-
-def task(counter, max_counter, counter_lock):
-    with counter_lock:
-        counter.value += 1
-        print(f"Task incrementing counter to {counter.value}")
-        if counter.value > max_counter.value:
-            max_counter.value = counter.value
-    time.sleep(0.5)
-    with counter_lock:
-        counter.value -= 1
-    return True
-
-class TestCappedProcessPoolExecutor(unittest.TestCase):
-
-    def test_basic_functionality(self):
-        """Test that tasks are executed and results are correct."""
-        with CappedProcessPoolExecutor(max_unprocessed=10, max_workers=4) as executor:
-            futures = [executor.submit(square, i) for i in range(10)]
-            results = [f.result() for f in futures]
-            expected = [i * i for i in range(10)]
-            self.assertEqual(results, expected)
-
-    def test_exception_handling(self):
-        """Test that exceptions in tasks are properly raised."""
-        with CappedProcessPoolExecutor(max_unprocessed=10, max_workers=4) as executor:
-            future = executor.submit(raise_exception)
-            with self.assertRaises(ValueError):
-                future.result()
-
-    def test_cancellation(self):
-        """Test that tasks can be cancelled before execution."""
-        with CappedProcessPoolExecutor(max_unprocessed=10, max_workers=4) as executor:
-            future = executor.submit(time.sleep, 5)
-            # Try to cancel immediately
-            cancelled = future.cancel()
-            self.assertTrue(cancelled)
-            self.assertTrue(future.cancelled())
-            # Attempt to get result; should raise CancelledError
-            with self.assertRaises(concurrent.futures.CancelledError):
-                future.result()
-
-    def test_shutdown(self):
-        """Test that the executor shuts down properly and does not accept new tasks."""
-        executor = CappedProcessPoolExecutor(max_unprocessed=10, max_workers=4)
-        future = executor.submit(time.sleep, 1)
-        executor.shutdown(wait=True)
-        with self.assertRaises(RuntimeError):
-            executor.submit(time.sleep, 1)
-
-    def test_capping_behavior(self):
-        """Test that the number of concurrent tasks does not exceed max_unprocessed."""
-        max_unprocessed = 3
-        with CappedProcessPoolExecutor(max_unprocessed=max_unprocessed, max_workers=10) as executor:
-            from multiprocessing import Manager
-
-            manager = Manager()
-            counter = manager.Value('i', 0)
-            max_counter = manager.Value('i', 0)
-            counter_lock = manager.Lock()
-
-            futures = [executor.submit(task, counter, max_counter, counter_lock) for _ in range(10)]
-
-            for index, f in enumerate(futures):
-                print(f"Future {index} returned {f.result()}")
-
-                time.sleep(1)
-
-            print(max_counter.value)
-            self.assertLessEqual(max_counter.value, max_unprocessed)
-
-    def test_submit_after_shutdown(self):
-        """Test that submitting tasks after shutdown raises an error."""
-        executor = CappedProcessPoolExecutor(max_unprocessed=10, max_workers=4)
-        executor.shutdown(wait=True)
-        with self.assertRaises(RuntimeError):
-            executor.submit(square, 2)
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/tests/test_sglang.py
+++ b/tests/test_sglang.py
@ -17,7 +17,7 @@ from io import BytesIO
 from PIL import Image
 from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration
 from pathlib import Path
-from olmocr.beakerpipeline import sglang_server_task, sglang_server_ready, build_page_query, SGLANG_SERVER_PORT, render_pdf_to_base64png, get_anchor_text, download_directory
+from olmocr.pipeline import sglang_server_task, sglang_server_ready, build_page_query, SGLANG_SERVER_PORT, render_pdf_to_base64png, get_anchor_text, download_directory
 from olmocr.prompts import PageResponse
 from httpx import AsyncClient
 import torch.nn.functional as F