Cleaning up dependencies

2025-10-19 03:59:09 +00:00 · 2025-10-15 19:40:58 +00:00 · 2025-10-15 19:40:58 +00:00 · e2a5d9f8f3
commit e2a5d9f8f3
parent 569311c461
3 changed files with 6 additions and 9 deletions
--- a/olmocr/data/prepare_loc_transcripts.py
+++ b/olmocr/data/prepare_loc_transcripts.py
@ -4,7 +4,6 @@
 # This script will go through each CSV file, convert each page to PDF format, clean up the transcription using a grounded prompt in chatgpt-4o
 # and then output data in olmocr-format, where you have a .md file and a .pdf file named with the ItemID in a folder structure for
 # each initial CSV
-# We use https://pypi.org/project/img2pdf/ to convert the images to PDFs losslessly.

 import argparse
 import csv
@ -14,8 +13,8 @@ import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from typing import Dict, Optional, Set, Tuple
+from olmocr.image_utils import convert_image_to_pdf_bytes

-import img2pdf
 import requests
 from tqdm import tqdm

@ -56,10 +55,10 @@ def download_image(url: str, output_path: Path, max_retries: int = 3) -> bool:


 def convert_image_to_pdf(image_path: Path, pdf_path: Path) -> bool:
-    """Convert image to PDF using img2pdf."""
+    """Convert image to PDF."""
    try:
        with open(pdf_path, "wb") as f:
-            f.write(img2pdf.convert(str(image_path)))
+            f.write(convert_image_to_pdf_bytes(str(image_path)))
        return True
    except Exception as e:
        print(f"Failed to convert {image_path} to PDF: {e}")
--- a/olmocr/data/prepare_national_archive_transcripts.py
+++ b/olmocr/data/prepare_national_archive_transcripts.py
@ -82,7 +82,6 @@
 # Then, for each image, which is typically a scanned document page, we create a dataset in olmocr-format, where you have a .md file and a .pdf file named with the ItemID in a folder structure for
 # each initial jsonl file. Ex if you had rg_341/rg_341-53.jsonl, then you'd make rg_341/object_id.md and rg_341/object_id.pdf
 # If you have a TIFF file, you can compress it to jpg at 98% quality, targetting around 1-2MB in size.
-# Then use https://pypi.org/project/img2pdf/ to convert the images to PDFs losslessly.
 # Output files are named as naId-objectId-page-pageNum.{md,pdf} based on the target object from transcriptions.
 # Each JSONL file gets its own subfolder for organization.

@ -94,11 +93,11 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from typing import Dict, Optional, Set, Tuple

-import img2pdf
 import requests
 from PIL import Image
 from tqdm import tqdm

+from olmocr.image_utils import convert_image_to_pdf_bytes

 def download_image(url: str, output_path: Path, max_retries: int = 5) -> bool:
    """Download image from URL with exponential backoff retry logic."""
@ -152,7 +151,7 @@ def process_image_file(image_path: Path, output_path: Path, target_size_mb: floa

            # Convert JPEG to PDF
            with open(output_path, "wb") as f:
-                f.write(img2pdf.convert(str(temp_jpg)))
+                f.write(convert_image_to_pdf_bytes(str(temp_jpg)))

            # Clean up temp file
            temp_jpg.unlink(missing_ok=True)
@ -160,7 +159,7 @@ def process_image_file(image_path: Path, output_path: Path, target_size_mb: floa
        else:
            # For other formats, convert directly to PDF
            with open(output_path, "wb") as f:
-                f.write(img2pdf.convert(str(image_path)))
+                f.write(convert_image_to_pdf_bytes(str(image_path)))

        return True
    except Exception as e:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -38,7 +38,6 @@ dependencies = [
  "httpx",
  "torch>=2.7.0",
  "transformers==4.55.2",
-  "img2pdf",
  "beaker-py",
 ]
 license = {file = "LICENSE"}