diff --git a/olmocr/data/prepare_loc_transcripts.py b/olmocr/data/prepare_loc_transcripts.py index b26d6e1..366c2b5 100644 --- a/olmocr/data/prepare_loc_transcripts.py +++ b/olmocr/data/prepare_loc_transcripts.py @@ -4,7 +4,6 @@ # This script will go through each CSV file, convert each page to PDF format, clean up the transcription using a grounded prompt in chatgpt-4o # and then output data in olmocr-format, where you have a .md file and a .pdf file named with the ItemID in a folder structure for # each initial CSV -# We use https://pypi.org/project/img2pdf/ to convert the images to PDFs losslessly. import argparse import csv @@ -14,8 +13,8 @@ import time from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from typing import Dict, Optional, Set, Tuple +from olmocr.image_utils import convert_image_to_pdf_bytes -import img2pdf import requests from tqdm import tqdm @@ -56,10 +55,10 @@ def download_image(url: str, output_path: Path, max_retries: int = 3) -> bool: def convert_image_to_pdf(image_path: Path, pdf_path: Path) -> bool: - """Convert image to PDF using img2pdf.""" + """Convert image to PDF.""" try: with open(pdf_path, "wb") as f: - f.write(img2pdf.convert(str(image_path))) + f.write(convert_image_to_pdf_bytes(str(image_path))) return True except Exception as e: print(f"Failed to convert {image_path} to PDF: {e}") diff --git a/olmocr/data/prepare_national_archive_transcripts.py b/olmocr/data/prepare_national_archive_transcripts.py index bd4905f..40986df 100644 --- a/olmocr/data/prepare_national_archive_transcripts.py +++ b/olmocr/data/prepare_national_archive_transcripts.py @@ -82,7 +82,6 @@ # Then, for each image, which is typically a scanned document page, we create a dataset in olmocr-format, where you have a .md file and a .pdf file named with the ItemID in a folder structure for # each initial jsonl file. Ex if you had rg_341/rg_341-53.jsonl, then you'd make rg_341/object_id.md and rg_341/object_id.pdf # If you have a TIFF file, you can compress it to jpg at 98% quality, targetting around 1-2MB in size. -# Then use https://pypi.org/project/img2pdf/ to convert the images to PDFs losslessly. # Output files are named as naId-objectId-page-pageNum.{md,pdf} based on the target object from transcriptions. # Each JSONL file gets its own subfolder for organization. @@ -94,11 +93,11 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from typing import Dict, Optional, Set, Tuple -import img2pdf import requests from PIL import Image from tqdm import tqdm +from olmocr.image_utils import convert_image_to_pdf_bytes def download_image(url: str, output_path: Path, max_retries: int = 5) -> bool: """Download image from URL with exponential backoff retry logic.""" @@ -152,7 +151,7 @@ def process_image_file(image_path: Path, output_path: Path, target_size_mb: floa # Convert JPEG to PDF with open(output_path, "wb") as f: - f.write(img2pdf.convert(str(temp_jpg))) + f.write(convert_image_to_pdf_bytes(str(temp_jpg))) # Clean up temp file temp_jpg.unlink(missing_ok=True) @@ -160,7 +159,7 @@ def process_image_file(image_path: Path, output_path: Path, target_size_mb: floa else: # For other formats, convert directly to PDF with open(output_path, "wb") as f: - f.write(img2pdf.convert(str(image_path))) + f.write(convert_image_to_pdf_bytes(str(image_path))) return True except Exception as e: diff --git a/pyproject.toml b/pyproject.toml index 64a9bf7..ecd5471 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,6 @@ dependencies = [ "httpx", "torch>=2.7.0", "transformers==4.55.2", - "img2pdf", "beaker-py", ] license = {file = "LICENSE"}