Cleaning up dependencies

This commit is contained in:
Jake Poznanski 2025-10-15 19:40:58 +00:00
parent 569311c461
commit e2a5d9f8f3
3 changed files with 6 additions and 9 deletions

View File

@ -4,7 +4,6 @@
# This script will go through each CSV file, convert each page to PDF format, clean up the transcription using a grounded prompt in chatgpt-4o # This script will go through each CSV file, convert each page to PDF format, clean up the transcription using a grounded prompt in chatgpt-4o
# and then output data in olmocr-format, where you have a .md file and a .pdf file named with the ItemID in a folder structure for # and then output data in olmocr-format, where you have a .md file and a .pdf file named with the ItemID in a folder structure for
# each initial CSV # each initial CSV
# We use https://pypi.org/project/img2pdf/ to convert the images to PDFs losslessly.
import argparse import argparse
import csv import csv
@ -14,8 +13,8 @@ import time
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path from pathlib import Path
from typing import Dict, Optional, Set, Tuple from typing import Dict, Optional, Set, Tuple
from olmocr.image_utils import convert_image_to_pdf_bytes
import img2pdf
import requests import requests
from tqdm import tqdm from tqdm import tqdm
@ -56,10 +55,10 @@ def download_image(url: str, output_path: Path, max_retries: int = 3) -> bool:
def convert_image_to_pdf(image_path: Path, pdf_path: Path) -> bool: def convert_image_to_pdf(image_path: Path, pdf_path: Path) -> bool:
"""Convert image to PDF using img2pdf.""" """Convert image to PDF."""
try: try:
with open(pdf_path, "wb") as f: with open(pdf_path, "wb") as f:
f.write(img2pdf.convert(str(image_path))) f.write(convert_image_to_pdf_bytes(str(image_path)))
return True return True
except Exception as e: except Exception as e:
print(f"Failed to convert {image_path} to PDF: {e}") print(f"Failed to convert {image_path} to PDF: {e}")

View File

@ -82,7 +82,6 @@
# Then, for each image, which is typically a scanned document page, we create a dataset in olmocr-format, where you have a .md file and a .pdf file named with the ItemID in a folder structure for # Then, for each image, which is typically a scanned document page, we create a dataset in olmocr-format, where you have a .md file and a .pdf file named with the ItemID in a folder structure for
# each initial jsonl file. Ex if you had rg_341/rg_341-53.jsonl, then you'd make rg_341/object_id.md and rg_341/object_id.pdf # each initial jsonl file. Ex if you had rg_341/rg_341-53.jsonl, then you'd make rg_341/object_id.md and rg_341/object_id.pdf
# If you have a TIFF file, you can compress it to jpg at 98% quality, targetting around 1-2MB in size. # If you have a TIFF file, you can compress it to jpg at 98% quality, targetting around 1-2MB in size.
# Then use https://pypi.org/project/img2pdf/ to convert the images to PDFs losslessly.
# Output files are named as naId-objectId-page-pageNum.{md,pdf} based on the target object from transcriptions. # Output files are named as naId-objectId-page-pageNum.{md,pdf} based on the target object from transcriptions.
# Each JSONL file gets its own subfolder for organization. # Each JSONL file gets its own subfolder for organization.
@ -94,11 +93,11 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path from pathlib import Path
from typing import Dict, Optional, Set, Tuple from typing import Dict, Optional, Set, Tuple
import img2pdf
import requests import requests
from PIL import Image from PIL import Image
from tqdm import tqdm from tqdm import tqdm
from olmocr.image_utils import convert_image_to_pdf_bytes
def download_image(url: str, output_path: Path, max_retries: int = 5) -> bool: def download_image(url: str, output_path: Path, max_retries: int = 5) -> bool:
"""Download image from URL with exponential backoff retry logic.""" """Download image from URL with exponential backoff retry logic."""
@ -152,7 +151,7 @@ def process_image_file(image_path: Path, output_path: Path, target_size_mb: floa
# Convert JPEG to PDF # Convert JPEG to PDF
with open(output_path, "wb") as f: with open(output_path, "wb") as f:
f.write(img2pdf.convert(str(temp_jpg))) f.write(convert_image_to_pdf_bytes(str(temp_jpg)))
# Clean up temp file # Clean up temp file
temp_jpg.unlink(missing_ok=True) temp_jpg.unlink(missing_ok=True)
@ -160,7 +159,7 @@ def process_image_file(image_path: Path, output_path: Path, target_size_mb: floa
else: else:
# For other formats, convert directly to PDF # For other formats, convert directly to PDF
with open(output_path, "wb") as f: with open(output_path, "wb") as f:
f.write(img2pdf.convert(str(image_path))) f.write(convert_image_to_pdf_bytes(str(image_path)))
return True return True
except Exception as e: except Exception as e:

View File

@ -38,7 +38,6 @@ dependencies = [
"httpx", "httpx",
"torch>=2.7.0", "torch>=2.7.0",
"transformers==4.55.2", "transformers==4.55.2",
"img2pdf",
"beaker-py", "beaker-py",
] ]
license = {file = "LICENSE"} license = {file = "LICENSE"}