From b64fd19db3cf2d1abcd1164d6f69a2daef8a1c8a Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 31 Mar 2025 13:28:30 -0700 Subject: [PATCH] Cleaning up code for image to pdf conversion --- olmocr/bench/convert.py | 42 ++++++++++++++++++++++++++-------- olmocr/image_utils.py | 50 +++++++++++++++++++++++++++++------------ olmocr/pipeline.py | 12 +++++++--- 3 files changed, 78 insertions(+), 26 deletions(-) diff --git a/olmocr/bench/convert.py b/olmocr/bench/convert.py index f5a65f8..46c2941 100644 --- a/olmocr/bench/convert.py +++ b/olmocr/bench/convert.py @@ -7,11 +7,11 @@ import os import tempfile from functools import partial -import img2pdf from pypdf import PdfReader from tqdm import tqdm from olmocr.data.renderpdf import render_pdf_to_base64png +from olmocr.image_utils import convert_image_to_pdf_bytes def parse_method_arg(method_arg): @@ -116,15 +116,39 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, remove_te pdf_relative_dir = os.path.dirname(relative_pdf_path) if remove_text: - page_images = [] - for page_num in range(1, num_pages + 1): - page_images.append(render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=2048)) - print(f"Converting {pdf_path} into images to remove text-content...") - temp_pdf = tempfile.NamedTemporaryFile("wb", suffix=".pdf", delete=False) - temp_pdf.write(img2pdf.convert([base64.b64decode(x) for x in page_images])) - temp_pdf.flush() - pdf_path = temp_pdf.name + + # Generate image files from each page + temp_image_files = [] + try: + for page_num in range(1, num_pages + 1): + # Get base64 PNG data for the current page + base64_png = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=2048) + + # Decode base64 and save to temporary file + temp_img = tempfile.NamedTemporaryFile("wb", suffix=".png", delete=False) + temp_img.write(base64.b64decode(base64_png)) + temp_img.close() + temp_image_files.append(temp_img.name) + + # Convert all images to a single PDF using our enhanced function + pdf_bytes = convert_image_to_pdf_bytes(temp_image_files) + + # Write the PDF bytes to a temporary file + temp_pdf = tempfile.NamedTemporaryFile("wb", suffix=".pdf", delete=False) + temp_pdf.write(pdf_bytes) + temp_pdf.close() + + # Update pdf_path to the new file + pdf_path = temp_pdf.name + + finally: + # Clean up temporary image files + for temp_file in temp_image_files: + try: + os.remove(temp_file) + except Exception as e: + print(f"Warning: Failed to remove temporary file {temp_file}: {e}") for repeat in range(1, repeats + 1): for page_num in range(1, num_pages + 1): diff --git a/olmocr/image_utils.py b/olmocr/image_utils.py index 960b505..f69df96 100644 --- a/olmocr/image_utils.py +++ b/olmocr/image_utils.py @@ -1,23 +1,45 @@ import os -import tempfile import subprocess +import tempfile +from typing import List, Union -def convert_image_to_pdf_bytes(image_file: str) -> bytes: +def convert_image_to_pdf_bytes(image_files: Union[str, List[str]]) -> bytes: + """ + Convert one or multiple image files to PDF bytes. + + Args: + image_files: A single image file path (str) or a list of image file paths + + Returns: + bytes: The PDF content as bytes + + Raises: + RuntimeError: If the conversion fails + ValueError: If invalid input is provided + """ + # Handle different input types + if isinstance(image_files, str): + # Single image case + image_files = [image_files] + elif not isinstance(image_files, list) or not image_files: + raise ValueError("image_files must be a non-empty string or list of strings") + + # Validate files exist and are valid image formats + for image_file in image_files: + if not os.path.exists(image_file): + raise ValueError(f"File does not exist: {image_file}") + try: - # Run img2pdf and capture its stdout directly as bytes - result = subprocess.run( - ["img2pdf", image_file], - check=True, - capture_output=True - ) - + # Run img2pdf with all images as arguments + result = subprocess.run(["img2pdf"] + image_files, check=True, capture_output=True) + # Return the stdout content which contains the PDF data return result.stdout - + except subprocess.CalledProcessError as e: # Raise error with stderr information if the conversion fails - raise RuntimeError(f"Error converting image to PDF: {e.stderr.decode('utf-8')}") + raise RuntimeError(f"Error converting image(s) to PDF: {e.stderr.decode('utf-8')}") def is_png(file_path): @@ -32,9 +54,9 @@ def is_png(file_path): def is_jpeg(file_path): try: - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: header = f.read(2) - return header == b'\xff\xd8' + return header == b"\xff\xd8" except Exception as e: print(f"Error: {e}") - return False \ No newline at end of file + return False diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 5bf30c3..daa2593 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -37,10 +37,10 @@ from olmocr.check import ( ) from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.filter.filter import Language, PdfFilter +from olmocr.image_utils import convert_image_to_pdf_bytes, is_jpeg, is_png from olmocr.metrics import MetricsKeeper, WorkerTracker from olmocr.prompts import PageResponse, build_finetuning_prompt from olmocr.prompts.anchor import get_anchor_text -from olmocr.image_utils import is_png, is_jpeg, convert_image_to_pdf_bytes from olmocr.s3_utils import ( download_zstd_csv, expand_s3_glob, @@ -89,7 +89,8 @@ process_pool = ProcessPoolExecutor(max_workers=min(multiprocessing.cpu_count() / # Filter object, cached so it will only get loaded when/if you need it get_pdf_filter = cache(lambda: PdfFilter(languages_to_keep={Language.ENGLISH, None}, apply_download_spam_check=True, apply_form_check=True)) -SGLANG_SERVER_PORT = None +# Specify a default port, but it can be overridden by args +SGLANG_SERVER_PORT = 30024 @dataclass(frozen=True) @@ -995,7 +996,12 @@ async def main(): logger.info(f"Expanding s3 glob at {pdf_path}") pdf_work_paths |= set(expand_s3_glob(pdf_s3, pdf_path)) elif os.path.exists(pdf_path): - if pdf_path.lower().endswith(".pdf") or pdf_path.lower().endswith(".png") or pdf_path.lower().endswith(".jpg") or pdf_path.lower().endswith(".jpeg"): + if ( + pdf_path.lower().endswith(".pdf") + or pdf_path.lower().endswith(".png") + or pdf_path.lower().endswith(".jpg") + or pdf_path.lower().endswith(".jpeg") + ): if open(pdf_path, "rb").read(4) == b"%PDF": logger.info(f"Loading file at {pdf_path} as PDF document") pdf_work_paths.add(pdf_path)