diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 4f7d8f5..7ad2b3d 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -329,7 +329,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path: async def process_pdf(args, worker_id: int, pdf_orig_path: str): - with tempfile.NamedTemporaryFile("wb+", suffix=".pdf") as tf: + with tempfile.NamedTemporaryFile("wb+", suffix=".pdf", delete=False) as tf: try: data = await asyncio.to_thread(lambda: get_s3_bytes_with_backoff(pdf_s3, pdf_orig_path)) tf.write(data) @@ -347,6 +347,7 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str): tf.write(convert_image_to_pdf_bytes(tf.name)) tf.flush() + try: try: reader = PdfReader(tf.name) num_pages = reader.get_num_pages() @@ -398,7 +399,9 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str): # You can't build a dolma doc with even 1 failed page, so just get out of here # However, you don't want to propagate an exception higher up and cancel the entire work_group return None - + finally: + if os.path.exists(tf.name): + os.unlink(tf.name) def build_dolma_document(pdf_orig_path, page_results): # Build the document text and page spans