mirror of
https://github.com/allenai/olmocr.git
synced 2025-06-27 04:00:02 +00:00
Fixing #240
This commit is contained in:
parent
0257444720
commit
cbc4580b72
@ -329,7 +329,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
|
||||
|
||||
|
||||
async def process_pdf(args, worker_id: int, pdf_orig_path: str):
|
||||
with tempfile.NamedTemporaryFile("wb+", suffix=".pdf") as tf:
|
||||
with tempfile.NamedTemporaryFile("wb+", suffix=".pdf", delete=False) as tf:
|
||||
try:
|
||||
data = await asyncio.to_thread(lambda: get_s3_bytes_with_backoff(pdf_s3, pdf_orig_path))
|
||||
tf.write(data)
|
||||
@ -347,6 +347,7 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str):
|
||||
tf.write(convert_image_to_pdf_bytes(tf.name))
|
||||
tf.flush()
|
||||
|
||||
try:
|
||||
try:
|
||||
reader = PdfReader(tf.name)
|
||||
num_pages = reader.get_num_pages()
|
||||
@ -398,7 +399,9 @@ async def process_pdf(args, worker_id: int, pdf_orig_path: str):
|
||||
# You can't build a dolma doc with even 1 failed page, so just get out of here
|
||||
# However, you don't want to propagate an exception higher up and cancel the entire work_group
|
||||
return None
|
||||
|
||||
finally:
|
||||
if os.path.exists(tf.name):
|
||||
os.unlink(tf.name)
|
||||
|
||||
def build_dolma_document(pdf_orig_path, page_results):
|
||||
# Build the document text and page spans
|
||||
|
Loading…
x
Reference in New Issue
Block a user