mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-16 18:39:29 +00:00
I think I have error handling better now
This commit is contained in:
parent
8217e49153
commit
2c7686f8ff
@ -366,11 +366,12 @@ async def process_pdf(args, session: aiohttp.ClientSession, worker_id: int, pdf_
|
|||||||
|
|
||||||
# Collect the results from the entire task group, assuming no exceptions
|
# Collect the results from the entire task group, assuming no exceptions
|
||||||
page_results = [task.result() for task in page_tasks]
|
page_results = [task.result() for task in page_tasks]
|
||||||
|
return build_dolma_document(pdf_s3_path, page_results)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception(f"Exception in process_pdf for {pdf_s3_path}: {e}")
|
logger.exception(f"Exception in process_pdf for {pdf_s3_path}: {e}")
|
||||||
raise
|
# You can't build a dolma doc with even 1 failed page, so just get out of here
|
||||||
|
# However, you don't want to propagate an exception higher up and cancel the entire work_group
|
||||||
return build_dolma_document(pdf_s3_path, page_results)
|
return None
|
||||||
|
|
||||||
|
|
||||||
def build_dolma_document(pdf_s3_path, page_results):
|
def build_dolma_document(pdf_s3_path, page_results):
|
||||||
@ -435,7 +436,16 @@ async def worker(args, queue, semaphore, worker_id):
|
|||||||
async with asyncio.TaskGroup() as tg:
|
async with asyncio.TaskGroup() as tg:
|
||||||
dolma_tasks = [tg.create_task(process_pdf(args, session, worker_id, pdf)) for pdf in pdfs]
|
dolma_tasks = [tg.create_task(process_pdf(args, session, worker_id, pdf)) for pdf in pdfs]
|
||||||
|
|
||||||
dolma_docs = [task.result() for task in dolma_tasks if task.result() is not None]
|
dolma_docs = []
|
||||||
|
for task in dolma_tasks:
|
||||||
|
try:
|
||||||
|
result = task.result()
|
||||||
|
except:
|
||||||
|
# some dolma doc creations may have failed
|
||||||
|
pass
|
||||||
|
|
||||||
|
if result is not None:
|
||||||
|
dolma_docs.append(result)
|
||||||
|
|
||||||
# Write the Dolma documents to a local temporary file in JSONL format
|
# Write the Dolma documents to a local temporary file in JSONL format
|
||||||
with tempfile.NamedTemporaryFile(mode='w+', delete=False) as tf:
|
with tempfile.NamedTemporaryFile(mode='w+', delete=False) as tf:
|
||||||
|
|||||||
@ -2,7 +2,7 @@ _MAJOR = "0"
|
|||||||
_MINOR = "1"
|
_MINOR = "1"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "12"
|
_PATCH = "13"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user