mirror of
https://github.com/allenai/olmocr.git
synced 2025-08-19 14:22:26 +00:00
More realistic results
This commit is contained in:
parent
770da2b7ae
commit
ae1e4bc07e
@ -244,8 +244,8 @@ async def load_pdf_work_queue(args) -> asyncio.Queue:
|
||||
}
|
||||
|
||||
# Determine remaining work
|
||||
remaining_work_hashes = set(work_queue) - done_work_hashes
|
||||
#remaining_work_hashes = set(["0e779f21fbb75d38ed4242c7e5fe57fa9a636bac"]) # If you want to debug with a specific work hash
|
||||
#remaining_work_hashes = set(work_queue) - done_work_hashes
|
||||
remaining_work_hashes = set(["0e779f21fbb75d38ed4242c7e5fe57fa9a636bac"]) # If you want to debug with a specific work hash
|
||||
remaining_work_queue = {
|
||||
hash_: work_queue[hash_]
|
||||
for hash_ in remaining_work_hashes
|
||||
@ -443,7 +443,7 @@ async def worker(args, queue, semaphore, worker_id):
|
||||
else:
|
||||
logger.info(f"Proceeding with {work_hash} on worker {worker_id}")
|
||||
|
||||
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60),
|
||||
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=600),
|
||||
connector=aiohttp.TCPConnector(limit=1000)) as session:
|
||||
async with asyncio.TaskGroup() as tg:
|
||||
dolma_tasks = [tg.create_task(process_pdf(args, session, worker_id, pdf)) for pdf in pdfs]
|
||||
|
@ -2,7 +2,7 @@ _MAJOR = "0"
|
||||
_MINOR = "1"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "16"
|
||||
_PATCH = "17"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
Loading…
x
Reference in New Issue
Block a user