mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-01 10:33:57 +00:00
Fix
This commit is contained in:
parent
f25cb6c261
commit
db36608b42
@ -100,7 +100,7 @@ def get_state(folder_path: str) -> dict:
|
||||
} for f in jsonl_files}
|
||||
|
||||
with open(state_file, "w") as f:
|
||||
return json.dump(state, f)
|
||||
json.dump(state, f)
|
||||
|
||||
return state
|
||||
|
||||
@ -129,7 +129,15 @@ def get_next_work_item(folder_path):
|
||||
|
||||
return all_states[0] if len(all_states) > 0 else None
|
||||
|
||||
def get_done_total(folder_path):
|
||||
done, total = 0,0
|
||||
|
||||
for state in get_state(folder_path).values():
|
||||
if state["state"] in FINISHED_STATES:
|
||||
done += 1
|
||||
total += 1
|
||||
|
||||
return done, total
|
||||
|
||||
# Main function to process all .jsonl files in a folder
|
||||
def process_folder(folder_path: str, max_gb: int):
|
||||
@ -142,6 +150,9 @@ def process_folder(folder_path: str, max_gb: int):
|
||||
raise ValueError(f"Insufficient free space in OpenAI's file storage: Only {starting_free_space} GB left, but 2x{max_gb} GB are required (1x for your uploads, 1x for your results).")
|
||||
|
||||
while not all(state["state"] in FINISHED_STATES for state in get_state(folder_path).values()):
|
||||
done, total = get_done_total(folder_path)
|
||||
print(f"Total items {total}, done {done}, {done/total*100:.1f}%")
|
||||
|
||||
work_item = get_next_work_item(folder_path)
|
||||
print(f"Processing {os.path.basename(work_item['filename'])}, cur status = {work_item['state']}")
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user