From b67d8e75552a7f4a8dc6cd97f33d84b1ea5857a8 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 14 Nov 2024 12:48:46 -0800 Subject: [PATCH] Fixing work queue population --- pdelfin/beakerpipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdelfin/beakerpipeline.py b/pdelfin/beakerpipeline.py index affba22..aa703a5 100644 --- a/pdelfin/beakerpipeline.py +++ b/pdelfin/beakerpipeline.py @@ -194,7 +194,7 @@ async def populate_pdf_work_queue(args): async def load_pdf_work_queue(args) -> asyncio.Queue: index_file_s3_path = os.path.join(args.workspace, "pdf_index_list.csv.zstd") - output_glob = f"{args.workspace}/dolma_documents/output_*.jsonl" + output_glob = os.path.join(args.workspace, "dolma_documents", "*.jsonl") # Define the two blocking I/O operations download_task = asyncio.to_thread(download_zstd_csv, workspace_s3, index_file_s3_path)