Fixing work queue population

This commit is contained in:
Jake Poznanski 2024-11-14 12:48:46 -08:00
parent 827b77e8df
commit b67d8e7555

View File

@ -194,7 +194,7 @@ async def populate_pdf_work_queue(args):
async def load_pdf_work_queue(args) -> asyncio.Queue:
index_file_s3_path = os.path.join(args.workspace, "pdf_index_list.csv.zstd")
output_glob = f"{args.workspace}/dolma_documents/output_*.jsonl"
output_glob = os.path.join(args.workspace, "dolma_documents", "*.jsonl")
# Define the two blocking I/O operations
download_task = asyncio.to_thread(download_zstd_csv, workspace_s3, index_file_s3_path)