Hopefully can use weka for the train datasets now

This commit is contained in:
Jake Poznanski 2024-10-07 16:14:28 +00:00
parent d8e459c9f3
commit 44bcdc771b
2 changed files with 6 additions and 5 deletions

View File

@ -29,11 +29,11 @@ train_data:
seed: 1337
sources:
- name: openai_batch_data_v5_1_train
query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_train/*.jsonl
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json
query_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train/*.jsonl
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json
- name: openai_batch_data_v5_1_train
query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train/*.jsonl
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json
query_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train/*.jsonl
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json
valid_data:
metric_for_best_model: openai_batch_data_v5_1_eval_loss

View File

@ -19,7 +19,7 @@ run_name=$(basename "$0" .sh)
# --cluster 'ai2/allennlp-cirrascale' \
# --priority high \
CLUSTER='pluto'
CLUSTER='jupiter'
gantry run \
--description "${run_name}"\
@ -35,6 +35,7 @@ gantry run \
--preemptible \
--cluster "ai2/${CLUSTER}*" \
--budget ai2/oe-data \
--weka "oe-data-default:/data" \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \