mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-06 04:01:29 +00:00
Prepping for more training
This commit is contained in:
parent
063be21287
commit
5c36c22bf7
@ -7,43 +7,35 @@ wandb:
|
|||||||
project: pdelfin
|
project: pdelfin
|
||||||
entity: ai2-llm
|
entity: ai2-llm
|
||||||
|
|
||||||
# TODO This is not used
|
|
||||||
format:
|
|
||||||
instruction_template: "Original:"
|
|
||||||
response_template: "Rewritten:"
|
|
||||||
# Template from here: https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py#L30
|
|
||||||
chat_template: |
|
|
||||||
{% for message in messages %}
|
|
||||||
{{'<|im_start|>' + message['role'] + '\n' + message['content']}}
|
|
||||||
{% if loop.last %}
|
|
||||||
{{ '<|im_end|>'}}
|
|
||||||
{% else %}
|
|
||||||
{{ '<|im_end|>\n' }}
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
generate:
|
generate:
|
||||||
max_length: 4096
|
max_length: 8192
|
||||||
|
|
||||||
train_data:
|
train_data:
|
||||||
seed: 1337
|
seed: 1337
|
||||||
|
cache_location: /data/jakep/pdfdata/pdelfin_cache
|
||||||
sources:
|
sources:
|
||||||
# These tend to be really big, so it's only practical to host them as parquets on weka, otherwise you may OOM or just never finish dataloading
|
|
||||||
- name: openai_batch_data_v5_1_train
|
- name: openai_batch_data_v5_1_train
|
||||||
parquet_path: /data/jakep/pdfdata/openai_batch_data_v5_1_parquet/*.parquet
|
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json
|
||||||
- name: openai_batch_data_v5_1_train
|
target_longest_image_dim: 1024
|
||||||
parquet_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_parquet/*.parquet
|
target_anchor_text_len: 6000
|
||||||
|
- name: openai_batch_data_v5_1_iabooks_train
|
||||||
|
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json
|
||||||
|
target_longest_image_dim: 1024
|
||||||
|
target_anchor_text_len: 6000
|
||||||
|
|
||||||
valid_data:
|
valid_data:
|
||||||
|
cache_location: /data/jakep/pdfdata/pdelfin_cache
|
||||||
metric_for_best_model: openai_batch_data_v5_1_eval_loss
|
metric_for_best_model: openai_batch_data_v5_1_eval_loss
|
||||||
sources:
|
sources:
|
||||||
# These tend to be small, so you can load from s3 it's no big deal
|
# These tend to be small, so you can load from s3 it's no big deal
|
||||||
- name: openai_batch_data_v5_1_eval
|
- name: openai_batch_data_v5_1_eval
|
||||||
query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl
|
|
||||||
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json
|
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json
|
||||||
|
target_longest_image_dim: 1024
|
||||||
|
target_anchor_text_len: 6000
|
||||||
- name: openai_batch_data_v5_1_iabooks_eval
|
- name: openai_batch_data_v5_1_iabooks_eval
|
||||||
query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_eval/*.jsonl
|
|
||||||
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json
|
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json
|
||||||
|
target_longest_image_dim: 1024
|
||||||
|
target_anchor_text_len: 6000
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -55,7 +47,7 @@ hparams:
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
clip_grad_norm: 1.0
|
clip_grad_norm: 1.0
|
||||||
learning_rate: 1e-5
|
learning_rate: 1e-5
|
||||||
max_steps: 9000
|
max_steps: 10000
|
||||||
pad_multiple_of: 16
|
pad_multiple_of: 16
|
||||||
log_every_steps: 10
|
log_every_steps: 10
|
||||||
eval_every_steps: 100
|
eval_every_steps: 100
|
||||||
|
|||||||
@ -116,7 +116,8 @@ def _cache_s3_file(s3_path: str, local_cache_dir: str):
|
|||||||
)
|
)
|
||||||
s3_client.download_file(bucket, key, local_file_path)
|
s3_client.download_file(bucket, key, local_file_path)
|
||||||
else:
|
else:
|
||||||
logger.info(f"File {local_file_path} already exists, skipping download.")
|
pass
|
||||||
|
#logger.info(f"File {local_file_path} already exists, skipping download.")
|
||||||
|
|
||||||
return local_file_path
|
return local_file_path
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user