mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-01 18:43:45 +00:00
Prepping for more training
This commit is contained in:
parent
063be21287
commit
5c36c22bf7
@ -7,43 +7,35 @@ wandb:
|
||||
project: pdelfin
|
||||
entity: ai2-llm
|
||||
|
||||
# TODO This is not used
|
||||
format:
|
||||
instruction_template: "Original:"
|
||||
response_template: "Rewritten:"
|
||||
# Template from here: https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py#L30
|
||||
chat_template: |
|
||||
{% for message in messages %}
|
||||
{{'<|im_start|>' + message['role'] + '\n' + message['content']}}
|
||||
{% if loop.last %}
|
||||
{{ '<|im_end|>'}}
|
||||
{% else %}
|
||||
{{ '<|im_end|>\n' }}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
generate:
|
||||
max_length: 4096
|
||||
max_length: 8192
|
||||
|
||||
train_data:
|
||||
seed: 1337
|
||||
cache_location: /data/jakep/pdfdata/pdelfin_cache
|
||||
sources:
|
||||
# These tend to be really big, so it's only practical to host them as parquets on weka, otherwise you may OOM or just never finish dataloading
|
||||
- name: openai_batch_data_v5_1_train
|
||||
parquet_path: /data/jakep/pdfdata/openai_batch_data_v5_1_parquet/*.parquet
|
||||
- name: openai_batch_data_v5_1_train
|
||||
parquet_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_parquet/*.parquet
|
||||
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json
|
||||
target_longest_image_dim: 1024
|
||||
target_anchor_text_len: 6000
|
||||
- name: openai_batch_data_v5_1_iabooks_train
|
||||
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json
|
||||
target_longest_image_dim: 1024
|
||||
target_anchor_text_len: 6000
|
||||
|
||||
valid_data:
|
||||
cache_location: /data/jakep/pdfdata/pdelfin_cache
|
||||
metric_for_best_model: openai_batch_data_v5_1_eval_loss
|
||||
sources:
|
||||
# These tend to be small, so you can load from s3 it's no big deal
|
||||
- name: openai_batch_data_v5_1_eval
|
||||
query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl
|
||||
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json
|
||||
target_longest_image_dim: 1024
|
||||
target_anchor_text_len: 6000
|
||||
- name: openai_batch_data_v5_1_iabooks_eval
|
||||
query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_eval/*.jsonl
|
||||
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json
|
||||
target_longest_image_dim: 1024
|
||||
target_anchor_text_len: 6000
|
||||
|
||||
|
||||
|
||||
@ -55,7 +47,7 @@ hparams:
|
||||
gradient_checkpointing: true
|
||||
clip_grad_norm: 1.0
|
||||
learning_rate: 1e-5
|
||||
max_steps: 9000
|
||||
max_steps: 10000
|
||||
pad_multiple_of: 16
|
||||
log_every_steps: 10
|
||||
eval_every_steps: 100
|
||||
|
||||
@ -116,7 +116,8 @@ def _cache_s3_file(s3_path: str, local_cache_dir: str):
|
||||
)
|
||||
s3_client.download_file(bucket, key, local_file_path)
|
||||
else:
|
||||
logger.info(f"File {local_file_path} already exists, skipping download.")
|
||||
pass
|
||||
#logger.info(f"File {local_file_path} already exists, skipping download.")
|
||||
|
||||
return local_file_path
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user