mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-12 16:39:40 +00:00
Config work
This commit is contained in:
parent
3c1b7de293
commit
d4f64ed82a
@ -7,43 +7,33 @@ wandb:
|
|||||||
project: pdelfin
|
project: pdelfin
|
||||||
entity: ai2-llm
|
entity: ai2-llm
|
||||||
|
|
||||||
# TODO This is not used
|
|
||||||
format:
|
|
||||||
instruction_template: "Original:"
|
|
||||||
response_template: "Rewritten:"
|
|
||||||
# Template from here: https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py#L30
|
|
||||||
chat_template: |
|
|
||||||
{% for message in messages %}
|
|
||||||
{{'<|im_start|>' + message['role'] + '\n' + message['content']}}
|
|
||||||
{% if loop.last %}
|
|
||||||
{{ '<|im_end|>'}}
|
|
||||||
{% else %}
|
|
||||||
{{ '<|im_end|>\n' }}
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
generate:
|
generate:
|
||||||
max_length: 4096
|
max_length: 8192
|
||||||
|
|
||||||
train_data:
|
train_data:
|
||||||
seed: 1337
|
seed: 1337
|
||||||
sources:
|
sources:
|
||||||
# These tend to be really big, so it's only practical to host them as parquets on weka, otherwise you may OOM or just never finish dataloading
|
|
||||||
- name: openai_batch_data_v5_1_train
|
- name: openai_batch_data_v5_1_train
|
||||||
parquet_path: /data/jakep/pdfdata/openai_batch_data_v5_1_parquet/*.parquet
|
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json
|
||||||
- name: openai_batch_data_v5_1_train
|
target_longest_image_dim: 1024
|
||||||
parquet_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_parquet/*.parquet
|
target_anchor_text_len: 6000
|
||||||
|
- name: openai_batch_data_v5_1_iabooks_train
|
||||||
|
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json
|
||||||
|
target_longest_image_dim: 1024
|
||||||
|
target_anchor_text_len: 6000
|
||||||
|
|
||||||
valid_data:
|
valid_data:
|
||||||
metric_for_best_model: openai_batch_data_v5_1_eval_loss
|
metric_for_best_model: openai_batch_data_v5_1_eval_loss
|
||||||
sources:
|
sources:
|
||||||
# These tend to be small, so you can load from s3 it's no big deal
|
# These tend to be small, so you can load from s3 it's no big deal
|
||||||
- name: openai_batch_data_v5_1_eval
|
- name: openai_batch_data_v5_1_eval
|
||||||
query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl
|
|
||||||
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json
|
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json
|
||||||
|
target_longest_image_dim: 1024
|
||||||
|
target_anchor_text_len: 6000
|
||||||
- name: openai_batch_data_v5_1_iabooks_eval
|
- name: openai_batch_data_v5_1_iabooks_eval
|
||||||
query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_eval/*.jsonl
|
|
||||||
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json
|
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json
|
||||||
|
target_longest_image_dim: 1024
|
||||||
|
target_anchor_text_len: 6000
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -22,28 +22,6 @@ class ModelConfig:
|
|||||||
model_revision: Optional[str] = field(help="The model revision to use for the model.", default=None)
|
model_revision: Optional[str] = field(help="The model revision to use for the model.", default=None)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class FormatConfig:
|
|
||||||
"""Configuration for formatting the text that is input to the model."""
|
|
||||||
|
|
||||||
new_line_symbol: str = field(
|
|
||||||
help="The symbol to use for new lines in the text; default is '\\n'.",
|
|
||||||
default="\n",
|
|
||||||
)
|
|
||||||
system_message: Optional[str] = field(
|
|
||||||
help="The system message to use for formatting the text; default is no system message.",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
instruction_template: str = field(
|
|
||||||
help="The template to use for formatting the input text", default="Original:"
|
|
||||||
)
|
|
||||||
response_template: str = field(help="The template to use for formatting the output text", default="Rewrite:")
|
|
||||||
chat_template: Optional[str] = field(
|
|
||||||
help="The template to use for formatting the chat text. If None, the default chat template will be used.",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GenerateConfig:
|
class GenerateConfig:
|
||||||
max_length: int = field(help="The maximum length of the generated text", default=4096)
|
max_length: int = field(help="The maximum length of the generated text", default=4096)
|
||||||
@ -75,9 +53,9 @@ class AwsConfig:
|
|||||||
@dataclass
|
@dataclass
|
||||||
class SourceConfig:
|
class SourceConfig:
|
||||||
name: str = field(help="The name of the source")
|
name: str = field(help="The name of the source")
|
||||||
parquet_path: Optional[str] = field(help="The s3/glob path to a bunch of parquet files for a preprocessed dataset.", default=None)
|
response_glob_path: str = field(help="The s3 bucket pointing to the batch api response json's sent back from open ai")
|
||||||
query_glob_path: Optional[str] = field(help="The s3 bucket pointing to the inputs sent to OpenAI to generate the silver data", default=None)
|
target_longest_image_dim: int = field(help="Dimensions to render the pdf page image to")
|
||||||
response_glob_path: Optional[str] = field(help="The s3 bucket pointing to the batch api response json's sent back from open ai", default=None)
|
target_anchor_text_len: int = field(help="Maximum amount of anchor text (aka prompt hint)")
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -141,7 +119,6 @@ class TrainConfig:
|
|||||||
lora: Optional[LoraConfig] = field(default=None, help="The LoRA configuration")
|
lora: Optional[LoraConfig] = field(default=None, help="The LoRA configuration")
|
||||||
aws: AwsConfig = field(default=AwsConfig(), help="Configuration for AWS S3")
|
aws: AwsConfig = field(default=AwsConfig(), help="Configuration for AWS S3")
|
||||||
wandb: WandbConfig = field(default=WandbConfig(), help="Configuration for Weights and Biases")
|
wandb: WandbConfig = field(default=WandbConfig(), help="Configuration for Weights and Biases")
|
||||||
format: FormatConfig = field(default=FormatConfig(), help="Configuration for formatting the input/output text")
|
|
||||||
train_data: DataConfig = field(default=DataConfig(), help="Configuration for the training data")
|
train_data: DataConfig = field(default=DataConfig(), help="Configuration for the training data")
|
||||||
valid_data: DataConfig = field(default=DataConfig(), help="Configuration for the validation data")
|
valid_data: DataConfig = field(default=DataConfig(), help="Configuration for the validation data")
|
||||||
generate: GenerateConfig = field(default=GenerateConfig(), help="Configuration for text generation")
|
generate: GenerateConfig = field(default=GenerateConfig(), help="Configuration for text generation")
|
||||||
@ -158,5 +135,4 @@ class DemoConfig:
|
|||||||
share: bool = field(default=False, help="Share the demo publicly.")
|
share: bool = field(default=False, help="Share the demo publicly.")
|
||||||
|
|
||||||
model: ModelConfig = field(default=ModelConfig())
|
model: ModelConfig = field(default=ModelConfig())
|
||||||
format: FormatConfig = field(default=FormatConfig())
|
|
||||||
generate: GenerateConfig = field(default=GenerateConfig())
|
generate: GenerateConfig = field(default=GenerateConfig())
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user