diff --git a/pdelfin/train/config/qwen2vl-7b-lora.yaml b/pdelfin/train/config/qwen2vl-7b-lora.yaml index 87a24b7..38d3a09 100644 --- a/pdelfin/train/config/qwen2vl-7b-lora.yaml +++ b/pdelfin/train/config/qwen2vl-7b-lora.yaml @@ -7,43 +7,33 @@ wandb: project: pdelfin entity: ai2-llm -# TODO This is not used -format: - instruction_template: "Original:" - response_template: "Rewritten:" - # Template from here: https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py#L30 - chat_template: | - {% for message in messages %} - {{'<|im_start|>' + message['role'] + '\n' + message['content']}} - {% if loop.last %} - {{ '<|im_end|>'}} - {% else %} - {{ '<|im_end|>\n' }} - {% endif %} - {% endfor %} - generate: - max_length: 4096 + max_length: 8192 train_data: seed: 1337 sources: - # These tend to be really big, so it's only practical to host them as parquets on weka, otherwise you may OOM or just never finish dataloading - name: openai_batch_data_v5_1_train - parquet_path: /data/jakep/pdfdata/openai_batch_data_v5_1_parquet/*.parquet - - name: openai_batch_data_v5_1_train - parquet_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_parquet/*.parquet + response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json + target_longest_image_dim: 1024 + target_anchor_text_len: 6000 + - name: openai_batch_data_v5_1_iabooks_train + response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json + target_longest_image_dim: 1024 + target_anchor_text_len: 6000 valid_data: metric_for_best_model: openai_batch_data_v5_1_eval_loss sources: # These tend to be small, so you can load from s3 it's no big deal - name: openai_batch_data_v5_1_eval - query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json + target_longest_image_dim: 1024 + target_anchor_text_len: 6000 - name: openai_batch_data_v5_1_iabooks_eval - query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_eval/*.jsonl response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json + target_longest_image_dim: 1024 + target_anchor_text_len: 6000 diff --git a/pdelfin/train/core/config.py b/pdelfin/train/core/config.py index 6e677fc..c3389ae 100644 --- a/pdelfin/train/core/config.py +++ b/pdelfin/train/core/config.py @@ -22,28 +22,6 @@ class ModelConfig: model_revision: Optional[str] = field(help="The model revision to use for the model.", default=None) -@dataclass -class FormatConfig: - """Configuration for formatting the text that is input to the model.""" - - new_line_symbol: str = field( - help="The symbol to use for new lines in the text; default is '\\n'.", - default="\n", - ) - system_message: Optional[str] = field( - help="The system message to use for formatting the text; default is no system message.", - default=None, - ) - instruction_template: str = field( - help="The template to use for formatting the input text", default="Original:" - ) - response_template: str = field(help="The template to use for formatting the output text", default="Rewrite:") - chat_template: Optional[str] = field( - help="The template to use for formatting the chat text. If None, the default chat template will be used.", - default=None, - ) - - @dataclass class GenerateConfig: max_length: int = field(help="The maximum length of the generated text", default=4096) @@ -75,9 +53,9 @@ class AwsConfig: @dataclass class SourceConfig: name: str = field(help="The name of the source") - parquet_path: Optional[str] = field(help="The s3/glob path to a bunch of parquet files for a preprocessed dataset.", default=None) - query_glob_path: Optional[str] = field(help="The s3 bucket pointing to the inputs sent to OpenAI to generate the silver data", default=None) - response_glob_path: Optional[str] = field(help="The s3 bucket pointing to the batch api response json's sent back from open ai", default=None) + response_glob_path: str = field(help="The s3 bucket pointing to the batch api response json's sent back from open ai") + target_longest_image_dim: int = field(help="Dimensions to render the pdf page image to") + target_anchor_text_len: int = field(help="Maximum amount of anchor text (aka prompt hint)") @dataclass @@ -141,7 +119,6 @@ class TrainConfig: lora: Optional[LoraConfig] = field(default=None, help="The LoRA configuration") aws: AwsConfig = field(default=AwsConfig(), help="Configuration for AWS S3") wandb: WandbConfig = field(default=WandbConfig(), help="Configuration for Weights and Biases") - format: FormatConfig = field(default=FormatConfig(), help="Configuration for formatting the input/output text") train_data: DataConfig = field(default=DataConfig(), help="Configuration for the training data") valid_data: DataConfig = field(default=DataConfig(), help="Configuration for the validation data") generate: GenerateConfig = field(default=GenerateConfig(), help="Configuration for text generation") @@ -158,5 +135,4 @@ class DemoConfig: share: bool = field(default=False, help="Share the demo publicly.") model: ModelConfig = field(default=ModelConfig()) - format: FormatConfig = field(default=FormatConfig()) generate: GenerateConfig = field(default=GenerateConfig())