diff --git a/pdelfin/train/config/qwen2vl-7b.yaml b/pdelfin/train/config/qwen2vl-7b.yaml index f4f896d..6ffe969 100644 --- a/pdelfin/train/config/qwen2vl-7b.yaml +++ b/pdelfin/train/config/qwen2vl-7b.yaml @@ -17,11 +17,11 @@ train_data: - name: openai_batch_data_v5_1_train response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json target_longest_image_dim: 1024 - target_anchor_text_len: 6000 + target_anchor_text_len: [0, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000] - name: openai_batch_data_v5_1_iabooks_train response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json target_longest_image_dim: 1024 - target_anchor_text_len: 6000 + target_anchor_text_len: [0, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000] valid_data: cache_location: /data/jakep/pdfdata/pdelfin_cache diff --git a/pdelfin/train/core/config.py b/pdelfin/train/core/config.py index f6b8896..b854bbf 100644 --- a/pdelfin/train/core/config.py +++ b/pdelfin/train/core/config.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import List, Optional +from typing import List, Optional, Union from peft import TaskType # pyright: ignore @@ -54,8 +54,8 @@ class AwsConfig: class SourceConfig: name: str = field(help="The name of the source") response_glob_path: str = field(help="The s3 bucket pointing to the batch api response json's sent back from open ai") - target_longest_image_dim: int = field(help="Dimensions to render the pdf page image to") - target_anchor_text_len: int = field(help="Maximum amount of anchor text (aka prompt hint)") + target_longest_image_dim: Union[int, list[int]] = field(help="Dimensions to render the pdf page image to") + target_anchor_text_len: Union[int, list[int]] = field(help="Maximum amount of anchor text (aka prompt hint)") @dataclass