Allow for sampling anchor and other params

This commit is contained in:
Jake Poznanski 2024-10-23 22:26:12 +00:00
parent 999f64dd46
commit 6a22900b8a
2 changed files with 5 additions and 5 deletions

View File

@ -17,11 +17,11 @@ train_data:
- name: openai_batch_data_v5_1_train
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json
target_longest_image_dim: 1024
target_anchor_text_len: 6000
target_anchor_text_len: [0, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000]
- name: openai_batch_data_v5_1_iabooks_train
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json
target_longest_image_dim: 1024
target_anchor_text_len: 6000
target_anchor_text_len: [0, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000]
valid_data:
cache_location: /data/jakep/pdfdata/pdelfin_cache

View File

@ -1,5 +1,5 @@
from dataclasses import dataclass
from typing import List, Optional
from typing import List, Optional, Union
from peft import TaskType # pyright: ignore
@ -54,8 +54,8 @@ class AwsConfig:
class SourceConfig:
name: str = field(help="The name of the source")
response_glob_path: str = field(help="The s3 bucket pointing to the batch api response json's sent back from open ai")
target_longest_image_dim: int = field(help="Dimensions to render the pdf page image to")
target_anchor_text_len: int = field(help="Maximum amount of anchor text (aka prompt hint)")
target_longest_image_dim: Union[int, list[int]] = field(help="Dimensions to render the pdf page image to")
target_anchor_text_len: Union[int, list[int]] = field(help="Maximum amount of anchor text (aka prompt hint)")
@dataclass