mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-12 15:51:26 +00:00
Allow for sampling anchor and other params
This commit is contained in:
parent
999f64dd46
commit
6a22900b8a
@ -17,11 +17,11 @@ train_data:
|
||||
- name: openai_batch_data_v5_1_train
|
||||
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json
|
||||
target_longest_image_dim: 1024
|
||||
target_anchor_text_len: 6000
|
||||
target_anchor_text_len: [0, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000]
|
||||
- name: openai_batch_data_v5_1_iabooks_train
|
||||
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json
|
||||
target_longest_image_dim: 1024
|
||||
target_anchor_text_len: 6000
|
||||
target_anchor_text_len: [0, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000]
|
||||
|
||||
valid_data:
|
||||
cache_location: /data/jakep/pdfdata/pdelfin_cache
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from peft import TaskType # pyright: ignore
|
||||
|
||||
@ -54,8 +54,8 @@ class AwsConfig:
|
||||
class SourceConfig:
|
||||
name: str = field(help="The name of the source")
|
||||
response_glob_path: str = field(help="The s3 bucket pointing to the batch api response json's sent back from open ai")
|
||||
target_longest_image_dim: int = field(help="Dimensions to render the pdf page image to")
|
||||
target_anchor_text_len: int = field(help="Maximum amount of anchor text (aka prompt hint)")
|
||||
target_longest_image_dim: Union[int, list[int]] = field(help="Dimensions to render the pdf page image to")
|
||||
target_anchor_text_len: Union[int, list[int]] = field(help="Maximum amount of anchor text (aka prompt hint)")
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user