mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-13 08:11:22 +00:00
Allow for sampling anchor and other params
This commit is contained in:
parent
999f64dd46
commit
6a22900b8a
@ -17,11 +17,11 @@ train_data:
|
|||||||
- name: openai_batch_data_v5_1_train
|
- name: openai_batch_data_v5_1_train
|
||||||
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json
|
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json
|
||||||
target_longest_image_dim: 1024
|
target_longest_image_dim: 1024
|
||||||
target_anchor_text_len: 6000
|
target_anchor_text_len: [0, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000]
|
||||||
- name: openai_batch_data_v5_1_iabooks_train
|
- name: openai_batch_data_v5_1_iabooks_train
|
||||||
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json
|
response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json
|
||||||
target_longest_image_dim: 1024
|
target_longest_image_dim: 1024
|
||||||
target_anchor_text_len: 6000
|
target_anchor_text_len: [0, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000]
|
||||||
|
|
||||||
valid_data:
|
valid_data:
|
||||||
cache_location: /data/jakep/pdfdata/pdelfin_cache
|
cache_location: /data/jakep/pdfdata/pdelfin_cache
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import List, Optional
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from peft import TaskType # pyright: ignore
|
from peft import TaskType # pyright: ignore
|
||||||
|
|
||||||
@ -54,8 +54,8 @@ class AwsConfig:
|
|||||||
class SourceConfig:
|
class SourceConfig:
|
||||||
name: str = field(help="The name of the source")
|
name: str = field(help="The name of the source")
|
||||||
response_glob_path: str = field(help="The s3 bucket pointing to the batch api response json's sent back from open ai")
|
response_glob_path: str = field(help="The s3 bucket pointing to the batch api response json's sent back from open ai")
|
||||||
target_longest_image_dim: int = field(help="Dimensions to render the pdf page image to")
|
target_longest_image_dim: Union[int, list[int]] = field(help="Dimensions to render the pdf page image to")
|
||||||
target_anchor_text_len: int = field(help="Maximum amount of anchor text (aka prompt hint)")
|
target_anchor_text_len: Union[int, list[int]] = field(help="Maximum amount of anchor text (aka prompt hint)")
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user