mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-12 08:43:32 +00:00
Lowwering size of default data load for testing
This commit is contained in:
parent
c36b5df2af
commit
12b5cc3101
@ -27,7 +27,7 @@ model:
|
|||||||
dataset:
|
dataset:
|
||||||
|
|
||||||
train:
|
train:
|
||||||
- root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
|
- root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
|
||||||
pipeline: &basic_pipeline
|
pipeline: &basic_pipeline
|
||||||
- name: FrontMatterParser
|
- name: FrontMatterParser
|
||||||
front_matter_class: PageResponse
|
front_matter_class: PageResponse
|
||||||
@ -41,8 +41,9 @@ dataset:
|
|||||||
- name: Tokenizer
|
- name: Tokenizer
|
||||||
masking_index: -100
|
masking_index: -100
|
||||||
end_of_message_token: "<|im_end|>"
|
end_of_message_token: "<|im_end|>"
|
||||||
- root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
|
# Not putting in big bulk of data to speed up loading for debugging for now
|
||||||
pipeline: *basic_pipeline
|
# - root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
|
||||||
|
# pipeline: *basic_pipeline
|
||||||
|
|
||||||
eval:
|
eval:
|
||||||
- root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
|
- root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
|
||||||
|
Loading…
x
Reference in New Issue
Block a user