Lowwering size of default data load for testing

This commit is contained in:
Jake Poznanski 2025-06-28 23:09:44 +00:00
parent c36b5df2af
commit 12b5cc3101

View File

@ -27,7 +27,7 @@ model:
dataset: dataset:
train: train:
- root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/ - root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
pipeline: &basic_pipeline pipeline: &basic_pipeline
- name: FrontMatterParser - name: FrontMatterParser
front_matter_class: PageResponse front_matter_class: PageResponse
@ -41,8 +41,9 @@ dataset:
- name: Tokenizer - name: Tokenizer
masking_index: -100 masking_index: -100
end_of_message_token: "<|im_end|>" end_of_message_token: "<|im_end|>"
- root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/ # Not putting in big bulk of data to speed up loading for debugging for now
pipeline: *basic_pipeline # - root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
# pipeline: *basic_pipeline
eval: eval:
- root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/ - root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/