mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-28 23:59:08 +00:00
Silver dataset adjustments
This commit is contained in:
parent
5535e3ab2e
commit
9ae26472d3
@ -140,7 +140,7 @@ def main():
|
||||
parser.add_argument("--glob_path", type=str, help="Local or S3 path glob (e.g., *.pdf or s3://bucket/pdfs/*.pdf).")
|
||||
parser.add_argument("--path_list", type=str, help="Path to a file containing paths to PDFs, one per line.")
|
||||
parser.add_argument("--num_sample_docs", type=int, default=5000, help="Number of PDF documents to sample.")
|
||||
parser.add_argument("--first_n_pages", type=int, default=5, help="Always sample the first N pages of each PDF.")
|
||||
parser.add_argument("--first_n_pages", type=int, default=0, help="Always sample the first N pages of each PDF.")
|
||||
parser.add_argument("--max_sample_pages", type=int, default=15, help="Max number of pages to sample per PDF.")
|
||||
parser.add_argument("--output", type=str, default="openai_batch_data", help="Output destination")
|
||||
args = parser.parse_args()
|
||||
Loading…
x
Reference in New Issue
Block a user