mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-28 07:34:13 +00:00
More docs
This commit is contained in:
parent
73bd961135
commit
93d70683d4
@ -46,6 +46,15 @@ You should expect somewhere between 1,400 to 1,800 tokens per second per H100 GP
|
||||
python -m pdelfin.birrpipeline [s3_workspace_path] --add_pdfs [s3_glob_path or path to file with s3 paths (one per line)]
|
||||
```
|
||||
|
||||
For example:
|
||||
```bash
|
||||
python -m pdelfin.birrpipeline s3://ai2-oe-data/[your username]/pdfworkspaces/[workspacename] --pdf_profile s2 --add_pdfs s3://ai2-oe-data/jakep/gnarly_pdfs/*.pdf
|
||||
```
|
||||
|
||||
After this runs the first time, you should have a whole bunch of json files generated in `s3://ai2-oe-data/[your username]/pdfworkspaces/[workspacename]/round_0/`
|
||||
|
||||
Now you need to run them using birr.
|
||||
|
||||
|
||||
|
||||
### TODOs for future versions
|
||||
|
||||
41
scripts/birr/config/qwen2-vl-7b-pdf-weka.yaml
Normal file
41
scripts/birr/config/qwen2-vl-7b-pdf-weka.yaml
Normal file
@ -0,0 +1,41 @@
|
||||
model:
|
||||
# full fine tune
|
||||
name_or_path: weka://oe-data-default/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/best_bf16/
|
||||
#name_or_path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/
|
||||
vlm: true
|
||||
|
||||
# necessary to prevent random crashes, until vllm fixes some bugs
|
||||
num_scheduler_steps: 1
|
||||
|
||||
format:
|
||||
add_generation_prompt: true
|
||||
|
||||
generate:
|
||||
# The model's max context length is 8096, but around 1500 tokens are reserved for the image itself
|
||||
max_context_length: 6500
|
||||
temperature: 0.8
|
||||
top_p: 1.0
|
||||
drop_long_outputs: false
|
||||
|
||||
|
||||
pipeline:
|
||||
sqs_queue_name: jake-pdf
|
||||
num_workers: 3
|
||||
generation_batch_size: 256
|
||||
tokenization_batch_size: 64
|
||||
output_serializer: default
|
||||
target_bucket: ai2-oe-data
|
||||
target_object_prefix: [your username]/pdfworkspaces/s2orc_3200k_v2/inference_outputs
|
||||
allowed_restarts_per_predictor: 10
|
||||
|
||||
task:
|
||||
budget: ai2/oe-data
|
||||
workspace: ai2/oe-data-model-based-cleanup
|
||||
name: qwen2vl-schedsteps-bg
|
||||
replicas: 128
|
||||
priority: LOW
|
||||
gpu_count: 1
|
||||
cluster:
|
||||
- ai2/jupiter-cirrascale-2
|
||||
- ai2/saturn-cirrascale
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user