More docs

2025-12-28 15:46:29 +00:00 · 2024-11-04 17:28:09 +00:00 · 2024-11-04 17:28:09 +00:00 · 93d70683d4
commit 93d70683d4
parent 73bd961135
2 changed files with 50 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -46,6 +46,15 @@ You should expect somewhere between 1,400 to 1,800 tokens per second per H100 GP
 python -m pdelfin.birrpipeline [s3_workspace_path] --add_pdfs [s3_glob_path or path to file with s3 paths (one per line)]
 ```

+For example:
+```bash
+python -m pdelfin.birrpipeline s3://ai2-oe-data/[your username]/pdfworkspaces/[workspacename] --pdf_profile s2 --add_pdfs s3://ai2-oe-data/jakep/gnarly_pdfs/*.pdf
+```
+
+After this runs the first time, you should have a whole bunch of json files generated in `s3://ai2-oe-data/[your username]/pdfworkspaces/[workspacename]/round_0/`
+
+Now you need to run them using birr.
+


 ### TODOs for future versions
--- a/scripts/birr/config/qwen2-vl-7b-pdf-weka.yaml
+++ b/scripts/birr/config/qwen2-vl-7b-pdf-weka.yaml
@ -0,0 +1,41 @@
+model:
+  # full fine tune
+  name_or_path: weka://oe-data-default/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/best_bf16/
+  #name_or_path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/
+  vlm: true
+
+  # necessary to prevent random crashes, until vllm fixes some bugs
+  num_scheduler_steps: 1
+
+format:
+  add_generation_prompt: true
+
+generate:
+  # The model's max context length is 8096, but around 1500 tokens are reserved for the image itself
+  max_context_length: 6500
+  temperature: 0.8
+  top_p: 1.0
+  drop_long_outputs: false
+
+
+pipeline:
+  sqs_queue_name: jake-pdf
+  num_workers: 3
+  generation_batch_size: 256
+  tokenization_batch_size: 64
+  output_serializer: default
+  target_bucket: ai2-oe-data
+  target_object_prefix: [your username]/pdfworkspaces/s2orc_3200k_v2/inference_outputs
+  allowed_restarts_per_predictor: 10
+
+task:
+  budget: ai2/oe-data
+  workspace: ai2/oe-data-model-based-cleanup
+  name: qwen2vl-schedsteps-bg
+  replicas: 128
+  priority: LOW
+  gpu_count: 1
+  cluster:
+    - ai2/jupiter-cirrascale-2
+    - ai2/saturn-cirrascale
+