Giving more memory buffer

2025-11-02 19:13:53 +00:00 · 2025-09-03 19:56:53 +00:00 · 2025-09-03 19:56:53 +00:00 · b689a8e5f8
commit b689a8e5f8
parent 7346d12322
3 changed files with 14 additions and 8 deletions
--- a/scripts/clean_olmocrmix.py
+++ b/scripts/clean_olmocrmix.py
@ -9,7 +9,7 @@ import os
 import random
 import sys
 from pathlib import Path
-from typing import List, Tuple, Any, Dict
+from typing import List, Tuple, Any, Dict, Optional
 from dataclasses import dataclass
 from concurrent.futures import ThreadPoolExecutor, as_completed

@ -25,6 +25,7 @@ class CleanedDocument(BaseModel):
    cleaned_text: str = Field(description="The cleaned and corrected version of the OCR transcription")
    confidence_score: float = Field(description="Confidence score from 0 to 1 indicating how confident the model is in the cleaning", ge=0.0, le=1.0)
    corrections_made: List[str] = Field(description="List of major corrections or improvements made to the text")
+    is_page_all_blank: bool = Field(description="Document consistents entire of blank page, or only headers/footers that would otherwise be removed")


@dataclass
@ -156,8 +157,10 @@ def clean_document_with_chatgpt(
                "3. Remove any original transcriber's marks and notes, usually indicated by [ and ] symbols.\n"
                "4. Fix word breaks and line breaks\n"
                "5. Ensure mathematical formulas and special characters are correct\n"
-                "6. Maintain the semantic structure of the document\n"
-                "7. Remove any headers or footers that are not semantically relevant to the main document contents, ex page numbers\n"
+                "6. If there are any figures or charts, label them with the following markdown syntax ![Alt text describing the contents of the figure](page_startx_starty_width_height.png)\n"
+                "7. Maintain the semantic structure of the document\n"
+                "8. Remove any headers or footers that are not semantically relevant to the main document contents, ex page numbers, document classifications, etc.\n"
+                "9. If the page is blank, you are allowed to return 'null' for the text.\n"
                "Return a cleaned version that accurately represents the original document."
            )
        }
@ -236,7 +239,10 @@ def process_document(
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Write cleaned text
-        output_path.write_text(cleaned_result.cleaned_text, encoding='utf-8')
+        if cleaned_result.is_page_all_blank:
+            output_path.write_text("", encoding='utf-8')
+        else:
+            output_path.write_text(cleaned_result.cleaned_text, encoding='utf-8')
        
        # Create soft link for the original MD file as .md.orig
        orig_md_link_path = output_path.with_suffix('.md.orig')
--- a/scripts/train/grpotrainer-beaker-multi-gpu-augusta.sh
+++ b/scripts/train/grpotrainer-beaker-multi-gpu-augusta.sh
@ -275,9 +275,9 @@ set -e
 # Create output directory
 mkdir -p {local_output_dir if local_output_dir else "/tmp/checkpoints"}

-# Start VLLM server in background
+# Start VLLM server in background (output goes to console)
 echo 'Starting VLLM server on GPU {vllm_gpu} as background process...'
-CUDA_VISIBLE_DEVICES={vllm_gpu} nohup trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.9 > /tmp/vllm_server.log 2>&1 &
+CUDA_VISIBLE_DEVICES={vllm_gpu} trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.5 &
 VLLM_PID=$!
 echo "VLLM server started with PID: $VLLM_PID"

--- a/scripts/train/grpotrainer-beaker-multi-gpu.sh
+++ b/scripts/train/grpotrainer-beaker-multi-gpu.sh
@ -238,9 +238,9 @@ set -e
 # Setup commands
 {" && ".join(setup_commands)}

-# Start VLLM server in background
+# Start VLLM server in background (output goes to console)
 echo 'Starting VLLM server on GPU {vllm_gpu} as background process...'
-CUDA_VISIBLE_DEVICES={vllm_gpu} nohup trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.9 > /tmp/vllm_server.log 2>&1 &
+CUDA_VISIBLE_DEVICES={vllm_gpu} trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.5 &
 VLLM_PID=$!
 echo "VLLM server started with PID: $VLLM_PID"