Giving more memory buffer

This commit is contained in:
Jake Poznanski 2025-09-03 19:56:53 +00:00
parent 7346d12322
commit b689a8e5f8
3 changed files with 14 additions and 8 deletions

View File

@ -9,7 +9,7 @@ import os
import random import random
import sys import sys
from pathlib import Path from pathlib import Path
from typing import List, Tuple, Any, Dict from typing import List, Tuple, Any, Dict, Optional
from dataclasses import dataclass from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
@ -25,6 +25,7 @@ class CleanedDocument(BaseModel):
cleaned_text: str = Field(description="The cleaned and corrected version of the OCR transcription") cleaned_text: str = Field(description="The cleaned and corrected version of the OCR transcription")
confidence_score: float = Field(description="Confidence score from 0 to 1 indicating how confident the model is in the cleaning", ge=0.0, le=1.0) confidence_score: float = Field(description="Confidence score from 0 to 1 indicating how confident the model is in the cleaning", ge=0.0, le=1.0)
corrections_made: List[str] = Field(description="List of major corrections or improvements made to the text") corrections_made: List[str] = Field(description="List of major corrections or improvements made to the text")
is_page_all_blank: bool = Field(description="Document consistents entire of blank page, or only headers/footers that would otherwise be removed")
@dataclass @dataclass
@ -156,8 +157,10 @@ def clean_document_with_chatgpt(
"3. Remove any original transcriber's marks and notes, usually indicated by [ and ] symbols.\n" "3. Remove any original transcriber's marks and notes, usually indicated by [ and ] symbols.\n"
"4. Fix word breaks and line breaks\n" "4. Fix word breaks and line breaks\n"
"5. Ensure mathematical formulas and special characters are correct\n" "5. Ensure mathematical formulas and special characters are correct\n"
"6. Maintain the semantic structure of the document\n" "6. If there are any figures or charts, label them with the following markdown syntax ![Alt text describing the contents of the figure](page_startx_starty_width_height.png)\n"
"7. Remove any headers or footers that are not semantically relevant to the main document contents, ex page numbers\n" "7. Maintain the semantic structure of the document\n"
"8. Remove any headers or footers that are not semantically relevant to the main document contents, ex page numbers, document classifications, etc.\n"
"9. If the page is blank, you are allowed to return 'null' for the text.\n"
"Return a cleaned version that accurately represents the original document." "Return a cleaned version that accurately represents the original document."
) )
} }
@ -236,6 +239,9 @@ def process_document(
output_path.parent.mkdir(parents=True, exist_ok=True) output_path.parent.mkdir(parents=True, exist_ok=True)
# Write cleaned text # Write cleaned text
if cleaned_result.is_page_all_blank:
output_path.write_text("", encoding='utf-8')
else:
output_path.write_text(cleaned_result.cleaned_text, encoding='utf-8') output_path.write_text(cleaned_result.cleaned_text, encoding='utf-8')
# Create soft link for the original MD file as .md.orig # Create soft link for the original MD file as .md.orig

View File

@ -275,9 +275,9 @@ set -e
# Create output directory # Create output directory
mkdir -p {local_output_dir if local_output_dir else "/tmp/checkpoints"} mkdir -p {local_output_dir if local_output_dir else "/tmp/checkpoints"}
# Start VLLM server in background # Start VLLM server in background (output goes to console)
echo 'Starting VLLM server on GPU {vllm_gpu} as background process...' echo 'Starting VLLM server on GPU {vllm_gpu} as background process...'
CUDA_VISIBLE_DEVICES={vllm_gpu} nohup trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.9 > /tmp/vllm_server.log 2>&1 & CUDA_VISIBLE_DEVICES={vllm_gpu} trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.5 &
VLLM_PID=$! VLLM_PID=$!
echo "VLLM server started with PID: $VLLM_PID" echo "VLLM server started with PID: $VLLM_PID"

View File

@ -238,9 +238,9 @@ set -e
# Setup commands # Setup commands
{" && ".join(setup_commands)} {" && ".join(setup_commands)}
# Start VLLM server in background # Start VLLM server in background (output goes to console)
echo 'Starting VLLM server on GPU {vllm_gpu} as background process...' echo 'Starting VLLM server on GPU {vllm_gpu} as background process...'
CUDA_VISIBLE_DEVICES={vllm_gpu} nohup trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.9 > /tmp/vllm_server.log 2>&1 & CUDA_VISIBLE_DEVICES={vllm_gpu} trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.5 &
VLLM_PID=$! VLLM_PID=$!
echo "VLLM server started with PID: $VLLM_PID" echo "VLLM server started with PID: $VLLM_PID"