Giving more memory buffer

This commit is contained in:
Jake Poznanski 2025-09-03 19:56:53 +00:00
parent 7346d12322
commit b689a8e5f8
3 changed files with 14 additions and 8 deletions

View File

@ -9,7 +9,7 @@ import os
import random
import sys
from pathlib import Path
from typing import List, Tuple, Any, Dict
from typing import List, Tuple, Any, Dict, Optional
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor, as_completed
@ -25,6 +25,7 @@ class CleanedDocument(BaseModel):
cleaned_text: str = Field(description="The cleaned and corrected version of the OCR transcription")
confidence_score: float = Field(description="Confidence score from 0 to 1 indicating how confident the model is in the cleaning", ge=0.0, le=1.0)
corrections_made: List[str] = Field(description="List of major corrections or improvements made to the text")
is_page_all_blank: bool = Field(description="Document consistents entire of blank page, or only headers/footers that would otherwise be removed")
@dataclass
@ -156,8 +157,10 @@ def clean_document_with_chatgpt(
"3. Remove any original transcriber's marks and notes, usually indicated by [ and ] symbols.\n"
"4. Fix word breaks and line breaks\n"
"5. Ensure mathematical formulas and special characters are correct\n"
"6. Maintain the semantic structure of the document\n"
"7. Remove any headers or footers that are not semantically relevant to the main document contents, ex page numbers\n"
"6. If there are any figures or charts, label them with the following markdown syntax ![Alt text describing the contents of the figure](page_startx_starty_width_height.png)\n"
"7. Maintain the semantic structure of the document\n"
"8. Remove any headers or footers that are not semantically relevant to the main document contents, ex page numbers, document classifications, etc.\n"
"9. If the page is blank, you are allowed to return 'null' for the text.\n"
"Return a cleaned version that accurately represents the original document."
)
}
@ -236,7 +239,10 @@ def process_document(
output_path.parent.mkdir(parents=True, exist_ok=True)
# Write cleaned text
output_path.write_text(cleaned_result.cleaned_text, encoding='utf-8')
if cleaned_result.is_page_all_blank:
output_path.write_text("", encoding='utf-8')
else:
output_path.write_text(cleaned_result.cleaned_text, encoding='utf-8')
# Create soft link for the original MD file as .md.orig
orig_md_link_path = output_path.with_suffix('.md.orig')

View File

@ -275,9 +275,9 @@ set -e
# Create output directory
mkdir -p {local_output_dir if local_output_dir else "/tmp/checkpoints"}
# Start VLLM server in background
# Start VLLM server in background (output goes to console)
echo 'Starting VLLM server on GPU {vllm_gpu} as background process...'
CUDA_VISIBLE_DEVICES={vllm_gpu} nohup trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.9 > /tmp/vllm_server.log 2>&1 &
CUDA_VISIBLE_DEVICES={vllm_gpu} trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.5 &
VLLM_PID=$!
echo "VLLM server started with PID: $VLLM_PID"

View File

@ -238,9 +238,9 @@ set -e
# Setup commands
{" && ".join(setup_commands)}
# Start VLLM server in background
# Start VLLM server in background (output goes to console)
echo 'Starting VLLM server on GPU {vllm_gpu} as background process...'
CUDA_VISIBLE_DEVICES={vllm_gpu} nohup trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.9 > /tmp/vllm_server.log 2>&1 &
CUDA_VISIBLE_DEVICES={vllm_gpu} trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.5 &
VLLM_PID=$!
echo "VLLM server started with PID: $VLLM_PID"