mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-12 16:39:40 +00:00
Giving more memory buffer
This commit is contained in:
parent
7346d12322
commit
b689a8e5f8
@ -9,7 +9,7 @@ import os
|
|||||||
import random
|
import random
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Tuple, Any, Dict
|
from typing import List, Tuple, Any, Dict, Optional
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
@ -25,6 +25,7 @@ class CleanedDocument(BaseModel):
|
|||||||
cleaned_text: str = Field(description="The cleaned and corrected version of the OCR transcription")
|
cleaned_text: str = Field(description="The cleaned and corrected version of the OCR transcription")
|
||||||
confidence_score: float = Field(description="Confidence score from 0 to 1 indicating how confident the model is in the cleaning", ge=0.0, le=1.0)
|
confidence_score: float = Field(description="Confidence score from 0 to 1 indicating how confident the model is in the cleaning", ge=0.0, le=1.0)
|
||||||
corrections_made: List[str] = Field(description="List of major corrections or improvements made to the text")
|
corrections_made: List[str] = Field(description="List of major corrections or improvements made to the text")
|
||||||
|
is_page_all_blank: bool = Field(description="Document consistents entire of blank page, or only headers/footers that would otherwise be removed")
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -156,8 +157,10 @@ def clean_document_with_chatgpt(
|
|||||||
"3. Remove any original transcriber's marks and notes, usually indicated by [ and ] symbols.\n"
|
"3. Remove any original transcriber's marks and notes, usually indicated by [ and ] symbols.\n"
|
||||||
"4. Fix word breaks and line breaks\n"
|
"4. Fix word breaks and line breaks\n"
|
||||||
"5. Ensure mathematical formulas and special characters are correct\n"
|
"5. Ensure mathematical formulas and special characters are correct\n"
|
||||||
"6. Maintain the semantic structure of the document\n"
|
"6. If there are any figures or charts, label them with the following markdown syntax \n"
|
||||||
"7. Remove any headers or footers that are not semantically relevant to the main document contents, ex page numbers\n"
|
"7. Maintain the semantic structure of the document\n"
|
||||||
|
"8. Remove any headers or footers that are not semantically relevant to the main document contents, ex page numbers, document classifications, etc.\n"
|
||||||
|
"9. If the page is blank, you are allowed to return 'null' for the text.\n"
|
||||||
"Return a cleaned version that accurately represents the original document."
|
"Return a cleaned version that accurately represents the original document."
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@ -236,6 +239,9 @@ def process_document(
|
|||||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# Write cleaned text
|
# Write cleaned text
|
||||||
|
if cleaned_result.is_page_all_blank:
|
||||||
|
output_path.write_text("", encoding='utf-8')
|
||||||
|
else:
|
||||||
output_path.write_text(cleaned_result.cleaned_text, encoding='utf-8')
|
output_path.write_text(cleaned_result.cleaned_text, encoding='utf-8')
|
||||||
|
|
||||||
# Create soft link for the original MD file as .md.orig
|
# Create soft link for the original MD file as .md.orig
|
||||||
|
|||||||
@ -275,9 +275,9 @@ set -e
|
|||||||
# Create output directory
|
# Create output directory
|
||||||
mkdir -p {local_output_dir if local_output_dir else "/tmp/checkpoints"}
|
mkdir -p {local_output_dir if local_output_dir else "/tmp/checkpoints"}
|
||||||
|
|
||||||
# Start VLLM server in background
|
# Start VLLM server in background (output goes to console)
|
||||||
echo 'Starting VLLM server on GPU {vllm_gpu} as background process...'
|
echo 'Starting VLLM server on GPU {vllm_gpu} as background process...'
|
||||||
CUDA_VISIBLE_DEVICES={vllm_gpu} nohup trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.9 > /tmp/vllm_server.log 2>&1 &
|
CUDA_VISIBLE_DEVICES={vllm_gpu} trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.5 &
|
||||||
VLLM_PID=$!
|
VLLM_PID=$!
|
||||||
echo "VLLM server started with PID: $VLLM_PID"
|
echo "VLLM server started with PID: $VLLM_PID"
|
||||||
|
|
||||||
|
|||||||
@ -238,9 +238,9 @@ set -e
|
|||||||
# Setup commands
|
# Setup commands
|
||||||
{" && ".join(setup_commands)}
|
{" && ".join(setup_commands)}
|
||||||
|
|
||||||
# Start VLLM server in background
|
# Start VLLM server in background (output goes to console)
|
||||||
echo 'Starting VLLM server on GPU {vllm_gpu} as background process...'
|
echo 'Starting VLLM server on GPU {vllm_gpu} as background process...'
|
||||||
CUDA_VISIBLE_DEVICES={vllm_gpu} nohup trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.9 > /tmp/vllm_server.log 2>&1 &
|
CUDA_VISIBLE_DEVICES={vllm_gpu} trl vllm-serve --model {vllm_model_arg} --port 8000 --gpu-memory-utilization 0.5 &
|
||||||
VLLM_PID=$!
|
VLLM_PID=$!
|
||||||
echo "VLLM server started with PID: $VLLM_PID"
|
echo "VLLM server started with PID: $VLLM_PID"
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user