mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-27 16:12:13 +00:00
Performance fixes with vllm backend
This commit is contained in:
parent
d71703317d
commit
2f1957b401
@ -63,6 +63,9 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|||||||
|
|
||||||
RUN uv pip install --system --no-cache -e .
|
RUN uv pip install --system --no-cache -e .
|
||||||
RUN uv pip install --system --no-cache ".[gpu]" --extra-index-url https://download.pytorch.org/whl/cu128
|
RUN uv pip install --system --no-cache ".[gpu]" --extra-index-url https://download.pytorch.org/whl/cu128
|
||||||
|
|
||||||
|
# TODO Try this and measure performance on it
|
||||||
|
#RUN uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl
|
||||||
RUN uv pip install --system --no-cache ".[bench]"
|
RUN uv pip install --system --no-cache ".[bench]"
|
||||||
RUN playwright install-deps
|
RUN playwright install-deps
|
||||||
RUN playwright install chromium
|
RUN playwright install chromium
|
||||||
|
|||||||
@ -72,6 +72,38 @@ class MetricsKeeper:
|
|||||||
|
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
def get_total_metrics(self):
|
||||||
|
"""
|
||||||
|
Returns the total cumulative metrics since the MetricsKeeper was created.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Dictionary of metric names to their total values.
|
||||||
|
"""
|
||||||
|
return dict(self.total_metrics)
|
||||||
|
|
||||||
|
def get_metrics_summary(self):
|
||||||
|
"""
|
||||||
|
Returns a summary of metrics including totals and rates.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Dictionary containing total metrics and overall rates.
|
||||||
|
"""
|
||||||
|
current_time = time.time()
|
||||||
|
elapsed_time = current_time - self.start_time
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
"elapsed_time_seconds": elapsed_time,
|
||||||
|
"total_metrics": dict(self.total_metrics),
|
||||||
|
"rates": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Calculate rates for each metric
|
||||||
|
if elapsed_time > 0:
|
||||||
|
for key, value in self.total_metrics.items():
|
||||||
|
summary["rates"][f"{key}_per_sec"] = value / elapsed_time
|
||||||
|
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
class WorkerTracker:
|
class WorkerTracker:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|||||||
@ -607,7 +607,7 @@ async def vllm_server_task(model_name_or_path, args, semaphore):
|
|||||||
logger.error("Cannot continue, sampling errors detected, model is probably corrupt")
|
logger.error("Cannot continue, sampling errors detected, model is probably corrupt")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if not server_printed_ready_message and "The server is fired up and ready to roll!" in line:
|
if not server_printed_ready_message and ("The server is fired up and ready to roll!" in line or "vllm server is ready" in line):
|
||||||
server_printed_ready_message = True
|
server_printed_ready_message = True
|
||||||
last_semaphore_release = time.time()
|
last_semaphore_release = time.time()
|
||||||
|
|
||||||
@ -740,7 +740,8 @@ def submit_beaker_job(args):
|
|||||||
b = Beaker.from_env(default_workspace=args.beaker_workspace)
|
b = Beaker.from_env(default_workspace=args.beaker_workspace)
|
||||||
account = b.account.whoami()
|
account = b.account.whoami()
|
||||||
owner = account.name
|
owner = account.name
|
||||||
beaker_image = f"jakep/olmocr-inference-{VERSION}"
|
#beaker_image = f"jakep/olmocr-inference-{VERSION}"
|
||||||
|
beaker_image = "jakep/olmocr-benchmark-0.1.71-d71703317d"
|
||||||
|
|
||||||
task_name = f"olmocr-{os.path.basename(args.workspace.rstrip('/'))}"
|
task_name = f"olmocr-{os.path.basename(args.workspace.rstrip('/'))}"
|
||||||
|
|
||||||
@ -1163,6 +1164,37 @@ async def main():
|
|||||||
|
|
||||||
vllm_server.cancel()
|
vllm_server.cancel()
|
||||||
metrics_task.cancel()
|
metrics_task.cancel()
|
||||||
|
|
||||||
|
# Output final metrics summary
|
||||||
|
metrics_summary = metrics.get_metrics_summary()
|
||||||
|
logger.info("=" * 80)
|
||||||
|
logger.info("FINAL METRICS SUMMARY")
|
||||||
|
logger.info("=" * 80)
|
||||||
|
logger.info(f"Total elapsed time: {metrics_summary['elapsed_time_seconds']:.2f} seconds")
|
||||||
|
|
||||||
|
# Output token counts and rates
|
||||||
|
total_metrics = metrics_summary['total_metrics']
|
||||||
|
rates = metrics_summary['rates']
|
||||||
|
|
||||||
|
# Calculate total tokens (input + output)
|
||||||
|
total_tokens = total_metrics.get('server_input_tokens', 0) + total_metrics.get('server_output_tokens', 0)
|
||||||
|
total_finished_tokens = total_metrics.get('finished_input_tokens', 0) + total_metrics.get('finished_output_tokens', 0)
|
||||||
|
|
||||||
|
logger.info(f"Total tokens processed: {total_tokens:,}")
|
||||||
|
logger.info(f" - Input tokens: {total_metrics.get('server_input_tokens', 0):,}")
|
||||||
|
logger.info(f" - Output tokens: {total_metrics.get('server_output_tokens', 0):,}")
|
||||||
|
|
||||||
|
logger.info(f"Total tokens in finished documents: {total_finished_tokens:,}")
|
||||||
|
logger.info(f" - Finished input tokens: {total_metrics.get('finished_input_tokens', 0):,}")
|
||||||
|
logger.info(f" - Finished output tokens: {total_metrics.get('finished_output_tokens', 0):,}")
|
||||||
|
|
||||||
|
# Output rates
|
||||||
|
if 'server_output_tokens_per_sec' in rates:
|
||||||
|
logger.info(f"Output tokens/sec rate: {rates['server_output_tokens_per_sec']:.2f}")
|
||||||
|
if 'server_input_tokens_per_sec' in rates:
|
||||||
|
logger.info(f"Input tokens/sec rate: {rates['server_input_tokens_per_sec']:.2f}")
|
||||||
|
|
||||||
|
logger.info("=" * 80)
|
||||||
logger.info("Work done")
|
logger.info("Work done")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user