update reference numbers

This commit is contained in:
rasbt 2024-10-11 12:12:05 -05:00
parent b66d846cf6
commit c36f623472

View File

@ -2,7 +2,9 @@
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"id": "FtQYMbLvgzO-"
},
"source": [
"<table style=\"width:100%\">\n",
"<tr>\n",
@ -21,14 +23,18 @@
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"id": "EbrESHKtgzPA"
},
"source": [
"## FLOPS Analysis"
"# FLOPS Analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"id": "xS2WjniMgzPB"
},
"source": [
"- FLOPs (Floating Point Operations Per Second) measure the computational complexity of neural network models by counting the number of floating-point operations executed\n",
"- High FLOPs indicate more intensive computation and energy consumption"
@ -36,8 +42,10 @@
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"execution_count": null,
"metadata": {
"id": "L01-NzkggzPB"
},
"outputs": [],
"source": [
"# pip install -r requirements-extra.txt"
@ -45,15 +53,21 @@
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ObzfVatqgzPC",
"outputId": "3ead6a41-ac38-4db1-9fc3-012fb3ad18cd"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"thop version: 0.1.1-2209072238\n",
"torch version: 2.2.1+cu121\n"
"torch version: 2.4.1+cu121\n"
]
}
],
@ -70,21 +84,32 @@
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"id": "74UpjSLjgzPC"
},
"source": [
"&nbsp;\n",
"# Simple benchmark with fixed batch size"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "90pnCK39gzPD"
},
"source": [
"- forward pass only"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GerIdRMXd6g9",
"outputId": "ccdd5c71-d221-4a84-f9bc-09557e77162d"
"outputId": "177c6d00-a817-40fe-badd-95cfa8ac9b51"
},
"outputs": [
{
@ -141,16 +166,33 @@
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"id": "_S6V05QmgzPD"
},
"source": [
"&nbsp;\n",
"# Simple benchmark with automatic batch size finding"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "amw4E983gzPD"
},
"source": [
"- forward pass only"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "h08VOiqpgzPE",
"outputId": "a6a90ef8-28fb-4b55-9268-6915b0c84c51"
},
"outputs": [
{
"name": "stdout",
@ -158,25 +200,26 @@
"text": [
"\n",
"Processing gpt-small (124M)\n",
" Batch size 128: 3.2e+13 FLOPS\n",
" Batch size 160: 4.0e+13 FLOPS\n",
" Batch size 176: 4.5e+13 FLOPS\n",
" Batch size 184: 4.7e+13 FLOPS\n",
" Batch size 186: 4.7e+13 FLOPS\n",
" Batch size 256: 6.5e+13 FLOPS\n",
" Batch size 384: 9.7e+13 FLOPS\n",
" Batch size 388: 9.8e+13 FLOPS\n",
" Batch size 389: 9.8e+13 FLOPS\n",
"\n",
"Processing gpt-medium (355M)\n",
" Batch size 128: 9.3e+13 FLOPS\n",
" Batch size 136: 9.8e+13 FLOPS\n",
" Batch size 140: 1.0e+14 FLOPS\n",
" Batch size 142: 1.0e+14 FLOPS\n",
" Batch size 143: 1.0e+14 FLOPS\n",
" Batch size 256: 1.9e+14 FLOPS\n",
" Batch size 260: 1.9e+14 FLOPS\n",
" Batch size 262: 1.9e+14 FLOPS\n",
" Batch size 263: 1.9e+14 FLOPS\n",
"\n",
"Processing gpt-large (774M)\n",
" Batch size 128: 2.0e+14 FLOPS\n",
" Batch size 256: 4.0e+14 FLOPS\n",
"\n",
"Processing gpt-xl (1558M)\n",
" Batch size 64: 2.0e+14 FLOPS\n",
" Batch size 96: 3.1e+14 FLOPS\n"
" Batch size 128: 4.1e+14 FLOPS\n",
" Batch size 136: 4.3e+14 FLOPS\n",
" Batch size 140: 4.5e+14 FLOPS\n",
" Batch size 142: 4.5e+14 FLOPS\n",
" Batch size 143: 4.6e+14 FLOPS\n"
]
}
],
@ -232,7 +275,9 @@
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"id": "V4lD7tfcgzPE"
},
"source": [
"&nbsp;\n",
"# Benchmark with automatic batch size finding and Model FLOP Utilization (MFU)"
@ -240,7 +285,9 @@
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"id": "70Y2mblVgzPE"
},
"source": [
"- Model FLOPs Utilization (MFU) explanation from the [PaLM paper](https://arxiv.org/abs/2204.02311)\n",
"\n",
@ -258,102 +305,103 @@
"$$\\text{Tokens per Second} = \\frac{\\text{Batch Size} \\times \\text{Sequence Length}}{\\text{Total Time}}$$"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "TKttjC8xgzPF"
},
"source": [
"- forward and backward pass"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"execution_count": null,
"metadata": {
"id": "6aO4rjtNgzPF"
},
"outputs": [],
"source": [
"# Max flops per second provided by the GPU manufacturer\n",
"# Theoretical max flops per second provided by the GPU manufacturer\n",
"\n",
"flops_per_second = {\n",
" # https://www.techpowerup.com/gpu-specs/h100-pcie-80-gb.c3899\n",
" \"H100\": {\n",
" torch.float32: 60e12, # 60 TFLOPs for FP32 on NVIDIA H100\n",
" torch.float16: 1.979e15, # 1979 TFLOPs for FP16 on NVIDIA H100\n",
" torch.bfloat16: 1.979e15\n",
" torch.float32: 51.22e12, # 51.22 TFLOPs for FP32 on NVIDIA H100\n",
" torch.float16: 204.9e12, # 204.9 TFLOPs for FP16 on NVIDIA H100\n",
" torch.bfloat16: 204.9e12\n",
" },\n",
" # https://www.techpowerup.com/gpu-specs/l4.c4091\n",
" \"L4\": {\n",
" torch.float32: 15e12, # 15 TFLOPs for FP32 on NVIDIA L4\n",
" torch.float16: 30e12, # 30 TFLOPs for FP16 on NVIDIA L4\n",
" torch.bfloat16: 30e12 \n",
" torch.float32: 30.29e12, # 30.29 TFLOPs for FP32 on NVIDIA L4\n",
" torch.float16: 30.29e12, # 30.29 TFLOPs for FP16 on NVIDIA L4\n",
" torch.bfloat16: 30.29e12\n",
" },\n",
" # https://www.techpowerup.com/gpu-specs/tesla-t4.c3316\n",
" \"T4\": {\n",
" torch.float32: 8.1e12, # 8.1 TFLOPs for FP32 on NVIDIA T4\n",
" torch.float16: 130e12, # 130 TFLOPs for FP16 on NVIDIA T4\n",
" torch.bfloat16: 130e12\n",
" torch.float16: 65.13e12, # 65.13 TFLOPs for FP16 on NVIDIA T4\n",
" torch.bfloat16: 65.13e12\n",
" },\n",
" # https://www.techpowerup.com/gpu-specs/a10g.c3798\n",
" \"A10G\": {\n",
" torch.float32: 15.6e12, # 15.6 TFLOPs for FP32 on NVIDIA A10G\n",
" torch.float16: 78e12, # 78 TFLOPs for FP16 on NVIDIA A10G\n",
" torch.bfloat16: 78e12\n",
" torch.float32: 31.52e12, # 31.52 TFLOPs for FP32 on NVIDIA A10G\n",
" torch.float16: 31.52e12, # 31.52 TFLOPs for FP16 on NVIDIA A10G\n",
" torch.bfloat16: 31.52e12\n",
" },\n",
" # https://www.techpowerup.com/gpu-specs/a100-pcie-40-gb.c3623\n",
" \"A100\": {\n",
" torch.float32: 19.5e12, # 19.5 TFLOPs for FP32 on NVIDIA A100\n",
" torch.float16: 1.248e15, # 1248 TFLOPs for FP16 on NVIDIA A100\n",
" torch.bfloat16: 1.248e15\n",
" },\n",
" \"H200\": {\n",
" torch.float32: 70e12, # 70 TFLOPs for FP32 on NVIDIA H200\n",
" torch.float16: 1.2e15, # Assuming 1200 TFLOPs for FP16 on NVIDIA H200\n",
" torch.bfloat16: 1.2e15\n",
" torch.float32: 19.49e12, # 19.49 TFLOPs for FP32 on NVIDIA A100\n",
" torch.float16: 77.97e12, # 77.97 TFLOPs for FP16 on NVIDIA A100\n",
" torch.bfloat16: 77.97e12\n",
" },\n",
" # https://www.techpowerup.com/gpu-specs/geforce-rtx-3080.c3621\n",
" \"RTX_3080\": {\n",
" torch.float32: 29.8e12, # 29.8 TFLOPs for FP32 on NVIDIA RTX 3080\n",
" torch.float16: 59.6e12, # 59.6 TFLOPs for FP16 on NVIDIA RTX 3080\n",
" torch.bfloat16: 59.6e12\n",
" torch.float32: 29.77e12, # 29.77 TFLOPs for FP32 on NVIDIA RTX 3080\n",
" torch.float16: 29.77e12, # 29.77 TFLOPs for FP16 on NVIDIA RTX 3080\n",
" torch.bfloat16: 29.77e12\n",
" },\n",
" # https://www.techpowerup.com/gpu-specs/geforce-rtx-3090.c3622\n",
" \"RTX_3090\": {\n",
" torch.float32: 35.6e12, # 35.6 TFLOPs for FP32 on NVIDIA RTX 3090\n",
" torch.float16: 71.2e12, # 71.2 TFLOPs for FP16 on NVIDIA RTX 3090\n",
" torch.bfloat16: 71.2e12\n",
" },\n",
" \"GTX_1080\": {\n",
" torch.float32: 8.9e12, # 8.9 TFLOPs for FP32 on NVIDIA GTX 1080\n",
" torch.float16: 8.9e12, # No dedicated FP16 performance; using FP32 value\n",
" torch.bfloat16: 8.9e12\n",
" },\n",
" \"GTX_1080Ti\": {\n",
" torch.float32: 11.3e12, # 11.3 TFLOPs for FP32 on NVIDIA GTX 1080Ti\n",
" torch.float16: 11.3e12, # No dedicated FP16 performance; using FP32 value\n",
" torch.bfloat16: 11.3e12\n",
" },\n",
" \"GTX_1660\": {\n",
" torch.float32: 5e12, # 5 TFLOPs for FP32 on NVIDIA GTX 1660\n",
" torch.float16: 5e12, # No dedicated FP16 performance; using FP32 value\n",
" torch.bfloat16: 5e12\n",
" },\n",
" \"GTX_1660Ti\": {\n",
" torch.float32: 5.5e12, # 5.5 TFLOPs for FP32 on NVIDIA GTX 1660Ti\n",
" torch.float16: 5.5e12, # No dedicated FP16 performance; using FP32 value\n",
" torch.bfloat16: 5.5e12\n",
" torch.float32: 35.58e12, # 35.58 TFLOPs for FP32 on NVIDIA RTX 3090\n",
" torch.float16: 35.58e12, # 35.58 TFLOPs for FP16 on NVIDIA RTX 3090\n",
" torch.bfloat16: 35.58e12\n",
" }\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"execution_count": null,
"metadata": {
"colab": {
"background_save": true,
"base_uri": "https://localhost:8080/"
},
"id": "HW5qWfE7gzPF",
"outputId": "bb1663bc-ee66-44f1-f54d-0bb66ee0d0c2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GPU Model: L4\n",
"GPU Model: A100\n",
"\n",
"Processing gpt-small (124M)\n",
" Batch size 8: Tokens/sec: 14488.21, MFU: 0.3580\n",
" Batch size 12: Tokens/sec: 15378.16, MFU: 0.3799\n",
" Batch size 16: Tokens/sec: 34248.82, MFU: 0.3256\n",
" Batch size 24: Tokens/sec: 62568.34, MFU: 0.5948\n",
"\n",
"Processing gpt-medium (355M)\n",
" Batch size 2: Tokens/sec: 6493.81, MFU: 0.4591\n",
" Batch size 3: Tokens/sec: 6328.82, MFU: 0.4474\n",
" Batch size 4: Tokens/sec: 20159.93, MFU: 0.5483\n",
" Batch size 6: Tokens/sec: 21717.66, MFU: 0.5907\n",
" Batch size 7: Tokens/sec: 22536.25, MFU: 0.6130\n",
"\n",
"Processing gpt-large (774M)\n",
" Batch size 4: Tokens/sec: 3130.38, MFU: 0.4834\n",
" Batch size 8: Tokens/sec: 12465.21, MFU: 0.7406\n",
"\n",
"Processing gpt-xl (1558M)\n",
" Batch size 2: Tokens/sec: 1896.17, MFU: 0.5897\n"
" Batch size 4: Tokens/sec: 6779.92, MFU: 0.8113\n"
]
}
],
@ -471,8 +519,11 @@
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"id": "LovmswRigzPG"
},
"source": [
"- a value of 1.0 is best (equal to 100%)\n",
"- Note that the batch sizes are smaller than previously because we also carry out the backward pass here, which is more memory-intensive"
]
}
@ -503,5 +554,5 @@
}
},
"nbformat": 4,
"nbformat_minor": 4
"nbformat_minor": 0
}