|
import gradio as gr |
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
col=['L', 'H', 'FFN', 'S', 'A', 'G', |
|
'DP', 'TP', 'PP', 'CP', 'GPUs', 'B', 'FP8', 'Model parameters (B)', 'Model states (GB)', 'Activation (GB)', 'Total (GB)'] |
|
|
|
abbr = """ |
|
<div align="center"> |
|
|
|
> **Abbreviations of symbols:** |
|
|Abbr|Full name|Abbr|Full name|Abbr|Full name|Abbr|Full name|Abbr|Full name|Abbr|Full name| |
|
|---|---|---|---|---|---|---|---|---|---|---|---| |
|
|L|Layer number|H|Hidden size|FFN|FFN Hidden size|S|Sequence length|A|Head number|G|Group number| |
|
|
|
</div> |
|
""" |
|
|
|
def Get_GigaByte(memory): |
|
return memory / 1024**3 |
|
|
|
def Get_BillionParameter(parameter): |
|
return parameter / 1000**3 |
|
|
|
|
|
def Compute_Parameters_input(seq_length, hidden_size, vocab_size, act_func, tp): |
|
num_parameters_word_embedding = hidden_size * vocab_size / tp |
|
|
|
if act_func == "LLaMA": |
|
num_parameters_position_embedding = 0 |
|
else: |
|
num_parameters_position_embedding = seq_length * hidden_size / tp |
|
|
|
return num_parameters_word_embedding + num_parameters_position_embedding |
|
|
|
def Compute_Parameters_output(hidden_size, vocab_size, is_tie_word_embedding, act_func, tp): |
|
|
|
if act_func == "LLaMA": |
|
num_parameters_output_layernorm = hidden_size |
|
else: |
|
num_parameters_output_layernorm = 2 * hidden_size |
|
|
|
if is_tie_word_embedding == "True": |
|
num_parameters_output_embedding = 0 |
|
else: |
|
num_parameters_output_embedding = hidden_size * vocab_size / tp |
|
|
|
return num_parameters_output_layernorm + num_parameters_output_embedding |
|
|
|
def Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, act_func, tp): |
|
|
|
|
|
if act_func == "LLaMA": |
|
num_parameters_attention = hidden_size |
|
else: |
|
num_parameters_attention = 2 * hidden_size |
|
|
|
|
|
num_parameters_attention_Q_weight = hidden_size * hidden_size / tp |
|
num_parameters_attention_KV_weight = 2 * kv_hidden_size * hidden_size / tp |
|
num_parameters_attention_Linear_weight = hidden_size * hidden_size / tp |
|
|
|
num_parameters_attention += num_parameters_attention_Q_weight + num_parameters_attention_KV_weight + num_parameters_attention_Linear_weight |
|
if is_bias == "True": |
|
num_parameters_attention += (hidden_size + 2 * kv_hidden_size) / tp + hidden_size |
|
|
|
return num_parameters_attention |
|
|
|
def Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp): |
|
|
|
|
|
if act_func == "LLaMA": |
|
num_parameters_mlp = hidden_size |
|
else: |
|
num_parameters_mlp = 2 * hidden_size |
|
|
|
|
|
if act_func == "LLaMA": |
|
num_parameters_mlp += hidden_size * ffn_size * 3 / tp |
|
if is_bias == "True": |
|
num_parameters_mlp += ffn_size * 2 / tp + hidden_size |
|
else: |
|
num_parameters_mlp += hidden_size * ffn_size * 2 / tp |
|
if is_bias == "True": |
|
num_parameters_mlp += ffn_size / tp + hidden_size |
|
|
|
return num_parameters_mlp |
|
|
|
def Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, head_num, tp, pp): |
|
if is_group_query == "False": |
|
group_query_num = head_num |
|
kv_hidden_size = hidden_size / head_num * group_query_num |
|
|
|
|
|
num_parameters_input = Compute_Parameters_input(seq_length, hidden_size, vocab_size, act_func, tp) |
|
|
|
|
|
num_parameters_attention = Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, act_func, tp) |
|
num_parameters_mlp = Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp) |
|
num_parameters_in_single_layer = num_parameters_attention + num_parameters_mlp |
|
num_parameters_in_total_layers = num_parameters_in_single_layer * layer_num / pp |
|
|
|
|
|
parameters_output = Compute_Parameters_output(hidden_size, vocab_size, is_tie_word_embedding, act_func, tp) |
|
|
|
if pp == 1: |
|
num_parameters_total = ( |
|
num_parameters_input |
|
+ num_parameters_in_total_layers |
|
+ parameters_output |
|
) |
|
else: |
|
num_parameters_total = ( |
|
num_parameters_input |
|
+ num_parameters_in_total_layers |
|
) |
|
|
|
return num_parameters_total |
|
|
|
def Compute_Weight(numParametersTotal, precision, is_fp8, is_fp8_init): |
|
weight_memory = 0 |
|
if precision == "FP32": |
|
weight_memory = 4 * numParametersTotal |
|
else: |
|
weight_memory = 2 * numParametersTotal |
|
|
|
if is_fp8 == "True" and is_fp8_init == "False": |
|
weight_memory += 2 * numParametersTotal |
|
|
|
return weight_memory |
|
|
|
def Compute_Gradient(numParametersTotal, g_ty): |
|
if g_ty == "FP32": |
|
gradient_memory = 4 * numParametersTotal |
|
elif g_ty =="BF16": |
|
gradient_memory = 2 * numParametersTotal |
|
|
|
return gradient_memory |
|
|
|
def Compute_Optimizer_states(numParametersTotal, opt_func, o_ty, is_dist_opt, dp, cp): |
|
if o_ty == "FP32": |
|
optimizer_memory = 4 * 2 * numParametersTotal |
|
elif o_ty =="BF16": |
|
optimizer_memory = 2 * 2 * numParametersTotal |
|
|
|
if is_dist_opt == "True": |
|
optimizer_memory = optimizer_memory / (dp * cp) |
|
|
|
|
|
if opt_func == "SGD": |
|
optimizer_memory = 0 |
|
|
|
return optimizer_memory |
|
|
|
def Compute_Master_weight(numParametersTotal, precision, is_dist_opt, dp, cp): |
|
if precision == "BF16": |
|
master_weight_memory = 4 * numParametersTotal |
|
else: |
|
master_weight_memory = 0 |
|
if is_dist_opt == "True": |
|
master_weight_memory = master_weight_memory / (dp * cp) |
|
|
|
return master_weight_memory |
|
|
|
def Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size, ffn_size, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, |
|
dp, tp, pp, cp, is_dist_opt, precision, is_fp8, is_fp8_init, g_ty, opt_func, o_ty): |
|
numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, head_num, tp, pp) |
|
|
|
weight_memory = Compute_Weight(numParametersTotal, precision, is_fp8, is_fp8_init) |
|
gradient_memory = Compute_Gradient(numParametersTotal, g_ty) |
|
optimizer_memory = Compute_Optimizer_states(numParametersTotal, opt_func, o_ty, is_dist_opt, dp, cp) |
|
master_weight_memory = Compute_Master_weight(numParametersTotal, precision, is_dist_opt, dp, cp) |
|
|
|
return numParametersTotal, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, \ |
|
weight_memory + gradient_memory + optimizer_memory + master_weight_memory |
|
|
|
|
|
def compute_activation_memory_attention(training_dtype, gemm_dtype, seq_length, b, hidden_size, kv_hidden_size, is_sp, tp): |
|
|
|
activation_mem_attn_ln = seq_length * b * hidden_size * training_dtype |
|
if is_sp == "False": |
|
activation_mem_attn_ln *= tp |
|
|
|
activation_mem_attn_qkv = seq_length * b * hidden_size * gemm_dtype |
|
if is_sp == "False": |
|
activation_mem_attn_qkv *= tp |
|
|
|
activation_mem_attn_q = seq_length * b * hidden_size * training_dtype |
|
|
|
activation_mem_attn_kv = seq_length * b * kv_hidden_size * training_dtype * 2 |
|
|
|
activation_mem_attn_proj = seq_length * b * hidden_size * gemm_dtype |
|
|
|
activation_mem_attn_dropout = seq_length * b * hidden_size |
|
if is_sp == "False": |
|
activation_mem_attn_dropout *= tp |
|
|
|
|
|
activation_memory_attn = ( |
|
activation_mem_attn_ln |
|
+ activation_mem_attn_qkv |
|
+ activation_mem_attn_q |
|
+ activation_mem_attn_kv |
|
+ activation_mem_attn_proj |
|
+ activation_mem_attn_dropout |
|
) |
|
return activation_memory_attn |
|
|
|
def compute_activation_memory_mlp(training_dtype, gemm_dtype, seq_length, b, hidden_size, ffn_size, act_func, is_sp, tp): |
|
|
|
activation_mem_mlp_ln = seq_length * b * hidden_size * training_dtype |
|
if is_sp == "False": |
|
activation_mem_mlp_ln *= tp |
|
|
|
activation_mem_mlp_fc1 = seq_length * b * hidden_size * gemm_dtype |
|
if is_sp == "False": |
|
activation_mem_mlp_fc1 *= tp |
|
|
|
if act_func == "LLaMA": |
|
activation_mem_mlp_act = seq_length * b * ffn_size * training_dtype * 2 |
|
else: |
|
activation_mem_mlp_act = seq_length * b * ffn_size * training_dtype |
|
|
|
activation_mem_mlp_fc2 = seq_length * b * ffn_size * gemm_dtype |
|
|
|
activation_mem_mlp_dropout = seq_length * b * hidden_size |
|
if is_sp == "False": |
|
activation_mem_mlp_dropout *= tp |
|
|
|
|
|
activation_memory_mlp = ( |
|
activation_mem_mlp_ln |
|
+ activation_mem_mlp_fc1 |
|
+ activation_mem_mlp_act |
|
+ activation_mem_mlp_fc2 |
|
+ activation_mem_mlp_dropout |
|
) |
|
return activation_memory_mlp |
|
|
|
def compute_activation_memory_input(seq_length, b, hidden_size, pp): |
|
|
|
return 8 * seq_length * b * pp + seq_length * b * hidden_size * pp |
|
|
|
def compute_activation_memory_output(seq_length, b, hidden_size, vocab_size): |
|
|
|
return 2 * seq_length * b * hidden_size + (2 + 4 + 4) * seq_length * b * vocab_size |
|
|
|
def compute_activation_memory_pp(activation_memory, vp, pp, num_microbatches): |
|
|
|
if vp > 0: |
|
interleaved_schedule_memory_penalty = 1 + (pp - 1) / (pp * vp) |
|
activation_memory *= interleaved_schedule_memory_penalty |
|
|
|
|
|
|
|
if vp == 0 and pp > 1: |
|
if num_microbatches > 1: |
|
activation_memory *= min(1, num_microbatches / pp) |
|
|
|
return activation_memory |
|
|
|
def compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, precision, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp): |
|
|
|
|
|
|
|
|
|
|
|
if precision == "FP32": |
|
training_dtype = 4 |
|
else: |
|
training_dtype = 2 |
|
|
|
|
|
if precision == "FP32": |
|
gemm_dtype = 4 |
|
elif is_fp8 == "False": |
|
gemm_dtype = 2 |
|
else: |
|
gemm_dtype = 1 |
|
|
|
|
|
if is_group_query == "False": |
|
group_query_num = head_num |
|
kv_hidden_size = hidden_size / head_num * group_query_num |
|
|
|
activation_memory_attn = compute_activation_memory_attention(training_dtype, gemm_dtype, seq_length, b, hidden_size, kv_hidden_size, is_sp, tp) |
|
|
|
activation_memory_mlp = compute_activation_memory_mlp(training_dtype, gemm_dtype, seq_length, b, hidden_size, ffn_size, act_func, is_sp, tp) |
|
|
|
activation_memory = activation_memory_attn + activation_memory_mlp |
|
|
|
activation_memory *= layer_num |
|
|
|
|
|
|
|
activation_memory_input = compute_activation_memory_input(seq_length, b, hidden_size, pp) |
|
activation_memory += activation_memory_input |
|
|
|
|
|
num_microbatches = b_global / b / dp / cp |
|
activation_memory = compute_activation_memory_pp(activation_memory, vp, pp, num_microbatches) |
|
|
|
if pp == 1: |
|
|
|
activation_memory_output = compute_activation_memory_output(seq_length, b, hidden_size, vocab_size) |
|
activation_memory += activation_memory_output |
|
elif pp > 1: |
|
|
|
activation_memory += seq_length * b * hidden_size * 2 |
|
|
|
|
|
return activation_memory / tp / cp |
|
|
|
|
|
def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_length, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, |
|
dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, precision, is_fp8, is_fp8_init, g_ty, opt_func, o_ty, record_df, count): |
|
|
|
if is_group_query == "True": |
|
group_query_num = int(group_query_num) |
|
else: |
|
group_query_num = head_num |
|
|
|
|
|
[result, Error_message] = check_input(dp, tp, pp, cp, hidden_size, head_num, layer_num, seq_length, vp, b, b_global) |
|
if result == False: |
|
return Error_message, record_df, count |
|
|
|
|
|
numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size, |
|
ffn_size, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, dp, tp, pp, cp, is_dist_opt, precision, is_fp8, is_fp8_init, g_ty, opt_func, o_ty) |
|
|
|
|
|
activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, precision, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp) |
|
|
|
|
|
numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, head_num, 1, 1) |
|
|
|
gpu_num = dp * tp * pp * cp |
|
|
|
|
|
numParametersTotal = round(Get_BillionParameter(numParametersTotal), 3) |
|
numParameters = round(Get_BillionParameter(numParameters), 3) |
|
model_states_memory = round(Get_GigaByte(model_states_memory), 3) |
|
activation_memory = round(Get_GigaByte(activation_memory), 3) |
|
other_memory = 5 |
|
Total = round(model_states_memory + activation_memory + other_memory, 3) |
|
|
|
|
|
new_row = pd.DataFrame([[layer_num, hidden_size, ffn_size, seq_length, head_num, group_query_num, dp, tp, pp, cp, gpu_num, b, is_fp8, |
|
numParametersTotal, model_states_memory, activation_memory, Total]], |
|
columns=col) |
|
if count == 1: |
|
record_df = new_row |
|
else: |
|
record_df = record_df._append(new_row, ignore_index=True) |
|
count = count + 1 |
|
|
|
|
|
return f""" |
|
GPU numbers = {str(gpu_num)}, \n |
|
Model parameters = {str(numParametersTotal)} B, \n |
|
Model parameters on each device = {str(numParameters)} B, \n |
|
Model_states = Weight + Gradient + Optimizer = {str(model_states_memory)} GB, \n |
|
Activation = {str(activation_memory)} GB, \n |
|
Other memory = 5 GB, \n |
|
Total memory consumption = {str(Total)} GB \n |
|
""", record_df, count |
|
|
|
def generate_csv(record_df): |
|
|
|
csv_filename = "data.csv" |
|
record_df.to_csv(csv_filename, index=False) |
|
|
|
|
|
return csv_filename |
|
|
|
|
|
formula = r""" |
|
> **Note**🔑: In this formula, we assume LLM training with FP8 training. |
|
> 1. LlaMA-family Model. |
|
> 2. Interleaved pipeline. |
|
> 3. bias = False. |
|
> 4. SP = True. |
|
|
|
<div align="center"> |
|
<img src=file/T1.jpg width=50%/> |
|
</div> |
|
|
|
$$ |
|
{Total\ Model\ parameters} = |
|
HV + (4H^2 + 3H \times FFN + 2H) \times L + H |
|
$$ |
|
|
|
*** |
|
|
|
<div align="center"> |
|
<img src=file/ms.png width=40%/> |
|
</div> |
|
|
|
$$ |
|
{Model\ states} = |
|
(6 + \frac{12}{dp \times cp}) \times |
|
(\frac{(\frac{4H^2 + 3H \times FFN}{tp} + 2H) \times L}{pp} + \frac{HV}{tp}) |
|
$$ |
|
|
|
$$ |
|
{Activation} = |
|
(1 + \frac{pp-1}{pp \times vp}) \times |
|
\frac{(8BS + BSH) \times pp + (15BSH + 5BS \times FFN) \times L}{tp \times cp} |
|
$$ |
|
|
|
*** |
|
|
|
$$ |
|
\\begin{gather} |
|
{GPU\ numbers} = tp \times pp \times dp \times cp\\\\ |
|
{Total\ memory\ consumption} = {Model\ states} + Activation |
|
\\end{gather} |
|
$$ |
|
""" |
|
|
|
def check_tp(tp, head_num): |
|
if head_num % tp == 0: |
|
return True |
|
else: |
|
return False |
|
|
|
def check_pp(pp, layer_num): |
|
if layer_num % pp == 0: |
|
return True |
|
else: |
|
return False |
|
|
|
def check_cp(cp, seq_length): |
|
if seq_length % cp == 0: |
|
return True |
|
else: |
|
return False |
|
|
|
def check_hidden(hidden_size, head_num): |
|
if hidden_size % head_num == 0: |
|
return True |
|
else: |
|
return False |
|
|
|
def check_b_global(b_global, b, dp, cp): |
|
if b_global % (b * dp * cp) == 0: |
|
return True |
|
else: |
|
return False |
|
|
|
def check_num_microbatch(layer_num, vp, pp, num_microbatches): |
|
if vp > 0: |
|
if layer_num % (pp * vp) == 0: |
|
return True |
|
else: |
|
return False |
|
|
|
if vp == 0 and pp > 1: |
|
if num_microbatches > 1: |
|
if num_microbatches % pp == 0: |
|
return True |
|
else: |
|
return False |
|
return True |
|
|
|
|
|
def check_input(dp, tp, pp, cp, hidden_size, head_num, layer_num, seq_length, vp, b, b_global): |
|
result = True |
|
Error_message = "" |
|
if check_tp(tp, head_num) == False: |
|
result = False |
|
Error_message += "Error message: Please reset Tensor parallelism or head_num, make head_num % tp = 0. \n" |
|
if check_pp(pp, layer_num) == False: |
|
result = False |
|
Error_message += "Error message: Please reset Pipeline parallelism or layer_num, make layer_num % pp = 0. \n" |
|
if check_cp(cp, seq_length) == False: |
|
result = False |
|
Error_message += "Error message: Please reset Context parallelism or seq_length, make seq_length % cp = 0. \n" |
|
if check_hidden(hidden_size, head_num) == False: |
|
result = False |
|
Error_message += "Error message: Please reset hidden_size or head_num, make hidden_size % head_num = 0. \n" |
|
if check_b_global(b_global, b, dp, cp) == False: |
|
result = False |
|
Error_message += "Error message: Please reset b_global or batch_size, make b_global % (batch_size * dp * cp) = 0. \n" |
|
if check_num_microbatch(layer_num, vp, pp, b_global / b / dp / cp) == False: |
|
result = False |
|
Error_message += "Error message: Please reset b_global or batch_size or layer_num or Virtual Pipeline Size, make layer_num % (pp * vp) = 0, num_microbatches % pp = 0. \n" |
|
|
|
return result, Error_message |
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
|
|
gr.Markdown( |
|
""" |
|
<div style="text-align: center;"> |
|
<h1>GPU memory calculator 🌀</h1> |
|
<p style="font-size:16px;">Here's a GPU memory calculator, it helps you to compute memory comsumption in LLM training. </p> |
|
<p style="font-size:16px;">Note: Flash-attention is enabled by default. </p> |
|
</div> |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
gr.Markdown( |
|
""" |
|
<h2>Model Parameters:</h2> |
|
""" |
|
) |
|
with gr.Accordion("Model Parameters"): |
|
|
|
act_func = gr.Radio(["LLaMA", "GPT"], value="LLaMA", label="Model type", info="eg. LLaMa: SwiGLU, RoPE, RMSNorm") |
|
with gr.Row(): |
|
vocab_size = gr.Number(label="Vocab size (V)", value=32000) |
|
layer_num = gr.Number(label="Layer number (L)", value=32) |
|
with gr.Row(): |
|
hidden_size = gr.Number(label="Hidden size (H)", value=4096) |
|
ffn_size = gr.Number(label="FFN Hidden size (FFN)", value=11008) |
|
with gr.Row(): |
|
sequence_len = gr.Number(label="Sequence length (S)", value=2048) |
|
head_num = gr.Number(label="Number of Attention Heads (A)", value=32) |
|
with gr.Row(): |
|
is_group_query = gr.Radio(["True", "False"], value="False", label="Use Group Query Attention") |
|
group_query_num = gr.Textbox(label="Number of Query Groups (G)", max_lines=1, value=None, interactive=False) |
|
with gr.Row(): |
|
is_bias = gr.Radio(["True", "False"], value="False", label="Use Bias") |
|
is_tie_word_embedding = gr.Radio(["True", "False"], value="False", label="Tie word embeddings") |
|
|
|
def toggle_textbox_editable(radio_value): |
|
|
|
if radio_value == "True": |
|
return gr.update(interactive=True, value="96") |
|
else: |
|
return gr.update(interactive=False, value="") |
|
|
|
is_group_query.change(toggle_textbox_editable, inputs=is_group_query, outputs=group_query_num) |
|
|
|
with gr.Column(): |
|
|
|
gr.Markdown( |
|
""" |
|
<h2>Parallelism config:</h2> |
|
""" |
|
) |
|
with gr.Accordion("Parallelism config"): |
|
|
|
dp = gr.Number(label="Data parallelism (dp)", value=2) |
|
tp = gr.Number(label="Tensor parallelism (tp)", value=2) |
|
pp = gr.Number(label="Pipeline parallelism (pp)", value=2) |
|
cp = gr.Number(label="Context parallelism (cp)", value=1) |
|
|
|
is_sp = gr.Radio(["True", "False"], value="True", label="Sequence parallelism") |
|
vp = gr.Number(label="Virtual Pipeline Size (vp)") |
|
is_dist_opt = gr.Radio(["True", "False"], value="True", label="Use Distributed Optimizer(Zero1)") |
|
|
|
with gr.Column(): |
|
|
|
gr.Markdown( |
|
""" |
|
<h2>Training Config:</h2> |
|
""" |
|
) |
|
with gr.Accordion("Training Config"): |
|
|
|
b = gr.Number(label="Micro Batch size (B)", value=4) |
|
b_global = gr.Number(label="Global Batch size", value=64) |
|
precision = gr.Dropdown(["FP32", "BF16"], value="BF16", label="Training precision") |
|
with gr.Row(): |
|
is_fp8 = gr.Radio(["True", "False"], value="True", label="FP8 Training") |
|
is_fp8_init = gr.Radio(["True", "False"], value="True", label="FP8 Initialization(will reduce memory)") |
|
g_ty = gr.Dropdown(["FP32", "BF16"], value="FP32", label="Gradients Dtype") |
|
with gr.Row(): |
|
opt_func = gr.Radio(["Adam", "SGD"], value="Adam", label="Optimizer function") |
|
o_ty = gr.Dropdown(["FP32", "BF16"], value="FP32", label="Optimizer State Dtype") |
|
|
|
compute_btn = gr.Button("Compute") |
|
with gr.Tab("Output"): |
|
with gr.Column(): |
|
|
|
|
|
|
|
|
|
|
|
output_text = gr.Textbox( |
|
label="Compute result", |
|
interactive=False, |
|
) |
|
|
|
with gr.Tab("Formula"): |
|
formula = formula |
|
|
|
gr.Markdown( |
|
formula |
|
, latex_delimiters=[{ "left": "$$", "right": "$$", "display": True }] |
|
) |
|
|
|
|
|
|
|
record_df = gr.Dataframe( |
|
label="Record Table", |
|
headers=col, |
|
interactive=False |
|
) |
|
download_btn = gr.Button("Download") |
|
count = gr.Number(label="Row count", value=1, visible=False) |
|
compute_btn.click( |
|
fn=Compute_ALL_Model_memory, |
|
inputs=[vocab_size, layer_num, hidden_size, ffn_size, sequence_len, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, |
|
dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, precision, is_fp8, is_fp8_init, g_ty, opt_func, o_ty, record_df, count], |
|
outputs=[output_text, record_df, count] |
|
) |
|
|
|
output_file=gr.File(label="When you click the download button, the downloaded form will be displayed here.") |
|
|
|
download_btn.click( |
|
fn=generate_csv, |
|
inputs=record_df, |
|
outputs=output_file |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=False, allowed_paths=["/"]) |
|
|