Alan Liu commited on
Commit
c93009d
1 Parent(s): ed50ee5

add client throughput

Browse files
Files changed (2) hide show
  1. app.py +9 -6
  2. calc_util.py +13 -1
app.py CHANGED
@@ -138,8 +138,9 @@ with col3: # Prefilling
138
  prefilling_operation_count = prefilling_operation(model_config, inference_config)
139
  prefilling_activation_memory_count = prefilling_activation_memory(model_config, inference_config)
140
  inference_info['inference_prefilling_time'] = prefilling_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
141
- inference_info['inference_prefilling_throughput'] = inference_config['input_seq_length']*inference_config['batchsize']/inference_info['inference_prefilling_time']
142
  inference_info['prefilling_memory_latency'] = prefilling_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
 
 
143
  cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * inference_config['input_seq_length']))
144
 
145
  operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
@@ -162,9 +163,9 @@ with col3: # Prefilling
162
 
163
  header5("Summary: Prefilling")
164
  st.markdown(create_table(df_subtotal_operation_count))
165
- st.write(f"Prefillng throughput (tokens/s): {inference_info['inference_prefilling_throughput']:.2f}")
166
  st.write(f"FLOPS latency: {inference_info['inference_prefilling_time']}")
167
  st.write(f"Memory latency: {inference_info['prefilling_memory_latency']}")
 
168
 
169
  if inference_config['KV_cache']:
170
  st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
@@ -175,9 +176,9 @@ with col4: # Generation
175
  generation_operation_count = generation_operation(model_config, inference_config)
176
  generation_activation_memory_count = generation_activation_memory(model_config, inference_config)
177
  inference_info['inference_generation_time'] = generation_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
178
- inference_info['inference_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize']/inference_info['inference_generation_time']
179
- inference_info['inference_client_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize'] / (inference_info['inference_prefilling_time'] + inference_info['inference_generation_time'])
180
  inference_info['generation_memory_latency'] = generation_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
 
 
181
  cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * (inference_config['input_seq_length']+inference_config['output_seq_length'])))
182
 
183
  operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}
@@ -199,10 +200,12 @@ with col4: # Generation
199
 
200
  header5("Summary: Generation")
201
  st.markdown(create_table(df_subtotal_operation_count))
202
- st.write(f"Generation-only throughput (tokens/s): {inference_info['inference_generation_throughput']:.2f}")
203
- st.write(f"(Client) Generation throughput (tokens/s): {inference_info['inference_client_generation_throughput']:.2f}")
204
  st.write(f"FLOPS latency: {inference_info['inference_generation_time']}")
205
  st.write(f"Memory latency: {inference_info['generation_memory_latency']}")
 
 
206
 
207
  if inference_config['KV_cache']:
208
  st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
 
138
  prefilling_operation_count = prefilling_operation(model_config, inference_config)
139
  prefilling_activation_memory_count = prefilling_activation_memory(model_config, inference_config)
140
  inference_info['inference_prefilling_time'] = prefilling_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
 
141
  inference_info['prefilling_memory_latency'] = prefilling_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
142
+ calc_prefilling_throughput(model_config, inference_config, inference_info)
143
+
144
  cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * inference_config['input_seq_length']))
145
 
146
  operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
 
163
 
164
  header5("Summary: Prefilling")
165
  st.markdown(create_table(df_subtotal_operation_count))
 
166
  st.write(f"FLOPS latency: {inference_info['inference_prefilling_time']}")
167
  st.write(f"Memory latency: {inference_info['prefilling_memory_latency']}")
168
+ st.write(f"Prefillng throughput (tokens/s): {inference_info['prefilling_throughput']:.2f} ({inference_info['prefilling_bound_type']}-bound)")
169
 
170
  if inference_config['KV_cache']:
171
  st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
 
176
  generation_operation_count = generation_operation(model_config, inference_config)
177
  generation_activation_memory_count = generation_activation_memory(model_config, inference_config)
178
  inference_info['inference_generation_time'] = generation_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
 
 
179
  inference_info['generation_memory_latency'] = generation_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
180
+ calc_generation_throughput(model_config, inference_config, inference_info)
181
+
182
  cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * (inference_config['input_seq_length']+inference_config['output_seq_length'])))
183
 
184
  operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}
 
200
 
201
  header5("Summary: Generation")
202
  st.markdown(create_table(df_subtotal_operation_count))
203
+ #st.write(f"Generation-only throughput (tokens/s): {inference_info['inference_generation_throughput']:.2f}")
204
+ #st.write(f"(Client) Generation throughput (tokens/s): {inference_info['inference_client_generation_throughput']:.2f}")
205
  st.write(f"FLOPS latency: {inference_info['inference_generation_time']}")
206
  st.write(f"Memory latency: {inference_info['generation_memory_latency']}")
207
+ st.write(f"Generation-only throughput (tokens/s): {inference_info['generation_throughput']:.2f} ({inference_info['generation_bound_type']}-bound)")
208
+ st.write(f"(Client) Generation throughput (tokens/s): {inference_info['client_generation_throughput']:.2f}")
209
 
210
  if inference_config['KV_cache']:
211
  st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
calc_util.py CHANGED
@@ -296,4 +296,16 @@ def generation_activation_memory(model_config, inference_config):
296
  activation_memory['mlp'] + activation_memory['layernorm']
297
  )
298
 
299
- return activation_memory
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  activation_memory['mlp'] + activation_memory['layernorm']
297
  )
298
 
299
+ return activation_memory
300
+
301
+
302
+ def calc_prefilling_throughput(model_config, inference_config, inference_info):
303
+ inference_info['prefilling_throughput'] = inference_config['input_seq_length']*inference_config['batchsize'] / max([inference_info['inference_prefilling_time'], inference_info['prefilling_memory_latency']])
304
+ inference_info['prefilling_bound_type'] = "memory" if inference_info['inference_prefilling_time'] < inference_info['prefilling_memory_latency'] else "arithmetic"
305
+
306
+ def calc_generation_throughput(model_config, inference_config, inference_info):
307
+ inference_info['generation_throughput'] = inference_config['input_seq_length']*inference_config['batchsize'] / max([inference_info['inference_generation_time'], inference_info['generation_memory_latency']])
308
+ inference_info['generation_bound_type'] = "memory" if inference_info['inference_generation_time'] < inference_info['generation_memory_latency'] else "arithmetic"
309
+
310
+ total_time = max([inference_info['inference_prefilling_time'], inference_info['prefilling_memory_latency']]) + max([inference_info['inference_generation_time'], inference_info['generation_memory_latency']])
311
+ inference_info['client_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize'] / total_time