File size: 13,557 Bytes
3698d0a
 
 
 
dd4f101
3698d0a
 
 
 
 
 
 
 
 
bd01cf4
3698d0a
 
dd4f101
 
 
3698d0a
 
 
 
 
 
 
 
 
 
 
 
d1c8a18
3698d0a
 
 
 
 
 
 
 
 
dd4f101
bd01cf4
dd4f101
 
5ae9f66
 
dd4f101
 
 
3698d0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3732b01
3698d0a
 
 
 
5f0df3a
3698d0a
 
5f0df3a
 
 
3698d0a
 
 
 
 
 
bd01cf4
 
3698d0a
 
 
 
 
 
5f0df3a
3698d0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd4f101
 
 
 
 
 
 
 
 
 
3698d0a
 
 
 
 
 
 
 
 
 
 
 
 
 
dd4f101
 
 
 
 
 
 
 
3698d0a
3732b01
 
 
 
 
 
 
 
 
 
 
3698d0a
 
5f0df3a
 
 
c93009d
 
3698d0a
 
 
 
ed50ee5
5f0df3a
3698d0a
6aa1c8b
3698d0a
 
 
5f0df3a
 
6aa1c8b
5f0df3a
6aa1c8b
 
3698d0a
 
 
 
 
5f0df3a
 
c93009d
3698d0a
 
 
 
 
 
5f0df3a
3698d0a
5f0df3a
 
 
c93009d
 
3698d0a
 
 
 
ed50ee5
5f0df3a
3698d0a
 
 
 
 
ed50ee5
 
 
 
 
3698d0a
 
 
 
 
c93009d
 
5f0df3a
ed50ee5
c93009d
 
3698d0a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# Ref: Ouyang, A. (2023). Understanding the Performance of Transformer Inference (Doctoral dissertation, Massachusetts Institute of Technology).

import streamlit as st
import pandas as pd
from model_util import fetch_dictionary_content, load_parameter, get_model, classify_module, get_module_tensors
from calc_util import *
from render_util import create_table, header4, header5


st.set_page_config(layout='wide')
if 'model_config' not in st.session_state:
    st.session_state['model_config'] = {}


def load_model_config(model_id, access_token):
    if 'model_id' in st.session_state['model_config'] and st.session_state['model_config']['model_id'] == model_id:
        return st.session_state['model_config']
    if 'parameter_count' in st.session_state:
        st.session_state.pop('parameter_count')

    model_config = {}
    dictionary_content = fetch_dictionary_content(model_id)
    if dictionary_content:
        model_config['model_id'] = model_id
        model_config['hidden_size'] = dictionary_content['hidden_size']
        model_config['num_attention_heads'] = dictionary_content['num_attention_heads']
        model_config['num_hidden_layers'] = dictionary_content['num_hidden_layers']
        model_config['intermediate_size'] = load_parameter(dictionary_content, ['intermediate_size', 'ffn_dim'])
        model_config['vocab_size'] = dictionary_content['vocab_size']
        model_config['max_position_embeddings'] = dictionary_content['max_position_embeddings']
        model_config['layernorm_operation'] = 2
    else:
        st.warning("Fetching information failed! Maybe model info is not public!")
        model_config['model_id'] = 'opt-1.3b'
        model_config['hidden_size'] = 2048
        model_config['num_attention_heads'] = 32
        model_config['num_hidden_layers'] = 24
        model_config['intermediate_size'] = 8192
        model_config['vocab_size'] = 50272
        model_config['max_position_embeddings'] = 2048
        model_config['layernorm_operation'] = 2

    try:
        model_config['model'] = get_model(model_id, None, access_token=access_token if len(access_token)>0 else None)
        module_tensors = get_module_tensors(model_config['model'])
        model_config['module_classes'] = classify_module(module_tensors)
    except Exception as e:
        st.warning(e)
        model_config['model'] = None
        model_config['module_classes'] = None

    st.session_state['model_config'] = model_config
    return model_config


subtotal_parameters = [
    'embedding_weights',
    'attention_weights',
    'mlp_weights',
]

subtotal_operations = [
    'embeddings',
    'attention',
    'mlp',
    'total',
]



col1, col2, col3, col4, col5 = st.columns([0.8, 2, 2.5, 2.5, 0.01])

inference_config = {}
parameter_count = {}
cached_parameter_count = {}

prefilling_operation_count = {}
generation_operation_count = {}
prefilling_memory_count = {}
generation_memory_count = {}

gpu_config = {}
inference_info = {}

with col1:
    header4("Model")
    model_id = st.text_input("huggingface model id", 'ArthurZ/opt-13b')
    access_token = st.text_input("access token", '')
    model_config = load_model_config(model_id, access_token)
    model_config['hidden_size'] = st.number_input('hidden size', value=model_config['hidden_size'], format ="%d")
    model_config['num_attention_heads'] = st.number_input('num attention heads', value=model_config['num_attention_heads'], format ="%d")
    model_config['num_hidden_layers'] = st.number_input('num hidden layers', value=model_config['num_hidden_layers'], format ="%d")
    model_config['intermediate_size'] = st.number_input('intermediate size', value=model_config['intermediate_size'], format ="%d")
    model_config['vocab_size'] = st.number_input('vocab size', value= model_config['vocab_size'], format ="%d")
    model_config['max_position_embeddings'] = st.number_input('max position embeddings', value=model_config['max_position_embeddings'], format ="%d")
    model_config['hidden_size_per_head'] = model_config['hidden_size']/model_config['num_attention_heads']

    header4("Inference Setting")
    inference_config['batchsize'] = st.number_input('batchsize', value=1, format ="%d")
    inference_config['input_seq_length'] = st.number_input('input seq length', value=1, format ="%d")
    inference_config['output_seq_length'] = st.number_input('output seq length', value=1, format ="%d")
    inference_config['byte_per_parameter'] = st.number_input('byte per parameter', value=2, format ="%d")
    inference_config['KV_cache'] = st.checkbox("Use KV cache", value=True)

    header4("GPU Setting")
    gpu_config['Name'] = st.text_input('GPU Type', value="A6000")
    gpu_config['TFLOP'] = st.number_input('TFLOP', value=38.7, format ="%2f")
    gpu_config['memory_bandwidth'] = st.number_input('memory bandwidth (GB/s)', value=768, format ="%2d")
    gpu_config['arithmetic_intensity'] = gpu_config['TFLOP']*10**12/gpu_config['memory_bandwidth']/1024**3
    st.write(f"arithmetic_intensity: {gpu_config['arithmetic_intensity']:.3f}")

with col2:
    if 'parameter_count' not in st.session_state:
        if model_config['model']:
            st.info("Model info fetcted!")
            parameter_count = calc_model_size_from_model(model_config, inference_config)
        else:
            st.info("Fail to fetch model info. Using estimation!")
            parameter_count = model_size_estimate(model_config, inference_config)
        st.session_state.parameter_count = parameter_count 
    else:
        parameter_count = st.session_state.parameter_count

    parameters_items = {key: "{:,}".format(int(parameter_count[key])) for key in parameter_count if key not in subtotal_parameters}
    subtotal_parameters_items = {key: "{:,}".format(int(parameter_count[key])) for key in parameter_count if key in subtotal_parameters}

    # Convert dictionaries to pandas dataframes for table display
    df_parameters_items = pd.DataFrame(list(parameters_items.items()), columns=["Parameter", "Count"])
    df_subtotal_parameters_items = pd.DataFrame(list(subtotal_parameters_items.items()), columns=["Parameter", "Count"])

    header4("Model Parameters")
    st.markdown(create_table(df_parameters_items))

    header4("Parameters Summary")
    st.markdown(create_table(df_subtotal_parameters_items))

    model_total_size_in_byte = inference_config['byte_per_parameter'] * (
                                                                            parameter_count['embedding_weights'] + 
                                                                            parameter_count['attention_weights'] + 
                                                                            parameter_count['mlp_weights'] + 
                                                                            parameter_count['layernorm']
                                                                        )
    st.write(f'model_total_size (Byte): {model_total_size_in_byte:,}')


    # add parameter viewer
    if model_config['model']:
        header4("Parameters Viewer")
        weight_generic = st.selectbox('Select weight:', options=model_config['module_classes'])
        modules = {}
        for module in model_config['module_classes'][weight_generic]:
            modules.update(module)
        modules = {k: list(v) for k, v in modules.items()}
        modules = pd.DataFrame(list(modules.items()), columns=["Parameter", "Shape"])
        st.markdown(create_table(modules))

with col3: # Prefilling
    prefilling_operation_count = prefilling_operation(model_config, inference_config)
    prefilling_activation_memory_count = prefilling_activation_memory(model_config, inference_config)
    inference_info['inference_prefilling_time'] = prefilling_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
    inference_info['prefilling_memory_latency'] = prefilling_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
    calc_prefilling_throughput(model_config, inference_config, inference_info)
    
    cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * inference_config['input_seq_length']))

    operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
    subtotal_operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key in subtotal_operations}
    prefilling_arithmetic_intensity = {key: "{:.3f}".format(prefilling_operation_count[key]/prefilling_activation_memory_count[key] if prefilling_activation_memory_count[key]>0 else float('inf'))  for key in prefilling_activation_memory_count}
    prefilling_activation_memory_count = {key: "{:,}".format(int(value)) for key, value in prefilling_activation_memory_count.items()}


    ## Convert dictionaries to pandas dataframes for table display
    df_operation_count = pd.DataFrame(list(operation_items.items()), columns=["Operation", "FLOPS"])
    df_subtotal_operation_count = pd.DataFrame(list(subtotal_operation_items.items()), columns=["Operation", "FLOPS"])
    
    df_operation_count["Activation (Byte)"] = df_operation_count["Operation"].map(prefilling_activation_memory_count)
    df_operation_count["Arithmetic Intensity"] = df_operation_count["Operation"].map(prefilling_arithmetic_intensity)
    df_subtotal_operation_count["Activation (Byte)"] = df_subtotal_operation_count["Operation"].map(prefilling_activation_memory_count)
    df_subtotal_operation_count["Arithmetic Intensity"] = df_subtotal_operation_count["Operation"].map(prefilling_arithmetic_intensity)
    
    header4("Inference Ops: Prefilling")
    st.markdown(create_table(df_operation_count))

    header5("Summary: Prefilling")
    st.markdown(create_table(df_subtotal_operation_count))
    st.write(f"FLOPS latency: {inference_info['inference_prefilling_time']}")
    st.write(f"Memory latency: {inference_info['prefilling_memory_latency']}")
    st.write(f"Prefillng throughput (tokens/s): {inference_info['prefilling_throughput']:.2f} ({inference_info['prefilling_bound_type']}-bound)")

    if inference_config['KV_cache']:
        st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
        


with col4: # Generation
    generation_operation_count = generation_operation(model_config, inference_config)
    generation_activation_memory_count = generation_activation_memory(model_config, inference_config)
    inference_info['inference_generation_time'] = generation_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
    inference_info['generation_memory_latency'] = generation_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
    calc_generation_throughput(model_config, inference_config, inference_info)
   
    cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * (inference_config['input_seq_length']+inference_config['output_seq_length'])))

    operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}
    subtotal_operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key in subtotal_operations}
    generation_arithmetic_intensity = {key: "{:.3f}".format(generation_operation_count[key]/generation_activation_memory_count[key] if generation_activation_memory_count[key]>0 else float('inf')) for key in generation_activation_memory_count}
    generation_activation_memory_count = {key: "{:,}".format(int(value)) for key, value in generation_activation_memory_count.items()}

    ## Convert dictionaries to pandas dataframes for table display
    df_operation_count = pd.DataFrame(list(operation_items.items()), columns=["Operation", "FLOPS"])
    df_subtotal_operation_count = pd.DataFrame(list(subtotal_operation_items.items()), columns=["Operation", "FLOPS"])
   
    df_operation_count["Activation (Byte)"] = df_operation_count["Operation"].map(generation_activation_memory_count)
    df_operation_count["Arithmetic Intensity"] = df_operation_count["Operation"].map(generation_arithmetic_intensity)
    df_subtotal_operation_count["Activation (Byte)"] = df_subtotal_operation_count["Operation"].map(generation_activation_memory_count)
    df_subtotal_operation_count["Arithmetic Intensity"] = df_subtotal_operation_count["Operation"].map(generation_arithmetic_intensity)
    
    header4("Inference Ops: Generation")
    st.markdown(create_table(df_operation_count))

    header5("Summary: Generation")
    st.markdown(create_table(df_subtotal_operation_count))
    #st.write(f"Generation-only throughput (tokens/s): {inference_info['inference_generation_throughput']:.2f}")
    #st.write(f"(Client) Generation throughput (tokens/s): {inference_info['inference_client_generation_throughput']:.2f}")
    st.write(f"FLOPS latency: {inference_info['inference_generation_time']}")
    st.write(f"Memory latency: {inference_info['generation_memory_latency']}")
    st.write(f"Generation-only throughput (tokens/s): {inference_info['generation_throughput']:.2f} ({inference_info['generation_bound_type']}-bound)")
    st.write(f"(Client) Generation throughput (tokens/s): {inference_info['client_generation_throughput']:.2f}")

    if inference_config['KV_cache']:
        st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")