File size: 6,195 Bytes
76398c6
 
 
 
 
a0b9dac
76398c6
992ded3
76398c6
992ded3
76398c6
 
 
 
 
 
 
 
 
 
 
 
 
 
fddae32
 
 
 
76398c6
ae0011e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76398c6
 
 
 
 
 
 
e8be103
 
 
 
 
 
 
 
 
 
 
 
 
76398c6
 
 
 
 
 
 
 
a0b9dac
 
 
 
fddae32
 
 
 
 
 
 
a0b9dac
76398c6
 
 
79399de
161a324
79399de
 
76398c6
 
 
161a324
 
e8be103
 
 
161a324
76398c6
 
 
 
 
2d9aa2d
76398c6
a0b9dac
74c26d6
76398c6
 
 
 
 
 
 
 
 
 
 
 
ae0011e
 
 
 
 
 
 
76398c6
 
 
 
 
 
 
 
 
ae0011e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import streamlit as st
import pandas as pd
from utils import extract_from_url, get_model, calculate_memory
import plotly.express as px
import numpy as np
import gc

st.set_page_config(page_title='Can you run it? LLM version', layout="wide", initial_sidebar_state="expanded")

st.title("Can you run it? LLM version")

percentage_width_main = 80
st.markdown(
        f"""<style>
        .appview-container .main .block-container{{
        max-width: {percentage_width_main}%;}}
        </style>
        """,
        unsafe_allow_html=True,
    )
@st.cache_resource
def get_gpu_specs():
    return pd.read_csv("data/gpu_specs.csv")

@st.cache_resource
def get_mistralai_table():
    model = get_model("mistralai/Mistral-7B-v0.1", library="transformers", access_token="")
    return calculate_memory(model, ["float32", "float16/bfloat16", "int8", "int4"])

def show_gpu_info(info, trainable_params=0):
    for var in ['Inference', 'Full Training Adam', 'LoRa Fine-tuning']:
        _info = info.loc[var]
        if _info['Number of GPUs'] >= 3:
            func = st.error
            icon = "⛔"
        elif _info['Number of GPUs'] == 2:
            func = st.warning
            icon = "⚠️"
        else:
            func = st.success
            icon = "✅"
        
        msg = f"You require **{_info['Number of GPUs']}** GPUs for **{var}**"
        if var == 'LoRa Fine-tuning':
            msg += f" ({trainable_params}%)"
        func(msg, icon=icon)


def get_name(index):
    row = gpu_specs.iloc[index]
    return f"{row['Product Name']} ({row['RAM (GB)']} GB, {row['Year']})"

gpu_specs = get_gpu_specs()

_, col, _ = st.columns([1,3,1])
with col.expander("Information", expanded=True):
    st.markdown("""- GPU information comes from [TechPowerUp GPU Specs](https://www.techpowerup.com/gpu-specs/)
- Mainly based on [Model Memory Calculator by hf-accelerate](https://huggingface.co/spaces/hf-accelerate/model-memory-usage)
    using `transformers` library
- Inference is calculated following [EleutherAI Transformer Math 101](https://blog.eleuther.ai/transformer-math/),
    where is estimated as """)
    
    st.latex(r"""\text{Memory}_\text{Inference} \approx \text{Model Size} \times 1.2""")
    st.markdown("""- For LoRa Fine-tuning, I'm asuming a **16-bit** dtype of trainable parameters. The formula (in terms of GB) is""")
    st.latex(r"\text{Memory}_\text{LoRa} \approx \text{Model Size} + \left(\text{ \# trainable Params}_\text{Billions}\times\frac{16}{8} \times 4\right) \times 1.2")
    st.markdown("- You can understand `int4` as models in `GPTQ-4bit`, `AWQ-4bit` or `Q4_0 GGUF/GGML` formats")

access_token = st.sidebar.text_input("Access token")
model_name = st.sidebar.text_input("Model name", value="mistralai/Mistral-7B-v0.1")
if not model_name:
    st.info("Please enter a model name")
    st.stop()

model_name = extract_from_url(model_name)
if model_name not in st.session_state:
    if 'actual_model' in st.session_state:
        del st.session_state[st.session_state['actual_model']]
        del st.session_state['actual_model']
        gc.collect()
    if model_name == "mistralai/Mistral-7B-v0.1": # cache Mistral
        st.session_state[model_name] = get_mistralai_table()
    else:
        model = get_model(model_name, library="transformers", access_token=access_token)
        st.session_state[model_name] = calculate_memory(model, ["float32", "float16/bfloat16", "int8", "int4"])
        del model
        gc.collect()
    st.session_state['actual_model'] = model_name


gpu_vendor = st.sidebar.selectbox("GPU Vendor", ["NVIDIA", "AMD", "Intel"])
# year = st.sidebar.selectbox("Filter by Release Year", list(range(2014, 2024))[::-1], index=None)
gpu_info = gpu_specs[gpu_specs['Vendor'] == gpu_vendor].sort_values('Product Name')
# if year:
#     gpu_info = gpu_info[gpu_info['Year'] == year]

min_ram = gpu_info['RAM (GB)'].min()
max_ram = gpu_info['RAM (GB)'].max()
ram = st.sidebar.slider("Filter by RAM (GB)", min_ram, max_ram, (10.0, 40.0), step=0.5)
gpu_info = gpu_info[gpu_info["RAM (GB)"].between(ram[0], ram[1])]
if len(gpu_info) == 0:
    st.sidebar.error(f"**{gpu_vendor}** has no GPU in that RAM range")
    st.stop()
gpu = st.sidebar.selectbox("GPU", gpu_info['Product Name'].index.tolist(), format_func=lambda x : gpu_specs.iloc[x]['Product Name'])
gpu_spec = gpu_specs.iloc[gpu]
gpu_spec.name = 'INFO'

lora_pct = st.sidebar.slider("LoRa % trainable parameters", 0.1, 100.0, 2.0, step=0.1)

st.sidebar.dataframe(gpu_spec.T.astype(str))

memory_table = pd.DataFrame(st.session_state[model_name]).set_index('dtype')
memory_table['LoRA Fine-Tuning (GB)'] = (memory_table["Total Size (GB)"] + 
                                          (memory_table["Parameters (Billion)"]* lora_pct/100 * (16/8)*4)) * 1.2
    
_memory_table = memory_table.copy()
memory_table = memory_table.round(2).T
_memory_table /= gpu_spec['RAM (GB)']
_memory_table = _memory_table.apply(np.ceil).astype(int).drop(columns=['Parameters (Billion)', 'Total Size (GB)'])
_memory_table.columns = ['Inference', 'Full Training Adam', 'LoRa Fine-tuning']
_memory_table = _memory_table.stack().reset_index()
_memory_table.columns = ['dtype', 'Variable', 'Number of GPUs']
col1, col2 = st.columns([1,1.3])
with col1:
    st.write(f"####  [{model_name}](https://huggingface.co/{model_name}) ({memory_table.iloc[3,0]:.1f}B)")

    dtypes = memory_table.columns.tolist()[::-1]
    tabs = st.tabs(dtypes)
    for dtype, tab in zip(dtypes, tabs):
        with tab:
            info = _memory_table[_memory_table['dtype'] == dtype].set_index('Variable')
            show_gpu_info(info, lora_pct)
    st.write(memory_table.iloc[[0, 1, 2, 4]])
with col2:
    num_colors= 4
    colors = [px.colors.sequential.RdBu[int(i*(len(px.colors.sequential.RdBu)-1)/(num_colors-1))] for i in range(num_colors)]
    fig = px.bar(_memory_table, x='Variable', y='Number of GPUs', color='dtype', barmode='group', color_discrete_sequence=colors)
    fig.update_layout(title=dict(text=f"Number of GPUs required for<br> {get_name(gpu)}", font=dict(size=25))
                    , xaxis_tickfont_size=14, yaxis_tickfont_size=16, yaxis_dtick='1')
    st.plotly_chart(fig, use_container_width=True)