Spaces:
Running
Running
Add application file
Browse files- app.py +66 -0
- requirements.txt +2 -0
- utils.py +120 -0
app.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# !usr/bin/env python
|
2 |
+
# -*- coding:utf-8 -*-
|
3 |
+
|
4 |
+
'''
|
5 |
+
Description :
|
6 |
+
Version : 1.0
|
7 |
+
Author : Chaofan Tao
|
8 |
+
Mail : tcftrees@gmail.com
|
9 |
+
Github : https://github.com/sail-sg/scaling-with-vocab
|
10 |
+
Date : 2024-08-09 00:25
|
11 |
+
Copyright (C) 2024 Chaofan Tao. All rights reserved.
|
12 |
+
'''
|
13 |
+
import gradio as gr
|
14 |
+
import pandas as pd
|
15 |
+
|
16 |
+
from utils import approach1_isoflops, approach2_derivative, approach3_isoloss
|
17 |
+
|
18 |
+
|
19 |
+
def compute_optimal_vocab(Nnv: float,
|
20 |
+
flops: float,
|
21 |
+
):
|
22 |
+
|
23 |
+
if flops is None:
|
24 |
+
Vopt_app1 = approach1_isoflops(Nnv)
|
25 |
+
Vopt_app2 = approach2_derivative(Nnv)
|
26 |
+
Vopt_app3 = approach3_isoloss(Nnv)
|
27 |
+
else:
|
28 |
+
Vopt_app1, Vopt_app2 = None, None
|
29 |
+
Vopt_app3 = approach3_isoloss(Nnv,flops)
|
30 |
+
|
31 |
+
results = f"## The optimal vocabulary size for non-vocabulary parameters {Nnv:1e} is:\nApproach 1: {Vopt_app1}\nApproach 2: {Vopt_app2}Approach 3: {Vopt_app3}"
|
32 |
+
return results
|
33 |
+
|
34 |
+
|
35 |
+
with gr.Blocks() as demo:
|
36 |
+
with gr.Column():
|
37 |
+
gr.Markdown(
|
38 |
+
"""<img src="https://raw.githubusercontent.com/MrYxJ/calculate-flops.pytorch/main/screenshot/calflops_hf3.png?raw=true" style="float: left;" width="250" height="250"><h1> ⛽️Model(Transformers) FLOPs and Parameter Calculator</h1>
|
39 |
+
This tool is used to predict the optimal vocabulary size <h1> given the non-vocabulary parameters $N_{nv}$</h1>.
|
40 |
+
We provide 3 ways for prediction:
|
41 |
+
|
42 |
+
- Approach 1: Build the relationship between studied attributes and FLOPs: Build the relationship between the optimal data points (the points that reach the lowest loss under the same FLOPs budget) and the FLOPs.
|
43 |
+
- Approach 2: Derivative-Based Estimation: Fast calculation method using the derivative of FLOPs with respect to the vocabulary size.
|
44 |
+
- Approach 3: Parametric Fit of Loss Formula: Design a loss formula that considers the effect of vocabulary size and utilizes the loss to make prediction.
|
45 |
+
|
46 |
+
Approach 1 and 2 can only be used to compute the optimal vocabulary size when the compute is optimally allocated to non-vocabulary parameters, vocabulary parameters and data jointly.
|
47 |
+
Approach 3 will not only consider the case above, but also consider the case when the amount of data does not satisfy the optimal compute allocation, and can calculate the optimal vocabulary size with specified $N_{nv}$ and FLOPs.
|
48 |
+
|
49 |
+
Thanks for trying 🌟🌟🌟!
|
50 |
+
""")
|
51 |
+
|
52 |
+
with gr.Row():
|
53 |
+
Nnv = gr.inputs.Textbox(label="Non-vocabulary Parameters", value=7*10**9)
|
54 |
+
flops = gr.inputs.Textbox(label="FLOPs", placeholder="Optional (e.g. 7.05*10**21)")
|
55 |
+
output_text = gr.outputs.Textbox(label='output')
|
56 |
+
with gr.Row():
|
57 |
+
btn = gr.Button("Compute the optimal vocabulary size")
|
58 |
+
|
59 |
+
|
60 |
+
demo = gr.Interface(fn=compute_optimal_vocab, inputs=[Nnv, flops], outputs=output_text)
|
61 |
+
btn.click(
|
62 |
+
compute_optimal_vocab,
|
63 |
+
inputs=[Nnv, flops],
|
64 |
+
outputs=output_text
|
65 |
+
)
|
66 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
scipy
|
2 |
+
numpy
|
utils.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import numpy as np
|
3 |
+
from scipy.optimize import fsolve
|
4 |
+
|
5 |
+
|
6 |
+
def Nnv_to_d(Nnv):
|
7 |
+
if Nnv <= 50_000_000:
|
8 |
+
d = 512
|
9 |
+
elif 50_000_000 < Nnv <= 200_000_000:
|
10 |
+
d = 768
|
11 |
+
elif 200_000_000 < Nnv <= 500_000_000:
|
12 |
+
d = 1024
|
13 |
+
elif 500_000_000 < Nnv <= 1_000_000_000:
|
14 |
+
d = 1536
|
15 |
+
elif 1_000_000_000 < Nnv <= 2_000_000_000:
|
16 |
+
d = 2048
|
17 |
+
elif 2_000_000_000 < Nnv <= 5_000_000_000:
|
18 |
+
d = 3200
|
19 |
+
elif 5_000_000_000 < Nnv <= 10_000_000_000:
|
20 |
+
d = 4096
|
21 |
+
elif 10_000_000_000 < Nnv <= 20_000_000_000:
|
22 |
+
d = 5120
|
23 |
+
elif 20_000_000_000 < Nnv <= 50_000_000_000:
|
24 |
+
d = 6048
|
25 |
+
elif 50_000_000_000 < Nnv <= 100_000_000_000:
|
26 |
+
d = 8192
|
27 |
+
elif 100_000_000_000 < Nnv <= 200_000_000_000:
|
28 |
+
d = 12288
|
29 |
+
elif 200_000_000_000 < Nnv <= 500_000_000_000:
|
30 |
+
d = 16384
|
31 |
+
elif 500_000_000_000 < Nnv <= 1000_000_000_000:
|
32 |
+
d = 20480
|
33 |
+
else:
|
34 |
+
d = 24576
|
35 |
+
# raise ValueError()
|
36 |
+
return float(d)
|
37 |
+
|
38 |
+
|
39 |
+
def Nnvopt_to_flops(Nnv):
|
40 |
+
'''Return the corresponding training-optimal FLOPs budget
|
41 |
+
given the non-vocabulary parameters Nnv'''
|
42 |
+
FLOPs = ( Nnv/np.exp(-2.4846510161625193)) ** (1/0.5)
|
43 |
+
return FLOPs
|
44 |
+
|
45 |
+
|
46 |
+
def flops_to_Nnvopt(FLOPs):
|
47 |
+
'''Return the corresponding training-optimal non-vocabulary parameters Nnv
|
48 |
+
given the FLOPs budget'''
|
49 |
+
return np.exp(-2.4846510161625193) * FLOPs **0.5
|
50 |
+
|
51 |
+
|
52 |
+
def approach1_isoflops(Nnv):
|
53 |
+
'''Predict the training-optimal vocabulary parameters by the approach 1:
|
54 |
+
Build the relationship between studied attributes and FLOPs'''
|
55 |
+
d = Nnv_to_d(Nnv)
|
56 |
+
FLOPs = ( Nnv/np.exp(-2.4846510161625193)) ** (1/0.5)
|
57 |
+
Nv = np.exp(-1.589031299255507)* FLOPs ** 0.4163622634135234
|
58 |
+
return int(Nv/d)
|
59 |
+
|
60 |
+
def approach2_derivative(Nnv):
|
61 |
+
'''Predict the training-optimal vocabulary parameters by the approach 2:
|
62 |
+
Derivative-based fast estimation'''
|
63 |
+
d = Nnv_to_d(Nnv)
|
64 |
+
best_vocab_para = 3145728
|
65 |
+
best_alpha = 0.8353974035228025
|
66 |
+
return int((best_vocab_para * (Nnv / 33_000_000) ** best_alpha)/d)
|
67 |
+
|
68 |
+
def approach3_isoloss(Nnv, FLOPs=None):
|
69 |
+
'''Predict the training-optimal vocabulary parameters by the approach 3:
|
70 |
+
Parametric fit of loss function.
|
71 |
+
Different from the approach 1 & 2 that assumes the the training data and
|
72 |
+
non-vocabulary parameters are EQUALLY scaled to essure the optimal compute allocation,
|
73 |
+
the approach 3 is more flexible that it can also be used in the cases the training data is
|
74 |
+
not EQUALLY scaled with the non-vocabulary parameters, for example, the number of data
|
75 |
+
is insufficient or overly sufficient. One can assign a FLOPs budget to
|
76 |
+
adjust the number of available training data.
|
77 |
+
'''
|
78 |
+
def dl_dv(V, Nnv, d, F):
|
79 |
+
term1 = 0 # Derivative of -E
|
80 |
+
term2 = 0 # Derivative of A1/[Nnv]^alpha1
|
81 |
+
term3 = -alpha2 * A2 * d / (V * d) ** (alpha2 + 1)
|
82 |
+
u = F / (6 * (Nnv + V * d))
|
83 |
+
du_dV = F * d / (6 * (Nnv + V * d) ** 2)
|
84 |
+
term4 = beta * B * du_dV / (u ** (beta + 1))
|
85 |
+
return term1 + term2 + term3 + term4
|
86 |
+
A1, A2, B, E = 1.8313851559554126, 0.19584238398665638, 2.1241123120064955, 5.5327846803337435,
|
87 |
+
alpha1, alpha2, beta = 0.44660634152009615, 0.6707374679896795, 0.44660634152009615
|
88 |
+
|
89 |
+
d = Nnv_to_d(Nnv)
|
90 |
+
if FLOPs is None:
|
91 |
+
FLOPs = Nnvopt_to_flops(Nnv)
|
92 |
+
# normalization
|
93 |
+
Nnv = Nnv / 1_000_000
|
94 |
+
d = d / 1_000
|
95 |
+
FLOPs = FLOPs / (1_000_000_000*1_000_000)
|
96 |
+
V = fsolve(dl_dv, 1, args=(Nnv,d,FLOPs))[0]
|
97 |
+
# de-normalization
|
98 |
+
Nnv = Nnv * 1_000_000
|
99 |
+
d = d * 1_000
|
100 |
+
FLOPs = FLOPs * (1_000_000_000*1_000_000)
|
101 |
+
return int(V*1000)
|
102 |
+
|
103 |
+
|
104 |
+
if __name__ == '__main__':
|
105 |
+
'''
|
106 |
+
By using the coefficient fitted in the proposed 3 approaches, this code
|
107 |
+
provide an example about how to predict the optimal vocabulary
|
108 |
+
parameters (Nv) and vocabulary size, given the non-vocabulary parameters (Nnv).
|
109 |
+
'''
|
110 |
+
# Nnv = 7*10**9
|
111 |
+
# Nvopt_app1 = approach1_isoflops(Nnv)
|
112 |
+
# Nvopt_app2 = approach2_derivative(Nnv)
|
113 |
+
# Nvopt_app3 = approach3_isoloss(Nnv)
|
114 |
+
# FLOPs = Nnvopt_to_flops(Nnv)
|
115 |
+
# print(FLOPs)
|
116 |
+
# d = Nnv_to_d(Nnv)
|
117 |
+
# Vopt_app1, Vopt_app2, Vopt_app3 = int(Nvopt_app1/d), int(Nvopt_app2/d), int(Nvopt_app3/d)
|
118 |
+
# print(f'Given Nnv={Nnv}: The predicted optimal vocabulary size is {Nvopt_app1}, {Nvopt_app2}, {Nvopt_app3} by the 3 proposed approaches.\
|
119 |
+
# The predicted optimal vocabulary size is {Vopt_app1}, {Vopt_app2}, {Vopt_app3} by the 3 proposed approaches.')
|
120 |
+
|