menouar
commited on
Commit
•
af04de4
1
Parent(s):
3caa08b
First Commit
Browse files- .gitignore +60 -0
- app.py +297 -0
- utils/__init__.py +87 -0
- utils/components_creator.py +256 -0
- utils/notebook_generator.py +393 -0
.gitignore
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized files
|
2 |
+
*.py[cod]
|
3 |
+
__pycache__/
|
4 |
+
*.py[cod]?
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
dist/
|
11 |
+
build/
|
12 |
+
eggs/
|
13 |
+
*.egg-info/
|
14 |
+
.svn/
|
15 |
+
*.swp
|
16 |
+
*.tar.gz
|
17 |
+
*.tgz
|
18 |
+
*.zip
|
19 |
+
*.rar
|
20 |
+
|
21 |
+
# Development
|
22 |
+
*.bak
|
23 |
+
*.tmp
|
24 |
+
|
25 |
+
# IDE specific files
|
26 |
+
.vscode/
|
27 |
+
.idea/
|
28 |
+
|
29 |
+
# Jupyter Notebook
|
30 |
+
.ipynb_checkpoints/
|
31 |
+
|
32 |
+
# Environment
|
33 |
+
.env
|
34 |
+
env/
|
35 |
+
venv/
|
36 |
+
ENV/
|
37 |
+
env.bak/
|
38 |
+
venv.bak/
|
39 |
+
|
40 |
+
# Compiled Python modules
|
41 |
+
*.pyd
|
42 |
+
|
43 |
+
# Coverage
|
44 |
+
.coverage
|
45 |
+
.coverage.*
|
46 |
+
htmlcov/
|
47 |
+
|
48 |
+
# Type checking
|
49 |
+
.mypy_cache/
|
50 |
+
.dmypy.json
|
51 |
+
|
52 |
+
# Sphinx documentation
|
53 |
+
docs/_build/
|
54 |
+
|
55 |
+
# Ignore .pkl file
|
56 |
+
*.pkl
|
57 |
+
|
58 |
+
|
59 |
+
*.html
|
60 |
+
*.ipynb
|
app.py
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any
|
2 |
+
|
3 |
+
from nbconvert import HTMLExporter
|
4 |
+
|
5 |
+
|
6 |
+
from utils.notebook_generator import *
|
7 |
+
from utils.components_creator import *
|
8 |
+
|
9 |
+
finetuning_notebook = "Finetuning_NoteBook"
|
10 |
+
|
11 |
+
css = """
|
12 |
+
.container {
|
13 |
+
align-items: center;
|
14 |
+
justify-content: center;
|
15 |
+
}
|
16 |
+
.center_text {
|
17 |
+
text-align: center;
|
18 |
+
}
|
19 |
+
|
20 |
+
.a_custom {
|
21 |
+
border-radius: var(--button-large-radius);
|
22 |
+
padding: var(--button-large-padding);
|
23 |
+
font-weight: var(--button-large-text-weight);
|
24 |
+
font-size: var(--button-large-text-size);
|
25 |
+
border: var(--button-border-width) solid var(--button-primary-border-color);
|
26 |
+
background: var(--button-primary-background-fill);
|
27 |
+
color: var(--button-primary-text-color);
|
28 |
+
justify-content: center;
|
29 |
+
align-items: center;
|
30 |
+
transition: var(--button-transition);
|
31 |
+
box-shadow: var(--button-shadow);
|
32 |
+
text-align: center
|
33 |
+
}
|
34 |
+
.a_custom a {
|
35 |
+
text-decoration: none;
|
36 |
+
color: white;
|
37 |
+
}
|
38 |
+
"""
|
39 |
+
|
40 |
+
|
41 |
+
def centered_column():
|
42 |
+
return gr.Column(elem_classes=["container"])
|
43 |
+
|
44 |
+
|
45 |
+
def change_model_selection(model_id):
|
46 |
+
if model_id == gemma.name:
|
47 |
+
gr.Warning("""
|
48 |
+
Access Gemma:
|
49 |
+
|
50 |
+
To load Gemma from Hugging Face, you’re required to review and agree to Google’s usage license.
|
51 |
+
""")
|
52 |
+
if model_id == llama.name:
|
53 |
+
gr.Warning("""
|
54 |
+
Access Llama 2:
|
55 |
+
|
56 |
+
To load Llama 2 from Hugging Face, you’re required to review and agree to Meta’s usage license.
|
57 |
+
""")
|
58 |
+
|
59 |
+
for m in models:
|
60 |
+
if m.name == model_id:
|
61 |
+
return gr.Dropdown(choices=m.versions, interactive=True,
|
62 |
+
visible=True, info=f"Select the version of the model {m.name} you wish to use.")
|
63 |
+
return None
|
64 |
+
|
65 |
+
|
66 |
+
def check_valid_input(value):
|
67 |
+
if isinstance(value, str):
|
68 |
+
return value and value.strip()
|
69 |
+
if isinstance(value, list):
|
70 |
+
return len(value) > 0
|
71 |
+
return not None
|
72 |
+
|
73 |
+
|
74 |
+
def get_dataset(dataset_path):
|
75 |
+
for d in ft_datasets:
|
76 |
+
if d.path == dataset_path:
|
77 |
+
return d
|
78 |
+
return None
|
79 |
+
|
80 |
+
|
81 |
+
def get_value(components: dict[Component, Any], elem_id: str) -> Any:
|
82 |
+
for component, val in components.items():
|
83 |
+
if component.elem_id == elem_id:
|
84 |
+
return val
|
85 |
+
return None
|
86 |
+
|
87 |
+
|
88 |
+
def preview_notebook():
|
89 |
+
html_exporter = HTMLExporter()
|
90 |
+
(body, resources) = html_exporter.from_notebook_node(notebook)
|
91 |
+
|
92 |
+
html_path = f"{finetuning_notebook}.html"
|
93 |
+
with open(html_path, 'w') as f:
|
94 |
+
f.write(body)
|
95 |
+
return f'<iframe src="file={html_path}" width="100%" height="250px"></iframe>'
|
96 |
+
|
97 |
+
|
98 |
+
def generate_code(components: dict[Component, Any]):
|
99 |
+
create_install_libraries_cells(notebook['cells'])
|
100 |
+
flash_attention_value = get_value(components, FLASH_ATTENTION_ID)
|
101 |
+
if flash_attention_value:
|
102 |
+
create_install_flash_attention(notebook['cells'])
|
103 |
+
|
104 |
+
push_to_hub = get_value(components, PUSH_TO_HUB_ID)
|
105 |
+
if push_to_hub:
|
106 |
+
create_login_hf_cells(notebook['cells'])
|
107 |
+
|
108 |
+
dataset_value = get_value(components, DATASET_SELECTION_ID)
|
109 |
+
seed_value = get_value(components, DATASET_SHUFFLING_SEED)
|
110 |
+
if not check_valid_input(dataset_value):
|
111 |
+
gr.Warning("No dataset is selected")
|
112 |
+
else:
|
113 |
+
create_datasets_cells(notebook['cells'], get_dataset(dataset_value), seed_value)
|
114 |
+
|
115 |
+
model_value = get_value(components, MODEL_SELECTION_ID)
|
116 |
+
if not check_valid_input(model_value):
|
117 |
+
gr.Warning("No model is selected!")
|
118 |
+
else:
|
119 |
+
version_value = get_value(components, MODEL_VERSION_SELECTION_ID)
|
120 |
+
if not check_valid_input(version_value):
|
121 |
+
gr.Warning("No version of the model is selected")
|
122 |
+
else:
|
123 |
+
load_in_4bit = get_value(components, LOAD_IN_4_BIT_ID)
|
124 |
+
bnb_4bit_use_double_quant = get_value(components, BNB_4BIT_USE_DOUBLE_QUANT)
|
125 |
+
bnb_4bit_quant_type = get_value(components, BNB_4BIT_QUANT_TYPE)
|
126 |
+
bnb_4bit_compute_dtype = get_value(components, BNB_4BIT_COMPUTE_DTYPE)
|
127 |
+
pad_side = get_value(components, PAD_SIDE_ID)
|
128 |
+
pad_value = get_value(components, PAD_VALUE_ID)
|
129 |
+
create_model_cells(notebook['cells'], model_id=model_value, version=version_value,
|
130 |
+
flash_attention=flash_attention_value, pad_value=pad_value,
|
131 |
+
pad_side=pad_side, load_in_4bit=load_in_4bit,
|
132 |
+
bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
|
133 |
+
bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_compute_dtype=bnb_4bit_compute_dtype)
|
134 |
+
|
135 |
+
r_value = get_value(components, LORA_R_ID)
|
136 |
+
alpha_value = get_value(components, LORA_ALPHA_ID)
|
137 |
+
dropout_value = get_value(components, LORA_DROPOUT_ID)
|
138 |
+
bias_value = get_value(components, LORA_BIAS_ID)
|
139 |
+
create_lora_config_cells(notebook['cells'], r_value, alpha_value, dropout_value, bias_value)
|
140 |
+
|
141 |
+
epochs = get_value(components, NUM_TRAIN_EPOCHS_ID)
|
142 |
+
max_steps = get_value(components, MAX_STEPS_ID)
|
143 |
+
logging_steps = get_value(components, LOGGING_STEPS_ID)
|
144 |
+
per_device_train_batch_size = get_value(components, PER_DEVICE_TRAIN_BATCH_SIZE)
|
145 |
+
save_strategy = get_value(components, SAVE_STRATEGY_ID)
|
146 |
+
gradient_accumulation_steps = get_value(components, GRADIENT_ACCUMULATION_STEPS_ID)
|
147 |
+
gradient_checkpointing = get_value(components, GRADIENT_CHECKPOINTING_ID)
|
148 |
+
learning_rate = get_value(components, LEARNING_RATE_ID)
|
149 |
+
max_grad_norm = get_value(components, MAX_GRAD_NORM_ID)
|
150 |
+
warmup_ratio = get_value(components, WARMUP_RATIO_ID)
|
151 |
+
lr_scheduler_type = get_value(components, LR_SCHEDULER_TYPE_ID)
|
152 |
+
output_dir = get_value(components, OUTPUT_DIR_ID)
|
153 |
+
report_to = get_value(components, REPORT_TO_ID)
|
154 |
+
|
155 |
+
if not check_valid_input(output_dir):
|
156 |
+
gr.Warning("No output_dir is given")
|
157 |
+
|
158 |
+
create_training_args_cells(notebook['cells'], epochs=epochs, max_steps=max_steps, logging_steps=logging_steps,
|
159 |
+
per_device_train_batch_size=per_device_train_batch_size, save_strategy=save_strategy,
|
160 |
+
gradient_accumulation_steps=gradient_accumulation_steps,
|
161 |
+
gradient_checkpointing=gradient_checkpointing, learning_rate=learning_rate,
|
162 |
+
max_grad_norm=max_grad_norm, warmup_ratio=warmup_ratio,
|
163 |
+
lr_scheduler_type=lr_scheduler_type, output_dir=output_dir, report_to=report_to,
|
164 |
+
seed=seed_value)
|
165 |
+
|
166 |
+
max_seq_length = get_value(components, MAX_SEQ_LENGTH_ID)
|
167 |
+
packing = get_value(components, PACKING_ID)
|
168 |
+
create_sft_trainer_cells(notebook['cells'], max_seq_length, packing)
|
169 |
+
|
170 |
+
create_start_training_cells(notebook['cells'], epochs, max_steps, push_to_hub, output_dir)
|
171 |
+
|
172 |
+
create_free_gpu_cells(notebook['cells'])
|
173 |
+
|
174 |
+
create_merge_lora_cells(notebook['cells'], output_dir)
|
175 |
+
|
176 |
+
if push_to_hub:
|
177 |
+
push_merged_model_cells(notebook['cells'], output_dir)
|
178 |
+
|
179 |
+
file_name = f"{finetuning_notebook}.ipynb"
|
180 |
+
|
181 |
+
with open(file_name, 'w') as f:
|
182 |
+
nbf.write(notebook, f)
|
183 |
+
|
184 |
+
return gr.Button(
|
185 |
+
visible=True), f'''<div class="a_custom"><a href="file={file_name}" download={file_name}>
|
186 |
+
💾️ Download {finetuning_notebook}.ipynb</a> </div> '''
|
187 |
+
|
188 |
+
|
189 |
+
with gr.Blocks(css=css, theme=gr.themes.Soft(text_size='lg', font=["monospace"],
|
190 |
+
primary_hue=gr.themes.colors.blue)) as demo:
|
191 |
+
gr.Label("UI-Guided LLM FineTuning Jupyter Notebook Generator 🛠️🧠", show_label=False)
|
192 |
+
gr.Markdown(
|
193 |
+
'Generating a **Jupyter Notebook file (.ipynb)** 📔⚙️ for **finetuning** a Large Language Model (**LLM**) '
|
194 |
+
'🎚️🧠 on a chosen dataset and configured parameters, guided by an intuitive User Interface (UI) 👆💻.',
|
195 |
+
elem_classes=["center_text"])
|
196 |
+
|
197 |
+
all_components: Set[Component] = set()
|
198 |
+
|
199 |
+
gr.HTML("<h2 style='text-align: center;'>LLM 🧠</h2>")
|
200 |
+
with gr.Row():
|
201 |
+
model_selection = gr.Dropdown(
|
202 |
+
[model.name for model in models],
|
203 |
+
elem_id=MODEL_SELECTION_ID,
|
204 |
+
label="Select a Large Language Model (LLM)",
|
205 |
+
info="Select a Large Language Model (LLM) to finetune using the SFTTrainer."
|
206 |
+
)
|
207 |
+
version_selection = gr.Dropdown(
|
208 |
+
choices=[], label="Select a Model Version 🔄", info="", visible=False, elem_id=MODEL_VERSION_SELECTION_ID
|
209 |
+
)
|
210 |
+
all_components.add(model_selection)
|
211 |
+
all_components.add(version_selection)
|
212 |
+
|
213 |
+
gr.HTML("<h2 style='text-align: center;'>Dataset 📊</h2>")
|
214 |
+
with gr.Row():
|
215 |
+
all_components.update(add_dataset_components())
|
216 |
+
|
217 |
+
gr.HTML("<h2 style='text-align: center;'>⚡ Flash Attention ⚡</h2>")
|
218 |
+
with gr.Row():
|
219 |
+
flash_attention = gr.Checkbox(value=True, label="Enable Flash Attention", interactive=True,
|
220 |
+
elem_id=FLASH_ATTENTION_ID,
|
221 |
+
info="Flash Attention is a technique that reduces the memory and runtime costs "
|
222 |
+
"associated with "
|
223 |
+
"the attention layer in a model. For more details, please refer to the "
|
224 |
+
"Flash Attention "
|
225 |
+
"repository on GitHub.")
|
226 |
+
all_components.add(flash_attention)
|
227 |
+
|
228 |
+
gr.HTML("<h2 style='text-align: center;'>Quantization</h2>")
|
229 |
+
with gr.Row():
|
230 |
+
with centered_column():
|
231 |
+
all_components.update(add_quantization_components())
|
232 |
+
with centered_column():
|
233 |
+
all_components.update(add_quantization_components1())
|
234 |
+
|
235 |
+
gr.HTML("<h2 style='text-align: center;'>Tokenizer Configuration</h2>")
|
236 |
+
with gr.Row():
|
237 |
+
all_components.update(add_pad_tokens())
|
238 |
+
|
239 |
+
gr.HTML("<h2 style='text-align: center;'>Lora Configuration</h2>")
|
240 |
+
with gr.Row():
|
241 |
+
with centered_column():
|
242 |
+
all_components.update(add_lora_components1())
|
243 |
+
with centered_column():
|
244 |
+
all_components.update(add_lora_components())
|
245 |
+
|
246 |
+
gr.HTML("<h2 style='text-align: center;'>⚙️ Training Arguments ⚙️</h2>")
|
247 |
+
with gr.Row():
|
248 |
+
with centered_column():
|
249 |
+
all_components.update(add_training_args_1())
|
250 |
+
all_components.update(add_training_args_1_bis())
|
251 |
+
with centered_column():
|
252 |
+
all_components.update(add_training_args_3())
|
253 |
+
|
254 |
+
gr.HTML("<h2 style='text-align: center;'>Optimizer Arguments</h2>")
|
255 |
+
with gr.Row():
|
256 |
+
with centered_column():
|
257 |
+
optimizer1 = add_optimizer1()
|
258 |
+
all_components.update(optimizer1)
|
259 |
+
|
260 |
+
with centered_column():
|
261 |
+
optimizer = add_optimizer()
|
262 |
+
all_components.update(optimizer)
|
263 |
+
|
264 |
+
gr.HTML("<h2 style='text-align: center;'>Outputs</h2>")
|
265 |
+
with gr.Row():
|
266 |
+
with centered_column():
|
267 |
+
all_components.update(add_outputs())
|
268 |
+
with centered_column():
|
269 |
+
all_components.update(add_outputs1())
|
270 |
+
|
271 |
+
gr.HTML("<h2 style='text-align: center;'>SFTTrainer Arguments</h2>")
|
272 |
+
with gr.Row():
|
273 |
+
sft_args = add_sft_trainer_args()
|
274 |
+
all_components.update(sft_args)
|
275 |
+
|
276 |
+
with gr.Row():
|
277 |
+
iframe = gr.HTML(show_label=False, visible=True)
|
278 |
+
|
279 |
+
with gr.Row():
|
280 |
+
greet_btn = gr.Button("Generate 🛠️", variant="primary")
|
281 |
+
|
282 |
+
with gr.Row():
|
283 |
+
preview_btn = gr.Button(f"👀 Preview {finetuning_notebook}.ipynb", variant="primary", visible=False)
|
284 |
+
download_btn = gr.HTML(show_label=False, visible=True)
|
285 |
+
|
286 |
+
notebook = nbf.v4.new_notebook()
|
287 |
+
greet_btn.click(fn=generate_code, inputs=all_components, outputs=[preview_btn, download_btn])
|
288 |
+
|
289 |
+
preview_btn.click(fn=preview_notebook, inputs=None, outputs=iframe)
|
290 |
+
|
291 |
+
model_selection.change(
|
292 |
+
fn=change_model_selection,
|
293 |
+
inputs=model_selection,
|
294 |
+
outputs=version_selection
|
295 |
+
)
|
296 |
+
|
297 |
+
demo.launch(allowed_paths=["/"])
|
utils/__init__.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional
|
2 |
+
|
3 |
+
MODEL_SELECTION_ID: str = "model_selection"
|
4 |
+
MODEL_VERSION_SELECTION_ID: str = "model_version_selection"
|
5 |
+
|
6 |
+
LOAD_IN_4_BIT_ID: str = "load_in_4bit"
|
7 |
+
BNB_4BIT_QUANT_TYPE: str = "bnb_4bit_quant_type"
|
8 |
+
BNB_4BIT_COMPUTE_DTYPE: str = "bnb_4bit_compute_dtype"
|
9 |
+
BNB_4BIT_USE_DOUBLE_QUANT: str = "bnb_4bit_use_double_quant"
|
10 |
+
|
11 |
+
DATASET_SELECTION_ID = "dataset_selection"
|
12 |
+
DATASET_SHUFFLING_SEED = "dataset_seed"
|
13 |
+
|
14 |
+
FLASH_ATTENTION_ID = "flash_attention"
|
15 |
+
|
16 |
+
PAD_SIDE_ID = "pad_side"
|
17 |
+
PAD_VALUE_ID = "pad_value"
|
18 |
+
|
19 |
+
LORA_R_ID = "lora_r"
|
20 |
+
LORA_ALPHA_ID = "lora_alpha"
|
21 |
+
LORA_DROPOUT_ID = "lora_dropout"
|
22 |
+
LORA_BIAS_ID = 'lora_bias'
|
23 |
+
|
24 |
+
NUM_TRAIN_EPOCHS_ID = "num_train_epochs"
|
25 |
+
MAX_STEPS_ID = "max_steps_id"
|
26 |
+
LOGGING_STEPS_ID = "logging_steps"
|
27 |
+
PER_DEVICE_TRAIN_BATCH_SIZE = "per_device_train_batch_size"
|
28 |
+
SAVE_STRATEGY_ID = "save_strategy"
|
29 |
+
GRADIENT_ACCUMULATION_STEPS_ID = "gradient_accumulation_steps"
|
30 |
+
GRADIENT_CHECKPOINTING_ID = "gradient_checkpointing"
|
31 |
+
LEARNING_RATE_ID = "learning_rate"
|
32 |
+
MAX_GRAD_NORM_ID = "max_grad_norm"
|
33 |
+
WARMUP_RATIO_ID = "warmup_ratio"
|
34 |
+
LR_SCHEDULER_TYPE_ID = "lr_scheduler_type"
|
35 |
+
OUTPUT_DIR_ID = "output_dir"
|
36 |
+
PUSH_TO_HUB_ID = "push_to_hub"
|
37 |
+
REPORT_TO_ID = "report_to"
|
38 |
+
|
39 |
+
MAX_SEQ_LENGTH_ID = "max_seq_length"
|
40 |
+
PACKING_ID = "packing"
|
41 |
+
|
42 |
+
OPTIMIZER_ID = "optim"
|
43 |
+
BETA1_ID = "adam_beta1"
|
44 |
+
BETA2_ID = "adam_beta2"
|
45 |
+
EPSILON_ID = "adam_epsilon"
|
46 |
+
WEIGHT_DECAY_ID = "weight_decay"
|
47 |
+
|
48 |
+
|
49 |
+
class FTDataSet:
|
50 |
+
def __init__(self, path: str, dataset_split: Optional[str] = None):
|
51 |
+
self.path = path
|
52 |
+
self.dataset_split = dataset_split
|
53 |
+
|
54 |
+
def __str__(self):
|
55 |
+
return self.path
|
56 |
+
|
57 |
+
|
58 |
+
deita_dataset = FTDataSet(path="HuggingFaceH4/deita-10k-v0-sft", dataset_split="train_sft")
|
59 |
+
dolly = FTDataSet(path="philschmid/dolly-15k-oai-style", dataset_split="train")
|
60 |
+
ultrachat_200k = FTDataSet(path="HuggingFaceH4/ultrachat_200k", dataset_split="train_sft")
|
61 |
+
ft_datasets = [deita_dataset, dolly, ultrachat_200k]
|
62 |
+
|
63 |
+
|
64 |
+
class Model:
|
65 |
+
def __init__(self, name: str, versions: List[str]):
|
66 |
+
self.name = name
|
67 |
+
self.versions = versions
|
68 |
+
|
69 |
+
def __str__(self):
|
70 |
+
return self.name
|
71 |
+
|
72 |
+
|
73 |
+
models: List[Model] = []
|
74 |
+
gemma = Model(name="google/gemma", versions=["7b", "2b"])
|
75 |
+
models.append(gemma)
|
76 |
+
falcon = Model(name="tiiuae/falcon", versions=["7b"]) # "7b-instruct"
|
77 |
+
models.append(falcon)
|
78 |
+
phi = Model(name="microsoft/phi", versions=["1_5", "1", "2"])
|
79 |
+
models.append(phi)
|
80 |
+
llama = Model(name="meta-llama/Llama-2", versions=["7b", "7b-hf"]) # "7b-chat", "7b-chat-hf"
|
81 |
+
models.append(llama)
|
82 |
+
mistral = Model(name="mistralai/Mistral", versions=["7B-v0.1"]) # "7B-Instruct-v0.1"
|
83 |
+
models.append(mistral)
|
84 |
+
tinyLlama = Model(name="TinyLlama/TinyLlama-1.1B",
|
85 |
+
versions=['intermediate-step-1431k-3T', 'step-50K-105b', 'intermediate-step-240k-503b',
|
86 |
+
'intermediate-step-715k-1.5T', 'intermediate-step-1195k-token-2.5T'])
|
87 |
+
models.append(tinyLlama)
|
utils/components_creator.py
ADDED
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Set
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
from gradio.components import Component
|
5 |
+
|
6 |
+
from utils import *
|
7 |
+
|
8 |
+
|
9 |
+
def add_quantization_components() -> Set[Component]:
|
10 |
+
q_components: Set[Component] = set()
|
11 |
+
load_in_4bit = gr.Radio(["load_in_4bit", "load_in_8bit"], value="load_in_4bit",
|
12 |
+
label="Quantization",
|
13 |
+
info="This flag is used to enable 4/8-bit "
|
14 |
+
"quantization.",
|
15 |
+
interactive=True,
|
16 |
+
elem_id=LOAD_IN_4_BIT_ID)
|
17 |
+
bnb_4bit_quant_type = gr.Radio(["fp4", "nf4"], label="bnb_4bit_quant_type",
|
18 |
+
value="nf4",
|
19 |
+
elem_id=BNB_4BIT_QUANT_TYPE,
|
20 |
+
interactive=True,
|
21 |
+
info="This sets the quantization data type in "
|
22 |
+
"the bnb.nn.Linear4Bit "
|
23 |
+
"layers.")
|
24 |
+
q_components.add(load_in_4bit)
|
25 |
+
q_components.add(bnb_4bit_quant_type)
|
26 |
+
return q_components
|
27 |
+
|
28 |
+
|
29 |
+
def add_quantization_components1() -> Set[Component]:
|
30 |
+
q_components: Set[Component] = set()
|
31 |
+
bnb_4bit_compute_dtype = gr.Radio(
|
32 |
+
["torch.float32", "torch.bfloat16", "torch.float16"],
|
33 |
+
label="bnb_4bit_compute_dtype",
|
34 |
+
info="This sets the computational type which might be different "
|
35 |
+
"than the input type.",
|
36 |
+
elem_id=BNB_4BIT_COMPUTE_DTYPE,
|
37 |
+
interactive=True, value="torch.bfloat16")
|
38 |
+
bnb_4bit_use_double_quant = gr.Checkbox(label="bnb_4bit_use_double_quant",
|
39 |
+
value=True,
|
40 |
+
interactive=True,
|
41 |
+
elem_id=BNB_4BIT_USE_DOUBLE_QUANT,
|
42 |
+
info="This flag is used for nested "
|
43 |
+
"quantization where the "
|
44 |
+
"quantization constants from "
|
45 |
+
"the first "
|
46 |
+
"quantization are quantized "
|
47 |
+
"again.")
|
48 |
+
q_components.add(bnb_4bit_compute_dtype)
|
49 |
+
q_components.add(bnb_4bit_use_double_quant)
|
50 |
+
return q_components
|
51 |
+
|
52 |
+
|
53 |
+
def add_dataset_components() -> Set[Component]:
|
54 |
+
dataset_selection = gr.Dropdown(
|
55 |
+
[dt.path for dt in ft_datasets],
|
56 |
+
elem_id=DATASET_SELECTION_ID,
|
57 |
+
label="Select a Dataset",
|
58 |
+
info="Select a dataset for finetuning the model."
|
59 |
+
)
|
60 |
+
seed = gr.Slider(0, 256, step=1, value=42, elem_id=DATASET_SHUFFLING_SEED, label="Random Seed",
|
61 |
+
info="Set a random seed for shuffling the dataset.", interactive=True)
|
62 |
+
|
63 |
+
d_components: Set[Component] = set()
|
64 |
+
d_components.add(dataset_selection)
|
65 |
+
d_components.add(seed)
|
66 |
+
return d_components
|
67 |
+
|
68 |
+
|
69 |
+
def add_pad_tokens() -> Set[Component]:
|
70 |
+
pad_token_side = gr.Radio(["right", "left"], label="Tokenizer: padding_side",
|
71 |
+
info="The side on which the model should have padding applied.",
|
72 |
+
interactive=True, value="right", elem_id=PAD_SIDE_ID)
|
73 |
+
pad_token_value = gr.Radio([None, "eos_token"], label="Tokenizer: pad_token",
|
74 |
+
info="A special token used to make arrays of tokens the same size for batching "
|
75 |
+
"purpose. Will then be "
|
76 |
+
"ignored by attention mechanisms or loss computation.",
|
77 |
+
interactive=True, value=None, elem_id=PAD_VALUE_ID)
|
78 |
+
pad_components: Set[Component] = set()
|
79 |
+
pad_components.add(pad_token_side)
|
80 |
+
pad_components.add(pad_token_value)
|
81 |
+
return pad_components
|
82 |
+
|
83 |
+
|
84 |
+
def add_lora_components() -> Set[Component]:
|
85 |
+
r = gr.Slider(1, 2048, step=1, value=6, label="r", info="Lora attention dimension (the 'rank').",
|
86 |
+
interactive=True, elem_id=LORA_R_ID)
|
87 |
+
alpha = gr.Slider(1, 512, step=1, value=8, label="lora_alpha", info="The alpha parameter for Lora scaling.",
|
88 |
+
interactive=True, elem_id=LORA_ALPHA_ID)
|
89 |
+
|
90 |
+
out_components: Set[Component] = set()
|
91 |
+
out_components.add(r)
|
92 |
+
out_components.add(alpha)
|
93 |
+
return out_components
|
94 |
+
|
95 |
+
|
96 |
+
def add_lora_components1() -> Set[Component]:
|
97 |
+
dropout = gr.Slider(0, 1, step=0.01, value=0.05, label="lora_dropout",
|
98 |
+
info="The dropout probability for Lora layers.",
|
99 |
+
interactive=True, elem_id=LORA_DROPOUT_ID)
|
100 |
+
bias = gr.Radio(['none', 'all', 'lora_only'], label="bias",
|
101 |
+
info="Bias type for LoRA. If 'all' or 'lora_only', the corresponding biases will be updated during "
|
102 |
+
"training.",
|
103 |
+
interactive=True, value="none", elem_id=LORA_BIAS_ID)
|
104 |
+
|
105 |
+
out_components: Set[Component] = set()
|
106 |
+
out_components.add(dropout)
|
107 |
+
out_components.add(bias)
|
108 |
+
return out_components
|
109 |
+
|
110 |
+
|
111 |
+
def add_training_args_1() -> Set[Component]:
|
112 |
+
epochs = gr.Slider(1, 100, step=1, value=3, label="num_train_epochs",
|
113 |
+
info="Total number of training epochs to perform.",
|
114 |
+
interactive=True, elem_id=NUM_TRAIN_EPOCHS_ID)
|
115 |
+
max_steps = gr.Slider(-1, 100, step=1, value=-1, label="max_steps",
|
116 |
+
info="Total number of training steps to perform. if equals to -1 it overrides "
|
117 |
+
"num_train_epochs.",
|
118 |
+
interactive=True, elem_id=MAX_STEPS_ID)
|
119 |
+
out_components: Set[Component] = set()
|
120 |
+
out_components.add(epochs)
|
121 |
+
out_components.add(max_steps)
|
122 |
+
return out_components
|
123 |
+
|
124 |
+
|
125 |
+
def add_training_args_1_bis() -> Set[Component]:
|
126 |
+
logging_steps = gr.Slider(1, 100, step=1, value=10, label="logging_steps",
|
127 |
+
info="Number of update steps between two logs if logging_strategy='steps'",
|
128 |
+
interactive=True, elem_id=LOGGING_STEPS_ID)
|
129 |
+
per_device_train_batch_size = gr.Slider(1, 64, step=1, value=4, label="per_device_train_batch_size",
|
130 |
+
info="Batch size per device during training.",
|
131 |
+
interactive=True, elem_id=PER_DEVICE_TRAIN_BATCH_SIZE)
|
132 |
+
save_strategy = gr.Radio(['no', 'epoch', 'steps'], label="save_strategy",
|
133 |
+
info="The checkpoint save strategy to adopt during training.",
|
134 |
+
interactive=True, value="epoch", elem_id=SAVE_STRATEGY_ID)
|
135 |
+
out_components: Set[Component] = set()
|
136 |
+
out_components.add(save_strategy)
|
137 |
+
out_components.add(logging_steps)
|
138 |
+
out_components.add(per_device_train_batch_size)
|
139 |
+
return out_components
|
140 |
+
|
141 |
+
|
142 |
+
def add_training_args_3() -> Set[Component]:
|
143 |
+
max_grad_norm = gr.Slider(0.01, 1, value=0.3, label="max_grad_norm",
|
144 |
+
info="Maximum gradient norm (for gradient clipping).",
|
145 |
+
interactive=True, elem_id=MAX_GRAD_NORM_ID)
|
146 |
+
warmup_ratio = gr.Slider(0, 1, value=0.1, label="warmup_ratio",
|
147 |
+
info="Ratio of total training steps used for a linear warmup from 0 to learning_rate.",
|
148 |
+
interactive=True, elem_id=WARMUP_RATIO_ID)
|
149 |
+
gradient_accumulation_steps = gr.Slider(1, 64, step=1, value=2, label="gradient_accumulation_steps",
|
150 |
+
info="Number of updates steps to accumulate the gradients for, before "
|
151 |
+
"performing a backward/update "
|
152 |
+
"pass.",
|
153 |
+
interactive=True, elem_id=GRADIENT_ACCUMULATION_STEPS_ID)
|
154 |
+
gradient_checkpointing = gr.Checkbox(label="gradient_checkpointing", value=True, interactive=True,
|
155 |
+
info="Use gradient checkpointing to save memory at the expense of slower "
|
156 |
+
"backward pass.", elem_id=GRADIENT_CHECKPOINTING_ID)
|
157 |
+
lr_scheduler_type = gr.Radio(['linear', 'constant', 'cosine'], label="lr_scheduler_type",
|
158 |
+
info="The learning rate scheduler type to use.",
|
159 |
+
interactive=True, value="cosine", elem_id=LR_SCHEDULER_TYPE_ID)
|
160 |
+
|
161 |
+
out_components: Set[Component] = set()
|
162 |
+
out_components.add(max_grad_norm)
|
163 |
+
out_components.add(warmup_ratio)
|
164 |
+
out_components.add(gradient_accumulation_steps)
|
165 |
+
out_components.add(gradient_checkpointing)
|
166 |
+
out_components.add(lr_scheduler_type)
|
167 |
+
return out_components
|
168 |
+
|
169 |
+
|
170 |
+
def add_outputs() -> Set[Component]:
|
171 |
+
output_dir = gr.Textbox(interactive=True,
|
172 |
+
label="output_dir",
|
173 |
+
info='The output directory where the model predictions and checkpoints will be written.',
|
174 |
+
elem_id=OUTPUT_DIR_ID)
|
175 |
+
|
176 |
+
push_to_hub = gr.Checkbox(label="push_to_hub", value=False, interactive=True,
|
177 |
+
info="Whether or not to upload the trained model to the hub after training. If this is "
|
178 |
+
"True, you must specify 'HF_TOKEN'.",
|
179 |
+
elem_id=PUSH_TO_HUB_ID)
|
180 |
+
|
181 |
+
out_components: Set[Component] = set()
|
182 |
+
out_components.add(output_dir)
|
183 |
+
out_components.add(push_to_hub)
|
184 |
+
return out_components
|
185 |
+
|
186 |
+
|
187 |
+
def add_outputs1() -> Set[Component]:
|
188 |
+
report_to = gr.Dropdown(
|
189 |
+
["azure_ml", "comet_ml", "mlflow", "tensorboard", "wandb", "all", 'none'],
|
190 |
+
value="tensorboard",
|
191 |
+
elem_id=REPORT_TO_ID,
|
192 |
+
label="report_to",
|
193 |
+
info="The list of integrations to report the results and logs to. Supported platforms are 'azure_ml', "
|
194 |
+
"'comet_ml', 'mlflow', 'tensorboard' and 'wandb'. Use 'all' to report to all integrations installed, "
|
195 |
+
"'none' for no integrations."
|
196 |
+
)
|
197 |
+
out_components: Set[Component] = set()
|
198 |
+
out_components.add(report_to)
|
199 |
+
return out_components
|
200 |
+
|
201 |
+
|
202 |
+
def add_optimizer() -> Set[Component]:
|
203 |
+
adam_beta1 = gr.Slider(0.00001, 1, value=0.9, label="adam_beta1",
|
204 |
+
info="The beta1 hyperparameter for the [`AdamW`] optimizer.",
|
205 |
+
interactive=True, elem_id=BETA1_ID)
|
206 |
+
adam_beta2 = gr.Slider(0.00001, 1, value=0.999, label="adam_beta2",
|
207 |
+
info="The beta2 hyperparameter for the [`AdamW`] optimizer.",
|
208 |
+
interactive=True, elem_id=BETA2_ID)
|
209 |
+
adam_epsilon = gr.Slider(1e-9, 1, value=1e-8, label="adam_epsilon",
|
210 |
+
info="The epsilon hyperparameter for the [`AdamW`] optimizer.",
|
211 |
+
interactive=True, elem_id=EPSILON_ID)
|
212 |
+
out_components: Set[Component] = set()
|
213 |
+
out_components.add(adam_beta1)
|
214 |
+
out_components.add(adam_beta2)
|
215 |
+
out_components.add(adam_epsilon)
|
216 |
+
return out_components
|
217 |
+
|
218 |
+
|
219 |
+
def add_optimizer1() -> Set[Component]:
|
220 |
+
optimizer = gr.Dropdown(
|
221 |
+
["adamw_hf", "adamw_torch", "adamw_torch_fused", "adamw_apex_fused", "adamw_anyprecision", "adafactor"],
|
222 |
+
value="adamw_torch_fused",
|
223 |
+
elem_id=OPTIMIZER_ID,
|
224 |
+
label="optimizer",
|
225 |
+
info="The optimizer to use: 'adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_apex_fused', "
|
226 |
+
"'adamw_anyprecision' or "
|
227 |
+
"'adafactor'. "
|
228 |
+
)
|
229 |
+
learning_rate = gr.Slider(1e-6, 1, step=0.001, value=2.0e-05, label="learning_rate",
|
230 |
+
info="The initial learning rate for AdamW.",
|
231 |
+
interactive=True, elem_id=LEARNING_RATE_ID)
|
232 |
+
weight_decay = gr.Slider(0, 1, value=0, label="weight_decay",
|
233 |
+
info="The weight decay to apply (if not zero) to all layers except all bias and "
|
234 |
+
"LayerNorm weights in [`AdamW`] optimizer.",
|
235 |
+
interactive=True, elem_id=WEIGHT_DECAY_ID)
|
236 |
+
out_components: Set[Component] = set()
|
237 |
+
out_components.add(optimizer)
|
238 |
+
out_components.add(learning_rate)
|
239 |
+
out_components.add(weight_decay)
|
240 |
+
return out_components
|
241 |
+
|
242 |
+
|
243 |
+
def add_sft_trainer_args() -> Set[Component]:
|
244 |
+
max_seq_length = gr.Slider(512, 3072, value=2048, label="max_seq_length",
|
245 |
+
info="The maximum sequence length to use for the `ConstantLengthDataset` and for "
|
246 |
+
"automatically "
|
247 |
+
"creating the Dataset.",
|
248 |
+
interactive=True, elem_id=MAX_SEQ_LENGTH_ID)
|
249 |
+
packing = gr.Checkbox(label="packing", value=True, interactive=True, elem_id=PACKING_ID,
|
250 |
+
info="This argument is used by the `ConstantLengthDataset` to pack the sequences of the "
|
251 |
+
"dataset.")
|
252 |
+
|
253 |
+
out_components: Set[Component] = set()
|
254 |
+
out_components.add(max_seq_length)
|
255 |
+
out_components.add(packing)
|
256 |
+
return out_components
|
utils/notebook_generator.py
ADDED
@@ -0,0 +1,393 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nbformat as nbf
|
2 |
+
|
3 |
+
from utils import FTDataSet
|
4 |
+
|
5 |
+
|
6 |
+
def create_install_libraries_cells(cells: list):
|
7 |
+
text_cell = nbf.v4.new_markdown_cell("### Installing Required Libraries!")
|
8 |
+
text_cell1 = nbf.v4.new_markdown_cell(
|
9 |
+
"Installing required libraries, including trl, transformers, accelerate, peft, datasets, "
|
10 |
+
"and bitsandbytes.")
|
11 |
+
code = """
|
12 |
+
!pip install -q --upgrade "transformers==4.38.2"
|
13 |
+
!pip install -q --upgrade "datasets==2.16.1"
|
14 |
+
!pip install -q --upgrade "accelerate==0.26.1"
|
15 |
+
!pip install -q --upgrade "evaluate==0.4.1"
|
16 |
+
!pip install -q --upgrade "bitsandbytes==0.42.0"
|
17 |
+
!pip install -q --upgrade "trl==0.7.11"
|
18 |
+
!pip install -q --upgrade "peft==0.8.2"
|
19 |
+
"""
|
20 |
+
code_pytorch = """
|
21 |
+
# Checks if PyTorch is installed and installs it if not.
|
22 |
+
try:
|
23 |
+
import torch
|
24 |
+
print("PyTorch is installed!")
|
25 |
+
except ImportError:
|
26 |
+
print("PyTorch is not installed.")
|
27 |
+
!pip install -q torch
|
28 |
+
"""
|
29 |
+
code_cell = nbf.v4.new_code_cell(code)
|
30 |
+
cells.append(text_cell)
|
31 |
+
cells.append(text_cell1)
|
32 |
+
cells.append(nbf.v4.new_code_cell(code_pytorch))
|
33 |
+
cells.append(code_cell)
|
34 |
+
|
35 |
+
|
36 |
+
def create_install_flash_attention(cells: list):
|
37 |
+
text_cell = nbf.v4.new_markdown_cell(
|
38 |
+
"### Installing Flash Attention")
|
39 |
+
text_cell1 = nbf.v4.new_markdown_cell("Installing Flash Attention to reduce the memory "
|
40 |
+
"and runtime cost of the attention layer, and improve the performance of "
|
41 |
+
"the model training. Learn more at [FlashAttention]("
|
42 |
+
"https://github.com/Dao-AILab/flash-attention/tree/main)."
|
43 |
+
" Installing flash "
|
44 |
+
"attention from source can take quite a bit of time (10-45 "
|
45 |
+
"minutes).")
|
46 |
+
code = """
|
47 |
+
import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'
|
48 |
+
|
49 |
+
!pip install ninja packaging
|
50 |
+
!MAX_JOBS=4 pip install flash-attn --no-build-isolation --upgrade
|
51 |
+
"""
|
52 |
+
code_cell = nbf.v4.new_code_cell(code)
|
53 |
+
cells.append(text_cell)
|
54 |
+
cells.append(text_cell1)
|
55 |
+
cells.append(code_cell)
|
56 |
+
|
57 |
+
|
58 |
+
def create_login_hf_cells(cells: list):
|
59 |
+
text_cell = nbf.v4.new_markdown_cell(
|
60 |
+
"### Login to HF")
|
61 |
+
text_cell1 = nbf.v4.new_markdown_cell("Installing **huggingface_hub** to use as a remote "
|
62 |
+
"model versioning service. This means that your model, logs, and information "
|
63 |
+
"will be automatically pushed to the Hub during training. You should have "
|
64 |
+
"'HF_TOKEN'")
|
65 |
+
code = """
|
66 |
+
# Install huggingface_hub
|
67 |
+
!pip install -q huggingface_hub
|
68 |
+
|
69 |
+
from huggingface_hub import login
|
70 |
+
|
71 |
+
login(
|
72 |
+
token=userdata.get('Your_HF_TOKEN')
|
73 |
+
add_to_git_credential=True
|
74 |
+
)
|
75 |
+
"""
|
76 |
+
code_cell = nbf.v4.new_code_cell(code)
|
77 |
+
cells.append(text_cell)
|
78 |
+
cells.append(text_cell1)
|
79 |
+
cells.append(code_cell)
|
80 |
+
|
81 |
+
|
82 |
+
def create_datasets_cells(cells: list, dataset: FTDataSet, seed: int):
|
83 |
+
text_cell = nbf.v4.new_markdown_cell("### Load and prepare the dataset")
|
84 |
+
text = 'The dataset is already formatted in a conversational format, which is supported by [trl](' \
|
85 |
+
'https://huggingface.co/docs/trl/index/). '
|
86 |
+
text_format = """
|
87 |
+
**Conversational format:**
|
88 |
+
|
89 |
+
|
90 |
+
```python {"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."},
|
91 |
+
{"role": "assistant", "content": "..."}]} {"messages": [{"role": "system", "content": "You are..."}, {"role": "user",
|
92 |
+
"content": "..."}, {"role": "assistant", "content": "..."}]} {"messages": [{"role": "system", "content": "You
|
93 |
+
are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]} """
|
94 |
+
text_cell1 = nbf.v4.new_markdown_cell(text)
|
95 |
+
text_cell2 = nbf.v4.new_markdown_cell(text_format)
|
96 |
+
code = f"""
|
97 |
+
from datasets import load_dataset
|
98 |
+
|
99 |
+
# Load dataset from the hub
|
100 |
+
dataset = load_dataset("{dataset.path}", split="{dataset.dataset_split}")
|
101 |
+
|
102 |
+
dataset = dataset.shuffle(seed={seed})
|
103 |
+
"""
|
104 |
+
|
105 |
+
code_cell = nbf.v4.new_code_cell(code)
|
106 |
+
cells.append(text_cell)
|
107 |
+
cells.append(text_cell1)
|
108 |
+
cells.append(text_cell2)
|
109 |
+
cells.append(code_cell)
|
110 |
+
|
111 |
+
|
112 |
+
def create_model_cells(cells: list, model_id: str, version: str, flash_attention: bool, pad_side: str, pad_value: str,
|
113 |
+
load_in_4bit: str, bnb_4bit_use_double_quant: bool, bnb_4bit_quant_type: str,
|
114 |
+
bnb_4bit_compute_dtype: str
|
115 |
+
):
|
116 |
+
text_cell = nbf.v4.new_markdown_cell(f"### Load {model_id}-{version} for Finetuning")
|
117 |
+
load_in_4bit_str = f"{load_in_4bit}=True"
|
118 |
+
|
119 |
+
flash_attention_str = "attn_implementation='flash_attention_2',"
|
120 |
+
if not flash_attention:
|
121 |
+
flash_attention_str = ''
|
122 |
+
|
123 |
+
pad_value_str = "tokenizer.pad_token = tokenizer.eos_token"
|
124 |
+
if pad_value is None:
|
125 |
+
pad_value_str = ""
|
126 |
+
|
127 |
+
code = f"""
|
128 |
+
import torch
|
129 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
130 |
+
from trl import setup_chat_format
|
131 |
+
|
132 |
+
# Hugging Face model id
|
133 |
+
model_id = "{model_id}-{version}"
|
134 |
+
|
135 |
+
# BitsAndBytesConfig
|
136 |
+
bnb_config = BitsAndBytesConfig(
|
137 |
+
{load_in_4bit_str}, bnb_4bit_use_double_quant={bnb_4bit_use_double_quant},
|
138 |
+
bnb_4bit_quant_type="{bnb_4bit_quant_type}", bnb_4bit_compute_dtype={bnb_4bit_compute_dtype}
|
139 |
+
)
|
140 |
+
|
141 |
+
# Load model and tokenizer
|
142 |
+
model = AutoModelForCausalLM.from_pretrained(
|
143 |
+
model_id,
|
144 |
+
device_map="auto",
|
145 |
+
{flash_attention_str}
|
146 |
+
torch_dtype=torch.bfloat16,
|
147 |
+
quantization_config=bnb_config
|
148 |
+
)
|
149 |
+
|
150 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
151 |
+
tokenizer.padding_side = "{pad_side}"
|
152 |
+
{pad_value_str}
|
153 |
+
|
154 |
+
# Set chat template to OAI chatML
|
155 |
+
model, tokenizer = setup_chat_format(model, tokenizer)
|
156 |
+
"""
|
157 |
+
|
158 |
+
text_1 = """
|
159 |
+
This process involves two key steps:
|
160 |
+
|
161 |
+
1. **LLM Quantization:**
|
162 |
+
- We first load the selected large language model (LLM).
|
163 |
+
- We then use the "bitsandbytes" library to quantize the model, which can significantly reduce its memory footprint.
|
164 |
+
|
165 |
+
> **Note:** The memory requirements of the model scale with its size. For instance, a 7B parameter model may require
|
166 |
+
a 24GB GPU for fine-tuning.
|
167 |
+
|
168 |
+
2. **Chat Model Preparation:**
|
169 |
+
- To train a model for chat/conversational tasks, we need to prepare both the model and its tokenizer.
|
170 |
+
|
171 |
+
- This involves adding special tokens to the tokenizer and the model itself. These tokens help the model
|
172 |
+
understand the different roles within a conversation.
|
173 |
+
|
174 |
+
- The **trl** provides a convenient method called `setup_chat_format` for this purpose. This method performs the
|
175 |
+
following actions:
|
176 |
+
|
177 |
+
* Adds special tokens to the tokenizer, such as `<|im_start|>` and `<|im_end|>`, to mark the beginning and
|
178 |
+
ending of a conversation.
|
179 |
+
|
180 |
+
* Resizes the model's embedding layer to accommodate the new tokens.
|
181 |
+
|
182 |
+
* Sets the tokenizer's chat template, which defines the format used to convert input data into a chat-like
|
183 |
+
structure. The default template is `chatml` from OpenAI.
|
184 |
+
"""
|
185 |
+
|
186 |
+
code_cell = nbf.v4.new_code_cell(code)
|
187 |
+
text_cell1 = nbf.v4.new_markdown_cell(text_1)
|
188 |
+
cells.append(text_cell)
|
189 |
+
cells.append(text_cell1)
|
190 |
+
cells.append(code_cell)
|
191 |
+
|
192 |
+
|
193 |
+
def create_lora_config_cells(cells: list, r: int, alpha: int, dropout: float, bias: str):
|
194 |
+
text_cell = nbf.v4.new_markdown_cell("### LoraConfig")
|
195 |
+
code = f"""
|
196 |
+
from peft import LoraConfig
|
197 |
+
|
198 |
+
peft_config = LoraConfig(
|
199 |
+
lora_alpha={alpha},
|
200 |
+
lora_dropout={dropout},
|
201 |
+
r={r},
|
202 |
+
bias="{bias}",
|
203 |
+
target_modules="all-linear",
|
204 |
+
task_type="CAUSAL_LM"
|
205 |
+
)
|
206 |
+
"""
|
207 |
+
|
208 |
+
text = """The `SFTTrainer` provides native integration with `peft`, simplifying the process of efficiently tuning
|
209 |
+
Language Models (LLMs) using techniques such as [LoRA](
|
210 |
+
https://magazine.sebastianraschka.com/p/practical-tips-for-finetuning-llms). The only requirement is to create
|
211 |
+
our `LoraConfig` and pass it to the `SFTTrainer`.
|
212 |
+
"""
|
213 |
+
|
214 |
+
code_cell = nbf.v4.new_code_cell(code)
|
215 |
+
cells.append(text_cell)
|
216 |
+
cells.append(nbf.v4.new_markdown_cell(text))
|
217 |
+
cells.append(code_cell)
|
218 |
+
|
219 |
+
|
220 |
+
def create_training_args_cells(cells: list, epochs, max_steps, logging_steps, per_device_train_batch_size,
|
221 |
+
save_strategy, gradient_accumulation_steps, gradient_checkpointing,
|
222 |
+
learning_rate, max_grad_norm, warmup_ratio, lr_scheduler_type, output_dir,
|
223 |
+
report_to, seed):
|
224 |
+
text_cell = nbf.v4.new_markdown_cell("### TrainingArguments")
|
225 |
+
to_install = None
|
226 |
+
if report_to == "all":
|
227 |
+
to_install = "azure_ml comet_ml mlflow tensorboard wandb"
|
228 |
+
elif report_to != "none":
|
229 |
+
to_install = report_to
|
230 |
+
|
231 |
+
code_report = f"""
|
232 |
+
# Installing {to_install} to report the metrics
|
233 |
+
|
234 |
+
!pip install -q {to_install}
|
235 |
+
"""
|
236 |
+
|
237 |
+
code = f"""
|
238 |
+
from transformers import TrainingArguments
|
239 |
+
|
240 |
+
args = TrainingArguments(
|
241 |
+
output_dir="{output_dir}",
|
242 |
+
num_train_epochs={epochs},
|
243 |
+
per_device_train_batch_size={per_device_train_batch_size},
|
244 |
+
gradient_accumulation_steps={gradient_accumulation_steps},
|
245 |
+
gradient_checkpointing={gradient_checkpointing},
|
246 |
+
optim="adamw_torch_fused",
|
247 |
+
logging_steps={logging_steps},
|
248 |
+
save_strategy='{save_strategy}',
|
249 |
+
learning_rate={learning_rate},
|
250 |
+
bf16=True,
|
251 |
+
tf32=True,
|
252 |
+
max_grad_norm={max_grad_norm},
|
253 |
+
warmup_ratio={warmup_ratio},
|
254 |
+
lr_scheduler_type='{lr_scheduler_type}',
|
255 |
+
report_to='{report_to}',
|
256 |
+
max_steps={max_steps},
|
257 |
+
seed={seed},
|
258 |
+
overwrite_output_dir=True,
|
259 |
+
remove_unused_columns=True
|
260 |
+
)
|
261 |
+
"""
|
262 |
+
|
263 |
+
code_cell = nbf.v4.new_code_cell(code)
|
264 |
+
cells.append(text_cell)
|
265 |
+
if to_install is not None:
|
266 |
+
cells.append(nbf.v4.new_code_cell(code_report))
|
267 |
+
cells.append(code_cell)
|
268 |
+
|
269 |
+
|
270 |
+
def create_sft_trainer_cells(cells: list, max_seq_length, packing):
|
271 |
+
text_cell = nbf.v4.new_markdown_cell(
|
272 |
+
"""### Supervised Finetuning Trainer (SFT Trainer)
|
273 |
+
|
274 |
+
This `SFTTrainer` is a wrapper around the `transformers.Trainer` class and inherits all of its attributes and methods.
|
275 |
+
The trainer takes care of properly initializing the `PeftModel`.
|
276 |
+
""")
|
277 |
+
dataset_kwargs = {
|
278 |
+
"add_special_tokens": False, # We template with special tokens
|
279 |
+
"append_concat_token": False, # No need to add additional separator token
|
280 |
+
}
|
281 |
+
code = f"""
|
282 |
+
from trl import SFTTrainer
|
283 |
+
|
284 |
+
trainer = SFTTrainer(
|
285 |
+
model=model,
|
286 |
+
args=args,
|
287 |
+
train_dataset=dataset,
|
288 |
+
peft_config=peft_config,
|
289 |
+
max_seq_length={max_seq_length},
|
290 |
+
tokenizer=tokenizer,
|
291 |
+
packing={packing},
|
292 |
+
dataset_kwargs={dataset_kwargs}
|
293 |
+
)
|
294 |
+
"""
|
295 |
+
code_cell = nbf.v4.new_code_cell(code)
|
296 |
+
cells.append(text_cell)
|
297 |
+
cells.append(code_cell)
|
298 |
+
|
299 |
+
|
300 |
+
def create_start_training_cells(cells: list, epochs, max_steps, push_to_hub, output_dir):
|
301 |
+
if push_to_hub:
|
302 |
+
save_txt = "and to the hub."
|
303 |
+
else:
|
304 |
+
save_txt = "."
|
305 |
+
|
306 |
+
epoch_str = f"{epochs} epochs"
|
307 |
+
if max_steps > 0:
|
308 |
+
epoch_str = f"{max_steps} steps"
|
309 |
+
|
310 |
+
text_cell = nbf.v4.new_markdown_cell(
|
311 |
+
f"""### Starting Training and Saving Model/Tokenizer
|
312 |
+
|
313 |
+
We start training the model by calling the `train()` method on the trainer instance. This will start the training
|
314 |
+
loop and train the model for `{epoch_str}`. The model will be automatically saved the output directory({output_dir})
|
315 |
+
{save_txt}
|
316 |
+
|
317 |
+
""")
|
318 |
+
|
319 |
+
code = f"""
|
320 |
+
|
321 |
+
# start training
|
322 |
+
trainer.train()
|
323 |
+
|
324 |
+
# save the model
|
325 |
+
trainer.save_model()
|
326 |
+
|
327 |
+
# save tokenizer
|
328 |
+
tokenizer.save_pretrained("{output_dir}")
|
329 |
+
"""
|
330 |
+
code_cell = nbf.v4.new_code_cell(code)
|
331 |
+
cells.append(text_cell)
|
332 |
+
cells.append(code_cell)
|
333 |
+
|
334 |
+
|
335 |
+
def create_free_gpu_cells(cells: list):
|
336 |
+
text_cell = nbf.v4.new_markdown_cell(
|
337 |
+
"""### Free the GPU Memory for Merging `PeftModel`""")
|
338 |
+
|
339 |
+
code = f"""
|
340 |
+
|
341 |
+
# Free the GPU memory
|
342 |
+
del model
|
343 |
+
del trainer
|
344 |
+
torch.cuda.empty_cache()
|
345 |
+
"""
|
346 |
+
code_cell = nbf.v4.new_code_cell(code)
|
347 |
+
cells.append(text_cell)
|
348 |
+
cells.append(code_cell)
|
349 |
+
|
350 |
+
|
351 |
+
def create_merge_lora_cells(cells: list, output_dir):
|
352 |
+
text_cell = nbf.v4.new_markdown_cell(
|
353 |
+
"""### Merge LoRA adapter in to the Original Model
|
354 |
+
|
355 |
+
While utilizing `LoRA`, we focus on training the adapters rather than the entire model. Consequently, during the
|
356 |
+
model saving process, only the `adapter weights` are preserved, not the complete model. If we wish to save the
|
357 |
+
entire model for easier usage with Text Generation Inference, you can incorporate the adapter weights into the model
|
358 |
+
weights. This can be achieved using the `merge_and_unload` method. Following this, the model can be saved using the
|
359 |
+
`save_pretrained` method. The result is a default model that is ready for inference.
|
360 |
+
""")
|
361 |
+
|
362 |
+
code = f"""
|
363 |
+
import torch
|
364 |
+
from peft import AutoPeftModelForCausalLM
|
365 |
+
|
366 |
+
# Load Peft model on CPU
|
367 |
+
model = AutoPeftModelForCausalLM.from_pretrained(
|
368 |
+
"{output_dir}",
|
369 |
+
torch_dtype=torch.float16,
|
370 |
+
low_cpu_mem_usage=True
|
371 |
+
)
|
372 |
+
|
373 |
+
# Merge LoRA and base model and save
|
374 |
+
merged_model = model.merge_and_unload()
|
375 |
+
merged_model.save_pretrained("{output_dir}", safe_serialization=True, max_shard_size="2GB")
|
376 |
+
"""
|
377 |
+
code_cell = nbf.v4.new_code_cell(code)
|
378 |
+
cells.append(text_cell)
|
379 |
+
cells.append(code_cell)
|
380 |
+
|
381 |
+
|
382 |
+
def push_merged_model_cells(cells: list, output_dir):
|
383 |
+
text_cell = nbf.v4.new_markdown_cell(
|
384 |
+
"""### Push the Merged model as well as the Tokenizer to HF hub""")
|
385 |
+
|
386 |
+
code = f"""
|
387 |
+
merged_model.push_to_hub("{output_dir}", use_temp_dir=False)
|
388 |
+
|
389 |
+
tokenizer.push_to_hub("{output_dir}", use_temp_dir=False)
|
390 |
+
"""
|
391 |
+
code_cell = nbf.v4.new_code_cell(code)
|
392 |
+
cells.append(text_cell)
|
393 |
+
cells.append(code_cell)
|