File size: 13,754 Bytes
af04de4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6674f1f
 
 
 
 
 
af04de4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f22c73
 
af04de4
 
 
 
 
 
 
 
 
 
 
67a37c0
af04de4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e75ffde
af04de4
 
6674f1f
af04de4
 
6c5232f
 
 
 
 
 
 
 
af04de4
e75ffde
 
 
b8758c8
 
 
 
 
 
 
 
af04de4
 
 
 
 
 
 
 
 
 
 
 
611507d
 
 
 
af04de4
 
611507d
af04de4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
from typing import Set

import gradio as gr
from gradio.components import Component

from utils import *


def add_quantization_components() -> Set[Component]:
    q_components: Set[Component] = set()
    load_in_4bit = gr.Radio(["load_in_4bit", "load_in_8bit"], value="load_in_4bit",
                            label="Quantization",
                            info="This flag is used to enable 4/8-bit "
                                 "quantization.",
                            interactive=True,
                            elem_id=LOAD_IN_4_BIT_ID)
    bnb_4bit_quant_type = gr.Radio(["fp4", "nf4"], label="bnb_4bit_quant_type",
                                   value="nf4",
                                   elem_id=BNB_4BIT_QUANT_TYPE,
                                   interactive=True,
                                   info="This sets the quantization data type in "
                                        "the bnb.nn.Linear4Bit "
                                        "layers.")
    q_components.add(load_in_4bit)
    q_components.add(bnb_4bit_quant_type)
    return q_components


def add_quantization_components1() -> Set[Component]:
    q_components: Set[Component] = set()
    bnb_4bit_compute_dtype = gr.Radio(
        ["torch.float32", "torch.bfloat16", "torch.float16"],
        label="bnb_4bit_compute_dtype",
        info="This sets the computational type which might be different "
             "than the input type.",
        elem_id=BNB_4BIT_COMPUTE_DTYPE,
        interactive=True, value="torch.bfloat16")
    bnb_4bit_use_double_quant = gr.Checkbox(label="bnb_4bit_use_double_quant",
                                            value=True,
                                            interactive=True,
                                            elem_id=BNB_4BIT_USE_DOUBLE_QUANT,
                                            info="This flag is used for nested "
                                                 "quantization where the "
                                                 "quantization constants from "
                                                 "the first "
                                                 "quantization are quantized "
                                                 "again.")
    q_components.add(bnb_4bit_compute_dtype)
    q_components.add(bnb_4bit_use_double_quant)
    return q_components


def add_dataset_components() -> Set[Component]:
    dataset_selection = gr.Dropdown([dt.path for dt in ft_datasets],
                                    elem_id=DATASET_SELECTION_ID,
                                    label="Select a Dataset",
                                    info="Choose a dataset to finetune the model in the ChatML format."
                                    )

    seed = gr.Slider(0, 256, step=1, value=42, elem_id=DATASET_SHUFFLING_SEED, label="Random Seed",
                     info="Set a random seed for shuffling the dataset.", interactive=True)

    d_components: Set[Component] = set()
    d_components.add(dataset_selection)
    d_components.add(seed)
    return d_components


def add_pad_tokens() -> Set[Component]:
    pad_token_side = gr.Radio(["right", "left"], label="Tokenizer: padding_side",
                              info="The side on which the model should have padding applied.",
                              interactive=True, value="right", elem_id=PAD_SIDE_ID)
    pad_token_value = gr.Radio([None, "eos_token"], label="Tokenizer: pad_token",
                               info="A special token used to make arrays of tokens the same size for batching "
                                    "purpose. Will then be "
                                    "ignored by attention mechanisms or loss computation.",
                               interactive=True, value=None, elem_id=PAD_VALUE_ID)
    pad_components: Set[Component] = set()
    pad_components.add(pad_token_side)
    pad_components.add(pad_token_value)
    return pad_components


def add_lora_components() -> Set[Component]:
    r = gr.Slider(1, 2048, step=1, value=6, label="r", info="Lora attention dimension (the 'rank').",
                  interactive=True, elem_id=LORA_R_ID)
    alpha = gr.Slider(1, 512, step=1, value=8, label="lora_alpha", info="The alpha parameter for Lora scaling.",
                      interactive=True, elem_id=LORA_ALPHA_ID)

    out_components: Set[Component] = set()
    out_components.add(r)
    out_components.add(alpha)
    return out_components


def add_lora_components1() -> Set[Component]:
    dropout = gr.Slider(0, 1, step=0.01, value=0.05, label="lora_dropout",
                        info="The dropout probability for Lora layers.",
                        interactive=True, elem_id=LORA_DROPOUT_ID)
    bias = gr.Radio(['none', 'all', 'lora_only'], label="bias",
                    info="Bias type for LoRA. If 'all' or 'lora_only', the corresponding biases will be updated during "
                         "training.",
                    interactive=True, value="none", elem_id=LORA_BIAS_ID)

    out_components: Set[Component] = set()
    out_components.add(dropout)
    out_components.add(bias)
    return out_components


def add_training_args_1() -> Set[Component]:
    epochs = gr.Slider(1, 100, step=1, value=3, label="num_train_epochs",
                       info="Total number of training epochs to perform.",
                       interactive=True, elem_id=NUM_TRAIN_EPOCHS_ID)
    max_steps = gr.Slider(-1, 100, step=1, value=-1, label="max_steps",
                          info="Total number of training steps to perform. If set to a positive number it overrides "
                               "'num_train_epochs'.",
                          interactive=True, elem_id=MAX_STEPS_ID)
    out_components: Set[Component] = set()
    out_components.add(epochs)
    out_components.add(max_steps)
    return out_components


def add_training_args_1_bis() -> Set[Component]:
    logging_steps = gr.Slider(1, 100, step=1, value=10, label="logging_steps",
                              info="Number of update steps between two logs if logging_strategy='steps'",
                              interactive=True, elem_id=LOGGING_STEPS_ID)
    per_device_train_batch_size = gr.Slider(1, 64, step=1, value=3, label="per_device_train_batch_size",
                                            info="Batch size per device during training.",
                                            interactive=True, elem_id=PER_DEVICE_TRAIN_BATCH_SIZE)
    save_strategy = gr.Radio(['no', 'epoch', 'steps'], label="save_strategy",
                             info="The checkpoint save strategy to adopt during training.",
                             interactive=True, value="epoch", elem_id=SAVE_STRATEGY_ID)
    out_components: Set[Component] = set()
    out_components.add(save_strategy)
    out_components.add(logging_steps)
    out_components.add(per_device_train_batch_size)
    return out_components


def add_training_args_3() -> Set[Component]:
    max_grad_norm = gr.Slider(0.01, 1, value=0.3, label="max_grad_norm",
                              info="Maximum gradient norm (for gradient clipping).",
                              interactive=True, elem_id=MAX_GRAD_NORM_ID)
    warmup_ratio = gr.Slider(0, 1, value=0.1, label="warmup_ratio",
                             info="Ratio of total training steps used for a linear warmup from 0 to learning_rate.",
                             interactive=True, elem_id=WARMUP_RATIO_ID)
    gradient_accumulation_steps = gr.Slider(1, 64, step=1, value=2, label="gradient_accumulation_steps",
                                            info="Number of updates steps to accumulate the gradients for, before "
                                                 "performing a backward/update "
                                                 "pass.",
                                            interactive=True, elem_id=GRADIENT_ACCUMULATION_STEPS_ID)
    gradient_checkpointing = gr.Checkbox(label="gradient_checkpointing", value=True, interactive=True,
                                         info="Use gradient checkpointing to save memory at the expense of slower "
                                              "backward pass.", elem_id=GRADIENT_CHECKPOINTING_ID)
    lr_scheduler_type = gr.Radio(['linear', 'constant', 'cosine'], label="lr_scheduler_type",
                                 info="The learning rate scheduler type to use.",
                                 interactive=True, value="cosine", elem_id=LR_SCHEDULER_TYPE_ID)

    out_components: Set[Component] = set()
    out_components.add(max_grad_norm)
    out_components.add(warmup_ratio)
    out_components.add(gradient_accumulation_steps)
    out_components.add(gradient_checkpointing)
    out_components.add(lr_scheduler_type)
    return out_components


def add_outputs() -> (Component, Component):
    output_dir = gr.Textbox(interactive=True,
                            label="output_dir",
                            info='The output directory where the model and checkpoints will be saved.',
                            elem_id=OUTPUT_DIR_ID)

    push_to_hub = gr.Checkbox(
        label="Push to Hub",
        value=False,
        interactive=True,
        info="Select this option if you want to upload the trained model to Hugging Face Hub after training. "
             "Please note, if this option is selected, you must provide a valid 'HF_TOKEN' in the generated notebook.",
        elem_id=PUSH_TO_HUB_ID
    )

    return output_dir, push_to_hub


def add_hf_repo_cmp() -> Component:
    repo_name = gr.Textbox(label="HF Repo name",
                           placeholder="username/your_repository",
                           info="Hugging Face repository to be created.",
                           interactive=True,
                           visible=False,
                           elem_id=REPOSITORY_NAME_ID)
    return repo_name


def add_outputs1() -> Set[Component]:
    report_to = gr.Dropdown(
        ["azure_ml", "comet_ml", "mlflow", "tensorboard", "wandb", "all", 'none'],
        value="tensorboard",
        elem_id=REPORT_TO_ID,
        label="report_to",
        info="The list of integrations to report the results and logs to. Supported platforms are 'azure_ml', "
             "'comet_ml', 'mlflow', 'tensorboard' and 'wandb'. Use 'all' to report to all integrations installed, "
             "'none' for no integrations."
    )
    create_readme = gr.Checkbox(label="Automatically Generate a README.md", value=True, interactive=True,
                                info="Choose whether to automatically generate a model card (README.md) or not.",
                                elem_id=README_ID)

    out_components: Set[Component] = set()
    out_components.add(report_to)
    out_components.add(create_readme)
    return out_components


def add_optimizer() -> Set[Component]:
    adam_beta1 = gr.Slider(0.00001, 1, value=0.9, label="adam_beta1",
                           info="The beta1 hyperparameter for the [`AdamW`] optimizer.",
                           interactive=True, elem_id=BETA1_ID)
    adam_beta2 = gr.Slider(0.00001, 1, value=0.999, label="adam_beta2",
                           info="The beta2 hyperparameter for the [`AdamW`] optimizer.",
                           interactive=True, elem_id=BETA2_ID)
    adam_epsilon = gr.Slider(1e-9, 1, value=1e-8, label="adam_epsilon",
                             info="The epsilon hyperparameter for the [`AdamW`] optimizer.",
                             interactive=True, elem_id=EPSILON_ID)
    out_components: Set[Component] = set()
    out_components.add(adam_beta1)
    out_components.add(adam_beta2)
    out_components.add(adam_epsilon)
    return out_components


def add_optimizer1() -> Set[Component]:
    optimizer = gr.Dropdown(
        ["adamw_hf", "adamw_torch", "adamw_torch_fused", "adamw_apex_fused", "adamw_anyprecision", "adafactor"],
        value="adamw_torch_fused",
        elem_id=OPTIMIZER_ID,
        label="optimizer",
        info="The optimizer to use: 'adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_apex_fused', "
             "'adamw_anyprecision' or "
             "'adafactor'. "
    )
    learning_rate = gr.Slider(1e-6, 1, step=0.001, value=2.0e-05, label="learning_rate",
                              info="The initial learning rate for AdamW.",
                              interactive=True, elem_id=LEARNING_RATE_ID)
    weight_decay = gr.Slider(0, 1, value=0, label="weight_decay",
                             info="The weight decay to apply (if not zero) to all layers except all bias and "
                                  "LayerNorm weights in [`AdamW`] optimizer.",
                             interactive=True, elem_id=WEIGHT_DECAY_ID)
    out_components: Set[Component] = set()
    out_components.add(optimizer)
    out_components.add(learning_rate)
    out_components.add(weight_decay)
    return out_components


def add_sft_trainer_args() -> Set[Component]:
    max_seq_length = gr.Slider(512, 3072, value=2048, label="max_seq_length",
                               info="The maximum sequence length to use for the `ConstantLengthDataset` and for "
                                    "automatically "
                                    "creating the Dataset.",
                               interactive=True, elem_id=MAX_SEQ_LENGTH_ID)
    packing = gr.Checkbox(label="packing", value=True, interactive=True, elem_id=PACKING_ID,
                          info="This argument is used by the `ConstantLengthDataset` to pack the sequences of the "
                               "dataset.")

    out_components: Set[Component] = set()
    out_components.add(max_seq_length)
    out_components.add(packing)
    return out_components