from typing import Set import gradio as gr from gradio.components import Component from utils import * def add_quantization_components() -> Set[Component]: q_components: Set[Component] = set() load_in_4bit = gr.Radio(["load_in_4bit", "load_in_8bit"], value="load_in_4bit", label="Quantization", info="This flag is used to enable 4/8-bit " "quantization.", interactive=True, elem_id=LOAD_IN_4_BIT_ID) bnb_4bit_quant_type = gr.Radio(["fp4", "nf4"], label="bnb_4bit_quant_type", value="nf4", elem_id=BNB_4BIT_QUANT_TYPE, interactive=True, info="This sets the quantization data type in " "the bnb.nn.Linear4Bit " "layers.") q_components.add(load_in_4bit) q_components.add(bnb_4bit_quant_type) return q_components def add_quantization_components1() -> Set[Component]: q_components: Set[Component] = set() bnb_4bit_compute_dtype = gr.Radio( ["torch.float32", "torch.bfloat16", "torch.float16"], label="bnb_4bit_compute_dtype", info="This sets the computational type which might be different " "than the input type.", elem_id=BNB_4BIT_COMPUTE_DTYPE, interactive=True, value="torch.bfloat16") bnb_4bit_use_double_quant = gr.Checkbox(label="bnb_4bit_use_double_quant", value=True, interactive=True, elem_id=BNB_4BIT_USE_DOUBLE_QUANT, info="This flag is used for nested " "quantization where the " "quantization constants from " "the first " "quantization are quantized " "again.") q_components.add(bnb_4bit_compute_dtype) q_components.add(bnb_4bit_use_double_quant) return q_components def add_dataset_components() -> Set[Component]: dataset_selection = gr.Dropdown([dt.path for dt in ft_datasets], elem_id=DATASET_SELECTION_ID, label="Select a Dataset", info="Choose a dataset to finetune the model in the ChatML format." ) seed = gr.Slider(0, 256, step=1, value=42, elem_id=DATASET_SHUFFLING_SEED, label="Random Seed", info="Set a random seed for shuffling the dataset.", interactive=True) d_components: Set[Component] = set() d_components.add(dataset_selection) d_components.add(seed) return d_components def add_pad_tokens() -> Set[Component]: pad_token_side = gr.Radio(["right", "left"], label="Tokenizer: padding_side", info="The side on which the model should have padding applied.", interactive=True, value="right", elem_id=PAD_SIDE_ID) pad_token_value = gr.Radio([None, "eos_token"], label="Tokenizer: pad_token", info="A special token used to make arrays of tokens the same size for batching " "purpose. Will then be " "ignored by attention mechanisms or loss computation.", interactive=True, value=None, elem_id=PAD_VALUE_ID) pad_components: Set[Component] = set() pad_components.add(pad_token_side) pad_components.add(pad_token_value) return pad_components def add_lora_components() -> Set[Component]: r = gr.Slider(1, 2048, step=1, value=6, label="r", info="Lora attention dimension (the 'rank').", interactive=True, elem_id=LORA_R_ID) alpha = gr.Slider(1, 512, step=1, value=8, label="lora_alpha", info="The alpha parameter for Lora scaling.", interactive=True, elem_id=LORA_ALPHA_ID) out_components: Set[Component] = set() out_components.add(r) out_components.add(alpha) return out_components def add_lora_components1() -> Set[Component]: dropout = gr.Slider(0, 1, step=0.01, value=0.05, label="lora_dropout", info="The dropout probability for Lora layers.", interactive=True, elem_id=LORA_DROPOUT_ID) bias = gr.Radio(['none', 'all', 'lora_only'], label="bias", info="Bias type for LoRA. If 'all' or 'lora_only', the corresponding biases will be updated during " "training.", interactive=True, value="none", elem_id=LORA_BIAS_ID) out_components: Set[Component] = set() out_components.add(dropout) out_components.add(bias) return out_components def add_training_args_1() -> Set[Component]: epochs = gr.Slider(1, 100, step=1, value=3, label="num_train_epochs", info="Total number of training epochs to perform.", interactive=True, elem_id=NUM_TRAIN_EPOCHS_ID) max_steps = gr.Slider(-1, 100, step=1, value=-1, label="max_steps", info="Total number of training steps to perform. If set to a positive number it overrides " "'num_train_epochs'.", interactive=True, elem_id=MAX_STEPS_ID) out_components: Set[Component] = set() out_components.add(epochs) out_components.add(max_steps) return out_components def add_training_args_1_bis() -> Set[Component]: logging_steps = gr.Slider(1, 100, step=1, value=10, label="logging_steps", info="Number of update steps between two logs if logging_strategy='steps'", interactive=True, elem_id=LOGGING_STEPS_ID) per_device_train_batch_size = gr.Slider(1, 64, step=1, value=3, label="per_device_train_batch_size", info="Batch size per device during training.", interactive=True, elem_id=PER_DEVICE_TRAIN_BATCH_SIZE) save_strategy = gr.Radio(['no', 'epoch', 'steps'], label="save_strategy", info="The checkpoint save strategy to adopt during training.", interactive=True, value="epoch", elem_id=SAVE_STRATEGY_ID) out_components: Set[Component] = set() out_components.add(save_strategy) out_components.add(logging_steps) out_components.add(per_device_train_batch_size) return out_components def add_training_args_3() -> Set[Component]: max_grad_norm = gr.Slider(0.01, 1, value=0.3, label="max_grad_norm", info="Maximum gradient norm (for gradient clipping).", interactive=True, elem_id=MAX_GRAD_NORM_ID) warmup_ratio = gr.Slider(0, 1, value=0.1, label="warmup_ratio", info="Ratio of total training steps used for a linear warmup from 0 to learning_rate.", interactive=True, elem_id=WARMUP_RATIO_ID) gradient_accumulation_steps = gr.Slider(1, 64, step=1, value=2, label="gradient_accumulation_steps", info="Number of updates steps to accumulate the gradients for, before " "performing a backward/update " "pass.", interactive=True, elem_id=GRADIENT_ACCUMULATION_STEPS_ID) gradient_checkpointing = gr.Checkbox(label="gradient_checkpointing", value=True, interactive=True, info="Use gradient checkpointing to save memory at the expense of slower " "backward pass.", elem_id=GRADIENT_CHECKPOINTING_ID) lr_scheduler_type = gr.Radio(['linear', 'constant', 'cosine'], label="lr_scheduler_type", info="The learning rate scheduler type to use.", interactive=True, value="cosine", elem_id=LR_SCHEDULER_TYPE_ID) out_components: Set[Component] = set() out_components.add(max_grad_norm) out_components.add(warmup_ratio) out_components.add(gradient_accumulation_steps) out_components.add(gradient_checkpointing) out_components.add(lr_scheduler_type) return out_components def add_outputs() -> (Component, Component): output_dir = gr.Textbox(interactive=True, label="output_dir", info='The output directory where the model and checkpoints will be saved.', elem_id=OUTPUT_DIR_ID) push_to_hub = gr.Checkbox( label="Push to Hub", value=False, interactive=True, info="Select this option if you want to upload the trained model to Hugging Face Hub after training. " "Please note, if this option is selected, you must provide a valid 'HF_TOKEN' in the generated notebook.", elem_id=PUSH_TO_HUB_ID ) return output_dir, push_to_hub def add_hf_repo_cmp() -> Component: repo_name = gr.Textbox(label="HF Repo name", placeholder="username/your_repository", info="Hugging Face repository to be created.", interactive=True, visible=False, elem_id=REPOSITORY_NAME_ID) return repo_name def add_outputs1() -> Set[Component]: report_to = gr.Dropdown( ["azure_ml", "comet_ml", "mlflow", "tensorboard", "wandb", "all", 'none'], value="tensorboard", elem_id=REPORT_TO_ID, label="report_to", info="The list of integrations to report the results and logs to. Supported platforms are 'azure_ml', " "'comet_ml', 'mlflow', 'tensorboard' and 'wandb'. Use 'all' to report to all integrations installed, " "'none' for no integrations." ) create_readme = gr.Checkbox(label="Automatically Generate a README.md", value=True, interactive=True, info="Choose whether to automatically generate a model card (README.md) or not.", elem_id=README_ID) out_components: Set[Component] = set() out_components.add(report_to) out_components.add(create_readme) return out_components def add_optimizer() -> Set[Component]: adam_beta1 = gr.Slider(0.00001, 1, value=0.9, label="adam_beta1", info="The beta1 hyperparameter for the [`AdamW`] optimizer.", interactive=True, elem_id=BETA1_ID) adam_beta2 = gr.Slider(0.00001, 1, value=0.999, label="adam_beta2", info="The beta2 hyperparameter for the [`AdamW`] optimizer.", interactive=True, elem_id=BETA2_ID) adam_epsilon = gr.Slider(1e-9, 1, value=1e-8, label="adam_epsilon", info="The epsilon hyperparameter for the [`AdamW`] optimizer.", interactive=True, elem_id=EPSILON_ID) out_components: Set[Component] = set() out_components.add(adam_beta1) out_components.add(adam_beta2) out_components.add(adam_epsilon) return out_components def add_optimizer1() -> Set[Component]: optimizer = gr.Dropdown( ["adamw_hf", "adamw_torch", "adamw_torch_fused", "adamw_apex_fused", "adamw_anyprecision", "adafactor"], value="adamw_torch_fused", elem_id=OPTIMIZER_ID, label="optimizer", info="The optimizer to use: 'adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_apex_fused', " "'adamw_anyprecision' or " "'adafactor'. " ) learning_rate = gr.Slider(1e-6, 1, step=0.001, value=2.0e-05, label="learning_rate", info="The initial learning rate for AdamW.", interactive=True, elem_id=LEARNING_RATE_ID) weight_decay = gr.Slider(0, 1, value=0, label="weight_decay", info="The weight decay to apply (if not zero) to all layers except all bias and " "LayerNorm weights in [`AdamW`] optimizer.", interactive=True, elem_id=WEIGHT_DECAY_ID) out_components: Set[Component] = set() out_components.add(optimizer) out_components.add(learning_rate) out_components.add(weight_decay) return out_components def add_sft_trainer_args() -> Set[Component]: max_seq_length = gr.Slider(512, 3072, value=2048, label="max_seq_length", info="The maximum sequence length to use for the `ConstantLengthDataset` and for " "automatically " "creating the Dataset.", interactive=True, elem_id=MAX_SEQ_LENGTH_ID) packing = gr.Checkbox(label="packing", value=True, interactive=True, elem_id=PACKING_ID, info="This argument is used by the `ConstantLengthDataset` to pack the sequences of the " "dataset.") out_components: Set[Component] = set() out_components.add(max_seq_length) out_components.add(packing) return out_components