from typing import Set

import gradio as gr
from gradio.components import Component

from utils import *


def add_quantization_components() -> Set[Component]:
    q_components: Set[Component] = set()
    load_in_4bit = gr.Radio(["load_in_4bit", "load_in_8bit"], value="load_in_4bit",
                            label="Quantization",
                            info="This flag is used to enable 4/8-bit "
                                 "quantization.",
                            interactive=True,
                            elem_id=LOAD_IN_4_BIT_ID)
    bnb_4bit_quant_type = gr.Radio(["fp4", "nf4"], label="bnb_4bit_quant_type",
                                   value="nf4",
                                   elem_id=BNB_4BIT_QUANT_TYPE,
                                   interactive=True,
                                   info="This sets the quantization data type in "
                                        "the bnb.nn.Linear4Bit "
                                        "layers.")
    q_components.add(load_in_4bit)
    q_components.add(bnb_4bit_quant_type)
    return q_components


def add_quantization_components1() -> Set[Component]:
    q_components: Set[Component] = set()
    bnb_4bit_compute_dtype = gr.Radio(
        ["torch.float32", "torch.bfloat16", "torch.float16"],
        label="bnb_4bit_compute_dtype",
        info="This sets the computational type which might be different "
             "than the input type.",
        elem_id=BNB_4BIT_COMPUTE_DTYPE,
        interactive=True, value="torch.bfloat16")
    bnb_4bit_use_double_quant = gr.Checkbox(label="bnb_4bit_use_double_quant",
                                            value=True,
                                            interactive=True,
                                            elem_id=BNB_4BIT_USE_DOUBLE_QUANT,
                                            info="This flag is used for nested "
                                                 "quantization where the "
                                                 "quantization constants from "
                                                 "the first "
                                                 "quantization are quantized "
                                                 "again.")
    q_components.add(bnb_4bit_compute_dtype)
    q_components.add(bnb_4bit_use_double_quant)
    return q_components


def add_dataset_components() -> Set[Component]:
    dataset_selection = gr.Dropdown([dt.path for dt in ft_datasets],
                                    elem_id=DATASET_SELECTION_ID,
                                    label="Select a Dataset",
                                    info="Choose a dataset to finetune the model in the ChatML format."
                                    )

    seed = gr.Slider(0, 256, step=1, value=42, elem_id=DATASET_SHUFFLING_SEED, label="Random Seed",
                     info="Set a random seed for shuffling the dataset.", interactive=True)

    d_components: Set[Component] = set()
    d_components.add(dataset_selection)
    d_components.add(seed)
    return d_components


def add_pad_tokens() -> Set[Component]:
    pad_token_side = gr.Radio(["right", "left"], label="Tokenizer: padding_side",
                              info="The side on which the model should have padding applied.",
                              interactive=True, value="right", elem_id=PAD_SIDE_ID)
    pad_token_value = gr.Radio([None, "eos_token"], label="Tokenizer: pad_token",
                               info="A special token used to make arrays of tokens the same size for batching "
                                    "purpose. Will then be "
                                    "ignored by attention mechanisms or loss computation.",
                               interactive=True, value=None, elem_id=PAD_VALUE_ID)
    pad_components: Set[Component] = set()
    pad_components.add(pad_token_side)
    pad_components.add(pad_token_value)
    return pad_components


def add_lora_components() -> Set[Component]:
    r = gr.Slider(1, 2048, step=1, value=6, label="r", info="Lora attention dimension (the 'rank').",
                  interactive=True, elem_id=LORA_R_ID)
    alpha = gr.Slider(1, 512, step=1, value=8, label="lora_alpha", info="The alpha parameter for Lora scaling.",
                      interactive=True, elem_id=LORA_ALPHA_ID)

    out_components: Set[Component] = set()
    out_components.add(r)
    out_components.add(alpha)
    return out_components


def add_lora_components1() -> Set[Component]:
    dropout = gr.Slider(0, 1, step=0.01, value=0.05, label="lora_dropout",
                        info="The dropout probability for Lora layers.",
                        interactive=True, elem_id=LORA_DROPOUT_ID)
    bias = gr.Radio(['none', 'all', 'lora_only'], label="bias",
                    info="Bias type for LoRA. If 'all' or 'lora_only', the corresponding biases will be updated during "
                         "training.",
                    interactive=True, value="none", elem_id=LORA_BIAS_ID)

    out_components: Set[Component] = set()
    out_components.add(dropout)
    out_components.add(bias)
    return out_components


def add_training_args_1() -> Set[Component]:
    epochs = gr.Slider(1, 100, step=1, value=3, label="num_train_epochs",
                       info="Total number of training epochs to perform.",
                       interactive=True, elem_id=NUM_TRAIN_EPOCHS_ID)
    max_steps = gr.Slider(-1, 100, step=1, value=-1, label="max_steps",
                          info="Total number of training steps to perform. If set to a positive number it overrides "
                               "'num_train_epochs'.",
                          interactive=True, elem_id=MAX_STEPS_ID)
    out_components: Set[Component] = set()
    out_components.add(epochs)
    out_components.add(max_steps)
    return out_components


def add_training_args_1_bis() -> Set[Component]:
    logging_steps = gr.Slider(1, 100, step=1, value=10, label="logging_steps",
                              info="Number of update steps between two logs if logging_strategy='steps'",
                              interactive=True, elem_id=LOGGING_STEPS_ID)
    per_device_train_batch_size = gr.Slider(1, 64, step=1, value=3, label="per_device_train_batch_size",
                                            info="Batch size per device during training.",
                                            interactive=True, elem_id=PER_DEVICE_TRAIN_BATCH_SIZE)
    save_strategy = gr.Radio(['no', 'epoch', 'steps'], label="save_strategy",
                             info="The checkpoint save strategy to adopt during training.",
                             interactive=True, value="epoch", elem_id=SAVE_STRATEGY_ID)
    out_components: Set[Component] = set()
    out_components.add(save_strategy)
    out_components.add(logging_steps)
    out_components.add(per_device_train_batch_size)
    return out_components


def add_training_args_3() -> Set[Component]:
    max_grad_norm = gr.Slider(0.01, 1, value=0.3, label="max_grad_norm",
                              info="Maximum gradient norm (for gradient clipping).",
                              interactive=True, elem_id=MAX_GRAD_NORM_ID)
    warmup_ratio = gr.Slider(0, 1, value=0.1, label="warmup_ratio",
                             info="Ratio of total training steps used for a linear warmup from 0 to learning_rate.",
                             interactive=True, elem_id=WARMUP_RATIO_ID)
    gradient_accumulation_steps = gr.Slider(1, 64, step=1, value=2, label="gradient_accumulation_steps",
                                            info="Number of updates steps to accumulate the gradients for, before "
                                                 "performing a backward/update "
                                                 "pass.",
                                            interactive=True, elem_id=GRADIENT_ACCUMULATION_STEPS_ID)
    gradient_checkpointing = gr.Checkbox(label="gradient_checkpointing", value=True, interactive=True,
                                         info="Use gradient checkpointing to save memory at the expense of slower "
                                              "backward pass.", elem_id=GRADIENT_CHECKPOINTING_ID)
    lr_scheduler_type = gr.Radio(['linear', 'constant', 'cosine'], label="lr_scheduler_type",
                                 info="The learning rate scheduler type to use.",
                                 interactive=True, value="cosine", elem_id=LR_SCHEDULER_TYPE_ID)

    out_components: Set[Component] = set()
    out_components.add(max_grad_norm)
    out_components.add(warmup_ratio)
    out_components.add(gradient_accumulation_steps)
    out_components.add(gradient_checkpointing)
    out_components.add(lr_scheduler_type)
    return out_components


def add_outputs() -> (Component, Component):
    output_dir = gr.Textbox(interactive=True,
                            label="output_dir",
                            info='The output directory where the model and checkpoints will be saved.',
                            elem_id=OUTPUT_DIR_ID)

    push_to_hub = gr.Checkbox(
        label="Push to Hub",
        value=False,
        interactive=True,
        info="Select this option if you want to upload the trained model to Hugging Face Hub after training. "
             "Please note, if this option is selected, you must provide a valid 'HF_TOKEN' in the generated notebook.",
        elem_id=PUSH_TO_HUB_ID
    )

    return output_dir, push_to_hub


def add_hf_repo_cmp() -> Component:
    repo_name = gr.Textbox(label="HF Repo name",
                           placeholder="username/your_repository",
                           info="Hugging Face repository to be created.",
                           interactive=True,
                           visible=False,
                           elem_id=REPOSITORY_NAME_ID)
    return repo_name


def add_outputs1() -> Set[Component]:
    report_to = gr.Dropdown(
        ["azure_ml", "comet_ml", "mlflow", "tensorboard", "wandb", "all", 'none'],
        value="tensorboard",
        elem_id=REPORT_TO_ID,
        label="report_to",
        info="The list of integrations to report the results and logs to. Supported platforms are 'azure_ml', "
             "'comet_ml', 'mlflow', 'tensorboard' and 'wandb'. Use 'all' to report to all integrations installed, "
             "'none' for no integrations."
    )
    create_readme = gr.Checkbox(label="Automatically Generate a README.md", value=True, interactive=True,
                                info="Choose whether to automatically generate a model card (README.md) or not.",
                                elem_id=README_ID)

    out_components: Set[Component] = set()
    out_components.add(report_to)
    out_components.add(create_readme)
    return out_components


def add_optimizer() -> Set[Component]:
    adam_beta1 = gr.Slider(0.00001, 1, value=0.9, label="adam_beta1",
                           info="The beta1 hyperparameter for the [`AdamW`] optimizer.",
                           interactive=True, elem_id=BETA1_ID)
    adam_beta2 = gr.Slider(0.00001, 1, value=0.999, label="adam_beta2",
                           info="The beta2 hyperparameter for the [`AdamW`] optimizer.",
                           interactive=True, elem_id=BETA2_ID)
    adam_epsilon = gr.Slider(1e-9, 1, value=1e-8, label="adam_epsilon",
                             info="The epsilon hyperparameter for the [`AdamW`] optimizer.",
                             interactive=True, elem_id=EPSILON_ID)
    out_components: Set[Component] = set()
    out_components.add(adam_beta1)
    out_components.add(adam_beta2)
    out_components.add(adam_epsilon)
    return out_components


def add_optimizer1() -> Set[Component]:
    optimizer = gr.Dropdown(
        ["adamw_hf", "adamw_torch", "adamw_torch_fused", "adamw_apex_fused", "adamw_anyprecision", "adafactor"],
        value="adamw_torch_fused",
        elem_id=OPTIMIZER_ID,
        label="optimizer",
        info="The optimizer to use: 'adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_apex_fused', "
             "'adamw_anyprecision' or "
             "'adafactor'. "
    )
    learning_rate = gr.Slider(1e-6, 1, step=0.001, value=2.0e-05, label="learning_rate",
                              info="The initial learning rate for AdamW.",
                              interactive=True, elem_id=LEARNING_RATE_ID)
    weight_decay = gr.Slider(0, 1, value=0, label="weight_decay",
                             info="The weight decay to apply (if not zero) to all layers except all bias and "
                                  "LayerNorm weights in [`AdamW`] optimizer.",
                             interactive=True, elem_id=WEIGHT_DECAY_ID)
    out_components: Set[Component] = set()
    out_components.add(optimizer)
    out_components.add(learning_rate)
    out_components.add(weight_decay)
    return out_components


def add_sft_trainer_args() -> Set[Component]:
    max_seq_length = gr.Slider(512, 3072, value=2048, label="max_seq_length",
                               info="The maximum sequence length to use for the `ConstantLengthDataset` and for "
                                    "automatically "
                                    "creating the Dataset.",
                               interactive=True, elem_id=MAX_SEQ_LENGTH_ID)
    packing = gr.Checkbox(label="packing", value=True, interactive=True, elem_id=PACKING_ID,
                          info="This argument is used by the `ConstantLengthDataset` to pack the sequences of the "
                               "dataset.")

    out_components: Set[Component] = set()
    out_components.add(max_seq_length)
    out_components.add(packing)
    return out_components