File size: 11,598 Bytes
72a0757
1
{"bf16":{"desc":null,"value":false},"bits":{"desc":null,"value":4},"fp16":{"desc":null,"value":false},"fsdp":{"desc":null,"value":"[]"},"seed":{"desc":null,"value":42},"tf32":{"desc":null,"value":"None"},"debug":{"desc":null,"value":"[]"},"optim":{"desc":null,"value":"paged_adamw_32bit"},"top_k":{"desc":null,"value":50},"top_p":{"desc":null,"value":1},"_wandb":{"desc":null,"value":{"m":[{"1":"train/global_step","6":[3]},{"1":"train/loss","5":1,"6":[1]},{"1":"train/learning_rate","5":1,"6":[1]},{"1":"train/epoch","5":1,"6":[1]}],"t":{"1":[1,2,3,5,11,12,49,51,53,55,71,98,100],"2":[1,2,3,5,11,12,49,51,53,55,71,98,100],"3":[7,23],"4":"3.10.12","5":"0.15.5","6":"4.31.0","8":[2,5,13]},"framework":"huggingface","start_time":1693023705.164511,"cli_version":"0.15.5","is_jupyter_run":false,"python_version":"3.10.12","is_kaggle_kernel":true,"huggingface_version":"4.31.0"}},"lora_r":{"desc":null,"value":128},"n_head":{"desc":null,"value":32},"prefix":{"desc":null,"value":null},"do_eval":{"desc":null,"value":false},"n_inner":{"desc":null,"value":null},"n_layer":{"desc":null,"value":30},"no_cuda":{"desc":null,"value":false},"adam8bit":{"desc":null,"value":false},"do_train":{"desc":null,"value":true},"id2label":{"desc":null,"value":{"0":"LABEL_0","1":"LABEL_1"}},"label2id":{"desc":null,"value":{"LABEL_0":0,"LABEL_1":1}},"run_name":{"desc":null,"value":"./outputs/"},"use_ipex":{"desc":null,"value":false},"adafactor":{"desc":null,"value":false},"cache_dir":{"desc":null,"value":"None"},"data_seed":{"desc":null,"value":"None"},"deepspeed":{"desc":null,"value":"None"},"do_sample":{"desc":null,"value":false},"hub_token":{"desc":null,"value":""},"log_level":{"desc":null,"value":"passive"},"max_steps":{"desc":null,"value":10000},"num_beams":{"desc":null,"value":1},"ray_scope":{"desc":null,"value":"last"},"report_to":{"desc":null,"value":"['wandb']"},"typical_p":{"desc":null,"value":1},"use_cache":{"desc":null,"value":false},"adam_beta1":{"desc":null,"value":0.9},"adam_beta2":{"desc":null,"value":0.999},"do_predict":{"desc":null,"value":false},"eval_delay":{"desc":null,"value":0},"eval_steps":{"desc":null,"value":"None"},"is_decoder":{"desc":null,"value":false},"local_rank":{"desc":null,"value":0},"lora_alpha":{"desc":null,"value":32},"max_length":{"desc":null,"value":20},"min_length":{"desc":null,"value":0},"mmlu_split":{"desc":null,"value":"eval"},"model_type":{"desc":null,"value":"bloom"},"optim_args":{"desc":null,"value":"None"},"output_dir":{"desc":null,"value":"./outputs/"},"past_index":{"desc":null,"value":-1},"quant_type":{"desc":null,"value":"nf4"},"save_steps":{"desc":null,"value":100},"vocab_size":{"desc":null,"value":250880},"ddp_backend":{"desc":null,"value":"None"},"ddp_timeout":{"desc":null,"value":1800},"fsdp_config":{"desc":null,"value":"{'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}"},"hidden_size":{"desc":null,"value":2560},"label_names":{"desc":null,"value":"None"},"logging_dir":{"desc":null,"value":"./outputs/runs/Aug26_04-17-30_74526ec53aa8"},"push_to_hub":{"desc":null,"value":false},"return_dict":{"desc":null,"value":true},"sharded_ddp":{"desc":null,"value":"[]"},"temperature":{"desc":null,"value":1},"torch_dtype":{"desc":null,"value":"float32"},"torchdynamo":{"desc":null,"value":"None"},"torchscript":{"desc":null,"value":false},"xpu_backend":{"desc":null,"value":"None"},"adam_epsilon":{"desc":null,"value":1e-8},"bos_token_id":{"desc":null,"value":1},"disable_tqdm":{"desc":null,"value":false},"do_mmlu_eval":{"desc":null,"value":false},"double_quant":{"desc":null,"value":true},"eos_token_id":{"desc":null,"value":2},"fp16_backend":{"desc":null,"value":"auto"},"hub_model_id":{"desc":null,"value":"None"},"hub_strategy":{"desc":null,"value":"every_save"},"lora_dropout":{"desc":null,"value":0.1},"mmlu_dataset":{"desc":null,"value":"mmlu-fs"},"offset_alibi":{"desc":null,"value":100},"pad_token_id":{"desc":null,"value":3},"problem_type":{"desc":null,"value":null},"pruned_heads":{"desc":null,"value":{}},"sep_token_id":{"desc":null,"value":null},"unk_token_id":{"desc":null,"value":0},"use_bfloat16":{"desc":null,"value":false},"warmup_ratio":{"desc":null,"value":0.03},"warmup_steps":{"desc":null,"value":0},"weight_decay":{"desc":null,"value":0},"_name_or_path":{"desc":null,"value":"bigscience/bloom-3b"},"architectures":{"desc":null,"value":["BloomForCausalLM"]},"bad_words_ids":{"desc":null,"value":null},"full_finetune":{"desc":null,"value":false},"jit_mode_eval":{"desc":null,"value":false},"learning_rate":{"desc":null,"value":0.0003},"logging_steps":{"desc":null,"value":20},"max_grad_norm":{"desc":null,"value":0.3},"max_memory_MB":{"desc":null,"value":16000},"mp_parameters":{"desc":null,"value":""},"output_scores":{"desc":null,"value":false},"save_strategy":{"desc":null,"value":"steps"},"skip_bias_add":{"desc":null,"value":true},"torch_compile":{"desc":null,"value":false},"tpu_num_cores":{"desc":null,"value":"None"},"bf16_full_eval":{"desc":null,"value":false},"early_stopping":{"desc":null,"value":false},"fp16_full_eval":{"desc":null,"value":false},"fp16_opt_level":{"desc":null,"value":"O1"},"hidden_dropout":{"desc":null,"value":0},"length_penalty":{"desc":null,"value":1},"pretraining_tp":{"desc":null,"value":1},"slow_but_exact":{"desc":null,"value":false},"tf_legacy_loss":{"desc":null,"value":false},"use_mps_device":{"desc":null,"value":false},"eval_batch_size":{"desc":null,"value":8},"finetuning_task":{"desc":null,"value":null},"group_by_length":{"desc":null,"value":true},"num_beam_groups":{"desc":null,"value":1},"sortish_sampler":{"desc":null,"value":false},"suppress_tokens":{"desc":null,"value":null},"tokenizer_class":{"desc":null,"value":null},"train_on_source":{"desc":null,"value":false},"full_determinism":{"desc":null,"value":false},"hub_private_repo":{"desc":null,"value":false},"ignore_data_skip":{"desc":null,"value":false},"log_on_each_node":{"desc":null,"value":true},"logging_strategy":{"desc":null,"value":"steps"},"max_mmlu_samples":{"desc":null,"value":"None"},"num_train_epochs":{"desc":null,"value":3},"save_safetensors":{"desc":null,"value":false},"save_total_limit":{"desc":null,"value":4},"train_batch_size":{"desc":null,"value":2},"attention_dropout":{"desc":null,"value":0},"ddp_bucket_cap_mb":{"desc":null,"value":"None"},"diversity_penalty":{"desc":null,"value":0},"generation_config":{"desc":null,"value":"{'max_length': 20, 'max_new_tokens': 256, 'min_length': 0, 'min_new_tokens': None, 'early_stopping': False, 'max_time': None, 'do_sample': False, 'num_beams': 1, 'num_beam_groups': 1, 'penalty_alpha': None, 'use_cache': True, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'epsilon_cutoff': 0.0, 'eta_cutoff': 0.0, 'diversity_penalty': 0.0, 'repetition_penalty': 1.0, 'encoder_repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'bad_words_ids': None, 'force_words_ids': None, 'renormalize_logits': False, 'constraints': None, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'forced_decoder_ids': None, 'sequence_bias': None, 'guidance_scale': None, 'num_return_sequences': 1, 'output_attentions': False, 'output_hidden_states': False, 'output_scores': False, 'return_dict_in_generate': False, 'pad_token_id': None, 'bos_token_id': None, 'eos_token_id': None, 'encoder_no_repeat_ngram_size': 0, 'decoder_start_token_id': None, 'generation_kwargs': {}, '_from_model_config': False, 'transformers_version': '4.31.0'}"},"greater_is_better":{"desc":null,"value":"None"},"initializer_range":{"desc":null,"value":0.02},"log_level_replica":{"desc":null,"value":"warning"},"lr_scheduler_type":{"desc":null,"value":"cosine"},"output_attentions":{"desc":null,"value":false},"push_to_hub_token":{"desc":null,"value":""},"save_on_each_node":{"desc":null,"value":false},"skip_bias_add_qkv":{"desc":null,"value":false},"tpu_metrics_debug":{"desc":null,"value":false},"is_encoder_decoder":{"desc":null,"value":false},"layer_norm_epsilon":{"desc":null,"value":0.00001},"length_column_name":{"desc":null,"value":"length"},"logging_first_step":{"desc":null,"value":false},"repetition_penalty":{"desc":null,"value":1},"torch_compile_mode":{"desc":null,"value":"None"},"add_cross_attention":{"desc":null,"value":false},"bias_dropout_fusion":{"desc":null,"value":true},"evaluation_strategy":{"desc":null,"value":"no"},"forced_bos_token_id":{"desc":null,"value":null},"forced_eos_token_id":{"desc":null,"value":null},"fsdp_min_num_params":{"desc":null,"value":0},"mmlu_source_max_len":{"desc":null,"value":2048},"quantization_config":{"desc":null,"value":{"load_in_4bit":true,"load_in_8bit":false,"llm_int8_threshold":6,"bnb_4bit_quant_type":"nf4","llm_int8_skip_modules":null,"bnb_4bit_compute_dtype":"float32","llm_int8_has_fp16_weight":false,"bnb_4bit_use_double_quant":true,"llm_int8_enable_fp32_cpu_offload":false}},"skip_memory_metrics":{"desc":null,"value":true},"tie_encoder_decoder":{"desc":null,"value":false},"tie_word_embeddings":{"desc":null,"value":true},"auto_find_batch_size":{"desc":null,"value":false},"dataloader_drop_last":{"desc":null,"value":false},"generation_num_beams":{"desc":null,"value":"None"},"no_repeat_ngram_size":{"desc":null,"value":0},"num_return_sequences":{"desc":null,"value":1},"output_hidden_states":{"desc":null,"value":false},"overwrite_output_dir":{"desc":null,"value":false},"prediction_loss_only":{"desc":null,"value":false},"push_to_hub_model_id":{"desc":null,"value":"None"},"task_specific_params":{"desc":null,"value":null},"transformers_version":{"desc":null,"value":"4.31.0"},"begin_suppress_tokens":{"desc":null,"value":null},"dataloader_pin_memory":{"desc":null,"value":true},"ddp_broadcast_buffers":{"desc":null,"value":"None"},"generation_max_length":{"desc":null,"value":"None"},"masked_softmax_fusion":{"desc":null,"value":true},"metric_for_best_model":{"desc":null,"value":"None"},"predict_with_generate":{"desc":null,"value":false},"remove_invalid_values":{"desc":null,"value":false},"remove_unused_columns":{"desc":null,"value":false},"torch_compile_backend":{"desc":null,"value":"None"},"dataloader_num_workers":{"desc":null,"value":0},"decoder_start_token_id":{"desc":null,"value":null},"gradient_checkpointing":{"desc":null,"value":true},"half_precision_backend":{"desc":null,"value":"auto"},"label_smoothing_factor":{"desc":null,"value":0},"load_best_model_at_end":{"desc":null,"value":false},"logging_nan_inf_filter":{"desc":null,"value":true},"resume_from_checkpoint":{"desc":null,"value":"None"},"chunk_size_feed_forward":{"desc":null,"value":0},"eval_accumulation_steps":{"desc":null,"value":"None"},"per_gpu_eval_batch_size":{"desc":null,"value":"None"},"return_dict_in_generate":{"desc":null,"value":false},"per_gpu_train_batch_size":{"desc":null,"value":"None"},"push_to_hub_organization":{"desc":null,"value":"None"},"attention_softmax_in_fp32":{"desc":null,"value":true},"ddp_find_unused_parameters":{"desc":null,"value":"None"},"include_inputs_for_metrics":{"desc":null,"value":false},"per_device_eval_batch_size":{"desc":null,"value":8},"use_legacy_prediction_loop":{"desc":null,"value":false},"cross_attention_hidden_size":{"desc":null,"value":null},"gradient_accumulation_steps":{"desc":null,"value":4},"per_device_train_batch_size":{"desc":null,"value":2},"encoder_no_repeat_ngram_size":{"desc":null,"value":0},"exponential_decay_length_penalty":{"desc":null,"value":null},"fsdp_transformer_layer_cls_to_wrap":{"desc":null,"value":"None"},"apply_residual_connection_post_layernorm":{"desc":null,"value":false}}