beamaia commited on
Commit
dae1c45
1 Parent(s): a285e74

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +86 -0
README.md CHANGED
@@ -242,6 +242,90 @@ preprocessing: 'dataset = dataset.shuffle(seed=55)
242
  dataset = dataset[''train''].train_test_split(test_size=0.1)'
243
  base_model: NousResearch/Nous-Hermes-Llama2-13b
244
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  ## Training procedure
246
 
247
 
@@ -255,6 +339,8 @@ The following `bitsandbytes` quantization config was used during training:
255
  - bnb_4bit_quant_type: nf4
256
  - bnb_4bit_use_double_quant: True
257
  - bnb_4bit_compute_dtype: bfloat16
 
 
258
  ### Framework versions
259
 
260
 
 
242
  dataset = dataset[''train''].train_test_split(test_size=0.1)'
243
  base_model: NousResearch/Nous-Hermes-Llama2-13b
244
  ---
245
+
246
+ ## Training Hyperparameters
247
+ - evaluation_strategy: epoch
248
+ - prediction_loss_only: False
249
+ - per_device_train_batch_size: 2
250
+ - per_device_eval_batch_size: 8
251
+ - per_gpu_train_batch_size: None
252
+ - per_gpu_eval_batch_size: None
253
+ - gradient_accumulation_steps: 2
254
+ - eval_accumulation_steps: 1
255
+ - eval_delay: 0
256
+ - learning_rate: 0.0004
257
+ - weight_decay: 0.01
258
+ - adam_beta1: 0.9
259
+ - adam_beta2: 0.999
260
+ - adam_epsilon: 1e-08
261
+ - max_grad_norm: 0.3
262
+ - num_train_epochs: 10
263
+ - max_steps: -1
264
+ - lr_scheduler_type: cosine
265
+ - warmup_ratio: 0.1
266
+ - warmup_steps: 0
267
+ - log_level: passive
268
+ - log_level_replica: warning
269
+ - log_on_each_node: True
270
+ - logging_strategy: steps
271
+ - logging_first_step: False
272
+ - logging_steps: 500
273
+ - logging_nan_inf_filter: True
274
+ - save_strategy: epoch
275
+ - save_steps: 500
276
+ - save_total_limit: 5
277
+ - save_safetensors: True
278
+ - save_on_each_node: False
279
+ - no_cuda: False
280
+ - use_mps_device: False
281
+ - seed: 42
282
+ - data_seed: None
283
+ - jit_mode_eval: False
284
+ - use_ipex: False
285
+ - bf16: False
286
+ - fp16: True
287
+ - fp16_opt_level: O1
288
+ - half_precision_backend: auto
289
+ - bf16_full_eval: False
290
+ - fp16_full_eval: False
291
+ - tf32: None
292
+ - local_rank: 0
293
+ - ddp_backend: None
294
+ - tpu_num_cores: None
295
+ - tpu_metrics_debug: False
296
+ - debug: []
297
+ - dataloader_drop_last: False
298
+ - eval_steps: None
299
+ - dataloader_num_workers: 0
300
+ - past_index: -1
301
+ - remove_unused_columns: True
302
+ - label_names: None
303
+ - load_best_model_at_end: True
304
+ - metric_for_best_model: eval_loss
305
+ - greater_is_better: False
306
+ - ignore_data_skip: False
307
+ - sharded_ddp: []
308
+ - fsdp: []
309
+ - fsdp_min_num_params: 0
310
+ - fsdp_config: {'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}
311
+ - fsdp_transformer_layer_cls_to_wrap: None
312
+ - deepspeed: None
313
+ - label_smoothing_factor: 0.0
314
+ - optim: adamw_torch
315
+ - optim_args: None
316
+ - adafactor: False
317
+ - group_by_length: False
318
+ - ddp_find_unused_parameters: None
319
+ - ddp_bucket_cap_mb: None
320
+ - ddp_broadcast_buffers: None
321
+ - dataloader_pin_memory: True
322
+ - skip_memory_metrics: True
323
+ - use_legacy_prediction_loop: False
324
+ - push_to_hub: True
325
+ - resume_from_checkpoint: None
326
+ - hub_strategy: all_checkpoints
327
+ - gradient_checkpointing: True
328
+
329
  ## Training procedure
330
 
331
 
 
339
  - bnb_4bit_quant_type: nf4
340
  - bnb_4bit_use_double_quant: True
341
  - bnb_4bit_compute_dtype: bfloat16
342
+
343
+ -
344
  ### Framework versions
345
 
346