File size: 12,025 Bytes
569829a
40a29aa
757943f
1d06d68
269f1b8
3dc6ae4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40a29aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a285e74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1488cf4
 
 
 
f1daab3
 
 
1488cf4
d6436b2
569829a
dae1c45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569829a
 
e51a42f
757943f
 
 
 
 
 
 
 
 
 
dae1c45
 
e51a42f
569829a
757943f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
---
language: pt
library_name: peft
datasets: Weni/zeroshot-3.0.3
pipeline_tag: zero-shot-classification
training_arguments:
  output_dir: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/
  overwrite_output_dir: false
  do_train: false
  do_eval: true
  do_predict: false
  evaluation_strategy: epoch
  prediction_loss_only: false
  per_device_train_batch_size: 2
  per_device_eval_batch_size: 8
  gradient_accumulation_steps: 2
  eval_accumulation_steps: 1
  eval_delay: 0
  learning_rate: 0.0004
  weight_decay: 0.01
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_epsilon: 1.0e-08
  max_grad_norm: 0.3
  num_train_epochs: 10
  max_steps: -1
  lr_scheduler_type: cosine
  warmup_ratio: 0.1
  warmup_steps: 0
  log_level: passive
  log_level_replica: warning
  log_on_each_node: true
  logging_dir: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/runs/Dec01_21-53-07_fd10189bb234
  logging_strategy: steps
  logging_first_step: false
  logging_steps: 500
  logging_nan_inf_filter: true
  save_strategy: epoch
  save_steps: 500
  save_total_limit: 5
  save_safetensors: true
  save_on_each_node: false
  no_cuda: false
  use_mps_device: false
  seed: 42
  jit_mode_eval: false
  use_ipex: false
  bf16: false
  fp16: true
  fp16_opt_level: O1
  half_precision_backend: auto
  bf16_full_eval: false
  fp16_full_eval: false
  local_rank: 0
  tpu_metrics_debug: false
  debug: []
  dataloader_drop_last: false
  dataloader_num_workers: 0
  past_index: -1
  run_name: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/
  disable_tqdm: false
  remove_unused_columns: true
  load_best_model_at_end: true
  metric_for_best_model: eval_loss
  greater_is_better: false
  ignore_data_skip: false
  sharded_ddp: []
  fsdp: []
  fsdp_min_num_params: 0
  fsdp_config:
    fsdp_min_num_params: 0
    xla: false
    xla_fsdp_grad_ckpt: false
  label_smoothing_factor: 0.0
  optim: adamw_torch
  adafactor: false
  group_by_length: false
  length_column_name: length
  report_to:
  - tensorboard
  dataloader_pin_memory: true
  skip_memory_metrics: true
  use_legacy_prediction_loop: false
  push_to_hub: true
  hub_model_id: Weni/ZeroShot-2.2.1-Llama2-13b-Multilanguage-3.0.3
  hub_strategy: all_checkpoints
  hub_token: <HUB_TOKEN>
  hub_private_repo: false
  gradient_checkpointing: true
  include_inputs_for_metrics: false
  fp16_backend: auto
  push_to_hub_token: <PUSH_TO_HUB_TOKEN>
  mp_parameters: ''
  auto_find_batch_size: false
  full_determinism: false
  ray_scope: last
  ddp_timeout: 1800
  torch_compile: false
datatset:
  name: Weni/zeroshot-3.0.3
Training Procedure:
  Training Hyperparameters:
    output_dir: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/
    overwrite_output_dir: false
    do_train: false
    do_eval: true
    do_predict: false
    evaluation_strategy: epoch
    prediction_loss_only: false
    per_device_train_batch_size: 2
    per_device_eval_batch_size: 8
    gradient_accumulation_steps: 2
    eval_accumulation_steps: 1
    eval_delay: 0
    learning_rate: 0.0004
    weight_decay: 0.01
    adam_beta1: 0.9
    adam_beta2: 0.999
    adam_epsilon: 1.0e-08
    max_grad_norm: 0.3
    num_train_epochs: 10
    max_steps: -1
    lr_scheduler_type: cosine
    warmup_ratio: 0.1
    warmup_steps: 0
    log_level: passive
    log_level_replica: warning
    log_on_each_node: true
    logging_dir: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/runs/Dec01_21-53-07_fd10189bb234
    logging_strategy: steps
    logging_first_step: false
    logging_steps: 500
    logging_nan_inf_filter: true
    save_strategy: epoch
    save_steps: 500
    save_total_limit: 5
    save_safetensors: true
    save_on_each_node: false
    no_cuda: false
    use_mps_device: false
    seed: 42
    jit_mode_eval: false
    use_ipex: false
    bf16: false
    fp16: true
    fp16_opt_level: O1
    half_precision_backend: auto
    bf16_full_eval: false
    fp16_full_eval: false
    local_rank: 0
    tpu_metrics_debug: false
    debug: []
    dataloader_drop_last: false
    dataloader_num_workers: 0
    past_index: -1
    run_name: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/
    disable_tqdm: false
    remove_unused_columns: true
    load_best_model_at_end: true
    metric_for_best_model: eval_loss
    greater_is_better: false
    ignore_data_skip: false
    sharded_ddp: []
    fsdp: []
    fsdp_min_num_params: 0
    fsdp_config:
      fsdp_min_num_params: 0
      xla: false
      xla_fsdp_grad_ckpt: false
    label_smoothing_factor: 0.0
    optim: adamw_torch
    adafactor: false
    group_by_length: false
    length_column_name: length
    report_to:
    - tensorboard
    dataloader_pin_memory: true
    skip_memory_metrics: true
    use_legacy_prediction_loop: false
    push_to_hub: true
    hub_model_id: Weni/ZeroShot-2.2.1-Llama2-13b-Multilanguage-3.0.3
    hub_strategy: all_checkpoints
    hub_token: <HUB_TOKEN>
    hub_private_repo: false
    gradient_checkpointing: true
    include_inputs_for_metrics: false
    fp16_backend: auto
    push_to_hub_token: <PUSH_TO_HUB_TOKEN>
    mp_parameters: ''
    auto_find_batch_size: false
    full_determinism: false
    ray_scope: last
    ddp_timeout: 1800
    torch_compile: false
  Training data:
    name: Weni/zeroshot-3.0.3
  Training processing: 'dataset = dataset.shuffle(seed=55)

    dataset = dataset[''train''].train_test_split(test_size=0.1)'
training_regime: "### Training Hyperparameters- output_dir: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/\n\
  - overwrite_output_dir: False\n- do_train: False\n- do_eval: True\n- do_predict:\
  \ False\n- evaluation_strategy: epoch\n- prediction_loss_only: False\n- per_device_train_batch_size:\
  \ 2\n- per_device_eval_batch_size: 8\n- per_gpu_train_batch_size: None\n- per_gpu_eval_batch_size:\
  \ None\n- gradient_accumulation_steps: 2\n- eval_accumulation_steps: 1\n- eval_delay:\
  \ 0\n- learning_rate: 0.0004\n- weight_decay: 0.01\n- adam_beta1: 0.9\n- adam_beta2:\
  \ 0.999\n- adam_epsilon: 1e-08\n- max_grad_norm: 0.3\n- num_train_epochs: 10\n-\
  \ max_steps: -1\n- lr_scheduler_type: cosine\n- warmup_ratio: 0.1\n- warmup_steps:\
  \ 0\n- log_level: passive\n- log_level_replica: warning\n- log_on_each_node: True\n\
  - logging_dir: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/runs/Dec01_21-53-07_fd10189bb234\n\
  - logging_strategy: steps\n- logging_first_step: False\n- logging_steps: 500\n-\
  \ logging_nan_inf_filter: True\n- save_strategy: epoch\n- save_steps: 500\n- save_total_limit:\
  \ 5\n- save_safetensors: True\n- save_on_each_node: False\n- no_cuda: False\n- use_mps_device:\
  \ False\n- seed: 42\n- data_seed: None\n- jit_mode_eval: False\n- use_ipex: False\n\
  - bf16: False\n- fp16: True\n- fp16_opt_level: O1\n- half_precision_backend: auto\n\
  - bf16_full_eval: False\n- fp16_full_eval: False\n- tf32: None\n- local_rank: 0\n\
  - ddp_backend: None\n- tpu_num_cores: None\n- tpu_metrics_debug: False\n- debug:\
  \ []\n- dataloader_drop_last: False\n- eval_steps: None\n- dataloader_num_workers:\
  \ 0\n- past_index: -1\n- run_name: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/\n\
  - disable_tqdm: False\n- remove_unused_columns: True\n- label_names: None\n- load_best_model_at_end:\
  \ True\n- metric_for_best_model: eval_loss\n- greater_is_better: False\n- ignore_data_skip:\
  \ False\n- sharded_ddp: []\n- fsdp: []\n- fsdp_min_num_params: 0\n- fsdp_config:\
  \ {'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}\n- fsdp_transformer_layer_cls_to_wrap:\
  \ None\n- deepspeed: None\n- label_smoothing_factor: 0.0\n- optim: adamw_torch\n\
  - optim_args: None\n- adafactor: False\n- group_by_length: False\n- length_column_name:\
  \ length\n- report_to: ['tensorboard']\n- ddp_find_unused_parameters: None\n- ddp_bucket_cap_mb:\
  \ None\n- ddp_broadcast_buffers: None\n- dataloader_pin_memory: True\n- skip_memory_metrics:\
  \ True\n- use_legacy_prediction_loop: False\n- push_to_hub: True\n- resume_from_checkpoint:\
  \ None\n- hub_model_id: Weni/ZeroShot-2.2.1-Llama2-13b-Multilanguage-3.0.3\n- hub_strategy:\
  \ all_checkpoints\n- hub_token: <HUB_TOKEN>\n- hub_private_repo: False\n- gradient_checkpointing:\
  \ True\n- include_inputs_for_metrics: False\n- fp16_backend: auto\n- push_to_hub_model_id:\
  \ None\n- push_to_hub_organization: None\n- push_to_hub_token: <PUSH_TO_HUB_TOKEN>\n\
  - mp_parameters: \n- auto_find_batch_size: False\n- full_determinism: False\n- torchdynamo:\
  \ None\n- ray_scope: last\n- ddp_timeout: 1800\n- torch_compile: False\n- torch_compile_backend:\
  \ None\n- torch_compile_mode: None\n- xpu_backend: None"
training_data:
  name: Weni/zeroshot-3.0.3
'preprocessing ': 'dataset = dataset.shuffle(seed=55)

  dataset = dataset[''train''].train_test_split(test_size=0.1)'
preprocessing: 'dataset = dataset.shuffle(seed=55)

  dataset = dataset[''train''].train_test_split(test_size=0.1)'
base_model: NousResearch/Nous-Hermes-Llama2-13b
---

## Training Hyperparameters
- evaluation_strategy: epoch
- prediction_loss_only: False
- per_device_train_batch_size: 2
- per_device_eval_batch_size: 8
- per_gpu_train_batch_size: None
- per_gpu_eval_batch_size: None
- gradient_accumulation_steps: 2
- eval_accumulation_steps: 1
- eval_delay: 0
- learning_rate: 0.0004
- weight_decay: 0.01
- adam_beta1: 0.9
- adam_beta2: 0.999
- adam_epsilon: 1e-08
- max_grad_norm: 0.3
- num_train_epochs: 10
- max_steps: -1
- lr_scheduler_type: cosine
- warmup_ratio: 0.1
- warmup_steps: 0
- log_level: passive
- log_level_replica: warning
- log_on_each_node: True
- logging_strategy: steps
- logging_first_step: False
- logging_steps: 500
- logging_nan_inf_filter: True
- save_strategy: epoch
- save_steps: 500
- save_total_limit: 5
- save_safetensors: True
- save_on_each_node: False
- no_cuda: False
- use_mps_device: False
- seed: 42
- data_seed: None
- jit_mode_eval: False
- use_ipex: False
- bf16: False
- fp16: True
- fp16_opt_level: O1
- half_precision_backend: auto
- bf16_full_eval: False
- fp16_full_eval: False
- tf32: None
- local_rank: 0
- ddp_backend: None
- tpu_num_cores: None
- tpu_metrics_debug: False
- debug: []
- dataloader_drop_last: False
- eval_steps: None
- dataloader_num_workers: 0
- past_index: -1
- remove_unused_columns: True
- label_names: None
- load_best_model_at_end: True
- metric_for_best_model: eval_loss
- greater_is_better: False
- ignore_data_skip: False
- sharded_ddp: []
- fsdp: []
- fsdp_min_num_params: 0
- fsdp_config: {'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}
- fsdp_transformer_layer_cls_to_wrap: None
- deepspeed: None
- label_smoothing_factor: 0.0
- optim: adamw_torch
- optim_args: None
- adafactor: False
- group_by_length: False
- ddp_find_unused_parameters: None
- ddp_bucket_cap_mb: None
- ddp_broadcast_buffers: None
- dataloader_pin_memory: True
- skip_memory_metrics: True
- use_legacy_prediction_loop: False
- push_to_hub: True
- resume_from_checkpoint: None
- hub_strategy: all_checkpoints
- gradient_checkpointing: True
    
## Training procedure


The following `bitsandbytes` quantization config was used during training:
- load_in_8bit: False
- load_in_4bit: True
- llm_int8_threshold: 6.0
- llm_int8_skip_modules: None
- llm_int8_enable_fp32_cpu_offload: False
- llm_int8_has_fp16_weight: False
- bnb_4bit_quant_type: nf4
- bnb_4bit_use_double_quant: True
- bnb_4bit_compute_dtype: bfloat16

- 
### Framework versions


- PEFT 0.4.0