|
--- |
|
language: pt |
|
library_name: peft |
|
datasets: Weni/zeroshot-3.0.3 |
|
pipeline_tag: zero-shot-classification |
|
training_arguments: |
|
output_dir: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/ |
|
overwrite_output_dir: false |
|
do_train: false |
|
do_eval: true |
|
do_predict: false |
|
evaluation_strategy: epoch |
|
prediction_loss_only: false |
|
per_device_train_batch_size: 2 |
|
per_device_eval_batch_size: 8 |
|
gradient_accumulation_steps: 2 |
|
eval_accumulation_steps: 1 |
|
eval_delay: 0 |
|
learning_rate: 0.0004 |
|
weight_decay: 0.01 |
|
adam_beta1: 0.9 |
|
adam_beta2: 0.999 |
|
adam_epsilon: 1.0e-08 |
|
max_grad_norm: 0.3 |
|
num_train_epochs: 10 |
|
max_steps: -1 |
|
lr_scheduler_type: cosine |
|
warmup_ratio: 0.1 |
|
warmup_steps: 0 |
|
log_level: passive |
|
log_level_replica: warning |
|
log_on_each_node: true |
|
logging_dir: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/runs/Dec01_21-53-07_fd10189bb234 |
|
logging_strategy: steps |
|
logging_first_step: false |
|
logging_steps: 500 |
|
logging_nan_inf_filter: true |
|
save_strategy: epoch |
|
save_steps: 500 |
|
save_total_limit: 5 |
|
save_safetensors: true |
|
save_on_each_node: false |
|
no_cuda: false |
|
use_mps_device: false |
|
seed: 42 |
|
jit_mode_eval: false |
|
use_ipex: false |
|
bf16: false |
|
fp16: true |
|
fp16_opt_level: O1 |
|
half_precision_backend: auto |
|
bf16_full_eval: false |
|
fp16_full_eval: false |
|
local_rank: 0 |
|
tpu_metrics_debug: false |
|
debug: [] |
|
dataloader_drop_last: false |
|
dataloader_num_workers: 0 |
|
past_index: -1 |
|
run_name: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/ |
|
disable_tqdm: false |
|
remove_unused_columns: true |
|
load_best_model_at_end: true |
|
metric_for_best_model: eval_loss |
|
greater_is_better: false |
|
ignore_data_skip: false |
|
sharded_ddp: [] |
|
fsdp: [] |
|
fsdp_min_num_params: 0 |
|
fsdp_config: |
|
fsdp_min_num_params: 0 |
|
xla: false |
|
xla_fsdp_grad_ckpt: false |
|
label_smoothing_factor: 0.0 |
|
optim: adamw_torch |
|
adafactor: false |
|
group_by_length: false |
|
length_column_name: length |
|
report_to: |
|
- tensorboard |
|
dataloader_pin_memory: true |
|
skip_memory_metrics: true |
|
use_legacy_prediction_loop: false |
|
push_to_hub: true |
|
hub_model_id: Weni/ZeroShot-2.2.1-Llama2-13b-Multilanguage-3.0.3 |
|
hub_strategy: all_checkpoints |
|
hub_token: <HUB_TOKEN> |
|
hub_private_repo: false |
|
gradient_checkpointing: true |
|
include_inputs_for_metrics: false |
|
fp16_backend: auto |
|
push_to_hub_token: <PUSH_TO_HUB_TOKEN> |
|
mp_parameters: '' |
|
auto_find_batch_size: false |
|
full_determinism: false |
|
ray_scope: last |
|
ddp_timeout: 1800 |
|
torch_compile: false |
|
datatset: |
|
name: Weni/zeroshot-3.0.3 |
|
Training Procedure: |
|
Training Hyperparameters: |
|
output_dir: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/ |
|
overwrite_output_dir: false |
|
do_train: false |
|
do_eval: true |
|
do_predict: false |
|
evaluation_strategy: epoch |
|
prediction_loss_only: false |
|
per_device_train_batch_size: 2 |
|
per_device_eval_batch_size: 8 |
|
gradient_accumulation_steps: 2 |
|
eval_accumulation_steps: 1 |
|
eval_delay: 0 |
|
learning_rate: 0.0004 |
|
weight_decay: 0.01 |
|
adam_beta1: 0.9 |
|
adam_beta2: 0.999 |
|
adam_epsilon: 1.0e-08 |
|
max_grad_norm: 0.3 |
|
num_train_epochs: 10 |
|
max_steps: -1 |
|
lr_scheduler_type: cosine |
|
warmup_ratio: 0.1 |
|
warmup_steps: 0 |
|
log_level: passive |
|
log_level_replica: warning |
|
log_on_each_node: true |
|
logging_dir: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/runs/Dec01_21-53-07_fd10189bb234 |
|
logging_strategy: steps |
|
logging_first_step: false |
|
logging_steps: 500 |
|
logging_nan_inf_filter: true |
|
save_strategy: epoch |
|
save_steps: 500 |
|
save_total_limit: 5 |
|
save_safetensors: true |
|
save_on_each_node: false |
|
no_cuda: false |
|
use_mps_device: false |
|
seed: 42 |
|
jit_mode_eval: false |
|
use_ipex: false |
|
bf16: false |
|
fp16: true |
|
fp16_opt_level: O1 |
|
half_precision_backend: auto |
|
bf16_full_eval: false |
|
fp16_full_eval: false |
|
local_rank: 0 |
|
tpu_metrics_debug: false |
|
debug: [] |
|
dataloader_drop_last: false |
|
dataloader_num_workers: 0 |
|
past_index: -1 |
|
run_name: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/ |
|
disable_tqdm: false |
|
remove_unused_columns: true |
|
load_best_model_at_end: true |
|
metric_for_best_model: eval_loss |
|
greater_is_better: false |
|
ignore_data_skip: false |
|
sharded_ddp: [] |
|
fsdp: [] |
|
fsdp_min_num_params: 0 |
|
fsdp_config: |
|
fsdp_min_num_params: 0 |
|
xla: false |
|
xla_fsdp_grad_ckpt: false |
|
label_smoothing_factor: 0.0 |
|
optim: adamw_torch |
|
adafactor: false |
|
group_by_length: false |
|
length_column_name: length |
|
report_to: |
|
- tensorboard |
|
dataloader_pin_memory: true |
|
skip_memory_metrics: true |
|
use_legacy_prediction_loop: false |
|
push_to_hub: true |
|
hub_model_id: Weni/ZeroShot-2.2.1-Llama2-13b-Multilanguage-3.0.3 |
|
hub_strategy: all_checkpoints |
|
hub_token: <HUB_TOKEN> |
|
hub_private_repo: false |
|
gradient_checkpointing: true |
|
include_inputs_for_metrics: false |
|
fp16_backend: auto |
|
push_to_hub_token: <PUSH_TO_HUB_TOKEN> |
|
mp_parameters: '' |
|
auto_find_batch_size: false |
|
full_determinism: false |
|
ray_scope: last |
|
ddp_timeout: 1800 |
|
torch_compile: false |
|
Training data: |
|
name: Weni/zeroshot-3.0.3 |
|
Training processing: 'dataset = dataset.shuffle(seed=55) |
|
|
|
dataset = dataset[''train''].train_test_split(test_size=0.1)' |
|
training_regime: "### Training Hyperparameters- output_dir: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/\n\ |
|
- overwrite_output_dir: False\n- do_train: False\n- do_eval: True\n- do_predict:\ |
|
\ False\n- evaluation_strategy: epoch\n- prediction_loss_only: False\n- per_device_train_batch_size:\ |
|
\ 2\n- per_device_eval_batch_size: 8\n- per_gpu_train_batch_size: None\n- per_gpu_eval_batch_size:\ |
|
\ None\n- gradient_accumulation_steps: 2\n- eval_accumulation_steps: 1\n- eval_delay:\ |
|
\ 0\n- learning_rate: 0.0004\n- weight_decay: 0.01\n- adam_beta1: 0.9\n- adam_beta2:\ |
|
\ 0.999\n- adam_epsilon: 1e-08\n- max_grad_norm: 0.3\n- num_train_epochs: 10\n-\ |
|
\ max_steps: -1\n- lr_scheduler_type: cosine\n- warmup_ratio: 0.1\n- warmup_steps:\ |
|
\ 0\n- log_level: passive\n- log_level_replica: warning\n- log_on_each_node: True\n\ |
|
- logging_dir: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/runs/Dec01_21-53-07_fd10189bb234\n\ |
|
- logging_strategy: steps\n- logging_first_step: False\n- logging_steps: 500\n-\ |
|
\ logging_nan_inf_filter: True\n- save_strategy: epoch\n- save_steps: 500\n- save_total_limit:\ |
|
\ 5\n- save_safetensors: True\n- save_on_each_node: False\n- no_cuda: False\n- use_mps_device:\ |
|
\ False\n- seed: 42\n- data_seed: None\n- jit_mode_eval: False\n- use_ipex: False\n\ |
|
- bf16: False\n- fp16: True\n- fp16_opt_level: O1\n- half_precision_backend: auto\n\ |
|
- bf16_full_eval: False\n- fp16_full_eval: False\n- tf32: None\n- local_rank: 0\n\ |
|
- ddp_backend: None\n- tpu_num_cores: None\n- tpu_metrics_debug: False\n- debug:\ |
|
\ []\n- dataloader_drop_last: False\n- eval_steps: None\n- dataloader_num_workers:\ |
|
\ 0\n- past_index: -1\n- run_name: ./Zeroshot/01-12-23-NousResearch-Nous-Hermes-Llama2-13b_multilang-dataset-3.0.3-portuguese-2_epochs-10_batch_2/checkpoints/\n\ |
|
- disable_tqdm: False\n- remove_unused_columns: True\n- label_names: None\n- load_best_model_at_end:\ |
|
\ True\n- metric_for_best_model: eval_loss\n- greater_is_better: False\n- ignore_data_skip:\ |
|
\ False\n- sharded_ddp: []\n- fsdp: []\n- fsdp_min_num_params: 0\n- fsdp_config:\ |
|
\ {'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}\n- fsdp_transformer_layer_cls_to_wrap:\ |
|
\ None\n- deepspeed: None\n- label_smoothing_factor: 0.0\n- optim: adamw_torch\n\ |
|
- optim_args: None\n- adafactor: False\n- group_by_length: False\n- length_column_name:\ |
|
\ length\n- report_to: ['tensorboard']\n- ddp_find_unused_parameters: None\n- ddp_bucket_cap_mb:\ |
|
\ None\n- ddp_broadcast_buffers: None\n- dataloader_pin_memory: True\n- skip_memory_metrics:\ |
|
\ True\n- use_legacy_prediction_loop: False\n- push_to_hub: True\n- resume_from_checkpoint:\ |
|
\ None\n- hub_model_id: Weni/ZeroShot-2.2.1-Llama2-13b-Multilanguage-3.0.3\n- hub_strategy:\ |
|
\ all_checkpoints\n- hub_token: <HUB_TOKEN>\n- hub_private_repo: False\n- gradient_checkpointing:\ |
|
\ True\n- include_inputs_for_metrics: False\n- fp16_backend: auto\n- push_to_hub_model_id:\ |
|
\ None\n- push_to_hub_organization: None\n- push_to_hub_token: <PUSH_TO_HUB_TOKEN>\n\ |
|
- mp_parameters: \n- auto_find_batch_size: False\n- full_determinism: False\n- torchdynamo:\ |
|
\ None\n- ray_scope: last\n- ddp_timeout: 1800\n- torch_compile: False\n- torch_compile_backend:\ |
|
\ None\n- torch_compile_mode: None\n- xpu_backend: None" |
|
training_data: |
|
name: Weni/zeroshot-3.0.3 |
|
'preprocessing ': 'dataset = dataset.shuffle(seed=55) |
|
|
|
dataset = dataset[''train''].train_test_split(test_size=0.1)' |
|
preprocessing: 'dataset = dataset.shuffle(seed=55) |
|
|
|
dataset = dataset[''train''].train_test_split(test_size=0.1)' |
|
base_model: NousResearch/Nous-Hermes-Llama2-13b |
|
--- |
|
|
|
## Training Hyperparameters |
|
- evaluation_strategy: epoch |
|
- prediction_loss_only: False |
|
- per_device_train_batch_size: 2 |
|
- per_device_eval_batch_size: 8 |
|
- per_gpu_train_batch_size: None |
|
- per_gpu_eval_batch_size: None |
|
- gradient_accumulation_steps: 2 |
|
- eval_accumulation_steps: 1 |
|
- eval_delay: 0 |
|
- learning_rate: 0.0004 |
|
- weight_decay: 0.01 |
|
- adam_beta1: 0.9 |
|
- adam_beta2: 0.999 |
|
- adam_epsilon: 1e-08 |
|
- max_grad_norm: 0.3 |
|
- num_train_epochs: 10 |
|
- max_steps: -1 |
|
- lr_scheduler_type: cosine |
|
- warmup_ratio: 0.1 |
|
- warmup_steps: 0 |
|
- log_level: passive |
|
- log_level_replica: warning |
|
- log_on_each_node: True |
|
- logging_strategy: steps |
|
- logging_first_step: False |
|
- logging_steps: 500 |
|
- logging_nan_inf_filter: True |
|
- save_strategy: epoch |
|
- save_steps: 500 |
|
- save_total_limit: 5 |
|
- save_safetensors: True |
|
- save_on_each_node: False |
|
- no_cuda: False |
|
- use_mps_device: False |
|
- seed: 42 |
|
- data_seed: None |
|
- jit_mode_eval: False |
|
- use_ipex: False |
|
- bf16: False |
|
- fp16: True |
|
- fp16_opt_level: O1 |
|
- half_precision_backend: auto |
|
- bf16_full_eval: False |
|
- fp16_full_eval: False |
|
- tf32: None |
|
- local_rank: 0 |
|
- ddp_backend: None |
|
- tpu_num_cores: None |
|
- tpu_metrics_debug: False |
|
- debug: [] |
|
- dataloader_drop_last: False |
|
- eval_steps: None |
|
- dataloader_num_workers: 0 |
|
- past_index: -1 |
|
- remove_unused_columns: True |
|
- label_names: None |
|
- load_best_model_at_end: True |
|
- metric_for_best_model: eval_loss |
|
- greater_is_better: False |
|
- ignore_data_skip: False |
|
- sharded_ddp: [] |
|
- fsdp: [] |
|
- fsdp_min_num_params: 0 |
|
- fsdp_config: {'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False} |
|
- fsdp_transformer_layer_cls_to_wrap: None |
|
- deepspeed: None |
|
- label_smoothing_factor: 0.0 |
|
- optim: adamw_torch |
|
- optim_args: None |
|
- adafactor: False |
|
- group_by_length: False |
|
- ddp_find_unused_parameters: None |
|
- ddp_bucket_cap_mb: None |
|
- ddp_broadcast_buffers: None |
|
- dataloader_pin_memory: True |
|
- skip_memory_metrics: True |
|
- use_legacy_prediction_loop: False |
|
- push_to_hub: True |
|
- resume_from_checkpoint: None |
|
- hub_strategy: all_checkpoints |
|
- gradient_checkpointing: True |
|
|
|
## Training procedure |
|
|
|
|
|
The following `bitsandbytes` quantization config was used during training: |
|
- load_in_8bit: False |
|
- load_in_4bit: True |
|
- llm_int8_threshold: 6.0 |
|
- llm_int8_skip_modules: None |
|
- llm_int8_enable_fp32_cpu_offload: False |
|
- llm_int8_has_fp16_weight: False |
|
- bnb_4bit_quant_type: nf4 |
|
- bnb_4bit_use_double_quant: True |
|
- bnb_4bit_compute_dtype: bfloat16 |
|
|
|
- |
|
### Framework versions |
|
|
|
|
|
- PEFT 0.4.0 |
|
|