sample_rate: 24_000 audio_backend: "vocos" weights_format: sft experimental: True models: - name: "nar-len" size: "full" resp_levels: 8 tasks: 9 langs: 4 tones: 1 arch_type: llama training: True version: 5 attention: sdpa dropout: 0.1 #loss_factors: # text: 0.01 # prom: 0.5 # resp: 1.0 capabilities: ["ar", "nar", "len"] experimental: audio_embedding_sums: True split_classifiers: True unified_position_ids: False rvq_levels_p: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 ] masking_train_p: 1.0 masking_ratio_fixed: True ignore_inputs_for_loss: True cfg_cond_dropout_p: 0.1 cfg_prom_dropout_p: 0.05 #token_dropout_error: 0.001 #token_dropout_rate: 0.001 #layerskip: True #layerskip_r: 2 #layerskip_e_scale: 0.1 #loras: #- name : "lora-shodan" # rank: 128 # alpha: 128 # training: True # rvq_levels: [] hyperparameters: batch_size: 32 gradient_accumulation_steps: 4 # 8 gradient_clipping: 1.0 warmup_steps: 10 optimizer: Prodigy learning_rate: 1.0 torch_optimizer: True scheduler: "" # ScheduleFree torch_scheduler: True evaluation: batch_size: 8 frequency: 500 size: 8 kwargs: max_duration: 500 max_steps: 25 ar_temperature: 1.0 repetition_penalty: 1.0 cfg_strength: 1.0 nar_temperature: 0.0 trainer: iterations: 1_000_000 save_frequency: 250 keep_last_checkpoints: 4 resize_modules: True check_for_oom: False gradient_checkpointing: True weight_dtype: float16 amp: True backend: deepspeed deepspeed: inferencing: False amp: False loss_scale_window: 250 min_loss_scale: 32768 load_webui: False inference: backend: local normalize: False weight_dtype: float16 amp: True optimizations: injects: False replace: True linear: False embedding: False optimizers: True bitsandbytes: False dadaptation: False bitnet: False fp8: False dataset: speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'" speaker_group_getter: "lambda p: f'{p.parts[-3]}'" speaker_languages: ja: [ "housamo", "JA-" ] de: [ "DE-" ] fr: [ "FR-" ] use_hdf5: True hdf5_flag: r use_metadata: True validate: True workers: 2 cache: True duration_range: [1.0, 16.0] prompt_max_samples: 1 prompt_duration_range: [1.0, 6.0] prompt_similar_p: 0.825 prompt_similar_top_k: 6 resps_max_samples: 1 resps_append_p: 0.0 sample_type: path # path # speaker sample_order: duration sample_max_duration_batch: 120 sample_shuffle: True retokenize_text: True tasks_list: [ "tts", "tts", "tts", "tts", "tts", "tts", "tts", "tts", "tts", "tts", "tts", "tts", "tts", "tts", "len", ] #, "stt", "tts-c", "ns", "sr" ] training: [] validation: [] noise: []