|
sample_rate: 24_000 |
|
audio_backend: "vocos" |
|
weights_format: sft |
|
experimental: True |
|
|
|
models: |
|
- name: "nar-len" |
|
size: "full" |
|
resp_levels: 8 |
|
tasks: 9 |
|
langs: 4 |
|
tones: 1 |
|
arch_type: llama |
|
training: True |
|
version: 5 |
|
attention: sdpa |
|
dropout: 0.1 |
|
|
|
|
|
|
|
|
|
capabilities: ["ar", "nar", "len"] |
|
experimental: |
|
audio_embedding_sums: True |
|
split_classifiers: True |
|
unified_position_ids: False |
|
rvq_levels_p: [ |
|
0, 0, 0, 0, 0, 0, 0, |
|
0, 0, 0, 0, 0, 0, 0, |
|
1, 2, 3, 4, 5, 6, 7 |
|
] |
|
|
|
masking_train_p: 1.0 |
|
masking_ratio_fixed: True |
|
ignore_inputs_for_loss: True |
|
|
|
cfg_cond_dropout_p: 0.1 |
|
cfg_prom_dropout_p: 0.05 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hyperparameters: |
|
batch_size: 32 |
|
gradient_accumulation_steps: 4 |
|
gradient_clipping: 1.0 |
|
warmup_steps: 10 |
|
|
|
optimizer: Prodigy |
|
learning_rate: 1.0 |
|
torch_optimizer: True |
|
|
|
scheduler: "" |
|
torch_scheduler: True |
|
|
|
evaluation: |
|
batch_size: 8 |
|
frequency: 500 |
|
size: 8 |
|
|
|
kwargs: |
|
max_duration: 500 |
|
max_steps: 25 |
|
ar_temperature: 1.0 |
|
repetition_penalty: 1.0 |
|
cfg_strength: 1.0 |
|
nar_temperature: 0.0 |
|
|
|
trainer: |
|
iterations: 1_000_000 |
|
save_frequency: 250 |
|
keep_last_checkpoints: 4 |
|
|
|
resize_modules: True |
|
|
|
check_for_oom: False |
|
gradient_checkpointing: True |
|
|
|
weight_dtype: float16 |
|
amp: True |
|
|
|
backend: deepspeed |
|
deepspeed: |
|
inferencing: False |
|
amp: False |
|
loss_scale_window: 250 |
|
min_loss_scale: 32768 |
|
|
|
load_webui: False |
|
|
|
inference: |
|
backend: local |
|
normalize: False |
|
|
|
weight_dtype: float16 |
|
amp: True |
|
|
|
optimizations: |
|
injects: False |
|
replace: True |
|
|
|
linear: False |
|
embedding: False |
|
optimizers: True |
|
|
|
bitsandbytes: False |
|
dadaptation: False |
|
bitnet: False |
|
fp8: False |
|
|
|
dataset: |
|
speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'" |
|
speaker_group_getter: "lambda p: f'{p.parts[-3]}'" |
|
speaker_languages: |
|
ja: [ |
|
"housamo", |
|
"JA-" |
|
] |
|
de: [ |
|
"DE-" |
|
] |
|
fr: [ |
|
"FR-" |
|
] |
|
|
|
use_hdf5: True |
|
hdf5_flag: r |
|
|
|
use_metadata: True |
|
validate: True |
|
|
|
workers: 2 |
|
cache: True |
|
|
|
duration_range: [1.0, 16.0] |
|
|
|
prompt_max_samples: 1 |
|
prompt_duration_range: [1.0, 6.0] |
|
prompt_similar_p: 0.825 |
|
prompt_similar_top_k: 6 |
|
|
|
resps_max_samples: 1 |
|
resps_append_p: 0.0 |
|
|
|
sample_type: path |
|
sample_order: duration |
|
sample_max_duration_batch: 120 |
|
sample_shuffle: True |
|
retokenize_text: True |
|
|
|
tasks_list: [ |
|
"tts", "tts", "tts", "tts", "tts", "tts", "tts", |
|
"tts", "tts", "tts", "tts", "tts", "tts", "tts", |
|
"len", |
|
] |
|
|
|
training: [] |
|
validation: [] |
|
noise: [] |
|
|