vall-e / models /config.llama[nar-len].yaml
ecker's picture
Upload 2 files
e5e7575 verified
raw
history blame
2.91 kB
sample_rate: 24_000
audio_backend: "vocos"
weights_format: sft
experimental: True
models:
- name: "nar-len"
size: "full"
resp_levels: 8
tasks: 9
langs: 4
tones: 1
arch_type: llama
training: True
version: 5
attention: sdpa
dropout: 0.1
#loss_factors:
# text: 0.01
# prom: 0.5
# resp: 1.0
capabilities: ["ar", "nar", "len"]
experimental:
audio_embedding_sums: True
split_classifiers: True
unified_position_ids: False
rvq_levels_p: [
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
1, 2, 3, 4, 5, 6, 7
]
masking_train_p: 1.0
masking_ratio_fixed: True
ignore_inputs_for_loss: True
cfg_cond_dropout_p: 0.1
cfg_prom_dropout_p: 0.05
#token_dropout_error: 0.001
#token_dropout_rate: 0.001
#layerskip: True
#layerskip_r: 2
#layerskip_e_scale: 0.1
#loras:
#- name : "lora-shodan"
# rank: 128
# alpha: 128
# training: True
# rvq_levels: []
hyperparameters:
batch_size: 32
gradient_accumulation_steps: 4 # 8
gradient_clipping: 1.0
warmup_steps: 10
optimizer: Prodigy
learning_rate: 1.0
torch_optimizer: True
scheduler: "" # ScheduleFree
torch_scheduler: True
evaluation:
batch_size: 8
frequency: 500
size: 8
kwargs:
max_duration: 500
max_steps: 25
ar_temperature: 1.0
repetition_penalty: 1.0
cfg_strength: 1.0
nar_temperature: 0.0
trainer:
iterations: 1_000_000
save_frequency: 250
keep_last_checkpoints: 4
resize_modules: True
check_for_oom: False
gradient_checkpointing: True
weight_dtype: float16
amp: True
backend: deepspeed
deepspeed:
inferencing: False
amp: False
loss_scale_window: 250
min_loss_scale: 32768
load_webui: False
inference:
backend: local
normalize: False
weight_dtype: float16
amp: True
optimizations:
injects: False
replace: True
linear: False
embedding: False
optimizers: True
bitsandbytes: False
dadaptation: False
bitnet: False
fp8: False
dataset:
speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
speaker_languages:
ja: [
"housamo",
"JA-"
]
de: [
"DE-"
]
fr: [
"FR-"
]
use_hdf5: True
hdf5_flag: r
use_metadata: True
validate: True
workers: 2
cache: True
duration_range: [1.0, 16.0]
prompt_max_samples: 1
prompt_duration_range: [1.0, 6.0]
prompt_similar_p: 0.825
prompt_similar_top_k: 6
resps_max_samples: 1
resps_append_p: 0.0
sample_type: path # path # speaker
sample_order: duration
sample_max_duration_batch: 120
sample_shuffle: True
retokenize_text: True
tasks_list: [
"tts", "tts", "tts", "tts", "tts", "tts", "tts",
"tts", "tts", "tts", "tts", "tts", "tts", "tts",
"len",
] #, "stt", "tts-c", "ns", "sr" ]
training: []
validation: []
noise: []