sample_rate: 24_000
audio_backend: "vocos"
weights_format: sft
experimental: True

models:
- name: "nar-len"
  size: "full"
  resp_levels: 8
  tasks: 9
  langs: 4
  tones: 1
  arch_type: llama
  training: True
  version: 5
  attention: sdpa
  dropout: 0.1
  #loss_factors:
  #  text: 0.01
  #  prom: 0.5
  #  resp: 1.0
  capabilities: ["ar", "nar", "len"]
  experimental:
    audio_embedding_sums: True
    split_classifiers: True
    unified_position_ids: False
    rvq_levels_p: [
      0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0,
      1, 2, 3, 4, 5, 6, 7
    ]

    masking_train_p: 1.0
    masking_ratio_fixed: True
    ignore_inputs_for_loss: True

    cfg_cond_dropout_p: 0.1
    cfg_prom_dropout_p: 0.05

    #token_dropout_error: 0.001
    #token_dropout_rate: 0.001
    #layerskip: True
    #layerskip_r: 2
    #layerskip_e_scale: 0.1

#loras:
#- name : "lora-shodan"
#  rank: 128
#  alpha: 128
#  training: True
#  rvq_levels: []

hyperparameters:
  batch_size: 32
  gradient_accumulation_steps: 4 # 8
  gradient_clipping: 1.0
  warmup_steps: 10

  optimizer: Prodigy
  learning_rate: 1.0
  torch_optimizer: True
  
  scheduler: "" # ScheduleFree
  torch_scheduler: True

evaluation:
  batch_size: 8
  frequency: 500
  size: 8
  
  kwargs:
    max_duration: 500
    max_steps: 25
    ar_temperature: 1.0
    repetition_penalty: 1.0
    cfg_strength: 1.0
    nar_temperature: 0.0

trainer:
  iterations: 1_000_000  
  save_frequency: 250
  keep_last_checkpoints: 4

  resize_modules: True
  
  check_for_oom: False
  gradient_checkpointing: True

  weight_dtype: float16
  amp: True

  backend: deepspeed
  deepspeed:
    inferencing: False
    amp: False
    loss_scale_window: 250
    min_loss_scale: 32768

  load_webui: False

inference:
  backend: local
  normalize: False

  weight_dtype: float16
  amp: True

optimizations:
  injects: False
  replace: True

  linear: False
  embedding: False
  optimizers: True

  bitsandbytes: False
  dadaptation: False
  bitnet: False
  fp8: False

dataset:
  speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
  speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
  speaker_languages:
    ja: [
      "housamo",
      "JA-"
    ]
    de: [
      "DE-"
    ]
    fr: [
      "FR-"
    ]

  use_hdf5: True
  hdf5_flag: r
  
  use_metadata: True
  validate: True

  workers: 2
  cache: True

  duration_range: [1.0, 16.0]

  prompt_max_samples: 1
  prompt_duration_range: [1.0, 6.0]
  prompt_similar_p: 0.825
  prompt_similar_top_k: 6

  resps_max_samples: 1
  resps_append_p: 0.0

  sample_type: path # path # speaker
  sample_order: duration
  sample_max_duration_batch: 120
  sample_shuffle: True
  retokenize_text: True

  tasks_list: [
    "tts", "tts", "tts", "tts", "tts", "tts", "tts",
    "tts", "tts", "tts", "tts", "tts", "tts", "tts",
    "len",
  ] #, "stt", "tts-c", "ns", "sr" ]

  training: []
  validation: []
  noise: []