training_type: pretrain
wandb_project: speechlmm-v1
wandb_watch: 'false'
num_gpus: 4
num_nodes: 1
accelerate_config: /net/scratch/hscra/plgrid/plgstefanop/llava/conf/accelerate/deepspeed.yaml
deepspeed_config: /net/scratch/hscra/plgrid/plgstefanop/llava/conf/deepspeed/zero3.json
adjustments: []
data:
  data_config_path: /net/scratch/hscra/plgrid/plgstefanop/llava/conf/datasets/speechlmm_v1/speechlmm_v1.yml
  num_proc_for_preprocessing: 64
  dataloader_debug: false
  filter_broken_samples: false
  organize_eval_dataset_per_task: true
  group_dataset_by_task:
    train: true
    eval: false
    test: false
  task_weights:
    ASR: 0.3
    ST: 0.2
    SSUM: 0.1
    SQA: 0.1
    SLU_INTENT_ONLY: 0.1
    VSR: 0.2
  multi_task_sampler: alternating
  replacement: true
  rebuild_dataset_cache: false
  cache_final_datasets: true
  audio_input_sampling_rate: 16000
  codec_sampling_rate: null
  codec_frame_rate: null
  image_folder: null
  image_aspect_ratio: square
  is_multimodal: false
  lazy_preprocess: true
  align_text_to_audio: false
  use_text_tokens: true
  align_with_whisper: false
  restore_punctuation_and_spaces: true
  max_condition_audio_duration: 10
  variable_batch_size: false
  max_length_per_batch: null
model:
  add_lm_head: true
  vision_select_layer: -1
  vision_use_patch_token: true
  vision_patch_merge_type: flat
  vision_select_feature: patch
  mm_use_im_start_end: false
  mm_use_audio_start_end: false
  use_audio_encoder_as_codec_encoder: false
  add_all_multimodal_tokens: true
  perturb_codes: true
  perturb_prob: 0.2
  pad_audio_weight: 0.5
  epad_audio_weight: 1
  pad_epad_audio_weight_decay: 0.5
  perturb_prob_decay: 0.5
  conversation_version: null
  codebook_weights:
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  - 1
  chunk_size_in_seconds: 15
  chunk_overlap_in_seconds: 1
  chunk_encoding_strategy: loop
  audio_loss_decay: 1
  audio_loss_weight: 1
  tokenizer_padding_side: right
  audio_encoder:
    _name_or_path: meetween/seamless-m4t-v2-large-speech-encoder
    adaptor_dropout: 0.1
    adaptor_kernel_size: 8
    adaptor_stride: 8
    add_adapter: true
    architectures:
    - SeamlessM4Tv2SpeechEncoder
    conv_depthwise_kernel_size: 31
    feature_projection_input_dim: 160
    hidden_size: 1024
    initializer_range: 0.02
    layer_norm_eps: 1.0e-05
    left_max_position_embeddings: 64
    model_type: seamless_m4t_v2
    num_adapter_layers: 1
    position_embeddings_type: relative_key
    right_max_position_embeddings: 8
    speech_encoder_attention_heads: 16
    speech_encoder_chunk_size: 20000
    speech_encoder_dropout: 0.0
    speech_encoder_hidden_act: swish
    speech_encoder_intermediate_size: 4096
    speech_encoder_layerdrop: 0.1
    speech_encoder_layers: 24
    speech_encoder_left_chunk_num: 128
    torch_dtype: float32
    transformers_version: 4.37.2
    use_cache: true
    sampling_rate: 16000
  audio_adapter:
    model_type: qformer
    input_dim: 1024
    output_dim: 8192
    hidden_size: 768
    num_hidden_layers: 4
    num_attention_heads: 12
    intermediate_size: 3072
    hidden_act: gelu
    hidden_dropout_prob: 0.1
    attention_probs_dropout_prob: 0.1
    initializer_range: 0.02
    layer_norm_eps: 1.0e-12
    add_cross_attention: true
    num_queries: 1
    cross_attention_every_n_layers: 1
    compress_factor: 2
    triplet_loss: false
  text_decoder:
    _name_or_path: meta-llama/Llama-3.3-70B-Instruct
    architectures:
    - LlamaForCausalLM
    attention_bias: false
    attention_dropout: 0.0
    bos_token_id: 128000
    eos_token_id:
    - 128001
    - 128008
    - 128009
    head_dim: 128
    hidden_act: silu
    hidden_size: 8192
    initializer_range: 0.02
    intermediate_size: 28672
    max_position_embeddings: 131072
    mlp_bias: false
    model_type: llama
    num_attention_heads: 64
    num_hidden_layers: 80
    num_key_value_heads: 8
    pretraining_tp: 1
    rms_norm_eps: 1.0e-05
    rope_scaling:
      factor: 8.0
      high_freq_factor: 4.0
      low_freq_factor: 1.0
      original_max_position_embeddings: 8192
      rope_type: llama3
    rope_theta: 500000.0
    tie_word_embeddings: false
    torch_dtype: bfloat16
    transformers_version: 4.47.0.dev0
    use_cache: true
    vocab_size: 128256
    conversation_version: llama_3_1
  video_encoder:
    hidden_size: 768
    _name_or_path: /net/scratch/hscra/plgrid/plgstefanop/auto_avsr_ckpt/vsr_trlrs3vox2_base.pth
    model_type: auto_avsr
    adim: 768
    aheads: 12
    eunits: 3072
    elayers: 12
    transformer_input_layer: conv3d
    dropout_rate: 0.1
    transformer_attn_dropout_rate: 0.1
    transformer_encoder_attn_layer_type: rel_mha
    macaron_style: true
    use_cnn_module: true
    cnn_module_kernel: 31
    zero_triu: false
    a_upsample_ratio: 1
    relu_type: swish
    ddim: 768
    dheads: 12
    dunits: 3072
    dlayers: 6
    lsm_weight: 0.1
    transformer_length_normalized_loss: false
    mtlalpha: 0.1
    ctc_type: builtin
    rel_pos_type: latest
  video_adapter:
    model_type: mlp
    input_dim: 768
    output_dim: 8192
    hidden_layers: 4
    hidden_size: 4096
    residual_type: interpolation
    force_input_projection: true
    force_output_projection: true
training:
  modality: audio
  group_by_modality_length: false
  load_in_4bit: false
  load_in_8bit: false
  bnb_4bit_quant_type: nf4
  bnb_4bit_use_double_quant: true
  lora_adapters: []
  num_steps_between_each_restart: null
  lr_min: 1.0e-06
  eval_temperature: 0
  eval_max_new_tokens: 200
  eval_num_batched_generations: 4
  cache_dir: null
  resume_from_checkpoint: null
  model_max_length: 2048
  freeze_modules:
  - audio_encoder
  - video_encoder
  - text_decoder
  attn_implementation: flash_attention_2
  mpt_attn_impl: triton
  mm_projector_lr: null
  output_dir: /net/scratch/hscra/plgrid/plgstefanop/checkpoints/speechlmm-v1/llava-huggingface-xl
  report_to: wandb
  run_name: pretrain-audio-seamless-qformer-auto_avsr-mlp-llama_3_70b-speechlmm/v1/xl
  num_train_epochs: 0
  max_steps: 0
  per_device_train_batch_size: 2
  per_device_eval_batch_size: 8
  gradient_accumulation_steps: 12
  gradient_checkpointing: true
  optim: adamw_torch
  learning_rate: 5.0e-05
  weight_decay: 0.0
  lr_scheduler_type: cosine
  warmup_ratio: 0.03
  logging_steps: 1
  save_strategy: steps
  save_steps: 200
  save_total_limit: 10
  eval_strategy: 'no'
  eval_steps: 0
  push_to_hub: true
  hub_model_id: meetween/Llama-speechlmm-1.0-xl
  hub_strategy: end
  hub_token: null
  hub_private_repo: false
  fp16: false
  bf16: true
  tf32: true
  dataloader_num_workers: 4
  remove_unused_columns: false
  seed: 42
  pretrained_checkpoint: /net/scratch/hscra/plgrid/plgstefanop/checkpoints/speechlmm-v1/llava-pretrain-audio-seamless-qformer-auto_avsr-mlp-llama_3_70b-speechlmm/v1/xl/checkpoint-200
run_name: speechlmm-xl