training_type: pretrain wandb_project: speechlmm-v1 wandb_watch: 'false' num_gpus: 4 num_nodes: 1 accelerate_config: /net/scratch/hscra/plgrid/plgstefanop/llava/conf/accelerate/deepspeed.yaml deepspeed_config: /net/scratch/hscra/plgrid/plgstefanop/llava/conf/deepspeed/zero3.json adjustments: [] data: data_config_path: /net/scratch/hscra/plgrid/plgstefanop/llava/conf/datasets/speechlmm_v1/speechlmm_v1.yml num_proc_for_preprocessing: 64 dataloader_debug: false filter_broken_samples: false organize_eval_dataset_per_task: true group_dataset_by_task: train: true eval: false test: false task_weights: ASR: 0.3 ST: 0.2 SSUM: 0.1 SQA: 0.1 SLU_INTENT_ONLY: 0.1 VSR: 0.2 multi_task_sampler: alternating replacement: true rebuild_dataset_cache: false cache_final_datasets: true audio_input_sampling_rate: 16000 codec_sampling_rate: null codec_frame_rate: null image_folder: null image_aspect_ratio: square is_multimodal: false lazy_preprocess: true align_text_to_audio: false use_text_tokens: true align_with_whisper: false restore_punctuation_and_spaces: true max_condition_audio_duration: 10 variable_batch_size: false max_length_per_batch: null model: add_lm_head: true vision_select_layer: -1 vision_use_patch_token: true vision_patch_merge_type: flat vision_select_feature: patch mm_use_im_start_end: false mm_use_audio_start_end: false use_audio_encoder_as_codec_encoder: false add_all_multimodal_tokens: true perturb_codes: true perturb_prob: 0.2 pad_audio_weight: 0.5 epad_audio_weight: 1 pad_epad_audio_weight_decay: 0.5 perturb_prob_decay: 0.5 conversation_version: null codebook_weights: - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 chunk_size_in_seconds: 15 chunk_overlap_in_seconds: 1 chunk_encoding_strategy: loop audio_loss_decay: 1 audio_loss_weight: 1 tokenizer_padding_side: right audio_encoder: _name_or_path: meetween/seamless-m4t-v2-large-speech-encoder adaptor_dropout: 0.1 adaptor_kernel_size: 8 adaptor_stride: 8 add_adapter: true architectures: - SeamlessM4Tv2SpeechEncoder conv_depthwise_kernel_size: 31 feature_projection_input_dim: 160 hidden_size: 1024 initializer_range: 0.02 layer_norm_eps: 1.0e-05 left_max_position_embeddings: 64 model_type: seamless_m4t_v2 num_adapter_layers: 1 position_embeddings_type: relative_key right_max_position_embeddings: 8 speech_encoder_attention_heads: 16 speech_encoder_chunk_size: 20000 speech_encoder_dropout: 0.0 speech_encoder_hidden_act: swish speech_encoder_intermediate_size: 4096 speech_encoder_layerdrop: 0.1 speech_encoder_layers: 24 speech_encoder_left_chunk_num: 128 torch_dtype: float32 transformers_version: 4.37.2 use_cache: true sampling_rate: 16000 audio_adapter: model_type: qformer input_dim: 1024 output_dim: 8192 hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 intermediate_size: 3072 hidden_act: gelu hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 initializer_range: 0.02 layer_norm_eps: 1.0e-12 add_cross_attention: true num_queries: 1 cross_attention_every_n_layers: 1 compress_factor: 2 triplet_loss: false text_decoder: _name_or_path: meta-llama/Llama-3.3-70B-Instruct architectures: - LlamaForCausalLM attention_bias: false attention_dropout: 0.0 bos_token_id: 128000 eos_token_id: - 128001 - 128008 - 128009 head_dim: 128 hidden_act: silu hidden_size: 8192 initializer_range: 0.02 intermediate_size: 28672 max_position_embeddings: 131072 mlp_bias: false model_type: llama num_attention_heads: 64 num_hidden_layers: 80 num_key_value_heads: 8 pretraining_tp: 1 rms_norm_eps: 1.0e-05 rope_scaling: factor: 8.0 high_freq_factor: 4.0 low_freq_factor: 1.0 original_max_position_embeddings: 8192 rope_type: llama3 rope_theta: 500000.0 tie_word_embeddings: false torch_dtype: bfloat16 transformers_version: 4.47.0.dev0 use_cache: true vocab_size: 128256 conversation_version: llama_3_1 video_encoder: hidden_size: 768 _name_or_path: /net/scratch/hscra/plgrid/plgstefanop/auto_avsr_ckpt/vsr_trlrs3vox2_base.pth model_type: auto_avsr adim: 768 aheads: 12 eunits: 3072 elayers: 12 transformer_input_layer: conv3d dropout_rate: 0.1 transformer_attn_dropout_rate: 0.1 transformer_encoder_attn_layer_type: rel_mha macaron_style: true use_cnn_module: true cnn_module_kernel: 31 zero_triu: false a_upsample_ratio: 1 relu_type: swish ddim: 768 dheads: 12 dunits: 3072 dlayers: 6 lsm_weight: 0.1 transformer_length_normalized_loss: false mtlalpha: 0.1 ctc_type: builtin rel_pos_type: latest video_adapter: model_type: mlp input_dim: 768 output_dim: 8192 hidden_layers: 4 hidden_size: 4096 residual_type: interpolation force_input_projection: true force_output_projection: true training: modality: audio group_by_modality_length: false load_in_4bit: false load_in_8bit: false bnb_4bit_quant_type: nf4 bnb_4bit_use_double_quant: true lora_adapters: [] num_steps_between_each_restart: null lr_min: 1.0e-06 eval_temperature: 0 eval_max_new_tokens: 200 eval_num_batched_generations: 4 cache_dir: null resume_from_checkpoint: null model_max_length: 2048 freeze_modules: - audio_encoder - video_encoder - text_decoder attn_implementation: flash_attention_2 mpt_attn_impl: triton mm_projector_lr: null output_dir: /net/scratch/hscra/plgrid/plgstefanop/checkpoints/speechlmm-v1/llava-huggingface-xl report_to: wandb run_name: pretrain-audio-seamless-qformer-auto_avsr-mlp-llama_3_70b-speechlmm/v1/xl num_train_epochs: 0 max_steps: 0 per_device_train_batch_size: 2 per_device_eval_batch_size: 8 gradient_accumulation_steps: 12 gradient_checkpointing: true optim: adamw_torch learning_rate: 5.0e-05 weight_decay: 0.0 lr_scheduler_type: cosine warmup_ratio: 0.03 logging_steps: 1 save_strategy: steps save_steps: 200 save_total_limit: 10 eval_strategy: 'no' eval_steps: 0 push_to_hub: true hub_model_id: meetween/Llama-speechlmm-1.0-xl hub_strategy: end hub_token: null hub_private_repo: false fp16: false bf16: true tf32: true dataloader_num_workers: 4 remove_unused_columns: false seed: 42 pretrained_checkpoint: /net/scratch/hscra/plgrid/plgstefanop/checkpoints/speechlmm-v1/llava-pretrain-audio-seamless-qformer-auto_avsr-mlp-llama_3_70b-speechlmm/v1/xl/checkpoint-200 run_name: speechlmm-xl