Llama-speechlmm-1.0-s / config_hydra.yaml
stp99's picture
Model save
6dda8c0 verified
training_type: pretrain
wandb_project: speechlmm-v1
wandb_watch: 'false'
num_gpus: 4
num_nodes: 1
accelerate_config: /net/scratch/hscra/plgrid/plgstefanop/llava/conf/accelerate/deepspeed.yaml
deepspeed_config: /net/scratch/hscra/plgrid/plgstefanop/llava/conf/deepspeed/zero3.json
adjustments: []
data:
data_config_path: /net/scratch/hscra/plgrid/plgstefanop/llava/conf/datasets/speechlmm_v1/speechlmm_v1.yml
num_proc_for_preprocessing: 64
dataloader_debug: false
filter_broken_samples: false
organize_eval_dataset_per_task: true
group_dataset_by_task:
train: true
eval: false
test: false
task_weights:
ASR: 0.3
ST: 0.2
SSUM: 0.1
SQA: 0.1
SLU_INTENT_ONLY: 0.1
VSR: 0.2
multi_task_sampler: alternating
replacement: true
rebuild_dataset_cache: false
cache_final_datasets: true
audio_input_sampling_rate: 16000
codec_sampling_rate: null
codec_frame_rate: null
image_folder: null
image_aspect_ratio: square
is_multimodal: false
lazy_preprocess: true
align_text_to_audio: false
use_text_tokens: true
align_with_whisper: false
restore_punctuation_and_spaces: true
max_condition_audio_duration: 10
variable_batch_size: false
max_length_per_batch: null
model:
add_lm_head: true
vision_select_layer: -1
vision_use_patch_token: true
vision_patch_merge_type: flat
vision_select_feature: patch
mm_use_im_start_end: false
mm_use_audio_start_end: false
use_audio_encoder_as_codec_encoder: false
add_all_multimodal_tokens: true
perturb_codes: true
perturb_prob: 0.2
pad_audio_weight: 0.5
epad_audio_weight: 1
pad_epad_audio_weight_decay: 0.5
perturb_prob_decay: 0.5
conversation_version: null
codebook_weights:
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
chunk_size_in_seconds: 15
chunk_overlap_in_seconds: 1
chunk_encoding_strategy: loop
audio_loss_decay: 1
audio_loss_weight: 1
tokenizer_padding_side: right
audio_encoder:
_name_or_path: meetween/seamless-m4t-v2-large-speech-encoder
adaptor_dropout: 0.1
adaptor_kernel_size: 8
adaptor_stride: 8
add_adapter: true
architectures:
- SeamlessM4Tv2SpeechEncoder
conv_depthwise_kernel_size: 31
feature_projection_input_dim: 160
hidden_size: 1024
initializer_range: 0.02
layer_norm_eps: 1.0e-05
left_max_position_embeddings: 64
model_type: seamless_m4t_v2
num_adapter_layers: 1
position_embeddings_type: relative_key
right_max_position_embeddings: 8
speech_encoder_attention_heads: 16
speech_encoder_chunk_size: 20000
speech_encoder_dropout: 0.0
speech_encoder_hidden_act: swish
speech_encoder_intermediate_size: 4096
speech_encoder_layerdrop: 0.1
speech_encoder_layers: 24
speech_encoder_left_chunk_num: 128
torch_dtype: float32
transformers_version: 4.37.2
use_cache: true
sampling_rate: 16000
audio_adapter:
model_type: qformer
input_dim: 1024
output_dim: 2048
hidden_size: 768
num_hidden_layers: 4
num_attention_heads: 12
intermediate_size: 3072
hidden_act: gelu
hidden_dropout_prob: 0.1
attention_probs_dropout_prob: 0.1
initializer_range: 0.02
layer_norm_eps: 1.0e-12
add_cross_attention: true
num_queries: 1
cross_attention_every_n_layers: 1
compress_factor: 2
triplet_loss: false
text_decoder:
_name_or_path: meta-llama/Llama-3.2-1B-Instruct
architectures:
- LlamaForCausalLM
attention_bias: false
attention_dropout: 0.0
bos_token_id: 128000
eos_token_id:
- 128001
- 128008
- 128009
head_dim: 64
hidden_act: silu
hidden_size: 2048
initializer_range: 0.02
intermediate_size: 8192
max_position_embeddings: 131072
mlp_bias: false
model_type: llama
num_attention_heads: 32
num_hidden_layers: 16
num_key_value_heads: 8
pretraining_tp: 1
rms_norm_eps: 1.0e-05
rope_scaling:
factor: 32.0
high_freq_factor: 4.0
low_freq_factor: 1.0
original_max_position_embeddings: 8192
rope_type: llama3
rope_theta: 500000.0
tie_word_embeddings: true
torch_dtype: bfloat16
transformers_version: 4.45.0.dev0
use_cache: true
vocab_size: 128256
conversation_version: llama_3_1
video_encoder:
hidden_size: 768
_name_or_path: /net/scratch/hscra/plgrid/plgstefanop/auto_avsr_ckpt/vsr_trlrs3vox2_base.pth
model_type: auto_avsr
adim: 768
aheads: 12
eunits: 3072
elayers: 12
transformer_input_layer: conv3d
dropout_rate: 0.1
transformer_attn_dropout_rate: 0.1
transformer_encoder_attn_layer_type: rel_mha
macaron_style: true
use_cnn_module: true
cnn_module_kernel: 31
zero_triu: false
a_upsample_ratio: 1
relu_type: swish
ddim: 768
dheads: 12
dunits: 3072
dlayers: 6
lsm_weight: 0.1
transformer_length_normalized_loss: false
mtlalpha: 0.1
ctc_type: builtin
rel_pos_type: latest
video_adapter:
model_type: mlp
input_dim: 768
output_dim: 2048
hidden_layers: 4
hidden_size: 4096
residual_type: interpolation
force_input_projection: true
force_output_projection: true
training:
modality: audio
group_by_modality_length: false
load_in_4bit: false
load_in_8bit: false
bnb_4bit_quant_type: nf4
bnb_4bit_use_double_quant: true
lora_adapters: []
num_steps_between_each_restart: null
lr_min: 1.0e-06
eval_temperature: 0
eval_max_new_tokens: 200
eval_num_batched_generations: 4
cache_dir: null
resume_from_checkpoint: null
model_max_length: 2048
freeze_modules:
- audio_encoder
- video_encoder
- text_decoder
attn_implementation: flash_attention_2
mpt_attn_impl: triton
mm_projector_lr: null
output_dir: /net/scratch/hscra/plgrid/plgstefanop/checkpoints/speechlmm-v1/llava-huggingface-s
report_to: wandb
run_name: pretrain-audio-seamless-qformer-auto_avsr-mlp-llama_3_1b-speechlmm/v1/s
num_train_epochs: 0
max_steps: 0
per_device_train_batch_size: 4
per_device_eval_batch_size: 8
gradient_accumulation_steps: 12
gradient_checkpointing: true
optim: adamw_torch
learning_rate: 5.0e-05
weight_decay: 0.0
lr_scheduler_type: cosine
warmup_ratio: 0.03
logging_steps: 1
save_strategy: steps
save_steps: 200
save_total_limit: 10
eval_strategy: 'no'
eval_steps: 0
push_to_hub: true
hub_model_id: meetween/Llama-speechlmm-1.0-s
hub_strategy: end
hub_token: null
hub_private_repo: false
fp16: false
bf16: true
tf32: true
dataloader_num_workers: 4
remove_unused_columns: false
seed: 42
pretrained_checkpoint: /net/scratch/hscra/plgrid/plgstefanop/checkpoints/speechlmm-v1/llava-pretrain-audio-seamless-qformer-auto_avsr-mlp-llama_3_1b-speechlmm/v1/s/checkpoint-800
run_name: speechlmm-s