|
training_type: pretrain |
|
wandb_project: speechlmm-v1 |
|
wandb_watch: 'false' |
|
num_gpus: 4 |
|
num_nodes: 1 |
|
accelerate_config: /net/scratch/hscra/plgrid/plgstefanop/llava/conf/accelerate/deepspeed.yaml |
|
deepspeed_config: /net/scratch/hscra/plgrid/plgstefanop/llava/conf/deepspeed/zero3.json |
|
adjustments: [] |
|
data: |
|
data_config_path: /net/scratch/hscra/plgrid/plgstefanop/llava/conf/datasets/speechlmm_v1/speechlmm_v1.yml |
|
num_proc_for_preprocessing: 64 |
|
dataloader_debug: false |
|
filter_broken_samples: false |
|
organize_eval_dataset_per_task: true |
|
group_dataset_by_task: |
|
train: true |
|
eval: false |
|
test: false |
|
task_weights: |
|
ASR: 0.3 |
|
ST: 0.2 |
|
SSUM: 0.1 |
|
SQA: 0.1 |
|
SLU_INTENT_ONLY: 0.1 |
|
VSR: 0.2 |
|
multi_task_sampler: alternating |
|
replacement: true |
|
rebuild_dataset_cache: false |
|
cache_final_datasets: true |
|
audio_input_sampling_rate: 16000 |
|
codec_sampling_rate: null |
|
codec_frame_rate: null |
|
image_folder: null |
|
image_aspect_ratio: square |
|
is_multimodal: false |
|
lazy_preprocess: true |
|
align_text_to_audio: false |
|
use_text_tokens: true |
|
align_with_whisper: false |
|
restore_punctuation_and_spaces: true |
|
max_condition_audio_duration: 10 |
|
variable_batch_size: false |
|
max_length_per_batch: null |
|
model: |
|
add_lm_head: true |
|
vision_select_layer: -1 |
|
vision_use_patch_token: true |
|
vision_patch_merge_type: flat |
|
vision_select_feature: patch |
|
mm_use_im_start_end: false |
|
mm_use_audio_start_end: false |
|
use_audio_encoder_as_codec_encoder: false |
|
add_all_multimodal_tokens: true |
|
perturb_codes: true |
|
perturb_prob: 0.2 |
|
pad_audio_weight: 0.5 |
|
epad_audio_weight: 1 |
|
pad_epad_audio_weight_decay: 0.5 |
|
perturb_prob_decay: 0.5 |
|
conversation_version: null |
|
codebook_weights: |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
- 1 |
|
chunk_size_in_seconds: 15 |
|
chunk_overlap_in_seconds: 1 |
|
chunk_encoding_strategy: loop |
|
audio_loss_decay: 1 |
|
audio_loss_weight: 1 |
|
tokenizer_padding_side: right |
|
audio_encoder: |
|
_name_or_path: meetween/seamless-m4t-v2-large-speech-encoder |
|
adaptor_dropout: 0.1 |
|
adaptor_kernel_size: 8 |
|
adaptor_stride: 8 |
|
add_adapter: true |
|
architectures: |
|
- SeamlessM4Tv2SpeechEncoder |
|
conv_depthwise_kernel_size: 31 |
|
feature_projection_input_dim: 160 |
|
hidden_size: 1024 |
|
initializer_range: 0.02 |
|
layer_norm_eps: 1.0e-05 |
|
left_max_position_embeddings: 64 |
|
model_type: seamless_m4t_v2 |
|
num_adapter_layers: 1 |
|
position_embeddings_type: relative_key |
|
right_max_position_embeddings: 8 |
|
speech_encoder_attention_heads: 16 |
|
speech_encoder_chunk_size: 20000 |
|
speech_encoder_dropout: 0.0 |
|
speech_encoder_hidden_act: swish |
|
speech_encoder_intermediate_size: 4096 |
|
speech_encoder_layerdrop: 0.1 |
|
speech_encoder_layers: 24 |
|
speech_encoder_left_chunk_num: 128 |
|
torch_dtype: float32 |
|
transformers_version: 4.37.2 |
|
use_cache: true |
|
sampling_rate: 16000 |
|
audio_adapter: |
|
model_type: qformer |
|
input_dim: 1024 |
|
output_dim: 2048 |
|
hidden_size: 768 |
|
num_hidden_layers: 4 |
|
num_attention_heads: 12 |
|
intermediate_size: 3072 |
|
hidden_act: gelu |
|
hidden_dropout_prob: 0.1 |
|
attention_probs_dropout_prob: 0.1 |
|
initializer_range: 0.02 |
|
layer_norm_eps: 1.0e-12 |
|
add_cross_attention: true |
|
num_queries: 1 |
|
cross_attention_every_n_layers: 1 |
|
compress_factor: 2 |
|
triplet_loss: false |
|
text_decoder: |
|
_name_or_path: meta-llama/Llama-3.2-1B-Instruct |
|
architectures: |
|
- LlamaForCausalLM |
|
attention_bias: false |
|
attention_dropout: 0.0 |
|
bos_token_id: 128000 |
|
eos_token_id: |
|
- 128001 |
|
- 128008 |
|
- 128009 |
|
head_dim: 64 |
|
hidden_act: silu |
|
hidden_size: 2048 |
|
initializer_range: 0.02 |
|
intermediate_size: 8192 |
|
max_position_embeddings: 131072 |
|
mlp_bias: false |
|
model_type: llama |
|
num_attention_heads: 32 |
|
num_hidden_layers: 16 |
|
num_key_value_heads: 8 |
|
pretraining_tp: 1 |
|
rms_norm_eps: 1.0e-05 |
|
rope_scaling: |
|
factor: 32.0 |
|
high_freq_factor: 4.0 |
|
low_freq_factor: 1.0 |
|
original_max_position_embeddings: 8192 |
|
rope_type: llama3 |
|
rope_theta: 500000.0 |
|
tie_word_embeddings: true |
|
torch_dtype: bfloat16 |
|
transformers_version: 4.45.0.dev0 |
|
use_cache: true |
|
vocab_size: 128256 |
|
conversation_version: llama_3_1 |
|
video_encoder: |
|
hidden_size: 768 |
|
_name_or_path: /net/scratch/hscra/plgrid/plgstefanop/auto_avsr_ckpt/vsr_trlrs3vox2_base.pth |
|
model_type: auto_avsr |
|
adim: 768 |
|
aheads: 12 |
|
eunits: 3072 |
|
elayers: 12 |
|
transformer_input_layer: conv3d |
|
dropout_rate: 0.1 |
|
transformer_attn_dropout_rate: 0.1 |
|
transformer_encoder_attn_layer_type: rel_mha |
|
macaron_style: true |
|
use_cnn_module: true |
|
cnn_module_kernel: 31 |
|
zero_triu: false |
|
a_upsample_ratio: 1 |
|
relu_type: swish |
|
ddim: 768 |
|
dheads: 12 |
|
dunits: 3072 |
|
dlayers: 6 |
|
lsm_weight: 0.1 |
|
transformer_length_normalized_loss: false |
|
mtlalpha: 0.1 |
|
ctc_type: builtin |
|
rel_pos_type: latest |
|
video_adapter: |
|
model_type: mlp |
|
input_dim: 768 |
|
output_dim: 2048 |
|
hidden_layers: 4 |
|
hidden_size: 4096 |
|
residual_type: interpolation |
|
force_input_projection: true |
|
force_output_projection: true |
|
training: |
|
modality: audio |
|
group_by_modality_length: false |
|
load_in_4bit: false |
|
load_in_8bit: false |
|
bnb_4bit_quant_type: nf4 |
|
bnb_4bit_use_double_quant: true |
|
lora_adapters: [] |
|
num_steps_between_each_restart: null |
|
lr_min: 1.0e-06 |
|
eval_temperature: 0 |
|
eval_max_new_tokens: 200 |
|
eval_num_batched_generations: 4 |
|
cache_dir: null |
|
resume_from_checkpoint: null |
|
model_max_length: 2048 |
|
freeze_modules: |
|
- audio_encoder |
|
- video_encoder |
|
- text_decoder |
|
attn_implementation: flash_attention_2 |
|
mpt_attn_impl: triton |
|
mm_projector_lr: null |
|
output_dir: /net/scratch/hscra/plgrid/plgstefanop/checkpoints/speechlmm-v1/llava-huggingface-s |
|
report_to: wandb |
|
run_name: pretrain-audio-seamless-qformer-auto_avsr-mlp-llama_3_1b-speechlmm/v1/s |
|
num_train_epochs: 0 |
|
max_steps: 0 |
|
per_device_train_batch_size: 4 |
|
per_device_eval_batch_size: 8 |
|
gradient_accumulation_steps: 12 |
|
gradient_checkpointing: true |
|
optim: adamw_torch |
|
learning_rate: 5.0e-05 |
|
weight_decay: 0.0 |
|
lr_scheduler_type: cosine |
|
warmup_ratio: 0.03 |
|
logging_steps: 1 |
|
save_strategy: steps |
|
save_steps: 200 |
|
save_total_limit: 10 |
|
eval_strategy: 'no' |
|
eval_steps: 0 |
|
push_to_hub: true |
|
hub_model_id: meetween/Llama-speechlmm-1.0-s |
|
hub_strategy: end |
|
hub_token: null |
|
hub_private_repo: false |
|
fp16: false |
|
bf16: true |
|
tf32: true |
|
dataloader_num_workers: 4 |
|
remove_unused_columns: false |
|
seed: 42 |
|
pretrained_checkpoint: /net/scratch/hscra/plgrid/plgstefanop/checkpoints/speechlmm-v1/llava-pretrain-audio-seamless-qformer-auto_avsr-mlp-llama_3_1b-speechlmm/v1/s/checkpoint-800 |
|
run_name: speechlmm-s |
|
|