Llama-speechlmm-1.0-s / config_hydra.yaml

Model save

6dda8c0 verified 2 days ago

6.94 kB

	training_type: pretrain
	wandb_project: speechlmm-v1
	wandb_watch: 'false'
	num_gpus: 4
	num_nodes: 1
	accelerate_config: /net/scratch/hscra/plgrid/plgstefanop/llava/conf/accelerate/deepspeed.yaml
	deepspeed_config: /net/scratch/hscra/plgrid/plgstefanop/llava/conf/deepspeed/zero3.json
	adjustments: []
	data:
	data_config_path: /net/scratch/hscra/plgrid/plgstefanop/llava/conf/datasets/speechlmm_v1/speechlmm_v1.yml
	num_proc_for_preprocessing: 64
	dataloader_debug: false
	filter_broken_samples: false
	organize_eval_dataset_per_task: true
	group_dataset_by_task:
	train: true
	eval: false
	test: false
	task_weights:
	ASR: 0.3
	ST: 0.2
	SSUM: 0.1
	SQA: 0.1
	SLU_INTENT_ONLY: 0.1
	VSR: 0.2
	multi_task_sampler: alternating
	replacement: true
	rebuild_dataset_cache: false
	cache_final_datasets: true
	audio_input_sampling_rate: 16000
	codec_sampling_rate: null
	codec_frame_rate: null
	image_folder: null
	image_aspect_ratio: square
	is_multimodal: false
	lazy_preprocess: true
	align_text_to_audio: false
	use_text_tokens: true
	align_with_whisper: false
	restore_punctuation_and_spaces: true
	max_condition_audio_duration: 10
	variable_batch_size: false
	max_length_per_batch: null
	model:
	add_lm_head: true
	vision_select_layer: -1
	vision_use_patch_token: true
	vision_patch_merge_type: flat
	vision_select_feature: patch
	mm_use_im_start_end: false
	mm_use_audio_start_end: false
	use_audio_encoder_as_codec_encoder: false
	add_all_multimodal_tokens: true
	perturb_codes: true
	perturb_prob: 0.2
	pad_audio_weight: 0.5
	epad_audio_weight: 1
	pad_epad_audio_weight_decay: 0.5
	perturb_prob_decay: 0.5
	conversation_version: null
	codebook_weights:
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	- 1
	chunk_size_in_seconds: 15
	chunk_overlap_in_seconds: 1
	chunk_encoding_strategy: loop
	audio_loss_decay: 1
	audio_loss_weight: 1
	tokenizer_padding_side: right
	audio_encoder:
	_name_or_path: meetween/seamless-m4t-v2-large-speech-encoder
	adaptor_dropout: 0.1
	adaptor_kernel_size: 8
	adaptor_stride: 8
	add_adapter: true
	architectures:
	- SeamlessM4Tv2SpeechEncoder
	conv_depthwise_kernel_size: 31
	feature_projection_input_dim: 160
	hidden_size: 1024
	initializer_range: 0.02
	layer_norm_eps: 1.0e-05
	left_max_position_embeddings: 64
	model_type: seamless_m4t_v2
	num_adapter_layers: 1
	position_embeddings_type: relative_key
	right_max_position_embeddings: 8
	speech_encoder_attention_heads: 16
	speech_encoder_chunk_size: 20000
	speech_encoder_dropout: 0.0
	speech_encoder_hidden_act: swish
	speech_encoder_intermediate_size: 4096
	speech_encoder_layerdrop: 0.1
	speech_encoder_layers: 24
	speech_encoder_left_chunk_num: 128
	torch_dtype: float32
	transformers_version: 4.37.2
	use_cache: true
	sampling_rate: 16000
	audio_adapter:
	model_type: qformer
	input_dim: 1024
	output_dim: 2048
	hidden_size: 768
	num_hidden_layers: 4
	num_attention_heads: 12
	intermediate_size: 3072
	hidden_act: gelu
	hidden_dropout_prob: 0.1
	attention_probs_dropout_prob: 0.1
	initializer_range: 0.02
	layer_norm_eps: 1.0e-12
	add_cross_attention: true
	num_queries: 1
	cross_attention_every_n_layers: 1
	compress_factor: 2
	triplet_loss: false
	text_decoder:
	_name_or_path: meta-llama/Llama-3.2-1B-Instruct
	architectures:
	- LlamaForCausalLM
	attention_bias: false
	attention_dropout: 0.0
	bos_token_id: 128000
	eos_token_id:
	- 128001
	- 128008
	- 128009
	head_dim: 64
	hidden_act: silu
	hidden_size: 2048
	initializer_range: 0.02
	intermediate_size: 8192
	max_position_embeddings: 131072
	mlp_bias: false
	model_type: llama
	num_attention_heads: 32
	num_hidden_layers: 16
	num_key_value_heads: 8
	pretraining_tp: 1
	rms_norm_eps: 1.0e-05
	rope_scaling:
	factor: 32.0
	high_freq_factor: 4.0
	low_freq_factor: 1.0
	original_max_position_embeddings: 8192
	rope_type: llama3
	rope_theta: 500000.0
	tie_word_embeddings: true
	torch_dtype: bfloat16
	transformers_version: 4.45.0.dev0
	use_cache: true
	vocab_size: 128256
	conversation_version: llama_3_1
	video_encoder:
	hidden_size: 768
	_name_or_path: /net/scratch/hscra/plgrid/plgstefanop/auto_avsr_ckpt/vsr_trlrs3vox2_base.pth
	model_type: auto_avsr
	adim: 768
	aheads: 12
	eunits: 3072
	elayers: 12
	transformer_input_layer: conv3d
	dropout_rate: 0.1
	transformer_attn_dropout_rate: 0.1
	transformer_encoder_attn_layer_type: rel_mha
	macaron_style: true
	use_cnn_module: true
	cnn_module_kernel: 31
	zero_triu: false
	a_upsample_ratio: 1
	relu_type: swish
	ddim: 768
	dheads: 12
	dunits: 3072
	dlayers: 6
	lsm_weight: 0.1
	transformer_length_normalized_loss: false
	mtlalpha: 0.1
	ctc_type: builtin
	rel_pos_type: latest
	video_adapter:
	model_type: mlp
	input_dim: 768
	output_dim: 2048
	hidden_layers: 4
	hidden_size: 4096
	residual_type: interpolation
	force_input_projection: true
	force_output_projection: true
	training:
	modality: audio
	group_by_modality_length: false
	load_in_4bit: false
	load_in_8bit: false
	bnb_4bit_quant_type: nf4
	bnb_4bit_use_double_quant: true
	lora_adapters: []
	num_steps_between_each_restart: null
	lr_min: 1.0e-06
	eval_temperature: 0
	eval_max_new_tokens: 200
	eval_num_batched_generations: 4
	cache_dir: null
	resume_from_checkpoint: null
	model_max_length: 2048
	freeze_modules:
	- audio_encoder
	- video_encoder
	- text_decoder
	attn_implementation: flash_attention_2
	mpt_attn_impl: triton
	mm_projector_lr: null
	output_dir: /net/scratch/hscra/plgrid/plgstefanop/checkpoints/speechlmm-v1/llava-huggingface-s
	report_to: wandb
	run_name: pretrain-audio-seamless-qformer-auto_avsr-mlp-llama_3_1b-speechlmm/v1/s
	num_train_epochs: 0
	max_steps: 0
	per_device_train_batch_size: 4
	per_device_eval_batch_size: 8
	gradient_accumulation_steps: 12
	gradient_checkpointing: true
	optim: adamw_torch
	learning_rate: 5.0e-05
	weight_decay: 0.0
	lr_scheduler_type: cosine
	warmup_ratio: 0.03
	logging_steps: 1
	save_strategy: steps
	save_steps: 200
	save_total_limit: 10
	eval_strategy: 'no'
	eval_steps: 0
	push_to_hub: true
	hub_model_id: meetween/Llama-speechlmm-1.0-s
	hub_strategy: end
	hub_token: null
	hub_private_repo: false
	fp16: false
	bf16: true
	tf32: true
	dataloader_num_workers: 4
	remove_unused_columns: false
	seed: 42
	pretrained_checkpoint: /net/scratch/hscra/plgrid/plgstefanop/checkpoints/speechlmm-v1/llava-pretrain-audio-seamless-qformer-auto_avsr-mlp-llama_3_1b-speechlmm/v1/s/checkpoint-800
	run_name: speechlmm-s