root commited on
Commit
5d59dad
·
1 Parent(s): f11ac57

initial commit

Browse files
configs/inference_1.5.yaml DELETED
@@ -1,302 +0,0 @@
1
- train_config:
2
- expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-1.5b
3
- run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-1.5B-sft
4
- delete_previous_checkpoint: true
5
- batch_size: 32
6
- gradient_accumulation_steps: 2
7
- seed: 42
8
- learning_rate: 0.00002
9
- lr_scheduler: constant
10
- loss_multiplier: 1.0
11
- warmup_steps: 1875
12
- weight_decay: 0.1
13
- precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
14
- gradient_checkpointing: False
15
- num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
16
- offline: false
17
- freeze_lm_embeddings: false
18
- logging_steps: 10
19
- dist_backend: nccl
20
- dist_url: env:// # tcp://localhost:7000
21
- no_set_device_rank: false
22
- fsdp: true
23
- fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
24
- fsdp_sharding_strategy: full # full, hybrid
25
- horovod: false
26
-
27
- # instruction tuning hparams
28
- # sft_config:
29
- # pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed/
30
- # pretrained_ckpt: checkpoint_199.pt
31
- # unfreeze_full_lm: false
32
-
33
- data_config:
34
- dataset_blending_global_weight: 0.005
35
-
36
- dataset_blending_config:
37
-
38
- MMAUQA/train:
39
- weight: 1.5
40
-
41
- AudioSet-Temporal-Speech-Audio-QA/train:
42
- weight: 1.0
43
-
44
- CompA-R-AQA/train:
45
- weight: 1.0
46
-
47
- # Audio QA
48
- Clotho-AQA-AQA/train:
49
- weight: 1.0
50
-
51
- OpenAQA-AQA/train:
52
- weight: 1.0
53
-
54
- SalmonnQA/train:
55
- weight: 0.8
56
-
57
- AudioEntailmentQA/train:
58
- weight: 1.0
59
-
60
- # Audio Captioning
61
-
62
- Clotho-v2-AudioCaptioning/train:
63
- weight: 1.0
64
-
65
- audiocaps-AudioCaptioning/train:
66
- weight: 1.0
67
-
68
- Epidemic_sound-AudioCaptioning/train:
69
- weight: 1.0
70
-
71
- MACS-AudioCaptioning/train:
72
- weight: 1.0
73
-
74
- # Audio Classification
75
-
76
- UrbanSound8K-EventClassification/train:
77
- weight: 0.5
78
-
79
- TUT-EventClassification/train:
80
- weight: 2.0
81
-
82
- FSD50k-EventClassification/train:
83
- weight: 1.0
84
-
85
- CochlScene-SceneClassification/train:
86
- weight: 1.0
87
-
88
- NonSpeech7k-EventClassification/train:
89
- weight: 1.0
90
-
91
- chime-home-EventClassification/train:
92
- weight: 1.0
93
-
94
- SONYC-UST-EventClassification/train:
95
- weight: 1.0
96
-
97
- # Speech Emotion Classification
98
-
99
- MELD-EmotionClassification/train:
100
- weight: 0.5
101
-
102
- MELD-SentimentClassification/train:
103
- weight: 0.5
104
-
105
- emov-db-EmotionClassification/train:
106
- weight: 1.0
107
-
108
- jl-corpus-EmotionClassification/train:
109
- weight: 6.0
110
-
111
- tess-EmotionClassification/train:
112
- weight: 2.5
113
-
114
- IEMOCAP-EmotionClassification/train:
115
- weight: 3.0
116
-
117
- OMGEmotion-EmotionClassification/train:
118
- weight: 3.0
119
-
120
- VocalSound-VocalClassification/train:
121
- weight: 1.5
122
-
123
- # Music QA
124
-
125
- Music-AVQA-AQA_All/train:
126
- weight: 3.0
127
-
128
- MU-LLAMA-AQA/train:
129
- weight: 1.0
130
-
131
- # Music Captioning
132
-
133
- LP-MusicCaps-MSD-AudioCaptioning/train:
134
- weight: 0.06
135
-
136
- LP-MusicCaps-MC-AudioCaptioning/train:
137
- weight: 2.0
138
-
139
- LP-MusicCaps-MTT-AudioCaptioning/train:
140
- weight: 1.0
141
-
142
- MusicCaps-AudioCaptioning/train:
143
- weight: 6.0
144
-
145
- musdbhq-captioning/train:
146
- weight: 2.0
147
-
148
- # Music Understanding
149
-
150
- Medley-solos-DB-InstrClassification/train:
151
- weight: 1.5
152
-
153
- GTZAN-GenreClassification/train:
154
- weight: 2.0
155
-
156
- NSynth-MIR/train:
157
- weight: 0.4
158
-
159
- NSynth-Instrument/train:
160
- weight: 1.5
161
-
162
- NSynth-Source/train:
163
- weight: 1.5
164
-
165
- mtg-jamendo-MusicTagging/train:
166
- weight: 1.0
167
-
168
- FMA-GenreClassification/train:
169
- weight: 1.0
170
-
171
- musdbhq-InstrClassification/train:
172
- weight: 1.0
173
-
174
- LLARK_FMA-mir/train:
175
- weight: 1.0
176
-
177
- LLARK_FMA-reasoning/train:
178
- weight: 1.0
179
-
180
- LLARK_MagnaTagATune-mir/train:
181
- weight: 1.0
182
-
183
- LLARK_MTG-Jamendo-reasoning/train:
184
- weight: 1.0
185
-
186
- LLARK_MagnaTagATune-reasoning/train:
187
- weight: 1.0
188
-
189
- LLARK_MTG-Jamendo-mir/train:
190
- weight: 1.0
191
-
192
- MusicBenchQA/train:
193
- weight: 1.0
194
-
195
- dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
196
- data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
197
- dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/dataset_blending.json
198
- max_tokens: 512
199
- num_workers: 4
200
-
201
- valid_dataset_config:
202
-
203
- Clotho-AQA-AQA/test: true
204
-
205
- Clotho-v2-AudioCaptioning/test: true
206
- audiocaps-AudioCaptioning/test: true
207
-
208
- FSD50k-EventClassification/test: true
209
- CochlScene-SceneClassification/test: true
210
- NonSpeech7k-EventClassification/test: true
211
- SONYC-UST-EventClassification/test: true
212
-
213
- MELD-EmotionClassification/test: true
214
- MELD-SentimentClassification/test: true
215
- emov-db-EmotionClassification/val: true
216
- jl-corpus-EmotionClassification/val: true
217
- tess-EmotionClassification/val: true
218
- IEMOCAP-EmotionClassification/val: true
219
- OMGEmotion-EmotionClassification/val: true
220
- VocalSound-VocalClassification/test: true
221
-
222
- Music-AVQA-AQA_All/test: true
223
- MU-LLAMA-AQA/test: true
224
-
225
- LP-MusicCaps-MSD-AudioCaptioning/test: true
226
- LP-MusicCaps-MC-AudioCaptioning/test: true
227
- LP-MusicCaps-MTT-AudioCaptioning/test: true
228
- MusicCaps-AudioCaptioning/test: true
229
-
230
- NSynth-MIR/test: true
231
- mtg-jamendo-MusicTagging/val: true
232
- musdbhq-InstrClassification/test: true
233
-
234
- # zero shot
235
- # CREMA-D-EmotionClassification/train:
236
- # prefix_prob: 1.0
237
-
238
- # ravdess-EmotionClassification/train:
239
- # prefix_prob: 1.0
240
-
241
- # UrbanSound8K-EventClassification/train:
242
- # prefix_prob: 1.0
243
-
244
- # ESC50-EventClassification/train:
245
- # prefix_prob: 1.0
246
-
247
- # DCASE17Task4-SceneClassification/test:
248
- # prefix_prob: 1.0
249
-
250
- # GTZAN-GenreClassification/train:
251
- # prefix_prob: 1.0
252
-
253
- # Medley-solos-DB-InstrClassification/test:
254
- # prefix_prob: 1.0
255
-
256
- clap_config:
257
- method: nvclap-large
258
- audio_embed_dim: 2048
259
- checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
260
-
261
- window_length: 10.0 # seconds
262
- window_overlap: 0.0 # seconds
263
- max_num_window: 9 # 1.5 minutes
264
- max_num_fewshot: 1 # number of fewshot samples (including the final one)
265
- finetune: true
266
-
267
- whisper_config:
268
- method: whisper-large-v3
269
- path: openai/whisper-large-v3
270
- audio_embed_dim: 1280
271
- sampling_rate: 16000
272
-
273
- window_length: 30.0 # seconds
274
- window_overlap: 0.0 # seconds
275
- max_num_window: 1 # 5 minutes
276
- max_num_fewshot: 1 # number of fewshot samples (including the final one)
277
-
278
- mert_config:
279
- method: mert-v1
280
- path: m-a-p/MERT-v1-330M
281
- audio_embed_dim: 1024
282
- sampling_rate: 24000
283
-
284
- window_length: 10.0 # seconds
285
- window_overlap: 0.0 # seconds
286
- max_num_window: 1 # 5 minutes
287
- max_num_fewshot: 1 # number of fewshot samples (including the final one)
288
-
289
- model_config:
290
- cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
291
-
292
- lang_encoder_path: Qwen/Qwen2.5-1.5B
293
- tokenizer_path: Qwen/Qwen2.5-1.5B
294
- cross_attn_every_n_layers: 1
295
- audio_transformer_kwargs: {
296
- n_head: 8,
297
- n_layers: 3,
298
- d_inner: 2048,
299
- max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
300
- max_window_per_audio: 1, # must = max_num_window
301
- common_encoder_embed_dim: 1024
302
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/inference_2.yaml DELETED
@@ -1,302 +0,0 @@
1
- train_config:
2
- expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers
3
- run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed
4
- delete_previous_checkpoint: true
5
- batch_size: 4
6
- gradient_accumulation_steps: 2
7
- seed: 42
8
- learning_rate: 0.00002
9
- lr_scheduler: constant
10
- loss_multiplier: 1.0
11
- warmup_steps: 1875
12
- weight_decay: 0.1
13
- precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
14
- gradient_checkpointing: False
15
- num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
16
- offline: false
17
- freeze_lm_embeddings: false
18
- logging_steps: 10
19
- dist_backend: nccl
20
- dist_url: env:// # tcp://localhost:7000
21
- no_set_device_rank: false
22
- fsdp: true
23
- fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
24
- fsdp_sharding_strategy: full # full, hybrid
25
- horovod: false
26
-
27
- # instruction tuning hparams
28
- sft_config:
29
- pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed/
30
- pretrained_ckpt: checkpoint_199.pt
31
- unfreeze_full_lm: false
32
-
33
- data_config:
34
- dataset_blending_global_weight: 0.005
35
-
36
- dataset_blending_config:
37
-
38
- MMAUQA/train:
39
- weight: 1.5
40
-
41
- AudioSet-Temporal-Speech-Audio-QA/train:
42
- weight: 1.0
43
-
44
- CompA-R-AQA/train:
45
- weight: 1.0
46
-
47
- # Audio QA
48
- Clotho-AQA-AQA/train:
49
- weight: 1.0
50
-
51
- OpenAQA-AQA/train:
52
- weight: 1.0
53
-
54
- SalmonnQA/train:
55
- weight: 0.8
56
-
57
- AudioEntailmentQA/train:
58
- weight: 1.0
59
-
60
- # Audio Captioning
61
-
62
- Clotho-v2-AudioCaptioning/train:
63
- weight: 1.0
64
-
65
- audiocaps-AudioCaptioning/train:
66
- weight: 1.0
67
-
68
- Epidemic_sound-AudioCaptioning/train:
69
- weight: 1.0
70
-
71
- MACS-AudioCaptioning/train:
72
- weight: 1.0
73
-
74
- # Audio Classification
75
-
76
- UrbanSound8K-EventClassification/train:
77
- weight: 0.5
78
-
79
- TUT-EventClassification/train:
80
- weight: 2.0
81
-
82
- FSD50k-EventClassification/train:
83
- weight: 1.0
84
-
85
- CochlScene-SceneClassification/train:
86
- weight: 1.0
87
-
88
- NonSpeech7k-EventClassification/train:
89
- weight: 1.0
90
-
91
- chime-home-EventClassification/train:
92
- weight: 1.0
93
-
94
- SONYC-UST-EventClassification/train:
95
- weight: 1.0
96
-
97
- # Speech Emotion Classification
98
-
99
- MELD-EmotionClassification/train:
100
- weight: 0.5
101
-
102
- MELD-SentimentClassification/train:
103
- weight: 0.5
104
-
105
- emov-db-EmotionClassification/train:
106
- weight: 1.0
107
-
108
- jl-corpus-EmotionClassification/train:
109
- weight: 6.0
110
-
111
- tess-EmotionClassification/train:
112
- weight: 2.5
113
-
114
- IEMOCAP-EmotionClassification/train:
115
- weight: 3.0
116
-
117
- OMGEmotion-EmotionClassification/train:
118
- weight: 3.0
119
-
120
- VocalSound-VocalClassification/train:
121
- weight: 1.5
122
-
123
- # Music QA
124
-
125
- Music-AVQA-AQA_All/train:
126
- weight: 3.0
127
-
128
- MU-LLAMA-AQA/train:
129
- weight: 1.0
130
-
131
- # Music Captioning
132
-
133
- LP-MusicCaps-MSD-AudioCaptioning/train:
134
- weight: 0.06
135
-
136
- LP-MusicCaps-MC-AudioCaptioning/train:
137
- weight: 2.0
138
-
139
- LP-MusicCaps-MTT-AudioCaptioning/train:
140
- weight: 1.0
141
-
142
- MusicCaps-AudioCaptioning/train:
143
- weight: 6.0
144
-
145
- musdbhq-captioning/train:
146
- weight: 2.0
147
-
148
- # Music Understanding
149
-
150
- Medley-solos-DB-InstrClassification/train:
151
- weight: 1.5
152
-
153
- GTZAN-GenreClassification/train:
154
- weight: 2.0
155
-
156
- NSynth-MIR/train:
157
- weight: 0.4
158
-
159
- NSynth-Instrument/train:
160
- weight: 1.5
161
-
162
- NSynth-Source/train:
163
- weight: 1.5
164
-
165
- mtg-jamendo-MusicTagging/train:
166
- weight: 1.0
167
-
168
- FMA-GenreClassification/train:
169
- weight: 1.0
170
-
171
- musdbhq-InstrClassification/train:
172
- weight: 1.0
173
-
174
- LLARK_FMA-mir/train:
175
- weight: 1.0
176
-
177
- LLARK_FMA-reasoning/train:
178
- weight: 1.0
179
-
180
- LLARK_MagnaTagATune-mir/train:
181
- weight: 1.0
182
-
183
- LLARK_MTG-Jamendo-reasoning/train:
184
- weight: 1.0
185
-
186
- LLARK_MagnaTagATune-reasoning/train:
187
- weight: 1.0
188
-
189
- LLARK_MTG-Jamendo-mir/train:
190
- weight: 1.0
191
-
192
- MusicBenchQA/train:
193
- weight: 1.0
194
-
195
- dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
196
- data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
197
- dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/dataset_blending.json
198
- max_tokens: 512
199
- num_workers: 4
200
-
201
- valid_dataset_config:
202
-
203
- Clotho-AQA-AQA/test: true
204
-
205
- Clotho-v2-AudioCaptioning/test: true
206
- audiocaps-AudioCaptioning/test: true
207
-
208
- FSD50k-EventClassification/test: true
209
- CochlScene-SceneClassification/test: true
210
- NonSpeech7k-EventClassification/test: true
211
- SONYC-UST-EventClassification/test: true
212
-
213
- MELD-EmotionClassification/test: true
214
- MELD-SentimentClassification/test: true
215
- emov-db-EmotionClassification/val: true
216
- jl-corpus-EmotionClassification/val: true
217
- tess-EmotionClassification/val: true
218
- IEMOCAP-EmotionClassification/val: true
219
- OMGEmotion-EmotionClassification/val: true
220
- VocalSound-VocalClassification/test: true
221
-
222
- Music-AVQA-AQA_All/test: true
223
- MU-LLAMA-AQA/test: true
224
-
225
- LP-MusicCaps-MSD-AudioCaptioning/test: true
226
- LP-MusicCaps-MC-AudioCaptioning/test: true
227
- LP-MusicCaps-MTT-AudioCaptioning/test: true
228
- MusicCaps-AudioCaptioning/test: true
229
-
230
- NSynth-MIR/test: true
231
- mtg-jamendo-MusicTagging/val: true
232
- musdbhq-InstrClassification/test: true
233
-
234
- # zero shot
235
- # CREMA-D-EmotionClassification/train:
236
- # prefix_prob: 1.0
237
-
238
- # ravdess-EmotionClassification/train:
239
- # prefix_prob: 1.0
240
-
241
- # UrbanSound8K-EventClassification/train:
242
- # prefix_prob: 1.0
243
-
244
- # ESC50-EventClassification/train:
245
- # prefix_prob: 1.0
246
-
247
- # DCASE17Task4-SceneClassification/test:
248
- # prefix_prob: 1.0
249
-
250
- # GTZAN-GenreClassification/train:
251
- # prefix_prob: 1.0
252
-
253
- # Medley-solos-DB-InstrClassification/test:
254
- # prefix_prob: 1.0
255
-
256
- clap_config:
257
- method: nvclap-large
258
- audio_embed_dim: 2048
259
- checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
260
-
261
- window_length: 10.0 # seconds
262
- window_overlap: 0.0 # seconds
263
- max_num_window: 9 # 1.5 minutes
264
- max_num_fewshot: 1 # number of fewshot samples (including the final one)
265
- finetune: true
266
-
267
- whisper_config:
268
- method: whisper-large-v3
269
- path: openai/whisper-large-v3
270
- audio_embed_dim: 1280
271
- sampling_rate: 16000
272
-
273
- window_length: 30.0 # seconds
274
- window_overlap: 0.0 # seconds
275
- max_num_window: 1 # 5 minutes
276
- max_num_fewshot: 1 # number of fewshot samples (including the final one)
277
-
278
- mert_config:
279
- method: mert-v1
280
- path: m-a-p/MERT-v1-330M
281
- audio_embed_dim: 1024
282
- sampling_rate: 24000
283
-
284
- window_length: 10.0 # seconds
285
- window_overlap: 0.0 # seconds
286
- max_num_window: 1 # 5 minutes
287
- max_num_fewshot: 1 # number of fewshot samples (including the final one)
288
-
289
- model_config:
290
- cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
291
-
292
- lang_encoder_path: Qwen/Qwen2.5-3B
293
- tokenizer_path: Qwen/Qwen2.5-3B
294
- cross_attn_every_n_layers: 1
295
- audio_transformer_kwargs: {
296
- n_head: 8,
297
- n_layers: 3,
298
- d_inner: 2048,
299
- max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
300
- max_window_per_audio: 1, # must = max_num_window
301
- common_encoder_embed_dim: 1024
302
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/inference_long.yaml DELETED
@@ -1,284 +0,0 @@
1
- train_config:
2
- expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed-sft
3
- run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft-long
4
- delete_previous_checkpoint: true
5
- batch_size: 2
6
- gradient_accumulation_steps: 2
7
- seed: 42
8
- learning_rate: 0.00002
9
- lr_scheduler: constant
10
- loss_multiplier: 1.0
11
- warmup_steps: 1875
12
- weight_decay: 0.1
13
- precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
14
- gradient_checkpointing: False
15
- num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
16
- offline: false
17
- freeze_lm_embeddings: false
18
- logging_steps: 10
19
- dist_backend: nccl
20
- dist_url: env:// # tcp://localhost:7000
21
- no_set_device_rank: false
22
- fsdp: true
23
- fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
24
- fsdp_sharding_strategy: full # full, hybrid
25
- horovod: false
26
-
27
- # instruction tuning hparams
28
- # sft_config:
29
- # pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
30
- # pretrained_ckpt: checkpoint_199.pt
31
- # unfreeze_full_lm: false
32
-
33
- data_config:
34
- dataset_blending_global_weight: 0.005
35
-
36
- dataset_blending_config:
37
-
38
- MMAUQA/train:
39
- weight: 1.5
40
-
41
- AudioSet-Temporal-Speech-Audio-QA/train:
42
- weight: 1.0
43
-
44
- CompA-R-AQA/train:
45
- weight: 1.0
46
-
47
- # Audio QA
48
- Clotho-AQA-AQA/train:
49
- weight: 1.0
50
-
51
- OpenAQA-AQA/train:
52
- weight: 1.0
53
-
54
- SalmonnQA/train:
55
- weight: 1.0
56
-
57
- AudioEntailmentQA/train:
58
- weight: 1.0
59
-
60
- # Audio Captioning
61
-
62
- Clotho-v2-AudioCaptioning/train:
63
- weight: 1.0
64
-
65
- audiocaps-AudioCaptioning/train:
66
- weight: 1.0
67
-
68
- Epidemic_sound-AudioCaptioning/train:
69
- weight: 1.0
70
-
71
- MACS-AudioCaptioning/train:
72
- weight: 1.0
73
-
74
- # Audio Classification
75
-
76
- FSD50k-EventClassification/train:
77
- weight: 1.0
78
-
79
- CochlScene-SceneClassification/train:
80
- weight: 1.0
81
-
82
- NonSpeech7k-EventClassification/train:
83
- weight: 1.0
84
-
85
- chime-home-EventClassification/train:
86
- weight: 1.0
87
-
88
- SONYC-UST-EventClassification/train:
89
- weight: 1.0
90
-
91
- # Speech Emotion Classification
92
-
93
- MELD-EmotionClassification/train:
94
- weight: 0.5
95
-
96
- MELD-SentimentClassification/train:
97
- weight: 0.5
98
-
99
- emov-db-EmotionClassification/train:
100
- weight: 1.0
101
-
102
- jl-corpus-EmotionClassification/train:
103
- weight: 6.0
104
-
105
- tess-EmotionClassification/train:
106
- weight: 2.5
107
-
108
- IEMOCAP-EmotionClassification/train:
109
- weight: 3.0
110
-
111
- OMGEmotion-EmotionClassification/train:
112
- weight: 3.0
113
-
114
- VocalSound-VocalClassification/train:
115
- weight: 1.5
116
-
117
- # Music QA
118
-
119
- Music-AVQA-AQA_All/train:
120
- weight: 3.0
121
-
122
- MU-LLAMA-AQA/train:
123
- weight: 1.0
124
-
125
- # Music Captioning
126
-
127
- LP-MusicCaps-MSD-AudioCaptioning/train:
128
- weight: 0.06
129
-
130
- LP-MusicCaps-MC-AudioCaptioning/train:
131
- weight: 2.0
132
-
133
- LP-MusicCaps-MTT-AudioCaptioning/train:
134
- weight: 1.0
135
-
136
- MusicCaps-AudioCaptioning/train:
137
- weight: 6.0
138
-
139
- musdbhq-captioning/train:
140
- weight: 2.0
141
-
142
- # Music Understanding
143
-
144
- NSynth-MIR/train:
145
- weight: 0.2
146
-
147
- mtg-jamendo-MusicTagging/train:
148
- weight: 0.1
149
-
150
- FMA-GenreClassification/train:
151
- weight: 0.5
152
-
153
- musdbhq-InstrClassification/train:
154
- weight: 0.8
155
-
156
- LLARK_FMA-mir/train:
157
- weight: 1.0
158
-
159
- LLARK_FMA-reasoning/train:
160
- weight: 1.0
161
-
162
- LLARK_MagnaTagATune-mir/train:
163
- weight: 1.0
164
-
165
- LLARK_MTG-Jamendo-reasoning/train:
166
- weight: 1.0
167
-
168
- LLARK_MagnaTagATune-reasoning/train:
169
- weight: 1.0
170
-
171
- LLARK_MTG-Jamendo-mir/train:
172
- weight: 1.0
173
-
174
- MusicBenchQA/train:
175
- weight: 1.0
176
-
177
- dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
178
- data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
179
- dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
180
- max_tokens: 512
181
- num_workers: 4
182
-
183
- valid_dataset_config:
184
-
185
- Clotho-AQA-AQA/test: true
186
-
187
- Clotho-v2-AudioCaptioning/test: true
188
- audiocaps-AudioCaptioning/test: true
189
-
190
- FSD50k-EventClassification/test: true
191
- CochlScene-SceneClassification/test: true
192
- NonSpeech7k-EventClassification/test: true
193
- SONYC-UST-EventClassification/test: true
194
-
195
- MELD-EmotionClassification/test: true
196
- MELD-SentimentClassification/test: true
197
- emov-db-EmotionClassification/val: true
198
- jl-corpus-EmotionClassification/val: true
199
- tess-EmotionClassification/val: true
200
- IEMOCAP-EmotionClassification/val: true
201
- OMGEmotion-EmotionClassification/val: true
202
- VocalSound-VocalClassification/test: true
203
-
204
- Music-AVQA-AQA_All/test: true
205
- MU-LLAMA-AQA/test: true
206
-
207
- LP-MusicCaps-MSD-AudioCaptioning/test: true
208
- LP-MusicCaps-MC-AudioCaptioning/test: true
209
- LP-MusicCaps-MTT-AudioCaptioning/test: true
210
- MusicCaps-AudioCaptioning/test: true
211
-
212
- NSynth-MIR/test: true
213
- mtg-jamendo-MusicTagging/val: true
214
- musdbhq-InstrClassification/test: true
215
-
216
- # # zero shot
217
- # CREMA-D-EmotionClassification/train:
218
- # prefix_prob: 1.0
219
-
220
- # ravdess-EmotionClassification/train:
221
- # prefix_prob: 1.0
222
-
223
- # UrbanSound8K-EventClassification/train:
224
- # prefix_prob: 1.0
225
-
226
- # ESC50-EventClassification/train:
227
- # prefix_prob: 1.0
228
-
229
- # DCASE17Task4-SceneClassification/test:
230
- # prefix_prob: 1.0
231
-
232
- # GTZAN-GenreClassification/train:
233
- # prefix_prob: 1.0
234
-
235
- # Medley-solos-DB-InstrClassification/test:
236
- # prefix_prob: 1.0
237
-
238
- clap_config:
239
- method: nvclap-large
240
- audio_embed_dim: 2048
241
- checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
242
-
243
- window_length: 10.0 # seconds
244
- window_overlap: 0.0 # seconds
245
- max_num_window: 30 # 1.5 minutes
246
- max_num_fewshot: 1 # number of fewshot samples (including the final one)
247
- finetune: true
248
-
249
- whisper_config:
250
- method: whisper-large-v3
251
- path: openai/whisper-large-v3
252
- audio_embed_dim: 1280
253
- sampling_rate: 16000
254
-
255
- window_length: 30.0 # seconds
256
- window_overlap: 0.0 # seconds
257
- max_num_window: 1 # 5 minutes
258
- max_num_fewshot: 1 # number of fewshot samples (including the final one)
259
-
260
- mert_config:
261
- method: mert-v1
262
- path: m-a-p/MERT-v1-330M
263
- audio_embed_dim: 1024
264
- sampling_rate: 24000
265
-
266
- window_length: 10.0 # seconds
267
- window_overlap: 0.0 # seconds
268
- max_num_window: 1 # 5 minutes
269
- max_num_fewshot: 1 # number of fewshot samples (including the final one)
270
-
271
- model_config:
272
- cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
273
-
274
- lang_encoder_path: Qwen/Qwen2.5-3B
275
- tokenizer_path: Qwen/Qwen2.5-3B
276
- cross_attn_every_n_layers: 1
277
- audio_transformer_kwargs: {
278
- n_head: 8,
279
- n_layers: 3,
280
- d_inner: 2048,
281
- max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
282
- max_window_per_audio: 1, # must = max_num_window
283
- common_encoder_embed_dim: 1024
284
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml DELETED
@@ -1,255 +0,0 @@
1
- train_config:
2
- expdir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation
3
- run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node
4
- delete_previous_checkpoint: true
5
- batch_size: 6
6
- gradient_accumulation_steps: 2 # 4 nodes
7
- seed: 42
8
- learning_rate: 0.0001
9
- lr_scheduler: constant
10
- loss_multiplier: 1.0
11
- warmup_steps: 1875
12
- weight_decay: 0.1
13
- precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
14
- gradient_checkpointing: False
15
- num_epochs: 100 # num_epochs * dataset_blending_global_weight = 1
16
- offline: false
17
- freeze_lm_embeddings: true
18
- logging_steps: 10
19
- dist_backend: nccl
20
- dist_url: env:// # tcp://localhost:7000
21
- no_set_device_rank: false
22
- fsdp: true
23
- fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
24
- fsdp_sharding_strategy: full # full, hybrid
25
- horovod: false
26
-
27
- data_config:
28
- dataset_blending_global_weight: 0.01
29
-
30
- dataset_blending_config:
31
-
32
- # Audio QA
33
- OpenAQA-AQA/train:
34
- weight: 1.0
35
- prefix_prob: 0.0
36
- augmentations:
37
- do_nothing: 1.0
38
-
39
- # Audio Captioning
40
-
41
- BBCSoundEffects-AudioDescription/train:
42
- weight: 5.0
43
- prefix_prob: 0.5
44
- augmentations:
45
- do_nothing: 1.0
46
-
47
- CLAP_freesound-AudioCaptioning/train:
48
- weight: 1.0
49
- prefix_prob: 0.5
50
- augmentations:
51
- do_nothing: 1.0
52
-
53
- SoundDescs-AudioDescription/train:
54
- weight: 1.0
55
- prefix_prob: 0.5
56
- augmentations:
57
- do_nothing: 1.0
58
-
59
- WavCaps-AudioSet_SL-AudioCaptioning/train:
60
- weight: 1.0
61
- prefix_prob: 0.5
62
- augmentations:
63
- do_nothing: 1.0
64
-
65
- WavCaps-BBC_Sound_Effects-AudioCaptioning/train:
66
- weight: 2
67
- prefix_prob: 0.5
68
- augmentations:
69
- do_nothing: 1.0
70
-
71
- WavCaps-FreeSound-AudioCaptioning/train:
72
- weight: 2
73
- prefix_prob: 0.5
74
- augmentations:
75
- do_nothing: 1.0
76
-
77
- WavCaps-SoundBible-AudioCaptioning/train:
78
- weight: 5
79
- prefix_prob: 0.5
80
- augmentations:
81
- do_nothing: 1.0
82
-
83
- # Audio Classification
84
-
85
- AudioSetFullwoAudioMusicCaps-EventClassification/train:
86
- weight: 1.0
87
- prefix_prob: 0.5
88
- augmentations:
89
- num_words: 0.8
90
- do_nothing: 0.2
91
-
92
- AudioSet-EventClassification/train:
93
- weight: 5.0
94
- prefix_prob: 0.5
95
- augmentations:
96
- num_words: 0.8
97
- do_nothing: 0.2
98
-
99
- Clotho-AQA-EventClassification/train:
100
- weight: 5.0
101
- prefix_prob: 0.5
102
- augmentations:
103
- num_words: 0.8
104
- do_nothing: 0.2
105
-
106
- WavText5K-Tagging/train:
107
- weight: 3.0
108
- prefix_prob: 0.5
109
- augmentations:
110
- num_words: 0.8
111
- do_nothing: 0.2
112
-
113
- # Speech Emotion Classification
114
-
115
- MSP-PODCAST-Publish-1.9-EmotionClassification/train:
116
- weight: 1.8
117
- prefix_prob: 0.5
118
- augmentations:
119
- provide_all_labels: 0.9
120
- do_nothing: 0.1
121
- MSP-PODCAST-Publish-1.9-EmotionClassification/interleaved_knn-train:
122
- weight: 1.2
123
- prefix_prob: 0.5
124
- augmentations:
125
- provide_all_labels: 0.9
126
- do_nothing: 0.1
127
-
128
- MELD-EmotionClassification/train:
129
- weight: 1.8
130
- prefix_prob: 0.5
131
- augmentations:
132
- provide_all_labels: 0.9
133
- do_nothing: 0.1
134
- MELD-EmotionClassification/interleaved_knn-train:
135
- weight: 1.2
136
- prefix_prob: 0.5
137
- augmentations:
138
- provide_all_labels: 0.9
139
- do_nothing: 0.1
140
-
141
- MELD-SentimentClassification/train:
142
- weight: 1.8
143
- prefix_prob: 0.5
144
- augmentations:
145
- provide_all_labels: 0.9
146
- do_nothing: 0.1
147
- MELD-SentimentClassification/interleaved_knn-train:
148
- weight: 1.2
149
- prefix_prob: 0.5
150
- augmentations:
151
- provide_all_labels: 0.9
152
- do_nothing: 0.1
153
-
154
- # Music QA
155
-
156
- Music-AVQA-AVQA_All/train:
157
- weight: 3.0
158
- prefix_prob: 0.5
159
- augmentations:
160
- AQA_binary_instruction: 1.0
161
-
162
- MU-LLAMA-AQA/train:
163
- weight: 1.8
164
- prefix_prob: 0.5
165
- augmentations:
166
- do_nothing: 1.0
167
- MU-LLAMA-AQA/interleaved_knn-train:
168
- weight: 1.2
169
- prefix_prob: 0.5
170
- augmentations:
171
- do_nothing: 1.0
172
-
173
- # Music Captioning
174
-
175
- LP-MusicCaps-MSD-AudioCaptioning/train:
176
- weight: 1.0
177
- prefix_prob: 0.5
178
- augmentations:
179
- do_nothing: 1.0
180
-
181
- # Music Understanding
182
-
183
- NSynth-MIR/train:
184
- weight: 0.6
185
- prefix_prob: 0.5
186
- augmentations:
187
- do_nothing: 1.0
188
- NSynth-MIR/interleaved_knn-train:
189
- weight: 0.4
190
- prefix_prob: 0.5
191
- augmentations:
192
- do_nothing: 1.0
193
-
194
- mtg-jamendo-MusicTagging/train:
195
- weight: 1.0
196
- prefix_prob: 0.5
197
- augmentations:
198
- do_nothing: 1.0
199
-
200
- dataset_file_root: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/dataset_files
201
- data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
202
- dataset_blending_output: dataset_blending.json
203
- max_tokens: 512
204
- num_workers: 4
205
-
206
- valid_dataset_config:
207
- CLAP_freesound-AudioCaptioning/test: true
208
- SoundDescs-AudioDescription/test: true
209
- Clotho-AQA-EventClassification/test: true
210
-
211
- MSP-PODCAST-Publish-1.9-EmotionClassification/test: true
212
- MSP-PODCAST-Publish-1.9-EmotionClassification/interleaved_knn-test: true
213
- MELD-EmotionClassification/test: true
214
- MELD-EmotionClassification/interleaved_knn-test: true
215
- MELD-SentimentClassification/test: true
216
- MELD-SentimentClassification/interleaved_knn-test: true
217
-
218
- MU-LLAMA-AQA/test: true
219
- LP-MusicCaps-MSD-AudioCaptioning/val: true
220
- NSynth-MIR/test: true
221
- NSynth-MIR/interleaved_knn-test: true
222
- mtg-jamendo-MusicTagging/val: true
223
-
224
- clap_config:
225
- # method: laion-clap
226
- # audio_embed_dim: 512
227
- # model_name: 630k-fusion-best
228
- # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
229
-
230
- method: microsoft-clap
231
- audio_embed_dim: 1024
232
- config_root: /home/zkong/audio_flamingo/audio_flamingo_v1/v1.0_optimlmax1.3b_foundation/my_ms_clap/src/configs
233
- # model_name: '2023'
234
- # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/CLAP_weights_2023.pth
235
- model_name: 'clapcap'
236
- checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/clapcap_weights_2023.pth
237
-
238
- window_length: 7.0 # seconds
239
- window_overlap: 5.25 # seconds
240
- max_num_window: 16 # 35 seconds
241
- max_num_fewshot: 4 # number of fewshot samples (including the final one)
242
-
243
- model_config:
244
- cache_dir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/LLM_pretrained/.cache
245
-
246
- lang_encoder_path: facebook/opt-iml-max-1.3b
247
- tokenizer_path: facebook/opt-iml-max-1.3b
248
- cross_attn_every_n_layers: 1
249
- audio_transformer_kwargs: {
250
- n_head: 8,
251
- n_layers: 3,
252
- d_inner: 2048,
253
- max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
254
- max_window_per_audio: 16, # must = max_num_window
255
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml DELETED
@@ -1,183 +0,0 @@
1
- train_config:
2
- expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed
3
- run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed
4
- delete_previous_checkpoint: true
5
- batch_size: 4
6
- gradient_accumulation_steps: 2 # 4 nodes
7
- seed: 42
8
- learning_rate: 0.0001
9
- lr_scheduler: constant
10
- loss_multiplier: 1.0
11
- warmup_steps: 1875
12
- weight_decay: 0.1
13
- precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
14
- gradient_checkpointing: False
15
- num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
16
- offline: false
17
- freeze_lm_embeddings: false
18
- logging_steps: 10
19
- dist_backend: nccl
20
- dist_url: env:// # tcp://localhost:7000
21
- no_set_device_rank: false
22
- fsdp: true
23
- fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
24
- fsdp_sharding_strategy: full # full, hybrid
25
- horovod: false
26
-
27
- data_config:
28
- dataset_blending_global_weight: 0.005
29
-
30
- dataset_blending_config:
31
-
32
- # Audio QA
33
- OpenAQA-AQA/train:
34
- weight: 1.0
35
-
36
- AudioSet-Temporal-Speech-Audio-QA/train:
37
- weight: 2.0
38
-
39
- CompA-R-AQA/train:
40
- weight: 2.0
41
-
42
- # Audio Captioning
43
-
44
- BBCSoundEffects-AudioDescription/train:
45
- weight: 5.0
46
-
47
- CLAP_freesound-AudioCaptioning/train:
48
- weight: 1.0
49
-
50
- SoundDescs-AudioDescription/train:
51
- weight: 1.0
52
-
53
- WavCaps-AudioSet_SL-AudioCaptioning/train:
54
- weight: 1.0
55
-
56
- WavCaps-BBC_Sound_Effects-AudioCaptioning/train:
57
- weight: 2.0
58
-
59
- WavCaps-FreeSound-AudioCaptioning/train:
60
- weight: 2.0
61
-
62
- WavCaps-SoundBible-AudioCaptioning/train:
63
- weight: 5.0
64
-
65
- Ego-10-AudioCaptioning/train:
66
- weight: 2.0
67
-
68
- Ego-30-AudioCaptioning/train:
69
- weight: 2.0
70
-
71
- # Audio Classification
72
-
73
- AudioSetFullwoAudioMusicCaps-EventClassification/train:
74
- weight: 1.0
75
-
76
- AudioSet-EventClassification/train:
77
- weight: 5.0
78
-
79
- Clotho-AQA-EventClassification/train:
80
- weight: 5.0
81
-
82
- WavText5K-Tagging/train:
83
- weight: 3.0
84
-
85
- # Speech Emotion Classification
86
-
87
- MSP-PODCAST-Publish-1.9-EmotionClassification/train:
88
- weight: 3.0
89
-
90
- MELD-EmotionClassification/train:
91
- weight: 3.0
92
-
93
- MELD-SentimentClassification/train:
94
- weight: 3.0
95
-
96
- # Music QA
97
-
98
- Music-AVQA-AVQA_All/train:
99
- weight: 3.0
100
-
101
- MU-LLAMA-AQA/train:
102
- weight: 3.0
103
-
104
- # Music Captioning
105
-
106
- LP-MusicCaps-MSD-AudioCaptioning/train:
107
- weight: 1.0
108
-
109
- # Music Understanding
110
-
111
- NSynth-MIR/train:
112
- weight: 1.0
113
-
114
- mtg-jamendo-MusicTagging/train:
115
- weight: 1.0
116
-
117
- dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
118
- data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
119
- dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
120
- max_tokens: 512
121
- num_workers: 4
122
-
123
- valid_dataset_config:
124
- CLAP_freesound-AudioCaptioning/test: true
125
- SoundDescs-AudioDescription/test: true
126
- Clotho-AQA-EventClassification/test: true
127
-
128
- MSP-PODCAST-Publish-1.9-EmotionClassification/test: true
129
- MELD-EmotionClassification/test: true
130
- MELD-SentimentClassification/test: true
131
-
132
- MU-LLAMA-AQA/test: true
133
- LP-MusicCaps-MSD-AudioCaptioning/val: true
134
- NSynth-MIR/test: true
135
- mtg-jamendo-MusicTagging/val: true
136
-
137
- clap_config:
138
- method: nvclap-large
139
- audio_embed_dim: 2048
140
- checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
141
-
142
- window_length: 10.0 # seconds
143
- window_overlap: 0.0 # seconds
144
- max_num_window: 3 # 5 minutes
145
- max_num_fewshot: 1 # number of fewshot samples (including the final one)
146
-
147
- whisper_config:
148
- method: whisper-large-v3
149
- path: openai/whisper-large-v3
150
- audio_embed_dim: 1280
151
- sampling_rate: 16000
152
-
153
- window_length: 30.0 # seconds
154
- window_overlap: 0.0 # seconds
155
- max_num_window: 1 # 5 minutes
156
- max_num_fewshot: 1 # number of fewshot samples (including the final one)
157
- finetune: true
158
-
159
- mert_config:
160
- method: mert-v1
161
- path: m-a-p/MERT-v1-330M
162
- audio_embed_dim: 1024
163
- sampling_rate: 24000
164
-
165
- window_length: 10.0 # seconds
166
- window_overlap: 0.0 # seconds
167
- max_num_window: 1 # 5 minutes
168
- max_num_fewshot: 1 # number of fewshot samples (including the final one)
169
-
170
- model_config:
171
- cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
172
-
173
- lang_encoder_path: Qwen/Qwen2.5-3B
174
- tokenizer_path: Qwen/Qwen2.5-3B
175
- cross_attn_every_n_layers: 1
176
- audio_transformer_kwargs: {
177
- n_head: 8,
178
- n_layers: 3,
179
- d_inner: 2048,
180
- max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
181
- max_window_per_audio: 1, # must = max_num_window
182
- common_encoder_embed_dim: 1024
183
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml DELETED
@@ -1,483 +0,0 @@
1
- train_config:
2
- expdir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation
3
- run_name: run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node
4
- delete_previous_checkpoint: true
5
- batch_size: 4
6
- gradient_accumulation_steps: 1
7
- seed: 42
8
- learning_rate: 0.00002
9
- lr_scheduler: constant
10
- loss_multiplier: 1.0
11
- warmup_steps: 1875
12
- weight_decay: 0.1
13
- precision: fp32 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
14
- gradient_checkpointing: False
15
- num_epochs: 160 # num_epochs * dataset_blending_global_weight = 1
16
- offline: false
17
- freeze_lm_embeddings: false
18
- logging_steps: 10
19
- dist_backend: nccl
20
- dist_url: env:// # tcp://localhost:7000
21
- no_set_device_rank: false
22
- fsdp: true
23
- fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
24
- fsdp_sharding_strategy: full # full, hybrid
25
- horovod: false
26
-
27
- # instruction tuning hparams
28
- sft_config:
29
- pretrained_path: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node/
30
- pretrained_ckpt: checkpoint_99.pt
31
- unfreeze_full_lm: true
32
-
33
- data_config:
34
- dataset_blending_global_weight: 0.01
35
-
36
- dataset_blending_config:
37
-
38
- # Audio QA
39
- Clotho-AQA-AQA/train:
40
- weight: 0.8
41
- prefix_prob: 1.0
42
- augmentations:
43
- AQA_binary_instruction: 1.0
44
- Clotho-AQA-AQA/interleaved_knn-train:
45
- weight: 0.2
46
- prefix_prob: 1.0
47
- augmentations:
48
- AQA_binary_instruction: 1.0
49
-
50
- OpenAQA-AQA/train:
51
- weight: 1.0
52
- prefix_prob: 1.0
53
- augmentations:
54
- do_nothing: 1.0
55
-
56
- # Audio Captioning
57
-
58
- Clotho-v2-AudioCaptioning/train:
59
- weight: 0.8
60
- prefix_prob: 1.0
61
- augmentations:
62
- AC_short: 1.0
63
- Clotho-v2-AudioCaptioning/interleaved_knn-train:
64
- weight: 0.2
65
- prefix_prob: 1.0
66
- augmentations:
67
- AC_short: 1.0
68
-
69
- audiocaps-AudioCaptioning/train:
70
- weight: 0.8
71
- prefix_prob: 1.0
72
- augmentations:
73
- AC_short: 1.0
74
- audiocaps-AudioCaptioning/interleaved_knn-train:
75
- weight: 0.2
76
- prefix_prob: 1.0
77
- augmentations:
78
- AC_short: 1.0
79
-
80
- Epidemic_sound-AudioCaptioning/train:
81
- weight: 0.8
82
- prefix_prob: 1.0
83
- augmentations:
84
- AC_short: 1.0
85
- Epidemic_sound-AudioCaptioning/interleaved_knn-train:
86
- weight: 0.2
87
- prefix_prob: 1.0
88
- augmentations:
89
- AC_short: 1.0
90
-
91
- MACS-AudioCaptioning/train:
92
- weight: 0.8
93
- prefix_prob: 1.0
94
- augmentations:
95
- AC_short: 1.0
96
- MACS-AudioCaptioning/interleaved_knn-train:
97
- weight: 0.2
98
- prefix_prob: 1.0
99
- augmentations:
100
- AC_short: 1.0
101
-
102
- # Audio Classification
103
-
104
- FSD50k-EventClassification/train:
105
- weight: 0.8
106
- prefix_prob: 1.0
107
- augmentations:
108
- default: 1.0
109
- FSD50k-EventClassification/interleaved_knn-train:
110
- weight: 0.2
111
- prefix_prob: 1.0
112
- augmentations:
113
- default: 1.0
114
-
115
- CochlScene-SceneClassification/train:
116
- weight: 0.8
117
- prefix_prob: 1.0
118
- augmentations:
119
- provide_all_labels: 0.5
120
- default: 0.5
121
- CochlScene-SceneClassification/interleaved_knn-train:
122
- weight: 0.2
123
- prefix_prob: 1.0
124
- augmentations:
125
- provide_all_labels: 0.5
126
- default: 0.5
127
-
128
- NonSpeech7k-EventClassification/train:
129
- weight: 0.8
130
- prefix_prob: 1.0
131
- augmentations:
132
- provide_all_labels: 0.5
133
- default: 0.5
134
- NonSpeech7k-EventClassification/interleaved_knn-train:
135
- weight: 0.2
136
- prefix_prob: 1.0
137
- augmentations:
138
- provide_all_labels: 0.5
139
- default: 0.5
140
-
141
- chime-home-EventClassification/train:
142
- weight: 0.8
143
- prefix_prob: 1.0
144
- augmentations:
145
- default: 0.5
146
- num_words: 0.5
147
- chime-home-EventClassification/interleaved_knn-train:
148
- weight: 0.2
149
- prefix_prob: 1.0
150
- augmentations:
151
- default: 0.5
152
- num_words: 0.5
153
-
154
- SONYC-UST-EventClassification/train:
155
- weight: 0.8
156
- prefix_prob: 1.0
157
- augmentations:
158
- default: 0.5
159
- num_words: 0.5
160
- SONYC-UST-EventClassification/interleaved_knn-train:
161
- weight: 0.2
162
- prefix_prob: 1.0
163
- augmentations:
164
- default: 0.5
165
- num_words: 0.5
166
-
167
- # Speech Emotion Classification
168
-
169
- MELD-EmotionClassification/train:
170
- weight: 0.5
171
- prefix_prob: 1.0
172
- augmentations:
173
- provide_all_labels: 0.5
174
- default: 0.5
175
-
176
- MELD-SentimentClassification/train:
177
- weight: 0.5
178
- prefix_prob: 1.0
179
- augmentations:
180
- provide_all_labels: 0.1
181
- default: 0.9
182
-
183
- emov-db-EmotionClassification/train:
184
- weight: 1.6
185
- prefix_prob: 1.0
186
- augmentations:
187
- provide_all_labels: 0.5
188
- default: 0.5
189
- emov-db-EmotionClassification/interleaved_knn-train:
190
- weight: 0.4
191
- prefix_prob: 1.0
192
- augmentations:
193
- provide_all_labels: 0.5
194
- default: 0.5
195
-
196
- jl-corpus-EmotionClassification/train:
197
- weight: 6.0
198
- prefix_prob: 1.0
199
- augmentations:
200
- provide_all_labels: 0.5
201
- default: 0.5
202
- jl-corpus-EmotionClassification/interleaved_knn-train:
203
- weight: 1.5
204
- prefix_prob: 1.0
205
- augmentations:
206
- provide_all_labels: 0.5
207
- default: 0.5
208
-
209
- tess-EmotionClassification/train:
210
- weight: 2.0
211
- prefix_prob: 1.0
212
- augmentations:
213
- provide_all_labels: 0.5
214
- default: 0.5
215
- tess-EmotionClassification/interleaved_knn-train:
216
- weight: 0.5
217
- prefix_prob: 1.0
218
- augmentations:
219
- provide_all_labels: 0.5
220
- default: 0.5
221
-
222
- IEMOCAP-EmotionClassification/train:
223
- weight: 2.4
224
- prefix_prob: 1.0
225
- augmentations:
226
- provide_all_labels: 0.5
227
- default: 0.5
228
- IEMOCAP-EmotionClassification/interleaved_knn-train:
229
- weight: 0.6
230
- prefix_prob: 1.0
231
- augmentations:
232
- provide_all_labels: 0.5
233
- default: 0.5
234
-
235
- OMGEmotion-EmotionClassification/train:
236
- weight: 3.0
237
- prefix_prob: 1.0
238
- augmentations:
239
- provide_all_labels: 0.5
240
- default: 0.5
241
-
242
- VocalSound-VocalClassification/train:
243
- weight: 1.0
244
- prefix_prob: 1.0
245
- augmentations:
246
- provide_all_labels: 0.5
247
- default: 0.5
248
-
249
- # Music QA
250
-
251
- Music-AVQA-AQA_All/train:
252
- weight: 2.0
253
- prefix_prob: 1.0
254
- augmentations:
255
- AQA_binary_instruction: 1.0
256
- Music-AVQA-AQA_All/interleaved_knn-train:
257
- weight: 1.0
258
- prefix_prob: 1.0
259
- augmentations:
260
- AQA_binary_instruction: 1.0
261
-
262
- MU-LLAMA-AQA/train:
263
- weight: 0.9
264
- prefix_prob: 1.0
265
- augmentations:
266
- do_nothing: 1.0
267
- MU-LLAMA-AQA/interleaved_knn-train:
268
- weight: 0.1
269
- prefix_prob: 1.0
270
- augmentations:
271
- do_nothing: 1.0
272
-
273
- # Music Captioning
274
-
275
- LP-MusicCaps-MSD-AudioCaptioning/train:
276
- weight: 0.05 # 1.3M
277
- prefix_prob: 1.0
278
- augmentations:
279
- AC_paragraph: 1.0
280
- LP-MusicCaps-MSD-AudioCaptioning/interleaved_knn-train:
281
- weight: 0.05 # 111k
282
- prefix_prob: 1.0
283
- augmentations:
284
- AC_paragraph: 1.0
285
-
286
- LP-MusicCaps-MC-AudioCaptioning/train:
287
- weight: 1.6
288
- prefix_prob: 1.0
289
- augmentations:
290
- AC_paragraph: 1.0
291
- LP-MusicCaps-MC-AudioCaptioning/interleaved_knn-train:
292
- weight: 0.4
293
- prefix_prob: 1.0
294
- augmentations:
295
- AC_paragraph: 1.0
296
-
297
- LP-MusicCaps-MTT-AudioCaptioning/train:
298
- weight: 0.8
299
- prefix_prob: 1.0
300
- augmentations:
301
- AC_long: 1.0
302
- LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-train:
303
- weight: 0.2
304
- prefix_prob: 1.0
305
- augmentations:
306
- AC_long: 1.0
307
-
308
- MusicCaps-AudioCaptioning/train:
309
- weight: 6.0
310
- prefix_prob: 1.0
311
- augmentations:
312
- AC_paragraph: 1.0
313
- MusicCaps-AudioCaptioning/interleaved_knn-train:
314
- weight: 1.5
315
- prefix_prob: 1.0
316
- augmentations:
317
- AC_paragraph: 1.0
318
-
319
- SongDescriber-AudioCaptioning/train:
320
- weight: 0.8
321
- prefix_prob: 1.0
322
- augmentations:
323
- AC_long: 1.0
324
- SongDescriber-AudioCaptioning/interleaved_knn-train:
325
- weight: 0.2
326
- prefix_prob: 1.0
327
- augmentations:
328
- AC_long: 1.0
329
-
330
- # Music Understanding
331
-
332
- NSynth-MIR/train:
333
- weight: 0.2 # 289k for weight = 1
334
- prefix_prob: 1.0
335
- augmentations:
336
- do_nothing: 1.0
337
- NSynth-MIR/interleaved_knn-train:
338
- weight: 0.2 # 60k for weight = 1
339
- prefix_prob: 1.0
340
- augmentations:
341
- do_nothing: 1.0
342
-
343
- mtg-jamendo-MusicTagging/train:
344
- weight: 0.1
345
- prefix_prob: 1.0
346
- augmentations:
347
- default: 1.0
348
-
349
- FMA-GenreClassification/train:
350
- weight: 0.4 # 104k for weight = 1
351
- prefix_prob: 1.0
352
- augmentations:
353
- do_nothing: 1.0
354
- FMA-GenreClassification/interleaved_knn-train:
355
- weight: 0.3 # 46k for weight = 1
356
- prefix_prob: 1.0
357
- augmentations:
358
- do_nothing: 1.0
359
-
360
- musdbhq-InstrClassification/train:
361
- weight: 0.8
362
- prefix_prob: 1.0
363
- augmentations:
364
- provide_all_labels: 0.5
365
- default: 0.5
366
-
367
- dataset_file_root: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/dataset_files
368
- data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
369
- dataset_blending_output: dataset_blending.json
370
- max_tokens: 512
371
- num_workers: 4
372
-
373
- valid_dataset_config:
374
-
375
- Clotho-AQA-AQA/test: true
376
- Clotho-AQA-AQA/interleaved_knn-test: true
377
-
378
- Clotho-v2-AudioCaptioning/test: true
379
- Clotho-v2-AudioCaptioning/interleaved_knn-test: true
380
-
381
- FSD50k-EventClassification/test: true
382
- FSD50k-EventClassification/interleaved_knn-test: true
383
-
384
- CochlScene-SceneClassification/test: true
385
- CochlScene-SceneClassification/interleaved_knn-test: true
386
-
387
- NonSpeech7k-EventClassification/test: true
388
- NonSpeech7k-EventClassification/interleaved_knn-test: true
389
-
390
- SONYC-UST-EventClassification/test: true
391
- SONYC-UST-EventClassification/interleaved_knn-test: true
392
-
393
- emov-db-EmotionClassification/val: true
394
- emov-db-EmotionClassification/interleaved_knn-val: true
395
-
396
- jl-corpus-EmotionClassification/val: true
397
- jl-corpus-EmotionClassification/interleaved_knn-val: true
398
-
399
- tess-EmotionClassification/val: true
400
- tess-EmotionClassification/interleaved_knn-val: true
401
-
402
- IEMOCAP-EmotionClassification/test: true
403
- IEMOCAP-EmotionClassification/interleaved_knn-test: true
404
-
405
- OMGEmotion-EmotionClassification/val: true
406
-
407
- Music-AVQA-AQA_All/test: true
408
- Music-AVQA-AQA_All/interleaved_knn-test: true
409
-
410
- MU-LLAMA-AQA/test: true
411
-
412
- LP-MusicCaps-MSD-AudioCaptioning/test: true
413
- LP-MusicCaps-MC-AudioCaptioning/test: true
414
- LP-MusicCaps-MTT-AudioCaptioning/test: true
415
- LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-test: true
416
-
417
- NSynth-MIR/test: true
418
- NSynth-MIR/interleaved_knn-test: true
419
-
420
- mtg-jamendo-MusicTagging/val: true
421
-
422
- audiocaps-AudioCaptioning/test: true
423
- audiocaps-AudioCaptioning/interleaved_knn-test: true
424
-
425
- MusicCaps-AudioCaptioning/test: true
426
-
427
- MELD-EmotionClassification/test: true
428
- MELD-SentimentClassification/test: true
429
- VocalSound-VocalClassification/test: true
430
- musdbhq-InstrClassification/test: true
431
-
432
- # zero shot
433
-
434
- GTZAN-GenreClassification/train:
435
- prefix_prob: 1.0
436
- augmentations:
437
- provide_all_labels: 1.0
438
- GTZAN-GenreClassification/interleaved_knn-train:
439
- prefix_prob: 1.0
440
- augmentations:
441
- provide_all_labels: 1.0
442
-
443
- Medley-solos-DB-InstrClassification/test:
444
- prefix_prob: 1.0
445
- augmentations:
446
- provide_all_labels: 1.0
447
- Medley-solos-DB-InstrClassification/interleaved_knn-test:
448
- prefix_prob: 1.0
449
- augmentations:
450
- provide_all_labels: 1.0
451
-
452
- clap_config:
453
- # method: laion-clap
454
- # audio_embed_dim: 512
455
- # model_name: 630k-fusion-best
456
- # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
457
-
458
- method: microsoft-clap
459
- audio_embed_dim: 1024
460
- config_root: /home/zkong/audio_flamingo/audio_flamingo_v1/v1.0_optimlmax1.3b_foundation/my_ms_clap/src/configs
461
- # model_name: '2023'
462
- # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/CLAP_weights_2023.pth
463
- model_name: 'clapcap'
464
- checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/clapcap_weights_2023.pth
465
-
466
- window_length: 7.0 # seconds
467
- window_overlap: 5.25 # seconds
468
- max_num_window: 16 # 35 seconds
469
- max_num_fewshot: 4 # number of fewshot samples (including the final one)
470
-
471
- model_config:
472
- cache_dir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/LLM_pretrained/.cache
473
-
474
- lang_encoder_path: facebook/opt-iml-max-1.3b
475
- tokenizer_path: facebook/opt-iml-max-1.3b
476
- cross_attn_every_n_layers: 1
477
- audio_transformer_kwargs: {
478
- n_head: 8,
479
- n_layers: 3,
480
- d_inner: 2048,
481
- max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
482
- max_window_per_audio: 16, # must = max_num_window
483
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml DELETED
@@ -1,284 +0,0 @@
1
- train_config:
2
- expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed
3
- run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft
4
- delete_previous_checkpoint: true
5
- batch_size: 4
6
- gradient_accumulation_steps: 2
7
- seed: 42
8
- learning_rate: 0.00002
9
- lr_scheduler: constant
10
- loss_multiplier: 1.0
11
- warmup_steps: 1875
12
- weight_decay: 0.1
13
- precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
14
- gradient_checkpointing: False
15
- num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
16
- offline: false
17
- freeze_lm_embeddings: false
18
- logging_steps: 10
19
- dist_backend: nccl
20
- dist_url: env:// # tcp://localhost:7000
21
- no_set_device_rank: false
22
- fsdp: true
23
- fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
24
- fsdp_sharding_strategy: full # full, hybrid
25
- horovod: false
26
-
27
- # instruction tuning hparams
28
- sft_config:
29
- pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
30
- pretrained_ckpt: checkpoint_199.pt
31
- unfreeze_full_lm: false
32
-
33
- data_config:
34
- dataset_blending_global_weight: 0.005
35
-
36
- dataset_blending_config:
37
-
38
- MMAUQA/train:
39
- weight: 1.5
40
-
41
- AudioSet-Temporal-Speech-Audio-QA/train:
42
- weight: 1.0
43
-
44
- CompA-R-AQA/train:
45
- weight: 1.0
46
-
47
- # Audio QA
48
- Clotho-AQA-AQA/train:
49
- weight: 1.0
50
-
51
- OpenAQA-AQA/train:
52
- weight: 1.0
53
-
54
- SalmonnQA/train:
55
- weight: 1.0
56
-
57
- AudioEntailmentQA/train:
58
- weight: 1.0
59
-
60
- # Audio Captioning
61
-
62
- Clotho-v2-AudioCaptioning/train:
63
- weight: 1.0
64
-
65
- audiocaps-AudioCaptioning/train:
66
- weight: 1.0
67
-
68
- Epidemic_sound-AudioCaptioning/train:
69
- weight: 1.0
70
-
71
- MACS-AudioCaptioning/train:
72
- weight: 1.0
73
-
74
- # Audio Classification
75
-
76
- FSD50k-EventClassification/train:
77
- weight: 1.0
78
-
79
- CochlScene-SceneClassification/train:
80
- weight: 1.0
81
-
82
- NonSpeech7k-EventClassification/train:
83
- weight: 1.0
84
-
85
- chime-home-EventClassification/train:
86
- weight: 1.0
87
-
88
- SONYC-UST-EventClassification/train:
89
- weight: 1.0
90
-
91
- # Speech Emotion Classification
92
-
93
- MELD-EmotionClassification/train:
94
- weight: 0.5
95
-
96
- MELD-SentimentClassification/train:
97
- weight: 0.5
98
-
99
- emov-db-EmotionClassification/train:
100
- weight: 1.0
101
-
102
- jl-corpus-EmotionClassification/train:
103
- weight: 6.0
104
-
105
- tess-EmotionClassification/train:
106
- weight: 2.5
107
-
108
- IEMOCAP-EmotionClassification/train:
109
- weight: 3.0
110
-
111
- OMGEmotion-EmotionClassification/train:
112
- weight: 3.0
113
-
114
- VocalSound-VocalClassification/train:
115
- weight: 1.5
116
-
117
- # Music QA
118
-
119
- Music-AVQA-AQA_All/train:
120
- weight: 3.0
121
-
122
- MU-LLAMA-AQA/train:
123
- weight: 1.0
124
-
125
- # Music Captioning
126
-
127
- LP-MusicCaps-MSD-AudioCaptioning/train:
128
- weight: 0.06
129
-
130
- LP-MusicCaps-MC-AudioCaptioning/train:
131
- weight: 2.0
132
-
133
- LP-MusicCaps-MTT-AudioCaptioning/train:
134
- weight: 1.0
135
-
136
- MusicCaps-AudioCaptioning/train:
137
- weight: 6.0
138
-
139
- musdbhq-captioning/train:
140
- weight: 2.0
141
-
142
- # Music Understanding
143
-
144
- NSynth-MIR/train:
145
- weight: 0.2
146
-
147
- mtg-jamendo-MusicTagging/train:
148
- weight: 0.1
149
-
150
- FMA-GenreClassification/train:
151
- weight: 0.5
152
-
153
- musdbhq-InstrClassification/train:
154
- weight: 0.8
155
-
156
- LLARK_FMA-mir/train:
157
- weight: 1.0
158
-
159
- LLARK_FMA-reasoning/train:
160
- weight: 1.0
161
-
162
- LLARK_MagnaTagATune-mir/train:
163
- weight: 1.0
164
-
165
- LLARK_MTG-Jamendo-reasoning/train:
166
- weight: 1.0
167
-
168
- LLARK_MagnaTagATune-reasoning/train:
169
- weight: 1.0
170
-
171
- LLARK_MTG-Jamendo-mir/train:
172
- weight: 1.0
173
-
174
- MusicBenchQA/train:
175
- weight: 1.0
176
-
177
- dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data_w_duration
178
- data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
179
- dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
180
- max_tokens: 512
181
- num_workers: 4
182
-
183
- valid_dataset_config:
184
-
185
- Clotho-AQA-AQA/test: true
186
-
187
- Clotho-v2-AudioCaptioning/test: true
188
- audiocaps-AudioCaptioning/test: true
189
-
190
- FSD50k-EventClassification/test: true
191
- CochlScene-SceneClassification/test: true
192
- NonSpeech7k-EventClassification/test: true
193
- SONYC-UST-EventClassification/test: true
194
-
195
- MELD-EmotionClassification/test: true
196
- MELD-SentimentClassification/test: true
197
- emov-db-EmotionClassification/val: true
198
- jl-corpus-EmotionClassification/val: true
199
- tess-EmotionClassification/val: true
200
- IEMOCAP-EmotionClassification/val: true
201
- OMGEmotion-EmotionClassification/val: true
202
- VocalSound-VocalClassification/test: true
203
-
204
- Music-AVQA-AQA_All/test: true
205
- MU-LLAMA-AQA/test: true
206
-
207
- LP-MusicCaps-MSD-AudioCaptioning/test: true
208
- LP-MusicCaps-MC-AudioCaptioning/test: true
209
- LP-MusicCaps-MTT-AudioCaptioning/test: true
210
- MusicCaps-AudioCaptioning/test: true
211
-
212
- NSynth-MIR/test: true
213
- mtg-jamendo-MusicTagging/val: true
214
- musdbhq-InstrClassification/test: true
215
-
216
- # # zero shot
217
- # CREMA-D-EmotionClassification/train:
218
- # prefix_prob: 1.0
219
-
220
- # ravdess-EmotionClassification/train:
221
- # prefix_prob: 1.0
222
-
223
- # UrbanSound8K-EventClassification/train:
224
- # prefix_prob: 1.0
225
-
226
- # ESC50-EventClassification/train:
227
- # prefix_prob: 1.0
228
-
229
- # DCASE17Task4-SceneClassification/test:
230
- # prefix_prob: 1.0
231
-
232
- # GTZAN-GenreClassification/train:
233
- # prefix_prob: 1.0
234
-
235
- # Medley-solos-DB-InstrClassification/test:
236
- # prefix_prob: 1.0
237
-
238
- clap_config:
239
- method: nvclap-large
240
- audio_embed_dim: 2048
241
- checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
242
-
243
- window_length: 10.0 # seconds
244
- window_overlap: 0.0 # seconds
245
- max_num_window: 9 # 1.5 minutes
246
- max_num_fewshot: 1 # number of fewshot samples (including the final one)
247
- finetune: true
248
-
249
- whisper_config:
250
- method: whisper-large-v3
251
- path: openai/whisper-large-v3
252
- audio_embed_dim: 1280
253
- sampling_rate: 16000
254
-
255
- window_length: 30.0 # seconds
256
- window_overlap: 0.0 # seconds
257
- max_num_window: 1 # 5 minutes
258
- max_num_fewshot: 1 # number of fewshot samples (including the final one)
259
-
260
- mert_config:
261
- method: mert-v1
262
- path: m-a-p/MERT-v1-330M
263
- audio_embed_dim: 1024
264
- sampling_rate: 24000
265
-
266
- window_length: 10.0 # seconds
267
- window_overlap: 0.0 # seconds
268
- max_num_window: 1 # 5 minutes
269
- max_num_fewshot: 1 # number of fewshot samples (including the final one)
270
-
271
- model_config:
272
- cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
273
-
274
- lang_encoder_path: Qwen/Qwen2.5-3B
275
- tokenizer_path: Qwen/Qwen2.5-3B
276
- cross_attn_every_n_layers: 1
277
- audio_transformer_kwargs: {
278
- n_head: 8,
279
- n_layers: 3,
280
- d_inner: 2048,
281
- max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
282
- max_window_per_audio: 1, # must = max_num_window
283
- common_encoder_embed_dim: 1024
284
- }