Spaces:
Running
on
L4
Running
on
L4
root
commited on
Commit
·
5d59dad
1
Parent(s):
f11ac57
initial commit
Browse files- configs/inference_1.5.yaml +0 -302
- configs/inference_2.yaml +0 -302
- configs/inference_long.yaml +0 -284
- configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml +0 -255
- configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml +0 -183
- configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml +0 -483
- configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml +0 -284
configs/inference_1.5.yaml
DELETED
@@ -1,302 +0,0 @@
|
|
1 |
-
train_config:
|
2 |
-
expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-1.5b
|
3 |
-
run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-1.5B-sft
|
4 |
-
delete_previous_checkpoint: true
|
5 |
-
batch_size: 32
|
6 |
-
gradient_accumulation_steps: 2
|
7 |
-
seed: 42
|
8 |
-
learning_rate: 0.00002
|
9 |
-
lr_scheduler: constant
|
10 |
-
loss_multiplier: 1.0
|
11 |
-
warmup_steps: 1875
|
12 |
-
weight_decay: 0.1
|
13 |
-
precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
|
14 |
-
gradient_checkpointing: False
|
15 |
-
num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
|
16 |
-
offline: false
|
17 |
-
freeze_lm_embeddings: false
|
18 |
-
logging_steps: 10
|
19 |
-
dist_backend: nccl
|
20 |
-
dist_url: env:// # tcp://localhost:7000
|
21 |
-
no_set_device_rank: false
|
22 |
-
fsdp: true
|
23 |
-
fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
|
24 |
-
fsdp_sharding_strategy: full # full, hybrid
|
25 |
-
horovod: false
|
26 |
-
|
27 |
-
# instruction tuning hparams
|
28 |
-
# sft_config:
|
29 |
-
# pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed/
|
30 |
-
# pretrained_ckpt: checkpoint_199.pt
|
31 |
-
# unfreeze_full_lm: false
|
32 |
-
|
33 |
-
data_config:
|
34 |
-
dataset_blending_global_weight: 0.005
|
35 |
-
|
36 |
-
dataset_blending_config:
|
37 |
-
|
38 |
-
MMAUQA/train:
|
39 |
-
weight: 1.5
|
40 |
-
|
41 |
-
AudioSet-Temporal-Speech-Audio-QA/train:
|
42 |
-
weight: 1.0
|
43 |
-
|
44 |
-
CompA-R-AQA/train:
|
45 |
-
weight: 1.0
|
46 |
-
|
47 |
-
# Audio QA
|
48 |
-
Clotho-AQA-AQA/train:
|
49 |
-
weight: 1.0
|
50 |
-
|
51 |
-
OpenAQA-AQA/train:
|
52 |
-
weight: 1.0
|
53 |
-
|
54 |
-
SalmonnQA/train:
|
55 |
-
weight: 0.8
|
56 |
-
|
57 |
-
AudioEntailmentQA/train:
|
58 |
-
weight: 1.0
|
59 |
-
|
60 |
-
# Audio Captioning
|
61 |
-
|
62 |
-
Clotho-v2-AudioCaptioning/train:
|
63 |
-
weight: 1.0
|
64 |
-
|
65 |
-
audiocaps-AudioCaptioning/train:
|
66 |
-
weight: 1.0
|
67 |
-
|
68 |
-
Epidemic_sound-AudioCaptioning/train:
|
69 |
-
weight: 1.0
|
70 |
-
|
71 |
-
MACS-AudioCaptioning/train:
|
72 |
-
weight: 1.0
|
73 |
-
|
74 |
-
# Audio Classification
|
75 |
-
|
76 |
-
UrbanSound8K-EventClassification/train:
|
77 |
-
weight: 0.5
|
78 |
-
|
79 |
-
TUT-EventClassification/train:
|
80 |
-
weight: 2.0
|
81 |
-
|
82 |
-
FSD50k-EventClassification/train:
|
83 |
-
weight: 1.0
|
84 |
-
|
85 |
-
CochlScene-SceneClassification/train:
|
86 |
-
weight: 1.0
|
87 |
-
|
88 |
-
NonSpeech7k-EventClassification/train:
|
89 |
-
weight: 1.0
|
90 |
-
|
91 |
-
chime-home-EventClassification/train:
|
92 |
-
weight: 1.0
|
93 |
-
|
94 |
-
SONYC-UST-EventClassification/train:
|
95 |
-
weight: 1.0
|
96 |
-
|
97 |
-
# Speech Emotion Classification
|
98 |
-
|
99 |
-
MELD-EmotionClassification/train:
|
100 |
-
weight: 0.5
|
101 |
-
|
102 |
-
MELD-SentimentClassification/train:
|
103 |
-
weight: 0.5
|
104 |
-
|
105 |
-
emov-db-EmotionClassification/train:
|
106 |
-
weight: 1.0
|
107 |
-
|
108 |
-
jl-corpus-EmotionClassification/train:
|
109 |
-
weight: 6.0
|
110 |
-
|
111 |
-
tess-EmotionClassification/train:
|
112 |
-
weight: 2.5
|
113 |
-
|
114 |
-
IEMOCAP-EmotionClassification/train:
|
115 |
-
weight: 3.0
|
116 |
-
|
117 |
-
OMGEmotion-EmotionClassification/train:
|
118 |
-
weight: 3.0
|
119 |
-
|
120 |
-
VocalSound-VocalClassification/train:
|
121 |
-
weight: 1.5
|
122 |
-
|
123 |
-
# Music QA
|
124 |
-
|
125 |
-
Music-AVQA-AQA_All/train:
|
126 |
-
weight: 3.0
|
127 |
-
|
128 |
-
MU-LLAMA-AQA/train:
|
129 |
-
weight: 1.0
|
130 |
-
|
131 |
-
# Music Captioning
|
132 |
-
|
133 |
-
LP-MusicCaps-MSD-AudioCaptioning/train:
|
134 |
-
weight: 0.06
|
135 |
-
|
136 |
-
LP-MusicCaps-MC-AudioCaptioning/train:
|
137 |
-
weight: 2.0
|
138 |
-
|
139 |
-
LP-MusicCaps-MTT-AudioCaptioning/train:
|
140 |
-
weight: 1.0
|
141 |
-
|
142 |
-
MusicCaps-AudioCaptioning/train:
|
143 |
-
weight: 6.0
|
144 |
-
|
145 |
-
musdbhq-captioning/train:
|
146 |
-
weight: 2.0
|
147 |
-
|
148 |
-
# Music Understanding
|
149 |
-
|
150 |
-
Medley-solos-DB-InstrClassification/train:
|
151 |
-
weight: 1.5
|
152 |
-
|
153 |
-
GTZAN-GenreClassification/train:
|
154 |
-
weight: 2.0
|
155 |
-
|
156 |
-
NSynth-MIR/train:
|
157 |
-
weight: 0.4
|
158 |
-
|
159 |
-
NSynth-Instrument/train:
|
160 |
-
weight: 1.5
|
161 |
-
|
162 |
-
NSynth-Source/train:
|
163 |
-
weight: 1.5
|
164 |
-
|
165 |
-
mtg-jamendo-MusicTagging/train:
|
166 |
-
weight: 1.0
|
167 |
-
|
168 |
-
FMA-GenreClassification/train:
|
169 |
-
weight: 1.0
|
170 |
-
|
171 |
-
musdbhq-InstrClassification/train:
|
172 |
-
weight: 1.0
|
173 |
-
|
174 |
-
LLARK_FMA-mir/train:
|
175 |
-
weight: 1.0
|
176 |
-
|
177 |
-
LLARK_FMA-reasoning/train:
|
178 |
-
weight: 1.0
|
179 |
-
|
180 |
-
LLARK_MagnaTagATune-mir/train:
|
181 |
-
weight: 1.0
|
182 |
-
|
183 |
-
LLARK_MTG-Jamendo-reasoning/train:
|
184 |
-
weight: 1.0
|
185 |
-
|
186 |
-
LLARK_MagnaTagATune-reasoning/train:
|
187 |
-
weight: 1.0
|
188 |
-
|
189 |
-
LLARK_MTG-Jamendo-mir/train:
|
190 |
-
weight: 1.0
|
191 |
-
|
192 |
-
MusicBenchQA/train:
|
193 |
-
weight: 1.0
|
194 |
-
|
195 |
-
dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
|
196 |
-
data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
|
197 |
-
dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/dataset_blending.json
|
198 |
-
max_tokens: 512
|
199 |
-
num_workers: 4
|
200 |
-
|
201 |
-
valid_dataset_config:
|
202 |
-
|
203 |
-
Clotho-AQA-AQA/test: true
|
204 |
-
|
205 |
-
Clotho-v2-AudioCaptioning/test: true
|
206 |
-
audiocaps-AudioCaptioning/test: true
|
207 |
-
|
208 |
-
FSD50k-EventClassification/test: true
|
209 |
-
CochlScene-SceneClassification/test: true
|
210 |
-
NonSpeech7k-EventClassification/test: true
|
211 |
-
SONYC-UST-EventClassification/test: true
|
212 |
-
|
213 |
-
MELD-EmotionClassification/test: true
|
214 |
-
MELD-SentimentClassification/test: true
|
215 |
-
emov-db-EmotionClassification/val: true
|
216 |
-
jl-corpus-EmotionClassification/val: true
|
217 |
-
tess-EmotionClassification/val: true
|
218 |
-
IEMOCAP-EmotionClassification/val: true
|
219 |
-
OMGEmotion-EmotionClassification/val: true
|
220 |
-
VocalSound-VocalClassification/test: true
|
221 |
-
|
222 |
-
Music-AVQA-AQA_All/test: true
|
223 |
-
MU-LLAMA-AQA/test: true
|
224 |
-
|
225 |
-
LP-MusicCaps-MSD-AudioCaptioning/test: true
|
226 |
-
LP-MusicCaps-MC-AudioCaptioning/test: true
|
227 |
-
LP-MusicCaps-MTT-AudioCaptioning/test: true
|
228 |
-
MusicCaps-AudioCaptioning/test: true
|
229 |
-
|
230 |
-
NSynth-MIR/test: true
|
231 |
-
mtg-jamendo-MusicTagging/val: true
|
232 |
-
musdbhq-InstrClassification/test: true
|
233 |
-
|
234 |
-
# zero shot
|
235 |
-
# CREMA-D-EmotionClassification/train:
|
236 |
-
# prefix_prob: 1.0
|
237 |
-
|
238 |
-
# ravdess-EmotionClassification/train:
|
239 |
-
# prefix_prob: 1.0
|
240 |
-
|
241 |
-
# UrbanSound8K-EventClassification/train:
|
242 |
-
# prefix_prob: 1.0
|
243 |
-
|
244 |
-
# ESC50-EventClassification/train:
|
245 |
-
# prefix_prob: 1.0
|
246 |
-
|
247 |
-
# DCASE17Task4-SceneClassification/test:
|
248 |
-
# prefix_prob: 1.0
|
249 |
-
|
250 |
-
# GTZAN-GenreClassification/train:
|
251 |
-
# prefix_prob: 1.0
|
252 |
-
|
253 |
-
# Medley-solos-DB-InstrClassification/test:
|
254 |
-
# prefix_prob: 1.0
|
255 |
-
|
256 |
-
clap_config:
|
257 |
-
method: nvclap-large
|
258 |
-
audio_embed_dim: 2048
|
259 |
-
checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
|
260 |
-
|
261 |
-
window_length: 10.0 # seconds
|
262 |
-
window_overlap: 0.0 # seconds
|
263 |
-
max_num_window: 9 # 1.5 minutes
|
264 |
-
max_num_fewshot: 1 # number of fewshot samples (including the final one)
|
265 |
-
finetune: true
|
266 |
-
|
267 |
-
whisper_config:
|
268 |
-
method: whisper-large-v3
|
269 |
-
path: openai/whisper-large-v3
|
270 |
-
audio_embed_dim: 1280
|
271 |
-
sampling_rate: 16000
|
272 |
-
|
273 |
-
window_length: 30.0 # seconds
|
274 |
-
window_overlap: 0.0 # seconds
|
275 |
-
max_num_window: 1 # 5 minutes
|
276 |
-
max_num_fewshot: 1 # number of fewshot samples (including the final one)
|
277 |
-
|
278 |
-
mert_config:
|
279 |
-
method: mert-v1
|
280 |
-
path: m-a-p/MERT-v1-330M
|
281 |
-
audio_embed_dim: 1024
|
282 |
-
sampling_rate: 24000
|
283 |
-
|
284 |
-
window_length: 10.0 # seconds
|
285 |
-
window_overlap: 0.0 # seconds
|
286 |
-
max_num_window: 1 # 5 minutes
|
287 |
-
max_num_fewshot: 1 # number of fewshot samples (including the final one)
|
288 |
-
|
289 |
-
model_config:
|
290 |
-
cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
|
291 |
-
|
292 |
-
lang_encoder_path: Qwen/Qwen2.5-1.5B
|
293 |
-
tokenizer_path: Qwen/Qwen2.5-1.5B
|
294 |
-
cross_attn_every_n_layers: 1
|
295 |
-
audio_transformer_kwargs: {
|
296 |
-
n_head: 8,
|
297 |
-
n_layers: 3,
|
298 |
-
d_inner: 2048,
|
299 |
-
max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
|
300 |
-
max_window_per_audio: 1, # must = max_num_window
|
301 |
-
common_encoder_embed_dim: 1024
|
302 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/inference_2.yaml
DELETED
@@ -1,302 +0,0 @@
|
|
1 |
-
train_config:
|
2 |
-
expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers
|
3 |
-
run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed
|
4 |
-
delete_previous_checkpoint: true
|
5 |
-
batch_size: 4
|
6 |
-
gradient_accumulation_steps: 2
|
7 |
-
seed: 42
|
8 |
-
learning_rate: 0.00002
|
9 |
-
lr_scheduler: constant
|
10 |
-
loss_multiplier: 1.0
|
11 |
-
warmup_steps: 1875
|
12 |
-
weight_decay: 0.1
|
13 |
-
precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
|
14 |
-
gradient_checkpointing: False
|
15 |
-
num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
|
16 |
-
offline: false
|
17 |
-
freeze_lm_embeddings: false
|
18 |
-
logging_steps: 10
|
19 |
-
dist_backend: nccl
|
20 |
-
dist_url: env:// # tcp://localhost:7000
|
21 |
-
no_set_device_rank: false
|
22 |
-
fsdp: true
|
23 |
-
fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
|
24 |
-
fsdp_sharding_strategy: full # full, hybrid
|
25 |
-
horovod: false
|
26 |
-
|
27 |
-
# instruction tuning hparams
|
28 |
-
sft_config:
|
29 |
-
pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed/
|
30 |
-
pretrained_ckpt: checkpoint_199.pt
|
31 |
-
unfreeze_full_lm: false
|
32 |
-
|
33 |
-
data_config:
|
34 |
-
dataset_blending_global_weight: 0.005
|
35 |
-
|
36 |
-
dataset_blending_config:
|
37 |
-
|
38 |
-
MMAUQA/train:
|
39 |
-
weight: 1.5
|
40 |
-
|
41 |
-
AudioSet-Temporal-Speech-Audio-QA/train:
|
42 |
-
weight: 1.0
|
43 |
-
|
44 |
-
CompA-R-AQA/train:
|
45 |
-
weight: 1.0
|
46 |
-
|
47 |
-
# Audio QA
|
48 |
-
Clotho-AQA-AQA/train:
|
49 |
-
weight: 1.0
|
50 |
-
|
51 |
-
OpenAQA-AQA/train:
|
52 |
-
weight: 1.0
|
53 |
-
|
54 |
-
SalmonnQA/train:
|
55 |
-
weight: 0.8
|
56 |
-
|
57 |
-
AudioEntailmentQA/train:
|
58 |
-
weight: 1.0
|
59 |
-
|
60 |
-
# Audio Captioning
|
61 |
-
|
62 |
-
Clotho-v2-AudioCaptioning/train:
|
63 |
-
weight: 1.0
|
64 |
-
|
65 |
-
audiocaps-AudioCaptioning/train:
|
66 |
-
weight: 1.0
|
67 |
-
|
68 |
-
Epidemic_sound-AudioCaptioning/train:
|
69 |
-
weight: 1.0
|
70 |
-
|
71 |
-
MACS-AudioCaptioning/train:
|
72 |
-
weight: 1.0
|
73 |
-
|
74 |
-
# Audio Classification
|
75 |
-
|
76 |
-
UrbanSound8K-EventClassification/train:
|
77 |
-
weight: 0.5
|
78 |
-
|
79 |
-
TUT-EventClassification/train:
|
80 |
-
weight: 2.0
|
81 |
-
|
82 |
-
FSD50k-EventClassification/train:
|
83 |
-
weight: 1.0
|
84 |
-
|
85 |
-
CochlScene-SceneClassification/train:
|
86 |
-
weight: 1.0
|
87 |
-
|
88 |
-
NonSpeech7k-EventClassification/train:
|
89 |
-
weight: 1.0
|
90 |
-
|
91 |
-
chime-home-EventClassification/train:
|
92 |
-
weight: 1.0
|
93 |
-
|
94 |
-
SONYC-UST-EventClassification/train:
|
95 |
-
weight: 1.0
|
96 |
-
|
97 |
-
# Speech Emotion Classification
|
98 |
-
|
99 |
-
MELD-EmotionClassification/train:
|
100 |
-
weight: 0.5
|
101 |
-
|
102 |
-
MELD-SentimentClassification/train:
|
103 |
-
weight: 0.5
|
104 |
-
|
105 |
-
emov-db-EmotionClassification/train:
|
106 |
-
weight: 1.0
|
107 |
-
|
108 |
-
jl-corpus-EmotionClassification/train:
|
109 |
-
weight: 6.0
|
110 |
-
|
111 |
-
tess-EmotionClassification/train:
|
112 |
-
weight: 2.5
|
113 |
-
|
114 |
-
IEMOCAP-EmotionClassification/train:
|
115 |
-
weight: 3.0
|
116 |
-
|
117 |
-
OMGEmotion-EmotionClassification/train:
|
118 |
-
weight: 3.0
|
119 |
-
|
120 |
-
VocalSound-VocalClassification/train:
|
121 |
-
weight: 1.5
|
122 |
-
|
123 |
-
# Music QA
|
124 |
-
|
125 |
-
Music-AVQA-AQA_All/train:
|
126 |
-
weight: 3.0
|
127 |
-
|
128 |
-
MU-LLAMA-AQA/train:
|
129 |
-
weight: 1.0
|
130 |
-
|
131 |
-
# Music Captioning
|
132 |
-
|
133 |
-
LP-MusicCaps-MSD-AudioCaptioning/train:
|
134 |
-
weight: 0.06
|
135 |
-
|
136 |
-
LP-MusicCaps-MC-AudioCaptioning/train:
|
137 |
-
weight: 2.0
|
138 |
-
|
139 |
-
LP-MusicCaps-MTT-AudioCaptioning/train:
|
140 |
-
weight: 1.0
|
141 |
-
|
142 |
-
MusicCaps-AudioCaptioning/train:
|
143 |
-
weight: 6.0
|
144 |
-
|
145 |
-
musdbhq-captioning/train:
|
146 |
-
weight: 2.0
|
147 |
-
|
148 |
-
# Music Understanding
|
149 |
-
|
150 |
-
Medley-solos-DB-InstrClassification/train:
|
151 |
-
weight: 1.5
|
152 |
-
|
153 |
-
GTZAN-GenreClassification/train:
|
154 |
-
weight: 2.0
|
155 |
-
|
156 |
-
NSynth-MIR/train:
|
157 |
-
weight: 0.4
|
158 |
-
|
159 |
-
NSynth-Instrument/train:
|
160 |
-
weight: 1.5
|
161 |
-
|
162 |
-
NSynth-Source/train:
|
163 |
-
weight: 1.5
|
164 |
-
|
165 |
-
mtg-jamendo-MusicTagging/train:
|
166 |
-
weight: 1.0
|
167 |
-
|
168 |
-
FMA-GenreClassification/train:
|
169 |
-
weight: 1.0
|
170 |
-
|
171 |
-
musdbhq-InstrClassification/train:
|
172 |
-
weight: 1.0
|
173 |
-
|
174 |
-
LLARK_FMA-mir/train:
|
175 |
-
weight: 1.0
|
176 |
-
|
177 |
-
LLARK_FMA-reasoning/train:
|
178 |
-
weight: 1.0
|
179 |
-
|
180 |
-
LLARK_MagnaTagATune-mir/train:
|
181 |
-
weight: 1.0
|
182 |
-
|
183 |
-
LLARK_MTG-Jamendo-reasoning/train:
|
184 |
-
weight: 1.0
|
185 |
-
|
186 |
-
LLARK_MagnaTagATune-reasoning/train:
|
187 |
-
weight: 1.0
|
188 |
-
|
189 |
-
LLARK_MTG-Jamendo-mir/train:
|
190 |
-
weight: 1.0
|
191 |
-
|
192 |
-
MusicBenchQA/train:
|
193 |
-
weight: 1.0
|
194 |
-
|
195 |
-
dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
|
196 |
-
data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
|
197 |
-
dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/dataset_blending.json
|
198 |
-
max_tokens: 512
|
199 |
-
num_workers: 4
|
200 |
-
|
201 |
-
valid_dataset_config:
|
202 |
-
|
203 |
-
Clotho-AQA-AQA/test: true
|
204 |
-
|
205 |
-
Clotho-v2-AudioCaptioning/test: true
|
206 |
-
audiocaps-AudioCaptioning/test: true
|
207 |
-
|
208 |
-
FSD50k-EventClassification/test: true
|
209 |
-
CochlScene-SceneClassification/test: true
|
210 |
-
NonSpeech7k-EventClassification/test: true
|
211 |
-
SONYC-UST-EventClassification/test: true
|
212 |
-
|
213 |
-
MELD-EmotionClassification/test: true
|
214 |
-
MELD-SentimentClassification/test: true
|
215 |
-
emov-db-EmotionClassification/val: true
|
216 |
-
jl-corpus-EmotionClassification/val: true
|
217 |
-
tess-EmotionClassification/val: true
|
218 |
-
IEMOCAP-EmotionClassification/val: true
|
219 |
-
OMGEmotion-EmotionClassification/val: true
|
220 |
-
VocalSound-VocalClassification/test: true
|
221 |
-
|
222 |
-
Music-AVQA-AQA_All/test: true
|
223 |
-
MU-LLAMA-AQA/test: true
|
224 |
-
|
225 |
-
LP-MusicCaps-MSD-AudioCaptioning/test: true
|
226 |
-
LP-MusicCaps-MC-AudioCaptioning/test: true
|
227 |
-
LP-MusicCaps-MTT-AudioCaptioning/test: true
|
228 |
-
MusicCaps-AudioCaptioning/test: true
|
229 |
-
|
230 |
-
NSynth-MIR/test: true
|
231 |
-
mtg-jamendo-MusicTagging/val: true
|
232 |
-
musdbhq-InstrClassification/test: true
|
233 |
-
|
234 |
-
# zero shot
|
235 |
-
# CREMA-D-EmotionClassification/train:
|
236 |
-
# prefix_prob: 1.0
|
237 |
-
|
238 |
-
# ravdess-EmotionClassification/train:
|
239 |
-
# prefix_prob: 1.0
|
240 |
-
|
241 |
-
# UrbanSound8K-EventClassification/train:
|
242 |
-
# prefix_prob: 1.0
|
243 |
-
|
244 |
-
# ESC50-EventClassification/train:
|
245 |
-
# prefix_prob: 1.0
|
246 |
-
|
247 |
-
# DCASE17Task4-SceneClassification/test:
|
248 |
-
# prefix_prob: 1.0
|
249 |
-
|
250 |
-
# GTZAN-GenreClassification/train:
|
251 |
-
# prefix_prob: 1.0
|
252 |
-
|
253 |
-
# Medley-solos-DB-InstrClassification/test:
|
254 |
-
# prefix_prob: 1.0
|
255 |
-
|
256 |
-
clap_config:
|
257 |
-
method: nvclap-large
|
258 |
-
audio_embed_dim: 2048
|
259 |
-
checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
|
260 |
-
|
261 |
-
window_length: 10.0 # seconds
|
262 |
-
window_overlap: 0.0 # seconds
|
263 |
-
max_num_window: 9 # 1.5 minutes
|
264 |
-
max_num_fewshot: 1 # number of fewshot samples (including the final one)
|
265 |
-
finetune: true
|
266 |
-
|
267 |
-
whisper_config:
|
268 |
-
method: whisper-large-v3
|
269 |
-
path: openai/whisper-large-v3
|
270 |
-
audio_embed_dim: 1280
|
271 |
-
sampling_rate: 16000
|
272 |
-
|
273 |
-
window_length: 30.0 # seconds
|
274 |
-
window_overlap: 0.0 # seconds
|
275 |
-
max_num_window: 1 # 5 minutes
|
276 |
-
max_num_fewshot: 1 # number of fewshot samples (including the final one)
|
277 |
-
|
278 |
-
mert_config:
|
279 |
-
method: mert-v1
|
280 |
-
path: m-a-p/MERT-v1-330M
|
281 |
-
audio_embed_dim: 1024
|
282 |
-
sampling_rate: 24000
|
283 |
-
|
284 |
-
window_length: 10.0 # seconds
|
285 |
-
window_overlap: 0.0 # seconds
|
286 |
-
max_num_window: 1 # 5 minutes
|
287 |
-
max_num_fewshot: 1 # number of fewshot samples (including the final one)
|
288 |
-
|
289 |
-
model_config:
|
290 |
-
cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
|
291 |
-
|
292 |
-
lang_encoder_path: Qwen/Qwen2.5-3B
|
293 |
-
tokenizer_path: Qwen/Qwen2.5-3B
|
294 |
-
cross_attn_every_n_layers: 1
|
295 |
-
audio_transformer_kwargs: {
|
296 |
-
n_head: 8,
|
297 |
-
n_layers: 3,
|
298 |
-
d_inner: 2048,
|
299 |
-
max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
|
300 |
-
max_window_per_audio: 1, # must = max_num_window
|
301 |
-
common_encoder_embed_dim: 1024
|
302 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/inference_long.yaml
DELETED
@@ -1,284 +0,0 @@
|
|
1 |
-
train_config:
|
2 |
-
expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed-sft
|
3 |
-
run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft-long
|
4 |
-
delete_previous_checkpoint: true
|
5 |
-
batch_size: 2
|
6 |
-
gradient_accumulation_steps: 2
|
7 |
-
seed: 42
|
8 |
-
learning_rate: 0.00002
|
9 |
-
lr_scheduler: constant
|
10 |
-
loss_multiplier: 1.0
|
11 |
-
warmup_steps: 1875
|
12 |
-
weight_decay: 0.1
|
13 |
-
precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
|
14 |
-
gradient_checkpointing: False
|
15 |
-
num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
|
16 |
-
offline: false
|
17 |
-
freeze_lm_embeddings: false
|
18 |
-
logging_steps: 10
|
19 |
-
dist_backend: nccl
|
20 |
-
dist_url: env:// # tcp://localhost:7000
|
21 |
-
no_set_device_rank: false
|
22 |
-
fsdp: true
|
23 |
-
fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
|
24 |
-
fsdp_sharding_strategy: full # full, hybrid
|
25 |
-
horovod: false
|
26 |
-
|
27 |
-
# instruction tuning hparams
|
28 |
-
# sft_config:
|
29 |
-
# pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
|
30 |
-
# pretrained_ckpt: checkpoint_199.pt
|
31 |
-
# unfreeze_full_lm: false
|
32 |
-
|
33 |
-
data_config:
|
34 |
-
dataset_blending_global_weight: 0.005
|
35 |
-
|
36 |
-
dataset_blending_config:
|
37 |
-
|
38 |
-
MMAUQA/train:
|
39 |
-
weight: 1.5
|
40 |
-
|
41 |
-
AudioSet-Temporal-Speech-Audio-QA/train:
|
42 |
-
weight: 1.0
|
43 |
-
|
44 |
-
CompA-R-AQA/train:
|
45 |
-
weight: 1.0
|
46 |
-
|
47 |
-
# Audio QA
|
48 |
-
Clotho-AQA-AQA/train:
|
49 |
-
weight: 1.0
|
50 |
-
|
51 |
-
OpenAQA-AQA/train:
|
52 |
-
weight: 1.0
|
53 |
-
|
54 |
-
SalmonnQA/train:
|
55 |
-
weight: 1.0
|
56 |
-
|
57 |
-
AudioEntailmentQA/train:
|
58 |
-
weight: 1.0
|
59 |
-
|
60 |
-
# Audio Captioning
|
61 |
-
|
62 |
-
Clotho-v2-AudioCaptioning/train:
|
63 |
-
weight: 1.0
|
64 |
-
|
65 |
-
audiocaps-AudioCaptioning/train:
|
66 |
-
weight: 1.0
|
67 |
-
|
68 |
-
Epidemic_sound-AudioCaptioning/train:
|
69 |
-
weight: 1.0
|
70 |
-
|
71 |
-
MACS-AudioCaptioning/train:
|
72 |
-
weight: 1.0
|
73 |
-
|
74 |
-
# Audio Classification
|
75 |
-
|
76 |
-
FSD50k-EventClassification/train:
|
77 |
-
weight: 1.0
|
78 |
-
|
79 |
-
CochlScene-SceneClassification/train:
|
80 |
-
weight: 1.0
|
81 |
-
|
82 |
-
NonSpeech7k-EventClassification/train:
|
83 |
-
weight: 1.0
|
84 |
-
|
85 |
-
chime-home-EventClassification/train:
|
86 |
-
weight: 1.0
|
87 |
-
|
88 |
-
SONYC-UST-EventClassification/train:
|
89 |
-
weight: 1.0
|
90 |
-
|
91 |
-
# Speech Emotion Classification
|
92 |
-
|
93 |
-
MELD-EmotionClassification/train:
|
94 |
-
weight: 0.5
|
95 |
-
|
96 |
-
MELD-SentimentClassification/train:
|
97 |
-
weight: 0.5
|
98 |
-
|
99 |
-
emov-db-EmotionClassification/train:
|
100 |
-
weight: 1.0
|
101 |
-
|
102 |
-
jl-corpus-EmotionClassification/train:
|
103 |
-
weight: 6.0
|
104 |
-
|
105 |
-
tess-EmotionClassification/train:
|
106 |
-
weight: 2.5
|
107 |
-
|
108 |
-
IEMOCAP-EmotionClassification/train:
|
109 |
-
weight: 3.0
|
110 |
-
|
111 |
-
OMGEmotion-EmotionClassification/train:
|
112 |
-
weight: 3.0
|
113 |
-
|
114 |
-
VocalSound-VocalClassification/train:
|
115 |
-
weight: 1.5
|
116 |
-
|
117 |
-
# Music QA
|
118 |
-
|
119 |
-
Music-AVQA-AQA_All/train:
|
120 |
-
weight: 3.0
|
121 |
-
|
122 |
-
MU-LLAMA-AQA/train:
|
123 |
-
weight: 1.0
|
124 |
-
|
125 |
-
# Music Captioning
|
126 |
-
|
127 |
-
LP-MusicCaps-MSD-AudioCaptioning/train:
|
128 |
-
weight: 0.06
|
129 |
-
|
130 |
-
LP-MusicCaps-MC-AudioCaptioning/train:
|
131 |
-
weight: 2.0
|
132 |
-
|
133 |
-
LP-MusicCaps-MTT-AudioCaptioning/train:
|
134 |
-
weight: 1.0
|
135 |
-
|
136 |
-
MusicCaps-AudioCaptioning/train:
|
137 |
-
weight: 6.0
|
138 |
-
|
139 |
-
musdbhq-captioning/train:
|
140 |
-
weight: 2.0
|
141 |
-
|
142 |
-
# Music Understanding
|
143 |
-
|
144 |
-
NSynth-MIR/train:
|
145 |
-
weight: 0.2
|
146 |
-
|
147 |
-
mtg-jamendo-MusicTagging/train:
|
148 |
-
weight: 0.1
|
149 |
-
|
150 |
-
FMA-GenreClassification/train:
|
151 |
-
weight: 0.5
|
152 |
-
|
153 |
-
musdbhq-InstrClassification/train:
|
154 |
-
weight: 0.8
|
155 |
-
|
156 |
-
LLARK_FMA-mir/train:
|
157 |
-
weight: 1.0
|
158 |
-
|
159 |
-
LLARK_FMA-reasoning/train:
|
160 |
-
weight: 1.0
|
161 |
-
|
162 |
-
LLARK_MagnaTagATune-mir/train:
|
163 |
-
weight: 1.0
|
164 |
-
|
165 |
-
LLARK_MTG-Jamendo-reasoning/train:
|
166 |
-
weight: 1.0
|
167 |
-
|
168 |
-
LLARK_MagnaTagATune-reasoning/train:
|
169 |
-
weight: 1.0
|
170 |
-
|
171 |
-
LLARK_MTG-Jamendo-mir/train:
|
172 |
-
weight: 1.0
|
173 |
-
|
174 |
-
MusicBenchQA/train:
|
175 |
-
weight: 1.0
|
176 |
-
|
177 |
-
dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
|
178 |
-
data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
|
179 |
-
dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
|
180 |
-
max_tokens: 512
|
181 |
-
num_workers: 4
|
182 |
-
|
183 |
-
valid_dataset_config:
|
184 |
-
|
185 |
-
Clotho-AQA-AQA/test: true
|
186 |
-
|
187 |
-
Clotho-v2-AudioCaptioning/test: true
|
188 |
-
audiocaps-AudioCaptioning/test: true
|
189 |
-
|
190 |
-
FSD50k-EventClassification/test: true
|
191 |
-
CochlScene-SceneClassification/test: true
|
192 |
-
NonSpeech7k-EventClassification/test: true
|
193 |
-
SONYC-UST-EventClassification/test: true
|
194 |
-
|
195 |
-
MELD-EmotionClassification/test: true
|
196 |
-
MELD-SentimentClassification/test: true
|
197 |
-
emov-db-EmotionClassification/val: true
|
198 |
-
jl-corpus-EmotionClassification/val: true
|
199 |
-
tess-EmotionClassification/val: true
|
200 |
-
IEMOCAP-EmotionClassification/val: true
|
201 |
-
OMGEmotion-EmotionClassification/val: true
|
202 |
-
VocalSound-VocalClassification/test: true
|
203 |
-
|
204 |
-
Music-AVQA-AQA_All/test: true
|
205 |
-
MU-LLAMA-AQA/test: true
|
206 |
-
|
207 |
-
LP-MusicCaps-MSD-AudioCaptioning/test: true
|
208 |
-
LP-MusicCaps-MC-AudioCaptioning/test: true
|
209 |
-
LP-MusicCaps-MTT-AudioCaptioning/test: true
|
210 |
-
MusicCaps-AudioCaptioning/test: true
|
211 |
-
|
212 |
-
NSynth-MIR/test: true
|
213 |
-
mtg-jamendo-MusicTagging/val: true
|
214 |
-
musdbhq-InstrClassification/test: true
|
215 |
-
|
216 |
-
# # zero shot
|
217 |
-
# CREMA-D-EmotionClassification/train:
|
218 |
-
# prefix_prob: 1.0
|
219 |
-
|
220 |
-
# ravdess-EmotionClassification/train:
|
221 |
-
# prefix_prob: 1.0
|
222 |
-
|
223 |
-
# UrbanSound8K-EventClassification/train:
|
224 |
-
# prefix_prob: 1.0
|
225 |
-
|
226 |
-
# ESC50-EventClassification/train:
|
227 |
-
# prefix_prob: 1.0
|
228 |
-
|
229 |
-
# DCASE17Task4-SceneClassification/test:
|
230 |
-
# prefix_prob: 1.0
|
231 |
-
|
232 |
-
# GTZAN-GenreClassification/train:
|
233 |
-
# prefix_prob: 1.0
|
234 |
-
|
235 |
-
# Medley-solos-DB-InstrClassification/test:
|
236 |
-
# prefix_prob: 1.0
|
237 |
-
|
238 |
-
clap_config:
|
239 |
-
method: nvclap-large
|
240 |
-
audio_embed_dim: 2048
|
241 |
-
checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
|
242 |
-
|
243 |
-
window_length: 10.0 # seconds
|
244 |
-
window_overlap: 0.0 # seconds
|
245 |
-
max_num_window: 30 # 1.5 minutes
|
246 |
-
max_num_fewshot: 1 # number of fewshot samples (including the final one)
|
247 |
-
finetune: true
|
248 |
-
|
249 |
-
whisper_config:
|
250 |
-
method: whisper-large-v3
|
251 |
-
path: openai/whisper-large-v3
|
252 |
-
audio_embed_dim: 1280
|
253 |
-
sampling_rate: 16000
|
254 |
-
|
255 |
-
window_length: 30.0 # seconds
|
256 |
-
window_overlap: 0.0 # seconds
|
257 |
-
max_num_window: 1 # 5 minutes
|
258 |
-
max_num_fewshot: 1 # number of fewshot samples (including the final one)
|
259 |
-
|
260 |
-
mert_config:
|
261 |
-
method: mert-v1
|
262 |
-
path: m-a-p/MERT-v1-330M
|
263 |
-
audio_embed_dim: 1024
|
264 |
-
sampling_rate: 24000
|
265 |
-
|
266 |
-
window_length: 10.0 # seconds
|
267 |
-
window_overlap: 0.0 # seconds
|
268 |
-
max_num_window: 1 # 5 minutes
|
269 |
-
max_num_fewshot: 1 # number of fewshot samples (including the final one)
|
270 |
-
|
271 |
-
model_config:
|
272 |
-
cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
|
273 |
-
|
274 |
-
lang_encoder_path: Qwen/Qwen2.5-3B
|
275 |
-
tokenizer_path: Qwen/Qwen2.5-3B
|
276 |
-
cross_attn_every_n_layers: 1
|
277 |
-
audio_transformer_kwargs: {
|
278 |
-
n_head: 8,
|
279 |
-
n_layers: 3,
|
280 |
-
d_inner: 2048,
|
281 |
-
max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
|
282 |
-
max_window_per_audio: 1, # must = max_num_window
|
283 |
-
common_encoder_embed_dim: 1024
|
284 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml
DELETED
@@ -1,255 +0,0 @@
|
|
1 |
-
train_config:
|
2 |
-
expdir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation
|
3 |
-
run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node
|
4 |
-
delete_previous_checkpoint: true
|
5 |
-
batch_size: 6
|
6 |
-
gradient_accumulation_steps: 2 # 4 nodes
|
7 |
-
seed: 42
|
8 |
-
learning_rate: 0.0001
|
9 |
-
lr_scheduler: constant
|
10 |
-
loss_multiplier: 1.0
|
11 |
-
warmup_steps: 1875
|
12 |
-
weight_decay: 0.1
|
13 |
-
precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
|
14 |
-
gradient_checkpointing: False
|
15 |
-
num_epochs: 100 # num_epochs * dataset_blending_global_weight = 1
|
16 |
-
offline: false
|
17 |
-
freeze_lm_embeddings: true
|
18 |
-
logging_steps: 10
|
19 |
-
dist_backend: nccl
|
20 |
-
dist_url: env:// # tcp://localhost:7000
|
21 |
-
no_set_device_rank: false
|
22 |
-
fsdp: true
|
23 |
-
fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
|
24 |
-
fsdp_sharding_strategy: full # full, hybrid
|
25 |
-
horovod: false
|
26 |
-
|
27 |
-
data_config:
|
28 |
-
dataset_blending_global_weight: 0.01
|
29 |
-
|
30 |
-
dataset_blending_config:
|
31 |
-
|
32 |
-
# Audio QA
|
33 |
-
OpenAQA-AQA/train:
|
34 |
-
weight: 1.0
|
35 |
-
prefix_prob: 0.0
|
36 |
-
augmentations:
|
37 |
-
do_nothing: 1.0
|
38 |
-
|
39 |
-
# Audio Captioning
|
40 |
-
|
41 |
-
BBCSoundEffects-AudioDescription/train:
|
42 |
-
weight: 5.0
|
43 |
-
prefix_prob: 0.5
|
44 |
-
augmentations:
|
45 |
-
do_nothing: 1.0
|
46 |
-
|
47 |
-
CLAP_freesound-AudioCaptioning/train:
|
48 |
-
weight: 1.0
|
49 |
-
prefix_prob: 0.5
|
50 |
-
augmentations:
|
51 |
-
do_nothing: 1.0
|
52 |
-
|
53 |
-
SoundDescs-AudioDescription/train:
|
54 |
-
weight: 1.0
|
55 |
-
prefix_prob: 0.5
|
56 |
-
augmentations:
|
57 |
-
do_nothing: 1.0
|
58 |
-
|
59 |
-
WavCaps-AudioSet_SL-AudioCaptioning/train:
|
60 |
-
weight: 1.0
|
61 |
-
prefix_prob: 0.5
|
62 |
-
augmentations:
|
63 |
-
do_nothing: 1.0
|
64 |
-
|
65 |
-
WavCaps-BBC_Sound_Effects-AudioCaptioning/train:
|
66 |
-
weight: 2
|
67 |
-
prefix_prob: 0.5
|
68 |
-
augmentations:
|
69 |
-
do_nothing: 1.0
|
70 |
-
|
71 |
-
WavCaps-FreeSound-AudioCaptioning/train:
|
72 |
-
weight: 2
|
73 |
-
prefix_prob: 0.5
|
74 |
-
augmentations:
|
75 |
-
do_nothing: 1.0
|
76 |
-
|
77 |
-
WavCaps-SoundBible-AudioCaptioning/train:
|
78 |
-
weight: 5
|
79 |
-
prefix_prob: 0.5
|
80 |
-
augmentations:
|
81 |
-
do_nothing: 1.0
|
82 |
-
|
83 |
-
# Audio Classification
|
84 |
-
|
85 |
-
AudioSetFullwoAudioMusicCaps-EventClassification/train:
|
86 |
-
weight: 1.0
|
87 |
-
prefix_prob: 0.5
|
88 |
-
augmentations:
|
89 |
-
num_words: 0.8
|
90 |
-
do_nothing: 0.2
|
91 |
-
|
92 |
-
AudioSet-EventClassification/train:
|
93 |
-
weight: 5.0
|
94 |
-
prefix_prob: 0.5
|
95 |
-
augmentations:
|
96 |
-
num_words: 0.8
|
97 |
-
do_nothing: 0.2
|
98 |
-
|
99 |
-
Clotho-AQA-EventClassification/train:
|
100 |
-
weight: 5.0
|
101 |
-
prefix_prob: 0.5
|
102 |
-
augmentations:
|
103 |
-
num_words: 0.8
|
104 |
-
do_nothing: 0.2
|
105 |
-
|
106 |
-
WavText5K-Tagging/train:
|
107 |
-
weight: 3.0
|
108 |
-
prefix_prob: 0.5
|
109 |
-
augmentations:
|
110 |
-
num_words: 0.8
|
111 |
-
do_nothing: 0.2
|
112 |
-
|
113 |
-
# Speech Emotion Classification
|
114 |
-
|
115 |
-
MSP-PODCAST-Publish-1.9-EmotionClassification/train:
|
116 |
-
weight: 1.8
|
117 |
-
prefix_prob: 0.5
|
118 |
-
augmentations:
|
119 |
-
provide_all_labels: 0.9
|
120 |
-
do_nothing: 0.1
|
121 |
-
MSP-PODCAST-Publish-1.9-EmotionClassification/interleaved_knn-train:
|
122 |
-
weight: 1.2
|
123 |
-
prefix_prob: 0.5
|
124 |
-
augmentations:
|
125 |
-
provide_all_labels: 0.9
|
126 |
-
do_nothing: 0.1
|
127 |
-
|
128 |
-
MELD-EmotionClassification/train:
|
129 |
-
weight: 1.8
|
130 |
-
prefix_prob: 0.5
|
131 |
-
augmentations:
|
132 |
-
provide_all_labels: 0.9
|
133 |
-
do_nothing: 0.1
|
134 |
-
MELD-EmotionClassification/interleaved_knn-train:
|
135 |
-
weight: 1.2
|
136 |
-
prefix_prob: 0.5
|
137 |
-
augmentations:
|
138 |
-
provide_all_labels: 0.9
|
139 |
-
do_nothing: 0.1
|
140 |
-
|
141 |
-
MELD-SentimentClassification/train:
|
142 |
-
weight: 1.8
|
143 |
-
prefix_prob: 0.5
|
144 |
-
augmentations:
|
145 |
-
provide_all_labels: 0.9
|
146 |
-
do_nothing: 0.1
|
147 |
-
MELD-SentimentClassification/interleaved_knn-train:
|
148 |
-
weight: 1.2
|
149 |
-
prefix_prob: 0.5
|
150 |
-
augmentations:
|
151 |
-
provide_all_labels: 0.9
|
152 |
-
do_nothing: 0.1
|
153 |
-
|
154 |
-
# Music QA
|
155 |
-
|
156 |
-
Music-AVQA-AVQA_All/train:
|
157 |
-
weight: 3.0
|
158 |
-
prefix_prob: 0.5
|
159 |
-
augmentations:
|
160 |
-
AQA_binary_instruction: 1.0
|
161 |
-
|
162 |
-
MU-LLAMA-AQA/train:
|
163 |
-
weight: 1.8
|
164 |
-
prefix_prob: 0.5
|
165 |
-
augmentations:
|
166 |
-
do_nothing: 1.0
|
167 |
-
MU-LLAMA-AQA/interleaved_knn-train:
|
168 |
-
weight: 1.2
|
169 |
-
prefix_prob: 0.5
|
170 |
-
augmentations:
|
171 |
-
do_nothing: 1.0
|
172 |
-
|
173 |
-
# Music Captioning
|
174 |
-
|
175 |
-
LP-MusicCaps-MSD-AudioCaptioning/train:
|
176 |
-
weight: 1.0
|
177 |
-
prefix_prob: 0.5
|
178 |
-
augmentations:
|
179 |
-
do_nothing: 1.0
|
180 |
-
|
181 |
-
# Music Understanding
|
182 |
-
|
183 |
-
NSynth-MIR/train:
|
184 |
-
weight: 0.6
|
185 |
-
prefix_prob: 0.5
|
186 |
-
augmentations:
|
187 |
-
do_nothing: 1.0
|
188 |
-
NSynth-MIR/interleaved_knn-train:
|
189 |
-
weight: 0.4
|
190 |
-
prefix_prob: 0.5
|
191 |
-
augmentations:
|
192 |
-
do_nothing: 1.0
|
193 |
-
|
194 |
-
mtg-jamendo-MusicTagging/train:
|
195 |
-
weight: 1.0
|
196 |
-
prefix_prob: 0.5
|
197 |
-
augmentations:
|
198 |
-
do_nothing: 1.0
|
199 |
-
|
200 |
-
dataset_file_root: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/dataset_files
|
201 |
-
data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
|
202 |
-
dataset_blending_output: dataset_blending.json
|
203 |
-
max_tokens: 512
|
204 |
-
num_workers: 4
|
205 |
-
|
206 |
-
valid_dataset_config:
|
207 |
-
CLAP_freesound-AudioCaptioning/test: true
|
208 |
-
SoundDescs-AudioDescription/test: true
|
209 |
-
Clotho-AQA-EventClassification/test: true
|
210 |
-
|
211 |
-
MSP-PODCAST-Publish-1.9-EmotionClassification/test: true
|
212 |
-
MSP-PODCAST-Publish-1.9-EmotionClassification/interleaved_knn-test: true
|
213 |
-
MELD-EmotionClassification/test: true
|
214 |
-
MELD-EmotionClassification/interleaved_knn-test: true
|
215 |
-
MELD-SentimentClassification/test: true
|
216 |
-
MELD-SentimentClassification/interleaved_knn-test: true
|
217 |
-
|
218 |
-
MU-LLAMA-AQA/test: true
|
219 |
-
LP-MusicCaps-MSD-AudioCaptioning/val: true
|
220 |
-
NSynth-MIR/test: true
|
221 |
-
NSynth-MIR/interleaved_knn-test: true
|
222 |
-
mtg-jamendo-MusicTagging/val: true
|
223 |
-
|
224 |
-
clap_config:
|
225 |
-
# method: laion-clap
|
226 |
-
# audio_embed_dim: 512
|
227 |
-
# model_name: 630k-fusion-best
|
228 |
-
# checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
|
229 |
-
|
230 |
-
method: microsoft-clap
|
231 |
-
audio_embed_dim: 1024
|
232 |
-
config_root: /home/zkong/audio_flamingo/audio_flamingo_v1/v1.0_optimlmax1.3b_foundation/my_ms_clap/src/configs
|
233 |
-
# model_name: '2023'
|
234 |
-
# checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/CLAP_weights_2023.pth
|
235 |
-
model_name: 'clapcap'
|
236 |
-
checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/clapcap_weights_2023.pth
|
237 |
-
|
238 |
-
window_length: 7.0 # seconds
|
239 |
-
window_overlap: 5.25 # seconds
|
240 |
-
max_num_window: 16 # 35 seconds
|
241 |
-
max_num_fewshot: 4 # number of fewshot samples (including the final one)
|
242 |
-
|
243 |
-
model_config:
|
244 |
-
cache_dir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/LLM_pretrained/.cache
|
245 |
-
|
246 |
-
lang_encoder_path: facebook/opt-iml-max-1.3b
|
247 |
-
tokenizer_path: facebook/opt-iml-max-1.3b
|
248 |
-
cross_attn_every_n_layers: 1
|
249 |
-
audio_transformer_kwargs: {
|
250 |
-
n_head: 8,
|
251 |
-
n_layers: 3,
|
252 |
-
d_inner: 2048,
|
253 |
-
max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
|
254 |
-
max_window_per_audio: 16, # must = max_num_window
|
255 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml
DELETED
@@ -1,183 +0,0 @@
|
|
1 |
-
train_config:
|
2 |
-
expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed
|
3 |
-
run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed
|
4 |
-
delete_previous_checkpoint: true
|
5 |
-
batch_size: 4
|
6 |
-
gradient_accumulation_steps: 2 # 4 nodes
|
7 |
-
seed: 42
|
8 |
-
learning_rate: 0.0001
|
9 |
-
lr_scheduler: constant
|
10 |
-
loss_multiplier: 1.0
|
11 |
-
warmup_steps: 1875
|
12 |
-
weight_decay: 0.1
|
13 |
-
precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
|
14 |
-
gradient_checkpointing: False
|
15 |
-
num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
|
16 |
-
offline: false
|
17 |
-
freeze_lm_embeddings: false
|
18 |
-
logging_steps: 10
|
19 |
-
dist_backend: nccl
|
20 |
-
dist_url: env:// # tcp://localhost:7000
|
21 |
-
no_set_device_rank: false
|
22 |
-
fsdp: true
|
23 |
-
fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
|
24 |
-
fsdp_sharding_strategy: full # full, hybrid
|
25 |
-
horovod: false
|
26 |
-
|
27 |
-
data_config:
|
28 |
-
dataset_blending_global_weight: 0.005
|
29 |
-
|
30 |
-
dataset_blending_config:
|
31 |
-
|
32 |
-
# Audio QA
|
33 |
-
OpenAQA-AQA/train:
|
34 |
-
weight: 1.0
|
35 |
-
|
36 |
-
AudioSet-Temporal-Speech-Audio-QA/train:
|
37 |
-
weight: 2.0
|
38 |
-
|
39 |
-
CompA-R-AQA/train:
|
40 |
-
weight: 2.0
|
41 |
-
|
42 |
-
# Audio Captioning
|
43 |
-
|
44 |
-
BBCSoundEffects-AudioDescription/train:
|
45 |
-
weight: 5.0
|
46 |
-
|
47 |
-
CLAP_freesound-AudioCaptioning/train:
|
48 |
-
weight: 1.0
|
49 |
-
|
50 |
-
SoundDescs-AudioDescription/train:
|
51 |
-
weight: 1.0
|
52 |
-
|
53 |
-
WavCaps-AudioSet_SL-AudioCaptioning/train:
|
54 |
-
weight: 1.0
|
55 |
-
|
56 |
-
WavCaps-BBC_Sound_Effects-AudioCaptioning/train:
|
57 |
-
weight: 2.0
|
58 |
-
|
59 |
-
WavCaps-FreeSound-AudioCaptioning/train:
|
60 |
-
weight: 2.0
|
61 |
-
|
62 |
-
WavCaps-SoundBible-AudioCaptioning/train:
|
63 |
-
weight: 5.0
|
64 |
-
|
65 |
-
Ego-10-AudioCaptioning/train:
|
66 |
-
weight: 2.0
|
67 |
-
|
68 |
-
Ego-30-AudioCaptioning/train:
|
69 |
-
weight: 2.0
|
70 |
-
|
71 |
-
# Audio Classification
|
72 |
-
|
73 |
-
AudioSetFullwoAudioMusicCaps-EventClassification/train:
|
74 |
-
weight: 1.0
|
75 |
-
|
76 |
-
AudioSet-EventClassification/train:
|
77 |
-
weight: 5.0
|
78 |
-
|
79 |
-
Clotho-AQA-EventClassification/train:
|
80 |
-
weight: 5.0
|
81 |
-
|
82 |
-
WavText5K-Tagging/train:
|
83 |
-
weight: 3.0
|
84 |
-
|
85 |
-
# Speech Emotion Classification
|
86 |
-
|
87 |
-
MSP-PODCAST-Publish-1.9-EmotionClassification/train:
|
88 |
-
weight: 3.0
|
89 |
-
|
90 |
-
MELD-EmotionClassification/train:
|
91 |
-
weight: 3.0
|
92 |
-
|
93 |
-
MELD-SentimentClassification/train:
|
94 |
-
weight: 3.0
|
95 |
-
|
96 |
-
# Music QA
|
97 |
-
|
98 |
-
Music-AVQA-AVQA_All/train:
|
99 |
-
weight: 3.0
|
100 |
-
|
101 |
-
MU-LLAMA-AQA/train:
|
102 |
-
weight: 3.0
|
103 |
-
|
104 |
-
# Music Captioning
|
105 |
-
|
106 |
-
LP-MusicCaps-MSD-AudioCaptioning/train:
|
107 |
-
weight: 1.0
|
108 |
-
|
109 |
-
# Music Understanding
|
110 |
-
|
111 |
-
NSynth-MIR/train:
|
112 |
-
weight: 1.0
|
113 |
-
|
114 |
-
mtg-jamendo-MusicTagging/train:
|
115 |
-
weight: 1.0
|
116 |
-
|
117 |
-
dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
|
118 |
-
data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
|
119 |
-
dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
|
120 |
-
max_tokens: 512
|
121 |
-
num_workers: 4
|
122 |
-
|
123 |
-
valid_dataset_config:
|
124 |
-
CLAP_freesound-AudioCaptioning/test: true
|
125 |
-
SoundDescs-AudioDescription/test: true
|
126 |
-
Clotho-AQA-EventClassification/test: true
|
127 |
-
|
128 |
-
MSP-PODCAST-Publish-1.9-EmotionClassification/test: true
|
129 |
-
MELD-EmotionClassification/test: true
|
130 |
-
MELD-SentimentClassification/test: true
|
131 |
-
|
132 |
-
MU-LLAMA-AQA/test: true
|
133 |
-
LP-MusicCaps-MSD-AudioCaptioning/val: true
|
134 |
-
NSynth-MIR/test: true
|
135 |
-
mtg-jamendo-MusicTagging/val: true
|
136 |
-
|
137 |
-
clap_config:
|
138 |
-
method: nvclap-large
|
139 |
-
audio_embed_dim: 2048
|
140 |
-
checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
|
141 |
-
|
142 |
-
window_length: 10.0 # seconds
|
143 |
-
window_overlap: 0.0 # seconds
|
144 |
-
max_num_window: 3 # 5 minutes
|
145 |
-
max_num_fewshot: 1 # number of fewshot samples (including the final one)
|
146 |
-
|
147 |
-
whisper_config:
|
148 |
-
method: whisper-large-v3
|
149 |
-
path: openai/whisper-large-v3
|
150 |
-
audio_embed_dim: 1280
|
151 |
-
sampling_rate: 16000
|
152 |
-
|
153 |
-
window_length: 30.0 # seconds
|
154 |
-
window_overlap: 0.0 # seconds
|
155 |
-
max_num_window: 1 # 5 minutes
|
156 |
-
max_num_fewshot: 1 # number of fewshot samples (including the final one)
|
157 |
-
finetune: true
|
158 |
-
|
159 |
-
mert_config:
|
160 |
-
method: mert-v1
|
161 |
-
path: m-a-p/MERT-v1-330M
|
162 |
-
audio_embed_dim: 1024
|
163 |
-
sampling_rate: 24000
|
164 |
-
|
165 |
-
window_length: 10.0 # seconds
|
166 |
-
window_overlap: 0.0 # seconds
|
167 |
-
max_num_window: 1 # 5 minutes
|
168 |
-
max_num_fewshot: 1 # number of fewshot samples (including the final one)
|
169 |
-
|
170 |
-
model_config:
|
171 |
-
cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
|
172 |
-
|
173 |
-
lang_encoder_path: Qwen/Qwen2.5-3B
|
174 |
-
tokenizer_path: Qwen/Qwen2.5-3B
|
175 |
-
cross_attn_every_n_layers: 1
|
176 |
-
audio_transformer_kwargs: {
|
177 |
-
n_head: 8,
|
178 |
-
n_layers: 3,
|
179 |
-
d_inner: 2048,
|
180 |
-
max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
|
181 |
-
max_window_per_audio: 1, # must = max_num_window
|
182 |
-
common_encoder_embed_dim: 1024
|
183 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml
DELETED
@@ -1,483 +0,0 @@
|
|
1 |
-
train_config:
|
2 |
-
expdir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation
|
3 |
-
run_name: run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node
|
4 |
-
delete_previous_checkpoint: true
|
5 |
-
batch_size: 4
|
6 |
-
gradient_accumulation_steps: 1
|
7 |
-
seed: 42
|
8 |
-
learning_rate: 0.00002
|
9 |
-
lr_scheduler: constant
|
10 |
-
loss_multiplier: 1.0
|
11 |
-
warmup_steps: 1875
|
12 |
-
weight_decay: 0.1
|
13 |
-
precision: fp32 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
|
14 |
-
gradient_checkpointing: False
|
15 |
-
num_epochs: 160 # num_epochs * dataset_blending_global_weight = 1
|
16 |
-
offline: false
|
17 |
-
freeze_lm_embeddings: false
|
18 |
-
logging_steps: 10
|
19 |
-
dist_backend: nccl
|
20 |
-
dist_url: env:// # tcp://localhost:7000
|
21 |
-
no_set_device_rank: false
|
22 |
-
fsdp: true
|
23 |
-
fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
|
24 |
-
fsdp_sharding_strategy: full # full, hybrid
|
25 |
-
horovod: false
|
26 |
-
|
27 |
-
# instruction tuning hparams
|
28 |
-
sft_config:
|
29 |
-
pretrained_path: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node/
|
30 |
-
pretrained_ckpt: checkpoint_99.pt
|
31 |
-
unfreeze_full_lm: true
|
32 |
-
|
33 |
-
data_config:
|
34 |
-
dataset_blending_global_weight: 0.01
|
35 |
-
|
36 |
-
dataset_blending_config:
|
37 |
-
|
38 |
-
# Audio QA
|
39 |
-
Clotho-AQA-AQA/train:
|
40 |
-
weight: 0.8
|
41 |
-
prefix_prob: 1.0
|
42 |
-
augmentations:
|
43 |
-
AQA_binary_instruction: 1.0
|
44 |
-
Clotho-AQA-AQA/interleaved_knn-train:
|
45 |
-
weight: 0.2
|
46 |
-
prefix_prob: 1.0
|
47 |
-
augmentations:
|
48 |
-
AQA_binary_instruction: 1.0
|
49 |
-
|
50 |
-
OpenAQA-AQA/train:
|
51 |
-
weight: 1.0
|
52 |
-
prefix_prob: 1.0
|
53 |
-
augmentations:
|
54 |
-
do_nothing: 1.0
|
55 |
-
|
56 |
-
# Audio Captioning
|
57 |
-
|
58 |
-
Clotho-v2-AudioCaptioning/train:
|
59 |
-
weight: 0.8
|
60 |
-
prefix_prob: 1.0
|
61 |
-
augmentations:
|
62 |
-
AC_short: 1.0
|
63 |
-
Clotho-v2-AudioCaptioning/interleaved_knn-train:
|
64 |
-
weight: 0.2
|
65 |
-
prefix_prob: 1.0
|
66 |
-
augmentations:
|
67 |
-
AC_short: 1.0
|
68 |
-
|
69 |
-
audiocaps-AudioCaptioning/train:
|
70 |
-
weight: 0.8
|
71 |
-
prefix_prob: 1.0
|
72 |
-
augmentations:
|
73 |
-
AC_short: 1.0
|
74 |
-
audiocaps-AudioCaptioning/interleaved_knn-train:
|
75 |
-
weight: 0.2
|
76 |
-
prefix_prob: 1.0
|
77 |
-
augmentations:
|
78 |
-
AC_short: 1.0
|
79 |
-
|
80 |
-
Epidemic_sound-AudioCaptioning/train:
|
81 |
-
weight: 0.8
|
82 |
-
prefix_prob: 1.0
|
83 |
-
augmentations:
|
84 |
-
AC_short: 1.0
|
85 |
-
Epidemic_sound-AudioCaptioning/interleaved_knn-train:
|
86 |
-
weight: 0.2
|
87 |
-
prefix_prob: 1.0
|
88 |
-
augmentations:
|
89 |
-
AC_short: 1.0
|
90 |
-
|
91 |
-
MACS-AudioCaptioning/train:
|
92 |
-
weight: 0.8
|
93 |
-
prefix_prob: 1.0
|
94 |
-
augmentations:
|
95 |
-
AC_short: 1.0
|
96 |
-
MACS-AudioCaptioning/interleaved_knn-train:
|
97 |
-
weight: 0.2
|
98 |
-
prefix_prob: 1.0
|
99 |
-
augmentations:
|
100 |
-
AC_short: 1.0
|
101 |
-
|
102 |
-
# Audio Classification
|
103 |
-
|
104 |
-
FSD50k-EventClassification/train:
|
105 |
-
weight: 0.8
|
106 |
-
prefix_prob: 1.0
|
107 |
-
augmentations:
|
108 |
-
default: 1.0
|
109 |
-
FSD50k-EventClassification/interleaved_knn-train:
|
110 |
-
weight: 0.2
|
111 |
-
prefix_prob: 1.0
|
112 |
-
augmentations:
|
113 |
-
default: 1.0
|
114 |
-
|
115 |
-
CochlScene-SceneClassification/train:
|
116 |
-
weight: 0.8
|
117 |
-
prefix_prob: 1.0
|
118 |
-
augmentations:
|
119 |
-
provide_all_labels: 0.5
|
120 |
-
default: 0.5
|
121 |
-
CochlScene-SceneClassification/interleaved_knn-train:
|
122 |
-
weight: 0.2
|
123 |
-
prefix_prob: 1.0
|
124 |
-
augmentations:
|
125 |
-
provide_all_labels: 0.5
|
126 |
-
default: 0.5
|
127 |
-
|
128 |
-
NonSpeech7k-EventClassification/train:
|
129 |
-
weight: 0.8
|
130 |
-
prefix_prob: 1.0
|
131 |
-
augmentations:
|
132 |
-
provide_all_labels: 0.5
|
133 |
-
default: 0.5
|
134 |
-
NonSpeech7k-EventClassification/interleaved_knn-train:
|
135 |
-
weight: 0.2
|
136 |
-
prefix_prob: 1.0
|
137 |
-
augmentations:
|
138 |
-
provide_all_labels: 0.5
|
139 |
-
default: 0.5
|
140 |
-
|
141 |
-
chime-home-EventClassification/train:
|
142 |
-
weight: 0.8
|
143 |
-
prefix_prob: 1.0
|
144 |
-
augmentations:
|
145 |
-
default: 0.5
|
146 |
-
num_words: 0.5
|
147 |
-
chime-home-EventClassification/interleaved_knn-train:
|
148 |
-
weight: 0.2
|
149 |
-
prefix_prob: 1.0
|
150 |
-
augmentations:
|
151 |
-
default: 0.5
|
152 |
-
num_words: 0.5
|
153 |
-
|
154 |
-
SONYC-UST-EventClassification/train:
|
155 |
-
weight: 0.8
|
156 |
-
prefix_prob: 1.0
|
157 |
-
augmentations:
|
158 |
-
default: 0.5
|
159 |
-
num_words: 0.5
|
160 |
-
SONYC-UST-EventClassification/interleaved_knn-train:
|
161 |
-
weight: 0.2
|
162 |
-
prefix_prob: 1.0
|
163 |
-
augmentations:
|
164 |
-
default: 0.5
|
165 |
-
num_words: 0.5
|
166 |
-
|
167 |
-
# Speech Emotion Classification
|
168 |
-
|
169 |
-
MELD-EmotionClassification/train:
|
170 |
-
weight: 0.5
|
171 |
-
prefix_prob: 1.0
|
172 |
-
augmentations:
|
173 |
-
provide_all_labels: 0.5
|
174 |
-
default: 0.5
|
175 |
-
|
176 |
-
MELD-SentimentClassification/train:
|
177 |
-
weight: 0.5
|
178 |
-
prefix_prob: 1.0
|
179 |
-
augmentations:
|
180 |
-
provide_all_labels: 0.1
|
181 |
-
default: 0.9
|
182 |
-
|
183 |
-
emov-db-EmotionClassification/train:
|
184 |
-
weight: 1.6
|
185 |
-
prefix_prob: 1.0
|
186 |
-
augmentations:
|
187 |
-
provide_all_labels: 0.5
|
188 |
-
default: 0.5
|
189 |
-
emov-db-EmotionClassification/interleaved_knn-train:
|
190 |
-
weight: 0.4
|
191 |
-
prefix_prob: 1.0
|
192 |
-
augmentations:
|
193 |
-
provide_all_labels: 0.5
|
194 |
-
default: 0.5
|
195 |
-
|
196 |
-
jl-corpus-EmotionClassification/train:
|
197 |
-
weight: 6.0
|
198 |
-
prefix_prob: 1.0
|
199 |
-
augmentations:
|
200 |
-
provide_all_labels: 0.5
|
201 |
-
default: 0.5
|
202 |
-
jl-corpus-EmotionClassification/interleaved_knn-train:
|
203 |
-
weight: 1.5
|
204 |
-
prefix_prob: 1.0
|
205 |
-
augmentations:
|
206 |
-
provide_all_labels: 0.5
|
207 |
-
default: 0.5
|
208 |
-
|
209 |
-
tess-EmotionClassification/train:
|
210 |
-
weight: 2.0
|
211 |
-
prefix_prob: 1.0
|
212 |
-
augmentations:
|
213 |
-
provide_all_labels: 0.5
|
214 |
-
default: 0.5
|
215 |
-
tess-EmotionClassification/interleaved_knn-train:
|
216 |
-
weight: 0.5
|
217 |
-
prefix_prob: 1.0
|
218 |
-
augmentations:
|
219 |
-
provide_all_labels: 0.5
|
220 |
-
default: 0.5
|
221 |
-
|
222 |
-
IEMOCAP-EmotionClassification/train:
|
223 |
-
weight: 2.4
|
224 |
-
prefix_prob: 1.0
|
225 |
-
augmentations:
|
226 |
-
provide_all_labels: 0.5
|
227 |
-
default: 0.5
|
228 |
-
IEMOCAP-EmotionClassification/interleaved_knn-train:
|
229 |
-
weight: 0.6
|
230 |
-
prefix_prob: 1.0
|
231 |
-
augmentations:
|
232 |
-
provide_all_labels: 0.5
|
233 |
-
default: 0.5
|
234 |
-
|
235 |
-
OMGEmotion-EmotionClassification/train:
|
236 |
-
weight: 3.0
|
237 |
-
prefix_prob: 1.0
|
238 |
-
augmentations:
|
239 |
-
provide_all_labels: 0.5
|
240 |
-
default: 0.5
|
241 |
-
|
242 |
-
VocalSound-VocalClassification/train:
|
243 |
-
weight: 1.0
|
244 |
-
prefix_prob: 1.0
|
245 |
-
augmentations:
|
246 |
-
provide_all_labels: 0.5
|
247 |
-
default: 0.5
|
248 |
-
|
249 |
-
# Music QA
|
250 |
-
|
251 |
-
Music-AVQA-AQA_All/train:
|
252 |
-
weight: 2.0
|
253 |
-
prefix_prob: 1.0
|
254 |
-
augmentations:
|
255 |
-
AQA_binary_instruction: 1.0
|
256 |
-
Music-AVQA-AQA_All/interleaved_knn-train:
|
257 |
-
weight: 1.0
|
258 |
-
prefix_prob: 1.0
|
259 |
-
augmentations:
|
260 |
-
AQA_binary_instruction: 1.0
|
261 |
-
|
262 |
-
MU-LLAMA-AQA/train:
|
263 |
-
weight: 0.9
|
264 |
-
prefix_prob: 1.0
|
265 |
-
augmentations:
|
266 |
-
do_nothing: 1.0
|
267 |
-
MU-LLAMA-AQA/interleaved_knn-train:
|
268 |
-
weight: 0.1
|
269 |
-
prefix_prob: 1.0
|
270 |
-
augmentations:
|
271 |
-
do_nothing: 1.0
|
272 |
-
|
273 |
-
# Music Captioning
|
274 |
-
|
275 |
-
LP-MusicCaps-MSD-AudioCaptioning/train:
|
276 |
-
weight: 0.05 # 1.3M
|
277 |
-
prefix_prob: 1.0
|
278 |
-
augmentations:
|
279 |
-
AC_paragraph: 1.0
|
280 |
-
LP-MusicCaps-MSD-AudioCaptioning/interleaved_knn-train:
|
281 |
-
weight: 0.05 # 111k
|
282 |
-
prefix_prob: 1.0
|
283 |
-
augmentations:
|
284 |
-
AC_paragraph: 1.0
|
285 |
-
|
286 |
-
LP-MusicCaps-MC-AudioCaptioning/train:
|
287 |
-
weight: 1.6
|
288 |
-
prefix_prob: 1.0
|
289 |
-
augmentations:
|
290 |
-
AC_paragraph: 1.0
|
291 |
-
LP-MusicCaps-MC-AudioCaptioning/interleaved_knn-train:
|
292 |
-
weight: 0.4
|
293 |
-
prefix_prob: 1.0
|
294 |
-
augmentations:
|
295 |
-
AC_paragraph: 1.0
|
296 |
-
|
297 |
-
LP-MusicCaps-MTT-AudioCaptioning/train:
|
298 |
-
weight: 0.8
|
299 |
-
prefix_prob: 1.0
|
300 |
-
augmentations:
|
301 |
-
AC_long: 1.0
|
302 |
-
LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-train:
|
303 |
-
weight: 0.2
|
304 |
-
prefix_prob: 1.0
|
305 |
-
augmentations:
|
306 |
-
AC_long: 1.0
|
307 |
-
|
308 |
-
MusicCaps-AudioCaptioning/train:
|
309 |
-
weight: 6.0
|
310 |
-
prefix_prob: 1.0
|
311 |
-
augmentations:
|
312 |
-
AC_paragraph: 1.0
|
313 |
-
MusicCaps-AudioCaptioning/interleaved_knn-train:
|
314 |
-
weight: 1.5
|
315 |
-
prefix_prob: 1.0
|
316 |
-
augmentations:
|
317 |
-
AC_paragraph: 1.0
|
318 |
-
|
319 |
-
SongDescriber-AudioCaptioning/train:
|
320 |
-
weight: 0.8
|
321 |
-
prefix_prob: 1.0
|
322 |
-
augmentations:
|
323 |
-
AC_long: 1.0
|
324 |
-
SongDescriber-AudioCaptioning/interleaved_knn-train:
|
325 |
-
weight: 0.2
|
326 |
-
prefix_prob: 1.0
|
327 |
-
augmentations:
|
328 |
-
AC_long: 1.0
|
329 |
-
|
330 |
-
# Music Understanding
|
331 |
-
|
332 |
-
NSynth-MIR/train:
|
333 |
-
weight: 0.2 # 289k for weight = 1
|
334 |
-
prefix_prob: 1.0
|
335 |
-
augmentations:
|
336 |
-
do_nothing: 1.0
|
337 |
-
NSynth-MIR/interleaved_knn-train:
|
338 |
-
weight: 0.2 # 60k for weight = 1
|
339 |
-
prefix_prob: 1.0
|
340 |
-
augmentations:
|
341 |
-
do_nothing: 1.0
|
342 |
-
|
343 |
-
mtg-jamendo-MusicTagging/train:
|
344 |
-
weight: 0.1
|
345 |
-
prefix_prob: 1.0
|
346 |
-
augmentations:
|
347 |
-
default: 1.0
|
348 |
-
|
349 |
-
FMA-GenreClassification/train:
|
350 |
-
weight: 0.4 # 104k for weight = 1
|
351 |
-
prefix_prob: 1.0
|
352 |
-
augmentations:
|
353 |
-
do_nothing: 1.0
|
354 |
-
FMA-GenreClassification/interleaved_knn-train:
|
355 |
-
weight: 0.3 # 46k for weight = 1
|
356 |
-
prefix_prob: 1.0
|
357 |
-
augmentations:
|
358 |
-
do_nothing: 1.0
|
359 |
-
|
360 |
-
musdbhq-InstrClassification/train:
|
361 |
-
weight: 0.8
|
362 |
-
prefix_prob: 1.0
|
363 |
-
augmentations:
|
364 |
-
provide_all_labels: 0.5
|
365 |
-
default: 0.5
|
366 |
-
|
367 |
-
dataset_file_root: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/dataset_files
|
368 |
-
data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
|
369 |
-
dataset_blending_output: dataset_blending.json
|
370 |
-
max_tokens: 512
|
371 |
-
num_workers: 4
|
372 |
-
|
373 |
-
valid_dataset_config:
|
374 |
-
|
375 |
-
Clotho-AQA-AQA/test: true
|
376 |
-
Clotho-AQA-AQA/interleaved_knn-test: true
|
377 |
-
|
378 |
-
Clotho-v2-AudioCaptioning/test: true
|
379 |
-
Clotho-v2-AudioCaptioning/interleaved_knn-test: true
|
380 |
-
|
381 |
-
FSD50k-EventClassification/test: true
|
382 |
-
FSD50k-EventClassification/interleaved_knn-test: true
|
383 |
-
|
384 |
-
CochlScene-SceneClassification/test: true
|
385 |
-
CochlScene-SceneClassification/interleaved_knn-test: true
|
386 |
-
|
387 |
-
NonSpeech7k-EventClassification/test: true
|
388 |
-
NonSpeech7k-EventClassification/interleaved_knn-test: true
|
389 |
-
|
390 |
-
SONYC-UST-EventClassification/test: true
|
391 |
-
SONYC-UST-EventClassification/interleaved_knn-test: true
|
392 |
-
|
393 |
-
emov-db-EmotionClassification/val: true
|
394 |
-
emov-db-EmotionClassification/interleaved_knn-val: true
|
395 |
-
|
396 |
-
jl-corpus-EmotionClassification/val: true
|
397 |
-
jl-corpus-EmotionClassification/interleaved_knn-val: true
|
398 |
-
|
399 |
-
tess-EmotionClassification/val: true
|
400 |
-
tess-EmotionClassification/interleaved_knn-val: true
|
401 |
-
|
402 |
-
IEMOCAP-EmotionClassification/test: true
|
403 |
-
IEMOCAP-EmotionClassification/interleaved_knn-test: true
|
404 |
-
|
405 |
-
OMGEmotion-EmotionClassification/val: true
|
406 |
-
|
407 |
-
Music-AVQA-AQA_All/test: true
|
408 |
-
Music-AVQA-AQA_All/interleaved_knn-test: true
|
409 |
-
|
410 |
-
MU-LLAMA-AQA/test: true
|
411 |
-
|
412 |
-
LP-MusicCaps-MSD-AudioCaptioning/test: true
|
413 |
-
LP-MusicCaps-MC-AudioCaptioning/test: true
|
414 |
-
LP-MusicCaps-MTT-AudioCaptioning/test: true
|
415 |
-
LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-test: true
|
416 |
-
|
417 |
-
NSynth-MIR/test: true
|
418 |
-
NSynth-MIR/interleaved_knn-test: true
|
419 |
-
|
420 |
-
mtg-jamendo-MusicTagging/val: true
|
421 |
-
|
422 |
-
audiocaps-AudioCaptioning/test: true
|
423 |
-
audiocaps-AudioCaptioning/interleaved_knn-test: true
|
424 |
-
|
425 |
-
MusicCaps-AudioCaptioning/test: true
|
426 |
-
|
427 |
-
MELD-EmotionClassification/test: true
|
428 |
-
MELD-SentimentClassification/test: true
|
429 |
-
VocalSound-VocalClassification/test: true
|
430 |
-
musdbhq-InstrClassification/test: true
|
431 |
-
|
432 |
-
# zero shot
|
433 |
-
|
434 |
-
GTZAN-GenreClassification/train:
|
435 |
-
prefix_prob: 1.0
|
436 |
-
augmentations:
|
437 |
-
provide_all_labels: 1.0
|
438 |
-
GTZAN-GenreClassification/interleaved_knn-train:
|
439 |
-
prefix_prob: 1.0
|
440 |
-
augmentations:
|
441 |
-
provide_all_labels: 1.0
|
442 |
-
|
443 |
-
Medley-solos-DB-InstrClassification/test:
|
444 |
-
prefix_prob: 1.0
|
445 |
-
augmentations:
|
446 |
-
provide_all_labels: 1.0
|
447 |
-
Medley-solos-DB-InstrClassification/interleaved_knn-test:
|
448 |
-
prefix_prob: 1.0
|
449 |
-
augmentations:
|
450 |
-
provide_all_labels: 1.0
|
451 |
-
|
452 |
-
clap_config:
|
453 |
-
# method: laion-clap
|
454 |
-
# audio_embed_dim: 512
|
455 |
-
# model_name: 630k-fusion-best
|
456 |
-
# checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
|
457 |
-
|
458 |
-
method: microsoft-clap
|
459 |
-
audio_embed_dim: 1024
|
460 |
-
config_root: /home/zkong/audio_flamingo/audio_flamingo_v1/v1.0_optimlmax1.3b_foundation/my_ms_clap/src/configs
|
461 |
-
# model_name: '2023'
|
462 |
-
# checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/CLAP_weights_2023.pth
|
463 |
-
model_name: 'clapcap'
|
464 |
-
checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/clapcap_weights_2023.pth
|
465 |
-
|
466 |
-
window_length: 7.0 # seconds
|
467 |
-
window_overlap: 5.25 # seconds
|
468 |
-
max_num_window: 16 # 35 seconds
|
469 |
-
max_num_fewshot: 4 # number of fewshot samples (including the final one)
|
470 |
-
|
471 |
-
model_config:
|
472 |
-
cache_dir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/LLM_pretrained/.cache
|
473 |
-
|
474 |
-
lang_encoder_path: facebook/opt-iml-max-1.3b
|
475 |
-
tokenizer_path: facebook/opt-iml-max-1.3b
|
476 |
-
cross_attn_every_n_layers: 1
|
477 |
-
audio_transformer_kwargs: {
|
478 |
-
n_head: 8,
|
479 |
-
n_layers: 3,
|
480 |
-
d_inner: 2048,
|
481 |
-
max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
|
482 |
-
max_window_per_audio: 16, # must = max_num_window
|
483 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml
DELETED
@@ -1,284 +0,0 @@
|
|
1 |
-
train_config:
|
2 |
-
expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed
|
3 |
-
run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft
|
4 |
-
delete_previous_checkpoint: true
|
5 |
-
batch_size: 4
|
6 |
-
gradient_accumulation_steps: 2
|
7 |
-
seed: 42
|
8 |
-
learning_rate: 0.00002
|
9 |
-
lr_scheduler: constant
|
10 |
-
loss_multiplier: 1.0
|
11 |
-
warmup_steps: 1875
|
12 |
-
weight_decay: 0.1
|
13 |
-
precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
|
14 |
-
gradient_checkpointing: False
|
15 |
-
num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
|
16 |
-
offline: false
|
17 |
-
freeze_lm_embeddings: false
|
18 |
-
logging_steps: 10
|
19 |
-
dist_backend: nccl
|
20 |
-
dist_url: env:// # tcp://localhost:7000
|
21 |
-
no_set_device_rank: false
|
22 |
-
fsdp: true
|
23 |
-
fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
|
24 |
-
fsdp_sharding_strategy: full # full, hybrid
|
25 |
-
horovod: false
|
26 |
-
|
27 |
-
# instruction tuning hparams
|
28 |
-
sft_config:
|
29 |
-
pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
|
30 |
-
pretrained_ckpt: checkpoint_199.pt
|
31 |
-
unfreeze_full_lm: false
|
32 |
-
|
33 |
-
data_config:
|
34 |
-
dataset_blending_global_weight: 0.005
|
35 |
-
|
36 |
-
dataset_blending_config:
|
37 |
-
|
38 |
-
MMAUQA/train:
|
39 |
-
weight: 1.5
|
40 |
-
|
41 |
-
AudioSet-Temporal-Speech-Audio-QA/train:
|
42 |
-
weight: 1.0
|
43 |
-
|
44 |
-
CompA-R-AQA/train:
|
45 |
-
weight: 1.0
|
46 |
-
|
47 |
-
# Audio QA
|
48 |
-
Clotho-AQA-AQA/train:
|
49 |
-
weight: 1.0
|
50 |
-
|
51 |
-
OpenAQA-AQA/train:
|
52 |
-
weight: 1.0
|
53 |
-
|
54 |
-
SalmonnQA/train:
|
55 |
-
weight: 1.0
|
56 |
-
|
57 |
-
AudioEntailmentQA/train:
|
58 |
-
weight: 1.0
|
59 |
-
|
60 |
-
# Audio Captioning
|
61 |
-
|
62 |
-
Clotho-v2-AudioCaptioning/train:
|
63 |
-
weight: 1.0
|
64 |
-
|
65 |
-
audiocaps-AudioCaptioning/train:
|
66 |
-
weight: 1.0
|
67 |
-
|
68 |
-
Epidemic_sound-AudioCaptioning/train:
|
69 |
-
weight: 1.0
|
70 |
-
|
71 |
-
MACS-AudioCaptioning/train:
|
72 |
-
weight: 1.0
|
73 |
-
|
74 |
-
# Audio Classification
|
75 |
-
|
76 |
-
FSD50k-EventClassification/train:
|
77 |
-
weight: 1.0
|
78 |
-
|
79 |
-
CochlScene-SceneClassification/train:
|
80 |
-
weight: 1.0
|
81 |
-
|
82 |
-
NonSpeech7k-EventClassification/train:
|
83 |
-
weight: 1.0
|
84 |
-
|
85 |
-
chime-home-EventClassification/train:
|
86 |
-
weight: 1.0
|
87 |
-
|
88 |
-
SONYC-UST-EventClassification/train:
|
89 |
-
weight: 1.0
|
90 |
-
|
91 |
-
# Speech Emotion Classification
|
92 |
-
|
93 |
-
MELD-EmotionClassification/train:
|
94 |
-
weight: 0.5
|
95 |
-
|
96 |
-
MELD-SentimentClassification/train:
|
97 |
-
weight: 0.5
|
98 |
-
|
99 |
-
emov-db-EmotionClassification/train:
|
100 |
-
weight: 1.0
|
101 |
-
|
102 |
-
jl-corpus-EmotionClassification/train:
|
103 |
-
weight: 6.0
|
104 |
-
|
105 |
-
tess-EmotionClassification/train:
|
106 |
-
weight: 2.5
|
107 |
-
|
108 |
-
IEMOCAP-EmotionClassification/train:
|
109 |
-
weight: 3.0
|
110 |
-
|
111 |
-
OMGEmotion-EmotionClassification/train:
|
112 |
-
weight: 3.0
|
113 |
-
|
114 |
-
VocalSound-VocalClassification/train:
|
115 |
-
weight: 1.5
|
116 |
-
|
117 |
-
# Music QA
|
118 |
-
|
119 |
-
Music-AVQA-AQA_All/train:
|
120 |
-
weight: 3.0
|
121 |
-
|
122 |
-
MU-LLAMA-AQA/train:
|
123 |
-
weight: 1.0
|
124 |
-
|
125 |
-
# Music Captioning
|
126 |
-
|
127 |
-
LP-MusicCaps-MSD-AudioCaptioning/train:
|
128 |
-
weight: 0.06
|
129 |
-
|
130 |
-
LP-MusicCaps-MC-AudioCaptioning/train:
|
131 |
-
weight: 2.0
|
132 |
-
|
133 |
-
LP-MusicCaps-MTT-AudioCaptioning/train:
|
134 |
-
weight: 1.0
|
135 |
-
|
136 |
-
MusicCaps-AudioCaptioning/train:
|
137 |
-
weight: 6.0
|
138 |
-
|
139 |
-
musdbhq-captioning/train:
|
140 |
-
weight: 2.0
|
141 |
-
|
142 |
-
# Music Understanding
|
143 |
-
|
144 |
-
NSynth-MIR/train:
|
145 |
-
weight: 0.2
|
146 |
-
|
147 |
-
mtg-jamendo-MusicTagging/train:
|
148 |
-
weight: 0.1
|
149 |
-
|
150 |
-
FMA-GenreClassification/train:
|
151 |
-
weight: 0.5
|
152 |
-
|
153 |
-
musdbhq-InstrClassification/train:
|
154 |
-
weight: 0.8
|
155 |
-
|
156 |
-
LLARK_FMA-mir/train:
|
157 |
-
weight: 1.0
|
158 |
-
|
159 |
-
LLARK_FMA-reasoning/train:
|
160 |
-
weight: 1.0
|
161 |
-
|
162 |
-
LLARK_MagnaTagATune-mir/train:
|
163 |
-
weight: 1.0
|
164 |
-
|
165 |
-
LLARK_MTG-Jamendo-reasoning/train:
|
166 |
-
weight: 1.0
|
167 |
-
|
168 |
-
LLARK_MagnaTagATune-reasoning/train:
|
169 |
-
weight: 1.0
|
170 |
-
|
171 |
-
LLARK_MTG-Jamendo-mir/train:
|
172 |
-
weight: 1.0
|
173 |
-
|
174 |
-
MusicBenchQA/train:
|
175 |
-
weight: 1.0
|
176 |
-
|
177 |
-
dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data_w_duration
|
178 |
-
data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
|
179 |
-
dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
|
180 |
-
max_tokens: 512
|
181 |
-
num_workers: 4
|
182 |
-
|
183 |
-
valid_dataset_config:
|
184 |
-
|
185 |
-
Clotho-AQA-AQA/test: true
|
186 |
-
|
187 |
-
Clotho-v2-AudioCaptioning/test: true
|
188 |
-
audiocaps-AudioCaptioning/test: true
|
189 |
-
|
190 |
-
FSD50k-EventClassification/test: true
|
191 |
-
CochlScene-SceneClassification/test: true
|
192 |
-
NonSpeech7k-EventClassification/test: true
|
193 |
-
SONYC-UST-EventClassification/test: true
|
194 |
-
|
195 |
-
MELD-EmotionClassification/test: true
|
196 |
-
MELD-SentimentClassification/test: true
|
197 |
-
emov-db-EmotionClassification/val: true
|
198 |
-
jl-corpus-EmotionClassification/val: true
|
199 |
-
tess-EmotionClassification/val: true
|
200 |
-
IEMOCAP-EmotionClassification/val: true
|
201 |
-
OMGEmotion-EmotionClassification/val: true
|
202 |
-
VocalSound-VocalClassification/test: true
|
203 |
-
|
204 |
-
Music-AVQA-AQA_All/test: true
|
205 |
-
MU-LLAMA-AQA/test: true
|
206 |
-
|
207 |
-
LP-MusicCaps-MSD-AudioCaptioning/test: true
|
208 |
-
LP-MusicCaps-MC-AudioCaptioning/test: true
|
209 |
-
LP-MusicCaps-MTT-AudioCaptioning/test: true
|
210 |
-
MusicCaps-AudioCaptioning/test: true
|
211 |
-
|
212 |
-
NSynth-MIR/test: true
|
213 |
-
mtg-jamendo-MusicTagging/val: true
|
214 |
-
musdbhq-InstrClassification/test: true
|
215 |
-
|
216 |
-
# # zero shot
|
217 |
-
# CREMA-D-EmotionClassification/train:
|
218 |
-
# prefix_prob: 1.0
|
219 |
-
|
220 |
-
# ravdess-EmotionClassification/train:
|
221 |
-
# prefix_prob: 1.0
|
222 |
-
|
223 |
-
# UrbanSound8K-EventClassification/train:
|
224 |
-
# prefix_prob: 1.0
|
225 |
-
|
226 |
-
# ESC50-EventClassification/train:
|
227 |
-
# prefix_prob: 1.0
|
228 |
-
|
229 |
-
# DCASE17Task4-SceneClassification/test:
|
230 |
-
# prefix_prob: 1.0
|
231 |
-
|
232 |
-
# GTZAN-GenreClassification/train:
|
233 |
-
# prefix_prob: 1.0
|
234 |
-
|
235 |
-
# Medley-solos-DB-InstrClassification/test:
|
236 |
-
# prefix_prob: 1.0
|
237 |
-
|
238 |
-
clap_config:
|
239 |
-
method: nvclap-large
|
240 |
-
audio_embed_dim: 2048
|
241 |
-
checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
|
242 |
-
|
243 |
-
window_length: 10.0 # seconds
|
244 |
-
window_overlap: 0.0 # seconds
|
245 |
-
max_num_window: 9 # 1.5 minutes
|
246 |
-
max_num_fewshot: 1 # number of fewshot samples (including the final one)
|
247 |
-
finetune: true
|
248 |
-
|
249 |
-
whisper_config:
|
250 |
-
method: whisper-large-v3
|
251 |
-
path: openai/whisper-large-v3
|
252 |
-
audio_embed_dim: 1280
|
253 |
-
sampling_rate: 16000
|
254 |
-
|
255 |
-
window_length: 30.0 # seconds
|
256 |
-
window_overlap: 0.0 # seconds
|
257 |
-
max_num_window: 1 # 5 minutes
|
258 |
-
max_num_fewshot: 1 # number of fewshot samples (including the final one)
|
259 |
-
|
260 |
-
mert_config:
|
261 |
-
method: mert-v1
|
262 |
-
path: m-a-p/MERT-v1-330M
|
263 |
-
audio_embed_dim: 1024
|
264 |
-
sampling_rate: 24000
|
265 |
-
|
266 |
-
window_length: 10.0 # seconds
|
267 |
-
window_overlap: 0.0 # seconds
|
268 |
-
max_num_window: 1 # 5 minutes
|
269 |
-
max_num_fewshot: 1 # number of fewshot samples (including the final one)
|
270 |
-
|
271 |
-
model_config:
|
272 |
-
cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
|
273 |
-
|
274 |
-
lang_encoder_path: Qwen/Qwen2.5-3B
|
275 |
-
tokenizer_path: Qwen/Qwen2.5-3B
|
276 |
-
cross_attn_every_n_layers: 1
|
277 |
-
audio_transformer_kwargs: {
|
278 |
-
n_head: 8,
|
279 |
-
n_layers: 3,
|
280 |
-
d_inner: 2048,
|
281 |
-
max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
|
282 |
-
max_window_per_audio: 1, # must = max_num_window
|
283 |
-
common_encoder_embed_dim: 1024
|
284 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|