audio-flamingo-2-1.5B

Running on L4

App Files Files Community

root commited on 23 days ago

Commit

5d59dad

1 Parent(s): f11ac57

initial commit

Browse files

Files changed (7) hide show

configs/inference_1.5.yaml +0 -302
configs/inference_2.yaml +0 -302
configs/inference_long.yaml +0 -284
configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml +0 -255
configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml +0 -183
configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml +0 -483
configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml +0 -284

configs/inference_1.5.yaml DELETED Viewed

@@ -1,302 +0,0 @@
-train_config:
-  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-1.5b
-  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-1.5B-sft
-  delete_previous_checkpoint: true
-  batch_size: 32
-  gradient_accumulation_steps: 2
-  seed: 42
-  learning_rate: 0.00002
-  lr_scheduler: constant
-  loss_multiplier: 1.0
-  warmup_steps: 1875
-  weight_decay: 0.1
-  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
-  gradient_checkpointing: False
-  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
-  offline: false
-  freeze_lm_embeddings: false
-  logging_steps: 10
-  dist_backend: nccl
-  dist_url: env:// # tcp://localhost:7000
-  no_set_device_rank: false
-  fsdp: true
-  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
-  fsdp_sharding_strategy: full  # full, hybrid
-  horovod: false
-# instruction tuning hparams
-# sft_config:
-#   pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed/
-#   pretrained_ckpt: checkpoint_199.pt
-#   unfreeze_full_lm: false
-data_config:
-  dataset_blending_global_weight: 0.005
-  dataset_blending_config:
-    MMAUQA/train:
-      weight: 1.5
-    AudioSet-Temporal-Speech-Audio-QA/train:
-      weight: 1.0
-    CompA-R-AQA/train:
-      weight: 1.0
-    # Audio QA
-    Clotho-AQA-AQA/train:
-      weight: 1.0
-    OpenAQA-AQA/train:
-      weight: 1.0
-    SalmonnQA/train:
-      weight: 0.8
-    AudioEntailmentQA/train:
-      weight: 1.0
-    # Audio Captioning
-    Clotho-v2-AudioCaptioning/train:
-      weight: 1.0
-    audiocaps-AudioCaptioning/train:
-      weight: 1.0
-    Epidemic_sound-AudioCaptioning/train:
-      weight: 1.0
-    MACS-AudioCaptioning/train:
-      weight: 1.0
-    # Audio Classification
-    UrbanSound8K-EventClassification/train:
-      weight: 0.5
-    TUT-EventClassification/train:
-      weight: 2.0
-    FSD50k-EventClassification/train:
-      weight: 1.0
-    CochlScene-SceneClassification/train:
-      weight: 1.0
-    NonSpeech7k-EventClassification/train:
-      weight: 1.0
-    chime-home-EventClassification/train:
-      weight: 1.0
-    SONYC-UST-EventClassification/train:
-      weight: 1.0
-    # Speech Emotion Classification
-    MELD-EmotionClassification/train:
-      weight: 0.5
-    MELD-SentimentClassification/train:
-      weight: 0.5
-    emov-db-EmotionClassification/train:
-      weight: 1.0
-    jl-corpus-EmotionClassification/train:
-      weight: 6.0
-    tess-EmotionClassification/train:
-      weight: 2.5
-    IEMOCAP-EmotionClassification/train:
-      weight: 3.0
-    OMGEmotion-EmotionClassification/train:
-      weight: 3.0
-    VocalSound-VocalClassification/train:
-      weight: 1.5
-    # Music QA
-    Music-AVQA-AQA_All/train:
-      weight: 3.0
-    MU-LLAMA-AQA/train:
-      weight: 1.0
-    # Music Captioning
-    LP-MusicCaps-MSD-AudioCaptioning/train:
-      weight: 0.06
-    LP-MusicCaps-MC-AudioCaptioning/train:
-      weight: 2.0
-    LP-MusicCaps-MTT-AudioCaptioning/train:
-      weight: 1.0
-    MusicCaps-AudioCaptioning/train:
-      weight: 6.0
-    musdbhq-captioning/train:
-      weight: 2.0
-    # Music Understanding
-    Medley-solos-DB-InstrClassification/train:
-      weight: 1.5
-    GTZAN-GenreClassification/train:
-      weight: 2.0
-    NSynth-MIR/train:
-      weight: 0.4
-    NSynth-Instrument/train:
-      weight: 1.5
-    NSynth-Source/train:
-      weight: 1.5
-    mtg-jamendo-MusicTagging/train:
-      weight: 1.0
-    FMA-GenreClassification/train:
-      weight: 1.0
-    musdbhq-InstrClassification/train:
-      weight: 1.0
-    LLARK_FMA-mir/train:
-      weight: 1.0
-    LLARK_FMA-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-mir/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-reasoning/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-mir/train:
-      weight: 1.0
-    MusicBenchQA/train:
-      weight: 1.0
-  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
-  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
-  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/dataset_blending.json
-  max_tokens: 512
-  num_workers: 4
-  valid_dataset_config:
-    Clotho-AQA-AQA/test: true
-    Clotho-v2-AudioCaptioning/test: true
-    audiocaps-AudioCaptioning/test: true
-    FSD50k-EventClassification/test: true
-    CochlScene-SceneClassification/test: true
-    NonSpeech7k-EventClassification/test: true
-    SONYC-UST-EventClassification/test: true
-    MELD-EmotionClassification/test: true
-    MELD-SentimentClassification/test: true
-    emov-db-EmotionClassification/val: true
-    jl-corpus-EmotionClassification/val: true
-    tess-EmotionClassification/val: true
-    IEMOCAP-EmotionClassification/val: true
-    OMGEmotion-EmotionClassification/val: true
-    VocalSound-VocalClassification/test: true
-    Music-AVQA-AQA_All/test: true
-    MU-LLAMA-AQA/test: true
-    LP-MusicCaps-MSD-AudioCaptioning/test: true
-    LP-MusicCaps-MC-AudioCaptioning/test: true
-    LP-MusicCaps-MTT-AudioCaptioning/test: true
-    MusicCaps-AudioCaptioning/test: true
-    NSynth-MIR/test: true
-    mtg-jamendo-MusicTagging/val: true
-    musdbhq-InstrClassification/test: true
-    # zero shot
-    # CREMA-D-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # ravdess-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # UrbanSound8K-EventClassification/train:
-    #   prefix_prob: 1.0
-    # ESC50-EventClassification/train:
-    #   prefix_prob: 1.0
-    # DCASE17Task4-SceneClassification/test:
-    #   prefix_prob: 1.0
-    # GTZAN-GenreClassification/train:
-    #   prefix_prob: 1.0
-    # Medley-solos-DB-InstrClassification/test:
-    #   prefix_prob: 1.0
-clap_config:
-  method: nvclap-large
-  audio_embed_dim: 2048
-  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 9  # 1.5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-  finetune: true
-whisper_config:
-  method: whisper-large-v3
-  path: openai/whisper-large-v3
-  audio_embed_dim: 1280
-  sampling_rate: 16000
-  window_length: 30.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-mert_config:
-  method: mert-v1
-  path: m-a-p/MERT-v1-330M
-  audio_embed_dim: 1024
-  sampling_rate: 24000
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-model_config:
-  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
-  lang_encoder_path: Qwen/Qwen2.5-1.5B
-  tokenizer_path: Qwen/Qwen2.5-1.5B
-  cross_attn_every_n_layers: 1
-  audio_transformer_kwargs: {
-    n_head: 8,
-    n_layers: 3,
-    d_inner: 2048,
-    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
-    max_window_per_audio: 1,  # must = max_num_window
-    common_encoder_embed_dim: 1024
-  }

configs/inference_2.yaml DELETED Viewed

@@ -1,302 +0,0 @@
-train_config:
-  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers
-  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed
-  delete_previous_checkpoint: true
-  batch_size: 4
-  gradient_accumulation_steps: 2
-  seed: 42
-  learning_rate: 0.00002
-  lr_scheduler: constant
-  loss_multiplier: 1.0
-  warmup_steps: 1875
-  weight_decay: 0.1
-  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
-  gradient_checkpointing: False
-  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
-  offline: false
-  freeze_lm_embeddings: false
-  logging_steps: 10
-  dist_backend: nccl
-  dist_url: env:// # tcp://localhost:7000
-  no_set_device_rank: false
-  fsdp: true
-  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
-  fsdp_sharding_strategy: full  # full, hybrid
-  horovod: false
-# instruction tuning hparams
-sft_config:
-  pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed/
-  pretrained_ckpt: checkpoint_199.pt
-  unfreeze_full_lm: false
-data_config:
-  dataset_blending_global_weight: 0.005
-  dataset_blending_config:
-    MMAUQA/train:
-      weight: 1.5
-    AudioSet-Temporal-Speech-Audio-QA/train:
-      weight: 1.0
-    CompA-R-AQA/train:
-      weight: 1.0
-    # Audio QA
-    Clotho-AQA-AQA/train:
-      weight: 1.0
-    OpenAQA-AQA/train:
-      weight: 1.0
-    SalmonnQA/train:
-      weight: 0.8
-    AudioEntailmentQA/train:
-      weight: 1.0
-    # Audio Captioning
-    Clotho-v2-AudioCaptioning/train:
-      weight: 1.0
-    audiocaps-AudioCaptioning/train:
-      weight: 1.0
-    Epidemic_sound-AudioCaptioning/train:
-      weight: 1.0
-    MACS-AudioCaptioning/train:
-      weight: 1.0
-    # Audio Classification
-    UrbanSound8K-EventClassification/train:
-      weight: 0.5
-    TUT-EventClassification/train:
-      weight: 2.0
-    FSD50k-EventClassification/train:
-      weight: 1.0
-    CochlScene-SceneClassification/train:
-      weight: 1.0
-    NonSpeech7k-EventClassification/train:
-      weight: 1.0
-    chime-home-EventClassification/train:
-      weight: 1.0
-    SONYC-UST-EventClassification/train:
-      weight: 1.0
-    # Speech Emotion Classification
-    MELD-EmotionClassification/train:
-      weight: 0.5
-    MELD-SentimentClassification/train:
-      weight: 0.5
-    emov-db-EmotionClassification/train:
-      weight: 1.0
-    jl-corpus-EmotionClassification/train:
-      weight: 6.0
-    tess-EmotionClassification/train:
-      weight: 2.5
-    IEMOCAP-EmotionClassification/train:
-      weight: 3.0
-    OMGEmotion-EmotionClassification/train:
-      weight: 3.0
-    VocalSound-VocalClassification/train:
-      weight: 1.5
-    # Music QA
-    Music-AVQA-AQA_All/train:
-      weight: 3.0
-    MU-LLAMA-AQA/train:
-      weight: 1.0
-    # Music Captioning
-    LP-MusicCaps-MSD-AudioCaptioning/train:
-      weight: 0.06
-    LP-MusicCaps-MC-AudioCaptioning/train:
-      weight: 2.0
-    LP-MusicCaps-MTT-AudioCaptioning/train:
-      weight: 1.0
-    MusicCaps-AudioCaptioning/train:
-      weight: 6.0
-    musdbhq-captioning/train:
-      weight: 2.0
-    # Music Understanding
-    Medley-solos-DB-InstrClassification/train:
-      weight: 1.5
-    GTZAN-GenreClassification/train:
-      weight: 2.0
-    NSynth-MIR/train:
-      weight: 0.4
-    NSynth-Instrument/train:
-      weight: 1.5
-    NSynth-Source/train:
-      weight: 1.5
-    mtg-jamendo-MusicTagging/train:
-      weight: 1.0
-    FMA-GenreClassification/train:
-      weight: 1.0
-    musdbhq-InstrClassification/train:
-      weight: 1.0
-    LLARK_FMA-mir/train:
-      weight: 1.0
-    LLARK_FMA-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-mir/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-reasoning/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-mir/train:
-      weight: 1.0
-    MusicBenchQA/train:
-      weight: 1.0
-  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
-  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
-  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/dataset_blending.json
-  max_tokens: 512
-  num_workers: 4
-  valid_dataset_config:
-    Clotho-AQA-AQA/test: true
-    Clotho-v2-AudioCaptioning/test: true
-    audiocaps-AudioCaptioning/test: true
-    FSD50k-EventClassification/test: true
-    CochlScene-SceneClassification/test: true
-    NonSpeech7k-EventClassification/test: true
-    SONYC-UST-EventClassification/test: true
-    MELD-EmotionClassification/test: true
-    MELD-SentimentClassification/test: true
-    emov-db-EmotionClassification/val: true
-    jl-corpus-EmotionClassification/val: true
-    tess-EmotionClassification/val: true
-    IEMOCAP-EmotionClassification/val: true
-    OMGEmotion-EmotionClassification/val: true
-    VocalSound-VocalClassification/test: true
-    Music-AVQA-AQA_All/test: true
-    MU-LLAMA-AQA/test: true
-    LP-MusicCaps-MSD-AudioCaptioning/test: true
-    LP-MusicCaps-MC-AudioCaptioning/test: true
-    LP-MusicCaps-MTT-AudioCaptioning/test: true
-    MusicCaps-AudioCaptioning/test: true
-    NSynth-MIR/test: true
-    mtg-jamendo-MusicTagging/val: true
-    musdbhq-InstrClassification/test: true
-    # zero shot
-    # CREMA-D-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # ravdess-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # UrbanSound8K-EventClassification/train:
-    #   prefix_prob: 1.0
-    # ESC50-EventClassification/train:
-    #   prefix_prob: 1.0
-    # DCASE17Task4-SceneClassification/test:
-    #   prefix_prob: 1.0
-    # GTZAN-GenreClassification/train:
-    #   prefix_prob: 1.0
-    # Medley-solos-DB-InstrClassification/test:
-    #   prefix_prob: 1.0
-clap_config:
-  method: nvclap-large
-  audio_embed_dim: 2048
-  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 9  # 1.5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-  finetune: true
-whisper_config:
-  method: whisper-large-v3
-  path: openai/whisper-large-v3
-  audio_embed_dim: 1280
-  sampling_rate: 16000
-  window_length: 30.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-mert_config:
-  method: mert-v1
-  path: m-a-p/MERT-v1-330M
-  audio_embed_dim: 1024
-  sampling_rate: 24000
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-model_config:
-  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
-  lang_encoder_path: Qwen/Qwen2.5-3B
-  tokenizer_path: Qwen/Qwen2.5-3B
-  cross_attn_every_n_layers: 1
-  audio_transformer_kwargs: {
-    n_head: 8,
-    n_layers: 3,
-    d_inner: 2048,
-    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
-    max_window_per_audio: 1,  # must = max_num_window
-    common_encoder_embed_dim: 1024
-  }

configs/inference_long.yaml DELETED Viewed

@@ -1,284 +0,0 @@
-train_config:
-  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed-sft
-  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft-long
-  delete_previous_checkpoint: true
-  batch_size: 2
-  gradient_accumulation_steps: 2
-  seed: 42
-  learning_rate: 0.00002
-  lr_scheduler: constant
-  loss_multiplier: 1.0
-  warmup_steps: 1875
-  weight_decay: 0.1
-  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
-  gradient_checkpointing: False
-  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
-  offline: false
-  freeze_lm_embeddings: false
-  logging_steps: 10
-  dist_backend: nccl
-  dist_url: env:// # tcp://localhost:7000
-  no_set_device_rank: false
-  fsdp: true
-  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
-  fsdp_sharding_strategy: full  # full, hybrid
-  horovod: false
-# instruction tuning hparams
-# sft_config:
-#   pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
-#   pretrained_ckpt: checkpoint_199.pt
-#   unfreeze_full_lm: false
-data_config:
-  dataset_blending_global_weight: 0.005
-  dataset_blending_config:
-    MMAUQA/train:
-      weight: 1.5
-    AudioSet-Temporal-Speech-Audio-QA/train:
-      weight: 1.0
-    CompA-R-AQA/train:
-      weight: 1.0
-    # Audio QA
-    Clotho-AQA-AQA/train:
-      weight: 1.0
-    OpenAQA-AQA/train:
-      weight: 1.0
-    SalmonnQA/train:
-      weight: 1.0
-    AudioEntailmentQA/train:
-      weight: 1.0
-    # Audio Captioning
-    Clotho-v2-AudioCaptioning/train:
-      weight: 1.0
-    audiocaps-AudioCaptioning/train:
-      weight: 1.0
-    Epidemic_sound-AudioCaptioning/train:
-      weight: 1.0
-    MACS-AudioCaptioning/train:
-      weight: 1.0
-    # Audio Classification
-    FSD50k-EventClassification/train:
-      weight: 1.0
-    CochlScene-SceneClassification/train:
-      weight: 1.0
-    NonSpeech7k-EventClassification/train:
-      weight: 1.0
-    chime-home-EventClassification/train:
-      weight: 1.0
-    SONYC-UST-EventClassification/train:
-      weight: 1.0
-    # Speech Emotion Classification
-    MELD-EmotionClassification/train:
-      weight: 0.5
-    MELD-SentimentClassification/train:
-      weight: 0.5
-    emov-db-EmotionClassification/train:
-      weight: 1.0
-    jl-corpus-EmotionClassification/train:
-      weight: 6.0
-    tess-EmotionClassification/train:
-      weight: 2.5
-    IEMOCAP-EmotionClassification/train:
-      weight: 3.0
-    OMGEmotion-EmotionClassification/train:
-      weight: 3.0
-    VocalSound-VocalClassification/train:
-      weight: 1.5
-    # Music QA
-    Music-AVQA-AQA_All/train:
-      weight: 3.0
-    MU-LLAMA-AQA/train:
-      weight: 1.0
-    # Music Captioning
-    LP-MusicCaps-MSD-AudioCaptioning/train:
-      weight: 0.06
-    LP-MusicCaps-MC-AudioCaptioning/train:
-      weight: 2.0
-    LP-MusicCaps-MTT-AudioCaptioning/train:
-      weight: 1.0
-    MusicCaps-AudioCaptioning/train:
-      weight: 6.0
-    musdbhq-captioning/train:
-      weight: 2.0
-    # Music Understanding
-    NSynth-MIR/train:
-      weight: 0.2
-    mtg-jamendo-MusicTagging/train:
-      weight: 0.1
-    FMA-GenreClassification/train:
-      weight: 0.5
-    musdbhq-InstrClassification/train:
-      weight: 0.8
-    LLARK_FMA-mir/train:
-      weight: 1.0
-    LLARK_FMA-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-mir/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-reasoning/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-mir/train:
-      weight: 1.0
-    MusicBenchQA/train:
-      weight: 1.0
-  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
-  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
-  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
-  max_tokens: 512
-  num_workers: 4
-  valid_dataset_config:
-    Clotho-AQA-AQA/test: true
-    Clotho-v2-AudioCaptioning/test: true
-    audiocaps-AudioCaptioning/test: true
-    FSD50k-EventClassification/test: true
-    CochlScene-SceneClassification/test: true
-    NonSpeech7k-EventClassification/test: true
-    SONYC-UST-EventClassification/test: true
-    MELD-EmotionClassification/test: true
-    MELD-SentimentClassification/test: true
-    emov-db-EmotionClassification/val: true
-    jl-corpus-EmotionClassification/val: true
-    tess-EmotionClassification/val: true
-    IEMOCAP-EmotionClassification/val: true
-    OMGEmotion-EmotionClassification/val: true
-    VocalSound-VocalClassification/test: true
-    Music-AVQA-AQA_All/test: true
-    MU-LLAMA-AQA/test: true
-    LP-MusicCaps-MSD-AudioCaptioning/test: true
-    LP-MusicCaps-MC-AudioCaptioning/test: true
-    LP-MusicCaps-MTT-AudioCaptioning/test: true
-    MusicCaps-AudioCaptioning/test: true
-    NSynth-MIR/test: true
-    mtg-jamendo-MusicTagging/val: true
-    musdbhq-InstrClassification/test: true
-    # # zero shot
-    # CREMA-D-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # ravdess-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # UrbanSound8K-EventClassification/train:
-    #   prefix_prob: 1.0
-    # ESC50-EventClassification/train:
-    #   prefix_prob: 1.0
-    # DCASE17Task4-SceneClassification/test:
-    #   prefix_prob: 1.0
-    # GTZAN-GenreClassification/train:
-    #   prefix_prob: 1.0
-    # Medley-solos-DB-InstrClassification/test:
-    #   prefix_prob: 1.0
-clap_config:
-  method: nvclap-large
-  audio_embed_dim: 2048
-  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 30  # 1.5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-  finetune: true
-whisper_config:
-  method: whisper-large-v3
-  path: openai/whisper-large-v3
-  audio_embed_dim: 1280
-  sampling_rate: 16000
-  window_length: 30.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-mert_config:
-  method: mert-v1
-  path: m-a-p/MERT-v1-330M
-  audio_embed_dim: 1024
-  sampling_rate: 24000
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-model_config:
-  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
-  lang_encoder_path: Qwen/Qwen2.5-3B
-  tokenizer_path: Qwen/Qwen2.5-3B
-  cross_attn_every_n_layers: 1
-  audio_transformer_kwargs: {
-    n_head: 8,
-    n_layers: 3,
-    d_inner: 2048,
-    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
-    max_window_per_audio: 1,  # must = max_num_window
-    common_encoder_embed_dim: 1024
-  }

configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml DELETED Viewed

@@ -1,255 +0,0 @@
-train_config:
-  expdir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation
-  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node
-  delete_previous_checkpoint: true
-  batch_size: 6
-  gradient_accumulation_steps: 2  # 4 nodes
-  seed: 42
-  learning_rate: 0.0001
-  lr_scheduler: constant
-  loss_multiplier: 1.0
-  warmup_steps: 1875
-  weight_decay: 0.1
-  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
-  gradient_checkpointing: False
-  num_epochs: 100  # num_epochs * dataset_blending_global_weight = 1
-  offline: false
-  freeze_lm_embeddings: true
-  logging_steps: 10
-  dist_backend: nccl
-  dist_url: env:// # tcp://localhost:7000
-  no_set_device_rank: false
-  fsdp: true
-  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
-  fsdp_sharding_strategy: full  # full, hybrid
-  horovod: false
-data_config:
-  dataset_blending_global_weight: 0.01
-  dataset_blending_config:
-    # Audio QA
-    OpenAQA-AQA/train:
-      weight: 1.0
-      prefix_prob: 0.0
-      augmentations:
-        do_nothing: 1.0
-    # Audio Captioning
-    BBCSoundEffects-AudioDescription/train:
-      weight: 5.0
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    CLAP_freesound-AudioCaptioning/train:
-      weight: 1.0
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    SoundDescs-AudioDescription/train:
-      weight: 1.0
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    WavCaps-AudioSet_SL-AudioCaptioning/train:
-      weight: 1.0
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    WavCaps-BBC_Sound_Effects-AudioCaptioning/train:
-      weight: 2
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    WavCaps-FreeSound-AudioCaptioning/train:
-      weight: 2
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    WavCaps-SoundBible-AudioCaptioning/train:
-      weight: 5
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    # Audio Classification
-    AudioSetFullwoAudioMusicCaps-EventClassification/train:
-      weight: 1.0
-      prefix_prob: 0.5
-      augmentations:
-        num_words: 0.8
-        do_nothing: 0.2
-    AudioSet-EventClassification/train:
-      weight: 5.0
-      prefix_prob: 0.5
-      augmentations:
-        num_words: 0.8
-        do_nothing: 0.2
-    Clotho-AQA-EventClassification/train:
-      weight: 5.0
-      prefix_prob: 0.5
-      augmentations:
-        num_words: 0.8
-        do_nothing: 0.2
-    WavText5K-Tagging/train:
-      weight: 3.0
-      prefix_prob: 0.5
-      augmentations:
-        num_words: 0.8
-        do_nothing: 0.2
-    # Speech Emotion Classification
-    MSP-PODCAST-Publish-1.9-EmotionClassification/train:
-      weight: 1.8
-      prefix_prob: 0.5
-      augmentations:
-        provide_all_labels: 0.9
-        do_nothing: 0.1
-    MSP-PODCAST-Publish-1.9-EmotionClassification/interleaved_knn-train:
-      weight: 1.2
-      prefix_prob: 0.5
-      augmentations:
-        provide_all_labels: 0.9
-        do_nothing: 0.1
-    MELD-EmotionClassification/train:
-      weight: 1.8
-      prefix_prob: 0.5
-      augmentations:
-        provide_all_labels: 0.9
-        do_nothing: 0.1
-    MELD-EmotionClassification/interleaved_knn-train:
-      weight: 1.2
-      prefix_prob: 0.5
-      augmentations:
-        provide_all_labels: 0.9
-        do_nothing: 0.1
-    MELD-SentimentClassification/train:
-      weight: 1.8
-      prefix_prob: 0.5
-      augmentations:
-        provide_all_labels: 0.9
-        do_nothing: 0.1
-    MELD-SentimentClassification/interleaved_knn-train:
-      weight: 1.2
-      prefix_prob: 0.5
-      augmentations:
-        provide_all_labels: 0.9
-        do_nothing: 0.1
-    # Music QA
-    Music-AVQA-AVQA_All/train:
-      weight: 3.0
-      prefix_prob: 0.5
-      augmentations:
-        AQA_binary_instruction: 1.0
-    MU-LLAMA-AQA/train:
-      weight: 1.8
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    MU-LLAMA-AQA/interleaved_knn-train:
-      weight: 1.2
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    # Music Captioning
-    LP-MusicCaps-MSD-AudioCaptioning/train:
-      weight: 1.0
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    # Music Understanding
-    NSynth-MIR/train:
-      weight: 0.6
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    NSynth-MIR/interleaved_knn-train:
-      weight: 0.4
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    mtg-jamendo-MusicTagging/train:
-      weight: 1.0
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-  dataset_file_root: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/dataset_files
-  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
-  dataset_blending_output: dataset_blending.json
-  max_tokens: 512
-  num_workers: 4
-  valid_dataset_config:
-    CLAP_freesound-AudioCaptioning/test: true
-    SoundDescs-AudioDescription/test: true
-    Clotho-AQA-EventClassification/test: true
-    MSP-PODCAST-Publish-1.9-EmotionClassification/test: true
-    MSP-PODCAST-Publish-1.9-EmotionClassification/interleaved_knn-test: true
-    MELD-EmotionClassification/test: true
-    MELD-EmotionClassification/interleaved_knn-test: true
-    MELD-SentimentClassification/test: true
-    MELD-SentimentClassification/interleaved_knn-test: true
-    MU-LLAMA-AQA/test: true
-    LP-MusicCaps-MSD-AudioCaptioning/val: true
-    NSynth-MIR/test: true
-    NSynth-MIR/interleaved_knn-test: true
-    mtg-jamendo-MusicTagging/val: true
-clap_config:
-  # method: laion-clap
-  # audio_embed_dim: 512
-  # model_name: 630k-fusion-best
-  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
-  method: microsoft-clap
-  audio_embed_dim: 1024
-  config_root: /home/zkong/audio_flamingo/audio_flamingo_v1/v1.0_optimlmax1.3b_foundation/my_ms_clap/src/configs
-  # model_name: '2023'
-  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/CLAP_weights_2023.pth
-  model_name: 'clapcap'
-  checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/clapcap_weights_2023.pth
-  window_length: 7.0  # seconds
-  window_overlap: 5.25  # seconds
-  max_num_window: 16  # 35 seconds
-  max_num_fewshot: 4  # number of fewshot samples (including the final one)
-model_config:
-  cache_dir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/LLM_pretrained/.cache
-  lang_encoder_path: facebook/opt-iml-max-1.3b
-  tokenizer_path: facebook/opt-iml-max-1.3b
-  cross_attn_every_n_layers: 1
-  audio_transformer_kwargs: {
-    n_head: 8,
-    n_layers: 3,
-    d_inner: 2048,
-    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
-    max_window_per_audio: 16,  # must = max_num_window
-  }

configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml DELETED Viewed

@@ -1,183 +0,0 @@
-train_config:
-  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed
-  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed
-  delete_previous_checkpoint: true
-  batch_size: 4
-  gradient_accumulation_steps: 2  # 4 nodes
-  seed: 42
-  learning_rate: 0.0001
-  lr_scheduler: constant
-  loss_multiplier: 1.0
-  warmup_steps: 1875
-  weight_decay: 0.1
-  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
-  gradient_checkpointing: False
-  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
-  offline: false
-  freeze_lm_embeddings: false
-  logging_steps: 10
-  dist_backend: nccl
-  dist_url: env:// # tcp://localhost:7000
-  no_set_device_rank: false
-  fsdp: true
-  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
-  fsdp_sharding_strategy: full  # full, hybrid
-  horovod: false
-data_config:
-  dataset_blending_global_weight: 0.005
-  dataset_blending_config:
-    # Audio QA
-    OpenAQA-AQA/train:
-      weight: 1.0
-    AudioSet-Temporal-Speech-Audio-QA/train:
-      weight: 2.0
-    CompA-R-AQA/train:
-      weight: 2.0
-    # Audio Captioning
-    BBCSoundEffects-AudioDescription/train:
-      weight: 5.0
-    CLAP_freesound-AudioCaptioning/train:
-      weight: 1.0
-    SoundDescs-AudioDescription/train:
-      weight: 1.0
-    WavCaps-AudioSet_SL-AudioCaptioning/train:
-      weight: 1.0
-    WavCaps-BBC_Sound_Effects-AudioCaptioning/train:
-      weight: 2.0
-    WavCaps-FreeSound-AudioCaptioning/train:
-      weight: 2.0
-    WavCaps-SoundBible-AudioCaptioning/train:
-      weight: 5.0
-    Ego-10-AudioCaptioning/train:
-      weight: 2.0
-    Ego-30-AudioCaptioning/train:
-      weight: 2.0
-    # Audio Classification
-    AudioSetFullwoAudioMusicCaps-EventClassification/train:
-      weight: 1.0
-    AudioSet-EventClassification/train:
-      weight: 5.0
-    Clotho-AQA-EventClassification/train:
-      weight: 5.0
-    WavText5K-Tagging/train:
-      weight: 3.0
-    # Speech Emotion Classification
-    MSP-PODCAST-Publish-1.9-EmotionClassification/train:
-      weight: 3.0
-    MELD-EmotionClassification/train:
-      weight: 3.0
-    MELD-SentimentClassification/train:
-      weight: 3.0
-    # Music QA
-    Music-AVQA-AVQA_All/train:
-      weight: 3.0
-    MU-LLAMA-AQA/train:
-      weight: 3.0
-    # Music Captioning
-    LP-MusicCaps-MSD-AudioCaptioning/train:
-      weight: 1.0
-    # Music Understanding
-    NSynth-MIR/train:
-      weight: 1.0
-    mtg-jamendo-MusicTagging/train:
-      weight: 1.0
-  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
-  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
-  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
-  max_tokens: 512
-  num_workers: 4
-  valid_dataset_config:
-    CLAP_freesound-AudioCaptioning/test: true
-    SoundDescs-AudioDescription/test: true
-    Clotho-AQA-EventClassification/test: true
-    MSP-PODCAST-Publish-1.9-EmotionClassification/test: true
-    MELD-EmotionClassification/test: true
-    MELD-SentimentClassification/test: true
-    MU-LLAMA-AQA/test: true
-    LP-MusicCaps-MSD-AudioCaptioning/val: true
-    NSynth-MIR/test: true
-    mtg-jamendo-MusicTagging/val: true
-clap_config:
-  method: nvclap-large
-  audio_embed_dim: 2048
-  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 3  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-whisper_config:
-  method: whisper-large-v3
-  path: openai/whisper-large-v3
-  audio_embed_dim: 1280
-  sampling_rate: 16000
-  window_length: 30.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-  finetune: true
-mert_config:
-  method: mert-v1
-  path: m-a-p/MERT-v1-330M
-  audio_embed_dim: 1024
-  sampling_rate: 24000
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-model_config:
-  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
-  lang_encoder_path: Qwen/Qwen2.5-3B
-  tokenizer_path: Qwen/Qwen2.5-3B
-  cross_attn_every_n_layers: 1
-  audio_transformer_kwargs: {
-    n_head: 8,
-    n_layers: 3,
-    d_inner: 2048,
-    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
-    max_window_per_audio: 1,  # must = max_num_window
-    common_encoder_embed_dim: 1024
-  }

configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml DELETED Viewed

@@ -1,483 +0,0 @@
-train_config:
-  expdir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation
-  run_name: run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node
-  delete_previous_checkpoint: true
-  batch_size: 4
-  gradient_accumulation_steps: 1
-  seed: 42
-  learning_rate: 0.00002
-  lr_scheduler: constant
-  loss_multiplier: 1.0
-  warmup_steps: 1875
-  weight_decay: 0.1
-  precision: fp32  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
-  gradient_checkpointing: False
-  num_epochs: 160  # num_epochs * dataset_blending_global_weight = 1
-  offline: false
-  freeze_lm_embeddings: false
-  logging_steps: 10
-  dist_backend: nccl
-  dist_url: env:// # tcp://localhost:7000
-  no_set_device_rank: false
-  fsdp: true
-  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
-  fsdp_sharding_strategy: full  # full, hybrid
-  horovod: false
-# instruction tuning hparams
-sft_config:
-  pretrained_path: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node/
-  pretrained_ckpt: checkpoint_99.pt
-  unfreeze_full_lm: true
-data_config:
-  dataset_blending_global_weight: 0.01
-  dataset_blending_config:
-    # Audio QA
-    Clotho-AQA-AQA/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        AQA_binary_instruction: 1.0
-    Clotho-AQA-AQA/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        AQA_binary_instruction: 1.0
-    OpenAQA-AQA/train:
-      weight: 1.0
-      prefix_prob: 1.0
-      augmentations:
-        do_nothing: 1.0
-    # Audio Captioning
-    Clotho-v2-AudioCaptioning/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    Clotho-v2-AudioCaptioning/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    audiocaps-AudioCaptioning/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    audiocaps-AudioCaptioning/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    Epidemic_sound-AudioCaptioning/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    Epidemic_sound-AudioCaptioning/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    MACS-AudioCaptioning/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    MACS-AudioCaptioning/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    # Audio Classification
-    FSD50k-EventClassification/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        default: 1.0
-    FSD50k-EventClassification/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        default: 1.0
-    CochlScene-SceneClassification/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    CochlScene-SceneClassification/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    NonSpeech7k-EventClassification/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    NonSpeech7k-EventClassification/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    chime-home-EventClassification/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        default: 0.5
-        num_words: 0.5
-    chime-home-EventClassification/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        default: 0.5
-        num_words: 0.5
-    SONYC-UST-EventClassification/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        default: 0.5
-        num_words: 0.5
-    SONYC-UST-EventClassification/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        default: 0.5
-        num_words: 0.5
-    # Speech Emotion Classification
-    MELD-EmotionClassification/train:
-      weight: 0.5
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    MELD-SentimentClassification/train:
-      weight: 0.5
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.1
-        default: 0.9
-    emov-db-EmotionClassification/train:
-      weight: 1.6
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    emov-db-EmotionClassification/interleaved_knn-train:
-      weight: 0.4
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    jl-corpus-EmotionClassification/train:
-      weight: 6.0
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    jl-corpus-EmotionClassification/interleaved_knn-train:
-      weight: 1.5
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    tess-EmotionClassification/train:
-      weight: 2.0
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    tess-EmotionClassification/interleaved_knn-train:
-      weight: 0.5
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    IEMOCAP-EmotionClassification/train:
-      weight: 2.4
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    IEMOCAP-EmotionClassification/interleaved_knn-train:
-      weight: 0.6
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    OMGEmotion-EmotionClassification/train:
-      weight: 3.0
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    VocalSound-VocalClassification/train:
-      weight: 1.0
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    # Music QA
-    Music-AVQA-AQA_All/train:
-      weight: 2.0
-      prefix_prob: 1.0
-      augmentations:
-        AQA_binary_instruction: 1.0
-    Music-AVQA-AQA_All/interleaved_knn-train:
-      weight: 1.0
-      prefix_prob: 1.0
-      augmentations:
-        AQA_binary_instruction: 1.0
-    MU-LLAMA-AQA/train:
-      weight: 0.9
-      prefix_prob: 1.0
-      augmentations:
-        do_nothing: 1.0
-    MU-LLAMA-AQA/interleaved_knn-train:
-      weight: 0.1
-      prefix_prob: 1.0
-      augmentations:
-        do_nothing: 1.0
-    # Music Captioning
-    LP-MusicCaps-MSD-AudioCaptioning/train:
-      weight: 0.05  # 1.3M
-      prefix_prob: 1.0
-      augmentations:
-        AC_paragraph: 1.0
-    LP-MusicCaps-MSD-AudioCaptioning/interleaved_knn-train:
-      weight: 0.05  # 111k
-      prefix_prob: 1.0
-      augmentations:
-        AC_paragraph: 1.0
-    LP-MusicCaps-MC-AudioCaptioning/train:
-      weight: 1.6
-      prefix_prob: 1.0
-      augmentations:
-        AC_paragraph: 1.0
-    LP-MusicCaps-MC-AudioCaptioning/interleaved_knn-train:
-      weight: 0.4
-      prefix_prob: 1.0
-      augmentations:
-        AC_paragraph: 1.0
-    LP-MusicCaps-MTT-AudioCaptioning/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        AC_long: 1.0
-    LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        AC_long: 1.0
-    MusicCaps-AudioCaptioning/train:
-      weight: 6.0
-      prefix_prob: 1.0
-      augmentations:
-        AC_paragraph: 1.0
-    MusicCaps-AudioCaptioning/interleaved_knn-train:
-      weight: 1.5
-      prefix_prob: 1.0
-      augmentations:
-        AC_paragraph: 1.0
-    SongDescriber-AudioCaptioning/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        AC_long: 1.0
-    SongDescriber-AudioCaptioning/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        AC_long: 1.0
-    # Music Understanding
-    NSynth-MIR/train:
-      weight: 0.2  # 289k for weight = 1
-      prefix_prob: 1.0
-      augmentations:
-        do_nothing: 1.0
-    NSynth-MIR/interleaved_knn-train:
-      weight: 0.2  # 60k for weight = 1
-      prefix_prob: 1.0
-      augmentations:
-        do_nothing: 1.0
-    mtg-jamendo-MusicTagging/train:
-      weight: 0.1
-      prefix_prob: 1.0
-      augmentations:
-        default: 1.0
-    FMA-GenreClassification/train:
-      weight: 0.4  # 104k for weight = 1
-      prefix_prob: 1.0
-      augmentations:
-        do_nothing: 1.0
-    FMA-GenreClassification/interleaved_knn-train:
-      weight: 0.3  # 46k for weight = 1
-      prefix_prob: 1.0
-      augmentations:
-        do_nothing: 1.0
-    musdbhq-InstrClassification/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-  dataset_file_root: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/dataset_files
-  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
-  dataset_blending_output: dataset_blending.json
-  max_tokens: 512
-  num_workers: 4
-  valid_dataset_config:
-    Clotho-AQA-AQA/test: true
-    Clotho-AQA-AQA/interleaved_knn-test: true
-    Clotho-v2-AudioCaptioning/test: true
-    Clotho-v2-AudioCaptioning/interleaved_knn-test: true
-    FSD50k-EventClassification/test: true
-    FSD50k-EventClassification/interleaved_knn-test: true
-    CochlScene-SceneClassification/test: true
-    CochlScene-SceneClassification/interleaved_knn-test: true
-    NonSpeech7k-EventClassification/test: true
-    NonSpeech7k-EventClassification/interleaved_knn-test: true
-    SONYC-UST-EventClassification/test: true
-    SONYC-UST-EventClassification/interleaved_knn-test: true
-    emov-db-EmotionClassification/val: true
-    emov-db-EmotionClassification/interleaved_knn-val: true
-    jl-corpus-EmotionClassification/val: true
-    jl-corpus-EmotionClassification/interleaved_knn-val: true
-    tess-EmotionClassification/val: true
-    tess-EmotionClassification/interleaved_knn-val: true
-    IEMOCAP-EmotionClassification/test: true
-    IEMOCAP-EmotionClassification/interleaved_knn-test: true
-    OMGEmotion-EmotionClassification/val: true
-    Music-AVQA-AQA_All/test: true
-    Music-AVQA-AQA_All/interleaved_knn-test: true
-    MU-LLAMA-AQA/test: true
-    LP-MusicCaps-MSD-AudioCaptioning/test: true
-    LP-MusicCaps-MC-AudioCaptioning/test: true
-    LP-MusicCaps-MTT-AudioCaptioning/test: true
-    LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-test: true
-    NSynth-MIR/test: true
-    NSynth-MIR/interleaved_knn-test: true
-    mtg-jamendo-MusicTagging/val: true
-    audiocaps-AudioCaptioning/test: true
-    audiocaps-AudioCaptioning/interleaved_knn-test: true
-    MusicCaps-AudioCaptioning/test: true
-    MELD-EmotionClassification/test: true
-    MELD-SentimentClassification/test: true
-    VocalSound-VocalClassification/test: true
-    musdbhq-InstrClassification/test: true
-    # zero shot
-    GTZAN-GenreClassification/train:
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 1.0
-    GTZAN-GenreClassification/interleaved_knn-train:
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 1.0
-    Medley-solos-DB-InstrClassification/test:
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 1.0
-    Medley-solos-DB-InstrClassification/interleaved_knn-test:
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 1.0
-clap_config:
-  # method: laion-clap
-  # audio_embed_dim: 512
-  # model_name: 630k-fusion-best
-  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
-  method: microsoft-clap
-  audio_embed_dim: 1024
-  config_root: /home/zkong/audio_flamingo/audio_flamingo_v1/v1.0_optimlmax1.3b_foundation/my_ms_clap/src/configs
-  # model_name: '2023'
-  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/CLAP_weights_2023.pth
-  model_name: 'clapcap'
-  checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/clapcap_weights_2023.pth
-  window_length: 7.0  # seconds
-  window_overlap: 5.25  # seconds
-  max_num_window: 16  # 35 seconds
-  max_num_fewshot: 4  # number of fewshot samples (including the final one)
-model_config:
-  cache_dir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/LLM_pretrained/.cache
-  lang_encoder_path: facebook/opt-iml-max-1.3b
-  tokenizer_path: facebook/opt-iml-max-1.3b
-  cross_attn_every_n_layers: 1
-  audio_transformer_kwargs: {
-    n_head: 8,
-    n_layers: 3,
-    d_inner: 2048,
-    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
-    max_window_per_audio: 16,  # must = max_num_window
-  }

configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml DELETED Viewed

@@ -1,284 +0,0 @@
-train_config:
-  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed
-  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft
-  delete_previous_checkpoint: true
-  batch_size: 4
-  gradient_accumulation_steps: 2
-  seed: 42
-  learning_rate: 0.00002
-  lr_scheduler: constant
-  loss_multiplier: 1.0
-  warmup_steps: 1875
-  weight_decay: 0.1
-  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
-  gradient_checkpointing: False
-  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
-  offline: false
-  freeze_lm_embeddings: false
-  logging_steps: 10
-  dist_backend: nccl
-  dist_url: env:// # tcp://localhost:7000
-  no_set_device_rank: false
-  fsdp: true
-  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
-  fsdp_sharding_strategy: full  # full, hybrid
-  horovod: false
-# instruction tuning hparams
-sft_config:
-  pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
-  pretrained_ckpt: checkpoint_199.pt
-  unfreeze_full_lm: false
-data_config:
-  dataset_blending_global_weight: 0.005
-  dataset_blending_config:
-    MMAUQA/train:
-      weight: 1.5
-    AudioSet-Temporal-Speech-Audio-QA/train:
-      weight: 1.0
-    CompA-R-AQA/train:
-      weight: 1.0
-    # Audio QA
-    Clotho-AQA-AQA/train:
-      weight: 1.0
-    OpenAQA-AQA/train:
-      weight: 1.0
-    SalmonnQA/train:
-      weight: 1.0
-    AudioEntailmentQA/train:
-      weight: 1.0
-    # Audio Captioning
-    Clotho-v2-AudioCaptioning/train:
-      weight: 1.0
-    audiocaps-AudioCaptioning/train:
-      weight: 1.0
-    Epidemic_sound-AudioCaptioning/train:
-      weight: 1.0
-    MACS-AudioCaptioning/train:
-      weight: 1.0
-    # Audio Classification
-    FSD50k-EventClassification/train:
-      weight: 1.0
-    CochlScene-SceneClassification/train:
-      weight: 1.0
-    NonSpeech7k-EventClassification/train:
-      weight: 1.0
-    chime-home-EventClassification/train:
-      weight: 1.0
-    SONYC-UST-EventClassification/train:
-      weight: 1.0
-    # Speech Emotion Classification
-    MELD-EmotionClassification/train:
-      weight: 0.5
-    MELD-SentimentClassification/train:
-      weight: 0.5
-    emov-db-EmotionClassification/train:
-      weight: 1.0
-    jl-corpus-EmotionClassification/train:
-      weight: 6.0
-    tess-EmotionClassification/train:
-      weight: 2.5
-    IEMOCAP-EmotionClassification/train:
-      weight: 3.0
-    OMGEmotion-EmotionClassification/train:
-      weight: 3.0
-    VocalSound-VocalClassification/train:
-      weight: 1.5
-    # Music QA
-    Music-AVQA-AQA_All/train:
-      weight: 3.0
-    MU-LLAMA-AQA/train:
-      weight: 1.0
-    # Music Captioning
-    LP-MusicCaps-MSD-AudioCaptioning/train:
-      weight: 0.06
-    LP-MusicCaps-MC-AudioCaptioning/train:
-      weight: 2.0
-    LP-MusicCaps-MTT-AudioCaptioning/train:
-      weight: 1.0
-    MusicCaps-AudioCaptioning/train:
-      weight: 6.0
-    musdbhq-captioning/train:
-      weight: 2.0
-    # Music Understanding
-    NSynth-MIR/train:
-      weight: 0.2
-    mtg-jamendo-MusicTagging/train:
-      weight: 0.1
-    FMA-GenreClassification/train:
-      weight: 0.5
-    musdbhq-InstrClassification/train:
-      weight: 0.8
-    LLARK_FMA-mir/train:
-      weight: 1.0
-    LLARK_FMA-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-mir/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-reasoning/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-mir/train:
-      weight: 1.0
-    MusicBenchQA/train:
-      weight: 1.0
-  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data_w_duration
-  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
-  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
-  max_tokens: 512
-  num_workers: 4
-  valid_dataset_config:
-    Clotho-AQA-AQA/test: true
-    Clotho-v2-AudioCaptioning/test: true
-    audiocaps-AudioCaptioning/test: true
-    FSD50k-EventClassification/test: true
-    CochlScene-SceneClassification/test: true
-    NonSpeech7k-EventClassification/test: true
-    SONYC-UST-EventClassification/test: true
-    MELD-EmotionClassification/test: true
-    MELD-SentimentClassification/test: true
-    emov-db-EmotionClassification/val: true
-    jl-corpus-EmotionClassification/val: true
-    tess-EmotionClassification/val: true
-    IEMOCAP-EmotionClassification/val: true
-    OMGEmotion-EmotionClassification/val: true
-    VocalSound-VocalClassification/test: true
-    Music-AVQA-AQA_All/test: true
-    MU-LLAMA-AQA/test: true
-    LP-MusicCaps-MSD-AudioCaptioning/test: true
-    LP-MusicCaps-MC-AudioCaptioning/test: true
-    LP-MusicCaps-MTT-AudioCaptioning/test: true
-    MusicCaps-AudioCaptioning/test: true
-    NSynth-MIR/test: true
-    mtg-jamendo-MusicTagging/val: true
-    musdbhq-InstrClassification/test: true
-    # # zero shot
-    # CREMA-D-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # ravdess-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # UrbanSound8K-EventClassification/train:
-    #   prefix_prob: 1.0
-    # ESC50-EventClassification/train:
-    #   prefix_prob: 1.0
-    # DCASE17Task4-SceneClassification/test:
-    #   prefix_prob: 1.0
-    # GTZAN-GenreClassification/train:
-    #   prefix_prob: 1.0
-    # Medley-solos-DB-InstrClassification/test:
-    #   prefix_prob: 1.0
-clap_config:
-  method: nvclap-large
-  audio_embed_dim: 2048
-  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 9  # 1.5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-  finetune: true
-whisper_config:
-  method: whisper-large-v3
-  path: openai/whisper-large-v3
-  audio_embed_dim: 1280
-  sampling_rate: 16000
-  window_length: 30.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-mert_config:
-  method: mert-v1
-  path: m-a-p/MERT-v1-330M
-  audio_embed_dim: 1024
-  sampling_rate: 24000
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-model_config:
-  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
-  lang_encoder_path: Qwen/Qwen2.5-3B
-  tokenizer_path: Qwen/Qwen2.5-3B
-  cross_attn_every_n_layers: 1
-  audio_transformer_kwargs: {
-    n_head: 8,
-    n_layers: 3,
-    d_inner: 2048,
-    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
-    max_window_per_audio: 1,  # must = max_num_window
-    common_encoder_embed_dim: 1024
-  }