diff --git "a/Pretrain/config.yaml" "b/Pretrain/config.yaml"
deleted file mode 100644--- "a/Pretrain/config.yaml"
+++ /dev/null
@@ -1,5436 +0,0 @@
-# task 0: attr, task 1: pose, task 2:caption task3: parsing task4: smpl, task 5: det
-# fixed parameter with diverse shape among different tasks should also be set in the task_spec_list,
-# e.g., text_vectors, pos_embed, etc.
-
-# 0 attr  1 caption 2 sk2d 3 smpl 4 det  5 cocopose 6 aicpose 7 lipparsing 8 cihpparsing 9 humanparsing
-
-# attr: 0: 5set 1: luperson
-# caption: 2: caption_joint
-# skeleton action: 3: skeleton stack 4: k400
-# smpl 5: smpl
-# det: 6: crowdhuman 7: 5set_det
-# pose: 8: coco 9: aic 10: h36m 11: posetrack 12: jrdb 13: mhp  15: mpi-inf-3dhp
-#        17: 3dpw 18: aist++
-# parsing: 19: lip 20: cihp 21: human3.6 22: modanet 23: vip 24: deep fashion 25: paperdoll
-
-
-
-common:  # prefix
-  share_backbone_group:              [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-         0,  0,  0,  0,  0,  0,  0,  0]
-  share_decoder_group:               [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-         0,  0,  0,  0,  0,  0,  0,  0]
-  # use modality groups to control the communication of neck, adapter, and output proj
-  share_rgb_group:                   [-1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-         0,  0,  0,  0,  0,  0,  0,  0]  # rgb
-  share_dense_labeling_group:        [-1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,
-         0,  0,  0,  0,  0,  0,  0,  0]  # dense_labeling
-  share_text_group:                  [0,  0, -1, -1, -1,  0,  0,  0, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1]  # text
-  share_sparse_labeling_group:       [  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1]
-  share_video_group:                 [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1]
-  # share modality is truly the share task group, e.g., parsing datasets share a group
-  share_modality_group:              [ 2,  2,  3,  4,  4,  0,  0,  1,  5,  5,  5,  5,  5,  5,  5,  5,
-         5,  6,  6,  6,  6,  6,  6,  6 ]
-
-
-  solver:
-    type: SolverMAEDev
-
-  model_entry_type: aio_entry_v2mae_shareneck
-
-
-  lr_scheduler:
-    type: 'Cosine'
-    kwargs:
-      eta_min: 0.
-      base_lr: 1.e-5
-      warmup_lr: 1.e-3
-      warmup_steps: 1500
-
-  backbone_multiplier: 1.
-  pos_embed_multiplier: 1.
-  layer_decay:
-    num_layers: 12
-    layer_decay_rate: 0.75
-    lpe_lr: True
-
-  optimizer:
-    type: Adafactor_dev
-    kwargs:
-      beta1: 0.9
-      clip_beta2: 0.999
-      clip_threshold: 1.
-      decay_rate: -0.8
-      scale_parameter: False
-      relative_step: False
-      weight_decay: 0.05
-
-  auto_denan: False
-
-  workers: 6
-  max_iter: 60000 #61446  # 0.1628001628001628 * |61446 for 149813 // 512 * 210
-
-  deterministic: True   # seed control
-  cudnn_deterministic: False
-  worker_rank: True
-  random_seed: 233
-
-  print_freq: 10
-  verbose_loss: False
-  vis_batch: False
-  save_interval: 10000
-
-  use_ceph: True
-  sync: True
-  collate: det
-
-# task_specific_param = ['backbone', 'neck', 'decoder', 'dataset', 'sampler', 'lr_scheduler', 'optimizer']
-tasks :  # prefix
-  5: # prefix
-    name: pedattr_multi_rap2_PA_100k_parse27k_market_HARDHC
-    loss_weight: 5
-    gres_ratio: 1  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: False # when torch.compile is True, this should be False
-        learnable_pos: True
-        drop_path_rate: 0.2
-        img_size: 1344
-        num_encoded_tokens: 192
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-#        mask_all_gt_tokens: True
-    dataset:
-      type: MultiAttrDataset
-      kwargs:
-        text_label_return: True
-        task_spec:
-          dataset:
-            - rap2
-            - PA_100k
-            - parse27k
-            - market
-            - HARDHC
-          data_path:
-            - Humans1:s3://HumanCentricModel/pedattr_public/rap2/dataset.pkl
-            - Humans1:s3://HumanCentricModel/pedattr_public/PA-100k/dataset.pkl
-            - Humans1:s3://HumanCentricModel/pedattr_public/Parse27k/parse27k/parse27k/dataset.pkl
-            - Humans1:s3://HumanCentricModel/pedattr_public/market/dataset.pkl
-            - Humans1:s3://HumanCentricModel/pedattr_public/HARDHC/dataset.pkl
-          root_path:
-            - Humans1:s3://HumanCentricModel/pedattr_public/rap2/RAP_dataset/
-            - Humans1:s3://HumanCentricModel/pedattr_public/PA-100k/data/
-            - Humans1:s3://HumanCentricModel/pedattr_public/Parse27k/parse27k/parse27k/images
-            - Humans1:s3://HumanCentricModel/pedattr_public/market/bounding_box_train
-            - Humans1:s3://HumanCentricModel/pedattr_public/HARDHC/croped_image/
-        augmentation:
-          height: 256
-          width: 192
-
-    sampler:
-      batch_size: 147 # per card
-      shuffle_strategy: 1
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: rgb
-  #      task_sp_list: ['mask_map']
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: [256, 192]
-        task_sp_list: [ 'pos_embed' ]
-  #      type_embed: True
-
-    # fix kwargs of the project, which should be the same as that in the adapter, such as
-    # hidden_dim, patch_size, in_chans, stride_level are set in the solver - create_modal
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-    label_adapter:
-      type: text_adapter
-      kwargs:
-        pretrained: True
-        close_set: True
-        task_sp_list: ['text_vectors']
-        one_way_semantics: True
-        pre_extracted: True
-        description_dict_name: 'multi_rap2_PA_100k_parse27k_market_HARDHC_attr_name'
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: text
-    label_proj:
-      type: text_projector
-      kwargs:
-        task_sp_list: ['text_vectors',
-                       'translate_weight',
-                       'translate_bias',
-                        'post_mul_norm',]
-        close_set: True
-        one_way_semantics: True
-        pre_extracted: True
-        post_mul_norm: True
-        replace_post_mul_norm: False
-        translate_weight_scale: 5
-        description_dict_name: 'multi_rap2_PA_100k_parse27k_market_HARDHC_attr_name'
-        pre_proj_type: ''
-        loss_cfg:
-          type: MaskedOneSideBCELoss
-          kwargs:
-            use_focal_weight: True
-            loss_weight: 1.
-            dataset_weight: [ 0.5, 0.5, 0.5, 0.5, 0.5,
-                              0.5, 0.5, 0.5, 0.5, 0.5,
-                              0.5, 0.5, 0.5, 0.5, 0.5,
-                              0.5, 0.5, 0.5, 0.5, 0.5,
-                              0.5, 0.5, 0.5, 0.5, 0.5,
-                              0.5, 0.5, 0.5, 0.5, 0.5,
-                              0.5, 0.5, 0.5, 0.5, 0.5,
-                              0.5, 0.5, 0.5, 0.5, 0.5,
-                              0.5, 0.5, 0.5, 0.5, 0.5,
-                              0.5, 0.5, 0.5, 0.5, 0.5,
-                              0.5, 0.5, 0.5, 0.5, 0.25,
-                              0.25, 0.25, 0.25, 0.25, 0.25,
-                              0.25, 0.25, 0.25, 0.25, 0.25,
-                              0.25, 0.25, 0.25, 0.25, 0.25,
-                              0.25, 0.25, 0.25, 0.25, 0.25,
-                              0.25, 0.25, 0.25, 0.25, 0.25,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, 1.0, 1.0,
-                              1.0, 1.0, 1.0, ]
-            sample_weight: [0.00172477, 0.05791431, 0.2792891 , 0.00459644, 0.01987675,
-       0.06484867, 0.02327336, 0.01420398, 0.06937013, 0.03476447,
-       0.08533858, 0.0091179 , 0.0125145 , 0.02894172, 0.00816949,
-       0.17255632, 0.00890175, 0.00613153, 0.00838123, 0.07975844,
-       0.03529381, 0.07885856, 0.06067129, 0.02532455, 0.00429207,
-       0.06790121, 0.02532014, 0.00639179, 0.02070164, 0.00790041,
-       0.01142935, 0.00823125, 0.00310547, 0.00732696, 0.08890281,
-       0.00265994, 0.12081324, 0.16404275, 0.010578  , 0.09486231,
-       0.040896  , 0.23313939, 0.02223673, 0.28135352, 0.01603462,
-       0.01012806, 0.00799305, 0.01450835, 0.00697848, 0.00314958,
-       0.00536399, 0.00762692, 0.03982408, 0.00306577,  # rap2
-             0.01728739, 0.0714522 , 0.23161312, 0.16539257, 0.01964296,
-             0.0599655 , 0.04277957, 0.01663895, 0.00187475, 0.00670499,
-             0.0128674 , 0.28255336, 0.06885843, 0.0455939 , 0.00238203,
-             0.07344605, 0.07651623, 0.06356061, 0.00378038, 0.00534193,
-             0.36698324, 0.02468052, 0.18279907, 0.14001068, 0.1169667 ,
-             0.14002832, # pa100k
-             0.00080283, 0.04727897, 0.05596016, 0.00868119, 0.00850474,
-             0.00013234, 0.02891966, 0.0113279 , 0.00466261, 0.00932522,
-             0.04154444, 0.00932522, 0.00466261, 0.0113279 , 0.0128277 ,
-             0.05136371, 0.05703648, 0.00839005, 0.00951049, 0.10332735,
-             0.04794505, 0.01736679, 0.05591605, 0.04794505, 0.01736679,
-             0.05591605, 0.04949779, 0.01482155, 0.05690856, 0.04949779,
-             0.01482155, 0.05690856, 0.00515225, 0.00014998, 0.11592566,
-             0.02974014, 0.00336131, 0.08812644, 0.00546986, 0.00292902,
-             0.11282902, 0.03215746, 0.00087341, 0.08819702, # parse27k
-             0.01577436, 0.01377169, 0.00681968, 0.02183531, 0.00826654,
-             0.00613153, 0.0091179 , 0.00096605, 0.00241732, 0.00012792,
-             0.00481259, 0.00091752, 0.00754752, 0.00346277, 0.00502433,
-             0.00635209, 0.00219676, 0.00692113, 0.01726093, 0.00282756,
-             0.04876553, 0.03532027, 0.05422657, 0.01836813, 0.00129247,
-             0.0237233 , 0.00093958, 0.04455727, 0.01074562, 0.00082048, # market
-             0.07086552, 0.02805507, 0.0062771 , 0.02825357, 0.0273978 ,
-             0.05809076, 0.00874295, 0.01927683, 0.01020305, 0.04525424,
-             0.01257185, 0.00412004, 0.03352934, 0.00677998, # HARDHC
-            ]
-
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.mask_token',
-                        # 'predictor.text_pe',
-                        'predictor.class_embed','predictor.fc_bias',    # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9
-          pre_norm: False
-          arch: fan_in
-          enforce_input_project: False
-          mask_on: False
-          num_feature_levels: 1
-          cross_pos_embed: anchor
-          # patch_pos_mode: simple_interpolate
-          # label_pos_mode: simple_interpolate
-          self_attn_mask_type: patch_diag_label_row
-          # adding_per_layer_pe: True
-          # mask_token_normal_init: True
-          cls_out_dim: 1
-#          fixed_class_embed: True
-#          fixed_class_embed_cfg:
-#            pre_extracted: True
-#            description_dict_name: 'rap2_attr_name'
-#            fixed_class_embed_LN: True
-#            one_way_semantics: True
-          detach_from_peddet: True
-#          label_ffn_pre_norm: True
-#          label_ffn_zero_gated: True
-          #  one_way_semantics: True
-        #          no_mask_embed: True
-        #          learnable_class_embed: True
-        loss_cfg:
-          type: CEL_Sigmoid
-
-  6: # prefix
-    name: attr_luperson
-    loss_weight: 5
-    gres_ratio: 1  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: False # when torch.compile is True, this should be False
-        learnable_pos: True
-        drop_path_rate: 0.2
-        img_size: 1344
-        num_encoded_tokens: 192
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-    #        mask_all_gt_tokens: True
-    dataset:
-      type: MultiAttrDataset
-      kwargs:
-        text_label_return: True
-        task_spec:
-          dataset:
-            - lup_0_600w
-            - lup_600_1200w
-          data_path:
-            - /mnt/petrelfs/tangshixiang/hwz/humanbenchv2/experiments/v2_attribute/dataset_0_600w_pjlab.pkl
-            - /mnt/petrelfs/tangshixiang/hwz/humanbenchv2/experiments/v2_attribute/dataset_600_1200w_pjlab.pkl
-          root_path:
-            - /mnt/petrelfs/share_data/vitruvian/data/reid/LUPerson-NL/LUPws
-            - /mnt/petrelfs/share_data/vitruvian/data/reid/LUPerson-NL/LUPws
-        augmentation:
-          height: 256
-          width: 192
-
-    sampler:
-      batch_size: 300 # per card
-      shuffle_strategy: 1
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: rgb
-    #      task_sp_list: ['mask_map']
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: [ 256, 192 ]
-        task_sp_list: [ 'pos_embed' ]
-    #      type_embed: True
-
-    # fix kwargs of the project, which should be the same as that in the adapter, such as
-    # hidden_dim, patch_size, in_chans, stride_level are set in the solver - create_modal
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-    label_adapter:
-      type: text_adapter
-      kwargs:
-        pretrained: True
-        close_set: True
-        task_sp_list: [ 'text_vectors' ]
-        one_way_semantics: True
-        pre_extracted: True
-        description_dict_name: 'lup_lup_attr_base'
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: text
-    label_proj:
-      type: text_projector
-      kwargs:
-        task_sp_list: [ 'text_vectors',
-                        'translate_weight',
-                        'translate_bias',
-                        'post_mul_norm', ]
-        close_set: True
-        one_way_semantics: True
-        pre_extracted: True
-        post_mul_norm: True
-        replace_post_mul_norm: False
-        translate_weight_scale: 5
-        description_dict_name: 'lup_lup_attr_base'
-        pre_proj_type: ''
-        loss_cfg:
-          type: MaskedOneSideBCELoss
-          kwargs:
-            loss_weight: 1.
-            use_focal_weight: True
-            sample_weight: [ 3.705390e-01, 6.184500e-03, 6.679500e-03, 9.445730e-01,
-                             3.924500e-02, 4.686065e-01, 7.492855e-01, 6.642300e-02,
-                             7.882115e-01, 1.606450e-02, 1.043025e-01, 8.040050e-02,
-                             1.102100e-02, 5.510935e-01, 4.074950e-02, 1.142160e-01,
-                             3.731000e-02, 5.566250e-02, 1.852115e-01, 1.524850e-02,
-                             5.085000e-04, 9.421990e-01, 1.484350e-02, 3.347200e-02,
-                             5.750000e-03, 3.735500e-03, 1.509560e-01, 3.741515e-01,
-                             3.318200e-02, 2.215850e-02, 4.213145e-01, 5.177550e-02,
-                             3.974550e-02, 3.878800e-01, 1.321270e-01, 1.337740e-01,
-                             9.478400e-02, 3.324350e-02, 1.095815e-01, 2.231600e-02,
-                             1.592250e-02, 2.386005e-01, 1.999500e-01, 1.321300e-02,
-                             7.382405e-01, 4.859650e-02, 2.932510e-01, 8.297100e-02,
-                             9.567325e-01, 2.430700e-02, 3.554500e-03, 1.751500e-03, #lup_0_600w
-                             3.705390e-01, 6.184500e-03, 6.679500e-03, 9.445730e-01,
-                             3.924500e-02, 4.686065e-01, 7.492855e-01, 6.642300e-02,
-                             7.882115e-01, 1.606450e-02, 1.043025e-01, 8.040050e-02,
-                             1.102100e-02, 5.510935e-01, 4.074950e-02, 1.142160e-01,
-                             3.731000e-02, 5.566250e-02, 1.852115e-01, 1.524850e-02,
-                             5.085000e-04, 9.421990e-01, 1.484350e-02, 3.347200e-02,
-                             5.750000e-03, 3.735500e-03, 1.509560e-01, 3.741515e-01,
-                             3.318200e-02, 2.215850e-02, 4.213145e-01, 5.177550e-02,
-                             3.974550e-02, 3.878800e-01, 1.321270e-01, 1.337740e-01,
-                             9.478400e-02, 3.324350e-02, 1.095815e-01, 2.231600e-02,
-                             1.592250e-02, 2.386005e-01, 1.999500e-01, 1.321300e-02,
-                             7.382405e-01, 4.859650e-02, 2.932510e-01, 8.297100e-02,
-                             9.567325e-01, 2.430700e-02, 3.554500e-03, 1.751500e-03 # lup_600_1200w
-            ]
-
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.mask_token',
-                        # 'predictor.text_pe',
-                        'predictor.class_embed','predictor.fc_bias',    # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9
-          pre_norm: False
-          arch: fan_in
-          enforce_input_project: False
-          mask_on: False
-          num_feature_levels: 1
-          cross_pos_embed: anchor
-          # patch_pos_mode: simple_interpolate
-          # label_pos_mode: simple_interpolate
-          self_attn_mask_type: patch_diag_label_row
-          # adding_per_layer_pe: True
-          # mask_token_normal_init: True
-          cls_out_dim: 1
-          #          fixed_class_embed: True
-          #          fixed_class_embed_cfg:
-          #            pre_extracted: True
-          #            description_dict_name: 'rap2_attr_name'
-          #            fixed_class_embed_LN: True
-          #            one_way_semantics: True
-          detach_from_peddet: True
-          #          label_ffn_pre_norm: True
-          #          label_ffn_zero_gated: True
-          #  one_way_semantics: True
-        #          no_mask_embed: True
-        #          learnable_class_embed: True
-        loss_cfg:
-          type: CEL_Sigmoid
-
-  7:
-    name: image_caption_joint
-    loss_weight: 90
-    gres_ratio: 3  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    dataset:
-      type: CocoCaption
-      kwargs:
-        bert_dir: /mnt/petrelfs/tangshixiang/wangyizhou/humanbenchv2/bert-base-uncased
-        max_words: 40
-        img_size: 384
-        prompt: ''
-        split_type: train
-        joint_train: True
-        joint_train_anno_root: /mnt/petrelfs/share_data/vitruvian/data/textreid/joint_reid_caption_train.json
-#        joint_train_anno_root: /mnt/petrelfs/share_data/vitruvian/data/textreid/LUPerson-T/luperson.json
-        synth_peds_root: /mnt/petrelfs/share_data/vitruvian/data/textreid/SYNTH-PEDES/
-        cuhk_peds_root: /mnt/petrelfs/share_data/vitruvian/data/textreid/CUHK-PEDES/imgs/
-        mals_root: /mnt/petrelfs/share_data/vitruvian/data/textreid/MALS
-        luperson_root: /mnt/petrelfs/share_data/vitruvian/data/textreid/LUPerson-T/imgs/
-
-    sampler:
-      batch_size: 100  # per card
-      shuffle_strategy: 1
-
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: learnable_interpolate
-        learnable_pos: True
-        drop_path_rate: 0.2
-
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: rgb
-    #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: text
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: [ 384, 384 ]
-        task_sp_list: [ 'pos_embed' ]
-
-    label_adapter:
-      type: text_adapter
-      kwargs:
-        image_caption: True
-        pretrained: True
-        max_tokens: 40
-        task_sp_list: [ ]
-
-    # fix kwargs of the project, which should be the same as that in the adapter, such as
-    # hidden_dim, patch_size, in_chans, stride_level are set in the solver - create_modal
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-    label_proj:
-      type: text_projector
-      kwargs:
-        pre_extracted: True
-        description_dict_name: caption_bert
-        close_set: True
-        image_caption: True
-        one_way_semantics: True
-        post_mul_norm: True
-        loss_cfg:
-          type: LabelSmoothingCrossEntropy
-          kwargs:
-            epsilon: 0.1
-            loss_weight: 1.
-#            sample_weight_path: sample_weight_40.npy
-        task_sp_list: [ 'post_mul_norm',
-                        'text_vectors',
-                        'loss_fn']
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.text_pe',
-                        # 'predictor.mask_token',
-                        'predictor.mask_token_buffer',
-                        'predictor.mask_token_proj',
-                        'predictor.captiontoken_ln',
-                        'predictor.class_embed','predictor.fc_bias',    # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9
-          pre_norm: False
-          arch: fan_in
-          enforce_input_project: False
-          mask_on: False
-          num_feature_levels: 1
-          cross_pos_embed: anchor
-          cls_out_dim: 1
-          self_attn_mask_type: caption_mask
-          caption_cfgs: { nn.parameter: True, vocal_size: 30522, lndo: True ,bert_feats_for_embedding: True }
-          mask_token_normal_init: True
-          detach_from_peddet: True
-#          label_ffn_zero_gated: True
-#          label_ffn_pre_norm: True
-        #  no_mask_embed: True
-        #  learnable_class_embed: True
-        loss_cfg:
-          type: CEL_Sigmoid
-
-  0:
-    name: NUTRGBD_skeleton   #SPECIFIC
-    loss_weight: 4.4
-    gres_ratio: 2 # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: learnable_interpolate
-        learnable_pos: False
-        drop_path_rate: 0.1
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    dataset:
-      type: mmSkeletonDataset  # train for 150 epochs
-      kwargs:
-        ann_file:
-          - Humans1:s3://HumanCentricModel/skaction_public/ntu60_hrnet.pkl
-          - Humans1:s3://HumanCentricModel/skaction_public/ntu120_hrnet.pkl
-          - Humans1:s3://HumanCentricModel/skaction_public/gym_hrnet.pkl
-#          - Humans1:s3://HumanCentricModel/skaction_public/diving48_hrnet.pkl
-#          - Humans1:s3://HumanCentricModel/skaction_public/ucf101_hrnet.pkl
-        #          - k400_hrnet.pkl
-        dataset_name:
-          - 2dntu60
-          - 2dntu120
-          - gym
-#          - diving
-#          - ucf
-        #          - k400
-        kp_dim: 2d  #SPECIFIC
-        one_hot: True
-        num_classes:
-          - 60
-          - 120
-          - 99
-#          - 48
-#          - 101
-        #          - 400
-        centernorm: False
-        scale_range: [ 0.75,1.25 ]
-        data_pipeline:
-          - type: PreNormalize2D
-            kwargs: { }
-          - type: GenSkeFeat
-            kwargs:
-              dataset: coco
-              feats: [ 'j' ]
-          - type: UniformSampleGivenFrames
-            kwargs:
-              clip_len: 25
-              given_len: 7
-          - type: PoseDecode
-            kwargs: { }
-          - type: FormatGCNInput2D
-            kwargs:
-              num_person: 2
-              window: False
-              rotate: True
-              mode: zero
-          - type: Collect
-            kwargs:
-              keys: [ 'keypoint', 'label' ]
-              meta_keys: [ ]
-          - type: ToTensor
-            kwargs:
-              keys: [ 'keypoint' ]
-        flip: True
-
-
-    sampler:
-      batch_size: 120  # per card
-#      batch_accumulation: 2
-      shuffle_strategy: 1
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: sparse_labeling
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: text
-
-    patch_adapter:
-      type: sparse_labeling_adapter_skaction
-      kwargs:
-        pretrained: True # should be changed to True later
-        in_chans: 3
-        num_joints: 17  #SPECIFIC
-        num_frames: 175
-        embed_dim: 768
-        patch_size: [ 7, 2 ]
-        stride_level: [ 1, 1 ]
-        use_abs_pos_emb: True
-        learnable_pos: False
-        test_pos_mode: learnable_interpolate
-        type_embed: False
-
-        joint_with_text_embedding: True
-        pre_extracted: True # extract features before training
-        joint_names: coco_body_17joints  #SPECIFIC
-        proj_norm: 'LN'
-        stride_text_embedding: True
-        is_2d_dataset: True  #SPECIFIC
-        modality_share_list: [
-          'merge_kernel',
-          'proj_kernel',
-          'proj', ]
-        task_sp_list: [ 'text_embedding', 'pos_embed', ]
-
-
-    patch_proj:
-      type: sparse_labeling_projector
-      kwargs:
-        loss_cfg:
-          type: MaskDetFocalDiceLoss
-          kwargs:
-            cfg:
-              deep_supervision: True
-              focal_alpha: 0.25
-              class_weight: 2.0
-              bbox_weight: 5.0
-              giou_weight: 2.
-              ign_thr: 0.7
-              dec_layers: 6
-              num_classes: 1
-              predict3d: True
-              xyxy: True
-        in_chans: 3  # predefined in patch adapter, set in solver
-        num_joints: 17 #SPECIFIC
-        num_frames: 175
-        modality_share_list: [
-          'output_proj',
-          'translate_weight',
-          'translate_bias',
-          'post_mul_norm',
-          'patch_proj',
-          'class_proj'
-        ]
-        task_sp_list: [
-          'text_vectors',  # useless
-          'text_features',
-        ]
-
-    label_adapter:
-      type: text_adapter
-      kwargs:
-        pretrained: True
-        close_set: True
-        description_dict_name:
-          - ntu60_name
-          - ntu120_name
-          - gym_cls_name
-#          - diving48_cls_name
-#          - ucf101_cls_name
-        #          - k400_cls_name
-        one_way_semantics: False
-        skeleton_action: True # use skeleton action to Double the text embedding (when M=2)
-        skeleton_action_one_hot_label: True
-        pre_extracted: True # extract features before training
-        task_sp_list: [ 'text_vectors', ]
-
-    label_proj:
-      type: text_projector
-      kwargs:
-        close_set: True
-        one_way_semantics: False
-        description_dict_name:
-          - ntu60_name
-          - ntu120_name
-          - gym_cls_name
-#          - diving48_cls_name
-#          - ucf101_cls_name
-        #          - k400_cls_name
-        skeleton_action: True
-        skeleton_action_one_hot_label: True
-        pre_proj_type: 'pool'
-        pre_extracted: True # extract features before training
-
-        replace_post_mul_norm: False
-        post_mul_norm: True
-#        translate_weight_scale: 7.0
-        task_sp_list: [ 'text_vectors',
-                        'translate_weight',
-                        'translate_bias',
-                        'post_mul_norm', ]
-        loss_cfg:
-          type: CELoss
-          kwargs:
-            loss_weight: 1.0
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token', ]
-        task_sp_list: [
-          'predictor.query_embed_patch',
-          'predictor.query_embed_label',
-#          'predictor.mask_token',
-          'predictor.class_embed','predictor.fc_bias',    # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20 # useless in unihcpv2
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9
-          pre_norm: False
-          arch: fan_in
-          enforce_input_project: False
-          mask_on: False
-          num_feature_levels: 1
-          cross_pos_embed: anchor
-          #          fixed_class_embed: True
-          #          fixed_class_embed_cfg:
-          #            pre_extracted: True
-          #            description_dict_name: gym_cls_name
-          #            fixed_class_embed_LN: True
-          #            one_way_semantics: False
-          self_attn_mask_type: patch_diag_label_row
-          detach_from_peddet: True
-#          label_ffn_pre_norm: True
-#          label_ffn_zero_gated: True
-
-        loss_cfg:
-          type: CEL_Sigmoid
-
-  1:
-    name: k400_skeleton   #SPECIFIC
-    loss_weight: 1
-    gres_ratio: 1  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: learnable_interpolate
-        learnable_pos: False
-        drop_path_rate: 0.1
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    dataset:
-      type: mmSkeletonDataset  # train for 150 epochs
-      kwargs:
-        ann_file:
-#          - Humans1:s3://HumanCentricModel/skaction_public/ntu60_hrnet.pkl
-#          - Humans1:s3://HumanCentricModel/skaction_public/ntu120_hrnet.pkl
-#          - Humans1:s3://HumanCentricModel/skaction_public/gym_hrnet.pkl
-          - Humans1:s3://HumanCentricModel/skaction_public/diving48_hrnet.pkl
-          - Humans1:s3://HumanCentricModel/skaction_public/ucf101_hrnet.pkl
-          - Humans1:s3://HumanCentricModel/skaction_public/k400_hrnet.pkl
-        dataset_name:
-#          - 2dntu60
-#          - 2dntu120
-#          - gym
-          - diving
-          - ucf
-          - k400
-        kp_dim: 2d  #SPECIFIC
-        one_hot: True
-        num_classes:
-#          - 60
-#          - 120
-#          - 99
-          - 48
-          - 101
-          - 400
-        centernorm: False
-        scale_range: [ 0.75,1.25 ]
-        data_pipeline:
-          - type: PreNormalize2D
-            kwargs: { }
-          - type: GenSkeFeat
-            kwargs:
-              dataset: coco
-              feats: [ 'j' ]
-          - type: UniformSampleGivenFrames
-            kwargs:
-              clip_len: 25
-              given_len: 7
-          - type: PoseDecode
-            kwargs: { }
-          - type: FormatGCNInput2D
-            kwargs:
-              num_person: 2
-              window: False
-              rotate: True
-              mode: zero
-          - type: Collect
-            kwargs:
-              keys: [ 'keypoint', 'label' ]
-              meta_keys: [ ]
-          - type: ToTensor
-            kwargs:
-              keys: [ 'keypoint' ]
-        flip: True
-
-
-    sampler:
-      batch_size: 90  # per card
-      #      batch_accumulation: 2
-      shuffle_strategy: 1
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: sparse_labeling
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: text
-
-    patch_adapter:
-      type: sparse_labeling_adapter_skaction
-      kwargs:
-        pretrained: True # should be changed to True later
-        in_chans: 3
-        num_joints: 17  #SPECIFIC
-        num_frames: 175
-        embed_dim: 768
-        patch_size: [ 7, 2 ]
-        stride_level: [ 1, 1 ]
-        use_abs_pos_emb: True
-        learnable_pos: False
-        test_pos_mode: learnable_interpolate
-        type_embed: False
-
-        joint_with_text_embedding: True
-        pre_extracted: True # extract features before training
-        joint_names: coco_body_17joints  #SPECIFIC
-        proj_norm: 'LN'
-        stride_text_embedding: True
-        is_2d_dataset: True  #SPECIFIC
-        modality_share_list: [
-          'merge_kernel',
-          'proj_kernel',
-          'proj', ]
-        task_sp_list: [ 'text_embedding', 'pos_embed', ]
-
-
-    patch_proj:
-      type: sparse_labeling_projector
-      kwargs:
-        loss_cfg:
-          type: MaskDetFocalDiceLoss
-          kwargs:
-            cfg:
-              deep_supervision: True
-              focal_alpha: 0.25
-              class_weight: 2.0
-              bbox_weight: 5.0
-              giou_weight: 2.
-              ign_thr: 0.7
-              dec_layers: 6
-              num_classes: 1
-              predict3d: True
-              xyxy: True
-        in_chans: 3  # predefined in patch adapter, set in solver
-        num_joints: 17 #SPECIFIC
-        num_frames: 175
-        modality_share_list: [
-          'output_proj',
-          'translate_weight',
-          'translate_bias',
-          'post_mul_norm',
-          'patch_proj',
-          'class_proj'
-        ]
-        task_sp_list: [
-          'text_vectors',  # useless
-          'text_features',
-        ]
-
-    label_adapter:
-      type: text_adapter
-      kwargs:
-        pretrained: True
-        close_set: True
-        description_dict_name:
-#          - ntu60_name
-#          - ntu120_name
-#          - gym_cls_name
-          - diving48_cls_name
-          - ucf101_cls_name
-          - k400_cls_name
-        one_way_semantics: False
-        skeleton_action: True # use skeleton action to Double the text embedding (when M=2)
-        skeleton_action_one_hot_label: True
-        pre_extracted: True # extract features before training
-        task_sp_list: [ 'text_vectors', ]
-
-    label_proj:
-      type: text_projector
-      kwargs:
-        close_set: True
-        one_way_semantics: False
-        description_dict_name:
-#          - ntu60_name
-#          - ntu120_name
-#          - gym_cls_name
-          - diving48_cls_name
-          - ucf101_cls_name
-          - k400_cls_name
-        skeleton_action: True
-        skeleton_action_one_hot_label: True
-        pre_proj_type: 'pool'
-        pre_extracted: True # extract features before training
-
-        replace_post_mul_norm: False
-        post_mul_norm: True
-        #        translate_weight_scale: 7.0
-        task_sp_list: [ 'text_vectors',
-                        'translate_weight',
-                        'translate_bias',
-                        'post_mul_norm', ]
-        loss_cfg:
-          type: CELoss
-          kwargs:
-            loss_weight: 1.0
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: [ 'predictor.mask_token', ]
-        task_sp_list: [
-          'predictor.query_embed_patch',
-          'predictor.query_embed_label',
-          #          'predictor.mask_token',
-          'predictor.class_embed','predictor.fc_bias',    # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20 # useless in unihcpv2
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9
-          pre_norm: False
-          arch: fan_in
-          enforce_input_project: False
-          mask_on: False
-          num_feature_levels: 1
-          cross_pos_embed: anchor
-          #          fixed_class_embed: True
-          #          fixed_class_embed_cfg:
-          #            pre_extracted: True
-          #            description_dict_name: gym_cls_name
-          #            fixed_class_embed_LN: True
-          #            one_way_semantics: False
-          self_attn_mask_type: patch_diag_label_row
-          detach_from_peddet: True
-        #          label_ffn_pre_norm: True
-        #          label_ffn_zero_gated: True
-
-        loss_cfg:
-          type: CEL_Sigmoid
-
-  2:
-    name: smpl
-    loss_weight: 0.5
-    gres_ratio: 3
-    dataset:
-      type: MeshTSVYamlDataset  # train for 150 epochs
-      kwargs:
-        is_composite: True
-        is_train: True
-        cv2_output: False
-        augmentation:
-          scale_factor: 0.25
-          noise_factor: 0.4
-          rot_factor: 30
-          img_res: 224
-        cfg:
-          data_path:
-             - /mnt/petrelfs/share_data/vitruvian/data/Processed_SMPL/3dpw/dataset.pkl  # problem
-             - /mnt/petrelfs/share_data/vitruvian/data/Processed_SMPL/human3.6m/dataset.pkl  #running
-             - /mnt/petrelfs/share_data/vitruvian/data/Processed_SMPL/coco_smpl/dataset.pkl  # problem
-             - /mnt/petrelfs/share_data/vitruvian/data/Processed_SMPL/muco/dataset.pkl  #running
-             - /mnt/petrelfs/share_data/vitruvian/data/Processed_SMPL/up3d/dataset.pkl  # done
-             - /mnt/petrelfs/share_data/vitruvian/data/Processed_SMPL/mpii/dataset.pkl  #done
-             - /mnt/petrelfs/share_data/vitruvian/data/Global_SMPL/gta_human/dataset_pkl/v2_dataset_1396913.pkl
-             - /mnt/petrelfs/share_data/vitruvian/data/Global_SMPL/gta_human/dataset_pkl/v2_dataset_200000.pkl
-             - /mnt/petrelfs/share_data/vitruvian/data/Global_SMPL/gta_human/dataset_pkl/v2_dataset_400000.pkl
-             - /mnt/petrelfs/share_data/vitruvian/data/Global_SMPL/gta_human/dataset_pkl/v2_dataset_600000.pkl
-             - /mnt/petrelfs/share_data/vitruvian/data/Global_SMPL/gta_human/dataset_pkl/v2_dataset_800000.pkl
-             - /mnt/petrelfs/share_data/vitruvian/data/Global_SMPL/gta_human/dataset_pkl/v2_dataset_1000000.pkl
-             - /mnt/petrelfs/share_data/vitruvian/data/Global_SMPL/gta_human/dataset_pkl/v2_dataset_1200000.pkl
-#             - /mnt/petrelfs/tangshixiang/hwz/smpl_datasets/agora/v3_dataset_106674.pkl
-          root_path:
-             - /mnt/petrelfs/share_data/vitruvian/data/Processed_SMPL/3dpw/images
-             - /mnt/petrelfs/share_data/vitruvian/data/Processed_SMPL/human3.6m/images
-             - /mnt/petrelfs/share_data/vitruvian/data/Processed_SMPL/coco_smpl/images
-             - /mnt/petrelfs/share_data/vitruvian/data/Processed_SMPL/muco/images
-             - /mnt/petrelfs/share_data/vitruvian/data/Processed_SMPL/up3d/images
-             - /mnt/petrelfs/share_data/vitruvian/data/Processed_SMPL/mpii/images
-             - /mnt/petrelfs/share_data/vitruvian/data/Global_SMPL/gta_human_openxlab/gta_human
-             - /mnt/petrelfs/share_data/vitruvian/data/Global_SMPL/gta_human_openxlab/gta_human
-             - /mnt/petrelfs/share_data/vitruvian/data/Global_SMPL/gta_human_openxlab/gta_human
-             - /mnt/petrelfs/share_data/vitruvian/data/Global_SMPL/gta_human_openxlab/gta_human
-             - /mnt/petrelfs/share_data/vitruvian/data/Global_SMPL/gta_human_openxlab/gta_human
-             - /mnt/petrelfs/share_data/vitruvian/data/Global_SMPL/gta_human_openxlab/gta_human
-             - /mnt/petrelfs/share_data/vitruvian/data/Global_SMPL/gta_human_openxlab/gta_human
-#             - /mnt/petrelfs/share_data/vitruvian/data/Global_SMPL/AGORA/images_1280*720/train/
-
-    sampler:
-      batch_size: 165  # per card
-      shuffle_strategy: 1
-
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: learnable_interpolate
-        learnable_pos: True
-        drop_path_rate: 0.2
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: rgb
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: sparse_labeling
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3
-        use_abs_pos_emb: True
-        learnable_pos: False # useless
-        test_pos_mode: False
-        img_size: [ 224, 224 ]
-        task_sp_list: [ 'pos_embed' ]
-
-    label_adapter:
-      type: sparse_labeling_adapter
-      kwargs:
-        pretrained: True
-        in_chans: 3
-        num_joints: 446  # 1 + 14 + 431
-        num_frames: 1
-        embed_dim: 768
-        patch_size: [ 1,1 ]
-        stride_level: [ 1, 1 ]
-        use_abs_pos_emb: True
-        learnable_pos: False
-        test_pos_mode: learnable_interpolate
-        type_embed: False
-        proj_norm: 'LN'
-        task_sp_list: [ 'pos_embed',
-                        'text_embedding',
-                        'proj_kernel',
-                        'proj',]
-
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-    label_proj:
-      type: sparse_labeling_projector
-      kwargs:
-        task_sp_list: [ 'output_proj',
-                        'text_features',
-                        'loss_fn',
-                        'translate',
-                        'post_mul_norm',
-                        'patch_proj',
-                        'class_proj',
-                        'proj'
-        ]
-        pre_proj_type: 'fix_text_tokens'
-        num_classes: 14
-        # pred_joints_class: True
-        reference_type: 'smpl'
-        in_chans: 3 # XYZ
-        num_joints: 446
-        num_frames: 1
-        hidden_dim: 256
-        patch_size: [ 1, 1 ]
-        stride_level: [ 1, 1 ]
-        replace_post_mul_norm: False
-        task: smpl
-        # for smpl task, do not predict joints classes, so text_prototype and learn_text is not useful
-        text_prototype: True
-        learn_text: True
-        loss_cfg:
-          type: SMPL_LOSS_FASTMETRO
-          kwargs:
-            # use_pred_joints_class_loss: True
-            cfg:
-              use_smpl_param_regressor: True
-              joints_2d_loss_weight: 100.0
-              vertices_3d_loss_weight: 100.0
-              edge_normal_loss_weight: 100.0
-              joints_3d_loss_weight: 1000.0
-              vertices_fine_loss_weight: 0.25
-              vertices_intermediate_loss_weight: 0.50
-              vertices_coarse_loss_weight: 0.25
-              edge_gt_loss_weight: 5.0
-              edge_self_loss_weight: 1.e-4
-              normal_loss_weight: 0.1
-              smpl_param_loss_weight: 1000.0
-              except_smpl_param_loss_weight: 1.e-8
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.text_pe',
-                        # 'predictor.mask_token',
-                         'predictor.class_embed', 'predictor.fc_bias',   # useless in unihcpv2
-
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9
-          pre_norm: False
-          arch: fan_in
-          enforce_input_project: False
-          mask_on: False
-          num_feature_levels: 1
-          cross_pos_embed: anchor
-          cls_out_dim: 1
-          smpl_attention_mask_flag: True
-          smpl_mae_pe: True
-          use_adapt_pos2d: True
-          use_adapt_pos1d: True
-          self_attn_mask_type: full #full #patch_diag_label_row_textlabelfull, patch_diag_label_row
-          adding_per_layer_pe: True
-          detach_from_peddet: True
-          use_adapt_position: 'before'
-          use_smpl_label_attention_mask: True
-          label_pos_mode: 'smpl_xyz'
-        loss_cfg:
-          type: CEL_Sigmoid # useless
-
-  3:
-    name: Peddet
-    loss_weight: 15
-    gres_ratio: 8  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    dataset:
-      type: PedestrainDetectionDataset_v2  # train for 150 epochs
-      kwargs:
-        task_spec:
-          img_folder:
-            - /mnt/petrelfs/share_data/vitruvian/data/PedDet2d/CrowdHuman/Images
-          ann_file:
-            - /mnt/petrelfs/share_data/vitruvian/data/PedDet2d/CrowdHuman/annotations/train.json
-          return_masks: False
-        augmentation:
-            max_size: 1120
-        vit: True
-        num_append_fake_boxes: 867
-        return_box_xyxy: True
-        append_z: True
-    sampler:
-      batch_size: 4  # per card
-      shuffle_strategy: 1
-      batch_accumulation: 1
-
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: learnable_interpolate
-        learnable_pos: True
-        drop_path_rate: 0.2
-        attn_calcul_method: 'math'
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: rgb
-#        conv_neck: True
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: sparse_labeling
-#        conv_neck: True
-
-    patch_adapter:
-      type: rgb_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3
-        learnable_pos: False
-        use_abs_pos_emb: True
-        test_pos_mode: interpolate_with_nomask
-        img_size: 1344 # dynamic input size: TODO: nested
-        round_padding: True  #  should fix in rgb
-        pad_attn_mask: True
-        task_sp_list: [ 'pos_embed' ]
-
-    label_adapter:
-      type: sparse_labeling_adapter
-      kwargs:
-        pretrained: True
-        in_chans: 3  # xyz
-        num_joints: 867 # boxes with random gts
-        num_frames: 2   # 2 for x1y1 and x2y2
-        embed_dim: 768
-        patch_size: [ 2, 1 ]
-        stride_level: [ 1, 1 ]
-        use_abs_pos_emb: True
-        learnable_pos: False
-        test_pos_mode: learnable_interpolate
-        type_embed: False
-        #      joint_with_text_embedding: True
-        #      pre_extracted: True # extract features before training
-        #      joint_names: 'ntu_body_joints'
-        proj_norm: 'LN'
-    #      stride_text_embedding: True
-        task_sp_list: [ 'pos_embed',
-                        'text_embedding',
-                        'proj_kernel',
-                        'proj',
-                        'merge_kernel',
-        ]
-
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-    label_proj:
-      type: sparse_labeling_projector
-      kwargs:
-        task_sp_list: [ 'text_vectors',  # useless
-                        'text_features',
-        ]
-        modality_share_list: [
-          'text_vectors',  # useless
-          'output_proj',
-          'translate_weight',
-          'translate_bias',
-          'post_mul_norm',
-          'patch_proj',
-          'class_proj'
-        ]
-        in_chans: 3
-        num_joints: 900 # boxes with random gts
-        num_frames: 2   # 2 for x1y1 and x2y2
-        pre_proj_type: fix_text_tokens
-        num_classes: 1
-        reference_type: four_points
-        box_mlp: True
-        replace_post_mul_norm: True
-        translate_weight_scale: 4
-        text_prototype: True
-        loss_cfg:
-          type: MaskDetFocalDiceLoss
-          kwargs:
-            cfg:
-              deep_supervision: True
-              focal_alpha: 0.25
-              class_weight: 2.0
-              bbox_weight: 5.0
-              giou_weight: 2.
-              ign_thr: 0.7
-              dec_layers: 9
-              num_classes: 1
-              predict3d: True
-              xyxy: True
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.text_pe',
-                        'predictor.anchor',
-                        # 'predictor.mask_token',
-                        'predictor.class_embed','predictor.fc_bias',    # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20 # useless in unihcpv2
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9
-          pre_norm: False
-          arch: fan_in
-          enforce_input_project: False
-          mask_on: False
-          num_feature_levels: 1
-          cross_pos_embed: anchor
-          patch_pos_mode: interpolate_with_nomask
-          label_pos_mode: simple_interpolate
-          self_attn_mask_type: patch_diag_label_row_nested
-          adding_per_layer_pe: True
-          mask_token_normal_init: True
-          intermediate_output: True
-          peddet_cfgs:
-            share_content_query: 3
-            num_queries: 867
-            pre_defined_path: '289_points_3d.npy'
-            query_pe_dim: 3
-            xattn: False
-            anchor_requires_grad: False
-
-        loss_cfg:
-          type: CEL_Sigmoid
-
-  4:
-    name: Peddet_5set
-    loss_weight: 42.4
-    gres_ratio: 20  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    dataset:
-      type: PedestrainDetectionDataset_v2  # train for 150 epochs
-      kwargs:
-        task_spec:
-          img_folder:
-            - Humans1:s3://HumanCentricModel/peddet_public/CrowdHuman/Images
-            - Humans1:s3://HumanCentricModel/peddet_public/ECP/
-            - Humans1:s3://HumanCentricModel/peddet_public/CityPersons/
-            - Humans1:s3://HumanCentricModel/peddet_public/WiderPerson/Images
-            - Humans1:s3://HumanCentricModel/pose_public/coco/train2017/
-            - Humans1:s3://HumanCentricModel/peddet_public/WIDER_Pedestrian/Images/
-          ann_file:
-            - Humans1:s3://HumanCentricModel/peddet_public/CrowdHuman/annotations/train.json
-            - Humans1:s3://HumanCentricModel/peddet_public/ECP/ECP_remove_no_person_img.json
-            - Humans1:s3://HumanCentricModel/peddet_public/CityPersons/CityPersons_remove_no_person_img.json
-            - Humans1:s3://HumanCentricModel/peddet_public/WiderPerson/WiderPerson_remove_no_person_img.json
-            - Humans1:s3://HumanCentricModel/peddet_public/cocopersons/coco_person_remove_no_person_img.json
-            - Humans1:s3://HumanCentricModel/peddet_public/WIDER_Pedestrian/WIDER_Pedestrian_remove_no_person_img.json
-          return_masks: False
-        augmentation:
-          max_size: 1120
-        vit: True
-        num_append_fake_boxes: 867
-        return_box_xyxy: True
-        append_z: True
-    sampler:
-      batch_size: 4  # per card
-      shuffle_strategy: 1
-      batch_accumulation: 1
-
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: learnable_interpolate
-        learnable_pos: True
-        drop_path_rate: 0.2
-        attn_calcul_method: 'math'
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: rgb
-    #        conv_neck: True
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: sparse_labeling
-    #        conv_neck: True
-
-    patch_adapter:
-      type: rgb_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3
-        learnable_pos: False
-        use_abs_pos_emb: True
-        test_pos_mode: interpolate_with_nomask
-        img_size: 1344 # dynamic input size: TODO: nested
-        round_padding: True  #  should fix in rgb
-        pad_attn_mask: True
-        task_sp_list: [ 'pos_embed' ]
-
-    label_adapter:
-      type: sparse_labeling_adapter
-      kwargs:
-        pretrained: True
-        in_chans: 3  # xyz
-        num_joints: 867 # boxes with random gts
-        num_frames: 2   # 2 for x1y1 and x2y2
-        embed_dim: 768
-        patch_size: [ 2, 1 ]
-        stride_level: [ 1, 1 ]
-        use_abs_pos_emb: True
-        learnable_pos: False
-        test_pos_mode: learnable_interpolate
-        type_embed: False
-        #      joint_with_text_embedding: True
-        #      pre_extracted: True # extract features before training
-        #      joint_names: 'ntu_body_joints'
-        proj_norm: 'LN'
-        #      stride_text_embedding: True
-        task_sp_list: [ 'pos_embed',
-                        'text_embedding',
-                        'proj_kernel',
-                        'proj',
-                        'merge_kernel',
-        ]
-
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-    label_proj:
-      type: sparse_labeling_projector
-      kwargs:
-        task_sp_list: [ 'text_vectors',  # useless
-                        'text_features',
-        ]
-        modality_share_list: [
-          'text_vectors',  # useless
-          'output_proj',
-          'translate_weight',
-          'translate_bias',
-          'post_mul_norm',
-          'patch_proj',
-          'class_proj'
-        ]
-        in_chans: 3
-        num_joints: 900 # boxes with random gts
-        num_frames: 2   # 2 for x1y1 and x2y2
-        pre_proj_type: fix_text_tokens
-        num_classes: 1
-        reference_type: four_points
-        box_mlp: True
-        replace_post_mul_norm: True
-        translate_weight_scale: 4
-        text_prototype: True
-        loss_cfg:
-          type: MaskDetFocalDiceLoss
-          kwargs:
-            cfg:
-              deep_supervision: True
-              focal_alpha: 0.25
-              class_weight: 2.0
-              bbox_weight: 5.0
-              giou_weight: 2.
-              ign_thr: 0.7
-              dec_layers: 9
-              num_classes: 1
-              predict3d: True
-              xyxy: True
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: [ 'predictor.mask_token' ]
-        task_sp_list: [ # 'predictor.text_features',
-          'predictor.query_embed_patch',
-          'predictor.query_embed_label',
-          # 'predictor.text_pe',
-          'predictor.anchor',
-          # 'predictor.mask_token',
-          'predictor.class_embed','predictor.fc_bias',    # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-#          lms_checkpoint_train: fairscale
-          hidden_dim: 256
-          num_queries: 20 # useless in unihcpv2
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9
-          pre_norm: False
-          arch: fan_in
-          enforce_input_project: False
-          mask_on: False
-          num_feature_levels: 1
-          cross_pos_embed: anchor
-          patch_pos_mode: interpolate_with_nomask
-          label_pos_mode: simple_interpolate
-          self_attn_mask_type: patch_diag_label_row_nested
-          adding_per_layer_pe: True
-          mask_token_normal_init: True
-          intermediate_output: True
-          peddet_cfgs:
-            share_content_query: 3
-            num_queries: 867
-            pre_defined_path: '289_points_3d.npy'
-            query_pe_dim: 3
-            xattn: False
-            anchor_requires_grad: False
-
-        loss_cfg:
-          type: CEL_Sigmoid
-
-  8:
-    name: cocopose_256x192
-    loss_weight: 28000
-    gres_ratio: 3  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: False # when torch.compile is True, this should be False
-        learnable_pos: True
-        drop_path_rate: 0.2
-        img_size: 1344
-        num_encoded_tokens: 192
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    dataset:
-      type: COCOPosDatasetDev
-      kwargs:
-        ann_file: Humans1:s3://HumanCentricModel/pose_public/coco/annotations/person_keypoints_train2017.json
-        img_prefix: Humans1:s3://HumanCentricModel/pose_public/coco/train2017/
-        use_udp: True
-        data_use_ratio: 1
-        data_cfg: {
-                    'image_size':[192, 256],
-                    'heatmap_size':[48, 64], # originally, 'heatmap_size':[48, 64]
-                    'num_output_channels': 17,
-                    'num_joints': 17,
-                    'dataset_channel': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],],
-                    'inference_channel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
-                    'soft_nms': False,
-                    'nms_thr': 1.0,
-                    'oks_thr': 0.9,
-                    'vis_thr': 0.2,
-                    'use_gt_bbox': False,
-                    'det_bqbox_thr': 0.0,
-                    'bbox_file': './COCO_val2017_detections_AP_H_56_person.json'
-                }
-    sampler:
-      batch_size: 176  # per card
-      shuffle_strategy: 1
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256    # project to 256 dim for decoder
-        modality: rgb    # patch modality
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256   # project to 256 dim for decoder
-        modality: dense_labeling  # label modality
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3  # 3 for rgb
-        learnable_pos: False  # fixed position embedding, redundant parameter
-        test_pos_mode: False  # PE parameters are interpolated from mae to 'img_size'/16, then use repeat(batchsize, 1, 1)
-        img_size: [  256, 192 ]
-        task_sp_list: [ 'pos_embed' ]
-    #      type_embed: True
-
-    label_adapter: # for supvervised training, the results of label adapter is useless
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 17   # class num
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: [  256, 192 ]
-        #      type_embed: True
-        dim_class_embed: 64  # embedding shape for class embedding. TODO: chance to text features
-        emb_padding_idx: 255  #
-        task_sp_list: [ 'pos_embed',
-                        'class_embed',]
-
-    # fix kwargs of the project, which should be the same as that in the adapter, such as
-    # hidden_dim, patch_size, in_chans, stride_level are set in the solver - create_modal
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs:
-        task_sp_list: [ 'post_mul_norm',
-                        'loss_fn',
-                         'upsample_network',
-                         'text_features',]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        cls_loss_branch: True
-        l2_norm_debuged: True
-        description_dict_name: checked_pose_coco_name # this key is only valid when we set text_prototype to be True
-#        upsample_before_product: True    # Temperary solution, specific "upsample networks"
-        upsample_hidden_dim: 256
-        task: pose
-        #      no_mask_embed: True
-        #      learnable_class_embed: True
-        loss_cfg:
-          type: POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            target_type: GaussianHeatMap
-            cfg:
-              num_classes: 17
-              deep_supervision: True
-              ignore_blank: False
-              class_weight: 0.001
-              dice_weight: 0.0
-              mask_weight: 1.0
-              redundant_queries: 1
-              dec_layers: 9
-              sample_weight: [ 0.38647058, 0.33606767, 0.33835369, 0.29253424, 0.29636332,
-                               0.4987484 , 0.49978854, 0.39467358, 0.40091822, 0.36039853,
-                               0.36918446, 0.43343303, 0.4345989 , 0.32999829, 0.33092793,
-                               0.27714171, 0.27754939 ]
-              eos_coef: 0.1
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.mask_token',
-                        # 'predictor.text_pe',
-                        'predictor.class_embed', 'predictor.fc_bias',   # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20  # useless in unihcpv2
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9 # currently 6 layers for debug but we should use 9 layers afterwards
-          pre_norm: False # indicate to use pre_norm or post_norm in (self-attn, FFN)
-          arch: fan_in # fan_in type to init the weights
-          enforce_input_project: False  # placeholder, useless in unihcpv2
-          mask_on: False # placeholder, useless in unihcpv2
-          intermediate_output: True
-          num_feature_levels: 1  # placeholder, useless in unihcpv2
-          cross_pos_embed: anchor # indicate to use adaptive pose2d. should always be "anchor" in unihcpv2
-          cls_out_dim: 1  # placeholder, useless in unihcpv2
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-                                              # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-                                              # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          # currently, we put the class text after the decoder
-          # fixed_class_embed_cfg:
-          #   pre_extracted: True
-          #   description_dict_name: 'cihp_name'
-          #   fixed_class_embed_LN: True  # whether to use LN for fixed class embedding before adding to the input of decoder
-          self_attn_mask_type: full                            # full for all attention
-                                                               # type of mask for self-attention,
-                                                               # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          label_ffn_zero_gated: False # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-#          use_adapt_pos1d: True  # not effective for 2d tasks
-#          no_mask_embed: True
-#          learnable_class_embed: True
-        loss_cfg:
-          type: CEL_Sigmoid
-
-  9:
-    name: aic
-    loss_weight: 56000
-    gres_ratio: 7  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: False # when torch.compile is True, this should be False
-        learnable_pos: True
-        drop_path_rate: 0.2
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    dataset:
-      type: MultiPoseDatasetDev
-      kwargs:
-        dataset_name: aic
-        ann_file: openmmlab:s3://openmmlab/datasets/pose/ai_challenge/annotations/aic_train.json
-        img_prefix: openmmlab:s3://openmmlab/datasets/pose/ai_challenge/ai_challenger_keypoint_train_20170902/keypoint_train_images_20170902/
-        use_udp: True
-        data_use_ratio: 1
-        data_cfg: {
-          'image_size': [ 192, 256 ],
-          'heatmap_size': [ 48, 64 ], # originally, 'heatmap_size':[48, 64]
-          'num_output_channels': 14,
-          'num_joints': 14,
-          'dataset_channel': [ [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 ], ],
-          'inference_channel': [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 ],
-
-          'flip_pairs': [ [ 0, 3 ], [ 1, 4 ], [ 2, 5 ], [ 6, 9 ], [ 7, 10 ], [ 8, 11 ], ],
-          'upper_body_ids': [ 0, 1, 2, 3, 4, 5, 12, 13 ],
-          'lower_body_ids': [ 6, 7, 8, 9, 10, 11 ],
-          'use_different_joint_weights': False,
-          'joint_weights': [ 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1. ],
-
-          'soft_nms': False,
-          'nms_thr': 1.0,
-          'oks_thr': 0.9,
-          'vis_thr': 0.2,
-          'use_gt_bbox': False,
-          'det_bqbox_thr': 0.0,
-          'bbox_file': './COCO_val2017_detections_AP_H_56_person.json'
-        }
-    sampler:
-      batch_size: 189  # per card
-      shuffle_strategy: 1
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256    # project to 256 dim for decoder
-        modality: rgb    # patch modality
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256   # project to 256 dim for decoder
-        modality: dense_labeling  # label modality
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3  # 3 for rgb
-        learnable_pos: False  # fixed position embedding, redundant parameter
-        test_pos_mode: False  # PE parameters are interpolated from mae to 'img_size'/16, then use repeat(batchsize, 1, 1)
-        img_size: [ 256, 192 ]
-        task_sp_list: [ 'pos_embed' ]
-    #      type_embed: True
-
-    label_adapter: # for supvervised training, the results of label adapter is useless
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 14   # class num
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: [ 256, 192 ]
-        #      type_embed: True
-        dim_class_embed: 64  # embedding shape for class embedding. TODO: chance to text features
-        emb_padding_idx: 255  #
-        task_sp_list: [ 'pos_embed',
-                        'class_embed', ]
-
-    # fix kwargs of the project, which should be the same as that in the adapter, such as
-    # hidden_dim, patch_size, in_chans, stride_level are set in the solver - create_modal
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs:
-        task_sp_list: [ 'post_mul_norm',
-                         'upsample_network',
-                        'loss_fn',
-                        'text_features', ]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        cls_loss_branch: True
-        description_dict_name: checked_pose_aic_name # this key is only valid when we set text_prototype to be True
-        task: pose
-        l2_norm_debuged: True
-        #        upsample_before_product: True
-        upsample_hidden_dim: 256
-   # dim of hidden features in upsampling network
-        loss_cfg:
-          type: POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            target_type: GaussianHeatMap
-            cfg:
-              num_classes: 14
-              deep_supervision: True
-              ignore_blank: False
-              class_weight: 0.001
-              dice_weight: 0.0
-              mask_weight: 1.0
-              redundant_queries: 1
-              dec_layers: 9
-              sample_weight: [ 0.98064613, 0.977893565, 0.97715356, 0.98064613, 0.977893565,
-                               0.97715356, 0.9594528200000001, 0.85703431, 0.7504981850000001,
-                               0.9594528200000001, 0.85703431, 0.7504981850000001, 0.97149646, 0.98605877 ]
-
-              eos_coef: 0.1
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.mask_token',
-                        # 'predictor.text_pe',
-                         'predictor.class_embed','predictor.fc_bias',    # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20  # useless in unihcpv2
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9 # currently 6 layers for debug but we should use 9 layers afterwards
-          pre_norm: False # indicate to use pre_norm or post_norm in (self-attn, FFN)
-          arch: fan_in # fan_in type to init the weights
-          enforce_input_project: False  # placeholder, useless in unihcpv2
-          mask_on: False # placeholder, useless in unihcpv2
-          intermediate_output: True
-          num_feature_levels: 1  # placeholder, useless in unihcpv2
-          cross_pos_embed: anchor # indicate to use adaptive pose2d. should always be "anchor" in unihcpv2
-          cls_out_dim: 1  # placeholder, useless in unihcpv2
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-          # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-          # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          # currently, we put the class text after the decoder
-          # fixed_class_embed_cfg:
-          #   pre_extracted: True
-          #   description_dict_name: 'cihp_name'
-          #   fixed_class_embed_LN: True  # whether to use LN for fixed class embedding before adding to the input of decoder
-          self_attn_mask_type: full                            # full for all attention
-          # type of mask for self-attention,
-          # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          label_ffn_zero_gated: False # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-        #          no_mask_embed: True
-        #          learnable_class_embed: True
-        loss_cfg:
-          type: CEL_Sigmoid
-
-  10:
-    name: h36m_pose_256x256
-    loss_weight: 3192
-    gres_ratio: 2  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: False # when torch.compile is True, this should be False
-        learnable_pos: True
-        drop_path_rate: 0.2
-        img_size: 1344
-        num_encoded_tokens: 192
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    dataset:
-      type: COCOPosDatasetDev
-      kwargs:
-        ann_file: openmmlab:s3://openmmlab/datasets/pose/h36m/processed/annotation_body2d/h36m_coco_train.json
-        img_prefix: openmmlab:s3://openmmlab/datasets/pose/h36m/processed/images/
-        use_udp: True
-        data_use_ratio: 1
-        data_cfg: {
-          'image_size': [ 256, 256 ],
-          'heatmap_size': [ 64, 64 ], # originally, 'heatmap_size':[48, 64]
-          'num_output_channels': 17,
-          'num_joints': 17,
-          'dataset_channel': [ [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ], ],
-          'inference_channel': [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ],
-          'soft_nms': False,
-          'nms_thr': 1.0,
-          'oks_thr': 0.9,
-          'vis_thr': 0.2,
-          'use_gt_bbox': True,
-          'det_bqbox_thr': 0.0,
-          'bbox_file': './COCO_val2017_detections_AP_H_56_person.json'
-        }
-    sampler:
-      batch_size: 132  # per card
-      shuffle_strategy: 1
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256    # project to 256 dim for decoder
-        modality: rgb    # patch modality
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256   # project to 256 dim for decoder
-        modality: dense_labeling  # label modality
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3  # 3 for rgb
-        learnable_pos: False  # fixed position embedding, redundant parameter
-        test_pos_mode: False  # PE parameters are interpolated from mae to 'img_size'/16, then use repeat(batchsize, 1, 1)
-        img_size: [ 256, 256 ]
-        task_sp_list: [ 'pos_embed' ]
-    #      type_embed: True
-
-    label_adapter: # for supvervised training, the results of label adapter is useless
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 17   # class num
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: [ 256, 256 ]
-        #      type_embed: True
-        dim_class_embed: 64  # embedding shape for class embedding. TODO: chance to text features
-        emb_padding_idx: 255  #
-        task_sp_list: [ 'pos_embed',
-                        'class_embed', ]
-
-    # fix kwargs of the project, which should be the same as that in the adapter, such as
-    # hidden_dim, patch_size, in_chans, stride_level are set in the solver - create_modal
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs:
-        #        modality_share_list: [ 'upsample_network',]
-        task_sp_list: [ 'post_mul_norm',
-                        'post_mul_norm_cls',
-                        'loss_fn',
-                                  'upsample_network',
-                        'text_features', ]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        #post_mul_norm_cls: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        cls_loss_branch: True
-        description_dict_name: checked_pose_h3m6_name # this key is only valid when we set text_prototype to be True
-        #upsample_before_product: True    # Temperary solution, specific "upsample networks"
-        upsample_hidden_dim: 256
-        l2_norm_debuged: True
-
-        #text_mlp: True
-        task: pose
-        #      no_mask_embed: True
-        #      learnable_class_embed: True
-        loss_cfg:
-          type: POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            target_type: GaussianHeatMap
-            cfg:
-              num_classes: 17
-              deep_supervision: True
-              ignore_blank: False
-              class_weight: 0.001
-              dice_weight: 0.0
-              mask_weight: 1.0
-              redundant_queries: 1
-              dec_layers: 9
-              sample_weight: [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1. ]
-              eos_coef: 0.1
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: [ 'predictor.mask_token' ]
-        task_sp_list: [ # 'predictor.text_features',
-          'predictor.query_embed_patch',
-          'predictor.query_embed_label',
-          # 'predictor.mask_token',
-          # 'predictor.text_pe',
-          'predictor.class_embed', 'predictor.fc_bias',   # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20  # useless in unihcpv2
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9 # currently 6 layers for debug but we should use 9 layers afterwards
-          pre_norm: False # indicate to use pre_norm or post_norm in (self-attn, FFN)
-          arch: fan_in # fan_in type to init the weights
-          enforce_input_project: False  # placeholder, useless in unihcpv2
-          mask_on: False # placeholder, useless in unihcpv2
-          intermediate_output: True
-          num_feature_levels: 1  # placeholder, useless in unihcpv2
-          cross_pos_embed: anchor # indicate to use adaptive pose2d. should always be "anchor" in unihcpv2
-          cls_out_dim: 1  # placeholder, useless in unihcpv2
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-          # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-          # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          # currently, we put the class text after the decoder
-          # fixed_class_embed_cfg:
-          #   pre_extracted: True
-          #   description_dict_name: 'cihp_name'
-          #   fixed_class_embed_LN: True  # whether to use LN for fixed class embedding before adding to the input of decoder
-          self_attn_mask_type: full                            # full for all attention
-          # type of mask for self-attention,
-          # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          label_ffn_zero_gated: False # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-        #          use_adapt_pos1d: True  # not effective for 2d tasks
-        #          no_mask_embed: True
-        #          learnable_class_embed: True
-        loss_cfg:
-          type: CEL_Sigmoid
-
-  11:
-    name: posetrack_256x192
-    loss_weight: 12335
-    gres_ratio: 2 # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: False # when torch.compile is True, this should be False
-        learnable_pos: True
-        drop_path_rate: 0.2
-        img_size: 1344
-        num_encoded_tokens: 192
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    dataset:
-      type: MultiPoseDatasetDev
-      kwargs:
-        ann_file: openmmlab:s3://openmmlab/datasets/pose/PoseChallenge2018/annotations/posetrack18_train.json
-        img_prefix: openmmlab:s3://openmmlab/datasets/pose/PoseChallenge2018/
-        use_udp: True
-        dataset_name: 'posetrack'
-        data_cfg: {
-                      'image_size':[192, 256],
-                      'heatmap_size':[48, 64],
-                      'num_output_channels': 15,
-                      'num_joints': 15,
-                      'dataset_channel': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],],
-                      'inference_channel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
-
-                      'flip_pairs': [[3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], ],
-                      'upper_body_ids': [0, 1, 2, 3, 4, 5, 6, 7, 8,],
-                      'lower_body_ids': [9, 10, 11, 12, 13, 14],
-                      'use_different_joint_weights': False,
-                      'joint_weights': [1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, 1.5],
-
-                      'soft_nms': False,
-                      'nms_thr': 1.0,
-                      'oks_thr': 0.9,
-                      'vis_thr': 0.2,
-                      'use_gt_bbox': True,
-                      'det_bbox_thr': 0.0,
-                      'bbox_file': 'COCO_val2017_detections_AP_H_56_person.json'
-                }
-    sampler:
-      batch_size: 170  # per card
-      shuffle_strategy: 1
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256    # project to 256 dim for decoder
-        modality: rgb    # patch modality
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256   # project to 256 dim for decoder
-        modality: dense_labeling  # label modality
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3  # 3 for rgb
-        learnable_pos: False  # fixed position embedding, redundant parameter
-        test_pos_mode: False  # PE parameters are interpolated from mae to 'img_size'/16, then use repeat(batchsize, 1, 1)
-        img_size: [  256, 192 ]
-        task_sp_list: [ 'pos_embed' ]
-    #      type_embed: True
-
-    label_adapter: # for supvervised training, the results of label adapter is useless
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 15   # class num
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: [  256, 192 ]
-        #      type_embed: True
-        dim_class_embed: 64  # embedding shape for class embedding. TODO: chance to text features
-        emb_padding_idx: 255  #
-        task_sp_list: [ 'pos_embed',
-                        'class_embed',]
-
-    # fix kwargs of the project, which should be the same as that in the adapter, such as
-    # hidden_dim, patch_size, in_chans, stride_level are set in the solver - create_modal
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs:
-#        modality_share_list: [ 'upsample_network',]
-        task_sp_list: [ 'post_mul_norm',
-                        'post_mul_norm_cls',
-                        'loss_fn',
-                        'upsample_network',
-                         'text_features',]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        #post_mul_norm_cls: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        cls_loss_branch: True
-        description_dict_name: checked_pose_posetrack_name # this key is only valid when we set text_prototype to be True
-        #upsample_before_product: True    # Temperary solution, specific "upsample networks"
-        upsample_hidden_dim: 256
-        l2_norm_debuged: True
-
-        #text_mlp: True
-        task: pose
-        #      no_mask_embed: True
-        #      learnable_class_embed: True
-        loss_cfg:
-          type: POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            target_type: GaussianHeatMap
-            cfg:
-              num_classes: 15
-              deep_supervision: True
-              ignore_blank: False
-              class_weight: 0.001
-              dice_weight: 0.0
-              mask_weight: 1.0
-              redundant_queries: 1
-              dec_layers: 9
-              sample_weight: [ 0.81831569, 0.75692071, 0.74175951,
-                              0.789882655, 0.789882655, 0.659771425, 0.659771425, 0.625614735,
-                              0.625614735, 0.737772405, 0.737772405, 0.665022735, 0.665022735,
-                              0.59563039, 0.5956303
- ]
-              eos_coef: 0.1
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.mask_token',
-                        # 'predictor.text_pe',
-                        'predictor.class_embed', 'predictor.fc_bias',   # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20  # useless in unihcpv2
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9 # currently 6 layers for debug but we should use 9 layers afterwards
-          pre_norm: False # indicate to use pre_norm or post_norm in (self-attn, FFN)
-          arch: fan_in # fan_in type to init the weights
-          enforce_input_project: False  # placeholder, useless in unihcpv2
-          mask_on: False # placeholder, useless in unihcpv2
-          intermediate_output: True
-          num_feature_levels: 1  # placeholder, useless in unihcpv2
-          cross_pos_embed: anchor # indicate to use adaptive pose2d. should always be "anchor" in unihcpv2
-          cls_out_dim: 1  # placeholder, useless in unihcpv2
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-                                              # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-                                              # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          # currently, we put the class text after the decoder
-          # fixed_class_embed_cfg:
-          #   pre_extracted: True
-          #   description_dict_name: 'cihp_name'
-          #   fixed_class_embed_LN: True  # whether to use LN for fixed class embedding before adding to the input of decoder
-          self_attn_mask_type: full                            # full for all attention
-                                                               # type of mask for self-attention,
-                                                               # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          label_ffn_zero_gated: False # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-#          use_adapt_pos1d: True  # not effective for 2d tasks
-#          no_mask_embed: True
-#          learnable_class_embed: True
-        loss_cfg:
-          type: CEL_Sigmoid
-
-  12:
-    name: jrdb_256x192
-    loss_weight: 8223
-    gres_ratio: 2  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: False # when torch.compile is True, this should be False
-        learnable_pos: True
-        drop_path_rate: 0.2
-        img_size: 1344
-        num_encoded_tokens: 192
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    dataset:
-      type: MultiPoseDatasetDev
-      kwargs:
-        ann_file: Humans1:s3://HumanCentricModel/pose_public/JRDB2019/train.json
-        img_prefix: Humans1:s3://HumanCentricModel/pose_public/JRDB2022/images/
-        use_udp: True
-        dataset_name: 'JRDB2022'
-        data_cfg: {
-                      'image_size':[192, 256],
-                      'heatmap_size':[48, 64],
-                      'num_output_channels': 17,
-                      'num_joints': 17,
-                      'dataset_channel': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],],
-                      'inference_channel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,],
-                      'flip_pairs': [[2, 5], [3, 6], [4, 7], [8, 11], [9, 12], [10, 13], ],
-                      'upper_body_ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16,],
-                      'lower_body_ids': [9, 10, 12, 13],
-                      'use_different_joint_weights': False,
-                      'joint_weights': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-
-                      'soft_nms': False,
-                      'nms_thr': 1.0,
-                      'oks_thr': 0.9,
-                      'vis_thr': 0.2,
-                      'use_gt_bbox': True,
-                      'det_bbox_thr': 0.0,
-          'bbox_file': 'COCO_val2017_detections_AP_H_56_person.json'
-
-        }
-    sampler:
-      batch_size: 170  # per card
-      shuffle_strategy: 1
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256    # project to 256 dim for decoder
-        modality: rgb    # patch modality
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256   # project to 256 dim for decoder
-        modality: dense_labeling  # label modality
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3  # 3 for rgb
-        learnable_pos: False  # fixed position embedding, redundant parameter
-        test_pos_mode: False  # PE parameters are interpolated from mae to 'img_size'/16, then use repeat(batchsize, 1, 1)
-        img_size: [ 256, 192 ]
-        task_sp_list: [ 'pos_embed' ]
-    #      type_embed: True
-
-    label_adapter: # for supvervised training, the results of label adapter is useless
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 17   # class num
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: [ 256, 192 ]
-        #      type_embed: True
-        dim_class_embed: 64  # embedding shape for class embedding. TODO: chance to text features
-        emb_padding_idx: 255  #
-        task_sp_list: [ 'pos_embed',
-                        'class_embed', ]
-
-    # fix kwargs of the project, which should be the same as that in the adapter, such as
-    # hidden_dim, patch_size, in_chans, stride_level are set in the solver - create_modal
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs:
-        #        modality_share_list: [ 'upsample_network',]
-        task_sp_list: [ 'post_mul_norm',
-                        'post_mul_norm_cls',
-                        'loss_fn',
-                                  'upsample_network',
-                        'text_features', ]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        #post_mul_norm_cls: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        cls_loss_branch: True
-        description_dict_name: checked_pose_jrdb_name # this key is only valid when we set text_prototype to be True
-        #upsample_before_product: True    # Temperary solution, specific "upsample networks"
-        upsample_hidden_dim: 256
-        l2_norm_debuged: True
-
-        #text_mlp: True
-        task: pose
-        #      no_mask_embed: True
-        #      learnable_class_embed: True
-        loss_cfg:
-          type: POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            target_type: GaussianHeatMap
-            cfg:
-              num_classes: 17
-              deep_supervision: True
-              ignore_blank: False
-              class_weight: 0.001
-              dice_weight: 0.0
-              mask_weight: 1.0
-              redundant_queries: 1
-              dec_layers: 9
-              sample_weight: [
-                0.90384634, 0.82524231, 0.89927266, 0.90945538, 0.92796942, 0.89927266,
-                0.90945538, 0.92796942, 0.9912784,  0.84353379, 0.97898463, 0.9912784,
-                0.84353379, 0.97898463, 0.97418356, 0.94284516, 0.93372039,
-              ]
-              eos_coef: 0.1
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: [ 'predictor.mask_token' ]
-        task_sp_list: [ # 'predictor.text_features',
-          'predictor.query_embed_patch',
-          'predictor.query_embed_label',
-          # 'predictor.mask_token',
-          # 'predictor.text_pe',
-          'predictor.class_embed', 'predictor.fc_bias',   # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20  # useless in unihcpv2
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9 # currently 6 layers for debug but we should use 9 layers afterwards
-          pre_norm: False # indicate to use pre_norm or post_norm in (self-attn, FFN)
-          arch: fan_in # fan_in type to init the weights
-          enforce_input_project: False  # placeholder, useless in unihcpv2
-          mask_on: False # placeholder, useless in unihcpv2
-          intermediate_output: True
-          num_feature_levels: 1  # placeholder, useless in unihcpv2
-          cross_pos_embed: anchor # indicate to use adaptive pose2d. should always be "anchor" in unihcpv2
-          cls_out_dim: 1  # placeholder, useless in unihcpv2
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-          # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-          # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          # currently, we put the class text after the decoder
-          # fixed_class_embed_cfg:
-          #   pre_extracted: True
-          #   description_dict_name: 'cihp_name'
-          #   fixed_class_embed_LN: True  # whether to use LN for fixed class embedding before adding to the input of decoder
-          self_attn_mask_type: full                            # full for all attention
-          # type of mask for self-attention,
-          # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          label_ffn_zero_gated: False # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-        #          use_adapt_pos1d: True  # not effective for 2d tasks
-        #          no_mask_embed: True
-        #          learnable_class_embed: True
-        loss_cfg:
-          type: CEL_Sigmoid
-
-  13:
-    name: MHP_256x192
-    loss_weight: 3192
-    gres_ratio: 1  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: False # when torch.compile is True, this should be False
-        learnable_pos: True
-        drop_path_rate: 0.2
-        img_size: 1344
-        num_encoded_tokens: 192
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    dataset:
-      type: MultiPoseDatasetDev
-      kwargs:
-        ann_file: Humans1:s3://HumanCentricModel/pose_public/pose_MHPv2/train.json
-        img_prefix: openmmlab:s3://openmmlab/datasets/pose/LV-MHP-v2/train/images
-        use_udp: True
-        dataset_name: 'mhp'
-        data_cfg: {
-                      'image_size':[192, 256],
-                      'heatmap_size':[48, 64],
-                      'num_output_channels': 16,
-                      'num_joints': 16,
-                      'dataset_channel': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,],],
-                      'inference_channel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,],
-
-                      'flip_pairs': [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13], ],
-                      'upper_body_ids': [7, 8, 9, 10, 11, 12, 13, 14, 15],
-                      'lower_body_ids': [0, 1, 2, 3, 4, 5, 6],
-                      'use_different_joint_weights': False,
-                      'joint_weights': [1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5],
-
-                      'soft_nms': False,
-                      'nms_thr': 1.0,
-                      'oks_thr': 0.9,
-                      'vis_thr': 0.2,
-                      'use_gt_bbox': True,
-                      'det_bbox_thr': 0.0,
-                      'bbox_file': 'COCO_val2017_detections_AP_H_56_person.json'
-
-                }
-    sampler:
-      batch_size: 132  # per card
-      shuffle_strategy: 1
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256    # project to 256 dim for decoder
-        modality: rgb    # patch modality
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256   # project to 256 dim for decoder
-        modality: dense_labeling  # label modality
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3  # 3 for rgb
-        learnable_pos: False  # fixed position embedding, redundant parameter
-        test_pos_mode: False  # PE parameters are interpolated from mae to 'img_size'/16, then use repeat(batchsize, 1, 1)
-        img_size: [  256, 192 ]
-        task_sp_list: [ 'pos_embed' ]
-    #      type_embed: True
-
-    label_adapter: # for supvervised training, the results of label adapter is useless
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 16   # class num
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: [  256, 192 ]
-        #      type_embed: True
-        dim_class_embed: 64  # embedding shape for class embedding. TODO: chance to text features
-        emb_padding_idx: 255  #
-        task_sp_list: [ 'pos_embed',
-                        'class_embed',]
-
-    # fix kwargs of the project, which should be the same as that in the adapter, such as
-    # hidden_dim, patch_size, in_chans, stride_level are set in the solver - create_modal
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs:
-#        modality_share_list: [ 'upsample_network',]
-        task_sp_list: [ 'post_mul_norm',
-                        'post_mul_norm_cls',
-                        'loss_fn',
-                        'upsample_network',
-                         'text_features',]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        #post_mul_norm_cls: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        cls_loss_branch: True
-        description_dict_name: checked_pose_mhp_name # this key is only valid when we set text_prototype to be True
-        #upsample_before_product: True    # Temperary solution, specific "upsample networks"
-        upsample_hidden_dim: 256
-        l2_norm_debuged: True
-
-        #text_mlp: True
-        task: pose
-        #      no_mask_embed: True
-        #      learnable_class_embed: True
-        loss_cfg:
-          type: POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            target_type: GaussianHeatMap
-            cfg:
-              num_classes: 16
-              deep_supervision: True
-              ignore_blank: False
-              class_weight: 0.001
-              dice_weight: 0.0
-              mask_weight: 1.0
-              redundant_queries: 1
-              dec_layers: 9
-              sample_weight: [ 0.463188095, 0.6055728499999999, 0.732992125, 0.732992125, 0.6055728499999999,
-              0.463188095, 0.74209784, 0.92598716, 0.9642093, 0.98767263,
-              0.67156195, 0.6861140800000001, 0.85427203, 0.85427203, 0.6861140800000001,
-              0.67156195
- ]
-              eos_coef: 0.1
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.mask_token',
-                        # 'predictor.text_pe',
-                        'predictor.class_embed', 'predictor.fc_bias',   # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20  # useless in unihcpv2
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9 # currently 6 layers for debug but we should use 9 layers afterwards
-          pre_norm: False # indicate to use pre_norm or post_norm in (self-attn, FFN)
-          arch: fan_in # fan_in type to init the weights
-          enforce_input_project: False  # placeholder, useless in unihcpv2
-          mask_on: False # placeholder, useless in unihcpv2
-          intermediate_output: True
-          num_feature_levels: 1  # placeholder, useless in unihcpv2
-          cross_pos_embed: anchor # indicate to use adaptive pose2d. should always be "anchor" in unihcpv2
-          cls_out_dim: 1  # placeholder, useless in unihcpv2
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-                                              # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-                                              # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          # currently, we put the class text after the decoder
-          # fixed_class_embed_cfg:
-          #   pre_extracted: True
-          #   description_dict_name: 'cihp_name'
-          #   fixed_class_embed_LN: True  # whether to use LN for fixed class embedding before adding to the input of decoder
-          self_attn_mask_type: full                            # full for all attention
-                                                               # type of mask for self-attention,
-                                                               # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          label_ffn_zero_gated: False # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-#          use_adapt_pos1d: True  # not effective for 2d tasks
-#          no_mask_embed: True
-#          learnable_class_embed: True
-        loss_cfg:
-          type: CEL_Sigmoid
-
-#  14:
-#    name: pennaction_256x192
-#    loss_weight: 2902
-#    gres_ratio: 1  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-#    backbone:
-#      type: vit_base_patch16_mask
-#      kwargs:
-#        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-#        pretrained: True
-#        lms_checkpoint_train: fairscale
-#        window: False
-#        test_pos_mode: False # when torch.compile is True, this should be False
-#        learnable_pos: True
-#        drop_path_rate: 0.2
-#        img_size: 1344
-#        num_encoded_tokens: 192
-#        vis_patch_token_ratio: 1
-#        vis_label_token_ratio: 0.
-#
-#    dataset:
-#      type: MultiPoseDatasetDev
-#      kwargs:
-#        ann_file: Humans1:s3://HumanCentricModel/pose_public/pose_penn_action/train.json
-#        img_prefix: openmmlab:s3://openmmlab/datasets/pose/PENN/Penn_Action/frames
-#        use_udp: True
-#        dataset_name: 'penn_action'
-#        data_cfg: {
-#                      'image_size':[192, 256],
-#                      'heatmap_size':[48, 64],
-#                      'num_output_channels': 13,
-#                      'num_joints': 13,
-#                      'dataset_channel': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,],],
-#                      'inference_channel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,],
-#
-#                      'flip_pairs': [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], ],
-#                      'upper_body_ids': [0, 1, 2, 3, 4, 5, 6, 7, 8],
-#                      'lower_body_ids': [9, 10, 11, 12],
-#                      'use_different_joint_weights': False,
-#                      'joint_weights': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,],
-#
-#                      'soft_nms': False,
-#                      'nms_thr': 1.0,
-#                      'oks_thr': 0.9,
-#                      'vis_thr': 0.2,
-#                      'use_gt_bbox': True,
-#                      'det_bbox_thr': 0.0,
-#                      'bbox_file': 'COCO_val2017_detections_AP_H_56_person.json'
-#                }
-#    sampler:
-#      batch_size: 120  # per card
-#      shuffle_strategy: 1
-#
-#    patch_neck:
-#      type: MAEdecoder_proj_neck
-#      kwargs:
-#        mask_dim: 256    # project to 256 dim for decoder
-#        modality: rgb    # patch modality
-#      #      task_sp_list: ['mask_map']
-#
-#    label_neck:
-#      type: MAEdecoder_proj_neck
-#      kwargs:
-#        mask_dim: 256   # project to 256 dim for decoder
-#        modality: dense_labeling  # label modality
-#
-#    patch_adapter:
-#      type: rgb_adapter  # change to adapter_rgb
-#      kwargs:
-#        pretrained: True
-#        stride_level: 1
-#        in_chans: 3  # 3 for rgb
-#        learnable_pos: False  # fixed position embedding, redundant parameter
-#        test_pos_mode: False  # PE parameters are interpolated from mae to 'img_size'/16, then use repeat(batchsize, 1, 1)
-#        img_size: [  256, 192 ]
-#        task_sp_list: [ 'pos_embed' ]
-#    #      type_embed: True
-#
-#    label_adapter: # for supvervised training, the results of label adapter is useless
-#      type: dense_labeling_adapter
-#      kwargs:
-#        pretrained: True
-#        stride_level: 1
-#        in_chans: 13   # class num
-#        learnable_pos: False
-#        test_pos_mode: False
-#        img_size: [  256, 192 ]
-#        #      type_embed: True
-#        dim_class_embed: 64  # embedding shape for class embedding. TODO: chance to text features
-#        emb_padding_idx: 255  #
-#        task_sp_list: [ 'pos_embed',
-#                        'class_embed',]
-#
-#    # fix kwargs of the project, which should be the same as that in the adapter, such as
-#    # hidden_dim, patch_size, in_chans, stride_level are set in the solver - create_modal
-#    patch_proj:
-#      type: rgb_projector
-#      kwargs:
-#        loss_cfg:
-#          type: MaskedMSELoss
-#          kwargs:
-#            stride: 1
-#            norm_pix_loss: True
-#            pix_loss: True
-#            pix_loss_weight: 1.
-#            norm_pix_loss_weight: 1.
-#
-#
-#    label_proj:
-#      type: dense_labeling_projector
-#      kwargs:
-##        modality_share_list: [ 'upsample_network',]
-#        task_sp_list: [ 'post_mul_norm',
-#                        'post_mul_norm_cls',
-#                        'loss_fn',
-#                        'upsample_network',
-#                         'text_features',]
-#        emb_padding_idx: 255 # should be the same with that in the input adapter
-#        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-#        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-#        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-#        post_mul_norm: True
-#        #post_mul_norm_cls: True
-#        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-#        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-#        # consider whether to use a same or unique scale for different datasets in joint training
-#        text_prototype: True # extract text features
-#        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-#        cls_loss_branch: True
-#        description_dict_name: uppen_action_pose # this key is only valid when we set text_prototype to be True
-#        #upsample_before_product: True    # Temperary solution, specific "upsample networks"
-#        upsample_hidden_dim: 256
-#        l2_norm_debuged: True
-#
-#        #text_mlp: True
-#        task: pose
-#        #      no_mask_embed: True
-#        #      learnable_class_embed: True
-#        loss_cfg:
-#          type: POS_FocalDiceLoss_bce_cls_emb
-#          kwargs:
-#            target_type: GaussianHeatMap
-#            cfg:
-#              num_classes: 13
-#              deep_supervision: True
-#              ignore_blank: False
-#              class_weight: 0.001
-#              dice_weight: 0.0
-#              mask_weight: 1.0
-#              redundant_queries: 1
-#              dec_layers: 9
-#              sample_weight: [ 0.9304317, 0.7091321349999999, 0.7091321349999999, 0.7636155, 0.7636155,
-#              0.72129652, 0.72129652, 0.786229165, 0.786229165, 0.842012585,
-#              0.842012585, 0.77971057, 0.77971057
-#
-# ]
-#              eos_coef: 0.1
-#
-#    decoder:
-#      type: UniHCPv2_Head
-#      kwargs:
-#        predictor: 'mae'
-#        task: recons_rgb
-#        modality_share_list: ['predictor.mask_token']
-#        task_sp_list: [ # 'predictor.text_features',
-#                        'predictor.query_embed_patch',
-#                        'predictor.query_embed_label',
-#                        # 'predictor.mask_token',
-#                        # 'predictor.text_pe',
-#                        'predictor.class_embed', 'predictor.fc_bias',   # useless in unihcpv2
-#        ] # wrong list would somehow cause .cuda() stuck without error
-#        loss_weight: 1.0
-#        transformer_predictor_cfg:
-#          hidden_dim: 256
-#          num_queries: 20  # useless in unihcpv2
-#          nheads: 8
-#          dim_feedforward: 2048
-#          dec_layers: 9 # currently 6 layers for debug but we should use 9 layers afterwards
-#          pre_norm: False # indicate to use pre_norm or post_norm in (self-attn, FFN)
-#          arch: fan_in # fan_in type to init the weights
-#          enforce_input_project: False  # placeholder, useless in unihcpv2
-#          mask_on: False # placeholder, useless in unihcpv2
-#          intermediate_output: True
-#          num_feature_levels: 1  # placeholder, useless in unihcpv2
-#          cross_pos_embed: anchor # indicate to use adaptive pose2d. should always be "anchor" in unihcpv2
-#          cls_out_dim: 1  # placeholder, useless in unihcpv2
-#          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-#                                              # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-#                                              # repeat(batchsize, 1,1)
-#          label_pos_mode: False
-#          # currently, we put the class text after the decoder
-#          # fixed_class_embed_cfg:
-#          #   pre_extracted: True
-#          #   description_dict_name: 'cihp_name'
-#          #   fixed_class_embed_LN: True  # whether to use LN for fixed class embedding before adding to the input of decoder
-#          self_attn_mask_type: full                            # full for all attention
-#                                                               # type of mask for self-attention,
-#                                                               # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-#          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-#          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-#          label_ffn_zero_gated: False # whether to use zero-gated FFN in decoder.
-#          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-#          use_adapt_pos2d: True
-##          use_adapt_pos1d: True  # not effective for 2d tasks
-##          no_mask_embed: True
-##          learnable_class_embed: True
-#        loss_cfg:
-#          type: CEL_Sigmoid
-
-  14:
-    name: mpi_inf_3dhp_256x192
-    loss_weight: 8223
-    gres_ratio: 2  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: False # when torch.compile is True, this should be False
-        learnable_pos: True
-        drop_path_rate: 0.2
-        img_size: 1344
-        num_encoded_tokens: 192
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    dataset:
-      type: MultiPoseDatasetDev
-      kwargs:
-        ann_file: Humans1:s3://HumanCentricModel/pose_public/mpi_inf_3dhp/train.json
-        img_prefix: openmmlab:s3://openmmlab/datasets/pose/mpi_inf_3dhp/processed/images/
-        use_udp: True
-        dataset_name: '3DHP'
-        data_cfg: {
-                      'image_size':[192, 256],
-                      'heatmap_size':[48, 64],
-                      'num_output_channels': 136,
-                      'num_joints': 17,
-                      'dataset_channel': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],],
-                      'inference_channel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,],
-                      'flip_pairs': [[2, 5], [3, 6], [4, 7], [8, 11], [9, 12], [10, 13], ],
-                      'upper_body_ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16,],
-                      'lower_body_ids': [9, 10, 12, 13],
-                      'use_different_joint_weights': False,
-                      'joint_weights': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-
-                      'soft_nms': False,
-                      'nms_thr': 1.0,
-                      'oks_thr': 0.9,
-                      'vis_thr': 0.2,
-                      'use_gt_bbox': True,
-                      'det_bbox_thr': 0.0,
-                      'bbox_file': 'COCO_val2017_detections_AP_H_56_person.json'
-
-                }
-    sampler:
-      batch_size: 170 # per card
-      shuffle_strategy: 1
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256    # project to 256 dim for decoder
-        modality: rgb    # patch modality
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256   # project to 256 dim for decoder
-        modality: dense_labeling  # label modality
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3  # 3 for rgb
-        learnable_pos: False  # fixed position embedding, redundant parameter
-        test_pos_mode: False  # PE parameters are interpolated from mae to 'img_size'/16, then use repeat(batchsize, 1, 1)
-        img_size: [  256, 192 ]
-        task_sp_list: [ 'pos_embed' ]
-    #      type_embed: True
-
-    label_adapter: # for supvervised training, the results of label adapter is useless
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 17   # class num
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: [  256, 192 ]
-        #      type_embed: True
-        dim_class_embed: 64  # embedding shape for class embedding. TODO: chance to text features
-        emb_padding_idx: 255  #
-        task_sp_list: [ 'pos_embed',
-                        'class_embed',]
-
-    # fix kwargs of the project, which should be the same as that in the adapter, such as
-    # hidden_dim, patch_size, in_chans, stride_level are set in the solver - create_modal
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs:
-#        modality_share_list: [ 'upsample_network',]
-        task_sp_list: [ 'post_mul_norm',
-                        'post_mul_norm_cls',
-                        'loss_fn',
-                        'upsample_network',
-                         'text_features',]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        #post_mul_norm_cls: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        cls_loss_branch: True
-        description_dict_name: checked_pose_mpi_inf_3dhp_name # this key is only valid when we set text_prototype to be True
-        #upsample_before_product: True    # Temperary solution, specific "upsample networks"
-        upsample_hidden_dim: 256
-        l2_norm_debuged: True
-
-        #text_mlp: True
-        task: pose
-        #      no_mask_embed: True
-        #      learnable_class_embed: True
-        loss_cfg:
-          type: POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            target_type: GaussianHeatMap
-            cfg:
-              num_classes: 17
-              deep_supervision: True
-              ignore_blank: False
-              class_weight: 0.001
-              dice_weight: 0.0
-              mask_weight: 1.0
-              redundant_queries: 1
-              dec_layers: 9
-              sample_weight: [
-                0.97905498, 0.98151887, 0.98018951, 0.97778281, 0.97704955,
-                0.98018951, 0.97778281, 0.97704955, 0.98309006, 0.98060388,
-                0.97209657, 0.98309006, 0.98060388, 0.97209657, 0.98405158,
-                0.98242514, 0.98066688
- ]
-              eos_coef: 0.1
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.mask_token',
-                        # 'predictor.text_pe',
-                        'predictor.class_embed', 'predictor.fc_bias',   # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20  # useless in unihcpv2
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9 # currently 6 layers for debug but we should use 9 layers afterwards
-          pre_norm: False # indicate to use pre_norm or post_norm in (self-attn, FFN)
-          arch: fan_in # fan_in type to init the weights
-          enforce_input_project: False  # placeholder, useless in unihcpv2
-          mask_on: False # placeholder, useless in unihcpv2
-          intermediate_output: True
-          num_feature_levels: 1  # placeholder, useless in unihcpv2
-          cross_pos_embed: anchor # indicate to use adaptive pose2d. should always be "anchor" in unihcpv2
-          cls_out_dim: 1  # placeholder, useless in unihcpv2
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-                                              # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-                                              # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          # currently, we put the class text after the decoder
-          # fixed_class_embed_cfg:
-          #   pre_extracted: True
-          #   description_dict_name: 'cihp_name'
-          #   fixed_class_embed_LN: True  # whether to use LN for fixed class embedding before adding to the input of decoder
-          self_attn_mask_type: full                            # full for all attention
-                                                               # type of mask for self-attention,
-                                                               # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          label_ffn_zero_gated: False # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-#          use_adapt_pos1d: True  # not effective for 2d tasks
-#          no_mask_embed: True
-#          learnable_class_embed: True
-        loss_cfg:
-          type: CEL_Sigmoid
-
-#  15:
-#    name: halpepose_256x192
-#    loss_weight: 1596
-#    gres_ratio: 1  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-#    backbone:
-#      type: vit_base_patch16_mask
-#      kwargs:
-#        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-#        pretrained: True
-#        lms_checkpoint_train: fairscale
-#        window: False
-#        test_pos_mode: False # when torch.compile is True, this should be False
-#        learnable_pos: True
-#        drop_path_rate: 0.2
-#        img_size: 1344
-#        num_encoded_tokens: 192
-#        vis_patch_token_ratio: 1
-#        vis_label_token_ratio: 0.
-#
-#    dataset:
-#      type: MultiPoseDatasetDev
-#      kwargs:
-#        ann_file: Humans1:s3://HumanCentricModel/pose_public/Halpe/train.json
-#        img_prefix: openmmlab:s3://openmmlab/datasets/pose/Halpe/hico_20160224_det/images/train2015/
-#        use_udp: True
-#        dataset_name: 'halpe'
-#        data_cfg: {
-#                      'image_size':[192, 256],
-#                      'heatmap_size':[48, 64],
-#                      'num_output_channels': 136,
-#                      'num_joints': 17,
-#                      'dataset_channel': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ],],
-#                      'inference_channel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,],
-#                      'flip_pairs': [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]],
-#                      'upper_body_ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-#                      'lower_body_ids': [11, 12, 13, 14, 15, 16, ],
-#                      'use_different_joint_weights': False,
-#                      'joint_weights': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,],
-#
-#                      'soft_nms': False,
-#                      'nms_thr': 1.0,
-#                      'oks_thr': 0.9,
-#                      'vis_thr': 0.2,
-#                      'use_gt_bbox': True,
-#                      'det_bbox_thr': 0.0,
-#                      'bbox_file': 'COCO_val2017_detections_AP_H_56_person.json'
-#
-#                }
-#    sampler:
-#      batch_size: 132  # per card
-#      shuffle_strategy: 1
-#
-#    patch_neck:
-#      type: MAEdecoder_proj_neck
-#      kwargs:
-#        mask_dim: 256    # project to 256 dim for decoder
-#        modality: rgb    # patch modality
-#      #      task_sp_list: ['mask_map']
-#
-#    label_neck:
-#      type: MAEdecoder_proj_neck
-#      kwargs:
-#        mask_dim: 256   # project to 256 dim for decoder
-#        modality: dense_labeling  # label modality
-#
-#    patch_adapter:
-#      type: rgb_adapter  # change to adapter_rgb
-#      kwargs:
-#        pretrained: True
-#        stride_level: 1
-#        in_chans: 3  # 3 for rgb
-#        learnable_pos: False  # fixed position embedding, redundant parameter
-#        test_pos_mode: False  # PE parameters are interpolated from mae to 'img_size'/16, then use repeat(batchsize, 1, 1)
-#        img_size: [  256, 192 ]
-#        task_sp_list: [ 'pos_embed' ]
-#    #      type_embed: True
-#
-#    label_adapter: # for supvervised training, the results of label adapter is useless
-#      type: dense_labeling_adapter
-#      kwargs:
-#        pretrained: True
-#        stride_level: 1
-#        in_chans: 17   # class num
-#        learnable_pos: False
-#        test_pos_mode: False
-#        img_size: [  256, 192 ]
-#        #      type_embed: True
-#        dim_class_embed: 64  # embedding shape for class embedding. TODO: chance to text features
-#        emb_padding_idx: 255  #
-#        task_sp_list: [ 'pos_embed',
-#                        'class_embed',]
-#
-#    # fix kwargs of the project, which should be the same as that in the adapter, such as
-#    # hidden_dim, patch_size, in_chans, stride_level are set in the solver - create_modal
-#    patch_proj:
-#      type: rgb_projector
-#      kwargs:
-#        loss_cfg:
-#          type: MaskedMSELoss
-#          kwargs:
-#            stride: 1
-#            norm_pix_loss: True
-#            pix_loss: True
-#            pix_loss_weight: 1.
-#            norm_pix_loss_weight: 1.
-#
-#
-#    label_proj:
-#      type: dense_labeling_projector
-#      kwargs:
-##        modality_share_list: [ 'upsample_network',]
-#        task_sp_list: [ 'post_mul_norm',
-#                        'post_mul_norm_cls',
-#                        'loss_fn',
-#                        'upsample_network',
-#                         'text_features',]
-#        emb_padding_idx: 255 # should be the same with that in the input adapter
-#        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-#        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-#        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-#        post_mul_norm: True
-#        #post_mul_norm_cls: True
-#        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-#        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-#        # consider whether to use a same or unique scale for different datasets in joint training
-#        text_prototype: True # extract text features
-#        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-#        cls_loss_branch: True
-#        description_dict_name: halpe_pose # this key is only valid when we set text_prototype to be True
-#        #upsample_before_product: True    # Temperary solution, specific "upsample networks"
-#        upsample_hidden_dim: 256
-#        l2_norm_debuged: True
-#
-#        #text_mlp: True
-#        task: pose
-#        #      no_mask_embed: True
-#        #      learnable_class_embed: True
-#        loss_cfg:
-#          type: POS_FocalDiceLoss_bce_cls_emb
-#          kwargs:
-#            target_type: GaussianHeatMap
-#            cfg:
-#              num_classes: 17
-#              deep_supervision: True
-#              ignore_blank: False
-#              class_weight: 0.001
-#              dice_weight: 0.0
-#              mask_weight: 1.0
-#              redundant_queries: 1
-#              dec_layers: 9
-#              sample_weight: [ 0.63643556, 0.5382983299999999, 0.5382983299999999, 0.340705315, 0.340705315,
-#              0.82491849, 0.82491849, 0.75516638, 0.75516638, 0.77731828,
-#              0.77731828, 0.6869366100000001, 0.6869366100000001, 0.58420838, 0.58420838,
-#              0.52246356, 0.52246356,
-#
-# ]
-#              eos_coef: 0.1
-#
-#    decoder:
-#      type: UniHCPv2_Head
-#      kwargs:
-#        predictor: 'mae'
-#        task: recons_rgb
-#        modality_share_list: ['predictor.mask_token']
-#        task_sp_list: [ # 'predictor.text_features',
-#                        'predictor.query_embed_patch',
-#                        'predictor.query_embed_label',
-#                        # 'predictor.mask_token',
-#                        # 'predictor.text_pe',
-#                        'predictor.class_embed', 'predictor.fc_bias',   # useless in unihcpv2
-#        ] # wrong list would somehow cause .cuda() stuck without error
-#        loss_weight: 1.0
-#        transformer_predictor_cfg:
-#          hidden_dim: 256
-#          num_queries: 20  # useless in unihcpv2
-#          nheads: 8
-#          dim_feedforward: 2048
-#          dec_layers: 9 # currently 6 layers for debug but we should use 9 layers afterwards
-#          pre_norm: False # indicate to use pre_norm or post_norm in (self-attn, FFN)
-#          arch: fan_in # fan_in type to init the weights
-#          enforce_input_project: False  # placeholder, useless in unihcpv2
-#          mask_on: False # placeholder, useless in unihcpv2
-#          intermediate_output: True
-#          num_feature_levels: 1  # placeholder, useless in unihcpv2
-#          cross_pos_embed: anchor # indicate to use adaptive pose2d. should always be "anchor" in unihcpv2
-#          cls_out_dim: 1  # placeholder, useless in unihcpv2
-#          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-#                                              # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-#                                              # repeat(batchsize, 1,1)
-#          label_pos_mode: False
-#          # currently, we put the class text after the decoder
-#          # fixed_class_embed_cfg:
-#          #   pre_extracted: True
-#          #   description_dict_name: 'cihp_name'
-#          #   fixed_class_embed_LN: True  # whether to use LN for fixed class embedding before adding to the input of decoder
-#          self_attn_mask_type: full                            # full for all attention
-#                                                               # type of mask for self-attention,
-#                                                               # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-#          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-#          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-#          label_ffn_zero_gated: False # whether to use zero-gated FFN in decoder.
-#          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-#          use_adapt_pos2d: True
-##          use_adapt_pos1d: True  # not effective for 2d tasks
-##          no_mask_embed: True
-##          learnable_class_embed: True
-#        loss_cfg:
-#          type: CEL_Sigmoid
-
-  15:
-    name: 3dpw_256x192
-    loss_weight: 2055
-    gres_ratio: 1  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: False # when torch.compile is True, this should be False
-        learnable_pos: True
-        drop_path_rate: 0.2
-        img_size: 1344
-        num_encoded_tokens: 192
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    dataset:
-      type: MultiPoseDatasetDev
-      kwargs:
-        ann_file: Humans1:s3://HumanCentricModel/pose_public/3DPW/dataset_merged.json
-        img_prefix: Humans1:s3://HumanCentricModel/pose_public/3DPW/imageFiles
-        use_udp: True
-        dataset_name: '3DPW'
-        data_cfg: {
-                      'image_size':[192, 256],
-                      'heatmap_size':[48, 64],
-                      'num_output_channels': 18,
-                      'num_joints': 18,
-                      'dataset_channel': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],],
-                      'inference_channel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
-
-                      'flip_pairs': [[2, 5], [3, 6], [4, 7], [8, 11], [9, 12], [10, 13], [14, 15], [16, 17]],
-                      'upper_body_ids': [0, 1, 2 ,3, 4, 5, 6, 714, 15, 16, 17],
-                      'lower_body_ids': [8, 9, 10, 11, 12, 13],
-                      'use_different_joint_weights': False,
-                      'joint_weights': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ],
-
-                      'soft_nms': False,
-                      'nms_thr': 1.0,
-                      'oks_thr': 0.9,
-                      'vis_thr': 0.2,
-                      'use_gt_bbox': True,
-                      'det_bbox_thr': 0.0,
-                      'bbox_file': 'COCO_val2017_detections_AP_H_56_person.json'
-                }
-    sampler:
-      batch_size: 170  # per card
-      shuffle_strategy: 1
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256    # project to 256 dim for decoder
-        modality: rgb    # patch modality
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256   # project to 256 dim for decoder
-        modality: dense_labeling  # label modality
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3  # 3 for rgb
-        learnable_pos: False  # fixed position embedding, redundant parameter
-        test_pos_mode: False  # PE parameters are interpolated from mae to 'img_size'/16, then use repeat(batchsize, 1, 1)
-        img_size: [  256, 192 ]
-        task_sp_list: [ 'pos_embed' ]
-    #      type_embed: True
-
-    label_adapter: # for supvervised training, the results of label adapter is useless
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 18   # class num
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: [  256, 192 ]
-        #      type_embed: True
-        dim_class_embed: 64  # embedding shape for class embedding. TODO: chance to text features
-        emb_padding_idx: 255  #
-        task_sp_list: [ 'pos_embed',
-                        'class_embed',]
-
-    # fix kwargs of the project, which should be the same as that in the adapter, such as
-    # hidden_dim, patch_size, in_chans, stride_level are set in the solver - create_modal
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs:
-#        modality_share_list: [ 'upsample_network',]
-        task_sp_list: [ 'post_mul_norm',
-                        'post_mul_norm_cls',
-                        'loss_fn',
-                        'upsample_network',
-                         'text_features',]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        #post_mul_norm_cls: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        cls_loss_branch: True
-        description_dict_name: checked_pose_3dpw_name # this key is only valid when we set text_prototype to be True
-        #upsample_before_product: True    # Temperary solution, specific "upsample networks"
-        upsample_hidden_dim: 256
-        l2_norm_debuged: True
-
-        #text_mlp: True
-        task: pose
-        #      no_mask_embed: True
-        #      learnable_class_embed: True
-        loss_cfg:
-          type: POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            target_type: GaussianHeatMap
-            cfg:
-              num_classes: 18
-              deep_supervision: True
-              ignore_blank: False
-              class_weight: 0.001
-              dice_weight: 0.0
-              mask_weight: 1.0
-              redundant_queries: 1
-              dec_layers: 9
-              sample_weight: [ 0.81362905, 0.92006165, 0.90966899, 0.83948673, 0.78390512,
-              0.90966899, 0.83948673, 0.78390512, 0.916771645, 0.895912625,
-              0.86267757, 0.916771645, 0.895912625, 0.86267757, 0.683630395,
-              0.683630395, 0.6390913949999999, 0.6390913949999999
- ]
-              eos_coef: 0.1
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.mask_token',
-                        # 'predictor.text_pe',
-                        'predictor.class_embed', 'predictor.fc_bias',   # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20  # useless in unihcpv2
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9 # currently 6 layers for debug but we should use 9 layers afterwards
-          pre_norm: False # indicate to use pre_norm or post_norm in (self-attn, FFN)
-          arch: fan_in # fan_in type to init the weights
-          enforce_input_project: False  # placeholder, useless in unihcpv2
-          mask_on: False # placeholder, useless in unihcpv2
-          intermediate_output: True
-          num_feature_levels: 1  # placeholder, useless in unihcpv2
-          cross_pos_embed: anchor # indicate to use adaptive pose2d. should always be "anchor" in unihcpv2
-          cls_out_dim: 1  # placeholder, useless in unihcpv2
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-                                              # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-                                              # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          # currently, we put the class text after the decoder
-          # fixed_class_embed_cfg:
-          #   pre_extracted: True
-          #   description_dict_name: 'cihp_name'
-          #   fixed_class_embed_LN: True  # whether to use LN for fixed class embedding before adding to the input of decoder
-          self_attn_mask_type: full                            # full for all attention
-                                                               # type of mask for self-attention,
-                                                               # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          label_ffn_zero_gated: False # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-#          use_adapt_pos1d: True  # not effective for 2d tasks
-#          no_mask_embed: True
-#          learnable_class_embed: True
-        loss_cfg:
-          type: CEL_Sigmoid
-
-  16:
-    name: aist++_256x192
-    loss_weight: 2055
-    gres_ratio: 1  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: False # when torch.compile is True, this should be False
-        learnable_pos: True
-        drop_path_rate: 0.2
-        img_size: 1344
-        num_encoded_tokens: 192
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    dataset:
-      type: MultiPoseDatasetDev
-      kwargs:
-        ann_file: Humans1:s3://HumanCentricModel/pose_public/aistplusplus/merged_train_1m_filter.json
-        img_prefix: Humans1:s3://HumanCentricModel/pose_public/aistplusplus/images/
-        use_udp: True
-        dataset_name: 'AIST'
-        data_cfg: {
-          'image_size': [ 192, 256 ],
-          'heatmap_size': [ 48, 64 ],
-          'num_output_channels': 136,
-          'num_joints': 17,
-          'dataset_channel': [ [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ], ],
-          'inference_channel': [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ],
-          'flip_pairs': [ [ 1, 2 ], [ 3, 4 ], [ 5, 6 ], [ 7, 8 ], [ 9, 10 ], [ 11, 12 ], [ 13, 14 ], [ 15, 16 ] ],
-          'upper_body_ids': [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ],
-          'lower_body_ids': [ 13, 14, 15, 16 ],
-          'use_different_joint_weights': False,
-          'joint_weights': [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ],
-
-          'soft_nms': False,
-          'nms_thr': 1.0,
-          'oks_thr': 0.9,
-          'vis_thr': 0.2,
-          'use_gt_bbox': True,
-          'det_bbox_thr': 0.0,
-          'bbox_file': 'COCO_val2017_detections_AP_H_56_person.json'
-
-        }
-    sampler:
-      batch_size: 170  # per card
-      shuffle_strategy: 1
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256    # project to 256 dim for decoder
-        modality: rgb    # patch modality
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256   # project to 256 dim for decoder
-        modality: dense_labeling  # label modality
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3  # 3 for rgb
-        learnable_pos: False  # fixed position embedding, redundant parameter
-        test_pos_mode: False  # PE parameters are interpolated from mae to 'img_size'/16, then use repeat(batchsize, 1, 1)
-        img_size: [ 256, 192 ]
-        task_sp_list: [ 'pos_embed' ]
-    #      type_embed: True
-
-    label_adapter: # for supvervised training, the results of label adapter is useless
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 17   # class num
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: [ 256, 192 ]
-        #      type_embed: True
-        dim_class_embed: 64  # embedding shape for class embedding. TODO: chance to text features
-        emb_padding_idx: 255  #
-        task_sp_list: [ 'pos_embed',
-                        'class_embed', ]
-
-    # fix kwargs of the project, which should be the same as that in the adapter, such as
-    # hidden_dim, patch_size, in_chans, stride_level are set in the solver - create_modal
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs:
-        #        modality_share_list: [ 'upsample_network',]
-        task_sp_list: [ 'post_mul_norm',
-                        'post_mul_norm_cls',
-                        'loss_fn',
-                        'upsample_network',
-                        'text_features', ]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        #post_mul_norm_cls: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        cls_loss_branch: True
-        description_dict_name: checked_pose_aist_name # this key is only valid when we set text_prototype to be True
-        #upsample_before_product: True    # Temperary solution, specific "upsample networks"
-        upsample_hidden_dim: 256
-        l2_norm_debuged: True
-
-        #text_mlp: True
-        task: pose
-        #      no_mask_embed: True
-        #      learnable_class_embed: True
-        loss_cfg:
-          type: POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            target_type: GaussianHeatMap
-            cfg:
-              num_classes: 17
-              deep_supervision: True
-              ignore_blank: False
-              class_weight: 0.001
-              dice_weight: 0.0
-              mask_weight: 1.0
-              redundant_queries: 1
-              dec_layers: 9
-              sample_weight: [
-                0.97905498, 0.98151887, 0.98018951, 0.97778281, 0.97704955,
-                0.98018951, 0.97778281, 0.97704955, 0.98309006, 0.98060388,
-                0.97209657, 0.98309006, 0.98060388, 0.97209657, 0.98405158,
-                0.98242514, 0.98066688
-              ]
-              eos_coef: 0.1
-
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: [ 'predictor.mask_token' ]
-        task_sp_list: [ # 'predictor.text_features',
-          'predictor.query_embed_patch',
-          'predictor.query_embed_label',
-          # 'predictor.mask_token',
-          # 'predictor.text_pe',
-          'predictor.class_embed', 'predictor.fc_bias',   # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20  # useless in unihcpv2
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9 # currently 6 layers for debug but we should use 9 layers afterwards
-          pre_norm: False # indicate to use pre_norm or post_norm in (self-attn, FFN)
-          arch: fan_in # fan_in type to init the weights
-          enforce_input_project: False  # placeholder, useless in unihcpv2
-          mask_on: False # placeholder, useless in unihcpv2
-          intermediate_output: True
-          num_feature_levels: 1  # placeholder, useless in unihcpv2
-          cross_pos_embed: anchor # indicate to use adaptive pose2d. should always be "anchor" in unihcpv2
-          cls_out_dim: 1  # placeholder, useless in unihcpv2
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-          # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-          # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          # currently, we put the class text after the decoder
-          # fixed_class_embed_cfg:
-          #   pre_extracted: True
-          #   description_dict_name: 'cihp_name'
-          #   fixed_class_embed_LN: True  # whether to use LN for fixed class embedding before adding to the input of decoder
-          self_attn_mask_type: full                            # full for all attention
-          # type of mask for self-attention,
-          # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          label_ffn_zero_gated: False # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-        #          use_adapt_pos1d: True  # not effective for 2d tasks
-        #          no_mask_embed: True
-        #          learnable_class_embed: True
-        loss_cfg:
-          type: CEL_Sigmoid
-
-  17:
-    name: LIP_parsing
-    loss_weight: 1.8
-    gres_ratio: 4 # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    dataset:
-      type: LIPParsingDataset  # train for 150 epochs
-      kwargs:
-        data_path: Humans1:s3://HumanCentricModel/parsing_public/LIP # #sh1424:s3://parsing_public/human3.6 #/mnt/lustre/share/wangyizhou/human3.6 #sh1984:s3://seg_public/human3.6
-        cfg:
-          stride_level: 1
-          is_flip: True
-          crop_size: [ 480, 480 ]
-          is_multi_scale: True
-          scale_factor: 11
-          center_crop_test: False
-          base_size: 480
-          #          mean: [0.485, 0.456, 0.406]
-          #          std: [0.229, 0.224, 0.225]
-          eval_crop_size: [ 480, 480 ]
-          ignore2endclass: True
-
-          is_photometricdistortion: True
-          brightness: 32
-          contrast_range: [ 0.5, 1.5 ]
-          saturation_range: [ 0.5, 1.5 ]
-          hue_delta: 18
-          is_rotate: True
-
-          ####
-          ignore_value: 255 # duplicated with decoder.kwargs.ignore_value
-          num_classes: 20
-          label_list: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ]
-    #          reduce_zero_label: True
-
-    sampler:
-      batch_size: 27  # per card
-      shuffle_strategy: 1
-
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: learnable_interpolate
-        learnable_pos: True
-        drop_path_rate: 0.2
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: rgb
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: dense_labeling
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: 480
-        task_sp_list: [ 'pos_embed' ]
-
-    label_adapter:
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 20
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: 480
-        #      type_embed: True
-        dim_class_embed: 64
-        emb_padding_idx: 255
-        task_sp_list: [ 'pos_embed', 'class_embed', ]
-
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs: # kept one
-        task_sp_list: [ 'post_mul_norm',
-                        # 'upsample_network',
-                        'loss_fn', 'text_features' ]
-        modality_share_list: ['upsample_network',]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        description_dict_name: checked_par_lip_name # this key is only valid when we set text_prototype to be True
-        cls_loss_branch: True
-        upsample_before_product: True
-        l2_norm_debuged: True
-        upsample_hidden_dim: 256
-        task: parsing
-        loss_cfg:
-          type: FocalDiceLoss_bce_cls_emb_sample_weight #POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            #          target_type: GaussianHeatMap
-            cfg: #for maskedsetloss v2
-              ignore_index: 20
-              loss_weight: 1.
-              loss_per_class: True
-              dice_weight: 50.0
-              mask_weight: 50.0
-              class_weight: 0.3
-              deep_supervision: True
-              dec_layers: 9
-              cls_weight_sample: True
-              sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378,
-                               0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058,
-                               0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099,
-                               0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ] #follow v1 parsing
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.text_pe',
-                        # 'predictor.mask_token',
-                        'predictor.class_embed','predictor.fc_bias',    # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9
-          pre_norm: False
-          arch: fan_in
-          enforce_input_project: False
-          mask_on: False
-          intermediate_output: True
-          num_feature_levels: 1
-          cross_pos_embed: anchor
-          cls_out_dim: 1
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-          # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-          # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          self_attn_mask_type: patch_diag_label_row_textlabelfull # type of mask for self-attention,
-          # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          #          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          #          label_ffn_zero_gated: True # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-
-        #          no_mask_embed: True
-        #          learnable_class_embed: True
-
-
-        loss_cfg:
-          type: FocalDiceLoss_bce_cls_emb_sample_weight
-          kwargs:
-            cfg:
-              deep_supervision: True
-              no_object_weight: 0.1
-
-              class_weight: 0.25
-              dice_weight: 5.0
-              mask_weight: 5.0
-              redundant_queries: 1
-              num_points: 12544
-
-              dec_layers: 6
-
-              oversample_ratio: 3.0
-              importance_sample_ratio: 0.75
-              sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378,
-                               0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058,
-                               0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099,
-                               0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ]
-
-  18:
-    name: CIHP_parsing
-    loss_weight: 3.6
-    gres_ratio: 4  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    dataset:
-      type: CIHPParsingDataset  # train for 150 epochs
-      kwargs:
-        data_path: Humans1:s3://HumanCentricModel/parsing_public/CIHP # #sh1424:s3://parsing_public/human3.6 #/mnt/lustre/share/wangyizhou/human3.6 #sh1984:s3://seg_public/human3.6
-        cfg:
-          stride_level: 1
-          is_flip: True
-          crop_size: [ 480, 480 ]
-          is_multi_scale: True
-          scale_factor: 11
-          center_crop_test: False
-          base_size: 480
-          #          mean: [0.485, 0.456, 0.406]
-          #          std: [0.229, 0.224, 0.225]
-          eval_crop_size: [ 480, 480 ]
-          ignore2endclass: True
-
-          is_photometricdistortion: True
-          brightness: 32
-          contrast_range: [ 0.5, 1.5 ]
-          saturation_range: [ 0.5, 1.5 ]
-          hue_delta: 18
-          is_rotate: True
-
-          ####
-          ignore_value: 255 # duplicated with decoder.kwargs.ignore_value
-          num_classes: 20
-          label_list: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ]
-    #          reduce_zero_label: True
-
-    sampler:
-      batch_size: 26  # per card
-      shuffle_strategy: 1
-
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: learnable_interpolate
-        learnable_pos: True
-        drop_path_rate: 0.2
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: rgb
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: dense_labeling
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: 480
-        task_sp_list: [ 'pos_embed' ]
-
-    label_adapter:
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 20
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: 480
-        #      type_embed: True
-        dim_class_embed: 64
-        emb_padding_idx: 255
-        task_sp_list: [ 'pos_embed', 'class_embed', ]
-
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs: # kept one
-        task_sp_list: [ 'post_mul_norm',
-                        # 'upsample_network',
-                        'loss_fn', 'text_features' ]
-        modality_share_list: ['upsample_network',]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        description_dict_name: checked_par_cihp_name # this key is only valid when we set text_prototype to be True
-        cls_loss_branch: True
-        l2_norm_debuged: True
-        task: parsing
-        upsample_before_product: True
-        upsample_hidden_dim: 256   #dim of hidden features in upsampling network
-        loss_cfg:
-          type: FocalDiceLoss_bce_cls_emb_sample_weight #POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            #          target_type: GaussianHeatMap
-            cfg: #for maskedsetloss v2
-              ignore_index: 20
-              loss_weight: 1.
-              loss_per_class: True
-              dice_weight: 50.0
-              mask_weight: 50.0
-              class_weight: 0.1
-              deep_supervision: True
-              dec_layers: 9
-              cls_weight_sample: True
-              sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378,
-                               0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058,
-                               0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099,
-                               0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ] #follow v1 parsing
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.text_pe',
-                        # 'predictor.mask_token',
-                         'predictor.class_embed','predictor.fc_bias',    # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9
-          pre_norm: False
-          arch: fan_in
-          enforce_input_project: False
-          mask_on: False
-          intermediate_output: True
-          num_feature_levels: 1
-          cross_pos_embed: anchor
-          cls_out_dim: 1
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-          # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-          # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          self_attn_mask_type: patch_diag_label_row_textlabelfull # type of mask for self-attention,
-          # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          #          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          #          label_ffn_zero_gated: True # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-
-        #          no_mask_embed: True
-        #          learnable_class_embed: True
-
-
-        loss_cfg:
-          type: FocalDiceLoss_bce_cls_emb_sample_weight
-          kwargs:
-            cfg:
-              deep_supervision: True
-              no_object_weight: 0.1
-
-              class_weight: 0.25
-              dice_weight: 5.0
-              mask_weight: 5.0
-              redundant_queries: 1
-              num_points: 12544
-
-              dec_layers: 6
-
-              oversample_ratio: 3.0
-              importance_sample_ratio: 0.75
-              sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378,
-                               0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058,
-                               0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099,
-                               0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ]
-
-  19:
-    name: human3.6m_parsing
-    loss_weight: 2.25
-    gres_ratio: 7  # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    dataset:
-      type: Human3M6ParsingDataset  # train for 150 epochs
-      kwargs:
-        data_path: Humans1:s3://HumanCentricModel/parsing_public/human3.6 # #sh1424:s3://parsing_public/human3.6 #/mnt/lustre/share/wangyizhou/human3.6 #sh1984:s3://seg_public/human3.6
-        cfg:
-          stride_level: 1
-          is_flip: True
-          crop_size: [ 480, 480 ]
-          is_multi_scale: True
-          scale_factor: 11
-          center_crop_test: False
-          base_size: 480
-          #          mean: [0.485, 0.456, 0.406]
-          #          std: [0.229, 0.224, 0.225]
-          eval_crop_size: [ 480, 480 ]
-          ignore2endclass: True
-
-          is_photometricdistortion: True
-          brightness: 32
-          contrast_range: [ 0.5, 1.5 ]
-          saturation_range: [ 0.5, 1.5 ]
-          hue_delta: 18
-          is_rotate: True
-
-          ####
-          ignore_value: 255 # duplicated with decoder.kwargs.ignore_value
-          num_classes: 25
-          label_list: [0, 1, 2, 3, 6, 7, 8, 17, 18, 19, 25, 26, 27, 32, 33, 34, 38, 39, 43, 44,
-             46, 49, 50, 56, 58]
-    #          reduce_zero_label: True
-
-    sampler:
-      batch_size: 31  # per card
-      shuffle_strategy: 1
-
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: learnable_interpolate
-        learnable_pos: True
-        drop_path_rate: 0.2
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: rgb
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: dense_labeling
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: 480
-        task_sp_list: [ 'pos_embed' ]
-
-    label_adapter:
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 25
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: 480
-        #      type_embed: True
-        dim_class_embed: 64
-        emb_padding_idx: 255
-        task_sp_list: [ 'pos_embed', 'class_embed', ]
-
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs: # kept one
-        task_sp_list: [ 'post_mul_norm',
-                        # 'upsample_network',
-                        'loss_fn', 'text_features' ]
-        modality_share_list: ['upsample_network',]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        description_dict_name: checked_par_human_name # this key is only valid when we set text_prototype to be True
-        cls_loss_branch: True
-        l2_norm_debuged: True
-        task: parsing
-        upsample_before_product: True
-        upsample_hidden_dim: 256  # dim of hidden features in upsampling network
-        loss_cfg:
-          type: FocalDiceLoss_bce_cls_emb_sample_weight #POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            #          target_type: GaussianHeatMap
-            cfg: #for maskedsetloss v2
-              ignore_index: 25
-              loss_weight: 1.
-              loss_per_class: True
-              dice_weight: 50.0
-              mask_weight: 50.0
-              class_weight: 0.1
-              deep_supervision: True
-              dec_layers: 9
-              cls_weight_sample: True
-              sample_weight: [1.0, 0.97325, 0.96685, 0.9903500000000001, 0.97325, 0.96685, 0.9903500000000001, 0.9929, 0.9459,
-                              0.89645, 0.9929, 0.9459, 0.89645, 0.981, 0.9997, 0.99265, 0.9997, 0.99265,
-                              0.9995, 0.9999, 0.9999, 0.9758, 0.9256500000000001, 0.9758, 0.9256500000000001] #follow v1 parsing
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.text_pe',
-                        # 'predictor.mask_token',
-                        'predictor.class_embed','predictor.fc_bias',    # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 25
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9
-          pre_norm: False
-          arch: fan_in
-          enforce_input_project: False
-          mask_on: False
-          intermediate_output: True
-          num_feature_levels: 1
-          cross_pos_embed: anchor
-          cls_out_dim: 1
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-          # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-          # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          self_attn_mask_type: patch_diag_label_row_textlabelfull # type of mask for self-attention,
-          # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          #          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          #          label_ffn_zero_gated: True # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-
-        #          no_mask_embed: True
-        #          learnable_class_embed: True
-
-
-        loss_cfg:
-          type: FocalDiceLoss_bce_cls_emb_sample_weight
-          kwargs:
-            cfg:
-              deep_supervision: True
-              no_object_weight: 0.1
-
-              class_weight: 0.25
-              dice_weight: 5.0
-              mask_weight: 5.0
-              redundant_queries: 1
-              num_points: 12544
-
-              dec_layers: 6
-
-              oversample_ratio: 3.0
-              importance_sample_ratio: 0.75
-              sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378,
-                               0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058,
-                               0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099,
-                               0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ]
-
-  20:
-    name: modanet_parsing
-    loss_weight: 0.021
-    gres_ratio: 1 # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    dataset:
-      type: ModaNetParsingDataset  # train for 150 epochs
-      kwargs:
-        data_path: Humans1:s3://HumanCentricModel/parsing_public/ModaNet/ # #sh1424:s3://parsing_public/human3.6 #/mnt/lustre/share/wangyizhou/human3.6 #sh1984:s3://seg_public/human3.6
-        cfg:
-          stride_level: 1
-          is_flip: True
-          crop_size: [ 480, 480 ]
-          is_multi_scale: True
-          scale_factor: 11
-          center_crop_test: False
-          base_size: 480
-          #          mean: [0.485, 0.456, 0.406]
-          #          std: [0.229, 0.224, 0.225]
-          eval_crop_size: [ 480, 480 ]
-          ignore2endclass: True
-
-          is_photometricdistortion: True
-          brightness: 32
-          contrast_range: [ 0.5, 1.5 ]
-          saturation_range: [ 0.5, 1.5 ]
-          hue_delta: 18
-          is_rotate: True
-
-          ####
-          ignore_value: 255 # duplicated with decoder.kwargs.ignore_value
-          num_classes: 14
-          label_list: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ]
-    #          reduce_zero_label: True
-
-    sampler:
-      batch_size: 27  # per card
-      shuffle_strategy: 1
-
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: learnable_interpolate
-        learnable_pos: True
-        drop_path_rate: 0.2
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: rgb
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: dense_labeling
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: 480
-        task_sp_list: [ 'pos_embed' ]
-
-    label_adapter:
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 14
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: 480
-        #      type_embed: True
-        dim_class_embed: 64
-        emb_padding_idx: 255
-        task_sp_list: [ 'pos_embed', 'class_embed', ]
-
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs: # kept one
-        task_sp_list: [ 'post_mul_norm',
-                        'post_mul_norm_cls',
-                        # 'upsample_network',
-                        'loss_fn', 'text_features' ]
-        modality_share_list: ['upsample_network',]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        #post_mul_norm_cls: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        description_dict_name: checked_par_modanet_name # this key is only valid when we set text_prototype to be True
-        cls_loss_branch: True
-        upsample_before_product: True
-        upsample_hidden_dim: 256   # dim of hidden features in upsampling network
-        #text_mlp: True
-        task: parsing
-        loss_cfg:
-          type: FocalDiceLoss_bce_cls_emb_sample_weight #POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            #          target_type: GaussianHeatMap
-            cfg: #for maskedsetloss v2
-              ignore_index: 14
-              loss_weight: 1.
-              loss_per_class: True
-              dice_weight: 50.0
-              mask_weight: 50.0
-              class_weight: 0.1
-              deep_supervision: True
-              dec_layers: 9
-              cls_weight_sample: True
-              sample_weight: [ 1.0, 0.3933582160972342, 0.2633553450090918, 0.13557278208440998, 0.7506555651258494, 0.45334481768590296, 0.2760455545985262, 0.16753756340319648, 0.4404249210450761, 0.6636233132357163, 0.13457747152837593, 0.25979519571250836, 0.10422049956933678, 0.0956263757297349 ] #follow v1 parsing
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.text_pe',
-                        # 'predictor.mask_token',
-                        'predictor.class_embed','predictor.fc_bias',    # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9
-          pre_norm: False
-          arch: fan_in
-          enforce_input_project: False
-          mask_on: False
-          intermediate_output: True
-          num_feature_levels: 1
-          cross_pos_embed: anchor
-          cls_out_dim: 1
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-          # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-          # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          self_attn_mask_type: patch_diag_label_row_textlabelfull # type of mask for self-attention,
-          # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          #          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          #          label_ffn_zero_gated: True # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-
-        #          no_mask_embed: True
-        #          learnable_class_embed: True
-
-
-        loss_cfg:
-          type: FocalDiceLoss_bce_cls_emb_sample_weight
-          kwargs:
-            cfg:
-              deep_supervision: True
-              no_object_weight: 0.1
-
-              class_weight: 0.25
-              dice_weight: 5.0
-              mask_weight: 5.0
-              redundant_queries: 1
-              num_points: 12544
-
-              dec_layers: 6
-
-              oversample_ratio: 3.0
-              importance_sample_ratio: 0.75
-              sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378,
-                               0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058,
-                               0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099,
-                               0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ]
-
-  21:
-    name: VIP_parsing
-    loss_weight: 0.021
-    gres_ratio: 1 # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    dataset:
-      type: VIPParsingDataset  # train for 150 epochs
-      kwargs:
-        data_path: Humans1:s3://HumanCentricModel/parsing_public/VIP # #sh1424:s3://parsing_public/human3.6 #/mnt/lustre/share/wangyizhou/human3.6 #sh1984:s3://seg_public/human3.6
-        cfg:
-          stride_level: 1
-          is_flip: True
-          crop_size: [ 480, 480 ]
-          is_multi_scale: True
-          scale_factor: 11
-          center_crop_test: False
-          base_size: 480
-          #          mean: [0.485, 0.456, 0.406]
-          #          std: [0.229, 0.224, 0.225]
-          eval_crop_size: [ 480, 480 ]
-          ignore2endclass: True
-
-          is_photometricdistortion: True
-          brightness: 32
-          contrast_range: [ 0.5, 1.5 ]
-          saturation_range: [ 0.5, 1.5 ]
-          hue_delta: 18
-          is_rotate: True
-
-          ####
-          ignore_value: 255 # duplicated with decoder.kwargs.ignore_value
-          num_classes: 20
-          label_list: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ]
-    #          reduce_zero_label: True
-
-    sampler:
-      batch_size: 27  # per card
-      shuffle_strategy: 1
-
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: learnable_interpolate
-        learnable_pos: True
-        drop_path_rate: 0.2
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: rgb
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: dense_labeling
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: 480
-        task_sp_list: [ 'pos_embed' ]
-
-    label_adapter:
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 20
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: 480
-        #      type_embed: True
-        dim_class_embed: 64
-        emb_padding_idx: 255
-        task_sp_list: [ 'pos_embed', 'class_embed', ]
-
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs: # kept one
-        task_sp_list: [ 'post_mul_norm',
-                        'post_mul_norm_cls',
-                        # 'upsample_network',
-                        'loss_fn', 'text_features' ]
-        modality_share_list: ['upsample_network',]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        #post_mul_norm_cls: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        description_dict_name: checked_par_vip_name # this key is only valid when we set text_prototype to be True
-        cls_loss_branch: True
-        upsample_before_product: True
-        upsample_hidden_dim: 256   # dim of hidden features in upsampling network
-        #text_mlp: True
-        task: parsing
-        loss_cfg:
-          type: FocalDiceLoss_bce_cls_emb_sample_weight #POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            #          target_type: GaussianHeatMap
-            cfg: #for maskedsetloss v2
-              ignore_index: 20
-              loss_weight: 1.
-              loss_per_class: True
-              dice_weight: 50.0
-              mask_weight: 50.0
-              class_weight: 0.1
-              deep_supervision: True
-              dec_layers: 9
-              cls_weight_sample: True
-              sample_weight: [1.0, 0.3266013319616655, 0.9908495316476258, 0.029184038117927337, 0.052466294872489036, 0.991336834695977, 0.10801884238453625, 0.30001624343494504, 0.3465807569440684, 0.9136932156586712, 0.9863555146461639, 0.015810276679841896, 0.11895608858086523, 0.9925821647084303, 0.9789106069630192, 0.9789106069630192, 0.4952081866912123, 0.4952081866912123, 0.7048026422654177, 0.7048026422654177, ] #follow v1 parsing
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.text_pe',
-                        # 'predictor.mask_token',
-                        'predictor.class_embed','predictor.fc_bias',    # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9
-          pre_norm: False
-          arch: fan_in
-          enforce_input_project: False
-          mask_on: False
-          intermediate_output: True
-          num_feature_levels: 1
-          cross_pos_embed: anchor
-          cls_out_dim: 1
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-          # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-          # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          self_attn_mask_type: patch_diag_label_row_textlabelfull # type of mask for self-attention,
-          # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          #          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          #          label_ffn_zero_gated: True # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-
-        #          no_mask_embed: True
-        #          learnable_class_embed: True
-
-
-        loss_cfg:
-          type: FocalDiceLoss_bce_cls_emb_sample_weight
-          kwargs:
-            cfg:
-              deep_supervision: True
-              no_object_weight: 0.1
-
-              class_weight: 0.25
-              dice_weight: 5.0
-              mask_weight: 5.0
-              redundant_queries: 1
-              num_points: 12544
-
-              dec_layers: 6
-
-              oversample_ratio: 3.0
-              importance_sample_ratio: 0.75
-              sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378,
-                               0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058,
-                               0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099,
-                               0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ]
-
-  22:
-    name: deepfashion_parsing
-    loss_weight: 0.042
-    gres_ratio: 2 # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    dataset:
-      type: DeepFashionParsingDataset  # train for 150 epochs
-      kwargs:
-        data_path: Humans1:s3://HumanCentricModel/parsing_public/deepfashion2/ # #sh1424:s3://parsing_public/human3.6 #/mnt/lustre/share/wangyizhou/human3.6 #sh1984:s3://seg_public/human3.6
-        cfg:
-          stride_level: 1
-          is_flip: True
-          crop_size: [ 480, 480 ]
-          is_multi_scale: True
-          scale_factor: 11
-          center_crop_test: False
-          base_size: 480
-          #          mean: [0.485, 0.456, 0.406]
-          #          std: [0.229, 0.224, 0.225]
-          eval_crop_size: [ 480, 480 ]
-          ignore2endclass: True
-
-          is_photometricdistortion: True
-          brightness: 32
-          contrast_range: [ 0.5, 1.5 ]
-          saturation_range: [ 0.5, 1.5 ]
-          hue_delta: 18
-          is_rotate: True
-
-          ####
-          ignore_value: 255 # duplicated with decoder.kwargs.ignore_value
-          num_classes: 14
-          label_list: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,  ]
-    #          reduce_zero_label: True
-
-    sampler:
-      batch_size: 27  # per card
-      shuffle_strategy: 1
-
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: learnable_interpolate
-        learnable_pos: True
-        drop_path_rate: 0.2
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: rgb
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: dense_labeling
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: 480
-        task_sp_list: [ 'pos_embed' ]
-
-    label_adapter:
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 14
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: 480
-        #      type_embed: True
-        dim_class_embed: 64
-        emb_padding_idx: 255
-        task_sp_list: [ 'pos_embed', 'class_embed', ]
-
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs: # kept one
-        task_sp_list: [ 'post_mul_norm',
-                        'post_mul_norm_cls',
-                        # 'upsample_network',
-                        'loss_fn', 'text_features' ]
-        modality_share_list: ['upsample_network',]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        #post_mul_norm_cls: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        description_dict_name: checked_par_deepfashion_name # this key is only valid when we set text_prototype to be True
-        cls_loss_branch: True
-        upsample_before_product: True
-        upsample_hidden_dim: 256   # dim of hidden features in upsampling network
-        #text_mlp: True
-        task: parsing
-        loss_cfg:
-          type: FocalDiceLoss_bce_cls_emb_sample_weight #POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            #          target_type: GaussianHeatMap
-            cfg: #for maskedsetloss v2
-              ignore_index: 14
-              loss_weight: 1.
-              loss_per_class: True
-              dice_weight: 50.0
-              mask_weight: 50.0
-              class_weight: 0.1
-              deep_supervision: True
-              dec_layers: 9
-              cls_weight_sample: True
-              sample_weight: [ 1.0, 0.367704898390819, 0.18624095519402378, 0.002807862013638187, 0.06970686754080256, 0.08321481967691353, 0.010231244888284599, 0.18925719286730117, 0.28635504086767627, 0.15953761441126063, 0.0887055183084064, 0.04064888180411646, 0.09255004922874958, 0.03362141268278453, ] #follow v1 parsing
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.text_pe',
-                        # 'predictor.mask_token',
-                        'predictor.class_embed','predictor.fc_bias',    # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9
-          pre_norm: False
-          arch: fan_in
-          enforce_input_project: False
-          mask_on: False
-          intermediate_output: True
-          num_feature_levels: 1
-          cross_pos_embed: anchor
-          cls_out_dim: 1
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-          # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-          # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          self_attn_mask_type: patch_diag_label_row_textlabelfull # type of mask for self-attention,
-          # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          #          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          #          label_ffn_zero_gated: True # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-
-        #          no_mask_embed: True
-        #          learnable_class_embed: True
-
-
-        loss_cfg:
-          type: FocalDiceLoss_bce_cls_emb_sample_weight
-          kwargs:
-            cfg:
-              deep_supervision: True
-              no_object_weight: 0.1
-
-              class_weight: 0.25
-              dice_weight: 5.0
-              mask_weight: 5.0
-              redundant_queries: 1
-              num_points: 12544
-
-              dec_layers: 6
-
-              oversample_ratio: 3.0
-              importance_sample_ratio: 0.75
-              sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378,
-                               0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058,
-                               0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099,
-                               0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ]
-
-  23:
-    name: PaperDoll_parsing
-    loss_weight: 0.021
-    gres_ratio: 1 # int, > 0, = Task_GPUs / (world_Size/sum(all_gres_ratios))
-    dataset:
-      type: PaperDollParsingDataset  # train for 150 epochs
-      kwargs:
-        data_path: Humans1:s3://HumanCentricModel/parsing_public/PaperDoll # #sh1424:s3://parsing_public/human3.6 #/mnt/lustre/share/wangyizhou/human3.6 #sh1984:s3://seg_public/human3.6
-        cfg:
-          stride_level: 1
-          is_flip: True
-          crop_size: [ 480, 480 ]
-          is_multi_scale: True
-          scale_factor: 11
-          center_crop_test: False
-          base_size: 480
-          #          mean: [0.485, 0.456, 0.406]
-          #          std: [0.229, 0.224, 0.225]
-          eval_crop_size: [ 480, 480 ]
-          ignore2endclass: True
-
-          is_photometricdistortion: True
-          brightness: 32
-          contrast_range: [ 0.5, 1.5 ]
-          saturation_range: [ 0.5, 1.5 ]
-          hue_delta: 18
-          is_rotate: True
-
-          ####
-          ignore_value: 255 # duplicated with decoder.kwargs.ignore_value
-          num_classes: 20
-          label_list: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ]
-    #          reduce_zero_label: True
-
-    sampler:
-      batch_size: 27  # per card
-      shuffle_strategy: 1
-
-    backbone:
-      type: vit_base_patch16_mask
-      kwargs:
-        task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] # wrong list would somehow cause .cuda() stuck without error
-        pretrained: True
-        lms_checkpoint_train: fairscale
-        window: False
-        test_pos_mode: learnable_interpolate
-        learnable_pos: True
-        drop_path_rate: 0.2
-        vis_patch_token_ratio: 1
-        vis_label_token_ratio: 0.
-
-    patch_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: rgb
-      #      task_sp_list: ['mask_map']
-
-    label_neck:
-      type: MAEdecoder_proj_neck
-      kwargs:
-        mask_dim: 256
-        modality: dense_labeling
-
-    patch_adapter:
-      type: rgb_adapter  # change to adapter_rgb
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 3
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: 480
-        task_sp_list: [ 'pos_embed' ]
-
-    label_adapter:
-      type: dense_labeling_adapter
-      kwargs:
-        pretrained: True
-        stride_level: 1
-        in_chans: 20
-        learnable_pos: False
-        test_pos_mode: False
-        img_size: 480
-        #      type_embed: True
-        dim_class_embed: 64
-        emb_padding_idx: 255
-        task_sp_list: [ 'pos_embed', 'class_embed', ]
-
-    patch_proj:
-      type: rgb_projector
-      kwargs:
-        loss_cfg:
-          type: MaskedMSELoss
-          kwargs:
-            stride: 1
-            norm_pix_loss: True
-            pix_loss: True
-            pix_loss_weight: 1.
-            norm_pix_loss_weight: 1.
-
-    label_proj:
-      type: dense_labeling_projector
-      kwargs: # kept one
-        task_sp_list: [ 'post_mul_norm',
-                        'post_mul_norm_cls',
-                        # 'upsample_network',
-                        'loss_fn', 'text_features' ]
-        modality_share_list: ['upsample_network',]
-        emb_padding_idx: 255 # should be the same with that in the input adapter
-        upsampling: upconv_down_4     # upsampling the label to the same size as the input
-        learnable_class_embed: False # True: learnbale class embedding, very similar to fc; False: aligment with text features
-        pre_proj_type: text_embed_first_mul_second_inter  # use decoded text tokens to contrast with the label patch tokens
-        post_mul_norm: True
-        #post_mul_norm_cls: True
-        replace_post_mul_norm: False   # replace the post_mul_norm(LN) with a linear layer
-        translate_weight_scale: 1  # scale the translate weight to 6 times of the original value(1), NOTE that we should
-        # consider whether to use a same or unique scale for different datasets in joint training
-        text_prototype: True # extract text features
-        pre_extracted: True # when pre_extracted is set to be True, the model will only load the pre-extracted features.
-        description_dict_name: checked_par_paperdoll_name # this key is only valid when we set text_prototype to be True
-        cls_loss_branch: True
-        upsample_before_product: True
-        upsample_hidden_dim: 256   # dim of hidden features in upsampling network
-        #text_mlp: True
-        task: parsing
-        loss_cfg:
-          type: FocalDiceLoss_bce_cls_emb_sample_weight #POS_FocalDiceLoss_bce_cls_emb
-          kwargs:
-            #          target_type: GaussianHeatMap
-            cfg: #for maskedsetloss v2
-              ignore_index: 20
-              loss_weight: 1.
-              loss_per_class: True
-              dice_weight: 50.0
-              mask_weight: 50.0
-              class_weight: 0.1
-              deep_supervision: True
-              dec_layers: 9
-              cls_weight_sample: True
-              sample_weight: [ 1.0, 0.12651171233101552, 0.9445288709780197, 0.022596273603759997, 0.1542096228225839, 0.7740073338443981, 0.3171279444960444, 0.38393872629003634, 0.19776277195374156, 0.5762416654276241, 0.932492136102867, 0.0684559727964192, 0.2131960924782717, 0.9246929266441772, 0.9079233711740138, 0.9079233711740138, 0.5743937220129259, 0.5743937220129259, 0.7146935638660443, 0.7146935638660443, ] #follow v1 parsing
-    decoder:
-      type: UniHCPv2_Head
-      kwargs:
-        predictor: 'mae'
-        task: recons_rgb
-        modality_share_list: ['predictor.mask_token']
-        task_sp_list: [ # 'predictor.text_features',
-                        'predictor.query_embed_patch',
-                        'predictor.query_embed_label',
-                        # 'predictor.text_pe',
-                        # 'predictor.mask_token',
-                        'predictor.class_embed','predictor.fc_bias',    # useless in unihcpv2
-        ] # wrong list would somehow cause .cuda() stuck without error
-        loss_weight: 1.0
-        transformer_predictor_cfg:
-          hidden_dim: 256
-          num_queries: 20
-          nheads: 8
-          dim_feedforward: 2048
-          dec_layers: 9
-          pre_norm: False
-          arch: fan_in
-          enforce_input_project: False
-          mask_on: False
-          intermediate_output: True
-          num_feature_levels: 1
-          cross_pos_embed: anchor
-          cls_out_dim: 1
-          patch_pos_mode: False # Mode to generate pos_embed for patch tokens in decoder.
-          # given the fixed self.query_embed_patch (which has a same shape of that in adapter),
-          # repeat(batchsize, 1,1)
-          label_pos_mode: False
-          self_attn_mask_type: patch_diag_label_row_textlabelfull # type of mask for self-attention,
-          # shape [patch_tokens(rgb), label_tokens(sparse_labeling), fixed text tokens]
-          detach_from_peddet: True  # Not use the peddet_cfgs to modify the model structure
-          #          label_ffn_pre_norm: False  # whether to use pre_norm in decoder.
-          #          label_ffn_zero_gated: True # whether to use zero-gated FFN in decoder.
-          adding_per_layer_pe: True  # whether to add per-layer pe to the input of each decoder layer
-          use_adapt_pos2d: True
-
-        #          no_mask_embed: True
-        #          learnable_class_embed: True
-
-
-        loss_cfg:
-          type: FocalDiceLoss_bce_cls_emb_sample_weight
-          kwargs:
-            cfg:
-              deep_supervision: True
-              no_object_weight: 0.1
-
-              class_weight: 0.25
-              dice_weight: 5.0
-              mask_weight: 5.0
-              redundant_queries: 1
-              num_points: 12544
-
-              dec_layers: 6
-
-              oversample_ratio: 3.0
-              importance_sample_ratio: 0.75
-              sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378,
-                               0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058,
-                               0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099,
-                               0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ]