log_level = 'INFO' load_from = None resume_from = None dist_params = dict(backend='nccl') workflow = [('train', 1)] checkpoint_config = dict(interval=20) evaluation = dict( interval=25, metric=['PCK', 'NME', 'AUC', 'EPE'], key_indicator='PCK', gpu_collect=True, res_folder='') optimizer = dict( type='Adam', lr=1e-5, ) optimizer_config = dict(grad_clip=None) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[160, 180]) total_epochs = 200 log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) channel_cfg = dict( num_output_channels=1, dataset_joints=1, dataset_channel=[ [ 0, ], ], inference_channel=[ 0, ], max_kpt_num=100) # model settings model = dict( type='PoseAnythingModel', pretrained='pretrained/swinv2_small_1k_500k.pth', text_pretrained='Alibaba-NLP/gte-base-en-v1.5', finetune_text_pretrained=False, encoder_config=dict( type='SwinTransformerV2', embed_dim=96, depths=[2, 2, 18, 2], num_heads=[3, 6, 12, 24], window_size=16, drop_path_rate=0.3, img_size=256, upsample="bilinear" ), keypoint_head=dict( type='PoseHead', img_in_channels=768, text_in_channels=768, # text_in_channels=512, transformer=dict( type='EncoderDecoder', d_model=256, nhead=8, num_encoder_layers=3, num_decoder_layers=3, graph_decoder='pre', dim_feedforward=768, dropout=0.1, similarity_proj_dim=256, dynamic_proj_dim=128, activation="relu", normalize_before=False, return_intermediate_dec=True), share_kpt_branch=False, num_decoder_layer=3, with_heatmap_loss=True, heatmap_loss_weight=2.0, support_order_dropout=-1, positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True)), # training and testing settings train_cfg=dict(), test_cfg=dict( flip_test=False, post_process='default', shift_heatmap=True, modulate_kernel=11)) data_cfg = dict( image_size=[256, 256], heatmap_size=[64, 64], num_output_channels=channel_cfg['num_output_channels'], num_joints=channel_cfg['dataset_joints'], dataset_channel=channel_cfg['dataset_channel'], inference_channel=channel_cfg['inference_channel']) train_pipeline = [ dict(type='LoadImageFromFile'), dict( type='TopDownGetRandomScaleRotation', rot_factor=15, scale_factor=0.15), dict(type='TopDownAffineFewShot'), dict(type='ToTensor'), dict( type='NormalizeTensor', mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), dict(type='TopDownGenerateTargetFewShot', sigma=1), dict( type='Collect', keys=['img', 'target', 'target_weight'], meta_keys=[ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton', ]), ] valid_pipeline = [ dict(type='LoadImageFromFile'), dict(type='TopDownAffineFewShot'), dict(type='ToTensor'), dict( type='NormalizeTensor', mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), dict(type='TopDownGenerateTargetFewShot', sigma=1), dict( type='Collect', keys=['img', 'target', 'target_weight'], meta_keys=[ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton', ]), ] test_pipeline = valid_pipeline data_root = 'data/mp100' data = dict( samples_per_gpu=16, workers_per_gpu=16, # samples_per_gpu=8, # workers_per_gpu=8, train=dict( type='TransformerPoseDataset', ann_file=f'{data_root}/annotations/mp100_split2_train.json', img_prefix=f'{data_root}/images/', # img_prefix=f'{data_root}', data_cfg=data_cfg, valid_class_ids=None, max_kpt_num=channel_cfg['max_kpt_num'], num_shots=1, pipeline=train_pipeline), val=dict( type='TransformerPoseDataset', ann_file=f'{data_root}/annotations/mp100_split2_val.json', img_prefix=f'{data_root}/images/', # img_prefix=f'{data_root}', data_cfg=data_cfg, valid_class_ids=None, max_kpt_num=channel_cfg['max_kpt_num'], num_shots=1, num_queries=15, num_episodes=100, pipeline=valid_pipeline), test=dict( type='TestPoseDataset', ann_file=f'{data_root}/annotations/mp100_split2_test.json', img_prefix=f'{data_root}/images/', # img_prefix=f'{data_root}', data_cfg=data_cfg, valid_class_ids=None, max_kpt_num=channel_cfg['max_kpt_num'], num_shots=1, num_queries=15, num_episodes=200, pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25], pipeline=test_pipeline), ) vis_backends = [ dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend'), ] visualizer = dict( type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') shuffle_cfg = dict(interval=1)