# Common configuration optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1, constructor='LayerDecayOptimizerConstructor', paramwise_cfg=dict( num_layers=12, layer_decay_rate=1 - 2e-4, custom_keys={ 'bias': dict(decay_multi=0.), 'pos_embed': dict(decay_mult=0.), 'relative_position_bias_table': dict(decay_mult=0.), 'norm': dict(decay_mult=0.) } ) ) optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=300, warmup_ratio=0.001, step=[3]) total_epochs = 4 target_type = 'GaussianHeatmap' data_cfg = dict( image_size=[192, 256], heatmap_size=[48, 64], soft_nms=False, nms_thr=1.0, oks_thr=0.9, vis_thr=0.2, use_gt_bbox=False, det_bbox_thr=0.0, bbox_file='data/coco/person_detection_results/' 'COCO_val2017_detections_AP_H_56_person.json', ) data_root = '/home/adryw/dataset/COCO17' data = dict( samples_per_gpu=64, workers_per_gpu=6, val_dataloader=dict(samples_per_gpu=128), test_dataloader=dict(samples_per_gpu=128), train=dict( type='TopDownCocoDataset', ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', img_prefix=f'{data_root}/train2017/', data_cfg=data_cfg), val=dict( type='TopDownCocoDataset', ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', img_prefix=f'{data_root}/val2017/', data_cfg=data_cfg), test=dict( type='TopDownCocoDataset', ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', img_prefix=f'{data_root}/val2017/', data_cfg=data_cfg) ) model_small = dict( type='TopDown', pretrained=None, backbone=dict( type='ViT', img_size=(256, 192), patch_size=16, embed_dim=384, depth=12, num_heads=12, ratio=1, use_checkpoint=False, mlp_ratio=4, qkv_bias=True, drop_path_rate=0.1, ), keypoint_head=dict( type='TopdownHeatmapSimpleHead', in_channels=384, num_deconv_layers=2, num_deconv_filters=(256, 256), num_deconv_kernels=(4, 4), extra=dict(final_conv_kernel=1, ), loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), train_cfg=dict(), test_cfg=dict( flip_test=True, post_process='default', shift_heatmap=False, target_type=target_type, modulate_kernel=11, use_udp=True)) model_base = dict( type='TopDown', pretrained=None, backbone=dict( type='ViT', img_size=(256, 192), patch_size=16, embed_dim=768, depth=12, num_heads=12, ratio=1, use_checkpoint=False, mlp_ratio=4, qkv_bias=True, drop_path_rate=0.3, ), keypoint_head=dict( type='TopdownHeatmapSimpleHead', in_channels=768, num_deconv_layers=2, num_deconv_filters=(256, 256), num_deconv_kernels=(4, 4), extra=dict(final_conv_kernel=1, ), loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), train_cfg=dict(), test_cfg=dict( flip_test=True, post_process='default', shift_heatmap=False, target_type=target_type, modulate_kernel=11, use_udp=True)) model_large = dict( type='TopDown', pretrained=None, backbone=dict( type='ViT', img_size=(256, 192), patch_size=16, embed_dim=1024, depth=24, num_heads=16, ratio=1, use_checkpoint=False, mlp_ratio=4, qkv_bias=True, drop_path_rate=0.5, ), keypoint_head=dict( type='TopdownHeatmapSimpleHead', in_channels=1024, num_deconv_layers=2, num_deconv_filters=(256, 256), num_deconv_kernels=(4, 4), extra=dict(final_conv_kernel=1, ), loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), train_cfg=dict(), test_cfg=dict( flip_test=True, post_process='default', shift_heatmap=False, target_type=target_type, modulate_kernel=11, use_udp=True)) model_huge = dict( type='TopDown', pretrained=None, backbone=dict( type='ViT', img_size=(256, 192), patch_size=16, embed_dim=1280, depth=32, num_heads=16, ratio=1, use_checkpoint=False, mlp_ratio=4, qkv_bias=True, drop_path_rate=0.55, ), keypoint_head=dict( type='TopdownHeatmapSimpleHead', in_channels=1280, num_deconv_layers=2, num_deconv_filters=(256, 256), num_deconv_kernels=(4, 4), extra=dict(final_conv_kernel=1, ), loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), train_cfg=dict(), test_cfg=dict( flip_test=True, post_process='default', shift_heatmap=False, target_type=target_type, modulate_kernel=11, use_udp=True))