Spaces:
Running
Running
# Common configuration | |
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1, | |
constructor='LayerDecayOptimizerConstructor', | |
paramwise_cfg=dict( | |
num_layers=12, | |
layer_decay_rate=1 - 2e-4, | |
custom_keys={ | |
'bias': dict(decay_multi=0.), | |
'pos_embed': dict(decay_mult=0.), | |
'relative_position_bias_table': dict(decay_mult=0.), | |
'norm': dict(decay_mult=0.) | |
} | |
) | |
) | |
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) | |
# learning policy | |
lr_config = dict( | |
policy='step', | |
warmup='linear', | |
warmup_iters=300, | |
warmup_ratio=0.001, | |
step=[3]) | |
total_epochs = 4 | |
target_type = 'GaussianHeatmap' | |
data_cfg = dict( | |
image_size=[192, 256], | |
heatmap_size=[48, 64], | |
soft_nms=False, | |
nms_thr=1.0, | |
oks_thr=0.9, | |
vis_thr=0.2, | |
use_gt_bbox=False, | |
det_bbox_thr=0.0, | |
bbox_file='data/coco/person_detection_results/' | |
'COCO_val2017_detections_AP_H_56_person.json', | |
) | |
data_root = '/home/adryw/dataset/COCO17' | |
data = dict( | |
samples_per_gpu=64, | |
workers_per_gpu=6, | |
val_dataloader=dict(samples_per_gpu=128), | |
test_dataloader=dict(samples_per_gpu=128), | |
train=dict( | |
type='TopDownCocoDataset', | |
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', | |
img_prefix=f'{data_root}/train2017/', | |
data_cfg=data_cfg), | |
val=dict( | |
type='TopDownCocoDataset', | |
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', | |
img_prefix=f'{data_root}/val2017/', | |
data_cfg=data_cfg), | |
test=dict( | |
type='TopDownCocoDataset', | |
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', | |
img_prefix=f'{data_root}/val2017/', | |
data_cfg=data_cfg) | |
) | |
model_small = dict( | |
type='TopDown', | |
pretrained=None, | |
backbone=dict( | |
type='ViT', | |
img_size=(256, 192), | |
patch_size=16, | |
embed_dim=384, | |
depth=12, | |
num_heads=12, | |
ratio=1, | |
use_checkpoint=False, | |
mlp_ratio=4, | |
qkv_bias=True, | |
drop_path_rate=0.1, | |
), | |
keypoint_head=dict( | |
type='TopdownHeatmapSimpleHead', | |
in_channels=384, | |
num_deconv_layers=2, | |
num_deconv_filters=(256, 256), | |
num_deconv_kernels=(4, 4), | |
extra=dict(final_conv_kernel=1, ), | |
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), | |
train_cfg=dict(), | |
test_cfg=dict( | |
flip_test=True, | |
post_process='default', | |
shift_heatmap=False, | |
target_type=target_type, | |
modulate_kernel=11, | |
use_udp=True)) | |
model_base = dict( | |
type='TopDown', | |
pretrained=None, | |
backbone=dict( | |
type='ViT', | |
img_size=(256, 192), | |
patch_size=16, | |
embed_dim=768, | |
depth=12, | |
num_heads=12, | |
ratio=1, | |
use_checkpoint=False, | |
mlp_ratio=4, | |
qkv_bias=True, | |
drop_path_rate=0.3, | |
), | |
keypoint_head=dict( | |
type='TopdownHeatmapSimpleHead', | |
in_channels=768, | |
num_deconv_layers=2, | |
num_deconv_filters=(256, 256), | |
num_deconv_kernels=(4, 4), | |
extra=dict(final_conv_kernel=1, ), | |
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), | |
train_cfg=dict(), | |
test_cfg=dict( | |
flip_test=True, | |
post_process='default', | |
shift_heatmap=False, | |
target_type=target_type, | |
modulate_kernel=11, | |
use_udp=True)) | |
model_large = dict( | |
type='TopDown', | |
pretrained=None, | |
backbone=dict( | |
type='ViT', | |
img_size=(256, 192), | |
patch_size=16, | |
embed_dim=1024, | |
depth=24, | |
num_heads=16, | |
ratio=1, | |
use_checkpoint=False, | |
mlp_ratio=4, | |
qkv_bias=True, | |
drop_path_rate=0.5, | |
), | |
keypoint_head=dict( | |
type='TopdownHeatmapSimpleHead', | |
in_channels=1024, | |
num_deconv_layers=2, | |
num_deconv_filters=(256, 256), | |
num_deconv_kernels=(4, 4), | |
extra=dict(final_conv_kernel=1, ), | |
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), | |
train_cfg=dict(), | |
test_cfg=dict( | |
flip_test=True, | |
post_process='default', | |
shift_heatmap=False, | |
target_type=target_type, | |
modulate_kernel=11, | |
use_udp=True)) | |
model_huge = dict( | |
type='TopDown', | |
pretrained=None, | |
backbone=dict( | |
type='ViT', | |
img_size=(256, 192), | |
patch_size=16, | |
embed_dim=1280, | |
depth=32, | |
num_heads=16, | |
ratio=1, | |
use_checkpoint=False, | |
mlp_ratio=4, | |
qkv_bias=True, | |
drop_path_rate=0.55, | |
), | |
keypoint_head=dict( | |
type='TopdownHeatmapSimpleHead', | |
in_channels=1280, | |
num_deconv_layers=2, | |
num_deconv_filters=(256, 256), | |
num_deconv_kernels=(4, 4), | |
extra=dict(final_conv_kernel=1, ), | |
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), | |
train_cfg=dict(), | |
test_cfg=dict( | |
flip_test=True, | |
post_process='default', | |
shift_heatmap=False, | |
target_type=target_type, | |
modulate_kernel=11, | |
use_udp=True)) | |