一、总流程:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64  | def main(): rank, world_size = dist_init() # rank:进程ID world_size:进程数量/任务数量/GPU数量 logger.info("init done") # load cfg cfg.merge_from_file(args.cfg) #父进程初始化log信息 if rank == 0: if not os.path.exists(cfg.TRAIN.LOG_DIR): os.makedirs(cfg.TRAIN.LOG_DIR) init_log('global', logging.INFO) if cfg.TRAIN.LOG_DIR: add_file_handler('global', os.path.join(cfg.TRAIN.LOG_DIR, 'logs.txt'), logging.INFO) logger.info("Version Information: \n{}\n".format(commit())) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) # 创建模型,在构造函数中分别调用了get_backbone/get_neck/get_rpn_head/get_mask_head(可选) # ModelBuilder实现了训练时用到的前向传播forward(data)函数, #输入data包含训练时的template patch和search patch,以及分类label:label_cls、预测框位置回归label:label_loc,以及位置参数权重:label_loc_weight。 #返回的outputs字典包括了总损失值total_loss/分类损失cls_loss/位置损失loc_loss; #ModelBuilder类同时实现了推断时模板分支的计算方法template(z)(backbone和neck部分)和搜索分支的计算方法track(x)(backbone和neck部分,以及与模板分支得到的结果一起送入rpn_head部分, #得到并返回分类和回归结果cls/loc/mask(可选)。 model = ModelBuilder().cuda().train() # 加载backbone预训练权重至刚刚创建的model if cfg.BACKBONE.PRETRAINED: cur_path = os.path.dirname(os.path.realpath(__file__)) backbone_path = os.path.join(cur_path, '../', cfg.BACKBONE.PRETRAINED) load_pretrain(model.backbone, backbone_path) # 创建 tensorboard writer if rank == 0 and cfg.TRAIN.LOG_DIR: tb_writer = SummaryWriter(cfg.TRAIN.LOG_DIR) else: tb_writer = None # 创建 dataset loader train_loader = build_data_loader() # 设置 optimizer 和 lr_scheduler optimizer, lr_scheduler = build_opt_lr(model, cfg.TRAIN.START_EPOCH) # 继续训练 if cfg.TRAIN.RESUME: logger.info("resume from {}".format(cfg.TRAIN.RESUME)) assert os.path.isfile(cfg.TRAIN.RESUME), \ '{} is not a valid file.'.format(cfg.TRAIN.RESUME) model, optimizer, cfg.TRAIN.START_EPOCH = \ restore_from(model, optimizer, cfg.TRAIN.RESUME) # load pretrain elif cfg.TRAIN.PRETRAINED: load_pretrain(model, cfg.TRAIN.PRETRAINED) # 作者自己封装的分布式训练模型,需要将模型副本copy至各个GPU中; dist_model = DistModule(model) logger.info(lr_scheduler) logger.info("model prepare done") # start training train(train_loader, dist_model, optimizer, lr_scheduler, tb_writer)  | 
二、ModelBulider
1、backbone
作者嫌弃原来ResNet的stride过大,从而在conv4和conv5中将stride=2改动为stride=1。但是同时为了保持之前的感受野,采用了空洞卷积,dilation=1时即为默认的卷积方式,没有空洞;dilation=2表示卷积时cell与cell间空出1个cell。
改动后ResNet50各层如下:
在每一个layer的第一个Bottleneck模块里进行downsample,因为第一次调用该模块的时候会使得output channel不等于input channel,导致x与经过卷积后的x无法直接相加,所以做一个下采样使得维度保持一致。
layer1:dilation=1, self.inplanes=64*4=256,output channel:64 => 256
layer2:dilation=1, self.inplanes=128*4=512,output channel:256 => 512, self.feature_size = 128 * block.expansion = 128*4 = 512
layer3:扩大感受野dilation=2, self.inplanes=256*4=1024,output channel:512 => 1024, self.feature_size = (256+128) * block.expansion = (256+128)*4 = 1536
layer4:扩大感受野dilation=4, self.inplanes=512*4=2048,output channel:1024 => 2048, self.feature_size = 512* block.expansion = 512*4 = 2048
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174  | ResNet( (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), bias=False) (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False) (layer1): Sequential( (0): Bottleneck( (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Bottleneck( (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) ) (layer2): Sequential( (0): Bottleneck( (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), bias=False) (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), bias=False) (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Bottleneck( (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (3): Bottleneck( (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) ) (layer3): Sequential( (0): Bottleneck( (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(512, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (3): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (4): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (5): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) ) (layer4): Sequential( (0): Bottleneck( (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False) (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(1024, 2048, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False) (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): Bottleneck( (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(4, 4), dilation=(4, 4), bias=False) (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(4, 4), dilation=(4, 4), bias=False) (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) ) ) )  | 
2、neck
neck层是为了backbone和head更好的衔接。作者使用Resnet的layer2 3 4 层的输出(也就是论文图中的conv3 4 5 层)作为输入,在config文件中定义了输出为256,输入为layer2 3 4输出的维度,即(512 =>256)、(1024 =>256)、(2048 =>256)。具体操作为使用1*1的卷积核做了downsample。对于模板帧经过layer234后h*w变为15*15,为了减轻计算量,还进行了从中心点开始裁减7*7的区域作为模板特征。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16  | class AdjustLayer(nn.Module): def __init__(self, in_channels, out_channels, center_size=7): super(AdjustLayer, self).__init__() self.downsample = nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False), nn.BatchNorm2d(out_channels), ) self.center_size = center_size def forward(self, x): x = self.downsample(x) if x.size(3) < 20: # 模板帧经过layer234后变为15*15 l = (x.size(3) - self.center_size) // 2 r = l + self.center_size x = x[:, :, l:r, l:r] return x  | 
3、RPN head
先看config文件,使用 MutilRPN,5种anchor,输入维度有3个都是256的,也就是neck输出的3个层,weighed为true
1 2 3 4 5 6  | RPN: TYPE: 'MultiRPN' KWARGS: anchor_num: 5 in_channels: [256, 256, 256] weighted: true  | 
对应neck输出的3层,每层都进行DepthwiseRPN,然后定义可训练的权重参数self.cls_weight 、self.loc_weight (nn.Parameter()参考pysot基础知识讲解)送入softmax层得到权重值,最后将这三层得到的结果进行加权平均分别得到分类分支和回归分支的特征
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37  | class MultiRPN(RPN): def __init__(self, anchor_num, in_channels, weighted=False): super(MultiRPN, self).__init__() self.weighted = weighted for i in range(len(in_channels)): self.add_module('rpn'+str(i+2), DepthwiseRPN(anchor_num, in_channels[i], in_channels[i])) if self.weighted: self.cls_weight = nn.Parameter(torch.ones(len(in_channels))) self.loc_weight = nn.Parameter(torch.ones(len(in_channels))) def forward(self, z_fs, x_fs): cls = [] loc = [] for idx, (z_f, x_f) in enumerate(zip(z_fs, x_fs), start=2): rpn = getattr(self, 'rpn'+str(idx)) c, l = rpn(z_f, x_f) cls.append(c) loc.append(l) if self.weighted: cls_weight = F.softmax(self.cls_weight, 0) loc_weight = F.softmax(self.loc_weight, 0) def avg(lst): return sum(lst) / len(lst) def weighted_avg(lst, weight): s = 0 for i in range(len(weight)): s += lst[i] * weight[i] return s if self.weighted: return weighted_avg(cls, cls_weight), weighted_avg(loc, loc_weight) else: return avg(cls), avg(loc)  | 
在 DepthwiseRPN里,使用DepthwiseXCorr得到输入256,隐藏层256,输出2*5=10的分类feature和输入256,隐藏层256,输出4*5=20的位置feature。
DepthwiseXCorr具体操作为模板帧和检测帧分别经过一个3*3的卷积,输入输出维度不变保持256,然后以模板帧为卷积核,进行深度互相关操作,self.head运算就是升维运算(到2k或者4k),即经过2个1*1的卷积输出输出2k或者4k的feature,可以看出,其发生在xcorr_depthwise之后,进行维度提升。与Siamese RPN网络不同,Siamese RPN++提升网络通道数为2k或者4k的操作是在卷积操作( Cross Correlation)之后,而Siamese RPN网络是在卷积操作之前,这样就减少了大量的计算量。
深度互相关操作具体为:搜索帧拉成1个batchsize*channel × W × H ,模板帧拉成batchsize*channel 个1× W × H 的卷积核,然后将搜索帧分为batchsize*channel个组,进行卷积操作(分组卷积参考这篇博客),这里其实相当于输入每个通道分别和一个卷积核进行卷积了,变成一对一的关系。最后在把结果变回(B,C,H,W)

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48  | class DepthwiseRPN(RPN): def __init__(self, anchor_num=5, in_channels=256, out_channels=256): super(DepthwiseRPN, self).__init__() self.cls = DepthwiseXCorr(in_channels, out_channels, 2 * anchor_num) self.loc = DepthwiseXCorr(in_channels, out_channels, 4 * anchor_num) def forward(self, z_f, x_f): cls = self.cls(z_f, x_f) loc = self.loc(z_f, x_f) return cls, loc class DepthwiseXCorr(nn.Module): def __init__(self, in_channels, hidden, out_channels, kernel_size=3, hidden_kernel_size=5): super(DepthwiseXCorr, self).__init__() self.conv_kernel = nn.Sequential( nn.Conv2d(in_channels, hidden, kernel_size=kernel_size, bias=False), nn.BatchNorm2d(hidden), nn.ReLU(inplace=True), ) self.conv_search = nn.Sequential( nn.Conv2d(in_channels, hidden, kernel_size=kernel_size, bias=False), nn.BatchNorm2d(hidden), nn.ReLU(inplace=True), ) self.head = nn.Sequential( nn.Conv2d(hidden, hidden, kernel_size=1, bias=False), nn.BatchNorm2d(hidden), nn.ReLU(inplace=True), nn.Conv2d(hidden, out_channels, kernel_size=1) ) def forward(self, kernel, search): kernel = self.conv_kernel(kernel) search = self.conv_search(search) feature = xcorr_depthwise(search, kernel) out = self.head(feature) return out def xcorr_depthwise(x, kernel): """depthwise cross correlation """ batch = kernel.size(0) channel = kernel.size(1) x = x.view(1, batch*channel, x.size(2), x.size(3)) kernel = kernel.view(batch*channel, 1, kernel.size(2), kernel.size(3)) out = F.conv2d(x, kernel, groups=batch*channel) out = out.view(batch, channel, out.size(2), out.size(3)) return out  | 
4、前向传播
首先将template和search帧传入上述构建的模型(backbone、neck、rpnhead)得到分类结果cls和位置结果loc,然后将cls先按照前景和背景分为2组,然后把组放到最后一维度,以最后一维度为基准送入log_softmax,得到softmax value:cls
再将cls、label_cls送入select_cross_entropy_loss得到分类损失cls_loss;将loc, label_loc, label_loc_weight送入weight_l1_loss得到回归损失loc_loss。返回ouputs=[ total_loss, cls_loss, loc_loss ]
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46  |     def log_softmax(self, cls): b, a2, h, w = cls.size() cls = cls.view(b, 2, a2//2, h, w) cls = cls.permute(0, 2, 3, 4, 1).contiguous() cls = F.log_softmax(cls, dim=4) return cls def forward(self, data): """ only used in training """ template = data['template'].cuda() search = data['search'].cuda() label_cls = data['label_cls'].cuda() label_loc = data['label_loc'].cuda() label_loc_weight = data['label_loc_weight'].cuda() # get feature zf = self.backbone(template) xf = self.backbone(search) if cfg.MASK.MASK: zf = zf[-1] self.xf_refine = xf[:-1] xf = xf[-1] if cfg.ADJUST.ADJUST: zf = self.neck(zf) xf = self.neck(xf) cls, loc = self.rpn_head(zf, xf) # get loss cls = self.log_softmax(cls) cls_loss = select_cross_entropy_loss(cls, label_cls) loc_loss = weight_l1_loss(loc, label_loc, label_loc_weight) outputs = {} outputs['total_loss'] = cfg.TRAIN.CLS_WEIGHT * cls_loss + \ cfg.TRAIN.LOC_WEIGHT * loc_loss outputs['cls_loss'] = cls_loss outputs['loc_loss'] = loc_loss if cfg.MASK.MASK: # TODO mask, self.mask_corr_feature = self.mask_head(zf, xf) mask_loss = None outputs['total_loss'] += cfg.TRAIN.MASK_WEIGHT * mask_loss outputs['mask_loss'] = mask_loss return outputs  | 
三、load_pretrain
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28  | def load_pretrain(model, pretrained_path): logger.info('load pretrained model from {}'.format(pretrained_path)) # 1.初始化device device = torch.cuda.current_device() # 2.加载模型权重 pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device)) # 3.移除前缀‘moudel.’(一般用于多GPU分布式训练后的模型) if "state_dict" in pretrained_dict.keys(): pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.') else: pretrained_dict = remove_prefix(pretrained_dict, 'module.') # 4.检查键完整性:即判断当前构建的模型参数与待加载的模型参数是否匹配 try: check_keys(model, pretrained_dict) except: # 增加前缀"features." logger.info('[Warning]: using pretrain as features.\ Adding "features." as prefix') new_dict = {} for k, v in pretrained_dict.items(): k = 'features.' + k new_dict[k] = v pretrained_dict = new_dict check_keys(model, pretrained_dict) # 5.装载参数 model.load_state_dict(pretrained_dict, strict=False) return model  | 
四、build_data_loader
1、anchor
参考该博主(原文连接在参考文献中)画的图,从左到又分别是detection frame(255×255的搜索区域)、featuremap对应的anchor(红点)、生成的anchorbox(蓝框)。featuremap中每个点都要映射回detection frame对应的位置,这也就是代码中ori的来源。

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72  | class Anchors: """ This class generate anchors. """ def __init__(self, stride, ratios, scales, image_center=0, size=0): self.stride = stride self.ratios = ratios self.scales = scales self.image_center = image_center self.size = size self.anchor_num = len(self.scales) * len(self.ratios) #1*5 self.anchors = None self.generate_anchors() def generate_anchors(self): """ generate anchors based on predefined configuration """ self.anchors = np.zeros((self.anchor_num, 4), dtype=np.float32) # 5行4列(x1,y1,x2,y2) size = self.stride * self.stride # 8*8 count = 0 for r in self.ratios: ws = int(math.sqrt(size*1. / r)) hs = int(ws * r) # r = h/w for s in self.scales: w = ws * s h = hs * s self.anchors[count][:] = [-w*0.5, -h*0.5, w*0.5, h*0.5][:] #映射为以0,0为中心点的 # 可是它这样写得到的不是‘左上右下点’,而是‘左下右上点’ # 当然也有一种可能那就是坐标系不是常规的那种,而是X轴正常但Y轴箭头是朝下的,这样倒是可以满足‘左上右下’ count += 1 # 整个 def generate_anchors 的作用就是根据设定参数算出每种不同尺度 Bbox 的 ‘左上右下’尺寸范围 # 这样在下面的 def generate_all_anchors 中就可以根据起始点和步长算出整个滑窗过程中的所有 anchorBbox的坐标 def generate_all_anchors(self, im_c, size): #127 25 """ im_c: image center size: image size """ if self.image_center == im_c and self.size == size: return False self.image_center = im_c self.size = size a0x = im_c - size // 2 * self.stride # 127-(25//2*8)=31 # ori就是featuremap上最左上点映射回detection frame(255x255)上最左上角的位置 ori = np.array([a0x] * 4, dtype=np.float32) #[31. 31. 31. 31.] zero_anchors = self.anchors + ori x1 = zero_anchors[:, 0] y1 = zero_anchors[:, 1] x2 = zero_anchors[:, 2] y2 = zero_anchors[:, 3] x1, y1, x2, y2 = map(lambda x: x.reshape(self.anchor_num, 1, 1), [x1, y1, x2, y2]) cx, cy, w, h = corner2center([x1, y1, x2, y2]) # cx.shape=(5,1,1) disp_x = np.arange(0, size).reshape(1, 1, -1) * self.stride # disp_x.shape(1,1,25),因为要从feature映射到搜索帧上的坐标,所以乘以步长 disp_y = np.arange(0, size).reshape(1, -1, 1) * self.stride cx = cx + disp_x # cx.shape=(5,1,25),cx每行元素加到每个disp_x中每个元素,cx共5行,重复5次 cy = cy + disp_y # broadcast zero = np.zeros((self.anchor_num, size, size), dtype=np.float32) # zero.shape=(5, 25, 25) cx, cy, w, h = map(lambda x: x + zero, [cx, cy, w, h]) # cx.shape=(5, 25, 25),每行cx都重复25次,5种anchor,分别在25x25个像素点上画 x1, y1, x2, y2 = center2corner([cx, cy, w, h]) self.all_anchors = (np.stack([x1, y1, x2, y2]).astype(np.float32), np.stack([cx, cy, w, h]).astype(np.float32)) return True  | 
2、dataset
1)初始化
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72  | class TrkDataset(Dataset): def __init__(self,): super(TrkDataset, self).__init__() # 不太明白计算公式如何来的?? desired_size = (cfg.TRAIN.SEARCH_SIZE - cfg.TRAIN.EXEMPLAR_SIZE) / \ cfg.ANCHOR.STRIDE + 1 + cfg.TRAIN.BASE_SIZE if desired_size != cfg.TRAIN.OUTPUT_SIZE: raise Exception('size not match!') # create anchor target 根据设定参数算出每种不同尺度 Bbox 的 ‘左上右下’尺寸范围,并在搜索帧上生成这些anchorbox(5,25,25) self.anchor_target = AnchorTarget() # create sub dataset self.all_dataset = [] start = 0 self.num = 0 for name in cfg.DATASET.NAMES: # ('VID', 'COCO', 'DET', 'YOUTUBEBB') subdata_cfg = getattr(cfg.DATASET, name) sub_dataset = SubDataset( name, subdata_cfg.ROOT, subdata_cfg.ANNO, subdata_cfg.FRAME_RANGE, subdata_cfg.NUM_USE, start ) start += sub_dataset.num # 每个数据集开始的下标 self.num += sub_dataset.num_use # 真正使用的视频的个数 sub_dataset.log() self.all_dataset.append(sub_dataset) # data augmentation # 数据增强 # cfg.DATASET.TEMPLATE.SHIFT : 4 # cfg.DATASET.TEMPLATE.SCALE : 0.05 # cfg.DATASET.TEMPLATE.BLUR : 0.0 # cfg.DATASET.TEMPLATE.FLIP : 0.0 # cfg.DATASET.TEMPLATE.COLOR : 1.0 self.template_aug = Augmentation( cfg.DATASET.TEMPLATE.SHIFT, cfg.DATASET.TEMPLATE.SCALE, cfg.DATASET.TEMPLATE.BLUR, cfg.DATASET.TEMPLATE.FLIP, cfg.DATASET.TEMPLATE.COLOR ) self.search_aug = Augmentation( cfg.DATASET.SEARCH.SHIFT, cfg.DATASET.SEARCH.SCALE, cfg.DATASET.SEARCH.BLUR, cfg.DATASET.SEARCH.FLIP, cfg.DATASET.SEARCH.COLOR ) videos_per_epoch = cfg.DATASET.VIDEOS_PER_EPOCH self.num = videos_per_epoch if videos_per_epoch > 0 else self.num # 每个epoch使用给定个数视频,或者全部视频 self.num *= cfg.TRAIN.EPOCH # 所有epoch的视频数量 self.pick = self.shuffle() # 再次乱选取的所有的视频(打乱的是选取的视频的序号集,原视频和序号还是正常顺序一一对应的关系) def shuffle(self): pick = [] m = 0 while m < self.num: p = [] for sub_dataset in self.all_dataset: sub_p = sub_dataset.pick p += sub_p np.random.shuffle(p) pick += p m = len(pick) logger.info("shuffle done!") logger.info("dataset length {}".format(self.num)) return pick[:self.num]  | 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47  | class SubDataset(object): def __init__(self, name, root, anno, frame_range, num_use, start_idx): cur_path = os.path.dirname(os.path.realpath(__file__)) self.name = name self.root = os.path.join(cur_path, '../../', root) self.anno = os.path.join(cur_path, '../../', anno) self.frame_range = frame_range # 默认是 100 ,frame_range 的意义应该是模板帧与搜索帧的最大帧数差 self.num_use = num_use # 使用的视频数量,全部使用或者重复使用直到满足num_use个 self.start_idx = start_idx logger.info("loading " + name) with open(self.anno, 'r') as f: meta_data = json.load(f) meta_data = self._filter_zero(meta_data) # def _filter_zero这个函数基本起到一个检查和筛选的作用: # 函数的作用是过滤跟踪框参数不正常的数据,主要过滤的地方有两方面:(1)程序要求的 Bbox 格式是 ‘左上右下’ # 而现在数据集的类标往往是 ‘左上和宽高’ ,所以在制作 json 文件时注意进行换算;(2)数据中很可能出现某一 # 帧没有目标的情况,这时的 Bbox 为 [-1,-1,-1,-1] ,函数可以将这些无效 Bbox 和对应的帧去除掉,剩余的 # 有效数据送入网络 for video in list(meta_data.keys()): for track in meta_data[video]: frames = meta_data[video][track] frames = list(map(int, filter(lambda x: x.isdigit(), frames.keys()))) # train.json 和 val.json 对于帧数都是用长度为6的字符串表示的(长度必须为 6 ) # 这句代码就是将这些字符串用 int 整型化并存入列表内,所以我们在使用自己的数据集 # 时一定要注意:图片名称里不能有字母的前后缀(例如无人机数据就有个 img 前缀)必须 # 是纯数字 frames.sort()# 排序 meta_data[video][track]['frames'] = frames # 这是在 '00' 字典内又加入了一个键值,键的名称叫 'frames' ,值的内容就是上面得到的排序后的frame list if len(frames) <= 0: logger.warning("{}/{} has no frames".format(video, track)) del meta_data[video][track] for video in list(meta_data.keys()): if len(meta_data[video]) <= 0: logger.warning("{} has no tracks".format(video)) del meta_data[video] self.labels = meta_data self.num = len(self.labels) # 就是整个训练阶段或者说 train.json 内的视频数量 self.num_use = self.num if self.num_use == -1 else self.num_use self.videos = list(meta_data.keys()) logger.info("{} loaded".format(self.name)) self.path_format = '{}.{}.{}.jpg' self.pick = self.shuffle() # 打乱顺序,重复使用视频,直到视频数量满足num_use  | 
2)get item
3)label
参考文献
https://blog.csdn.net/weixin_43084225/article/details/108753735#commentBox
https://blog.csdn.net/laizi_laizi/article/details/108279414#2Generate_Anchors_45