首页 - 技术栈

谷德设计网官网入口网站推广和优化教程

作者: 五速梦信息网
时间: 2026年06月19日 11:10

当前位置：首页 > news >正文

谷德设计网官网入口,网站推广和优化教程,珠海汽车网站建设,百度95099怎么转人工本文介绍一些注意力机制的实现#xff0c;包括EA/MHSA/SK/DA/EPSA。【深度学习】注意力机制#xff08;一#xff09; 【深度学习】注意力机制#xff08;三#xff09; 目录一、EA#xff08;External Attention#xff09; 二、Multi Head Self Attention 三、…本文介绍一些注意力机制的实现包括EA/MHSA/SK/DA/EPSA。【深度学习】注意力机制一【深度学习】注意力机制三目录一、EAExternal Attention 二、Multi Head Self Attention 三、SKSelective Kernel Networks 四、DADual Attention 五、EPSAEfficient Pyramid Squeeze Attention 一、EAExternal Attention EA可以关注全局的空间信息论文论文地址如下图代码如下代码连接 import numpy as np import torch from torch import nn from torch.nn import initclass External_attention(nn.Module):Arguments:c (int): The input and output channel number.def init(self, c):super(External_attention, self).init()self.conv1 nn.Conv2d(c, c, 1)self.k 64self.linear_0 nn.Conv1d(c, self.k, 1, biasFalse)self.linear_1 nn.Conv1d(self.k, c, 1, biasFalse)self.linear_1.weight.data self.linear_0.weight.data.permute(1, 0, 2) self.conv2 nn.Sequential(nn.Conv2d(c, c, 1, biasFalse),norm_layer©) for m in self.modules():if isinstance(m, nn.Conv2d):n m.kernel_size[0] * m.kernel_size[1] * m.outchannelsm.weight.data.normal(0, math.sqrt(2. / n))elif isinstance(m, nn.Conv1d):n m.kernel_size[0] * m.outchannelsm.weight.data.normal(0, math.sqrt(2. / n))elif isinstance(m, BatchNorm):m.weight.data.fill(1)if m.bias is not None:m.bias.data.zero_()def forward(self, x):idn xx self.conv1(x)b, c, h, w x.size()n h*wx x.view(b, c, h*w) # b * c * n attn self.linear_0(x) # b, k, nattn F.softmax(attn, dim-1) # b, k, nattn attn / (1e-9 attn.sum(dim1, keepdimTrue)) # # b, k, nx self.linear_1(attn) # b, c, nx x.view(b, c, h, w)x self.conv2(x)x x idnx F.relu(x)return x 二、Multi Head Self Attention 注意力机制的经典Transformer的基石。论文论文地址如下图代码如下代码连接 import numpy as np import torch from torch import nn from torch.nn import initclass ScaledDotProductAttention(nn.Module):Scaled dot-product attentiondef init(self, d_model, d_k, d_v, h,dropout.1)::param d_model: Output dimensionality of the model:param d_k: Dimensionality of queries and keys:param d_v: Dimensionality of values:param h: Number of headssuper(ScaledDotProductAttention, self).init()self.fc_q nn.Linear(d_model, h * d_k)self.fc_k nn.Linear(d_model, h * d_k)self.fc_v nn.Linear(d_model, h * d_v)self.fc_o nn.Linear(h * d_v, d_model)self.dropoutnn.Dropout(dropout)self.d_model d_modelself.d_k d_kself.d_v d_vself.h hself.init_weights()def init_weights(self):for m in self.modules():if isinstance(m, nn.Conv2d):init.kaimingnormal(m.weight, modefanout)if m.bias is not None:init.constant(m.bias, 0)elif isinstance(m, nn.BatchNorm2d):init.constant(m.weight, 1)init.constant(m.bias, 0)elif isinstance(m, nn.Linear):init.normal(m.weight, std0.001)if m.bias is not None:init.constant(m.bias, 0)def forward(self, queries, keys, values, attention_maskNone, attention_weightsNone):Computes:param queries: Queries (b_s, nq, d_model):param keys: Keys (b_s, nk, d_model):param values: Values (b_s, nk, d_model):param attention_mask: Mask over attention values (b_s, h, nq, nk). True indicates masking.:param attention_weights: Multiplicative weights for attention values (b_s, h, nq, nk).:return:b_s, nq queries.shape[:2]nk keys.shape[1]q self.fc_q(queries).view(b_s, nq, self.h, self.d_k).permute(0, 2, 1, 3) # (b_s, h, nq, d_k)k self.fc_k(keys).view(b_s, nk, self.h, self.d_k).permute(0, 2, 3, 1) # (b_s, h, d_k, nk)v self.fc_v(values).view(b_s, nk, self.h, self.d_v).permute(0, 2, 1, 3) # (b_s, h, nk, d_v)att torch.matmul(q, k) / np.sqrt(self.d_k) # (b_s, h, nq, nk)if attention_weights is not None:att att * attention_weightsif attention_mask is not None:att att.masked_fill(attention_mask, -np.inf)att torch.softmax(att, -1)attself.dropout(att)out torch.matmul(att, v).permute(0, 2, 1, 3).contiguous().view(b_s, nq, self.h * self.d_v) # (b_s, nq, h*d_v)out self.fc_o(out) # (b_s, nq, d_model)return out 三、SKSelective Kernel Networks SK是通道注意力机制。论文地址论文连接如下图代码如下代码连接 import numpy as np import torch from torch import nn from torch.nn import init from collections import OrderedDictclass SKAttention(nn.Module):def init(self, channel512,kernels[1,3,5,7],reduction16,group1,L32):super().init()self.dmax(L,channel//reduction)self.convsnn.ModuleList([])for k in kernels:self.convs.append(nn.Sequential(OrderedDict([(conv,nn.Conv2d(channel,channel,kernel_sizek,paddingk//2,groupsgroup)),(bn,nn.BatchNorm2d(channel)),(relu,nn.ReLU())])))self.fcnn.Linear(channel,self.d)self.fcsnn.ModuleList([])for i in range(len(kernels)):self.fcs.append(nn.Linear(self.d,channel))self.softmaxnn.Softmax(dim0)def forward(self, x):bs, c, _, _ x.size()conv_outs[]### splitfor conv in self.convs:conv_outs.append(conv(x))featstorch.stack(conv_outs,0)#k,bs,channel,h,w### fuseUsum(conv_outs) #bs,c,h,w### reduction channelSU.mean(-1).mean(-1) #bs,cZself.fc(S) #bs,d### calculate attention weightweights[]for fc in self.fcs:weightfc(Z)weights.append(weight.view(bs,c,1,1)) #bs,channelattention_weughtstorch.stack(weights,0)#k,bs,channel,1,1attention_weughtsself.softmax(attention_weughts)#k,bs,channel,1,1### fuseV(attention_weughts*feats).sum(0)return V四、DADual Attention DA融合了通道注意力和空间注意力机制。论文论文地址如下图代码代码连接 import numpy as np import torch from torch import nn from torch.nn import init from model.attention.SelfAttention import ScaledDotProductAttention from model.attention.SimplifiedSelfAttention import SimplifiedScaledDotProductAttentionclass PositionAttentionModule(nn.Module):def init(self,d_model512,kernel_size3,H7,W7):super().init()self.cnnnn.Conv2d(d_model,d_model,kernel_sizekernel_size,padding(kernel_size-1)//2)self.paScaledDotProductAttention(d_model,d_kd_model,d_vd_model,h1)def forward(self,x):bs,c,h,wx.shapeyself.cnn(x)yy.view(bs,c,-1).permute(0,2,1) #bs,h*w,cyself.pa(y,y,y) #bs,h*w,creturn yclass ChannelAttentionModule(nn.Module):def init(self,d_model512,kernel_size3,H7,W7):super().init()self.cnnnn.Conv2d(d_model,d_model,kernel_sizekernel_size,padding(kernel_size-1)//2)self.paSimplifiedScaledDotProductAttention(H*W,h1)def forward(self,x):bs,c,h,wx.shapeyself.cnn(x)yy.view(bs,c,-1) #bs,c,h*wyself.pa(y,y,y) #bs,c,h*wreturn yclass DAModule(nn.Module):def init(self,d_model512,kernel_size3,H7,W7):super().init()self.position_attention_modulePositionAttentionModule(d_model512,kernel_size3,H7,W7)self.channel_attention_moduleChannelAttentionModule(d_model512,kernel_size3,H7,W7)def forward(self,input):bs,c,h,winput.shapep_outself.position_attention_module(input)c_outself.channel_attention_module(input)p_outp_out.permute(0,2,1).view(bs,c,h,w)c_outc_out.view(bs,c,h,w)return p_outc_out 五、EPSAEfficient Pyramid Squeeze Attention 论文论文地址如下图代码如下代码连接 import torch.nn as nnclass SEWeightModule(nn.Module):def init(self, channels, reduction16):super(SEWeightModule, self).init()self.avg_pool nn.AdaptiveAvgPool2d(1)self.fc1 nn.Conv2d(channels, channels//reduction, kernel_size1, padding0)self.relu nn.ReLU(inplaceTrue)self.fc2 nn.Conv2d(channels//reduction, channels, kernel_size1, padding0)self.sigmoid nn.Sigmoid()def forward(self, x):out self.avg_pool(x)out self.fc1(out)out self.relu(out)out self.fc2(out)weight self.sigmoid(out)return weightdef conv(in_planes, out_planes, kernel_size3, stride1, padding1, dilation1, groups1):standard convolution with paddingreturn nn.Conv2d(in_planes, out_planes, kernel_sizekernel_size, stridestride,paddingpadding, dilationdilation, groupsgroups, biasFalse)def conv1x1(in_planes, out_planes, stride1):1x1 convolutionreturn nn.Conv2d(in_planes, out_planes, kernel_size1, stridestride, biasFalse)class PSAModule(nn.Module):def init(self, inplans, planes, conv_kernels[3, 5, 7, 9], stride1, conv_groups[1, 4, 8, 16]):super(PSAModule, self).init()self.conv_1 conv(inplans, planes//4, kernel_sizeconv_kernels[0], paddingconv_kernels[0]//2,stridestride, groupsconv_groups[0])self.conv_2 conv(inplans, planes//4, kernel_sizeconv_kernels[1], paddingconv_kernels[1]//2,stridestride, groupsconv_groups[1])self.conv_3 conv(inplans, planes//4, kernel_sizeconv_kernels[2], paddingconv_kernels[2]//2,stridestride, groupsconv_groups[2])self.conv_4 conv(inplans, planes//4, kernel_sizeconv_kernels[3], paddingconv_kernels[3]//2,stridestride, groupsconv_groups[3])self.se SEWeightModule(planes // 4)self.split_channel planes // 4self.softmax nn.Softmax(dim1)def forward(self, x):batch_size x.shape[0]x1 self.conv_1(x)x2 self.conv_2(x)x3 self.conv_3(x)x4 self.conv_4(x)feats torch.cat((x1, x2, x3, x4), dim1)feats feats.view(batch_size, 4, self.split_channel, feats.shape[2], feats.shape[3])x1_se self.se(x1)x2_se self.se(x2)x3_se self.se(x3)x4_se self.se(x4)x_se torch.cat((x1_se, x2_se, x3_se, x4_se), dim1)attention_vectors x_se.view(batch_size, 4, self.split_channel, 1, 1)attention_vectors self.softmax(attention_vectors)feats_weight feats * attention_vectorsfor i in range(4):x_se_weight_fp feats_weight[:, i, :, :]if i 0:out x_se_weight_fpelse:out torch.cat((x_se_weight_fp, out), 1)return out