Caffe 中的 LRNLayer

编程入门行业动态更新时间:2024-10-24 22:24:48

Caffe 中的 LRNLayer

Caffe 中的 LRNLayer 支持两种模式：

CrossChannel：为人们所熟知的局部响应归一化，在 AlexNet 中提出，并在一些早期网络中使用；
WithinChannel： Caffe 中独有的实现，未见网络中应用。本文略过。

ReLU 具有不需要输入归一化以防止其饱和的理想特性。但 AlexNet 论文中发现 LRN 有助于提高泛化性。LRN CrossChannel 模式公式如下：
b x , y i = a x , y i ( k + α n ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) ( a x , y i ) 2 ) β b_{x,y}^{i} = \frac{a_{x,y}^{i}}{(k+\frac{\alpha}{n} \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}(a_{x,y}^{i})^{2})^{\beta}} bx,yi=(k+nα∑j=max(0,i−n/2)min(N−1,i+n/2)(ax,yi)2)βax,yi

其中， a x , y i a_{x,y}^{i} ax,yi 表示在第 i i i 个卷积核输出的 ( x , y ) (x, y) (x,y) 位置施加 ReLU 非线性计算得到的神经元活动。在相同空间位置上的 n n n 个"相邻"通道上求和， N N N 为该层通道的总数。常数 k k k、 n n n、 α \alpha α和 β \beta β。是超参数。 b x , y i b_{x,y}^{i} bx,yi 为响应归一化的激活。

Caffe 的 LRNLayer 在 CPU 和 GPU 中均采用滑窗方式实现。不同特征图位置间并行，在通道方向上循环处理。层中前向缓存公式中的 k + α n ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) ( a x , y i ) 2 k+\frac{\alpha}{n} \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}(a_{x,y}^{i})^{2} k+nα∑j=max(0,i−n/2)min(N−1,i+n/2)(ax,yi)2 供给后向求导时使用。

LRNLayer::CrossChannelForward_cpu

scale_为前向和后向共享的变量。初始化为什么不使用拷贝而使用赋值？
padded_square在前后填充size_ - 1个通道。
alpha_over_size为 α n \frac{\alpha}{n} nα。

template <typename Dtype>
void LRNLayer<Dtype>::CrossChannelForward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {const Dtype* bottom_data = bottom[0]->cpu_data();Dtype* top_data = top[0]->mutable_cpu_data();Dtype* scale_data = scale_.mutable_cpu_data();// start with the constant valuefor (int i = 0; i < scale_.count(); ++i) {scale_data[i] = k_;}Blob<Dtype> padded_square(1, channels_ + size_ - 1, height_, width_);Dtype* padded_square_data = padded_square.mutable_cpu_data();caffe_set(padded_square.count(), Dtype(0), padded_square_data);Dtype alpha_over_size = alpha_ / size_;

每个 image 是独立的。
caffe_sqr 计算平方。
padded_square_data跳过前面的pre_pad_个通道，计算 ( a x , y i ) 2 (a_{x,y}^{i})^{2} (ax,yi)2。

  // go through the imagesfor (int n = 0; n < num_; ++n) {// compute the padded squarecaffe_sqr(channels_ * height_ * width_,bottom_data + bottom[0]->offset(n),padded_square_data + padded_square.offset(0, pre_pad_));

caffe_axpy 功能为 Y = α X + Y Y=\alpha X+Y Y=αX+Y
输出地址不变，累加size_个通道的 α n ( a x , y i ) 2 \frac{\alpha}{n}(a_{x,y}^{i})^{2} nα(ax,yi)2，计算出第一个通道的 N N N。

    // Create the first channel scalefor (int c = 0; c < size_; ++c) {caffe_axpy<Dtype>(height_ * width_, alpha_over_size,padded_square_data + padded_square.offset(0, c),scale_data + scale_.offset(n, 0));}

后续通道基于第一个通道的 N N N 计算，加上后面的通道，减去前面通道的值。
scale_data为 N = k + ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) α n ( a x , y i ) 2 N = k+ \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}\frac{\alpha}{n}(a_{x,y}^{i})^{2} N=k+∑j=max(0,i−n/2)min(N−1,i+n/2)nα(ax,yi)2。

    for (int c = 1; c < channels_; ++c) {// copy previous scalecaffe_copy<Dtype>(height_ * width_,scale_data + scale_.offset(n, c - 1),scale_data + scale_.offset(n, c));// add headcaffe_axpy<Dtype>(height_ * width_, alpha_over_size,padded_square_data + padded_square.offset(0, c + size_ - 1),scale_data + scale_.offset(n, c));// subtract tailcaffe_axpy<Dtype>(height_ * width_, -alpha_over_size,padded_square_data + padded_square.offset(0, c - 1),scale_data + scale_.offset(n, c));}}

计算 y = x N β y = \frac{x} {N^{\beta}} y=Nβx

  // In the end, compute outputcaffe_powx<Dtype>(scale_.count(), scale_data, -beta_, top_data);caffe_mul<Dtype>(scale_.count(), top_data, bottom_data, top_data);
}

LRNLayer::CrossChannelForward_gpu

LRNFillScale 函数计算 N = k + ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) α n ( a x , y i ) 2 N = k+ \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}\frac{\alpha}{n}(a_{x,y}^{i})^{2} N=k+∑j=max(0,i−n/2)min(N−1,i+n/2)nα(ax,yi)2。每个线程处理一个元素。
CAFFE_GET_BLOCKS 根据线程数计算 block 数。

template <typename Dtype>
void LRNLayer<Dtype>::CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {// First, compute scaleconst Dtype* bottom_data = bottom[0]->gpu_data();Dtype* top_data = top[0]->mutable_gpu_data();Dtype* scale_data = scale_.mutable_gpu_data();// We will launch one kernel for each pixel location, and have the kernel// go through all the channels.int n_threads = num_ * height_ * width_;// NOLINT_NEXT_LINE(whitespace/operators)LRNFillScale<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(n_threads, bottom_data, num_, channels_, height_, width_, size_,alpha_ / size_, k_, scale_data);CUDA_POST_KERNEL_CHECK;

LRNComputeOutput 函数计算 y = x N β y = \frac{x} {N^{\beta}} y=Nβx，每个线程处理一个输出。

  n_threads = bottom[0]->count();// NOLINT_NEXT_LINE(whitespace/operators)LRNComputeOutput<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(n_threads, bottom_data, scale_data, -beta_, top_data);CUDA_POST_KERNEL_CHECK;
}

LRNFillScale

N = k + ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) α n ( a x , y i ) 2 N = k+ \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}\frac{\alpha}{n}(a_{x,y}^{i})^{2} N=k+j=max(0,i−n/2)∑min(N−1,i+n/2)nα(ax,yi)2
根据一维index推算出空间位置。

  CUDA_KERNEL_LOOP(index, nthreads) {// find out the local offsetconst int w = index % width;const int h = (index / width) % height;const int n = index / width / height;const int offset = (n * channels * height + h) * width + w;const int step = height * width;const Dtype* const in_off = in + offset;Dtype* const scale_off = scale + offset;

仍然是滑窗方式。
首先计算post_pad个元素的平方和。

    int head = 0;const int pre_pad = (size - 1) / 2;const int post_pad = size - pre_pad - 1;Dtype accum_scale = 0;// fill the scale at [n, :, h, w]// accumulate valueswhile (head < post_pad && head < channels) {accum_scale += in_off[head * step] * in_off[head * step];++head;}

维持一个宽度为size的滑动窗口，计算窗口内元素的平方和。
得到head - post_pad个通道的 scale 值。

    // both add and subtractwhile (head < channels) {accum_scale += in_off[head * step] * in_off[head * step];if (head - size >= 0) {accum_scale -= in_off[(head - size) * step]* in_off[(head - size) * step];}scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;++head;}

计算末尾post_pad个通道的 scale 值。

    // subtract onlywhile (head < channels + post_pad) {if (head - size >= 0) {accum_scale -= in_off[(head - size) * step]* in_off[(head - size) * step];}scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;++head;}}

LRNComputeOutput

y = x N β y = \frac{x} {N^{\beta}} y=Nβx

  CUDA_KERNEL_LOOP(index, nthreads) {out[index] = in[index] * pow(scale[index], negative_beta);}

LRNLayer::CrossChannelBackward_cpu

d y i d x i = N β − x i β ⋅ N β − 1 ⋅ 2 α x i N 2 β = 1 − 2 α β N − 1 ⋅ x i x i N β d y i d x j = − x i β ⋅ N β − 1 ⋅ 2 α x j N 2 β = − 2 α β ⋅ N − 1 ⋅ x i x j N β \begin{aligned} \frac{dy_i}{dx_i} &= \frac{N^\beta - x_i\beta\cdot N^{\beta-1}\cdot 2\alpha x_i}{N^{2\beta}}\\ &= \frac{1 - 2\alpha\beta N^{-1}\cdot x_i x_i}{N^{\beta}}\\ \frac{dy_i}{dx_j} &= \frac{ - x_i\beta \cdot N^{\beta-1}\cdot 2\alpha x_j}{N^{2\beta}}\\ &= \frac{ - 2\alpha\beta \cdot N^{-1}\cdot x_i x_j}{N^{\beta}} \end{aligned} dxidyidxjdyi=N2βNβ−xiβ⋅Nβ−1⋅2αxi=Nβ1−2αβN−1⋅xixi=N2β−xiβ⋅Nβ−1⋅2αxj=Nβ−2αβ⋅N−1⋅xixj
scale_为前向过程中求出的 N = k + ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) α n ( a x , y i ) 2 N = k+ \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)}\frac{\alpha}{n}(a_{x,y}^{i})^{2} N=k+∑j=max(0,i−n/2)min(N−1,i+n/2)nα(ax,yi)2
padded_ratio和accum_ratio为临时空间。padded_ratio为填充通道后的单个特征图大小，accum_ratio为特征图的通道求和。
cache_ratio_value为 2 α β n \frac{2\alpha\beta}{n} n2αβ。

template <typename Dtype>
void LRNLayer<Dtype>::CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,const vector<Blob<Dtype>*>& bottom) {const Dtype* top_diff = top[0]->cpu_diff();const Dtype* top_data = top[0]->cpu_data();const Dtype* bottom_data = bottom[0]->cpu_data();const Dtype* scale_data = scale_.cpu_data();Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();Blob<Dtype> padded_ratio(1, channels_ + size_ - 1, height_, width_);Blob<Dtype> accum_ratio(1, 1, height_, width_);Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data();Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data();// We hack a little bit by using the diff() to store an additional resultDtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff();caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data);Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_;

bottom_diff为 N − β ∂ E ∂ y N^{-\beta} \frac{\partial E}{\partial y} N−β∂y∂E。

  caffe_powx<Dtype>(scale_.count(), scale_data, -beta_, bottom_diff);caffe_mul<Dtype>(scale_.count(), top_diff, bottom_diff, bottom_diff);

inverse_pre_pad即反向计算中前填充通道数。
每个批量单独计算。
padded_ratio为 N − 1 y ∂ E ∂ y N^{-1} y\frac{\partial E}{\partial y} N−1y∂y∂E

  // go through individual dataint inverse_pre_pad = size_ - (size_ + 1) / 2;for (int n = 0; n < num_; ++n) {int block_offset = scale_.offset(n);// first, compute diff_i * y_i / s_icaffe_mul<Dtype>(channels_ * height_ * width_,top_diff + block_offset, top_data + block_offset,padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));caffe_div<Dtype>(channels_ * height_ * width_,padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad),scale_data + block_offset,padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));

累加padded_ratio中的size_-1个局部通道得到前缀通道结果accum_ratio。

    // Now, compute the accumulated ratios and the bottom diffcaffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data);for (int c = 0; c < size_ - 1; ++c) {caffe_axpy<Dtype>(height_ * width_, 1.,padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);}

padded_ratio前后有填充，所以c + size_ - 1对应的是c通道的最后一个局部输入通道。
accum_ratio为 ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ∂ E ∂ y \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y\frac{\partial E}{\partial y} ∑j=max(0,i−n/2)min(N−1,i+n/2)N−1y∂y∂E。
对于每个通道，accum_ratio前减后加。
accum_ratio_times_bottom为 x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ∂ E ∂ y x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y\frac{\partial E}{\partial y} x∑j=max(0,i−n/2)min(N−1,i+n/2)N−1y∂y∂E。
由于 y i = N − β ⋅ x j y_i = N^{-{\beta}}\cdot x_j yi=N−β⋅xj
因此，
∂ E ∂ x = ∂ E ∂ y ∂ y ∂ x = ∂ E ∂ y ( 1 − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 ⋅ x j N β ) = ∂ E ∂ y ( N − β − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ) = ∂ E ∂ y N − β − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ∂ E ∂ y \begin{aligned} \frac{\partial E}{\partial x} &= \frac{\partial E}{\partial y} \frac{\partial y}{\partial x} \\ &=\frac{\partial E}{\partial y} \left(\frac{1-\frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} \cdot x_j}{N^{\beta}}\right) \\ &=\frac{\partial E}{\partial y} (N^{-{\beta}} - \frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y) \\ &=\frac{\partial E}{\partial y} N^{-{\beta}} - \frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y\frac{\partial E}{\partial y} \end{aligned} ∂x∂E=∂y∂E∂x∂y=∂y∂E Nβ1−n2αβx∑j=max(0,i−n/2)min(N−1,i+n/2)N−1⋅xj =∂y∂E(N−β−n2αβxj=max(0,i−n/2)∑min(N−1,i+n/2)N−1y)=∂y∂EN−β−n2αβxj=max(0,i−n/2)∑min(N−1,i+n/2)N−1y∂y∂E

    for (int c = 0; c < channels_; ++c) {caffe_axpy<Dtype>(height_ * width_, 1.,padded_ratio_data + padded_ratio.offset(0, c + size_ - 1),accum_ratio_data);// compute bottom diffcaffe_mul<Dtype>(height_ * width_,bottom_data + top[0]->offset(n, c),accum_ratio_data, accum_ratio_times_bottom);caffe_axpy<Dtype>(height_ * width_, -cache_ratio_value,accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c));caffe_axpy<Dtype>(height_ * width_, -1.,padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);}}
}

LRNLayer::CrossChannelBackward_gpu

将 NHW 维度合并，并行处理特征图位置上的元素。
网络后面的层由于特征图小而通道宽，导致并行程度低，效率下降。

template <typename Dtype>
void LRNLayer<Dtype>::CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,const vector<Blob<Dtype>*>& bottom) {int n_threads = num_ * height_ * width_;// NOLINT_NEXT_LINE(whitespace/operators)LRNComputeDiff<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),bottom[0]->mutable_gpu_diff());
}

LRNComputeDiff

CUDA_KERNEL_LOOP 循环执行 grid。
根据一维index推算出位置偏移。

template <typename Dtype>
__global__ void LRNComputeDiff(const int nthreads,const Dtype* const bottom_data, const Dtype* const top_data,const Dtype* const scale, const Dtype* const top_diff,const int num, const int channels, const int height,const int width, const int size, const Dtype negative_beta,const Dtype cache_ratio, Dtype* const bottom_diff) {CUDA_KERNEL_LOOP(index, nthreads) {// find out the local offsetconst int w = index % width;const int h = (index / width) % height;const int n = index / width / height;const int offset = (n * channels * height + h) * width + w;const int step = height * width;const Dtype* const bottom_off = bottom_data + offset;const Dtype* const top_off = top_data + offset;const Dtype* const scale_off = scale + offset;const Dtype* const top_diff_off = top_diff + offset;Dtype* const bottom_diff_off = bottom_diff + offset;

head为通道上中心值的索引。
pre_pad为包含中心值，比post_pad长。
累加属于第一个输出的post_pad个 N − 1 y ∂ E ∂ y N^{-1} y\frac{\partial E}{\partial y} N−1y∂y∂E

    int head = 0;const int pre_pad = size - (size + 1) / 2;const int post_pad = size - pre_pad - 1;Dtype accum_ratio = 0;// accumulate valueswhile (head < post_pad && head < channels) {accum_ratio += top_diff_off[head * step] * top_off[head * step] /scale_off[head * step];++head;}

循环处理channels次，得到channels-post_pad个通道的 $ \frac{\partial E}{\partial x}$。
accum_ratio为 ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ∂ E ∂ y \sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y\frac{\partial E}{\partial y} ∑j=max(0,i−n/2)min(N−1,i+n/2)N−1y∂y∂E。
没有缓冲区存储每个位置上的 N − 1 y ∂ E ∂ y N^{-1} y\frac{\partial E}{\partial y} N−1y∂y∂E，因此加减过程中需要计算两次。
cache_ratio为 2 α β n \frac{2\alpha\beta}{n} n2αβ。
∂ E ∂ x = ∂ E ∂ y ∂ y ∂ x = ∂ E ∂ y ( 1 − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 ⋅ x j N β ) = ∂ E ∂ y ( N − β − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ) = ∂ E ∂ y N − β − 2 α β n x ∑ j = m a x ( 0 , i − n / 2 ) m i n ( N − 1 , i + n / 2 ) N − 1 y ∂ E ∂ y \begin{aligned} \frac{\partial E}{\partial x} &= \frac{\partial E}{\partial y} \frac{\partial y}{\partial x} \\ &=\frac{\partial E}{\partial y} \left(\frac{1-\frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} \cdot x_j}{N^{\beta}}\right) \\ &=\frac{\partial E}{\partial y} (N^{-{\beta}} - \frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y) \\ &=\frac{\partial E}{\partial y} N^{-{\beta}} - \frac{2\alpha\beta}{n} x\sum_{j=max(0,i-n/2)}^{min(N-1,i+n/2)} N^{-1} y\frac{\partial E}{\partial y} \end{aligned} ∂x∂E=∂y∂E∂x∂y=∂y∂E Nβ1−n2αβx∑j=max(0,i−n/2)min(N−1,i+n/2)N−1⋅xj =∂y∂E(N−β−n2αβxj=max(0,i−n/2)∑min(N−1,i+n/2)N−1y)=∂y∂EN−β−n2αβxj=max(0,i−n/2)∑min(N−1,i+n/2)N−1y∂y∂E

y j = N − β ⋅ x j y_j = N^{-{\beta}}\cdot x_j yj=N−β⋅xj 。

    // both add and subtractwhile (head < channels) {accum_ratio += top_diff_off[head * step] * top_off[head * step] /scale_off[head * step];if (head - size >= 0) {accum_ratio -= top_diff_off[(head - size) * step] *top_off[(head - size) * step] / scale_off[(head - size) * step];}bottom_diff_off[(head - post_pad) * step] =top_diff_off[(head - post_pad) * step]* pow(scale_off[(head - post_pad) * step], negative_beta)- cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;++head;}

计算最后的post_pad个通道。

    // subtract onlywhile (head < channels + post_pad) {if (head - size >= 0) {accum_ratio -= top_diff_off[(head - size) * step] *top_off[(head - size) * step] / scale_off[(head - size) * step];}bottom_diff_off[(head - post_pad) * step] =top_diff_off[(head - post_pad) * step]* pow(scale_off[(head - post_pad) * step], negative_beta)- cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;++head;}}
}