CUDA算子优化-Reduce

本文是对官方reduce优化的精简，方便个人复习,详细回顾参考知乎深入浅出系列

展示 reduce的7种优化

V0_0 naive

跨步相加，非全局内存访问，

__global__ void reduce_naive(float *g_idata, float *g_odata, unsigned int n) {
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    // 直接在全局内存上进行跨步归约
    for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
        // 确保所有前一轮的写入对下一轮可见
        if (threadIdx.x % (2 * stride) == 0) {
            g_idata[idx] += g_idata[idx + stride];
        }
         __syncthreads(); 
    }

    // 一个线程块的结果写回全局内存
    if (threadIdx.x == 0) {
        g_odata[blockIdx.x] = g_idata[blockIdx.x * blockDim.x];
    }
}

整个过程都在读写缓慢的全局内存，延迟很高，O(N)次操作需要访问全局内存O(NlogN)次全局内存，效率低

V0 shared_memory

__global__ void reduce0(int *g_idata, int *g_odata) {
 extern __shared__ int sdata[];
 // each thread loads one element from global to shared mem
	unsigned int tid = threadIdx.x;
	unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
	sdata[tid] = g_idata[i];
 __syncthreads();
	// do reduction in shared mem
	for(unsigned int s=1; s < blockDim.x; s *= 2) {
		if (tid % (2*s) == 0) {
			sdata[tid] += sdata[tid + s];
		}
		__syncthreads();
 }
	// write result for this block to global mem
	if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

问题：会造成线程束分化，同一个warps内执行的操作不一致

V1 线程束分化

__global__ void reduce1(float *d_in,float *d_out){
    __shared__ float sdata[THREAD_PER_BLOCK];

    //each thread loads one element from global memory to shared mem
    unsigned int i=blockIdx.x*blockDim.x+threadIdx.x;
    unsigned int tid=threadIdx.x;
    sdata[tid]=d_in[i];
    __syncthreads();

    // do reduction in shared mem
    for(unsigned int s=1; s<blockDim.x; s*=2){
        int index = 2*s*tid;
        if(index < blockDim.x){//不同线程束执行不同
            sdata[index]+=sdata[index+s];
        }
        __syncthreads();
    }
    
    // write result for this block to global mem
    if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

本质，将区别通过index转换迁移到线程束之间而不是线程束内部，以前是每个线程束中都会出现if else，限制是一个线程束中的程序都一起执行或者一起不执行（最后几轮除外）

继续假定block中存在256个thread，即拥有256/32=8个warp。当进行第1次迭代时，0-3号warp的index<blockDim.x， 4-7号warp的index>=blockDim.x。对于每个warp而言，都只是进入到一个分支内，所以并不会存在warp divergence的情况。当进行第2次迭代时，0、1号两个warp进入计算分支。当进行第3次迭代时，只有0号warp进入计算分支。当进行第4次迭代时，只有0号warp的前16个线程进入分支。此时开始产生warp divergence。通过这种方式，我们消除了前3次迭代的warp divergence。来自知乎《深入浅出GPU优化系列》

V2 bank冲突

0号和32号元素冲突

__global__ void reduce2(float *d_in,float *d_out){
    __shared__ float sdata[THREAD_PER_BLOCK];

    //each thread loads one element from global memory to shared mem
    unsigned int i=blockIdx.x*blockDim.x+threadIdx.x;
    unsigned int tid=threadIdx.x;
    sdata[tid]=d_in[i];
    __syncthreads();

    // do reduction in shared mem
    for(unsigned int s=blockDim.x/2; s>0; s>>=1){
        if(tid < s){
            sdata[tid]+=sdata[tid+s];
        }
        __syncthreads();
    }
    
    // write result for this block to global mem
    if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

让一开始的访存跨度最大，恰好在元素多的时候错开，一个warp中不会同时访问同一个bank

问题：一半的线程是空闲的

V3 空闲线程优化

__global__ void reduce3(float *d_in,float *d_out){
    __shared__ float sdata[THREAD_PER_BLOCK];

    //each thread loads one element from global memory to shared mem
    unsigned int i=blockIdx.x*(blockDim.x*2)+threadIdx.x; //核心，每个block处理两个block
    unsigned int tid=threadIdx.x;
    sdata[tid]=d_in[i] + d_in[i+blockDim.x]; 
    __syncthreads();

    // do reduction in shared mem
    for(unsigned int s=blockDim.x/2; s>0; s>>=1){
        if(tid < s){
            sdata[tid]+=sdata[tid+s];
        }
        __syncthreads();
    }
    
    // write result for this block to global mem
    if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

V4 展开最后一次计算减少同步

__device__ void warpReduce(volatile float* cache,int tid){
    //volatile 的作用：禁止编译器优化，每次访问都必须真正从共享内存里取值/写值,防止缓存到寄存器里，然后重复使用寄存器的值，而不是每次都从共享内存读取
    //32到1是跨度s 
    cache[tid]+=cache[tid+32];
    cache[tid]+=cache[tid+16];
    cache[tid]+=cache[tid+8];
    cache[tid]+=cache[tid+4];
    cache[tid]+=cache[tid+2];
    cache[tid]+=cache[tid+1];
}


__global__ void reduce4(float *d_in,float *d_out){
    __shared__ float sdata[THREAD_PER_BLOCK];

    //each thread loads one element from global memory to shared mem
    unsigned int i=blockIdx.x*(blockDim.x*2)+threadIdx.x;
    unsigned int tid=threadIdx.x;
    sdata[tid]=d_in[i] + d_in[i+blockDim.x];
    __syncthreads();

    // do reduction in shared mem
    for(unsigned int s=blockDim.x/2; s>32; s>>=1){
        if(tid < s){
            sdata[tid]+=sdata[tid+s];
        }
        __syncthreads();
    }
    
    // write result for this block to global mem
    if(tid<32)warpReduce(sdata,tid);
    if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

一个SIMD单元工作时，避免__syncthreads同步

V5 暴力for循环展开

其实个人无法理解，现代版本中感觉不需要了

template <unsigned int blockSize>
__device__ void warpReduce(volatile float* cache,int tid){
    if(blockSize >= 64)cache[tid]+=cache[tid+32];
    if(blockSize >= 32)cache[tid]+=cache[tid+16];
    if(blockSize >= 16)cache[tid]+=cache[tid+8];
    if(blockSize >= 8)cache[tid]+=cache[tid+4];
    if(blockSize >= 4)cache[tid]+=cache[tid+2];
    if(blockSize >= 2)cache[tid]+=cache[tid+1];
}

template <unsigned int blockSize>
__global__ void reduce5(float *d_in,float *d_out){
    __shared__ float sdata[THREAD_PER_BLOCK];

    //each thread loads one element from global memory to shared mem
    unsigned int i=blockIdx.x*(blockDim.x*2)+threadIdx.x;
    unsigned int tid=threadIdx.x;
    sdata[tid]=d_in[i] + d_in[i+blockDim.x];
    __syncthreads();

    // do reduction in shared mem
    if(blockSize>=512){
        if(tid<256){
            sdata[tid]+=sdata[tid+256];
        }
        __syncthreads();
    }
    if(blockSize>=256){
        if(tid<128){
            sdata[tid]+=sdata[tid+128];
        }
        __syncthreads();
    }
    if(blockSize>=128){
        if(tid<64){
            sdata[tid]+=sdata[tid+64];
        }
        __syncthreads();
    }
    
    // write result for this block to global mem
    if(tid<32)warpReduce<blockSize>(sdata,tid);
    if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

V6版本

和文章中讲的有些差距，本质一个线程处理多个数。

template <unsigned int blockSize, int NUM_PER_THREAD>
__global__ void reduce6(float *d_in,float *d_out, unsigned int n){
    __shared__ float sdata[blockSize];

    // each thread loads NUM_PER_THREAD element from global to shared mem
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * (blockSize * NUM_PER_THREAD) + threadIdx.x;

    sdata[tid] = 0;

    #pragma unroll
    for(int iter=0; iter<NUM_PER_THREAD; iter++){
        sdata[tid] += d_in[i+iter*blockSize];
    }
    
    __syncthreads();

    // do reduction in shared mem
    if (blockSize >= 512) {
        if (tid < 256) { 
            sdata[tid] += sdata[tid + 256]; 
        } 
        __syncthreads(); 
    }
    if (blockSize >= 256) {
        if (tid < 128) { 
            sdata[tid] += sdata[tid + 128]; 
        } 
        __syncthreads(); 
    }
    if (blockSize >= 128) {
        if (tid < 64) { 
            sdata[tid] += sdata[tid + 64]; 
        } 
        __syncthreads(); 
    }
    if (tid < 32) warpReduce<blockSize>(sdata, tid);
    
    // write result for this block to global mem
    if (tid == 0) d_out[blockIdx.x] = sdata[0];
}

    reduce6<THREAD_PER_BLOCK, NUM_PER_THREAD><<<Grid,Block>>>(d_a, d_out, N);

V7 shuffle优化

//todo

How_to_optimize_in_GPU/reduce/reduce_v7_shuffle.cu at master · Liu-xiandong/How_to_optimize_in_GPU

采用了shuffle指令之后，warp内的线程可以直接对其他线程的寄存器进行访存。通过这种方式可以减少访存的延时。

#define THREAD_PER_BLOCK 256
#define WARP_SIZE 32

template <unsigned int blockSize>
__device__ __forceinline__ float warpReduceSum(float sum) {
    if (blockSize >= 32)sum += __shfl_down_sync(0xffffffff, sum, 16); // 0-16, 1-17, 2-18, etc.
    if (blockSize >= 16)sum += __shfl_down_sync(0xffffffff, sum, 8);// 0-8, 1-9, 2-10, etc.
    if (blockSize >= 8)sum += __shfl_down_sync(0xffffffff, sum, 4);// 0-4, 1-5, 2-6, etc.
    if (blockSize >= 4)sum += __shfl_down_sync(0xffffffff, sum, 2);// 0-2, 1-3, 4-6, 5-7, etc.
    if (blockSize >= 2)sum += __shfl_down_sync(0xffffffff, sum, 1);// 0-1, 2-3, 4-5, etc.
    return sum;
}
template <unsigned int blockSize, int NUM_PER_THREAD>
__global__ void reduce7(float *d_in,float *d_out, unsigned int n){
    float sum = 0;

    // each thread loads one element from global to shared mem
    unsigned int tid = threadIdx.x;


    #pragma unroll
    //线程级局部规约
    for(int iter=0; iter<NUM_PER_THREAD; iter++){
        sum += d_in[i+iter*blockSize];
    }
    
    // Shared mem for partial sums (one per warp in the block)
    static __shared__ float warpLevelSums[WARP_SIZE]; 
    const int laneId = threadIdx.x % WARP_SIZE;
    const int warpId = threadIdx.x / WARP_SIZE;

    sum = warpReduceSum<blockSize>(sum);

    if(laneId == 0 )warpLevelSums[warpId] = sum;
    __syncthreads();
    // read from shared memory only if that warp existed
    sum = (threadIdx.x < blockDim.x / WARP_SIZE) ? warpLevelSums[laneId] : 0;
    // Final reduce using first warp
    if (warpId == 0) sum = warpReduceSum<blockSize/WARP_SIZE>(sum); 
    // write result for this block to global mem
    if (tid == 0) d_out[blockIdx.x] = sum;
}
reduce7<THREAD_PER_BLOCK, NUM_PER_THREAD><<<Grid,Block>>>(d_a, d_out, N);

优化总结

使用共享内存
减少线程束分化（差异保留在线程束之间）
减少bank冲突；变换角标，错位（+1与跨度错位）
减少空闲线程，一个线程取两个元素
减少__syncthreads同步，一个SIMD内可以不使用
合理设置block数量
使用shuffle指令