本文是对官方reduce优化的精简,方便个人复习,详细回顾参考知乎深入浅出系列

leetGPU 问题解决

展示 reduce的7种优化

V0_0 naive

跨步相加,非全局内存访问,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
__global__ void reduce_naive(float *g_idata, float *g_odata, unsigned int n) {
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) return;

// 直接在全局内存上进行跨步归约
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
// 确保所有前一轮的写入对下一轮可见
if (threadIdx.x % (2 * stride) == 0) {
g_idata[idx] += g_idata[idx + stride];
}
__syncthreads();
}

// 一个线程块的结果写回全局内存
if (threadIdx.x == 0) {
g_odata[blockIdx.x] = g_idata[blockIdx.x * blockDim.x];
}
}

整个过程都在读写缓慢的全局内存,延迟很高,O(N)次操作需要访问全局内存O(NlogN)次全局内存,效率低

V0 shared_memory

image-20250907162658642
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

问题: 会造成线程束分化,同一个warps内执行的操作不一致

image-20250907162828552

V1 线程束分化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
__global__ void reduce1(float *d_in,float *d_out){
__shared__ float sdata[THREAD_PER_BLOCK];

//each thread loads one element from global memory to shared mem
unsigned int i=blockIdx.x*blockDim.x+threadIdx.x;
unsigned int tid=threadIdx.x;
sdata[tid]=d_in[i];
__syncthreads();

// do reduction in shared mem
for(unsigned int s=1; s<blockDim.x; s*=2){
int index = 2*s*tid;
if(index < blockDim.x){//不同线程束执行不同
sdata[index]+=sdata[index+s];
}
__syncthreads();
}

// write result for this block to global mem
if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

本质,将区别通过index转换迁移到线程束之间而不是线程束内部,以前是每个线程束中都会出现if else,限制是一个线程束中的程序都一起执行或者一起不执行(最后几轮除外)

继续假定block中存在256个thread,即拥有256/32=8个warp。当进行第1次迭代时,0-3号warp的index<blockDim.x, 4-7号warp的index>=blockDim.x。对于每个warp而言,都只是进入到一个分支内,所以并不会存在warp divergence的情况。当进行第2次迭代时,0、1号两个warp进入计算分支。当进行第3次迭代时,只有0号warp进入计算分支。当进行第4次迭代时,只有0号warp的前16个线程进入分支。此时开始产生warp divergence。通过这种方式,我们消除了前3次迭代的warp divergence。 来自知乎《深入浅出GPU优化系列》

V2 bank冲突

0号和32号元素冲突

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
__global__ void reduce2(float *d_in,float *d_out){
__shared__ float sdata[THREAD_PER_BLOCK];

//each thread loads one element from global memory to shared mem
unsigned int i=blockIdx.x*blockDim.x+threadIdx.x;
unsigned int tid=threadIdx.x;
sdata[tid]=d_in[i];
__syncthreads();

// do reduction in shared mem
for(unsigned int s=blockDim.x/2; s>0; s>>=1){
if(tid < s){
sdata[tid]+=sdata[tid+s];
}
__syncthreads();
}

// write result for this block to global mem
if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

让一开始的访存跨度最大,恰好在元素多的时候错开,一个warp中不会同时访问同一个bank

问题: 一半的线程是空闲的

V3 空闲线程优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
__global__ void reduce3(float *d_in,float *d_out){
__shared__ float sdata[THREAD_PER_BLOCK];

//each thread loads one element from global memory to shared mem
unsigned int i=blockIdx.x*(blockDim.x*2)+threadIdx.x; //核心,每个block处理两个block
unsigned int tid=threadIdx.x;
sdata[tid]=d_in[i] + d_in[i+blockDim.x];
__syncthreads();

// do reduction in shared mem
for(unsigned int s=blockDim.x/2; s>0; s>>=1){
if(tid < s){
sdata[tid]+=sdata[tid+s];
}
__syncthreads();
}

// write result for this block to global mem
if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

V4 展开最后一次计算减少同步

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
__device__ void warpReduce(volatile float* cache,int tid){
//volatile 的作用:禁止编译器优化,每次访问都必须真正从共享内存里取值/写值,防止缓存到寄存器里,然后重复使用寄存器的值,而不是每次都从共享内存读取
//32到1是跨度s
cache[tid]+=cache[tid+32];
cache[tid]+=cache[tid+16];
cache[tid]+=cache[tid+8];
cache[tid]+=cache[tid+4];
cache[tid]+=cache[tid+2];
cache[tid]+=cache[tid+1];
}


__global__ void reduce4(float *d_in,float *d_out){
__shared__ float sdata[THREAD_PER_BLOCK];

//each thread loads one element from global memory to shared mem
unsigned int i=blockIdx.x*(blockDim.x*2)+threadIdx.x;
unsigned int tid=threadIdx.x;
sdata[tid]=d_in[i] + d_in[i+blockDim.x];
__syncthreads();

// do reduction in shared mem
for(unsigned int s=blockDim.x/2; s>32; s>>=1){
if(tid < s){
sdata[tid]+=sdata[tid+s];
}
__syncthreads();
}

// write result for this block to global mem
if(tid<32)warpReduce(sdata,tid);
if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

一个SIMD单元工作时,避免__syncthreads同步

V5 暴力for循环展开

其实个人无法理解,现代版本中感觉不需要了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
template <unsigned int blockSize>
__device__ void warpReduce(volatile float* cache,int tid){
if(blockSize >= 64)cache[tid]+=cache[tid+32];
if(blockSize >= 32)cache[tid]+=cache[tid+16];
if(blockSize >= 16)cache[tid]+=cache[tid+8];
if(blockSize >= 8)cache[tid]+=cache[tid+4];
if(blockSize >= 4)cache[tid]+=cache[tid+2];
if(blockSize >= 2)cache[tid]+=cache[tid+1];
}

template <unsigned int blockSize>
__global__ void reduce5(float *d_in,float *d_out){
__shared__ float sdata[THREAD_PER_BLOCK];

//each thread loads one element from global memory to shared mem
unsigned int i=blockIdx.x*(blockDim.x*2)+threadIdx.x;
unsigned int tid=threadIdx.x;
sdata[tid]=d_in[i] + d_in[i+blockDim.x];
__syncthreads();

// do reduction in shared mem
if(blockSize>=512){
if(tid<256){
sdata[tid]+=sdata[tid+256];
}
__syncthreads();
}
if(blockSize>=256){
if(tid<128){
sdata[tid]+=sdata[tid+128];
}
__syncthreads();
}
if(blockSize>=128){
if(tid<64){
sdata[tid]+=sdata[tid+64];
}
__syncthreads();
}

// write result for this block to global mem
if(tid<32)warpReduce<blockSize>(sdata,tid);
if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

V6版本

和文章中讲的有些差距,本质一个线程处理多个数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
template <unsigned int blockSize, int NUM_PER_THREAD>
__global__ void reduce6(float *d_in,float *d_out, unsigned int n){
__shared__ float sdata[blockSize];

// each thread loads NUM_PER_THREAD element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * (blockSize * NUM_PER_THREAD) + threadIdx.x;

sdata[tid] = 0;

#pragma unroll
for(int iter=0; iter<NUM_PER_THREAD; iter++){
sdata[tid] += d_in[i+iter*blockSize];
}

__syncthreads();

// do reduction in shared mem
if (blockSize >= 512) {
if (tid < 256) {
sdata[tid] += sdata[tid + 256];
}
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128) {
sdata[tid] += sdata[tid + 128];
}
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64) {
sdata[tid] += sdata[tid + 64];
}
__syncthreads();
}
if (tid < 32) warpReduce<blockSize>(sdata, tid);

// write result for this block to global mem
if (tid == 0) d_out[blockIdx.x] = sdata[0];
}

reduce6<THREAD_PER_BLOCK, NUM_PER_THREAD><<<Grid,Block>>>(d_a, d_out, N);

V7 shuffle优化

//todo

How_to_optimize_in_GPU/reduce/reduce_v7_shuffle.cu at master · Liu-xiandong/How_to_optimize_in_GPU

采用了shuffle指令之后,warp内的线程可以直接对其他线程的寄存器进行访存。通过这种方式可以减少访存的延时。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#define THREAD_PER_BLOCK 256
#define WARP_SIZE 32

template <unsigned int blockSize>
__device__ __forceinline__ float warpReduceSum(float sum) {
if (blockSize >= 32)sum += __shfl_down_sync(0xffffffff, sum, 16); // 0-16, 1-17, 2-18, etc.
if (blockSize >= 16)sum += __shfl_down_sync(0xffffffff, sum, 8);// 0-8, 1-9, 2-10, etc.
if (blockSize >= 8)sum += __shfl_down_sync(0xffffffff, sum, 4);// 0-4, 1-5, 2-6, etc.
if (blockSize >= 4)sum += __shfl_down_sync(0xffffffff, sum, 2);// 0-2, 1-3, 4-6, 5-7, etc.
if (blockSize >= 2)sum += __shfl_down_sync(0xffffffff, sum, 1);// 0-1, 2-3, 4-5, etc.
return sum;
}
template <unsigned int blockSize, int NUM_PER_THREAD>
__global__ void reduce7(float *d_in,float *d_out, unsigned int n){
float sum = 0;

// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;


#pragma unroll
//线程级局部规约
for(int iter=0; iter<NUM_PER_THREAD; iter++){
sum += d_in[i+iter*blockSize];
}

// Shared mem for partial sums (one per warp in the block)
static __shared__ float warpLevelSums[WARP_SIZE];
const int laneId = threadIdx.x % WARP_SIZE;
const int warpId = threadIdx.x / WARP_SIZE;

sum = warpReduceSum<blockSize>(sum);

if(laneId == 0 )warpLevelSums[warpId] = sum;
__syncthreads();
// read from shared memory only if that warp existed
sum = (threadIdx.x < blockDim.x / WARP_SIZE) ? warpLevelSums[laneId] : 0;
// Final reduce using first warp
if (warpId == 0) sum = warpReduceSum<blockSize/WARP_SIZE>(sum);
// write result for this block to global mem
if (tid == 0) d_out[blockIdx.x] = sum;
}
reduce7<THREAD_PER_BLOCK, NUM_PER_THREAD><<<Grid,Block>>>(d_a, d_out, N);

优化总结

  1. 使用共享内存
  2. 减少线程束分化(差异保留在线程束之间)
  3. 减少bank冲突;变换角标,错位(+1与跨度错位)
  4. 减少空闲线程,一个线程取两个元素
  5. 减少__syncthreads同步,一个SIMD内可以不使用
  6. 合理设置block数量
  7. 使用shuffle指令

本站由 Zane Jiang 使用 Stellar 1.33.1 主题创建,一款很棒的 Hexo 主题!

总访问 次 || 本页访问
总访客 人 || 本页访客