本文是对官方reduce优化的精简,方便个人复习,详细回顾参考知乎深入浅出系列

leetGPU 问题解决

快速记忆

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

#define THREAD_PER_BLOCK 256 // 每个block的线程数
#define WARP_SIZE 32 // warp大小(NVIDIA GPU的基本调度单位)
#define NUM_PER_THREAD 8 // 每个线程处理的元素数量


template <unsigned int blockSize>
__device__ __forceinline__ float warpReduceSum(float sum) {
// __shfl_down_sync: 从同一warp中更高级的线程获取值并相加
// 0xffffffff 表示掩码,包含warp内所有线程
if (blockSize >= 32) sum += __shfl_down_sync(0xffffffff, sum, 16); // 步长16
if (blockSize >= 16) sum += __shfl_down_sync(0xffffffff, sum, 8); // 步长8
if (blockSize >= 8) sum += __shfl_down_sync(0xffffffff, sum, 4); // 步长4
if (blockSize >= 4) sum += __shfl_down_sync(0xffffffff, sum, 2); // 步长2
if (blockSize >= 2) sum += __shfl_down_sync(0xffffffff, sum, 1); // 步长1
return sum; // 此时所有线程都拥有warp的总和
}


//非类型模板参数,编译期知道数据,便于常数相关优化,如循环展开,提前分支判断等。
template <unsigned int blockSize, int NUM_PER_THREAD>
__global__ void reduce7(float *d_in, float *d_out, unsigned int n) {
float sum = 0;
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * NUM_PER_THREAD + tid; // 当前线程处理的起始全局索引
// 每个线程连续处理NUM_PER_THREAD个元素(步长为blockSize)
// 使用循环展开优化(#pragma unroll)
#pragma unroll
for(int iter = 0; iter < NUM_PER_THREAD; iter++) {
if(i + iter * blockSize < n) { // 边界检查
sum += d_in[i + iter * blockSize];
}
}

// 共享内存:用于存储每个warp的归约结果
static __shared__ float warpLevelSums[WARP_SIZE];
const int laneId = threadIdx.x % WARP_SIZE; // warp内的线程索引(0-31)
const int warpId = threadIdx.x / WARP_SIZE; // block内的warp索引

// 第一步:warp内部的归约
// 每个warp独立地进行归约,得到该warp处理的所有元素的和
sum = warpReduceSum<blockSize>(sum);

// 每个warp的第0号线程将该warp的归约结果存入共享内存
if(laneId == 0) {
warpLevelSums[warpId] = sum;
}
__syncthreads(); // 同步所有线程,确保共享内存写入完成

// 第二步:对各个warp的归约结果再次归约
// 只有第一个warp(warpId == 0)中的线程参与
// 每个线程从共享内存中读取一个值(使用laneId作为索引)
sum = (threadIdx.x < blockDim.x / WARP_SIZE) ? warpLevelSums[laneId] : 0;

// 第一个warp使用相同的warpReduceSum进行最终归约
// blockSize / WARP_SIZE 表示block中的warp数量
if(warpId == 0) {
sum = warpReduceSum<blockSize / WARP_SIZE>(sum);
}

// 每个block的第0号线程将最终结果写入全局内存
if(tid == 0) {
d_out[blockIdx.x] = sum;
}
}

__global__ void reduce2(float *d_in,float *d_out){
__shared__ float sdata[THREAD_PER_BLOCK];

//each thread loads one element from global memory to shared mem
unsigned int i=blockIdx.x*blockDim.x+threadIdx.x;
unsigned int tid=threadIdx.x;
sdata[tid]=d_in[i];
__syncthreads();

// do reduction in shared mem
for(unsigned int s=blockDim.x/2; s>0; s>>=1){
if(tid < s){
sdata[tid]+=sdata[tid+s];
}
__syncthreads();
}

// write result for this block to global mem
if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

int main() {
unsigned int N = 1000000;
size_t bytes = N * sizeof(float);
int blockSize = THREAD_PER_BLOCK; // 256线程/block
int elementsPerBlock = blockSize * NUM_PER_THREAD; // 256 * 8 = 2048元素/block
int gridSize = (N + elementsPerBlock - 1) / elementsPerBlock; //确定网格尺寸,能够处理所有N个元素

float *h_in = (float*)malloc(bytes);
float *h_out = (float*)malloc(gridSize * sizeof(float));

for(unsigned int i = 0; i < N; i++) {
h_in[i] = 1.0f;
}

float *d_in, *d_out;
cudaMalloc(&d_in, bytes);
cudaMalloc(&d_out, gridSize * sizeof(float));//输出只需要block个块
cudaMemcpy(d_in, h_in, bytes, cudaMemcpyHostToDevice);//拷贝 (目的地址,源地址,大小,方向)

dim3 block(blockSize);
dim3 grid(gridSize);
reduce7<THREAD_PER_BLOCK, NUM_PER_THREAD><<<grid, block>>>(d_in, d_out, N);

cudaMemcpy(h_out, d_out, gridSize * sizeof(float), cudaMemcpyDeviceToHost);

//计算block之间的总和
float gpu_sum = 0;
for(int i = 0; i < gridSize; i++) {
gpu_sum += h_out[i];
}

printf("Result: %f\n", gpu_sum);

free(h_in);
free(h_out);
cudaFree(d_in);
cudaFree(d_out);

return 0;
}

展示 reduce的7种优化

V0_0 naive

跨步相加,非全局内存访问,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
__global__ void reduce_naive(float *g_idata, float *g_odata, unsigned int n) {
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) return;
// 直接在全局内存上进行跨步归约
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
if (threadIdx.x % (2 * stride) == 0) {
g_idata[idx] += g_idata[idx + stride];
}
__syncthreads();
}

// 一个线程块的结果写回全局内存
if (threadIdx.x == 0) {
g_odata[blockIdx.x] = g_idata[blockIdx.x * blockDim.x];
}
}

整个过程都在读写缓慢的全局内存,延迟很高,O(N)次操作需要访问全局内存O(NlogN)次全局内存,效率低

V0 shared_memory

image-20250907162658642
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

问题: 会造成线程束分化,同一个warps内执行的操作不一致

image-20250907162828552

V1 解决线程束分化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
__global__ void reduce1(float *d_in,float *d_out){
__shared__ float sdata[THREAD_PER_BLOCK];

//each thread loads one element from global memory to shared mem
unsigned int i=blockIdx.x*blockDim.x+threadIdx.x;
unsigned int tid=threadIdx.x;
sdata[tid]=d_in[i];
__syncthreads();

// do reduction in shared mem
for(unsigned int s=1; s<blockDim.x; s*=2){
int index = 2*s*tid;
if(index < blockDim.x){//不同线程束执行不同
sdata[index]+=sdata[index+s];
}
__syncthreads();
}

// write result for this block to global mem
if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

本质,将区别通过index转换迁移到线程束之间而不是线程束内部,以前是每个线程束中都会出现if else,限制是一个线程束中的程序都一起执行或者一起不执行(最后几轮除外)

继续假定block中存在256个thread,即拥有256/32=8个warp。当进行第1次迭代时,0-3号warp的index<blockDim.x, 4-7号warp的index>=blockDim.x。对于每个warp而言,都只是进入到一个分支内,所以并不会存在warp divergence的情况。当进行第2次迭代时,0、1号两个warp进入计算分支。当进行第3次迭代时,只有0号warp进入计算分支。当进行第4次迭代时,只有0号warp的前16个线程进入分支。此时开始产生warp divergence。通过这种方式,我们消除了前3次迭代的warp divergence。 来自知乎《深入浅出GPU优化系列》

V2 bank冲突

0号和32号元素冲突

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
__global__ void reduce2(float *d_in,float *d_out){
__shared__ float sdata[THREAD_PER_BLOCK];

//each thread loads one element from global memory to shared mem
unsigned int i=blockIdx.x*blockDim.x+threadIdx.x;
unsigned int tid=threadIdx.x;
sdata[tid]=d_in[i];
__syncthreads();

// do reduction in shared mem
for(unsigned int s=blockDim.x/2; s>0; s>>=1){
if(tid < s){
sdata[tid]+=sdata[tid+s];
}
__syncthreads();
}

// write result for this block to global mem
if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

让一开始的访存跨度最大,恰好在元素多的时候错开,一个warp中不会同时访问同一个bank

问题: 一半的线程是空闲的

V3 空闲线程优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
__global__ void reduce3(float *d_in,float *d_out){
__shared__ float sdata[THREAD_PER_BLOCK];

//each thread loads one element from global memory to shared mem
unsigned int i=blockIdx.x*(blockDim.x*2)+threadIdx.x; //核心,每个block处理两个block
unsigned int tid=threadIdx.x;
sdata[tid]=d_in[i] + d_in[i+blockDim.x];
__syncthreads();

// do reduction in shared mem
for(unsigned int s=blockDim.x/2; s>0; s>>=1){
if(tid < s){
sdata[tid]+=sdata[tid+s];
}
__syncthreads();
}

// write result for this block to global mem
if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

V4 展开最后一次计算减少同步

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
__device__ void warpReduce(volatile float* cache,int tid){
//volatile 的作用:禁止编译器优化,每次访问都必须真正从共享内存里取值/写值,防止缓存到寄存器里,然后重复使用寄存器的值,而不是每次都从共享内存读取
//32到1是跨度s
cache[tid]+=cache[tid+32];
cache[tid]+=cache[tid+16];
cache[tid]+=cache[tid+8];
cache[tid]+=cache[tid+4];
cache[tid]+=cache[tid+2];
cache[tid]+=cache[tid+1];
}


__global__ void reduce4(float *d_in,float *d_out){
__shared__ float sdata[THREAD_PER_BLOCK];

//each thread loads one element from global memory to shared mem
unsigned int i=blockIdx.x*(blockDim.x*2)+threadIdx.x;
unsigned int tid=threadIdx.x;
sdata[tid]=d_in[i] + d_in[i+blockDim.x];
__syncthreads();

// do reduction in shared mem
for(unsigned int s=blockDim.x/2; s>32; s>>=1){
if(tid < s){
sdata[tid]+=sdata[tid+s];
}
__syncthreads();
}

// write result for this block to global mem
if(tid<32)warpReduce(sdata,tid);
if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

一个SIMD单元工作时,避免__syncthreads同步

V5 暴力for循环展开

其实个人无法理解,现代版本中感觉不需要了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
template <unsigned int blockSize>
__device__ void warpReduce(volatile float* cache,int tid){
if(blockSize >= 64)cache[tid]+=cache[tid+32];
if(blockSize >= 32)cache[tid]+=cache[tid+16];
if(blockSize >= 16)cache[tid]+=cache[tid+8];
if(blockSize >= 8)cache[tid]+=cache[tid+4];
if(blockSize >= 4)cache[tid]+=cache[tid+2];
if(blockSize >= 2)cache[tid]+=cache[tid+1];
}

template <unsigned int blockSize>
__global__ void reduce5(float *d_in,float *d_out){
__shared__ float sdata[THREAD_PER_BLOCK];

//each thread loads one element from global memory to shared mem
unsigned int i=blockIdx.x*(blockDim.x*2)+threadIdx.x;
unsigned int tid=threadIdx.x;
sdata[tid]=d_in[i] + d_in[i+blockDim.x];
__syncthreads();

// do reduction in shared mem
if(blockSize>=512){
if(tid<256){
sdata[tid]+=sdata[tid+256];
}
__syncthreads();
}
if(blockSize>=256){
if(tid<128){
sdata[tid]+=sdata[tid+128];
}
__syncthreads();
}
if(blockSize>=128){
if(tid<64){
sdata[tid]+=sdata[tid+64];
}
__syncthreads();
}

// write result for this block to global mem
if(tid<32)warpReduce<blockSize>(sdata,tid);
if(tid==0)d_out[blockIdx.x]=sdata[tid];
}

V6版本

和文章中讲的有些差距,本质一个线程处理多个数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
template <unsigned int blockSize, int NUM_PER_THREAD>
__global__ void reduce6(float *d_in,float *d_out, unsigned int n){
__shared__ float sdata[blockSize];

// each thread loads NUM_PER_THREAD element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * (blockSize * NUM_PER_THREAD) + threadIdx.x;

sdata[tid] = 0;

#pragma unroll
for(int iter=0; iter<NUM_PER_THREAD; iter++){
sdata[tid] += d_in[i+iter*blockSize];
}

__syncthreads();

// do reduction in shared mem
if (blockSize >= 512) {
if (tid < 256) {
sdata[tid] += sdata[tid + 256];
}
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128) {
sdata[tid] += sdata[tid + 128];
}
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64) {
sdata[tid] += sdata[tid + 64];
}
__syncthreads();
}
if (tid < 32) warpReduce<blockSize>(sdata, tid);

// write result for this block to global mem
if (tid == 0) d_out[blockIdx.x] = sdata[0];
}

reduce6<THREAD_PER_BLOCK, NUM_PER_THREAD><<<Grid,Block>>>(d_a, d_out, N);

V7 shuffle优化

//todo

How_to_optimize_in_GPU/reduce/reduce_v7_shuffle.cu at master · Liu-xiandong/How_to_optimize_in_GPU

采用了shuffle指令之后,warp内的线程可以直接对其他线程的寄存器进行访存。通过这种方式可以减少访存的延时。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#define THREAD_PER_BLOCK 256
#define WARP_SIZE 32

template <unsigned int blockSize>
__device__ __forceinline__ float warpReduceSum(float sum) {
if (blockSize >= 32)sum += __shfl_down_sync(0xffffffff, sum, 16); // 0-16, 1-17, 2-18, etc.
if (blockSize >= 16)sum += __shfl_down_sync(0xffffffff, sum, 8);// 0-8, 1-9, 2-10, etc.
if (blockSize >= 8)sum += __shfl_down_sync(0xffffffff, sum, 4);// 0-4, 1-5, 2-6, etc.
if (blockSize >= 4)sum += __shfl_down_sync(0xffffffff, sum, 2);// 0-2, 1-3, 4-6, 5-7, etc.
if (blockSize >= 2)sum += __shfl_down_sync(0xffffffff, sum, 1);// 0-1, 2-3, 4-5, etc.
return sum;
}
template <unsigned int blockSize, int NUM_PER_THREAD>
__global__ void reduce7(float *d_in,float *d_out, unsigned int n){
float sum = 0;

// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;


#pragma unroll
//线程级局部规约
for(int iter=0; iter<NUM_PER_THREAD; iter++){
sum += d_in[i+iter*blockSize];
}

// Shared mem for partial sums (one per warp in the block)
static __shared__ float warpLevelSums[WARP_SIZE];
const int laneId = threadIdx.x % WARP_SIZE;
const int warpId = threadIdx.x / WARP_SIZE;

sum = warpReduceSum<blockSize>(sum);

if(laneId == 0 )warpLevelSums[warpId] = sum;
__syncthreads();
// read from shared memory only if that warp existed
sum = (threadIdx.x < blockDim.x / WARP_SIZE) ? warpLevelSums[laneId] : 0;
// Final reduce using first warp
if (warpId == 0) sum = warpReduceSum<blockSize/WARP_SIZE>(sum);
// write result for this block to global mem
if (tid == 0) d_out[blockIdx.x] = sum;
}
reduce7<THREAD_PER_BLOCK, NUM_PER_THREAD><<<Grid,Block>>>(d_a, d_out, N);

优化总结

  1. 使用共享内存
  2. 减少线程束分化(差异保留在线程束之间)
  3. 减少bank冲突;变换角标,错位(+1与跨度错位)
  4. 减少空闲线程,一个线程取两个元素
  5. 减少__syncthreads同步,一个SIMD内可以不使用
  6. 合理设置block数量
  7. 使用shuffle指令

本站由 Zane Jiang 使用 Stellar 1.33.1 主题创建,一款很棒的 Hexo 主题!

总访问 次 || 本页访问
总访客 人 || 本页访客