使用了共享内存和向量化传输,目前为止效果最好的一个实现
__global__ void transposeSmemVec(float* input, float* output, const int X, const int Y){
__shared__ float smem[32 * 4 * 32];
unsigned int ix = 4 * (blockDim.x * blockIdx.x + threadIdx.x);
unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int ti = iy * X + ix;
float reg[4];
unsigned int thread_index = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int new_tx = thread_index % (blockDim.y / 4);
unsigned int new_ty = thread_index / (blockDim.y / 4);
unsigned int new_ix = blockIdx.y * blockDim.y + new_tx * 4;
unsigned int new_iy = blockIdx.x * blockDim.x * 4 + new_ty;
unsigned int to = new_iy * Y + new_ix;
if (ix < X && iy < Y) {
*reinterpret_cast<float4*>(&smem[(threadIdx.y * blockDim.x + threadIdx.x) * 4]) = *reinterpret_cast<float4*>(&input[ti]);
__syncthreads();
// *reinterpret_cast<float4*>(®[0]) = *reinterpret_cast<float4*>(&smem[threadIdx.y][threadIdx.x * 4]);
// *reinterpret_cast<float4*>(&output[iy * X + ix]) = *reinterpret_cast<float4*>(®[0]);
reg[0] = smem[4 * new_tx * blockDim.x * 4 + new_ty];
reg[1] = smem[(4 * new_tx + 1) * blockDim.x * 4 + new_ty];
reg[2] = smem[(4 * new_tx + 2) * blockDim.x * 4 + new_ty];
reg[3] = smem[(4 * new_tx + 3) * blockDim.x * 4 + new_ty];
// printf("---------\n");
*reinterpret_cast<float4*>(&output[to]) = *reinterpret_cast<float4*>(®[0]);
}
}
注意在调用核函数的时候grid的x维度要缩小1/4
主函数调用核函数代码
checkRuntime(cudaMemset(d_output, 0, sizeof(float) * X * Y));
memset(gpu_ref, 0, sizeof(float) * X * Y);
checkRuntime(cudaEventRecord(start));
dim3 grid2((X + 32 * 4 - 1) / (32 * 4), (Y + 32 - 1) / 32); transposeSmemVec<<<grid2, block>>>(d_input, d_output, X, Y);
checkRuntime(cudaEventRecord(end));
checkRuntime(cudaMemcpy(gpu_ref, d_output, sizeof(float) * X * Y, cudaMemcpyDeviceToHost));
checkRuntime(cudaEventSynchronize(end));
checkRuntime(cudaEventElapsedTime(&ms, start, end));
printf("transpose vectorialize bandwidth = %fGB/s\n", X * Y * 2 * sizeof(float) / ms / 1e6);
compareResult(cpu_ref, gpu_ref, X * Y);
block的size在主函数最前面定义为32,注意grid和block的设置形式是(x, y, z),x在最前面,而矩阵的表示方法是(行,列)也就是(y, x),和block,grid设置是相反的
向量化操作需要注意指令对应的线程和数据对应的线程,这两者是不一样的
无情的摸鱼机器