cuda vectorized实现矩阵转置-摩杜云开发者社区

使用了共享内存和向量化传输，目前为止效果最好的一个实现

__global__ void transposeSmemVec(float* input, float* output, const int X, const int Y){
        __shared__ float smem[32 * 4 * 32];
        unsigned int ix = 4 * (blockDim.x * blockIdx.x + threadIdx.x);
        unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
        unsigned int ti = iy * X + ix;
        float reg[4];

        unsigned int thread_index = threadIdx.y * blockDim.x + threadIdx.x;
        unsigned int new_tx = thread_index % (blockDim.y / 4);
        unsigned int new_ty = thread_index / (blockDim.y / 4);
        unsigned int new_ix = blockIdx.y * blockDim.y + new_tx * 4;
        unsigned int new_iy = blockIdx.x * blockDim.x * 4 + new_ty;
        unsigned int to = new_iy * Y + new_ix;

        if (ix < X && iy < Y) {
                *reinterpret_cast<float4*>(&smem[(threadIdx.y * blockDim.x + threadIdx.x) * 4]) = *reinterpret_cast<float4*>(&input[ti]);
                __syncthreads();
                // *reinterpret_cast<float4*>(®[0]) = *reinterpret_cast<float4*>(&smem[threadIdx.y][threadIdx.x * 4]);
                // *reinterpret_cast<float4*>(&output[iy * X + ix]) = *reinterpret_cast<float4*>(®[0]);
                reg[0] = smem[4 * new_tx * blockDim.x * 4 + new_ty];
                reg[1] = smem[(4 * new_tx + 1) * blockDim.x * 4 + new_ty];
                reg[2] = smem[(4 * new_tx + 2) * blockDim.x * 4 + new_ty];
                reg[3] = smem[(4 * new_tx + 3) * blockDim.x * 4 + new_ty];
                // printf("---------\n");
                *reinterpret_cast<float4*>(&output[to]) = *reinterpret_cast<float4*>(®[0]);
        }

}

注意在调用核函数的时候grid的x维度要缩小1/4

主函数调用核函数代码

checkRuntime(cudaMemset(d_output, 0, sizeof(float) * X * Y));
memset(gpu_ref, 0, sizeof(float) * X * Y);
checkRuntime(cudaEventRecord(start));
dim3 grid2((X + 32 * 4 - 1) / (32 * 4), (Y + 32 - 1) / 32); transposeSmemVec<<<grid2, block>>>(d_input, d_output, X, Y);
checkRuntime(cudaEventRecord(end));
checkRuntime(cudaMemcpy(gpu_ref, d_output, sizeof(float) * X * Y, cudaMemcpyDeviceToHost));
checkRuntime(cudaEventSynchronize(end));
checkRuntime(cudaEventElapsedTime(&ms, start, end));
printf("transpose vectorialize bandwidth = %fGB/s\n", X * Y * 2 * sizeof(float) / ms / 1e6);
compareResult(cpu_ref, gpu_ref, X * Y);

block的size在主函数最前面定义为32，注意grid和block的设置形式是(x, y, z)，x在最前面，而矩阵的表示方法是（行，列）也就是(y, x)，和block，grid设置是相反的

cuda vectorized实现矩阵转置_向量化

向量化操作需要注意指令对应的线程和数据对应的线程，这两者是不一样的

无情的摸鱼机器