要求写一个reduceKernel 要求给出Kerne的完整调用:
- // 简单的实现,使用一个block完成Reduce操作 好蠢好蠢的代码
- template<typename T>
- __global__ void BlockReduce(const T* input, T* output, int num) {
- int idx = blockIdx.x * blockDim.x + threadIdx.x;
- for(int i = idx; i < num; i+= blockDim.x + gridDim.x) {
- atomicAdd(output, input +i);
- }
- }
- # 一个block完成reduce操作 使用sharemem blockSize = 256
-
- __global__ void BlockReduceSharemem(const T* input, T* output, int num) {
-
- int idx = threadIdx.x;
- __share__ int sharemem[ThreadPerBlock];
- sharemem[threadIdx.x] = 0;
- __syncthreads();
- for(int index = idx; index < num; index+=blockDim.x) {
- sharemem[threadIdx.x] += input[index];
- }
- __syncthreads();
- for(int i = blockDim.x /2 ; i > 0; i >> 1) {
- if (threadIdx < i) {
- sharemem[threadIdx.x] += sharemem[threadIdx.x + i];
- }
- syncthread();
- }
- if (threadIdx.x == 0)
- output[0] = sharemem[0];
- }
- # 一个block完成reduce操作 使用shuffle指令 blockSize = WarpSize
- # shfl指令只能完成一个warp内的操作,因此如果是进行一个block内的数据操作需要先,分步进行
-
- __global__ void BlockReduceSharemem(const T* input, T* output, int num) {
-
- int idx = threadIdx.x;
- T sum = 0;
- for(int index = idx; index < num; index+=blockDim.x) {
- sum += input[index];
- }
-
- for(int i = WarpSize /2; i > 0; i >> 1) {
- sum+=__shfl_down(sum, i);
- }
- if (threadIdx.x == 0)
- output[0] = sum;
- }
- # 一个block完成reduce操作 使用shuffle指令 blockSize % WarpSize = 0
- # shfl指令只能完成一个warp内的操作,因此如果是进行一个block内的数据操作需要先,分步进行
-
-
- __global__ void BlockReduceSharemem(const T* input, T* output, int num) {
-
- int idx = threadIdx.x;
- T sum = 0;
- for(int index = idx; index < num; index+=blockDim.x) {
- sum += input[index];
- }
- __share__ T data[BlockSize];
- data[idx] = 0;
- __syncthreads();
- # 先使用sharemem完成warp_size 以外的数据处理
- for(int i = blockDim.x/ 2 i > WarpSize; i>>1) {
- if (idx < i){
- data[idx] += data[idx + i];
- data[idx + i] = 0;
- }
- }
- __syncthreads();
- sum = data[idx];
- __syncthreads();
-
- for(int i = WarpSize /2; i > 0; i >> 1) {
- sum+=__shfl_down(sum, i);
- }
-
- if (threadIdx.x == 0)
- output[0] = sum;
- }
所谓而维Reduce是指当输入in[h, l] 两维时,对h进行Reduce操作,这个时候就需要考虑到线程映射问题了。
方法一: 如果将threadIdx.x 映射到I维度,threadIdx.y 映射到 h维, 则在进行数据读取时候就能实现连续读取,但是则无法进行blockX内的Reduce操作,可以使用sharemem转置再进行reduce
- #include <cuda_runtime.h>
- #include <vector>
- #include <iostream>
-
-
- template<typename T>
- __inline__ __device__ T warpReduceSum(T x) {
- #pragma unroll
- for (int offset = 16; offset > 0; offset /= 2)
- x += __shfl_down_sync(0xFFFFFFFF, x, offset);
- return x;
- }
-
- __global__ void reduce_col(
- const int* in, //(m, n)
- const int m,
- const int n,
- int* out //(1, n)
- ){
- int tidx = threadIdx.x + blockDim.x * blockIdx.x;
- int tidy = threadIdx.y;
-
- __shared__ int cache[32][32];//为了方便实现和理解用32*32线程数,也可以改成16*64或其他配置
-
- //每一列先累加到一个block中
- int sum = 0;
- if(tidx < n){
- for(int i = tidy; i < m; i += blockDim.y){
- sum += in[i * n + tidx];
- }
- }
-
- //将累加结果做转置,方便做warp reduce
- cache[threadIdx.x][threadIdx.y] = sum;
- __syncthreads();
-
- //block内每一行做reduce,的到32个结果
- int x = cache[threadIdx.y][threadIdx.x];
- x = warpReduceSum<int>(x);
- __syncthreads();
-
- if(threadIdx.x == 0){
- out[blockIdx.x * blockDim.x + threadIdx.y] = x;
- }
- }
-
- void print(std::vector<int>& data, const int m, const int n){
- for(int i = 0; i < m; i++){
- for(int j = 0; j < n; j++){
- std::cout << data[i * n + j] << " ";
- }
- std::cout << std::endl;
- }
- std::cout << std::endl;
- }
-
- int main(){
- const int m = 32;
- const int n = 32;
- std::vector<int> h_in(m * n), h_out(n, 0);
-
- for(int i = 0; i < m; i++){
- for(int j = 0; j < n; j++){
- h_in[i * n + j] = i * n + j;
- h_out[j] += h_in[i * n + j];
- }
- }
- print(h_in, m, n);
-
- int *d_in, *d_out;
- cudaMalloc(&d_in, m*n * sizeof(int));
- cudaMalloc(&d_out, n * sizeof(int));
-
- cudaMemcpy(d_in, h_in.data(), h_in.size() * sizeof(int), cudaMemcpyHostToDevice);
-
- dim3 blockDim(32, 32, 1);
- dim3 gridDim((n + 31) / 32, 1, 1);
- reduce_col<<<gridDim, blockDim>>>(d_in, m, n, d_out);
- std::vector<int> check_out(n);
- cudaMemcpy(check_out.data(), d_out, n * sizeof(int), cudaMemcpyDeviceToHost);
- print(h_out, 1, n);
- print(check_out, 1, n);
- return 0;
- }
方法二: 如果将threadIdx.x 映射到h维度,threadIdx.y 映射到 l 维, 能够进行block内的数据读取,但是访存不连续。
补充优化:在进行Reduce index计算时 可以考虑更加高效的index方法