端侧GPU基于opencl实现reduce算子

采用了一个work group的所有线程来计算最内部维度的reduce计算，work group通常采用固定 64个线程。

计算一个warp/sub group内部的累加采用了sub_group_reduce_add，类似于CUDA的warp shuffle，理论上有助于提升性能。

opencl kernel和测试代码如下，ARM mali GPU性能显著优于tflite GPU delegate实现。没有直接采用atomic对不同warp之间的结果进行累加，因为opencl的atomic不支持float，而采用了另一种方式。


#include 
#include 
#include 
#include 
#include 
 
#include "mem_helper.h"
 
#define CL_HPP_TARGET_OPENCL_VERSION 300
#include 
 
using TEST_DTYPE = float;
 
using namespace std;
 
std::string kernel_source{R"(
// we use all threads in a block to calculate the reduce mean/sum of the last tensor axis
kernel void reduce_kernel(__global const float* d_in, __global float* d_out, int channel_size, int block_size) {
  int gid = get_global_id(0);
  int lid = get_local_id(0);
  int batch = get_group_id(0);
  int warp_id = get_sub_group_id();
  int lane_id = get_sub_group_local_id();  // id of threads in a warp
  int sg_num = get_num_sub_groups();
  // for warp_num = 8, block_size should < 8 * 8 = 64
  // for warp_num = 16, block_size should < 16 * 16 = 256
  int addr_offset = batch * channel_size;
  float mean = 0.0f;
  for (int i = lid; i < channel_size; i += block_size) {
    float x0 = d_in[addr_offset + i];
    mean += x0;
  }
  // all values in a warp are reduced together
  mean = sub_group_reduce_add(mean);
  // we use local mem to exchange the data between warps
  // we write first value of each warp into local mem, and then read by threads in one warp for reduce
  // should not > threads in a warp (typically 8 or 16), 
  // and should not < work_group size / threads in a warp
  #define MAX_WARP_NUM 16
  local float shared_sums[MAX_WARP_NUM];
  if (lane_id == 0) {
    // atomic_store(&shared_sums[warp_id], mean);
    shared_sums[warp_id] = mean;
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  // mean = (lid < sg_num) ? shared_sums[lid] : 0; // only first warp get mean
  mean = (lane_id < sg_num) ? shared_sums[lane_id] : 0; // each warp get mean
  mean = sub_group_reduce_add(mean);
  mean /= channel_size;
  if (lid == 0) {
    d_out[batch] = mean;
  }
}
)"};
 
int main() {
  std::vector platforms;
  cl::Platform::get(&platforms);
  std::cout << "get platform num:" << platforms.size() << std::endl;
 
  cl::Platform plat;
  for (auto& p : platforms) {
    std::string platver = p.getInfo();
    if (platver.find("OpenCL 2.") != std::string::npos || platver.find("OpenCL 3.") != std::string::npos) {
      // Note: an OpenCL 3.x platform may not support all required features!
      plat = p;
    }
  }
  if (plat() == 0) {
    std::cout << "No OpenCL 2.0 or newer platform found.\n";
    return -1;
  }
 
  std::cout << "platform name:" << plat.getInfo() << std::endl;
 
  cl::Platform newP = cl::Platform::setDefault(plat);
  if (newP != plat) {
    std::cout << "Error setting default platform.\n";
    return -1;
  }
 
  // get default device (CPUs, GPUs) of the default platform
  std::vector all_devices;
  newP.getDevices(CL_DEVICE_TYPE_GPU, &all_devices);  // CL_DEVICE_TYPE_ALL
  std::cout << "get all_devices num:" << all_devices.size() << std::endl;
 
  if (all_devices.size() == 0) {
    std::cout << " No devices found. Check OpenCL installation!\n";
    exit(1);
  }
 
  // cl::Device default_device = cl::Device::getDefault();
  cl::Device default_device = all_devices[0];
  std::cout << "device name: " << default_device.getInfo() << std::endl;
 
  // a context is like a "runtime link" to the device and platform;
  // i.e. communication is possible
  cl::Context context({default_device});
  cl::CommandQueue queue(context, default_device);
 
  int batch = 512;
  int channel_size = 768;
 
  vector<int> shape1 = {batch, channel_size};
  vector<int> shape2 = {batch,};
 
  MemoryHelper mem_in(shape1);
  MemoryHelper mem_out(shape2);
  mem_in.StepInit();
 
  // CL_MEM_WRITE_ONLY CL_MEM_READ_ONLY CL_MEM_READ_WRITE
  cl::Buffer d_in = cl::Buffer(context, CL_MEM_READ_WRITE, mem_in.bytes);
  cl::Buffer d_out = cl::Buffer(context, CL_MEM_READ_WRITE, mem_out.bytes);
 
  memset(mem_out.Mem(), 0, mem_out.bytes);
 
  // push write commands to queue
  queue.enqueueWriteBuffer(d_in, CL_TRUE, 0, mem_in.bytes, mem_in.Mem());
 
  std::vector programStrings;
  programStrings.push_back(kernel_source);
  cl::Program program(context, programStrings);
 
  if (program.build({default_device}, "-cl-std=CL3.0") != CL_SUCCESS) {
    std::cout << "Error building: " << program.getBuildInfo(default_device) << std::endl;
    exit(1);
  }
 
  auto cl_kernel = cl::KernelFunctorint, int>(program, "reduce_kernel");
 
  int block_size = 64;
  if(block_size > channel_size){
    block_size = channel_size;
    block_size = (block_size / 8) * 8;
  }
  int local_thread_num = block_size;
  int total_thread_num = batch * local_thread_num;
 
  // global, or global, local, or offset, global, local
  cl::EnqueueArgs kernel_args(queue, cl::NDRange(total_thread_num), cl::NDRange(local_thread_num));
 
  cl_kernel(kernel_args, d_in, d_out, channel_size, block_size);
  queue.enqueueReadBuffer(d_out, CL_TRUE, 0, mem_out.bytes, mem_out.Mem());
 
  std::cout << "results:" << std::endl;
 
  TEST_DTYPE* h_c = mem_out.Mem();
  for (int i = 0; i < mem_out.elem_num; i++) {
    std::cout << float(h_c[i]) << " ";
  }
  std::cout << std::endl;
 
  return 0;
}

mem_helper:


#include 
#include 
#include 
#include 
using namespace std;
 
template <class T>
class MemoryHelper {
 public:
  const vector<int> shape;
  const size_t elem_num = 0;
  const string name;
  const size_t bytes = 0;
  std::unique_ptr h_mem = nullptr;
 
 public:
  MemoryHelper(const vector<int>& shape, const string& name = ""): shape(shape),
    name(name),
    elem_num(GetElemNum(shape)),
    bytes(elem_num * sizeof(T)) {
    h_mem = std::make_unique(elem_num);
  }
  void RandInit(int seed=0){
    srand(seed);
    for (size_t i = 0; i < elem_num; i++) {
      h_mem[i] = T(rand() % 100);
    }
  }
  void StepInit(float ratio=0.01f, float bias=0.0f){
    for(size_t i=0;i
      h_mem[i] = i*ratio+bias;
    }
  }
  T* Mem() {
    return h_mem.get();
  }
  size_t GetBytes() {
    return bytes;
  }
 
 public:
  static int GetElemNum(const vector<int>& shape) {
    size_t elem_num = 1;
    for (auto elem : shape) {
      elem_num *= elem;
    }
    return elem_num;
  }
};

附加信息

mali gpu sub_group大小，对应于CUDA warp的大小，也可以通过相关函数获取：

G.2.2 OpenCL 2.1 built-in functions

Several new built-in functions are added in OpenCL 2.1.
The new functions are:
• get_enqueued_num_sub_groups
• get_kernel_max_sub_group_size_for_ndrange
• get_kernel_sub_group_count_for_ndrange
• get_max_sub_group_size
• get_num_sub_groups
• get_sub_group_size
• get_sub_group_local_id
• get_sub_group_id
• sub_group_all
• sub_group_any
• sub_group_barrier
• sub_group_broadcast
• sub_group_commit_read_pipe
• sub_group_commit_write_pipe
• sub_group_reduce_
• sub_group_reserve_read_pipe
• sub_group_reserve_write_pipe
• sub_group_scan_exclusive_
• sub_group_scan_inclusive_

The in sub_group_reduce_, sub_group_scan_inclusive_ and sub_group_scan_exclusive_ defines the operator and can be add, min or max.

For the sub_group_reduce, sub_group_scan_exclusive, and sub_group_scan_inclusive functions, gentype is int, uint, long, ulong, or float.

If cl_khr_fp16 is supported, gentype also includes half.

If cl_khr_fp64 or doubles are supported, gentype also includes double.

warp shuffle其他相关方法

// These functions are available to devices supporting cl_khr_subgroup_extended_types:
// Note: Existing functions supporting additional data types.

gentype sub_group_broadcast( gentype value, uint index )
gentype sub_group_reduce_add( gentype value )
gentype sub_group_reduce_min( gentype value )
gentype sub_group_reduce_max( gentype value )

gentype sub_group_scan_inclusive_add( gentype value )
gentype sub_group_scan_inclusive_min( gentype value )
gentype sub_group_scan_inclusive_max( gentype value )

gentype sub_group_scan_exclusive_add( gentype value )
gentype sub_group_scan_exclusive_min( gentype value )
gentype sub_group_scan_exclusive_max( gentype value )

// These functions are available to devices supporting cl_khr_subgroup_shuffle:
gentype sub_group_shuffle( gentype value, uint index )
gentype sub_group_shuffle_xor( gentype value, uint mask )
// These functions are available to devices supporting cl_khr_subgroup_shuffle_relative:
gentype sub_group_shuffle_up( gentype value, uint delta )
gentype sub_group_shuffle_down( gentype value, uint delta )

ref

The OpenCL™ Extension Specification

Arm® Mali™ Bifrost and Valhall OpenCL Developer Guide

https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_Ext.html#_extended_subgroup_functions

atomic操作可以参考《OpenCl异构并行计算原理机制与优化实践》

相关阅读:
【MySQL】MySQL的存储过程(1)
栈浅谈（上）
崩坏：星穹铁道私人服务器搭建教程
 ansible免密登陆脚本
 mysql数据库无法连接：解决MySQL数据库无法连接的方法
 Ant Design Vue - 去掉＜a-tabs＞标签页组件底部细条灰色线（清掉选项卡组件整体底部灰色黑色细线）
【微服务架构】链路追踪Skywalking入门与实践
 【scikit-learn基础】--『预处理』之离散化
 【定向征文活动】2023年深圳1024开发者城市聚会活动参会感想征文
 微信小程序image组件的@load事件在图片没有加载完导致序列帧动画闪烁，如何解决？
原文地址：https://blog.csdn.net/u013701860/article/details/126141236