• Tensor Core的WMMA API编程入门


    WMMA (Warp-level Matrix Multiply Accumulate) API

      对于计算能力在7.0及以上的CUDA设备,可以使用CUDA C++ API调用Tensor Core,支持形如D = AB + C的混合精度的矩阵乘运算。

    // fragment:Tensor Core数据存储类,支持matrix_a、matrix_b和accumulator
    template class fragment;
    
    // load_matrix_sync:Tensor Core数据加载API,支持将矩阵数据从global memory或shared memory加载到fragment
    void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned ldm);
    void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned ldm, layout_t layout);
    
    // store_matrix_sync:Tensor Core结果存储API,支持将计算结果从fragment存储到global memory或shared memory
    void store_matrix_sync(T* mptr, const fragment<...> &a, unsigned ldm, layout_t layout);
    
    // fill_fragment:fragment填充API,支持常数值填充
    void fill_fragment(fragment<...> &a, const T& v);
    
    // mma_sync:Tensor Core矩阵乘计算API,支持D = AB + C或者C = AB + C
    void mma_sync(fragment<...> &d, const fragment<...> &a, const fragment<...> &b, const fragment<...> &c, bool satf=false);
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    示例

      以m16n16k16为例,实现HGEMM:C = AB,其中矩阵A(M * K,row major)、B(K * N,col major)和C(M * N,row major)的精度均为FP16。首先我们看如何使用CUDA Core写HGEMM naive算法。

    CUDA Core

    按照每个线程计算矩阵C中的一个元素来构建naive kernel,首先确定当前线程处理矩阵C的元素坐标,再遍历K并直接从global memory中加载所需A、B矩阵元素到寄存器参与计算,最后将计算结果从寄存器直接写回矩阵C。所有block计算完成之后即可得到矩阵C。这个例子不能说简单,只能说技术含量不高,不过我们只是为了对比。

    #define DIV_CEIL(x, y) (((x) + (y) - 1) / (y))
    __global__ void naiveKernel(const half *__restrict__ A, const half *__restrict__ B, half *__restrict__ C, size_t M,
                                    size_t N, size_t K) {
        size_t row = threadIdx.x + blockDim.x * blockIdx.x;
        size_t col = threadIdx.y + blockDim.y * blockIdx.y;
        if (row < M && col < N) {
            half tmp = 0.0;
            for (size_t i = 0; i < K; ++i) {
                tmp += A[row * K + i] * B[i + col * K];
            }
            C[row * N + col] = tmp;
        }
    }
    void hgemmNaive(half *A, half *B, half *C, size_t M, size_t N, size_t K) {
        dim3 block(16, 16);
        dim3 grid(DIV_CEIL(M, block.x), DIV_CEIL(N, block.y));
        naiveKernel<<>>(A, B, C, M, N, K);
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    Tensor Core

      我们再来看如何用WMMA API来构建naive kernel,参考cuda sample。与CUDA Core naive不同的是,WMMA需要按照每个warp处理一个矩阵C的WMMA_M * WMMA_N大小的tile的思路来构建,因为Tensor Core的计算层级是warp级别,计算的矩阵元素也是二维的。接下来,与CUDA Core naive的处理思路一致,首先确定当前warp处理矩阵C的tile坐标,声明计算tilie所需的fragment,再以WMMA_K为步长遍历K并直接从global memory中加载所需A、B矩阵tile到fragment参与计算,最后将计算结果从fragment直接写回矩阵C。所有block计算完成之后即可得到矩阵C。值得注意的是,load_matrix_sync和store_matrix_sync都是按stride访问矩阵元素。

    #include 
    #define WARP_SIZE 32
    #define WMMA_M 16
    #define WMMA_N 16
    #define WMMA_K 16
    using namespace nvcuda;
     
    __global__ void wmmaNaiveKernel(const half *__restrict__ A, const half *__restrict__ B, half *__restrict__ C, size_t M,
                                    size_t N, size_t K) {
        size_t warpM = (blockIdx.x * blockDim.x + threadIdx.x) / WARP_SIZE;
        size_t warpN = (blockIdx.y * blockDim.y + threadIdx.y);
        wmma::fragment a_frag;
        wmma::fragment b_frag;
        wmma::fragment c_frag;
        wmma::fill_fragment(c_frag, 0.0f);
        for (size_t i = 0; i < K; i += WMMA_K) {
            size_t aCol = i;
            size_t aRow = warpM * WMMA_M;
            size_t bCol = warpN * WMMA_N;
            size_t bRow = i;
            if (aRow < M && aCol < K && bRow < K && bCol < N) {
                wmma::load_matrix_sync(a_frag, A + aCol + aRow * K, K);
                wmma::load_matrix_sync(b_frag, B + bRow + bCol * K, K);
                wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
            }
        }
        size_t cCol = warpN * WMMA_N;
        size_t cRow = warpM * WMMA_M;
        if (cRow < M && cCol < N) {
            wmma::store_matrix_sync(C + cCol + cRow * N, c_frag, N, wmma::mem_row_major);
        }
    }
    void hgemmWmmaNaive(half *A, half *B, half *C, size_t M, size_t N, size_t K) {
        dim3 block(128, 4);
        dim3 grid((M - 1) / (WMMA_M * block.x / WARP_SIZE) + 1, (N - 1) / (WMMA_N * block.y) + 1);
        wmmaNaiveKernel<<>>(A, B, C, M, N, K);
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37

    从上述两个naive kernel的代码来看调用CUDA Core和Tensor Core的区别如下:
    计算层级:CUDA Core是线程级别,Tensor Core是warp级别
    计算维度:CUDA Core是一维逐点计算,Tensor Core是二维逐tile计算
    计算依赖:WMMA调用Tensor Core需要借助数据存储类fragment,CUDA Core不需要借助其他

    示例
    // Wave Matrix Multiply Accumulate (WMMA) using HIP compiler intrinsic
    // Does a matrix multiplication of two 16x16, fp16 matrices, and stores them into a 16x16 fp16 result matrix
    
    #include 
    #include 
    #include 
    
    using namespace std;
    
    // Use half16 as an alias of the internal clang vector type of 16 fp16 values
    typedef _Float16 half16 __attribute__((ext_vector_type(16)));
    
    __global__ void wmma_matmul(__half* a, __half* b, __half* c)
    {
        const int gIdx = blockIdx.x * blockDim.x + threadIdx.x;
        const int lIdx = threadIdx.x;
    
        // a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and b
        // a_frag will store one column of the 16x16 matrix A tile
        // b_frag will store one row of the 16x16 matrix B tile
        half16 a_frag;
        half16 b_frag;
        // initialize c fragment to 0
        half16 c_frag = {};
    
        // lane is (0-31) mod 16 instead of 0-31 due to matrix replication in RDNA 3
        const int lane = lIdx % 16;
    
        for (int ele = 0; ele < 16; ++ele)
        {
            b_frag[ele] = b[16*ele + lane];
        }
    
        for (int ele = 0; ele < 16; ++ele)
        {
            a_frag[ele] = a[16 * lane + ele];
        }
    
        // call the WMMA intrinsic with OPSEL set to "false"
        c_frag = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a_frag, b_frag, c_frag, false);
    
        for (int ele = 0; ele < 8; ++ele)
        {
            const int r = ele * 2 + (lIdx / 16);
            // store results from unpacked c_frag output
            c[16 * r + lane] = c_frag[ele*2];
            // if OPSEL was set to "true", the line above would instead be
            // c[16 * r + lane] = c_frag[ele*2 + 1];
        }
    
    }
    
    int main(int argc, char* argv[])
    
    {
        __half a[16 * 16] = {};
        __half b[16 * 16] = {};
        __half c[16 * 16] = {};
        __half *a_gpu, *b_gpu, *c_gpu;
        hipMalloc(&a_gpu, 16*16 * sizeof(__half));
        hipMalloc(&b_gpu, 16*16 * sizeof(__half));
        hipMalloc(&c_gpu, 16*16 * sizeof(__half));
    
        // fill in some data into matrices A and B
        for (int i = 0; i < 16; ++i)
        {
            for (int j = 0; j < 16; ++j)
            {
                a[i * 16 + j] = (__half)1.f;
                b[i * 16 + j] = (__half)1.f;
            }
        }
    
        hipMemcpy(a_gpu, a, (16*16) * sizeof(__half), hipMemcpyHostToDevice);
        hipMemcpy(b_gpu, b, (16*16) * sizeof(__half), hipMemcpyHostToDevice);
        hipMemcpy(c_gpu, c, (16*16) * sizeof(__half), hipMemcpyHostToDevice);
    
        wmma_matmul<<>>(a_gpu, b_gpu, c_gpu);
    
        hipMemcpy(c, c_gpu, (16 * 16) * sizeof(__half), hipMemcpyDeviceToHost);
    
        hipFree(a_gpu);
        hipFree(b_gpu);
        hipFree(c_gpu);
    
        for (int i = 0; i < 16; ++i)
        {
            for (int j = 0; j < 16; ++j)
            {
                printf("%f ", (float)c[i * 16 + j]);
            }
            printf("\n");
        }
    
        return 0;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    shape

    https://note.youdao.com/ynoteshare/index.html?id=5a5ad61899123a3e3e9cb1f38f338936&type=note&_time=1697429671762
    设置shape

  • 相关阅读:
    位运算及其应用
    LeetCode 面试题 08.08. 有重复字符串的排列组合
    【DL】Windiws10系统下安装CUDA和CUDNN实践教程
    【因果推断python】49_去偏/正交机器学习1
    【SSL 1455】不稳定的道路(最短路)
    爬虫的介绍与使用
    Java高级-stream流
    HEC-RAS 1D/2D水动力与水环境模拟技术案例实践及拓展应用
    C语言基础小操作
    Spring Security(五)--管理用户
  • 原文地址:https://blog.csdn.net/wanchaochaochao/article/details/133807794