• 3d稀疏卷积——spconv源码剖析(三)


    构建Rulebook

    下面看ops.get_indice_pairs,位于:spconv/ops.py

    构建Rulebookops.get_indice_pairs接口完成

    get_indice_pairs函数具体实现:

    def get_indice_pairs(indices,  # (N, 4) voxel网格坐标索引
                         batch_size,
                         spatial_shape,
                         ksize=3,
                         stride=1,
                         padding=0,
                         dilation=1,
                         out_padding=0,
                         subm=False,
                         transpose=False,
                         grid=None,
                         use_hash=False):
        ndim = indices.shape[1] - 1 # 4->3
        if not isinstance(ksize, (list, tuple)):
            ksize = [ksize] * ndim # 3->[3,3,3],3x3x3 kernel
        if not isinstance(stride, (list, tuple)):
            stride = [stride] * ndim # 1->[1,1,1]
        if not isinstance(padding, (list, tuple)):
            padding = [padding] * ndim # 0->[0,0,0]
        if not isinstance(dilation, (list, tuple)):
            dilation = [dilation] * ndim # 1->[1,1,1]
        if not isinstance(out_padding, (list, tuple)):
            out_padding = [out_padding] * ndim # 0->[0,0,0]
    
        for d, s in zip(dilation, stride): # 不支持s,d都不等于1的设定
            assert any([s == 1, d == 1]), "don't support this." # 只要有一个为true,any则为true
    
        if not subm: # 普通稀疏卷积
            if transpose: # False
                out_shape = get_deconv_output_size(spatial_shape, ksize, stride,padding, dilation, out_padding)
            else:
                # 计算普通稀疏卷积输出shape
                out_shape = get_conv_output_size(spatial_shape, ksize, stride,padding, dilation)
        else: # 子流线稀疏卷积
            out_shape = spatial_shape # 输入输出shape一样
        if grid is None: # None
            # 在src/spconv/all.cc文件中通过Pytorch提供的OP Register对底层c++ api进行了注册
            # 通过torch.ops.load_library加载.so文件,torch.ops.spconv.get_indice_pairs方式来调用src/spconv/spconv_ops.cc文件种的getIndicePairs函数
            res = torch.ops.spconv.get_indice_pairs(indices, batch_size, out_shape,spatial_shape, ksize, stride,
                                                    padding, dilation, out_padding,int(subm), int(transpose),int(use_hash))
            return res
        else:
            if ndim == 2:
                get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_2d
            elif ndim == 3:
                get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_3d
            else:
                raise NotImplementedError
            return get_indice_pairs_func(indices, grid, batch_size, out_shape,spatial_shape, ksize, stride, padding,
                                         dilation, out_padding, int(subm),int(transpose), int(use_hash))
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50

    主要就是完成了一些参数的校验和预处理。首先,对于3d普通稀疏卷积,根据输入shape大小,kernel sizestride等参数计算出输出输出shape,子流行稀疏卷积就不必计算了,输出shape和输入shape一样大小

    准备好参数之后就进入最核心的get_indice_pairs函数。因为spconv通过torch.ops.load_library加载.so文件注册,所以这里通torch.ops.spconv.get_indice_pairs这种方式来调用该函数。

    算子注册:在src/spconv/all.cc文件中通过Pytorch提供的OP Register(算子注册的方式)对底层c++ api进行了注册,可以python接口形式调用c++算子

    static auto registry =
        torch::RegisterOperators()
            .op("spconv::get_indice_pairs", &spconv::getIndicePairs)
    
    • 1
    • 2
    • 3

    同C++ extension方式一样,OP Register也是Pytorch提供的一种底层扩展算子注册的方式。注册的算子可以通过torch.xxx或者tensor.xxx的方式进行调用,该方式同样与pytorch源码解耦,增加和修改算子不需要重新编译pytorch源码。用该方式注册一个新的算子,流程非常简单:先编写C++相关的算子实现,然后通过pytorch底层的注册接口(torch::RegisterOperators),将该算子注册即可。

    构建Rulebook实际通过python接口get_indice_pairs调用src/spconv/spconv_ops.cc文件种的getIndicePairs函数

    代码位于:src/spconv/spconv_ops.cc

    std::vector<torch::Tensor>
    getIndicePairs(torch::Tensor indices,                 // torch.Size([N, 4])
                   int64_t batchSize,                     // 4
                   std::vector<int64_t> outSpatialShape,  // [41, 1440, 1440]
                   std::vector<int64_t> spatialShape,     // [41, 1440, 1440]
                   std::vector<int64_t> kernelSize,       // [3,3,3]
                   std::vector<int64_t> stride,           // [1,1,1]
                   std::vector<int64_t> padding,          // [1,1,1]
                   std::vector<int64_t> dilation,         // [1,1,1]
                   std::vector<int64_t> outPadding,       // [0,0,0]
                   int64_t _subM,                         // SubMConv3d为1,SparseConv3d为0
                   int64_t _transpose,                    // 0
                   int64_t _useHash                       // 0
                   ) {
      // auto timer = spconv::CudaContextTimer<>();
      bool subM = _subM != 0;           // SubMConv3d为1,SparseConv3d为0
      bool transpose = _transpose != 0; // Flase
      auto NDim = kernelSize.size();    // 3
      // CPU always use hash (tsl::robin_map).
      bool useHash = _useHash != 0 || indices.device().type() == torch::kCPU; // 默认_useHas:False
      auto numAct = indices.size(0);      // torch.Size([N,4]) -> N  active input site的个数
      auto coorDim = indices.size(1) - 1; // torch.Size([N,4]) -> 3  
      TV_ASSERT_RT_ERR(NDim == coorDim, "error");                  // 3==3
      TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");     // 3==3
      TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");// 3==3
      TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");         // 3==3
      TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");        // 3==3
      TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");     // 3==3
      TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");       // 3==3
    
      // [3,3,3] -> 3*3*3 = 27
      auto kernelVolume = kernelSize[0]; // 3
      for (int i = 1; i < kernelSize.size(); ++i) {
        kernelVolume *= kernelSize[i]; 
      }
      TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
      // [41, 1440, 1440]->41*1440*1440 
      auto outputVolume = outSpatialShape[0]; 
      for (int i = 1; i < outSpatialShape.size(); ++i) {
        outputVolume *= outSpatialShape[i];
      }
      std::string msg = "due to limits of cuda hash, the volume of dense space include batch size ";
      msg += "must less than std::numeric_limits::max() = 2e9";
      TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(),msg);
      // indicePairs:torch.Size([2,27,N]),-1填充
      // 2表示输入和输出两个方向,kernelVolume为卷积核的volume_size。如一个3x3x3的卷积核,其volume_size就是27(3*3*3)。
      // numAct表示输入有效(active)特征的数量
      torch::Tensor indicePairs = torch::full({2, kernelVolume, numAct}, -1,torch::dtype(torch::kInt32).device(indices.device()));
      // indiceNum torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数,因为稀疏卷积的卷积核上每一个元素和有效数据的运算次数可能是不同的。
      torch::Tensor indiceNum = torch::zeros({kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
      // 4*41*1440*1440
      auto gridSize = batchSize * outputVolume;
      if (useHash) { // 默认False,使用GPU
          gridSize = batchSize; // 输入useHash为true,或者使用cpu
      }
      // torch.Size([4*41*1440*1440])
      torch::Tensor gridOut = torch::full({gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device()));
      // torch.Size([4,41*1440*1440])
      gridOut = gridOut.view({batchSize, -1});
      int64_t numActOut = -1;
      // 根据子流线稀疏卷积输出和输入形状相同,计算步长stride=1和padding
      for (int i = 0; i < NDim; ++i) {
        if (subM) { 
          padding[i] = kernelSize[i] / 2;
          stride[i] = 1;
        }
      }
      // tv::ssprint("prepare", timer.report() / 1000.0);
      if (subM) { // 子流线稀疏卷积
        if (indices.device().type() == torch::kCPU) { // False
    		......// cpu
        }
    #ifdef TV_CUDA
        else if (indices.device().type() == torch::kCUDA) { // True
          numActOut = create_submconv_indice_pair_cuda(
              indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
              dilation, outSpatialShape, transpose, false, useHash);
          if (numActOut == -1) {
            auto device = indices.device();
            indicePairs = indicePairs.to({torch::kCPU});
            indiceNum = indiceNum.to({torch::kCPU});
            indices = indices.to({torch::kCPU});
            numActOut = create_submconv_indice_pair_cpu(indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding, dilation, outSpatialShape, transpose, false, useHash);
            
            return {indices.to(device), indicePairs.to(device),indiceNum.to(device)};
          }
    
        }
    #endif
        else {
          TV_THROW_INVALID_ARG("unknown device type");
        }
        // tv::ssprint("subm", timer.report() / 1000.0);
        return {indices, indicePairs, indiceNum};
      } else { // 普通稀疏卷积,初始化 indicePairUnique 和 outInds
        auto indicePairUnique = torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),torch::dtype(torch::kInt32).device(indices.device()));
        torch::Tensor outInds = torch::zeros({numAct * kernelVolume, coorDim + 1},torch::dtype(torch::kInt32).device(indices.device()));
        if (indices.device().type() == torch::kCPU) { // CPU
    		......// cpu
        }
    #ifdef TV_CUDA
        else if (indices.device().type() == torch::kCUDA) { // GPU
          numActOut = create_conv_indice_pair_p1_cuda(indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride,padding, dilation, outSpatialShape, transpose);
          if (numActOut > 0) {
            auto res = torch::_unique(indicePairUnique);
            indicePairUnique = std::get<0>(res);
            numActOut = create_conv_indice_pair_p2_cuda(indices, outInds, gridOut, indicePairs, indiceNum, indicePairUnique,outSpatialShape, transpose, false, useHash);
            if (numActOut == -1) {
              auto device = indices.device();
              outInds = outInds.to({torch::kCPU});
              indicePairs = indicePairs.to({torch::kCPU});
              indiceNum = indiceNum.to({torch::kCPU});
              indices = indices.to({torch::kCPU});
              numActOut = create_conv_indice_pair_cpu(indices, outInds, gridOut, indicePairs, indiceNum, kernelSize,stride, padding, dilation, outSpatialShape, transpose, false,useHash);
    
              return {outInds.to(device).slice(0, 0, numActOut),indicePairs.to(device), indiceNum.to(device)};
            }
          }
        }
    #endif
        else {
          TV_THROW_INVALID_ARG("unknown device type");
        }
        return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
      }
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126

    分析getIndicePairs直接将重心锁定在GPU逻辑部分,并且子流行3d稀疏卷积和正常3d稀疏卷积分开讨论,优先子流行3d稀疏卷积。

    代码种最重要的3个变量分别为:indicePairsindiceNumgridOut,其建立过程如下:

      // torch.Size([2,27,N]) -1填充
      // 2表示输入和输出两个方向,kernelVolume为卷积核的volume_size。如一个3x3x3的卷积核,其volume_size就是27(3*3*3)。
      // numAct表示输入有效(active)特征的数量
      torch::Tensor indicePairs = torch::full({2, kernelVolume, numAct}, -1,torch::dtype(torch::kInt32).device(indices.device()));
      // indiceNum torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数,因为稀疏卷积的卷积核上每一个元素和有效数据的运算次数可能是不同的。
      torch::Tensor indiceNum = torch::zeros({kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
      // 4*41*1440*1440
      auto gridSize = batchSize * outputVolume;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8

    indicePairs代表了稀疏卷积输入输出的映射规则,即Input Hash TableOutput Hash Table。这里分配理论最大的内存,它的shape{2,kernelVolume,numAct}2表示输入和输出两个方向,kernelVolume为卷积核的volume size。例如一个3x3x3的卷积核,其volume size就是27(3*3*3)。numAct表示输入有效(active)特征的数量。indiceNum用于保存卷积核每一个位置上的总的计算的次数,indiceNum对应图片中的count

    在这里插入图片描述

    代码中关于gpu建立rulebook调用create_submconv_indice_pair_cuda函数来完成,下面具体分析下create_submconv_indice_pair_cuda函数

    子流线稀疏卷积

    子流线稀疏卷积是调用create_submconv_indice_pair_cuda函数来构建rulebook

    int create_submconv_indice_pair_cuda(
        torch::Tensor indicesIn,          // torch.Size([N, 4]) voxel空间索引
        torch::Tensor gridsOut,           // torch.Size([4,41*1440*1440]) 
        torch::Tensor indicePairs,        // torch.Size([2,27,N]),-1填充 保存 rulebook
        torch::Tensor indiceNum,          // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
        std::vector<int64_t> kernelSize,  // [3,3,3] 
        std::vector<int64_t> stride,      // [1,1,1]
        std::vector<int64_t> padding,     // [1,1,1]
        std::vector<int64_t> dilation,    // [1,1,1]
        std::vector<int64_t> outSpatialShape, // [41, 1440, 1440]
        bool transpose,                   // Flase
        bool resetGrid,                   // false
        bool useHash                      // False
        ) {
        auto stream = at::cuda::getCurrentCUDAStream();
        auto ndim = outSpatialShape.size(); // 3
        auto numActIn = indicesIn.size(0);  // 输入有效(active)特征的数量N
        int batchSize = gridsOut.size(0);   // 4
    
        auto kernelVolume = indiceNum.size(0); // 3x3x3 => 27
        if (numActIn == 0)
            return 0;
        bool failed = false;
    
        tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
            using Index = TV_DECLTYPE(IndexValue); //类型推导
            using IndexGrid = int32_t;
            tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
                constexpr int NDim = TV_DECLTYPE(I)::value; // 3
                // 将参数信息复制到tv::SimpleVector类型相关变量上
                tv::SimpleVector<Index, NDim> ks(kernelSize.begin(),kernelSize.end());
                tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
                tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
                tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
                tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),outSpatialShape.end());
                Index spatialVolume = 1;// 输出大小
                for (int i = 0; i < NDim; ++i) {
                    spatialVolume *= outSpatialShape[i]; // 21*800*704
                }
                if (useHash) {
    			//...省略...
                } else {
                    // auto timer = spconv::CudaContextTimer<>();
                    // block_size为tv::cuda::CUDA_NUM_THREADS=1024,grid_size大小通过tv::cuda::getBlocks(numActIn)计算得到,numActIn表示有效(active)输入数据的数量
                    // prepareSubMGridKernel的作用类似于建立输出张量坐标(通过index表示voxel空间索引)到输出序号之间的一张哈希表
                    prepareSubMGridKernel<Index, IndexGrid, NDim>
                        <<<tv::cuda::getBlocks(numActIn),tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
                            tv::torch2tv<Index>(indicesIn),tv::torch2tv<IndexGrid>(gridsOut), ou, spatialVolume);
                    // tv::ssprint("prepareSubMGridKernel", timer.report() / 1000.0);
                    TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed");
                    // when dilation all one, we use a simple kernel to calc result
                    bool dilation_one = true;
                    for (int i = 0; i < NDim; ++i) {
                        dilation_one &= di[i] == 1; // True
                    }
                    auto found = false;
                    if (dilation_one && (NDim == 2 || NDim == 3)) {
                        auto indiceNumCpu = indiceNum.cpu();
                        if (NDim == 2) {
    					//...省略...
                        } else if (NDim == 3) {
                            tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),outSpatialShape.end()); // 输出shape
    
                            tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[0], [&](auto K0C) {
                                tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[1], [&](auto K1C) {
                                    tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[2], [&](auto K2C) {
                                        // 卷积核尺寸大小
                                        constexpr int K0 = TV_DECLTYPE(K0C)::value; 
                                        constexpr int K1 = TV_DECLTYPE(K1C)::value;
                                        constexpr int K2 = TV_DECLTYPE(K2C)::value;
                                        found = true;
                                        // block_size为tv::cuda::CUDA_NUM_THREADS=1024
                                        // grid_size大小通过tv::cuda::getBlocks(numActIn)计算得到,numActIn表示有效(active)输入数据的数量
                                        // spatialVolume 输出shape各维度的乘积
                                        getSubMIndicePairsKernel3<Index, IndexGrid,K0, K1, K2>
                                            <<<tv::cuda::getBlocks(numActIn),tv::cuda::CUDA_NUM_THREADS, 0,stream>>>(
                                                tv::torch2tv<Index>(indicesIn),tv::torch2tv<IndexGrid>(gridsOut),
                                                tv::torch2tv<Index>(indicePairs),tv::torch2tv<Index>(indiceNum), ou_,spatialVolume);
                                    });
                                });
                            });
                        }
                    }
                    if (!found) {
    				//...省略...
                    }
                    // tv::ssprint("getSubMIndicePairsKernel", timer.report() / 1000.0);
                }
    
                if (resetGrid && (!useHash)) {
                    resetGridSubMKernel<Index, IndexGrid, NDim>
                        <<<tv::cuda::getBlocks(numActIn),
                           tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
                            indicesIn.data_ptr<Index>(),
                            tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn);
                    TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
                }
            });
        });
        if (failed) {
            return -1;
        }
    
        return numActIn;
    }
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106

    create_submconv_indice_pair_cuda大可不必深究以下动态分发机制的运行原理。

    tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
    ....
    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
    ....
    }    
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6

    直接将重心锁定在核函数:

            prepareSubMGridKernel<Index, IndexGrid, NDim>
                <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
                   stream>>>(tv::torch2tv<Index>(indicesIn),
                             tv::torch2tv<IndexGrid>(gridsOut), ou, spatialVolume);
    
    • 1
    • 2
    • 3
    • 4

    prepareSubMGridKernel核函数中grid_sizeblock_size实则都是用的整形变量。其中block_sizetv::cuda::CUDA_NUM_THREADS,在include/tensorview/cuda_utils.h文件中定义,大小为1024。而grid_size大小通过tv::cuda::getBlocks(numActIn)计算得到,其中numActIn表示有效(active)输入数据的数量。

    template <typename T1, typename T2> inline int DivUp(const T1 a, const T2 b) {
      return (a + b - 1) / b;
    }
    
    // Use 1024 threads per block, which requires cuda sm_2x or above
    constexpr int CUDA_NUM_THREADS = 1024;
    // CUDA: number of blocks for threads.
    
    inline int getNumThreads(const int N) {
      if (N > CUDA_NUM_THREADS) {
        return CUDA_NUM_THREADS;
      }
      return DivUp(N, 32) * 32;
    }
    
    inline int getBlocks(const int N) {
      TV_ASSERT_RT_ERR(N > 0,
                       "CUDA kernel launch blocks must be positive, but got N=", N);
      return DivUp(N, getNumThreads(N));
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20

    prepareSubMGridKernel作用:建立输出张量坐标(通过index表示)到输出序号之间的一张哈希表

    见:include/spconv/indice.cu.h

    // 建立输出张量坐标(通过index表示)到输出序号之间的一张哈希表
    template <typename Index, typename IndexGrid, unsigned NDim>
    __global__ void prepareSubMGridKernel(
        tv::TensorView<const Index> indicesIn,                // 输入voxel特征(N,4)
        tv::TensorView<IndexGrid> gridsOut,                   // 输出索引(4,41*1440*1440) 
        const tv::SimpleVector<Index, NDim> outSpatialShape,  // 输出空间shape [41, 1440, 1440]
        Index spatialVolume                                   // 输出shape每个维度大小乘积 41*1440*1440
        ) {
      auto numActIn = indicesIn.dim(0); // torch.Size([N,4]) => N
      Index index = 0;
      for (int ix : tv::KernelLoopX<int>(numActIn)) {
        // NDim 为 3 index为输出张量坐标 ix为输出序号
        // 转为一维,计算index,只不过这里换了模板加递归形式的写法,看起来复杂,理清楚就好
        // (batch_id,z,y,x) --> index : index = x*shape[3] + x*y * shape[2] + z + batch_id*spatialVolume
        index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(indicesIn.data() + ix * (NDim + 1) + 1, outSpatialShape.data(), 0) 
        + spatialVolume * indicesIn(ix, 0); // indicesIn(ix, 0) 表示属于第几个batch
        gridsOut[index] = ix;
      }
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19

    这里计算index换了一种模板加递归的写法,看起来比较复杂而已。令:new_indicesIn = indicesIn.data() ,可以推导得出index为:

    index = new_indicesIn[1+4*ix] * new_outSpatialShape[3]+ 
    		new_indicesIn[2+4*ix] * new_outSpatialShape[2] * new_outSpatialShape[3] + 		  new_indicesIn[3+4*ix] + 
    		spatialVolume * indicesIn(1, 0);		
    
    • 1
    • 2
    • 3

    ArrayIndexRowMajor位于include/tensorview/tensorview.h,其递归调用写法如下:

    template <int N, int Ndim> struct ArrayIndexRowMajor {
        // this array index provide almost same compiled code. compile it in
        // https://godbolt.org/ for more details.
        template <typename TShape, typename Tinit>
        TV_HOST_DEVICE_INLINE static unsigned runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
            return ArrayIndexRowMajor<N - 1, Ndim>::runPtrs(indexes, shape, (indexes[Ndim - N] + start) * shape[Ndim - N + 1]);
        }
    };
    
    template <int Ndim> struct ArrayIndexRowMajor<1, Ndim> {
        template <typename TShape, typename Tinit>
        TV_HOST_DEVICE_INLINE static unsigned runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
          return start + indexes[Ndim - 1];
        }
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15

    接着看核函数getSubMIndicePairsKernel3

    getSubMIndicePairsKernel3<Index, IndexGrid,K0, K1, K2>
       <<<tv::cuda::getBlocks(numActIn),tv::cuda::CUDA_NUM_THREADS, 0,stream>>>(
          tv::torch2tv<Index>(indicesIn),tv::torch2tv<IndexGrid>(gridsOut),
          tv::torch2tv<Index>(indicePairs),tv::torch2tv<Index>(indiceNum), ou_,spatialVolume);
    
    • 1
    • 2
    • 3
    • 4

    位于:include/spconv/indice.cu.h

    template <typename Index, typename IndexGrid, unsigned K0, unsigned K1,unsigned K2>
    __global__ void getSubMIndicePairsKernel3(
        tv::TensorView<const Index> indicesIn, // 输入特征[N,4] 
        tv::TensorView<IndexGrid> gridsOut,    // 输出张量坐标(通过index表示voxel空间索引)到输出序号之间的一张哈希表
        tv::TensorView<Index> indicePairs,     // 存储稀疏卷积输入输出的映射规则rulebook 维度[2,27,N]
        tv::TensorView<Index> indiceNum,       // 存储卷积核每一个位置上的总的计算的次数 维度[27]
        const tv::SimpleVector<Index, 3> outSpatialShape, // 输出shape每个维度大小乘积 41*1440*1440
        Index spatialVolume                    // 输出shape各维度的乘积
        ) {
        auto numActIn = indicesIn.dim(0);      // numActIn表示有效(active)输入数据的数量
    
        Index point[3];
        Index index = 0;
        Index offset;
        constexpr unsigned KV = K0 * K1 * K2; // 卷积核尺寸大小 3*3*3=27
        // 计算出kernel的大小及其中心位置
        constexpr unsigned center = KV / 2; // 27/2 = 13
        // 对于子流行稀疏卷积来说,kernel中心的元素一定会和输入中的每一个有效(active)元素进行一次运算
        *(indiceNum.data() + center) = numActIn; // 对 indiceNum 中中心位置的地址赋值为 numActIn
        for (int ix : tv::KernelLoopX<int>(numActIn)) { // 类似C++11范围for循环写法
            // 索引ix对应的输入特征数据
            const Index *indice_data = indicesIn.data() + ix * (3 + 1);
            // 3层for循环对应卷积核3个维度D,H和W,大小分别为K0,K1和K2
            // #pragma unroll命令,显示地告诉编译器在进行编译时对循环进行展开。
    #pragma unroll
        for (int i = 0; i < K0; ++i) {
    #pragma unroll
          for (int j = 0; j < K1; ++j) {
    #pragma unroll
            for (int k = 0; k < K2; ++k) {
              // 计算出当前卷积核内的偏移,以3x3x3(K0=3,K1=3,K2=3)3D卷积核为例,offset从0~26,
              // 但是代码25行规定当offset > center(13)时continue,所以offset实际只计算到13。
              offset = i * K1 * K2 + j * K2 + k;
              if (offset > center){
                continue;
              }
              // 对于卷积核中心位置(center)的元素,它一定会和每一个输入元素作用,当offset等于center时,输入索引等于输出索引等于ix
              if (center == offset){
                  // center of subm indice pairs dont need atomicadd
                  indicePairs(1, offset, ix) = ix;
                  indicePairs(0, offset, ix) = ix;
              }else{
                point[2] = indice_data[3] - k + K2 / 2; 
                point[1] = indice_data[2] - j + K1 / 2;
                point[0] = indice_data[1] - i + K0 / 2;
                if (point[1] >= 0 && point[1] < outSpatialShape[1] && 
                    point[2] >= 0 && point[2] < outSpatialShape[2] && 
                    point[0] >= 0 && point[0] < outSpatialShape[0]) {
                  // 计算索引
                  index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(point, outSpatialShape.data(), 0) + spatialVolume * indice_data[0];
                  if (gridsOut[index] != -1) {
                      // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume -i - 1] 
                      // indiceNum :保存卷积核每一个位置上的总的计算的次数-1
                      Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
                      atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
                      // 建立rulebook的核心,为什么offset只需要计算到center位置这是由子流行稀疏卷积的一个对称特点决定的,归结起来就是下面4行代码:
                      indicePairs(1, offset, oldNum) = gridsOut[index]; // 输出序号
                      indicePairs(0, offset, oldNum) = ix; // 输入序号
                      indicePairs(1, KV - offset - 1, oldNum) = ix; // 输出序号
                      indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index]; // 输入序号
                  }
                }
              }
            }
          }
        }
      }
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68

    看:

    for (int ix : tv::KernelLoopX(numActIn)) {
    	......	
    }
    
    • 1
    • 2
    • 3

    上述写法类似我们函数中常见的循环的写法,具体可以查看include/tensorview/kernel_utils.h

    template 
    __forceinline__ __device__ detail::KernelLoop KernelLoopX(T count) {
      return detail::KernelLoop(blockIdx.x * blockDim.x + threadIdx.x,
                                   gridDim.x * blockDim.x * NumILP, count);
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5

    NumILP按默认值等于1的话,其stride也是gridDim.x*blockDim.x。索引最大值要小于该线程块的线程上限索引blockDim.x * gridDim.x,功能与下面代码类似:

    {
        int idx    = threadIdx.x + blockIdx.x * blockDim.x;
        int stride = blockDim.x * gridDim.x;
                
        for(int i = idx; i < num; i += stride) {
            //...运算...
        }
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8

    下节写根据构建的Rulebook执行具体稀疏卷积计算。

    参考:https://blog.csdn.net/ChuiGeDaQiQiu/article/details/127680713

  • 相关阅读:
    linux k8s之KubeSphere启用DevOps
    字符流用户注册案例、字符缓冲流、字符缓冲流特有功能、字符缓冲流操作文件中的数据排序案例
    常见配置文件格式INI/XML/YAML/JSON/Properties/TOML/HCL/YAML Front Matter/.env介绍及实例
    “第五十二天”
    服务器硬盘HDD与SSD的优缺点是什么
    大数据学习:使用Vim编辑器
    N32学习笔记1-工程模板建立
    一文读懂二级分销返利模式,商城系统源码机制分享
    java健康饮食信息管理系统计算机毕业设计MyBatis+系统+LW文档+源码+调试部署
    智慧城市中的数字孪生:数字孪生技术助力智慧城市提高公共服务水平
  • 原文地址:https://blog.csdn.net/weixin_42905141/article/details/127938538