• 3d稀疏卷积——spconv源码剖析(五)


    下面介绍了根据构建的Rulebook执行具体稀疏卷积计算,继续看类SparseConvolution,代码位于:spconv/conv.py

    class SparseConvolution(SparseModule):
        __constants__ = [
            'stride', 'padding', 'dilation', 'groups', 'bias', 'subm', 'inverse',
            'transposed', 'output_padding', 'fused_bn'
        ]
    
        def __init__(self,
                     ndim,              #  数据特征维度,SubMConv3d和SparseConv3d的ndim为3
                     in_channels,       # 输入通道
                     out_channels,      # 输出通道
                     kernel_size=3,     # 卷积核尺寸
                     stride=1,          # 步长
                     padding=0,         # 填充值
                     dilation=1,        # 空洞卷积:卷积核各个元素之间的间隔,默认卷积方式dilation=1
                     groups=1,          # 深度可分离卷积:groups参数将 in_channel和out_channel 按照次序分别分成了一 一对应的groups组,默认为1
                     bias=True,         # 偏置
                     subm=False,        # 区分是标准3d稀疏卷积还是3d子流行稀疏卷积
                     output_padding=0,  # 输出填充,默认为0
                     transposed=False,  # 转置卷积,默认为False
                     inverse=False,     # 反卷积,默认为False
                     indice_key=None,   # 索引key,子流线卷积不会改变输入输出位置索引以及输出特征图空间形状,可以字典存储起来直接利用
                     fused_bn=False,    # conv和bn融合
                     use_hash=False,    # 分区使用cpu和gpu计算卷积,默认为False,使用gpu
                     algo=ops.ConvAlgo.Native): # 3种内存分配方式,值为0,1,2,默认0
            super(SparseConvolution, self).__init__()
            assert groups == 1
            if not isinstance(kernel_size, (list, tuple)):
                kernel_size = [kernel_size] * ndim
            if not isinstance(stride, (list, tuple)):
                stride = [stride] * ndim
            if not isinstance(padding, (list, tuple)):
                padding = [padding] * ndim
            if not isinstance(dilation, (list, tuple)):
                dilation = [dilation] * ndim
            if not isinstance(output_padding, (list, tuple)):
                output_padding = [output_padding] * ndim
    
            for d, s in zip(dilation, stride): # 必须有一个为1
                assert any([s == 1, d == 1]), "don't support this."
    
            self.ndim = ndim #  3d稀疏卷积ndim为3
            self.in_channels = in_channels # 5
            self.out_channels = out_channels # 16
            self.kernel_size = kernel_size # 3
            self.conv1x1 = np.prod(kernel_size) == 1 # 计算所有元素的乘积
            self.stride = stride # 1
            self.padding = padding # 1
            self.dilation = dilation # 空洞卷积:卷积核各个元素之间的间隔,默认卷积方式dilation=1
            self.transposed = transposed # False 
            self.inverse = inverse # False 
            self.output_padding = output_padding # 0
            self.groups = groups # 深度可分离卷积:groups参数将 in_channel和out_channel 按照次序分别分成了一 一对应的groups组,默认为1
            self.subm = subm # False 用于区分是标准3d稀疏卷积还是3d子流行稀疏卷积
            self.indice_key = indice_key # 索引key,子流线卷积不会改变输入输出位置索引以及输出特征图空间形状,可以存储起来直接利用
            self.fused_bn = fused_bn # conv和bn融合
            self.use_hash = use_hash # cpu版利用哈希
            self.algo = algo.value # 获取枚举标签的值,0,1,2分别对应3中内存分配方式
            # torch.nn.Parameter()将一个不可训练的tensor转换成可以训练的类型parameter,并将这个parameter绑定到这个module里面
            # 使用这个函数的目的也是想让某些变量在学习的过程中不断的修改其值以达到最优化。
            # 27,16,32
            self.weight = Parameter(
                torch.Tensor(*kernel_size, in_channels, out_channels))
            if bias:
                self.bias = Parameter(torch.Tensor(out_channels))
            else:
                # 将bias参数添加到模块中,通过使用给定名字可以访问该参数.
                self.register_parameter('bias', None)
            self.reset_parameters()
    
        def reset_parameters(self):
            n = self.in_channels
            init.kaiming_uniform_(self.weight, a=math.sqrt(5))
            if self.bias is not None:
                fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
                bound = 1 / math.sqrt(fan_in)
                init.uniform_(self.bias, -bound, bound)
    
        def forward(self, input):
            assert isinstance(input, spconv.SparseConvTensor)
            features = input.features # (N,5)
            device = features.device # cuda:0
            indices = input.indices # (N, 4) 网格坐标(batch_id,z,y,x)
            spatial_shape = input.spatial_shape # [41, 1440, 1440] 
            batch_size = input.batch_size # 4
    
            # 1.计算输出空间形状
            if not self.subm: # 普通3d稀疏卷积
                if self.transposed: # False
                    out_spatial_shape = ops.get_deconv_output_size(
                        spatial_shape, self.kernel_size, self.stride, self.padding,self.dilation, self.output_padding)
                else:
                    # 子流线稀疏卷积计算输出卷积的形状 [41, 1440, 1440]-->[21, 720, 720]
                    out_spatial_shape = ops.get_conv_output_size(
                        spatial_shape, self.kernel_size, self.stride, self.padding,self.dilation)
            else: # SubMConv 卷积
                  # 子流线卷积输入和输出形状相同
                out_spatial_shape = spatial_shape # [41, 1440, 1440] 
            # input.update_grid(out_spatial_shape)
            # t = time.time()
            if self.conv1x1: # 单独处理1x1卷积
                features = torch.mm(
                    input.features,
                    self.weight.view(self.in_channels, self.out_channels))
                if self.bias is not None:
                    features += self.bias
                out_tensor = spconv.SparseConvTensor(features, input.indices,input.spatial_shape,input.batch_size)
                out_tensor.indice_dict = input.indice_dict
                out_tensor.grid = input.grid
                return out_tensor
            # input为SparseConvTensor,因为submconv3d不会改变输入输出位置索引以及输出特征图空间形状
            # 如果本层的outids, indice_pairs, indice_pair_num, out_spatial_shape等与之前计算过的层相同,这里用字典存储,后面直接使用,避免重复计算
            datas = input.find_indice_pair(self.indice_key)  
    
            # 2.构建Rulebook,直接使用Python调用c++接口
            if self.inverse: # False 处理反卷积
                assert datas is not None and self.indice_key is not None
                _, outids, indice_pairs, indice_pair_num, out_spatial_shape = datas
                assert indice_pair_num.shape[0] == np.prod(self.kernel_size), "inverse conv must have same kernel size as its couple conv"
            else: # 非反卷积
                  # 如:self.indice_key = 'subm1' and datas = None
                if self.indice_key is not None and datas is not None:
                    outids, _, indice_pairs, indice_pair_num, _ = datas
                else:
                    # indices:[N, 4], 就是input的indices的属性,即voxel网格坐标索引
                	# outids: [N, 4],由于submconv性质,outids和 incides 是一样的,如果是标准的spconv,就不一样了
                	# indice_pairs: [2, 27, N],2是对应关系,2表示输入和输出两个方向,第0位储存输入indices的下标,第1位储存输出outids中的下标,27 为卷积核的volume 3x3x3,N 表示输入有效(active)特征的数量
                	# indice_pair_num: [27],用于保存卷积核每一个位置上的总的计算的次数,因为是稀疏卷积卷积核上每一个元素和有效数据的运算次数可能是不同的
                    # return {indices.to(device), indicePairs.to(device),indiceNum.to(device)};
                    outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
                        indices,            # (N, 4)
                        batch_size,         # 4
                        spatial_shape,      # [41, 1440, 1440]
                        self.kernel_size,   # 3
                        self.stride,        
                        self.padding,
                        self.dilation,
                        self.output_padding,
                        self.subm,
                        self.transposed,
                        grid=input.grid,
                        use_hash=self.use_hash)
                    # 将索引信息写入input的索引字典中
                    input.indice_dict[self.indice_key] = (outids,indices, indice_pairs,indice_pair_num,spatial_shape)
            
            # 3.根据构建的Rulebook执行具体稀疏卷积计算
            if self.fused_bn: # False
                assert self.bias is not None
                out_features = ops.fused_indice_conv(features, self.weight,self.bias,indice_pairs.to(device),indice_pair_num,outids.shape[0], self.inverse,self.subm)
            else:
                if self.subm: # 子流线稀疏卷积
                    # Fsp.indice_subm_conv和Fsp.indice_conv经function.py中的SubMConvFunction和SparseConvFunction对象辗转还是会继续调用ops模块中的indice_conv等函数。
                    # 最终,他们都会以torch.ops.spconv.xx的形式调用c++扩展共享库中的api来完成任务
                    out_features = Fsp.indice_subm_conv(features,           # 输入特征(N,5)
                                                        self.weight,        # 权重(27*16*32)
                                                        indice_pairs.to(device),  # [2, 27, N]
                                                        indice_pair_num,    # [27],用于保存卷积核每一个位置上的总的计算的次数
                                                        outids.shape[0],    # N
                                                        self.algo           # 获取枚举标签的值,0,1,2分别对应3中内存分配方式
                                                        )
                else:
                    if self.inverse:
                        out_features = Fsp.indice_inverse_conv(features, self.weight, indice_pairs.to(device),indice_pair_num, outids.shape[0], self.algo)
                    else:
                        out_features = Fsp.indice_conv(features, self.weight,indice_pairs.to(device),indice_pair_num,outids.shape[0], self.algo)
    
                if self.bias is not None:
                    out_features += self.bias
            out_tensor = spconv.SparseConvTensor(out_features, outids,out_spatial_shape, batch_size)
            out_tensor.indice_dict = input.indice_dict
            out_tensor.grid = input.grid
            return out_tensor
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
    • 149
    • 150
    • 151
    • 152
    • 153
    • 154
    • 155
    • 156
    • 157
    • 158
    • 159
    • 160
    • 161
    • 162
    • 163
    • 164
    • 165
    • 166
    • 167
    • 168
    • 169
    • 170
    • 171

    Fsp.indice_subm_convFsp.indice_convspconv/functional.py中的SubMConvFunctionSparseConvFunction对象辗转还是会继续调用spconv/ops.py模块中的indice_conv等函数。

    先看子流线卷积的接口:indice_subm_conv,代码:spconv/functional.py

    class SubMConvFunction(Function):
        @staticmethod
        def forward(ctx, features, filters, indice_pairs, indice_pair_num,num_activate_out, algo):
            ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
            ctx.algo = algo
            return ops.indice_conv(features,        # 输入特征(N,5)
                                   filters,         # 权重(27*16*32)
                                   indice_pairs,    # [2, 27, N]
                                   indice_pair_num, # [27],用于保存卷积核每一个位置上的总的计算的次数
                                   num_activate_out,# N 输出有效active个数
                                   False,           # 反卷积 false
                                   True,            # 子流线卷积默认 true
                                   algo=algo        # 获取枚举标签的值,0,1,2分别对应3中内存分配方式                 
                                   )
    
        @staticmethod
        def backward(ctx, grad_output):
            indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
            input_bp, filters_bp = ops.indice_conv_backward(features,
                                                            filters,
                                                            grad_output,
                                                            indice_pairs,
                                                            indice_pair_num,
                                                            False,
                                                            True,
                                                            algo=ctx.algo)
    
            return input_bp, filters_bp, None, None, None, None
    
    indice_subm_conv = SubMConvFunction.apply
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30

    直接用 Python 接口调用 C++ 函数不太“美观”,这里用torch.autograd.Function来封装算子的底层调用。

    Function 类本身表示 PyTorch 的一个可导函数,只要为其定义了前向推理和反向传播的实现,我们就可以把它当成一个普通 PyTorch 函数来使用,PyTorch` 会自动调度该函数,合适地执行前向和反向计算。

    扩充:对模型部署来说,Function 类有一个很好的性质:如果它定义了 symbolic 静态方法,该 Function 在执行 torch.onnx.export() 时就可以根据 symbolic 中定义的规则转换成 ONNX 算子

    applytorch.autograd.Function 的一个方法,这个方法完成了 Function 在前向推理或者反向传播时的调度,使用 indice_subm_conv = SubMConvFunction.apply 把这个调用方法取了一个更简短的别名 indice_subm_conv,以后在使用 indice_subm_conv 算子时,我们可以忽略 SubMConvFunction 的实现细节,而只通过 indice_subm_conv 这个接口来访问算子。

    SubMConvFunction的前向传播forward调用spconv/ops.pyindice_conv方法

    def indice_conv(features,
                    filters,
                    indice_pairs,
                    indice_pair_num,
                    num_activate_out,
                    inverse=False,
                    subm=False,
                    algo=ConvAlgo.Native.value):
        return torch.ops.spconv.indice_conv(features, filters, indice_pairs,indice_pair_num, num_activate_out,int(inverse), int(subm), algo)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    src/spconv/all.cc文件中通过Pytorch提供的OP Register对底层c++ api进行了注册

    static auto registry =
        torch::RegisterOperators()
            .op("spconv::get_indice_pairs", &spconv::getIndicePairs)
            .op("spconv::indice_conv", &spconv::indiceConv)
    
    • 1
    • 2
    • 3
    • 4

    通过torch.ops.load_library加载.so文件, torch.ops.spconv.indice_conv方式来调用src/spconv/spconv_ops.cc文件中的indiceConv函数

    下面具体看src/spconv/spconv_ops.ccindiceConv函数:

    torch::Tensor indiceConv(torch::Tensor features,   // 输入特征(N,5)
                             torch::Tensor filters,    // 权重(27*16*32)
                             torch::Tensor indicePairs,// [2, 27, N]
                             torch::Tensor indiceNum,  // [27],用于保存卷积核每一个位置上的总的计算的次数
                             int64_t numActOut,        // N 输出有效active个数
                             int64_t _inverse,         // 反卷积 false
                             int64_t _subM,            // 子流线卷积默认 true
                             int64_t algo              // 获取枚举标签的值,0,1,2分别对应3中内存分配方式  
                             ) {
      auto kernelVolume = indiceNum.size(0); // N
      switch (algo) {         // 默认0
      case kBatchGemmGather:  // 2
      case kBatch: {          // 1
        if (kernelVolume != 1) {
          return indiceConvBatch(features, filters, indicePairs, indiceNum,numActOut, _inverse, _subM,algo != kBatchGemmGather);
        } 
        else {
          break;
        }
      }
      case kNative:
        break;
      default:
        TV_THROW_RT_ERR("unknown algo");
      }
      // auto timer = spconv::CudaContextTimer<>();
    
      bool subM = _subM != 0;
      bool inverse = _inverse != 0;
      auto device = features.device().type();
      auto ndim = filters.dim() - 2;
      auto numInPlanes = features.size(1);
      auto numOutPlanes = filters.size(ndim + 1);
      auto indicePairNumCpu = indiceNum.to({torch::kCPU});
    
      auto options =
          torch::TensorOptions().dtype(features.dtype()).device(features.device());
      torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
      filters = filters.view({-1, numInPlanes, numOutPlanes});
    
      // init for subM
      int indicePairMaxOffset = kernelVolume / 2;
      int indicePairMaxSize = numActOut;
      if (subM) { // the center index of subm conv don't need gather and scatter
        // add.
        torch::mm_out(output, features, filters[indicePairMaxOffset]);
    
        // get indice pair second max size based on subM symmetric property
        indicePairMaxSize =
          *std::max_element(indicePairNumCpu.data_ptr<int>(),
                            indicePairNumCpu.data_ptr<int>() + indicePairMaxOffset);
        if (indicePairMaxSize == 0) {
          return output;
        }
      } else {
        indicePairMaxSize =
          *std::max_element(indicePairNumCpu.data_ptr<int>(),
                            indicePairNumCpu.data_ptr<int>() + kernelVolume);
      }
    
      torch::Tensor inputBuffer =
          torch::empty({indicePairMaxSize, numInPlanes}, options);
      torch::Tensor outputBuffer =
          torch::empty({indicePairMaxSize, numOutPlanes}, options);
    
      double totalGatherTime = 0;
      double totalGEMMTime = 0;
      double totalSAddTime = 0;
      // tv::ssprint("first subm gemm time", timer.report() / 1000.0);
    
      for (int i = 0; i < kernelVolume; ++i) {
        auto nHot = indicePairNumCpu.data_ptr<int>()[i];
        if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
          continue;
        }
        // TODO torch::from_blob is a little slow
        auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
                                                 {nHot, numOutPlanes}, options);
        auto inputBufferBlob =
            torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
    
        if (device == torch::kCPU) {
          sparse_gather_cpu(inputBuffer, features, indicePairs[inverse][i], nHot);
        }
    #ifdef TV_CUDA
        else if (device == torch::kCUDA) {
          sparse_gather_cuda(inputBuffer, features, indicePairs[inverse][i], nHot);
          /* slower than SparseGatherFunctor, may due to int->long conversion
          auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
          auto indicePairBlob = torch::from_blob(indicePairLong.data(),
          {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
          features, 0, indicePairBlob);*/
        }
    #endif
        else {
          TV_THROW_INVALID_ARG("unknown device type");
        }
        // totalGatherTime += timer.report() / 1000.0;
        torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
        // totalGEMMTime += timer.report() / 1000.0;
    
        if (device == torch::kCPU) {
          sparse_scatter_add_cpu(outputBuffer, output, indicePairs[!inverse][i],
                                 nHot);
        }
    #ifdef TV_CUDA
        else if (device == torch::kCUDA) {
          sparse_scatter_add_cuda(outputBuffer, output, indicePairs[!inverse][i],
                                  nHot);
        }
    #endif
        else {
          TV_THROW_INVALID_ARG("unknown device type");
        }
        // totalSAddTime += timer.report() / 1000.0;
      }
      // tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
      return output;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119

    代写!!

  • 相关阅读:
    谷歌180nm 工艺芯片开源
    网络中的一些基本概念
    火山引擎 DataLeap:一家企业,数据体系要怎么搭建?
    POJ1330最近公共祖先LCA
    RocketMQ的push消费方式实现的太聪明了
    《web结课作业的源码》中华传统文化题材网页设计主题——基于HTML+CSS+JavaScript精美自适应绿色茶叶公司(12页)
    C#实现纳秒级的计时器功能
    离散低通滤波方法
    程序员的护城河:技术深度、创新追求与软实力的完美结合
    深入理解Java AQS:从原理到源码分析
  • 原文地址:https://blog.csdn.net/weixin_42905141/article/details/127974542