• 【MindSpore易点通】混合精度训练使用总结


    一、概述

    混合精度训练方法是通过混合使用单精度和半精度数据格式来加速深度神经网络训练的过程,同时保持了单精度训练所能达到的网络精度。混合精度训练能够加速计算过程,同时减少内存使用和存取,并使得在特定的硬件上可以训练更大的模型或batch size。MindSpore混合精度典型的计算流程如下图所示:

    1、参数以FP32存储;

    2、正向计算过程中,遇到FP16算子,需要把算子输入和参数从FP32 cast成FP16进行计算;

    3、将Loss层设置为FP32进行计算;

    4、反向计算过程中,首先乘以Loss Scale值,避免反向梯度过小而产生下溢;

    5、FP16参数参与梯度计算,其结果将被cast回FP32;

    6、除以Loss scale值,还原被放大的梯度;

    7、判断梯度是否存在溢出,如果溢出则跳过更新,否则优化器以FP32对原始参数进行更新。

    二、使用场景

    由于混合精度能带来加速计算,减少内存占用的优势,因此用户在遇到以下情况可以考虑使用混合精度:

    1、内存资源不足;

    2、训练速度较慢。

    三、使用条件

    本文档是针对以下两类使用场景的的用户:

    1、即将启动MindSpore训练代码迁移任务,并对MindSpore有基础了解;

    2、已完成MindSpore训练代码迁移任务,即有可使用的MindSpore训练代码。

    四、使用样例

    1、MindSpore高阶API使用混合精度

    MindSpore在mindspore.Model接口中做了封装,方便用户调用。具体实现步骤与编写普通训练代码过程没有区别。只需要在Model中设置混合精度相关参数,如amp_level, loss_scale_manager, keep_batchnorm_fp32。

    修改高阶API代码中的Model接口,将amp_level设置成"O3",网络将采用混合精度进行训练。

    net = Model(net, loss, opt, metrics=metrics, amp_level="O3")

    2、MindSpore低阶API使用混合精度

    MindSpore低阶API使用混合精度,只需在MindSpore低阶API代码构造模型步骤中,将网络设置成混合精度进行训练。下面对比两者构造模型的区别。

    MindSpore低阶API代码中构造模型:

    1. class BuildTrainNetwork(nn.Cell):
    2.     '''Build train network.'''
    3.     def __init__(self, my_network, my_criterion, train_batch_size, class_num):
    4.         super(BuildTrainNetwork, self).__init__()
    5.         self.network = my_network
    6.         self.criterion = my_criterion
    7.         self.print = P.Print()
    8.         # Initialize self.output
    9.         self.output = mindspore.Parameter(Tensor(np.ones((train_batch_size, class_num)), mindspore.float32), requires_grad=False)
    10.     def construct(self, input_data, label):
    11.         output = self.network(input_data)
    12.         # Get the network output and assign it to self.output
    13.         self.output = output
    14.         loss0 = self.criterion(output, label)
    15.         return loss0
    16. class TrainOneStepCellV2(TrainOneStepCell):
    17.     def __init__(self, network, optimizer, sens=1.0):
    18.         super(TrainOneStepCellV2, self).__init__(network, optimizer, sens=1.0)
    19.     def construct(self, *inputs):
    20.         weights = self.weights
    21.         loss = self.network(*inputs)
    22.         # Obtain self.network from BuildTrainNetwork
    23.         output = self.network.output
    24.         sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
    25.         # Get the gradient of the network parameters
    26.         grads = self.grad(self.network, weights)(*inputs, sens)
    27.         grads = self.grad_reducer(grads)
    28.         # Optimize model parameters
    29.         loss = F.depend(loss, self.optimizer(grads))
    30.         return loss, output
    31.         
    32. model_constructed = BuildTrainNetwork(net, loss_function, TRAIN_BATCH_SIZE, CLASS_NUM)
    33. model_constructed = TrainOneStepCellV2(model_constructed, opt)

    MindSpore低阶API混合精度代码中构造模型:

    1. class BuildTrainNetwork(nn.Cell):
    2.     '''Build train network.'''
    3.     def __init__(self, my_network, my_criterion, train_batch_size, class_num):
    4.         super(BuildTrainNetwork, self).__init__()
    5.         self.network = my_network
    6.         self.criterion = my_criterion
    7.         self.print = P.Print()
    8.         # Initialize self.output
    9.         self.output = mindspore.Parameter(Tensor(np.ones((train_batch_size, class_num)), mindspore.float32), requires_grad=False)
    10.     def construct(self, input_data, label):
    11.         output = self.network(input_data)
    12.         # Get the network output and assign it to self.output
    13.         self.output = output
    14.         loss0 = self.criterion(output, label)
    15.         return loss0
    16. class TrainOneStepCellV2(TrainOneStepCell):
    17.     '''Build train network.'''
    18.     def __init__(self, network, optimizer, sens=1.0):
    19.         super(TrainOneStepCellV2, self).__init__(network, optimizer, sens=1.0)
    20.     def construct(self, *inputs):
    21.         weights = self.weights
    22.         loss = self.network(*inputs)
    23.         # Obtain self.network from BuildTrainNetwork
    24.         output = self.network.output
    25.         sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
    26.         # Get the gradient of the network parameters
    27.         grads = self.grad(self.network, weights)(*inputs, sens)
    28.         grads = self.grad_reducer(grads)
    29.         # Optimize model parameters
    30.         loss = F.depend(loss, self.optimizer(grads))
    31.         return loss, output
    32.         def build_train_network_step2(network, optimizer, loss_fn=None, level='O0', **kwargs):
    33.     """
    34.     Build the mixed precision training cell automatically.
    35.     """
    36.     amp.validator.check_value_type('network', network, nn.Cell)
    37.     amp.validator.check_value_type('optimizer', optimizer, nn.Optimizer)
    38.     amp.validator.check('level', level, "", ['O0', 'O2', 'O3', "auto"], amp.Rel.IN)
    39.     if level == "auto":
    40.         device_target = context.get_context('device_target')
    41.         if device_target == "GPU":
    42.             level = "O2"
    43.         elif device_target == "Ascend":
    44.             level = "O3"
    45.         else:
    46.             raise ValueError("Level `auto` only support when `device_target` is GPU or Ascend.")
    47.     amp._check_kwargs(kwargs)
    48.     config = dict(amp._config_level[level], **kwargs)
    49.     config = amp.edict(config)
    50.     if config.cast_model_type == mstype.float16:
    51.         network.to_float(mstype.float16)
    52.         if config.keep_batchnorm_fp32:
    53.             amp._do_keep_batchnorm_fp32(network)
    54.     if loss_fn:
    55.         network = amp._add_loss_network(network, loss_fn, config.cast_model_type)
    56.     if amp._get_parallel_mode() in (amp.ParallelMode.SEMI_AUTO_PARALLEL, amp.ParallelMode.AUTO_PARALLEL):
    57.         network = amp._VirtualDatasetCell(network)
    58.     loss_scale = 1.0
    59.     if config.loss_scale_manager is not None:
    60.         loss_scale_manager = config.loss_scale_manager
    61.         loss_scale = loss_scale_manager.get_loss_scale()
    62.         update_cell = loss_scale_manager.get_update_cell()
    63.         if update_cell is not None:
    64.             # only cpu not support `TrainOneStepWithLossScaleCell` for control flow.
    65.             if not context.get_context("enable_ge"and context.get_context("device_target") == "CPU":
    66.                 raise ValueError("Only `loss_scale_manager=None` and "
    67.                 "`loss_scale_manager=FixedLossScaleManager`"
    68.                 "are supported in current version. If you use `O2` option,"
    69.                 "use `loss_scale_manager=None` or `FixedLossScaleManager`")
    70.             network = TrainOneStepCellV2(network, optimizer)
    71.             return network
    72.     network = TrainOneStepCellV2(network, optimizer)
    73.     return network
    74. model_constructed = BuildTrainNetwork(net, loss_function, TRAIN_BATCH_SIZE, CLASS_NUM)
    75. model_constructed = build_train_network_step2(model_constructed, opt, level="O3")

    五、性能对比

    相比全精度训练,使用混合精度后,获得了可观的性能提升。

    低阶API: 2000 imgs/sec ;低阶API混合精度: 3200 imgs/sec

    高阶API: 2200 imgs/sec ;高阶API混合精度: 3300 imgs/sec 

  • 相关阅读:
    UE4 通过互动(键盘按键)开门
    欧拉公式 Euler‘s Formula
    数据链路层概述
    51、ElasticSearch RestHighLevelClient 索引库、文档
    MySQL中tinyint(1)与tinyint(2)的区别
    UE4 回合游戏项目 02- 创建人物-敌人角色(动画蓝图练习)
    pyflink 安装和测试
    Verilog HDL
    Centos7+Hadoop3.3.4+KDC1.15集成认证
    【个人笔记本】本地化部署详细流程 LLaMA中文模型:Chinese-LLaMA-Alpaca-2
  • 原文地址:https://blog.csdn.net/Kenji_Shinji/article/details/127650279