• 问题求助 -MindSpore 训练问题


    环境:win10 x64 ,anaconda 虚拟环境, Mindspore 1.3.0 ,pyhton 3.7.5

    代码“华为医学图像分割”

    #%%
    
    class StepLossTimeMonitor(Callback):
    
        def __init__(self, batch_size, per_print_times=1):
            super(StepLossTimeMonitor, self).__init__()
            if not isinstance(per_print_times, int) or per_print_times < 0:
                raise ValueError("print_step must be int and >= 0.")
            self._per_print_times = per_print_times
            self.batch_size = batch_size
    
        def step_begin(self, run_context):
            self.step_time = time.time()
    
        def step_end(self, run_context):
    
            step_seconds = time.time() - self.step_time
            step_fps = self.batch_size*1.0/step_seconds
    
            cb_params = run_context.original_args()
            loss = cb_params.net_outputs
    
            if isinstance(loss, (tuple, list)):
                if isinstance(loss[0], Tensor) and isinstance(loss[0].asnumpy(), np.ndarray):
                    loss = loss[0]
    
            if isinstance(loss, Tensor) and isinstance(loss.asnumpy(), np.ndarray):
                loss = np.mean(loss.asnumpy())
    
            cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
    
            if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)):
                raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format(
                    cb_params.cur_epoch_num, cur_step_in_epoch))
            if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0:
                # TEST
                print("step: %s, loss is %s, fps is %s" % (cur_step_in_epoch, loss, step_fps), flush=True)
    
    
    #%%
    
    def train_net(data_dir, cross_valid_ind=1, epochs=400, batch_size=16, lr=0.0001, run_distribute=False, cfg=None):
        if run_distribute:
            init()
            group_size = get_group_size()
            parallel_mode = ParallelMode.DATA_PARALLEL
            context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                              device_num=group_size,
                                              gradients_mean=False)
        net = UNet(n_channels=cfg['num_channels'], n_classes=cfg['num_classes'])
    
        if cfg['resume']:
            param_dict = load_checkpoint(cfg['resume_ckpt'])
            load_param_into_net(net, param_dict)
    
        criterion = CrossEntropyWithLogits()
        train_dataset, _ = create_dataset(data_dir, epochs, batch_size, True, cross_valid_ind, run_distribute)
        train_data_size = train_dataset.get_dataset_size()
        print("dataset length is:", train_data_size)
        ckpt_config = CheckpointConfig(save_checkpoint_steps=train_data_size,
                                       keep_checkpoint_max=cfg['keep_checkpoint_max'])
        ckpoint_cb = ModelCheckpoint(prefix='ckpt_unet_medical_adam',
                                     directory='./ckpt_{}/'.format(device_id),
                                     config=ckpt_config)
    
        optimizer = nn.Adam(params=net.trainable_params(), learning_rate=lr, weight_decay=cfg['weight_decay'],
                            loss_scale=cfg['loss_scale'])
    
        loss_scale_manager = mindspore.train.loss_scale_manager.FixedLossScaleManager(cfg['FixedLossScaleManager'], False)
    
        model = Model(net, loss_fn=criterion, loss_scale_manager=loss_scale_manager, optimizer=optimizer, amp_level="O3")
    
        print("============== Starting Training ==============")
        model.train(2, train_dataset, callbacks=[StepLossTimeMonitor(batch_size=batch_size), ckpoint_cb],
                    dataset_sink_mode=False)
        print("============== End Training ==============")
    
    
    #%%
    
    data_url = './data'
    
    run_distribute = False
    epoch_size = cfg_unet['epochs'] if not run_distribute else cfg_unet['distribute_epochs']
    
    train_net(data_dir=data_url, cross_valid_ind=cfg_unet['cross_valid_ind'], epochs=epoch_size,
              batch_size=cfg_unet['batchsize'], lr=cfg_unet['lr'], run_distribute=run_distribute,
              cfg=cfg_unet)
    
    运行到这一步的时候出错信息如下:
    
    WARNING] ME(3580:16232,MainProcess):2021-12-25-00:30:59.151.94 [mindspore\nn\loss\loss.py:103] '_Loss' is deprecated from version 1.3 and will be removed in a future version, use 'LossBase' instead.
    

    dataset length is: 600
    ============== Starting Training ==============
    

    ---------------------------------------------------------------------------

    RuntimeError Traceback (most recent call last)

    ~\AppData\Local\Temp/ipykernel_3580/3195514145.py in  6 train_net(data_dir=data_url, cross_valid_ind=cfg_unet['cross_valid_ind'], epochs=epoch_size, 7 batch_size=cfg_unet['batchsize'], lr=cfg_unet['lr'], run_distribute=run_distribute, ----> 8 cfg=cfg_unet) 9 10

    ~\AppData\Local\Temp/ipykernel_3580/2411766277.py in train_net(data_dir, cross_valid_ind, epochs, batch_size, lr, run_distribute, cfg) 32 print("============== Starting Training ==============") 33 model.train(2, train_dataset, callbacks=[StepLossTimeMonitor(batch_size=batch_size), ckpoint_cb], ---> 34 dataset_sink_mode=False) 35 print("============== End Training ==============") 36

    D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\train\model.py in train(self, epoch, train_dataset, callbacks, dataset_sink_mode, sink_size) 647 callbacks=callbacks, 648 dataset_sink_mode=dataset_sink_mode, --> 649 sink_size=sink_size) 650 651 def _eval_dataset_sink_process(self, valid_dataset, list_callback=None, cb_params=None):

    D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\train\model.py in _train(self, epoch, train_dataset, callbacks, dataset_sink_mode, sink_size) 431 self._check_reuse_dataset(train_dataset) 432 if not dataset_sink_mode: --> 433 self._train_process(epoch, train_dataset, list_callback, cb_params) 434 elif context.get_context("device_target") == "CPU": 435 logger.warning("The CPU cannot support dataset sink mode currently."

    D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\train\model.py in _train_process(self, epoch, train_dataset, list_callback, cb_params) 556 cb_params.train_dataset_element = next_element 557 list_callback.step_begin(run_context) --> 558 outputs = self._train_network(*next_element) 559 cb_params.net_outputs = outputs 560 if self._loss_scale_manager and self._loss_scale_manager.get_drop_overflow_update():

    D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\nn\cell.py in __call__(self, *inputs, **kwargs) 384 if self.enable_hook: 385 raise ValueError("The graph mode does not support hook function.") --> 386 out = self.compile_and_run(*inputs) 387 return out 388

    D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\nn\cell.py in compile_and_run(self, *inputs) 642 """ 643 self._auto_parallel_compile_and_run = True --> 644 self.compile(*inputs) 645 646 new_inputs = []

    D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\nn\cell.py in compile(self, *inputs) 629 inputs (tuple): Inputs of the Cell object. 630 """ --> 631 _executor.compile(self, *inputs, phase=self.phase, auto_parallel_mode=self._auto_parallel_mode) 632 633 def compile_and_run(self, *inputs):

    D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\common\api.py in compile(self, obj, phase, do_convert, auto_parallel_mode, *args) 529 enable_ge = context.get_context("enable_ge") 530 use_vm = not enable_ge or (enable_debug_runtime and context.get_context("mode") == context.PYNATIVE_MODE) --> 531 result = self._executor.compile(obj, args_list, phase, use_vm, self.queue_name) 532 self.compile_cache[phase] = phase 533 if not result:

    RuntimeError: mindspore\ccsrc\runtime\device\cpu\cpu_memory_manager.cc:33 MemMalloc] Malloc memory failed: size 13225224704 # In file D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\ops\_grad\grad_nn_ops.py(64) return dx, dw ^

    根据上述warning的提示是

    '_Loss' is deprecated from version 1.3 and will be removed in a future version, use 'LossBase' instead.

    “_Loss”已从1.3版中弃用,并将在未来版本中删除,请改用“LossBase”。请看下下方API是否可以满足使用需求。

    mindspore.nn.LossBase — MindSpore master documentation

  • 相关阅读:
    SQLSERVER 查询阻塞SQL以及锁
    CentOS -- 配置docker官方源并用yum安装docker
    如何在Spring Boot中记录用户系统操作流程?
    Map.get、Map.set、Map.has方法
    MyBatis之xml配置的解析
    【houdini】网格采样粒子
    Java基础:Stream流和方法引用
    对GROUP BY的增强
    MYSQL 查询重复数据
    关于设置html不让浏览器进行缓存的问题
  • 原文地址:https://blog.csdn.net/weixin_45666880/article/details/126501886