环境:win10 x64 ,anaconda 虚拟环境, Mindspore 1.3.0 ,pyhton 3.7.5
代码“华为医学图像分割”
#%%
class StepLossTimeMonitor(Callback):
def __init__(self, batch_size, per_print_times=1):
super(StepLossTimeMonitor, self).__init__()
if not isinstance(per_print_times, int) or per_print_times < 0:
raise ValueError("print_step must be int and >= 0.")
self._per_print_times = per_print_times
self.batch_size = batch_size
def step_begin(self, run_context):
self.step_time = time.time()
def step_end(self, run_context):
step_seconds = time.time() - self.step_time
step_fps = self.batch_size*1.0/step_seconds
cb_params = run_context.original_args()
loss = cb_params.net_outputs
if isinstance(loss, (tuple, list)):
if isinstance(loss[0], Tensor) and isinstance(loss[0].asnumpy(), np.ndarray):
loss = loss[0]
if isinstance(loss, Tensor) and isinstance(loss.asnumpy(), np.ndarray):
loss = np.mean(loss.asnumpy())
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)):
raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format(
cb_params.cur_epoch_num, cur_step_in_epoch))
if self._per_print_times != 0 and cb_params.cur_step_num % self._per_print_times == 0:
# TEST
print("step: %s, loss is %s, fps is %s" % (cur_step_in_epoch, loss, step_fps), flush=True)
#%%
def train_net(data_dir, cross_valid_ind=1, epochs=400, batch_size=16, lr=0.0001, run_distribute=False, cfg=None):
if run_distribute:
init()
group_size = get_group_size()
parallel_mode = ParallelMode.DATA_PARALLEL
context.set_auto_parallel_context(parallel_mode=parallel_mode,
device_num=group_size,
gradients_mean=False)
net = UNet(n_channels=cfg['num_channels'], n_classes=cfg['num_classes'])
if cfg['resume']:
param_dict = load_checkpoint(cfg['resume_ckpt'])
load_param_into_net(net, param_dict)
criterion = CrossEntropyWithLogits()
train_dataset, _ = create_dataset(data_dir, epochs, batch_size, True, cross_valid_ind, run_distribute)
train_data_size = train_dataset.get_dataset_size()
print("dataset length is:", train_data_size)
ckpt_config = CheckpointConfig(save_checkpoint_steps=train_data_size,
keep_checkpoint_max=cfg['keep_checkpoint_max'])
ckpoint_cb = ModelCheckpoint(prefix='ckpt_unet_medical_adam',
directory='./ckpt_{}/'.format(device_id),
config=ckpt_config)
optimizer = nn.Adam(params=net.trainable_params(), learning_rate=lr, weight_decay=cfg['weight_decay'],
loss_scale=cfg['loss_scale'])
loss_scale_manager = mindspore.train.loss_scale_manager.FixedLossScaleManager(cfg['FixedLossScaleManager'], False)
model = Model(net, loss_fn=criterion, loss_scale_manager=loss_scale_manager, optimizer=optimizer, amp_level="O3")
print("============== Starting Training ==============")
model.train(2, train_dataset, callbacks=[StepLossTimeMonitor(batch_size=batch_size), ckpoint_cb],
dataset_sink_mode=False)
print("============== End Training ==============")
#%%
data_url = './data'
run_distribute = False
epoch_size = cfg_unet['epochs'] if not run_distribute else cfg_unet['distribute_epochs']
train_net(data_dir=data_url, cross_valid_ind=cfg_unet['cross_valid_ind'], epochs=epoch_size,
batch_size=cfg_unet['batchsize'], lr=cfg_unet['lr'], run_distribute=run_distribute,
cfg=cfg_unet)
运行到这一步的时候出错信息如下:
WARNING] ME(3580:16232,MainProcess):2021-12-25-00:30:59.151.94 [mindspore\nn\loss\loss.py:103] '_Loss' is deprecated from version 1.3 and will be removed in a future version, use 'LossBase' instead.
dataset length is: 600 ============== Starting Training ==============
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_3580/3195514145.py in
~\AppData\Local\Temp/ipykernel_3580/2411766277.py in train_net(data_dir, cross_valid_ind, epochs, batch_size, lr, run_distribute, cfg) 32 print("============== Starting Training ==============") 33 model.train(2, train_dataset, callbacks=[StepLossTimeMonitor(batch_size=batch_size), ckpoint_cb], ---> 34 dataset_sink_mode=False) 35 print("============== End Training ==============") 36
D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\train\model.py in train(self, epoch, train_dataset, callbacks, dataset_sink_mode, sink_size) 647 callbacks=callbacks, 648 dataset_sink_mode=dataset_sink_mode, --> 649 sink_size=sink_size) 650 651 def _eval_dataset_sink_process(self, valid_dataset, list_callback=None, cb_params=None):
D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\train\model.py in _train(self, epoch, train_dataset, callbacks, dataset_sink_mode, sink_size) 431 self._check_reuse_dataset(train_dataset) 432 if not dataset_sink_mode: --> 433 self._train_process(epoch, train_dataset, list_callback, cb_params) 434 elif context.get_context("device_target") == "CPU": 435 logger.warning("The CPU cannot support dataset sink mode currently."
D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\train\model.py in _train_process(self, epoch, train_dataset, list_callback, cb_params) 556 cb_params.train_dataset_element = next_element 557 list_callback.step_begin(run_context) --> 558 outputs = self._train_network(*next_element) 559 cb_params.net_outputs = outputs 560 if self._loss_scale_manager and self._loss_scale_manager.get_drop_overflow_update():
D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\nn\cell.py in __call__(self, *inputs, **kwargs) 384 if self.enable_hook: 385 raise ValueError("The graph mode does not support hook function.") --> 386 out = self.compile_and_run(*inputs) 387 return out 388
D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\nn\cell.py in compile_and_run(self, *inputs) 642 """ 643 self._auto_parallel_compile_and_run = True --> 644 self.compile(*inputs) 645 646 new_inputs = []
D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\nn\cell.py in compile(self, *inputs) 629 inputs (tuple): Inputs of the Cell object. 630 """ --> 631 _executor.compile(self, *inputs, phase=self.phase, auto_parallel_mode=self._auto_parallel_mode) 632 633 def compile_and_run(self, *inputs):
D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\common\api.py in compile(self, obj, phase, do_convert, auto_parallel_mode, *args) 529 enable_ge = context.get_context("enable_ge") 530 use_vm = not enable_ge or (enable_debug_runtime and context.get_context("mode") == context.PYNATIVE_MODE) --> 531 result = self._executor.compile(obj, args_list, phase, use_vm, self.queue_name) 532 self.compile_cache[phase] = phase 533 if not result:
RuntimeError: mindspore\ccsrc\runtime\device\cpu\cpu_memory_manager.cc:33 MemMalloc] Malloc memory failed: size 13225224704 # In file D:\ProgramData\Anaconda3\envs\Mindspore13-py375\lib\site-packages\mindspore\ops\_grad\grad_nn_ops.py(64) return dx, dw ^
根据上述warning的提示是
'_Loss' is deprecated from version 1.3 and will be removed in a future version, use 'LossBase' instead.
“_Loss”已从1.3版中弃用,并将在未来版本中删除,请改用“LossBase”。请看下下方API是否可以满足使用需求。