在训练过程中,进行到更新参数这一步时,出现memory is not enough,我使用的是24G的RTX 3090显卡来训练,能帮忙看一下是我代码的问题吗?还是纯粹的内存问题?
- [WARNING] PRE_ACT(14099,7f5cfb754700,python3):2021-12-13-11:08:39.551.431 [mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.cc:150] CalMemBlockAllocSize] Memory not enough: current free memory size[103809024] is smaller than required size[4213571584].
- [ERROR] RUNTIME_FRAMEWORK(14099,7f5cfb754700,python3):2021-12-13-11:08:39.551.455 [mindspore/ccsrc/runtime/framework/actor/memory_manager_actor.cc:182] SetOpContextMemoryAllocFail] Device(id:0) memory isn't enough and alloc failed, kernel name: Gradients/Default/network-compute_loss_pi/ac-ActorCritic/pi-GaussianActor/conv2-Conv2d/gradConv2D/Conv2DBackpropInput-op731, alloc size: 4213571584B.
- [EXCEPTION] VM(14099,7f5dfee71740,python3):2021-12-13-11:08:39.551.515 [mindspore/ccsrc/vm/backend.cc:835] RunGraph] The actor runs failed, actor name: kernel_graph_82
- [ERROR] RUNTIME_FRAMEWORK(14099,7f5cfaf53700,python3):2021-12-13-11:08:39.551.570 [mindspore/ccsrc/runtime/framework/actor/abstract_actor.cc:53] EraseInput] Erase input data failed: Gradients/Default/network-compute_loss_pi/ac-ActorCritic/pi-GaussianActor/conv2-Conv2d/gradConv2D/Conv2DBackpropFilter-op730, sequential_num: -820833344
- Traceback (most recent call last):
- File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/runpy.py", line 197, in _run_module_as_main
- return _run_code(code, main_globals, None,
- File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/runpy.py", line 87, in _run_code
- exec(code, run_globals)
- File "/home/carserver1/mindspore_ws/src/drl_nav/drl/algos/ppo/pppo.py", line 374, in
- train(env, ppo)
- File "/home/carserver1/mindspore_ws/src/drl_nav/drl/algos/ppo/pppo.py", line 173, in train
- flag=ppo.update()
- File "/home/carserver1/mindspore_ws/src/drl_nav/drl/algos/ppo/pppo_mindspore.py", line 268, in update
- trainOneStepCellForPi(inputs)
- File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/nn/cell.py", line 404, in __call__
- out = self.compile_and_run(*inputs)
- File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/nn/cell.py", line 698, in compile_and_run
- return _cell_graph_executor(self, *new_inputs, phase=self.phase)
- File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 627, in __call__
- return self.run(obj, *args, phase=phase)
- File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 655, in run
- return self._exec_pip(obj, *args, phase=phase_real)
- File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 78, in wrapper
- results = fn(*arg, **kwargs)
- File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 638, in _exec_pip
- return self._graph_executor(args_list, phase)
- RuntimeError: mindspore/ccsrc/vm/backend.cc:835 RunGraph] The actor runs failed, actor name: kernel_graph_82
- # In file /home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/ops/_grad/grad_math_ops.py(81)
- num_selected = reshape(reduce_sum(indicators, axis), output_shape_kept_dims) + min_num
- ^
复制
- ## 损失网络
- class compute_loss_pi(nn.Cell):
- def __init__(self, actor_critic, clip_ratio=0.2):
- super(compute_loss_pi, self).__init__()
- self.ac = actor_critic
- self.clip_ratio = clip_ratio
- def construct(self, data):
- state, image, act, adv, logp_old = data['state'], data['image'], data['act'], data['adv'], data['logp']
-
- # Policy loss
- pi, logp = self.ac.pi(state, image, act)
- ratio = mnp.exp(logp - logp_old)
- clip_adv = mnp.clip(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio) * adv ###clamp
- loss_pi = -(mnp.min(ratio * adv, clip_adv)).mean()
- return loss_pi
- def backbone_network(self):
- return self.ac.pi
-
- class compute_loss_v(nn.Cell):
- def __init__(self, actor_critic):
- super(compute_loss_v, self).__init__()
- self.ac = actor_critic
- def construct(self, data):
- state, image, ret = data['state'], data['image'], data['ret']
- return ((self.ac.v(state, image) - ret) ** 2).mean()
- def backbone_network(self):
- return self.ac.v
-
- ## train(one epoch)
- data = dict(state=state_bufs, image=img_bufs, act=act_bufs, ret=ret_bufs, adv=adv_bufs, logp=logp_bufs)
- inputs = {k: Tensor(v, dtype=mindspore.dtype.float32) for k, v in list(data.items())}#zhe li ba device shanle
- trainOneStepCellForPi = nn.TrainOneStepCell(self.compute_loss_pi, self.pi_optimizer)
- trainOneStepCellForV = nn.TrainOneStepCell(self.compute_loss_v, self.vf_optimizer)
- if not self.only_once:
- trainOneStepCellForPi.set_train()
- trainOneStepCellForV.set_train()
- self.only_once = True
- # Train policy with multiple steps of gradient descent
- for i in range(self.train_pi_iters):
- trainOneStepCellForPi(inputs)
- pi_info = self.cal_pi_info(inputs)
- kl = np.average(pi_info['kl'])
- if kl > 1.5 * self.target_kl:
- self.logger.log('Early stopping at step %d due to reaching max kl.' % i)
- break
-
- # Value function learning
- for i in range(self.train_v_iters):
- trainOneStepCellForV(inputs)
*******************************************************************************************************************
export GLOG_v=1 再执行下,需要根据详细日志分析