• mindspore出现memory is not enough!


    在训练过程中,进行到更新参数这一步时,出现memory is not enough,我使用的是24G的RTX 3090显卡来训练,能帮忙看一下是我代码的问题吗?还是纯粹的内存问题?

    输出日志

    1. [WARNING] PRE_ACT(14099,7f5cfb754700,python3):2021-12-13-11:08:39.551.431 [mindspore/ccsrc/backend/optimizer/mem_reuse/mem_dynamic_allocator.cc:150] CalMemBlockAllocSize] Memory not enough: current free memory size[103809024] is smaller than required size[4213571584].
    2. [ERROR] RUNTIME_FRAMEWORK(14099,7f5cfb754700,python3):2021-12-13-11:08:39.551.455 [mindspore/ccsrc/runtime/framework/actor/memory_manager_actor.cc:182] SetOpContextMemoryAllocFail] Device(id:0) memory isn't enough and alloc failed, kernel name: Gradients/Default/network-compute_loss_pi/ac-ActorCritic/pi-GaussianActor/conv2-Conv2d/gradConv2D/Conv2DBackpropInput-op731, alloc size: 4213571584B.
    3. [EXCEPTION] VM(14099,7f5dfee71740,python3):2021-12-13-11:08:39.551.515 [mindspore/ccsrc/vm/backend.cc:835] RunGraph] The actor runs failed, actor name: kernel_graph_82
    4. [ERROR] RUNTIME_FRAMEWORK(14099,7f5cfaf53700,python3):2021-12-13-11:08:39.551.570 [mindspore/ccsrc/runtime/framework/actor/abstract_actor.cc:53] EraseInput] Erase input data failed: Gradients/Default/network-compute_loss_pi/ac-ActorCritic/pi-GaussianActor/conv2-Conv2d/gradConv2D/Conv2DBackpropFilter-op730, sequential_num: -820833344
    5. Traceback (most recent call last):
    6. File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    7. return _run_code(code, main_globals, None,
    8. File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/runpy.py", line 87, in _run_code
    9. exec(code, run_globals)
    10. File "/home/carserver1/mindspore_ws/src/drl_nav/drl/algos/ppo/pppo.py", line 374, in
    11. train(env, ppo)
    12. File "/home/carserver1/mindspore_ws/src/drl_nav/drl/algos/ppo/pppo.py", line 173, in train
    13. flag=ppo.update()
    14. File "/home/carserver1/mindspore_ws/src/drl_nav/drl/algos/ppo/pppo_mindspore.py", line 268, in update
    15. trainOneStepCellForPi(inputs)
    16. File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/nn/cell.py", line 404, in __call__
    17. out = self.compile_and_run(*inputs)
    18. File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/nn/cell.py", line 698, in compile_and_run
    19. return _cell_graph_executor(self, *new_inputs, phase=self.phase)
    20. File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 627, in __call__
    21. return self.run(obj, *args, phase=phase)
    22. File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 655, in run
    23. return self._exec_pip(obj, *args, phase=phase_real)
    24. File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 78, in wrapper
    25. results = fn(*arg, **kwargs)
    26. File "/home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/common/api.py", line 638, in _exec_pip
    27. return self._graph_executor(args_list, phase)
    28. RuntimeError: mindspore/ccsrc/vm/backend.cc:835 RunGraph] The actor runs failed, actor name: kernel_graph_82
    29. # In file /home/carserver1/anaconda3/envs/mindspore_py39/lib/python3.9/site-packages/mindspore/ops/_grad/grad_math_ops.py(81)
    30. num_selected = reshape(reduce_sum(indicators, axis), output_shape_kept_dims) + min_num
    31. ^
    复制

    部分实现代码

    1. ## 损失网络
    2. class compute_loss_pi(nn.Cell):
    3. def __init__(self, actor_critic, clip_ratio=0.2):
    4. super(compute_loss_pi, self).__init__()
    5. self.ac = actor_critic
    6. self.clip_ratio = clip_ratio
    7. def construct(self, data):
    8. state, image, act, adv, logp_old = data['state'], data['image'], data['act'], data['adv'], data['logp']
    9. # Policy loss
    10. pi, logp = self.ac.pi(state, image, act)
    11. ratio = mnp.exp(logp - logp_old)
    12. clip_adv = mnp.clip(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio) * adv ###clamp
    13. loss_pi = -(mnp.min(ratio * adv, clip_adv)).mean()
    14. return loss_pi
    15. def backbone_network(self):
    16. return self.ac.pi
    17. class compute_loss_v(nn.Cell):
    18. def __init__(self, actor_critic):
    19. super(compute_loss_v, self).__init__()
    20. self.ac = actor_critic
    21. def construct(self, data):
    22. state, image, ret = data['state'], data['image'], data['ret']
    23. return ((self.ac.v(state, image) - ret) ** 2).mean()
    24. def backbone_network(self):
    25. return self.ac.v
    26. ## train(one epoch)
    27. data = dict(state=state_bufs, image=img_bufs, act=act_bufs, ret=ret_bufs, adv=adv_bufs, logp=logp_bufs)
    28. inputs = {k: Tensor(v, dtype=mindspore.dtype.float32) for k, v in list(data.items())}#zhe li ba device shanle
    29. trainOneStepCellForPi = nn.TrainOneStepCell(self.compute_loss_pi, self.pi_optimizer)
    30. trainOneStepCellForV = nn.TrainOneStepCell(self.compute_loss_v, self.vf_optimizer)
    31. if not self.only_once:
    32. trainOneStepCellForPi.set_train()
    33. trainOneStepCellForV.set_train()
    34. self.only_once = True
    35. # Train policy with multiple steps of gradient descent
    36. for i in range(self.train_pi_iters):
    37. trainOneStepCellForPi(inputs)
    38. pi_info = self.cal_pi_info(inputs)
    39. kl = np.average(pi_info['kl'])
    40. if kl > 1.5 * self.target_kl:
    41. self.logger.log('Early stopping at step %d due to reaching max kl.' % i)
    42. break
    43. # Value function learning
    44. for i in range(self.train_v_iters):
    45. trainOneStepCellForV(inputs)

    *******************************************************************************************************************

     export GLOG_v=1 再执行下,需要根据详细日志分析

  • 相关阅读:
    【小程序】微信小程序自定义组件Component详细总结
    2023-09-28 monetdb-databae的概念和作用-分析
    PLC中ST编程——单按钮控制多台电机顺序启动
    Ag44团簇以及衍生团簇(银纳米团簇直径1-2nm)
    SpringBoot_整合SpringSecurity(前后端分离版)
    如何设计一个分布式 ID 发号器?
    paddle 自定义数据集和预处理
    .net Core .net6 Cookie加Jwt认证 同时生效
    【附源码】计算机毕业设计JAVA住房公积金筹集子系统的网站系统
    c语言基础:L1-001 Hello World
  • 原文地址:https://blog.csdn.net/weixin_45666880/article/details/126498947