问题描述:
【操作步骤&问题现象】
在PYNATIVE_MODE GPU模式下执行训练过程时,出现 "Malloc for kernel output failed, Memory isn't enough",网上查询得知可能与PYNATIVE内存消耗大有关。于是转用GRAPH_MODE进行训练,训练过程中出现问题 "The name 'batch_outputs' is not defined, or not supported in graph mode."。
【截图信息】
代码如下:

【日志信息】(可选,上传日志内容或者附件)
[CRITICAL] PARSER(43607,7f2058a25b80,python):2022-05-05-09:42:30.215.793 [mindspore/ccsrc/pipeline/jit/parse/function_block.cc:257] HandleBuiltinNamespaceInfo] The name 'batch_outputs' is not defined, or not supported in graph mode.
---------------------------------------------------------------------------NameError Traceback (most recent call last)
Input In [21], in <cell line: 1>()----> 1 train.train(net_name, train_ds, 1, cbs=cbs)
File ~/projs/textseg/src/train.py:35, in train(net_name, train_dataset, epochs, cbs) 33 optimizer = msnn.Adam(loss_net.trainable_params(), learning_rate=1e-3) 34 model = ms.Model(loss_net, optimizer=optimizer)---> 35 model.train(epoch=epochs, train_dataset=train_dataset, callbacks=cbs, dataset_sink_mode=False)
File ~/projs/textseg/.venv/lib64/python3.9/site-packages/mindspore/train/model.py:902, in Model.train(self, epoch, train_dataset, callbacks, dataset_sink_mode, sink_size) 897 raise ValueError("For 'Model.train', The argument 'sink_size' must be -1 or positive, " 898 "but got {}.".format(sink_size)) 900 _device_number_check(self._parallel_mode, self._device_number)--> 902 self._train(epoch, 903 train_dataset, 904 callbacks=callbacks, 905 dataset_sink_mode=dataset_sink_mode, 906 sink_size=sink_size)
File ~/projs/textseg/.venv/lib64/python3.9/site-packages/mindspore/train/model.py:87, in _save_final_ckpt.<locals>.wrapper(self, *args, **kwargs) 85 raise e 86 else:---> 87 func(self, *args, **kwargs)
File ~/projs/textseg/.venv/lib64/python3.9/site-packages/mindspore/train/model.py:542, in Model._train(self, epoch, train_dataset, callbacks, dataset_sink_mode, sink_size) 540 self._check_reuse_dataset(train_dataset) 541 if not dataset_sink_mode:--> 542 self._train_process(epoch, train_dataset, list_callback, cb_params) 543 elif context.get_context("device_target") == "CPU": 544 logger.info("The CPU cannot support dataset sink mode currently." 545 "So the training process will be performed with dataset not sink.")
File ~/projs/textseg/.venv/lib64/python3.9/site-packages/mindspore/train/model.py:794, in Model._train_process(self, epoch, train_dataset, list_callback, cb_params) 792 cb_params.train_dataset_element = next_element 793 list_callback.step_begin(run_context)--> 794 outputs = self._train_network(*next_element) 795 cb_params.net_outputs = outputs 796 if self._loss_scale_manager and self._loss_scale_manager.get_drop_overflow_update():
File ~/projs/textseg/.venv/lib64/python3.9/site-packages/mindspore/nn/cell.py:586, in Cell.__call__(self, *args, **kwargs) 583 if self._hook_fn_registered(): 584 logger.warning(f"For 'Cell', it's not support hook function in graph mode. If you want to use hook " 585 f"function, please use context.set_context to set pynative mode.")--> 586 out = self.compile_and_run(*args) 587 return out 589 # Run in PyNative mode.
File ~/projs/textseg/.venv/lib64/python3.9/site-packages/mindspore/nn/cell.py:964, in Cell.compile_and_run(self, *inputs) 952 """ 953 Compile and run Cell, the input must be consistent with the input defined in construct. 954 (...) 961 Object, the result of executing. 962 """ 963 self._auto_parallel_compile_and_run = True--> 964 self.compile(*inputs) 966 new_inputs = [] 967 for i in inputs:
File ~/projs/textseg/.venv/lib64/python3.9/site-packages/mindspore/nn/cell.py:937, in Cell.compile(self, *inputs) 930 """ 931 Compile Cell as a computation graph, the input must be consistent with the input defined in construct. 932 933 Args: 934 inputs (tuple): Inputs of the Cell object. 935 """ 936 if self._dynamic_shape_inputs is None or self._dynamic_shape_inputs[0] is None:--> 937 _cell_graph_executor.compile(self, *inputs, phase=self.phase, auto_parallel_mode=self._auto_parallel_mode) 938 else: 939 self._check_compile_dynamic_shape(*inputs)
File ~/projs/textseg/.venv/lib64/python3.9/site-packages/mindspore/common/api.py:1006, in _CellGraphExecutor.compile(self, obj, phase, do_convert, auto_parallel_mode, *args) 1004 enable_ge = context.get_context("enable_ge") 1005 self._graph_executor.set_weights_values(obj.parameters_dict())-> 1006 result = self._graph_executor.compile(obj, args_list, phase, self._use_vm_mode()) 1007 obj.compile_cache.add(phase) 1008 if not result:
NameError: mindspore/ccsrc/pipeline/jit/parse/function_block.cc:257 HandleBuiltinNamespaceInfo] The name 'batch_outputs' is not defined, or not supported in graph mode.
# In file /home/bozxo/projs/textseg/src/modelzoo/koshorek/textseg_ms.py(188)
doc_outputs = [batch_outputs[:int(batch_doc_lens[i]) - 1, i, :] for i in range(batch_size)]
解答:
因为静态图模式中不支持类型转换操作。