问题描述:
在mindspore1.1.1上做图算融合,硬件环境是Ascend 910,程序没有问题,添加运行图算融合(enable_graph_kernel=True)参数之后程序报错:
ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target=args.device_target,
save_graphs=False, max_call_depth=10000, device_id=args.device_id, enable_graph_kernel=True)
报错记录如下:
Traceback (most recent call last):
File "/opt/python3.7.5/lib/python3.7/site-packages/mindspore/_extends/parallel_compile/akg_compiler/compiler.py", line 35, in
run_compiler(sys.argv[1])
File "/opt/python3.7.5/lib/python3.7/site-packages/mindspore/_extends/parallel_compile/akg_compiler/compiler.py", line 28, in run_compiler
p = __import__("akg", globals(), locals(), ['ms'], 0)
File "/opt/python3.7.5/lib/python3.7/site-packages/akg/__init__.py", line 84, in
from . import autodiff
File "/opt/python3.7.5/lib/python3.7/site-packages/akg/autodiff.py", line 19, in
from akg.tvm._ffi.function import _init_api
File "
File "
File "
File "
File "/opt/python3.7.5/lib/python3.7/site-packages/akg/__init__.py", line 62, in load_module
__import__(self.__rname, globals(), locals())
File "/opt/python3.7.5/lib/python3.7/site-packages/akg/tvm/__init__.py", line 27, in
from . import tensor
File "/opt/python3.7.5/lib/python3.7/site-packages/akg/tvm/tensor.py", line 20, in
from ._ffi.node import NodeBase, NodeGeneric, register_node, convert_to_node
File "/opt/python3.7.5/lib/python3.7/site-packages/akg/tvm/_ffi/node.py", line 24, in
from .object import Object, register_object, _set_class_node
File "/opt/python3.7.5/lib/python3.7/site-packages/akg/tvm/_ffi/object.py", line 23, in
from .base import _FFI_MODE, _RUNTIME_ONLY, check_call, _LIB, c_str
File "/opt/python3.7.5/lib/python3.7/site-packages/akg/tvm/_ffi/base.py", line 81, in
_LIB, _LIB_NAME = _load_lib()
File "/opt/python3.7.5/lib/python3.7/site-packages/akg/tvm/_ffi/base.py", line 73, in _load_lib
lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_GLOBAL)
File "/opt/python3.7.5/lib/python3.7/ctypes/__init__.py", line 364, in __init__
self._handle = _dlopen(self._name, mode)
OSError: /opt/python3.7.5/lib/python3.7/site-packages/mindspore/lib/libakg.so: undefined symbol: pthread_atfork
Traceback (most recent call last):
Traceback (most recent call last):
File "/opt/python3.7.5/lib/python3.7/site-packages/mindspore/_extends/parallel_compile/akg_compiler/compiler.py", line 35, in
File "/opt/python3.7.5/lib/python3.7/site-packages/mindspore/_extends/parallel_compile/akg_compiler/compiler.py", line 35, in
run_compiler(sys.argv[1])
run_compiler(sys.argv[1])
File "/opt/python3.7.5/lib/python3.7/site-packages/mindspore/_extends/parallel_compile/akg_compiler/compiler.py", line 28, in run_compiler
File "/opt/python3.7.5/lib/python3.7/site-packages/mindspore/_extends/parallel_compile/akg_compiler/compiler.py", line 28, in run_compiler
p = __import__("akg", globals(), locals(), ['ms'], 0)
p = __import__("akg", globals(), locals(), ['ms'], 0)
这个报错记录太长了,超过帖子最大字数限制,我把报错记录放在附件error.txt中了
求问有哪位大神指导这个问题如何解决吗?
解决方案:
这个问题的原因在于使用了高版本的gcc编译akg时找不到pthread_atfork。
pthread_atfork是POSIX标准,在编译时要加上-pthread
-lpthread是老版本的gcc编译器用的,在新版本中应该用-pthread取代-lpthread
可以在akg 的 cmakelist下的target_link_libraries增加pthread重新编译一次