花了很长的时间 终于陆陆续续把CLION环境整合完毕, 踩了无数的坑,在这里记录一下
C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.16.27023\include
D:\Windows Kits\10\Lib\10.0.17763.0\ucrt
VS_LIB_PATH
这个路径需要和CUDA对应 32位就是x64, 64位就是x86,这个地方没有一篇文章是说清楚的,所以我的路径选了x86以后, 就出现了一堆错误,一百多个编译错误,直接心态崩了
C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.16.27023\lib\x64
D:\Windows Kits\10\Lib\10.0.17763.0\ucrt\x64
VS_PATH
C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.16.27023\bin\Hostx64\x64
环境变量配置好以后在path下直接用%VS_PATH%,这样来引用即可,配置完以后需要reboot
3. CLION
需要在CLION里面去配置visual studio,如果出现visual studio路径找不到,那多半是visual的环境变量有问题,这就回到之前我说的双引号问题,我这边CUDA版本是32位的,所以选择架构是arm64,这个东西是和CUDA版本有关,不要听其他博客人云亦云,说什么一定要选arm,许多百度的结果都不说明白理由,这些问题都是我去stackflow上才发现的
4. 愉快的开始coding
我的cmake
cmake_minimum_required(VERSION 3.17)
project(CUDA_project CUDA)
include_directories(./include)
find_package(CUDA)
set(CMAKE_CUDA_STANDARD 14)
add_executable(CUDA_project information.cu)
set_target_properties(
CUDA_project
PROPERTIES
CUDA_SEPARABLE_COMPILATION ON)
#include
#include
int main(int argc,char** argv)
{
printf("%s Starting ...\n",argv[0]);
int deviceCount = 0;
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
if(error_id!=cudaSuccess)
{
printf("cudaGetDeviceCount returned %d\n ->%s\n",
(int)error_id,cudaGetErrorString(error_id));
printf("Result = FAIL\n");
exit(EXIT_FAILURE);
}
if(deviceCount==0)
{
printf("There are no available device(s) that support CUDA\n");
}
else
{
printf("Detected %d CUDA Capable device(s)\n",deviceCount);
}
int dev=0,driverVersion=0,runtimeVersion=0;
cudaSetDevice(dev);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp,dev);
printf("Device %d:\"%s\"\n",dev,deviceProp.name);
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n",
driverVersion/1000,(driverVersion%100)/10,
runtimeVersion/1000,(runtimeVersion%100)/10);
printf(" CUDA Capability Major/Minor version number: %d.%d\n",
deviceProp.major,deviceProp.minor);
printf(" Total amount of global memory: %.2f GBytes (%llu bytes)\n",
(float)deviceProp.totalGlobalMem/pow(1024.0,3),deviceProp.totalGlobalMem);
printf(" GPU Clock rate: %.0f MHz (%0.2f GHz)\n",
deviceProp.clockRate*1e-3f,deviceProp.clockRate*1e-6f);
printf(" Memory Bus width: %d-bits\n",
deviceProp.memoryBusWidth);
if (deviceProp.l2CacheSize)
{
printf(" L2 Cache Size: %d bytes\n",
deviceProp.l2CacheSize);
}
printf(" Max Texture Dimension Size (x,y,z) 1D=(%d),2D=(%d,%d),3D=(%d,%d,%d)\n",
deviceProp.maxTexture1D,deviceProp.maxTexture2D[0],deviceProp.maxTexture2D[1]
,deviceProp.maxTexture3D[0],deviceProp.maxTexture3D[1],deviceProp.maxTexture3D[2]);
printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d,2D=(%d,%d) x %d\n",
deviceProp.maxTexture1DLayered[0],deviceProp.maxTexture1DLayered[1],
deviceProp.maxTexture2DLayered[0],deviceProp.maxTexture2DLayered[1],
deviceProp.maxTexture2DLayered[2]);
printf(" Total amount of constant memory %lu bytes\n",
deviceProp.totalConstMem);
printf(" Total amount of shared memory per block: %lu bytes\n",
deviceProp.sharedMemPerBlock);
printf(" Total number of registers available per block:%d\n",
deviceProp.regsPerBlock);
printf(" Wrap size: %d\n",deviceProp.warpSize);
printf(" Maximun number of thread per multiprocesser: %d\n",
deviceProp.maxThreadsPerMultiProcessor);
printf(" Maximun number of thread per block: %d\n",
deviceProp.maxThreadsPerBlock);
printf(" Maximun size of each dimension of a block: %d x %d x %d\n",
deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
printf(" Maximun size of each dimension of a grid: %d x %d x %d\n",
deviceProp.maxGridSize[0],
deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]);
printf(" Maximu memory pitch %lu bytes\n",deviceProp.memPitch);
printf("----------------------------------------------------------\n");
printf("Number of multiprocessors: %d\n", deviceProp.multiProcessorCount);
printf("Total amount of constant memory: %4.2f KB\n",
deviceProp.totalConstMem/1024.0);
printf("Total amount of shared memory per block: %4.2f KB\n",
deviceProp.sharedMemPerBlock/1024.0);
printf("Total number of registers available per block: %d\n",
deviceProp.regsPerBlock);
printf("Warp size %d\n", deviceProp.warpSize);
printf("Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
printf("Maximum number of threads per multiprocessor: %d\n",
deviceProp.maxThreadsPerMultiProcessor);
printf("Maximum number of warps per multiprocessor: %d\n",
deviceProp.maxThreadsPerMultiProcessor/32);
return EXIT_SUCCESS;
}
大功告成,之所以用CLION是因为VS实在是太臃肿,非常的快乐,如果有需要win环境下安装的朋友,一定要按照我的方法来,我发现其他博客没有一篇讲的很清楚的,我是折腾了差不多一周左右,踩了无数坑,如果出现nvcc编译cu程序乱码,一定要注意肯定是MSVC有问题,一考虑环境变量,二考虑是否版本对应。