• VSCode之C++ & CUDA极简环境配置


    1. 背景
      想要了解CUDA并行计算原理,同时针对深度学习中出现一些“不支持算子”可能需要手写的需要,配置一个简单的CUDA编译环境,探索CUDA编程的范式【注:CUDA环境配置略】。
    2. 结果展示
      在这里插入图片描述
    3. 示例代码
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include 
    
    __global__ void VecAdd(int* A, int* B, int* C)
    {
    	int i = threadIdx.x;
    	C[i] = A[i] + B[i];
    }
    
    void test_cuda(){
       // define data
        const int size = 3;
    	int a[size] = { 1,2,3 };
    	int b[size] = { 10,20,30 };
    	int c[size] = { 0 };
    
    	// define device
    	int* dev_a = 0;
    	int* dev_b = 0;
    	int* dev_c = 0;
    	// set cuda state
    	cudaError_t cudaStatus;
    	// select cuda device
    	cudaStatus = cudaSetDevice(0);
    	if (cudaStatus != cudaSuccess) {
    		fprintf(stderr, "GPU device error");
    		return;
    	}
    	// allocate memory on cuda
    	cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    	if (cudaStatus != cudaSuccess)
    		fprintf(stderr, "device_c allocate error");
    	cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    	if (cudaStatus != cudaSuccess)
    		fprintf(stderr,  "device_a allocate error");
    	cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    	if (cudaStatus != cudaSuccess)
    		fprintf(stderr,  "device_b allocate error");
    	// copy data from cpu to gpu
    	cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    	if (cudaStatus != cudaSuccess) {
    		fprintf(stderr,  "device_a copy error");
    	}
    	// copy data from cpu to gpu
    	cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    	if (cudaStatus != cudaSuccess) {
    		fprintf(stderr, "device_b copy error");
    	}
    	// core gpu operation
    	VecAdd <<<1, size>>> (dev_a, dev_b, dev_c);
    	// get error state
    	cudaStatus = cudaGetLastError();
    	if (cudaStatus != cudaSuccess) {
    		fprintf(stderr, "VecAdd call error: %s\n", cudaGetErrorString(cudaStatus));
    	}
    
    	cudaStatus = cudaDeviceSynchronize();
    	if (cudaStatus != cudaSuccess) {
    		fprintf(stderr, "cudaDeviceSynchronize not sucess %d!\n", cudaStatus);
    	}
    	// copy data from cuda to cpu
    	cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    	if (cudaStatus != cudaSuccess) {
    		fprintf(stderr, "copy result to host error");
    	}
    	printf("{1,2,3} + {10,20,30} = {%d,%d,%d}\n", c[0], c[1], c[2]);
    	cudaFree(dev_a);
    	cudaFree(dev_b);
    	cudaFree(dev_c); 
    }
    
    int main() {
    	test_cuda();
    	return 0;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    1. 小结
    1. NVCC编译cuda命令与g++编译C++较为相似,从而借鉴引入对应的include,实现Windows下cmake编译CUDA代码;
    2. 示例代码展示了从CPU读取数据,在GPU端进行计算,最终传输给CPU的过程,与深度学习数据加载过程类似,是较为通用的过程;
    3. 理解C++到CUDA的过渡、预加载过程,进一步从底层了解CUDA。
    1. 更新(同时支持Window和Linux)
      新增跨平台编译的CMakelists.txt
    cmake_minimum_required(VERSION 3.0.0)
    project(demo VERSION 0.1.0)
    
    if(MSVC)
        set(CMAKE_CUDA_COMPILER "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2/bin/nvcc.exe")
        set(CUDACXX "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2/bin/nvcc.exe")
        enable_language(CUDA)
        # set (CMAKE_CUDA_ARCHITECTURES "60;72;75" )
        set (CUDA_INCLUDE_DIRS )
    else()
        # set(CUDA_PATH /usr/local/cuda-11.3)
        # set(CMAKE_CUDA_COMPILER /usr/local/cuda-11.3/bin/nvcc)
        find_package(CUDA REQUIRED)
        enable_language(CUDA)
    endif()
    
    
    add_library(a SHARED a.cu)
    # add_library(advance_reduce advance_reduce.cu)
    add_executable(main main.cu)
    target_link_libraries(main a)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
  • 相关阅读:
    重排链表(leetcode)
    Vovsoft Text Edit Plus 专业文本编辑器工具软件:简洁高效的创作利器
    猿创征文|多模态图像合成和编辑(MISE):Multimodal Image Synthesis and Editing: A Survey
    134. 加油站
    毕业设计 基于单片机的交通安全车辆测速系统 - 嵌入式 物联网
    Nacos 如何实现配置文件动态更新的
    wget同时下载多个文件
    一个不用写代码的案例,来看看Flowable到底给我们提供了哪些功能?
    Visual Studio 和 VSCode 哪个好?
    PG14源码安装
  • 原文地址:https://blog.csdn.net/qq_37172182/article/details/132632572