• CUDA向量相加 向量内积


    CUDA向量相加

    #include 
    #include 
    
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    
    #define  N 1000000
    
    
    //cpu
    void vector_add_cpu(int* a, int* b, int* c, int n)
    {
    	for (int i = 0; i < n; ++i)
    	{
    		c[i] = a[i] + b[i];
    	}
    }
    
    
    //1、单block单thread向量加
    __global__ void vector_add_gpu_1(int* a, int* b, int* c, int n)
    {
    	for (int i = 0; i < n; ++i)
    	{
    		c[i] = a[i] + b[i];
    	}
    }
    
    
    //2、单block多thread向量加
    __global__ void vector_add_gpu_2(int* a, int* b, int* c, int n)
    {
    	int tid = threadIdx.x; //线程索引号
    	const int t_n = blockDim.x; // 一个block内的线程总数
    	while (tid < n)
    	{
    		c[tid] = a[tid] + b[tid];
    		tid += t_n;
    	}
    }
    
    
    //3、多block多thread向量加
    __global__ void vector_add_gpu_3(int* a, int* b, int* c, int n)
    {
    	int tid = blockIdx.x * blockDim.x + threadIdx.x; // 获取线程索引
    	const int t_n = gridDim.x * blockDim.x; // 跳步的步长,所有线程的数量
    	//printf("gridDim.x = %d\n", gridDim.x);
    	//printf("blockDim.x = %d\n", blockDim.x);
    	//printf("t_n = %d\n", t_n);
    	while (tid < n)
    	{
    		c[tid] = a[tid] + b[tid];
    		tid += t_n;
    	}
    }
    
    
    int main()
    {
    	int a[N], b[N], c[N];
    	int* dev_a, * dev_b, * dev_c;
    	for (int i = 0; i < N; ++i) // 为数组a、b赋值
    	{
    		a[i] = i;
    		b[i] = i;
    	}
    	cudaMalloc(&dev_a, sizeof(int) * N);
    	cudaMemcpy(dev_a, a, sizeof(int) * N, cudaMemcpyHostToDevice);
    
    	cudaMalloc(&dev_b, sizeof(int) * N);
    	cudaMemcpy(dev_b, b, sizeof(int) * N, cudaMemcpyHostToDevice);
    
    	cudaMalloc(&dev_c, sizeof(int) * N);
    	cudaMemcpy(dev_c, c, sizeof(int) * N, cudaMemcpyHostToDevice);
    
    	clock_t t = clock();
    	int loops = 100;
    
    	for (size_t i = 0; i < loops; i++)
    	{
    		vector_add_cpu(a, b, c, N);
    	}
    	clock_t t0 = clock();
    	std::cout << t0 - t << "ms" << std::endl;
    
    	for (size_t i = 0; i < loops; i++)
    	{
    		vector_add_gpu_1 << <1, 1 >> > (dev_a, dev_b, dev_c, N);
    	}
    	clock_t t1 = clock();
    	std::cout << t1 - t0 << "ms" << std::endl;
    
    	for (size_t i = 0; i < loops; i++)
    	{
    		vector_add_gpu_2 << <1, 4 >> > (dev_a, dev_b, dev_c, N);
    	}
    	clock_t t2 = clock();
    	std::cout << t2 - t1 << "ms" << std::endl;
    
    	for (size_t i = 0; i < loops; i++)
    	{
    		vector_add_gpu_3 << <2, 4 >> > (dev_a, dev_b, dev_c, N);
    	}
    	clock_t t3 = clock();
    	std::cout << t3 - t2 << "ms" << std::endl;
    
    
    	cudaMemcpy(c, dev_c, sizeof(int) * N, cudaMemcpyDeviceToHost);
    
    	//for (int i = 0; i < N; ++i)
    	//{
    	//	printf("%d + %d = %d \n", a[i], b[i], c[i]);
    	//}
    
    	cudaFree(dev_a);
    	cudaFree(dev_b);
    	cudaFree(dev_c);
    
    	return 0;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121

    调用cublas库:

    #include 
    #include 
    #include "cublas_v2.h"
    #include "cuda_runtime.h"
    
    #define  N 1000000
    
    
    int main()
    {
    	float a[N], b[N], c[N];
    	float* dev_a, * dev_b, * dev_c;
    	for (int i = 0; i < N; ++i) // 为数组a、b赋值
    	{
    		float tmp = 1.0 * i;
    		a[i] = tmp;
    		b[i] = tmp;
    	}
    
    	cublasHandle_t handle;  // 申明句柄
    	cublasCreate_v2(&handle); // 创建句柄
    	cudaMalloc(&dev_a, sizeof(float) * N);
    	cudaMalloc(&dev_b, sizeof(float) * N);
    
    	float alpha = 1.0;
    	cublasSetVector(N, sizeof(float), a, 1, dev_a, 1); // H2D host to device
    	cublasSetVector(N, sizeof(float), b, 1, dev_b, 1);
    
    	clock_t t0 = clock();
    	for (size_t i = 0; i < 10000; i++)
    	{
    		cublasSaxpy_v2(handle, N, &alpha, dev_a, 1, dev_b, 1); //实现向量+
    	}
    	clock_t t1 = clock();
    	std::cout << t1 - t0 << "ms" << std::endl;
    
    	cublasGetVector(N, sizeof(float), dev_b, 1, c, 1); // D2H
    	cudaFree(dev_a);
    	cudaFree(dev_b);
    	cublasDestroy(handle); // 销毁句柄
    
    	//for (int i = 0; i < N; ++i)
    	//{
    	//	printf("%f + %f  = %f \n", a[i], b[i], c[i]);
    	//}
    	return 0;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47

    CUDA向量内积

    #include 
    #include 
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    
    
    const int N = 2048;
    const int threadnum = 32;//开32个线程
    
    
    /* cpu 向量内积 */
    template <typename T>
    void dot_cpu(T* a, T* b, T* c, int n)
    {
    	double dTemp = 0;
    	for (int i = 0; i < n; ++i)
    	{
    		dTemp += a[i] * b[i];
    	}
    	*c = dTemp;
    }
    
    
    /*单block 分散归约 */
    template <typename T>
    __global__ void dot_gpu_1(T* a, T* b, T* c, int n)
    {
    	__shared__ T tmp[threadnum];
    	const int tid = threadIdx.x; //线程ID索引号
    	const int t_n = blockDim.x; // 一个block内开启的线程总数
    	int nTid = tid;
    	double dTemp = 0.0;
    	while (nTid < n)
    	{
    		dTemp += a[nTid] * b[nTid];
    		nTid += t_n;
    	}
    	tmp[tid] = dTemp; // 将每个线程中的内积放入到共享内存中
    	__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完
    
    	int i = 2, j = 1;
    	while (i <= threadnum)
    	{
    		if (tid % i == 0)
    		{
    			tmp[tid] += tmp[tid + j];
    		}
    		__syncthreads();
    		i *= 2;
    		j *= 2;
    	}
    	if (0 == tid)
    	{
    		c[0] = tmp[0];
    	}
    }
    
    /*单block 低线程归约向量内积*/
    template <typename T>
    __global__ void dot_gpu_2(T* a, T* b, T* c, int n)
    {
    	__shared__ T tmp[threadnum];
    	const int nThreadIdX = threadIdx.x; //线程ID索引号
    	const int nBlockDimX = blockDim.x; // 一个block内开启的线程总数
    	int nTid = nThreadIdX;
    	double dTemp = 0.0;
    	while (nTid < n)
    	{
    		dTemp += a[nTid] * b[nTid];
    		nTid += nBlockDimX;
    	}
    	tmp[nThreadIdX] = dTemp; // 将每个线程中的内积放入到共享内存中
    	__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完
    
    	int i = threadnum / 2;
    	while (i != 0)
    	{
    		if (nThreadIdX < i)
    		{
    			tmp[nThreadIdX] += tmp[nThreadIdX + i];
    		}
    		__syncthreads();// 同步操作,即等所有线程内上面的操作都执行完
    		i /= 2;
    	}
    	if (0 == nThreadIdX)
    	{
    		c[0] = tmp[0];
    	}
    }
    
    /*多block多线程向量内积*/
    template <typename T>
    __global__ void dot_gpu_3(T* a, T* b, T* c, int n)
    {
    	__shared__ T aTmp[threadnum];
    	const int nThreadIdX = threadIdx.x; //线程ID索引号
    	const int nStep = gridDim.x * blockDim.x; // 跳步的步长,即所有线程的数量
    	int nTidIdx = blockIdx.x * blockDim.x + threadIdx.x; // 当前线程在全局线程的索引
    
    	double dTemp = 0.0;
    	while (nTidIdx < n)
    	{
    		dTemp += a[nTidIdx] * b[nTidIdx];
    		nTidIdx += nStep;
    	}
    	aTmp[nThreadIdX] = dTemp; // 将每个线程中的内积放入到对应block的共享内存中
    	__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完
    
    	int i = threadnum / 2;
    	while (i != 0)
    	{
    		if (nThreadIdX < i)
    		{
    			aTmp[nThreadIdX] += aTmp[nThreadIdX + i];
    		}
    		__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完
    		i /= 2;
    	}
    
    	if (0 == nThreadIdX)
    	{
    		c[blockIdx.x] = aTmp[0];
    	}
    
    }
    
    
    int main()
    {
    	float a[N], b[N];
    	float c = 0;
    	for (int i = 0; i < N; ++i) // 为数组a、b赋值
    	{
    		a[i] = i * 1.0;
    		b[i] = 1.0;
    	}
    
    	float* d_a = 0, * d_b = 0, * d_c = 0;
    	cudaMalloc(&d_a, N * sizeof(float));
    	cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
    
    	cudaMalloc(&d_b, N * sizeof(float));
    	cudaMemcpy(d_b, b, N * sizeof(float), cudaMemcpyHostToDevice);
    
    	cudaMalloc(&d_c, sizeof(float));
    	dot_cpu(a, b, &c, N);
    	//dot_gpu_1 << <1, threadnum >> > (d_a, d_b, d_c, N);
    	//dot_gpu_2 << <1, threadnum >> > (d_a, d_b, d_c, N);
    	//dot_gpu_3<< <1, threadnum >> > (d_a, d_b, d_c, N);
    	//cudaMemcpy(&c, d_c, sizeof(float), cudaMemcpyDeviceToHost);
    	std::cout << c << std::endl;
    
    	cudaFree(d_a);
    	cudaFree(d_b);
    	cudaFree(d_c);
    	return 0;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
    • 149
    • 150
    • 151
    • 152
    • 153
    • 154
    • 155
    • 156
    • 157
  • 相关阅读:
    Maven导入和引用本地包的方法
    Scala012--Scala中的常用集合函数及操作Ⅲ
    【错误记录】HarmonyOS 运行报错 ( Failure[MSG_ERR_INSTALL_FAILED_VERIFY_APP_PKCS7_FAIL] )
    cesium态势标会(距离测量 ---- 不可修改)
    STM32时钟系统配置程序源码深入分析
    C++ 即将超越 Java,TIOBE 6 月编程语言排行榜发布!
    MySQL主从复制(读写分离)
    分布式事务解决方案Seata
    74cms骑士人才招聘系统源码SE版 v3.16.0
    (题目练习)条件概率+权值线段树+FWT+后缀数组
  • 原文地址:https://blog.csdn.net/taifyang/article/details/127973940