neofetch && uname -a|lolcat
install nvidia GPU driver:
- sudo add-apt-repository ppa:graphics-drivers/ppa # 加入官方ppa源
- sudo apt update # 检查软件包更新列表
- apt list --upgradable # 查看可更新的软件包列表
- sudo apt upgrade # 更新所有可更新的软件包
- ubuntu-drivers devices # ubuntu检测n卡的可选驱动
- sudo apt install nvidia-driver-510 # 根据自己的n卡可选驱动下载显卡驱动
- ubuntu-drivers devices # ubuntu检测n卡的可选驱动
- sudo apt install nvidia-driver-510 # 根据自己的n卡可选驱动下载显卡驱动
disable the nouveau by add the nouveau to the black list.
最后一行加上: blacklist nouveau
and execute:
- $ sudo update-initramfs -u
- $ reboot
reboot the system and execute the nvidia-smi:
the output of cuda does not mean the cuda environment already been installed, it just meas the corrspoinding versions of cuda that this driver supports.
- sudo nvidia-settings # 更改Nvidia驱动设置
- nvidia-smi # 查看显卡基本信息
- wget
- sudo dpkg -i cuda-keyring_1.0-1_all.deb
- sudo apt-get update
- sudo apt-get -y install cuda
add environment in bash shell
- export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.7/lib64
- export PATH=$PATH:/usr/local/cuda-11.7/bin
- export CUDA_HOME=$CUDA_HOME:/usr/local/cuda-11.7
test,printf on device:
- #include
- #include
- __device__ void add(void)
- {
- printf("kernel %s line %d, i am in kernel thread in block %d.\n", __func__, __LINE__,blockIdx.x);
- }
- __global__ void myfirstkernel(void)
- {
- printf("kernel %s line %d, i am in kernel thread in block %d.\n", __func__, __LINE__,blockIdx.x);
- add();
- }
- int main(void)
- {
- myfirstkernel <<<16,1>>>();
- cudaDeviceSynchronize();
- printf("exit.\n");
- return 0;
- }
algo cuda sample:
- #include <cuda_runtime.h>
- #include <stdio.h>
- __device__ void add(int a, int b, int *c)
- {
- *c = a + b;
- printf("kernel %s line %d, i am in kernel thread in block %d. *c = %d.\n", __func__, __LINE__,blockIdx.x, *c);
- }
- __global__ void myfirstkernel(int a, int b, int *c)
- {
- printf("kernel %s line %d, i am in kernel thread in block %d.\n", __func__, __LINE__,blockIdx.x);
- add(a, b, c);
- }
- int main(void)
- {
- int c;
- int *gpu_c;
- cudaMalloc((void **)&gpu_c, sizeof(int));
- myfirstkernel <<<16,1>>>(3, 6, gpu_c);
- cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
- cudaFree(gpu_c);
- cudaDeviceSynchronize();
- printf("exit.c = %d.\n", c);
- return 0;
- }
change thread and block
- #include <cuda_runtime.h>
- #include <stdio.h>
- __device__ void add(int a, int b, int *c)
- {
- *c = a + b;
- printf("kernel %s line %d, i am in kernel thread %d in block %d. *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, *c);
- }
- __global__ void myfirstkernel(int a, int b, int *c)
- {
- printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
- add(a, b, c);
- }
- int main(void)
- {
- int c;
- int *gpu_c;
- cudaMalloc((void **)&gpu_c, sizeof(int));
- myfirstkernel <<<1,16>>>(3, 6, gpu_c);
- cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
- cudaFree(gpu_c);
- cudaDeviceSynchronize();
- printf("exit.c = %d.\n", c);
- return 0;
- }
- #include
- #include
- __device__ void add(int a, int b, int *c)
- {
- *c = a + b;
- printf("kernel %s line %d, i am in kernel thread %d in block %d. *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, *c);
- }
- __global__ void myfirstkernel(int a, int b, int *c)
- {
- printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
- printf("block.x = %d, block.y = %d,block.z = %d\n", blockDim.x, blockDim.y,blockDim.z);
- printf("thread.x = %d, thread.y = %d,thread.z = %d\n", threadIdx.x, threadIdx.y,threadIdx.z);
- add(a, b, c);
- }
- int main(void)
- {
- int c;
- int *gpu_c;
- cudaMalloc((void **)&gpu_c, sizeof(int));
- myfirstkernel <<<3,16>>>(3, 6, gpu_c);
- cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
- cudaFree(gpu_c);
- cudaDeviceSynchronize();
- printf("exit.c = %d.\n", c);
- return 0;
- }
- czl@czl-RedmiBook-14:~/workspace/work$ ./a.out
- kernel myfirstkernel line 12, i am in kernel thread 0 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 1 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 2 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 3 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 4 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 5 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 6 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 7 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 8 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 9 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 10 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 11 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 12 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 13 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 14 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 15 in block 0.
- kernel myfirstkernel line 12, i am in kernel thread 0 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 1 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 2 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 3 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 4 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 5 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 6 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 7 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 8 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 9 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 10 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 11 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 12 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 13 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 14 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 15 in block 1.
- kernel myfirstkernel line 12, i am in kernel thread 0 in block 2.
- kernel myfirstkernel line 12, i am in kernel thread 1 in block 2.
- kernel myfirstkernel line 12, i am in kernel thread 2 in block 2.
- kernel myfirstkernel line 12, i am in kernel thread 3 in block 2.
- kernel myfirstkernel line 12, i am in kernel thread 4 in block 2.
- kernel myfirstkernel line 12, i am in kernel thread 5 in block 2.
- kernel myfirstkernel line 12, i am in kernel thread 6 in block 2.
- kernel myfirstkernel line 12, i am in kernel thread 7 in block 2.
- kernel myfirstkernel line 12, i am in kernel thread 8 in block 2.
- kernel myfirstkernel line 12, i am in kernel thread 9 in block 2.
- kernel myfirstkernel line 12, i am in kernel thread 10 in block 2.
- kernel myfirstkernel line 12, i am in kernel thread 11 in block 2.
- kernel myfirstkernel line 12, i am in kernel thread 12 in block 2.
- kernel myfirstkernel line 12, i am in kernel thread 13 in block 2.
- kernel myfirstkernel line 12, i am in kernel thread 14 in block 2.
- kernel myfirstkernel line 12, i am in kernel thread 15 in block 2.
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- block.x = 16, block.y = 1,block.z = 1
- thread.x = 0, thread.y = 0,thread.z = 0
- thread.x = 1, thread.y = 0,thread.z = 0
- thread.x = 2, thread.y = 0,thread.z = 0
- thread.x = 3, thread.y = 0,thread.z = 0
- thread.x = 4, thread.y = 0,thread.z = 0
- thread.x = 5, thread.y = 0,thread.z = 0
- thread.x = 6, thread.y = 0,thread.z = 0
- thread.x = 7, thread.y = 0,thread.z = 0
- thread.x = 8, thread.y = 0,thread.z = 0
- thread.x = 9, thread.y = 0,thread.z = 0
- thread.x = 10, thread.y = 0,thread.z = 0
- thread.x = 11, thread.y = 0,thread.z = 0
- thread.x = 12, thread.y = 0,thread.z = 0
- thread.x = 13, thread.y = 0,thread.z = 0
- thread.x = 14, thread.y = 0,thread.z = 0
- thread.x = 15, thread.y = 0,thread.z = 0
- thread.x = 0, thread.y = 0,thread.z = 0
- thread.x = 1, thread.y = 0,thread.z = 0
- thread.x = 2, thread.y = 0,thread.z = 0
- thread.x = 3, thread.y = 0,thread.z = 0
- thread.x = 4, thread.y = 0,thread.z = 0
- thread.x = 5, thread.y = 0,thread.z = 0
- thread.x = 6, thread.y = 0,thread.z = 0
- thread.x = 7, thread.y = 0,thread.z = 0
- thread.x = 8, thread.y = 0,thread.z = 0
- thread.x = 9, thread.y = 0,thread.z = 0
- thread.x = 10, thread.y = 0,thread.z = 0
- thread.x = 11, thread.y = 0,thread.z = 0
- thread.x = 12, thread.y = 0,thread.z = 0
- thread.x = 13, thread.y = 0,thread.z = 0
- thread.x = 14, thread.y = 0,thread.z = 0
- thread.x = 15, thread.y = 0,thread.z = 0
- thread.x = 0, thread.y = 0,thread.z = 0
- thread.x = 1, thread.y = 0,thread.z = 0
- thread.x = 2, thread.y = 0,thread.z = 0
- thread.x = 3, thread.y = 0,thread.z = 0
- thread.x = 4, thread.y = 0,thread.z = 0
- thread.x = 5, thread.y = 0,thread.z = 0
- thread.x = 6, thread.y = 0,thread.z = 0
- thread.x = 7, thread.y = 0,thread.z = 0
- thread.x = 8, thread.y = 0,thread.z = 0
- thread.x = 9, thread.y = 0,thread.z = 0
- thread.x = 10, thread.y = 0,thread.z = 0
- thread.x = 11, thread.y = 0,thread.z = 0
- thread.x = 12, thread.y = 0,thread.z = 0
- thread.x = 13, thread.y = 0,thread.z = 0
- thread.x = 14, thread.y = 0,thread.z = 0
- thread.x = 15, thread.y = 0,thread.z = 0
- kernel add line 7, i am in kernel thread 0 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 1 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 2 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 3 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 4 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 5 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 6 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 7 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 8 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 9 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 10 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 11 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 12 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 13 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 14 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 15 in block 0. *c = 9.
- kernel add line 7, i am in kernel thread 0 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 1 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 2 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 3 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 4 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 5 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 6 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 7 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 8 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 9 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 10 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 11 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 12 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 13 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 14 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 15 in block 2. *c = 9.
- kernel add line 7, i am in kernel thread 0 in block 1. *c = 9.
- kernel add line 7, i am in kernel thread 1 in block 1. *c = 9.
- kernel add line 7, i am in kernel thread 2 in block 1. *c = 9.
- kernel add line 7, i am in kernel thread 3 in block 1. *c = 9.
- kernel add line 7, i am in kernel thread 4 in block 1. *c = 9.
- kernel add line 7, i am in kernel thread 5 in block 1. *c = 9.
- kernel add line 7, i am in kernel thread 6 in block 1. *c = 9.
- kernel add line 7, i am in kernel thread 7 in block 1. *c = 9.
- kernel add line 7, i am in kernel thread 8 in block 1. *c = 9.
- kernel add line 7, i am in kernel thread 9 in block 1. *c = 9.
- kernel add line 7, i am in kernel thread 10 in block 1. *c = 9.
- kernel add line 7, i am in kernel thread 11 in block 1. *c = 9.
- kernel add line 7, i am in kernel thread 12 in block 1. *c = 9.
- kernel add line 7, i am in kernel thread 13 in block 1. *c = 9.
- kernel add line 7, i am in kernel thread 14 in block 1. *c = 9.
- kernel add line 7, i am in kernel thread 15 in block 1. *c = 9.
- exit.c = 9.
- czl@czl-RedmiBook-14:~/workspace/work$
- #include
- #include
- __device__ void add(int a, int b, int *c)
- {
- *c = a + b;
- printf("kernel %s line %d, i am in kernel thread %d in blockidx.x %d. blokidx.y %d blockidx.z %d *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, blockIdx.y,blockIdx.z,*c);
- }
- __global__ void myfirstkernel(int a, int b, int *c)
- {
- printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
- printf("block.x = %d, block.y = %d,block.z = %d\n", blockDim.x, blockDim.y,blockDim.z);
- printf("thread.x = %d, thread.y = %d,thread.z = %d\n", threadIdx.x, threadIdx.y,threadIdx.z);
- printf("gridDim.x = %d, gridDim.y = %d,gridDim.z = %d\n", gridDim.x, gridDim.y,gridDim.z);
- add(a, b, c);
- }
- int main(void)
- {
- int c;
- int *gpu_c;
- cudaMalloc((void **)&gpu_c, sizeof(int));
- myfirstkernel <<<2,3>>>(3, 6, gpu_c);
- cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
- cudaFree(gpu_c);
- cudaDeviceSynchronize();
- printf("exit.c = %d.\n", c);
- return 0;
- }
kernel call convontion:
kernel call invocation convotional is:
dim3 gridSize(3,2,1);
dim3 blockSize(2,2,2);
那么执行的线程是如何知道自己在在GRID,BLOCK,THREAD中的位置的呢?一种更简单的方案是让每个线程把自己的X索引(也就是threadIdx.x)记录下来,线程索引是线程在每个BLOCK里的索引,由于BLOCK的尺寸是三维的,因此线程索引也是一个三元常向量,threadIdx,访问方式为:threadIdx.x, threadIdx.y, threadIdx.z.对于一个BLOCK来说,它其中的每个线程的索引是唯一的,但是当一个GRID中有两个以上的BLOCK时,其中就会出现重复的线程索引,相应的,每个GRID里面的BLOCK也有唯一的BLOCK索引,用blockIdx表示,它同样是一个三维常向量,blockIdx.x, blockIdx.y, blockIdx.z。由于一次函数调用中只有一个GRID,因此不存在GRID索引。
对于BLOCK和GRID的尺寸,也用相应的三维常向量来表示,BLOCK的尺寸保存在常向量blockDim中,GRID的尺寸保存在gridDim中,他们都是CUDA C的内建变量,可以直接在设备代码中使用,在计算中,用户常常要给每个线程一个唯一的标识符,即线程号,以便给每个线程分配不同的任务。在多个BLOCK的情况下,线程号也不能重复。线程号在实际使用中很重要,它关系到被处理的数据在输入数组中的位置,也关系到线程的分配和存储器的使用问题。
- __device__ int get_globalidx_1d_1d(void)
- {
- return blockIdx.x * blockDim.x + threadIdx.x;
- }
- __device__ int get_globalidx_1d_2d(void)
- {
- return blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
- }
- __device__ int get_globalidx_1d_3d(void)
- {
- return blockIdx.x * blockDim.x * blockDim.y * blockDim.z + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
- }
- __device__ int get_globalidx_2d_1d(void)
- {
- int blockid = blockIdx.y * gridDim.x + blockIdx.x;
- int threadid = blockid * blockDim.x + threadIdx.x;
- return threadid;
- }
- __device__ int get_globalidx_2d_2d(void)
- {
- int blockid = blockIdx.y * gridDim.x + blockIdx.x;
- int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
- return threadid;
- }
- __device__ int get_globalidx_2d_3d(void)
- {
- int blockid = blockIdx.y * gridDim.x + blockIdx.x;
- int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
- return threadid;
- }
- __device__ int get_globalidx_3d_1d(void)
- {
- int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
- int threadid = blockid * blockDim.x + threadIdx.x;
- return threadid;
- }
- __device__ int get_globalidx_3d_2d(void)
- {
- int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
- int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
- return threadid;
- }
- __device__ int get_globalidx_3d_3d(void)
- {
- int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
- int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
- return threadid;
- }
- #include <cuda_runtime.h>
- #include <stdio.h>
- __device__ void add(int a, int b, int *c)
- {
- *c = a + b;
- printf("kernel %s line %d, i am in kernel thread %d in blockidx.x %d. blokidx.y %d blockidx.z %d *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, blockIdx.y,blockIdx.z,*c);
- }
- __global__ void myfirstkernel(int a, int b, int *c)
- {
- printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
- printf("block.x = %d, block.y = %d,block.z = %d\n", blockDim.x, blockDim.y,blockDim.z);
- printf("thread.x = %d, thread.y = %d,thread.z = %d\n", threadIdx.x, threadIdx.y,threadIdx.z);
- printf("gridDim.x = %d, gridDim.y = %d,gridDim.z = %d\n", gridDim.x, gridDim.y,gridDim.z);
- add(a, b, c);
- }
- __device__ int get_globalidx_1d_1d(void)
- {
- return blockIdx.x * blockDim.x + threadIdx.x;
- }
- __device__ int get_globalidx_1d_2d(void)
- {
- return blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
- }
- __device__ int get_globalidx_1d_3d(void)
- {
- return blockIdx.x * blockDim.x * blockDim.y * blockDim.z + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
- }
- __device__ int get_globalidx_2d_1d(void)
- {
- int blockid = blockIdx.y * gridDim.x + blockIdx.x;
- int threadid = blockid * blockDim.x + threadIdx.x;
- return threadid;
- }
- __device__ int get_globalidx_2d_2d(void)
- {
- int blockid = blockIdx.y * gridDim.x + blockIdx.x;
- int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
- return threadid;
- }
- __device__ int get_globalidx_2d_3d(void)
- {
- int blockid = blockIdx.y * gridDim.x + blockIdx.x;
- int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
- return threadid;
- }
- __device__ int get_globalidx_3d_1d(void)
- {
- int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
- int threadid = blockid * blockDim.x + threadIdx.x;
- return threadid;
- }
- __device__ int get_globalidx_3d_2d(void)
- {
- int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
- int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
- return threadid;
- }
- __device__ int get_globalidx_3d_3d(void)
- {
- int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
- int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
- return threadid;
- }
- __host__ int main(void)
- {
- int c;
- int *gpu_c;
- cudaMalloc((void **)&gpu_c, sizeof(int));
- myfirstkernel <<<2,3>>>(3, 6, gpu_c);
- cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
- cudaFree(gpu_c);
- cudaDeviceSynchronize();
- printf("exit.c = %d.\n", c);
- return 0;
- }
nvcc -g -G
- --debug (-g)
- Generate debug information for host code.
- --device-debug (-G)
- Generate debug information for device code. If --dopt is not specified, then
- turns off all optimizations. Don't use for profiling; use -lineinfo instead.
- czl@czl-RedmiBook-14:~/workspace/work$ cuda-gdb a.out
- NVIDIA (R) CUDA Debugger
- 11.7 release
- Portions Copyright (C) 2007-2022 NVIDIA Corporation
- GNU gdb (GDB) 10.2
- Copyright (C) 2021 Free Software Foundation, Inc.
- License GPLv3+: GNU GPL version 3 or later <>
- This is free software: you are free to change and redistribute it.
- There is NO WARRANTY, to the extent permitted by law.
- Type "show copying" and "show warranty" for details.
- This GDB was configured as "x86_64-pc-linux-gnu".
- Type "show configuration" for configuration details.
- For bug reporting instructions, please see:
- <>.
- Find the GDB manual and other documentation resources online at:
- <>.
- For help, type "help".
- Type "apropos word" to search for commands related to "word"...
- Reading symbols from a.out...
- (cuda-gdb) b myfirstkernel(int, int, int*)
- Breakpoint 1 at 0xa3e3: file /home/czl/workspace/work/, line 11.
- (cuda-gdb) r
- Starting program: /home/czl/workspace/work/a.out
- [Thread debugging using libthread_db enabled]
- Using host libthread_db library "/lib/x86_64-linux-gnu/".
- [Detaching after fork from child process 38487]
- [New Thread 0x7fffdffff000 (LWP 38492)]
- [New Thread 0x7fffdf7fe000 (LWP 38493)]
- [New Thread 0x7fffdeffd000 (LWP 38494)]
- [Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
- Thread 1 "a.out" hit Breakpoint 1, myfirstkernel<<<(2,1,1),(3,1,1)>>> (a=3, b=6, c=0x7fffcf000000) at
- 12 printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
- (cuda-gdb) info cuda threads
- BlockIdx ThreadIdx To BlockIdx To ThreadIdx Count Virtual PC Filename Line
- Kernel 0
- * (0,0,0) (0,0,0) (1,0,0) (2,0,0) 6 0x0000555555a31628 12
- (cuda-gdb)
- BlockIdx ThreadIdx To BlockIdx To ThreadIdx Count Virtual PC Filename Line
- Kernel 0
- * (0,0,0) (0,0,0) (1,0,0) (2,0,0) 6 0x0000555555a31628 12
- (cuda-gdb)
- (cuda-gdb) info threads
- Id Target Id Frame
- 1 Thread 0x7ffff7d7d000 (LWP 38482) "a.out" 0x00007ffff6ce679c in ?? () from /lib/x86_64-linux-gnu/
- 2 Thread 0x7fffdffff000 (LWP 38492) "cuda-EvtHandlr" 0x00007ffff7e98d7f in poll () from /lib/x86_64-linux-gnu/
- 3 Thread 0x7fffdf7fe000 (LWP 38493) "cuda-EvtHandlr" 0x00007ffff7e98d7f in poll () from /lib/x86_64-linux-gnu/
- 4 Thread 0x7fffdeffd000 (LWP 38494) "a.out" 0x00007ffff7e11197 in ?? () from /lib/x86_64-linux-gnu/
- (cuda-gdb)
- Id Target Id Frame
- 1 Thread 0x7ffff7d7d000 (LWP 38482) "a.out" 0x00007ffff6ce679c in ?? () from /lib/x86_64-linux-gnu/
- 2 Thread 0x7fffdffff000 (LWP 38492) "cuda-EvtHandlr" 0x00007ffff7e98d7f in poll () from /lib/x86_64-linux-gnu/
- 3 Thread 0x7fffdf7fe000 (LWP 38493) "cuda-EvtHandlr" 0x00007ffff7e98d7f in poll () from /lib/x86_64-linux-gnu/
- 4 Thread 0x7fffdeffd000 (LWP 38494) "a.out" 0x00007ffff7e11197 in ?? () from /lib/x86_64-linux-gnu/
- (cuda-gdb)
- (cuda-gdb) info cuda
- blocks devices lanes launch trace managed threads
- contexts kernels launch children line sms warps
- (cuda-gdb) info cuda devices
- Dev PCI Bus/Dev ID Name Description SM Type SMs Warps/SM Lanes/Warp Max Regs/Lane Active SMs Mask
- * 0 02:00.0 NVIDIA GeForce MX250 GP108-A sm_61 3 64 32 256 0x00000000000000000000000000000003
- (cuda-gdb) info cuda devices
- Dev PCI Bus/Dev ID Name Description SM Type SMs Warps/SM Lanes/Warp Max Regs/Lane Active SMs Mask
- * 0 02:00.0 NVIDIA GeForce MX250 GP108-A sm_61 3 64 32 256 0x00000000000000000000000000000003
- (cuda-gdb) info cuda blocks
- BlockIdx To BlockIdx Count State
- Kernel 0
- * (0,0,0) (1,0,0) 2 running
- (cuda-gdb) info cuda lanes
- Ln State Physical PC ThreadIdx Exception
- Device 0 SM 0 Warp 0
- * 0 active 0x0000000000000188 (0,0,0) None
- 1 active 0x0000000000000188 (1,0,0) None
- 2 active 0x0000000000000188 (2,0,0) None
- (cuda-gdb) info cuda threads
- BlockIdx ThreadIdx To BlockIdx To ThreadIdx Count Virtual PC Filename Line
- Kernel 0
- * (0,0,0) (0,0,0) (1,0,0) (2,0,0) 6 0x0000555555a31628 12
- (cuda-gdb) info cuda sms
- SM Active Warps Mask
- Device 0
- * 0 0x0000000000000001
- 1 0x0000000000000001
- (cuda-gdb) info cuda kernels
- Kernel Parent Dev Grid Status SMs Mask GridDim BlockDim Invocation
- * 0 - 0 1 Active 0x00000003 (2,1,1) (3,1,1) myfirstkernel(a=3, b=6, c=0x7fffcf000000)
- (cuda-gdb) info cuda contexts
- Context Dev State
- * 0x0000555555626930 0 active
- (cuda-gdb) info cuda managed
- Static managed variables on device 0 are:
- (cuda-gdb)
readelf -S a.out
be notice the .nv_fatbin section, this section include all kinds of target device ISA binarys and bundle then that is why it was called fat bin.
objdump -C -d -j .nv_fatbin a.out|more
so you can see it obviously include the HOST Like Device target enclude in the bundle fat bin.
- czl@czl-RedmiBook-14:~/workspace/work$ readelf -C -s a.out |grep get_globalidx_3d
- 2973: 000000000000a0fc 29 FUNC GLOBAL DEFAULT 15 get_globalidx_3d[...]
- 3101: 000000000000a0c2 29 FUNC GLOBAL DEFAULT 15 get_globalidx_3d[...]
- 3153: 000000000000a0df 29 FUNC GLOBAL DEFAULT 15 get_globalidx_3d[...]
- czl@czl-RedmiBook-14:~/workspace/work$
从上图可以看出,即便是device端的代码,编译器也为其产生符号,binutils 工具中使用-C对 C++中的经过改编的名字进行反改编。
- czl@czl-RedmiBook-14:~/workspace/new$ nvcc --fatbin
- czl@czl-RedmiBook-14:~/workspace/new$ ls
- a.out test.fatbin
- czl@czl-RedmiBook-14:~/workspace/new$
the answer is yes.
- v#include <cuda_runtime.h>
- #include <stdio.h>
- __device__ void add(int a, int b, int *c)
- {
- *c = a + b;
- printf("kernel %s line %d, i am in kernel thread %d in blockidx.x %d. blokidx.y %d blockidx.z %d *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, blockIdx.y,blockIdx.z,*c);
- add(a,b+1,c);
- }
- __global__ void myfirstkernel(int a, int b, int *c)
- {
- //printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
- //printf("block.x = %d, block.y = %d,block.z = %d\n", blockDim.x, blockDim.y,blockDim.z);
- //printf("thread.x = %d, thread.y = %d,thread.z = %d\n", threadIdx.x, threadIdx.y,threadIdx.z);
- //printf("gridDim.x = %d, gridDim.y = %d,gridDim.z = %d\n", gridDim.x, gridDim.y,gridDim.z);
- add(a, b, c);
- }
- __device__ int get_globalidx_1d_1d(void)
- {
- return blockIdx.x * blockDim.x + threadIdx.x;
- }
- __device__ int get_globalidx_1d_2d(void)
- {
- return blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
- }
- __device__ int get_globalidx_1d_3d(void)
- {
- return blockIdx.x * blockDim.x * blockDim.y * blockDim.z + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
- }
- __device__ int get_globalidx_2d_1d(void)
- {
- int blockid = blockIdx.y * gridDim.x + blockIdx.x;
- int threadid = blockid * blockDim.x + threadIdx.x;
- return threadid;
- }
- __device__ int get_globalidx_2d_2d(void)
- {
- int blockid = blockIdx.y * gridDim.x + blockIdx.x;
- int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
- return threadid;
- }
- __device__ int get_globalidx_2d_3d(void)
- {
- int blockid = blockIdx.y * gridDim.x + blockIdx.x;
- int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
- return threadid;
- }
- __device__ int get_globalidx_3d_1d(void)
- {
- int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
- int threadid = blockid * blockDim.x + threadIdx.x;
- return threadid;
- }
- __device__ int get_globalidx_3d_2d(void)
- {
- int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
- int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
- return threadid;
- }
- __device__ int get_globalidx_3d_3d(void)
- {
- int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
- int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
- return threadid;
- }
- __host__ int main(void)
- {
- int c;
- int *gpu_c;
- cudaMalloc((void **)&gpu_c, sizeof(int));
- myfirstkernel <<<2,3>>>(3, 6, gpu_c);
- cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
- cudaFree(gpu_c);
- cudaDeviceSynchronize();
- printf("exit.c = %d.\n", c);
- return 0;
- }
- czl@czl-RedmiBook-14:~/workspace/work$ ./a.out
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 9.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 9.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 9.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 9.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 9.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 9.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 10.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 10.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 10.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 10.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 10.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 10.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 11.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 11.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 11.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 11.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 11.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 11.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 12.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 12.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 12.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 12.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 12.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 12.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 13.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 13.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 13.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 13.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 13.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 13.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 14.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 14.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 14.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 14.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 14.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 14.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 15.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 15.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 15.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 15.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 15.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 15.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 16.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 16.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 16.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 16.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 16.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 16.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 17.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 17.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 17.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 17.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 17.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 17.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 18.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 18.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 18.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 18.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 18.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 18.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 19.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 19.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 19.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 19.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 19.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 19.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 20.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 20.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 20.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 20.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 20.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 20.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 21.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 21.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 21.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 21.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 21.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 21.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 22.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 22.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 22.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 22.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 22.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 22.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 23.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 23.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 23.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 23.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 23.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 23.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 24.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 24.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 24.
- kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 24.
- kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 24.
- kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 24.
- exit.c = 3.
- czl@czl-RedmiBook-14:~/workspace/work$
MPS means Multi Process Service.
$ sudo nvidia-cuda-mps-control -d
$ sudo nvidia-cuda-mps-control quit
others blog, opencl:
OpenCL编程初探_papaofdoudou的博客-CSDN博客_opencl 源码