• Ubuntu22.04安装CUDA深度学习环境&&cuda principle


    neofetch && uname -a|lolcat

    install nvidia GPU driver:

    1. sudo add-apt-repository ppa:graphics-drivers/ppa # 加入官方ppa源
    2. sudo apt update # 检查软件包更新列表
    3. apt list --upgradable # 查看可更新的软件包列表
    4. sudo apt upgrade # 更新所有可更新的软件包
    1. ubuntu-drivers devices # ubuntu检测n卡的可选驱动
    2. sudo apt install nvidia-driver-510 # 根据自己的n卡可选驱动下载显卡驱动

    1. ubuntu-drivers devices # ubuntu检测n卡的可选驱动
    2. sudo apt install nvidia-driver-510 # 根据自己的n卡可选驱动下载显卡驱动

    disable the nouveau

    disable the nouveau by add the nouveau to the black list.


    最后一行加上: blacklist nouveau

    and execute:

    1. $ sudo update-initramfs -u
    2. $ reboot

     reboot the system and execute the nvidia-smi:

    the output of cuda does not mean the cuda environment already been installed, it just meas the corrspoinding versions of cuda that this driver supports.

    1. sudo nvidia-settings # 更改Nvidia驱动设置
    2. nvidia-smi # 查看显卡基本信息

    install cuda:

    1. wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
    2. sudo dpkg -i cuda-keyring_1.0-1_all.deb
    3. sudo apt-get update
    4. sudo apt-get -y install cuda


    add environment in bash shell

    1. export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.7/lib64
    2. export PATH=$PATH:/usr/local/cuda-11.7/bin
    3. export CUDA_HOME=$CUDA_HOME:/usr/local/cuda-11.7

    test,printf on device:

    1. #include
    2. #include
    3. __device__ void add(void)
    4. {
    5. printf("kernel %s line %d, i am in kernel thread in block %d.\n", __func__, __LINE__,blockIdx.x);
    6. }
    7. __global__ void myfirstkernel(void)
    8. {
    9. printf("kernel %s line %d, i am in kernel thread in block %d.\n", __func__, __LINE__,blockIdx.x);
    10. add();
    11. }
    12. int main(void)
    13. {
    14. myfirstkernel <<<16,1>>>();
    15. cudaDeviceSynchronize();
    16. printf("exit.\n");
    17. return 0;
    18. }

    algo cuda sample:

    1. #include <cuda_runtime.h>
    2. #include <stdio.h>
    3. __device__ void add(int a, int b, int *c)
    4. {
    5. *c = a + b;
    6. printf("kernel %s line %d, i am in kernel thread in block %d. *c = %d.\n", __func__, __LINE__,blockIdx.x, *c);
    7. }
    8. __global__ void myfirstkernel(int a, int b, int *c)
    9. {
    10. printf("kernel %s line %d, i am in kernel thread in block %d.\n", __func__, __LINE__,blockIdx.x);
    11. add(a, b, c);
    12. }
    13. int main(void)
    14. {
    15. int c;
    16. int *gpu_c;
    17. cudaMalloc((void **)&gpu_c, sizeof(int));
    18. myfirstkernel <<<16,1>>>(3, 6, gpu_c);
    19. cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
    20. cudaFree(gpu_c);
    21. cudaDeviceSynchronize();
    22. printf("exit.c = %d.\n", c);
    23. return 0;
    24. }

    change thread and block

    1. #include <cuda_runtime.h>
    2. #include <stdio.h>
    3. __device__ void add(int a, int b, int *c)
    4. {
    5. *c = a + b;
    6. printf("kernel %s line %d, i am in kernel thread %d in block %d. *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, *c);
    7. }
    8. __global__ void myfirstkernel(int a, int b, int *c)
    9. {
    10. printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
    11. add(a, b, c);
    12. }
    13. int main(void)
    14. {
    15. int c;
    16. int *gpu_c;
    17. cudaMalloc((void **)&gpu_c, sizeof(int));
    18. myfirstkernel <<<1,16>>>(3, 6, gpu_c);
    19. cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
    20. cudaFree(gpu_c);
    21. cudaDeviceSynchronize();
    22. printf("exit.c = %d.\n", c);
    23. return 0;
    24. }

    1. #include
    2. #include
    3. __device__ void add(int a, int b, int *c)
    4. {
    5. *c = a + b;
    6. printf("kernel %s line %d, i am in kernel thread %d in block %d. *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, *c);
    7. }
    8. __global__ void myfirstkernel(int a, int b, int *c)
    9. {
    10. printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
    11. printf("block.x = %d, block.y = %d,block.z = %d\n", blockDim.x, blockDim.y,blockDim.z);
    12. printf("thread.x = %d, thread.y = %d,thread.z = %d\n", threadIdx.x, threadIdx.y,threadIdx.z);
    13. add(a, b, c);
    14. }
    15. int main(void)
    16. {
    17. int c;
    18. int *gpu_c;
    19. cudaMalloc((void **)&gpu_c, sizeof(int));
    20. myfirstkernel <<<3,16>>>(3, 6, gpu_c);
    21. cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
    22. cudaFree(gpu_c);
    23. cudaDeviceSynchronize();
    24. printf("exit.c = %d.\n", c);
    25. return 0;
    26. }
    1. czl@czl-RedmiBook-14:~/workspace/work$ ./a.out
    2. kernel myfirstkernel line 12, i am in kernel thread 0 in block 0.
    3. kernel myfirstkernel line 12, i am in kernel thread 1 in block 0.
    4. kernel myfirstkernel line 12, i am in kernel thread 2 in block 0.
    5. kernel myfirstkernel line 12, i am in kernel thread 3 in block 0.
    6. kernel myfirstkernel line 12, i am in kernel thread 4 in block 0.
    7. kernel myfirstkernel line 12, i am in kernel thread 5 in block 0.
    8. kernel myfirstkernel line 12, i am in kernel thread 6 in block 0.
    9. kernel myfirstkernel line 12, i am in kernel thread 7 in block 0.
    10. kernel myfirstkernel line 12, i am in kernel thread 8 in block 0.
    11. kernel myfirstkernel line 12, i am in kernel thread 9 in block 0.
    12. kernel myfirstkernel line 12, i am in kernel thread 10 in block 0.
    13. kernel myfirstkernel line 12, i am in kernel thread 11 in block 0.
    14. kernel myfirstkernel line 12, i am in kernel thread 12 in block 0.
    15. kernel myfirstkernel line 12, i am in kernel thread 13 in block 0.
    16. kernel myfirstkernel line 12, i am in kernel thread 14 in block 0.
    17. kernel myfirstkernel line 12, i am in kernel thread 15 in block 0.
    18. kernel myfirstkernel line 12, i am in kernel thread 0 in block 1.
    19. kernel myfirstkernel line 12, i am in kernel thread 1 in block 1.
    20. kernel myfirstkernel line 12, i am in kernel thread 2 in block 1.
    21. kernel myfirstkernel line 12, i am in kernel thread 3 in block 1.
    22. kernel myfirstkernel line 12, i am in kernel thread 4 in block 1.
    23. kernel myfirstkernel line 12, i am in kernel thread 5 in block 1.
    24. kernel myfirstkernel line 12, i am in kernel thread 6 in block 1.
    25. kernel myfirstkernel line 12, i am in kernel thread 7 in block 1.
    26. kernel myfirstkernel line 12, i am in kernel thread 8 in block 1.
    27. kernel myfirstkernel line 12, i am in kernel thread 9 in block 1.
    28. kernel myfirstkernel line 12, i am in kernel thread 10 in block 1.
    29. kernel myfirstkernel line 12, i am in kernel thread 11 in block 1.
    30. kernel myfirstkernel line 12, i am in kernel thread 12 in block 1.
    31. kernel myfirstkernel line 12, i am in kernel thread 13 in block 1.
    32. kernel myfirstkernel line 12, i am in kernel thread 14 in block 1.
    33. kernel myfirstkernel line 12, i am in kernel thread 15 in block 1.
    34. kernel myfirstkernel line 12, i am in kernel thread 0 in block 2.
    35. kernel myfirstkernel line 12, i am in kernel thread 1 in block 2.
    36. kernel myfirstkernel line 12, i am in kernel thread 2 in block 2.
    37. kernel myfirstkernel line 12, i am in kernel thread 3 in block 2.
    38. kernel myfirstkernel line 12, i am in kernel thread 4 in block 2.
    39. kernel myfirstkernel line 12, i am in kernel thread 5 in block 2.
    40. kernel myfirstkernel line 12, i am in kernel thread 6 in block 2.
    41. kernel myfirstkernel line 12, i am in kernel thread 7 in block 2.
    42. kernel myfirstkernel line 12, i am in kernel thread 8 in block 2.
    43. kernel myfirstkernel line 12, i am in kernel thread 9 in block 2.
    44. kernel myfirstkernel line 12, i am in kernel thread 10 in block 2.
    45. kernel myfirstkernel line 12, i am in kernel thread 11 in block 2.
    46. kernel myfirstkernel line 12, i am in kernel thread 12 in block 2.
    47. kernel myfirstkernel line 12, i am in kernel thread 13 in block 2.
    48. kernel myfirstkernel line 12, i am in kernel thread 14 in block 2.
    49. kernel myfirstkernel line 12, i am in kernel thread 15 in block 2.
    50. block.x = 16, block.y = 1,block.z = 1
    51. block.x = 16, block.y = 1,block.z = 1
    52. block.x = 16, block.y = 1,block.z = 1
    53. block.x = 16, block.y = 1,block.z = 1
    54. block.x = 16, block.y = 1,block.z = 1
    55. block.x = 16, block.y = 1,block.z = 1
    56. block.x = 16, block.y = 1,block.z = 1
    57. block.x = 16, block.y = 1,block.z = 1
    58. block.x = 16, block.y = 1,block.z = 1
    59. block.x = 16, block.y = 1,block.z = 1
    60. block.x = 16, block.y = 1,block.z = 1
    61. block.x = 16, block.y = 1,block.z = 1
    62. block.x = 16, block.y = 1,block.z = 1
    63. block.x = 16, block.y = 1,block.z = 1
    64. block.x = 16, block.y = 1,block.z = 1
    65. block.x = 16, block.y = 1,block.z = 1
    66. block.x = 16, block.y = 1,block.z = 1
    67. block.x = 16, block.y = 1,block.z = 1
    68. block.x = 16, block.y = 1,block.z = 1
    69. block.x = 16, block.y = 1,block.z = 1
    70. block.x = 16, block.y = 1,block.z = 1
    71. block.x = 16, block.y = 1,block.z = 1
    72. block.x = 16, block.y = 1,block.z = 1
    73. block.x = 16, block.y = 1,block.z = 1
    74. block.x = 16, block.y = 1,block.z = 1
    75. block.x = 16, block.y = 1,block.z = 1
    76. block.x = 16, block.y = 1,block.z = 1
    77. block.x = 16, block.y = 1,block.z = 1
    78. block.x = 16, block.y = 1,block.z = 1
    79. block.x = 16, block.y = 1,block.z = 1
    80. block.x = 16, block.y = 1,block.z = 1
    81. block.x = 16, block.y = 1,block.z = 1
    82. block.x = 16, block.y = 1,block.z = 1
    83. block.x = 16, block.y = 1,block.z = 1
    84. block.x = 16, block.y = 1,block.z = 1
    85. block.x = 16, block.y = 1,block.z = 1
    86. block.x = 16, block.y = 1,block.z = 1
    87. block.x = 16, block.y = 1,block.z = 1
    88. block.x = 16, block.y = 1,block.z = 1
    89. block.x = 16, block.y = 1,block.z = 1
    90. block.x = 16, block.y = 1,block.z = 1
    91. block.x = 16, block.y = 1,block.z = 1
    92. block.x = 16, block.y = 1,block.z = 1
    93. block.x = 16, block.y = 1,block.z = 1
    94. block.x = 16, block.y = 1,block.z = 1
    95. block.x = 16, block.y = 1,block.z = 1
    96. block.x = 16, block.y = 1,block.z = 1
    97. block.x = 16, block.y = 1,block.z = 1
    98. thread.x = 0, thread.y = 0,thread.z = 0
    99. thread.x = 1, thread.y = 0,thread.z = 0
    100. thread.x = 2, thread.y = 0,thread.z = 0
    101. thread.x = 3, thread.y = 0,thread.z = 0
    102. thread.x = 4, thread.y = 0,thread.z = 0
    103. thread.x = 5, thread.y = 0,thread.z = 0
    104. thread.x = 6, thread.y = 0,thread.z = 0
    105. thread.x = 7, thread.y = 0,thread.z = 0
    106. thread.x = 8, thread.y = 0,thread.z = 0
    107. thread.x = 9, thread.y = 0,thread.z = 0
    108. thread.x = 10, thread.y = 0,thread.z = 0
    109. thread.x = 11, thread.y = 0,thread.z = 0
    110. thread.x = 12, thread.y = 0,thread.z = 0
    111. thread.x = 13, thread.y = 0,thread.z = 0
    112. thread.x = 14, thread.y = 0,thread.z = 0
    113. thread.x = 15, thread.y = 0,thread.z = 0
    114. thread.x = 0, thread.y = 0,thread.z = 0
    115. thread.x = 1, thread.y = 0,thread.z = 0
    116. thread.x = 2, thread.y = 0,thread.z = 0
    117. thread.x = 3, thread.y = 0,thread.z = 0
    118. thread.x = 4, thread.y = 0,thread.z = 0
    119. thread.x = 5, thread.y = 0,thread.z = 0
    120. thread.x = 6, thread.y = 0,thread.z = 0
    121. thread.x = 7, thread.y = 0,thread.z = 0
    122. thread.x = 8, thread.y = 0,thread.z = 0
    123. thread.x = 9, thread.y = 0,thread.z = 0
    124. thread.x = 10, thread.y = 0,thread.z = 0
    125. thread.x = 11, thread.y = 0,thread.z = 0
    126. thread.x = 12, thread.y = 0,thread.z = 0
    127. thread.x = 13, thread.y = 0,thread.z = 0
    128. thread.x = 14, thread.y = 0,thread.z = 0
    129. thread.x = 15, thread.y = 0,thread.z = 0
    130. thread.x = 0, thread.y = 0,thread.z = 0
    131. thread.x = 1, thread.y = 0,thread.z = 0
    132. thread.x = 2, thread.y = 0,thread.z = 0
    133. thread.x = 3, thread.y = 0,thread.z = 0
    134. thread.x = 4, thread.y = 0,thread.z = 0
    135. thread.x = 5, thread.y = 0,thread.z = 0
    136. thread.x = 6, thread.y = 0,thread.z = 0
    137. thread.x = 7, thread.y = 0,thread.z = 0
    138. thread.x = 8, thread.y = 0,thread.z = 0
    139. thread.x = 9, thread.y = 0,thread.z = 0
    140. thread.x = 10, thread.y = 0,thread.z = 0
    141. thread.x = 11, thread.y = 0,thread.z = 0
    142. thread.x = 12, thread.y = 0,thread.z = 0
    143. thread.x = 13, thread.y = 0,thread.z = 0
    144. thread.x = 14, thread.y = 0,thread.z = 0
    145. thread.x = 15, thread.y = 0,thread.z = 0
    146. kernel add line 7, i am in kernel thread 0 in block 0. *c = 9.
    147. kernel add line 7, i am in kernel thread 1 in block 0. *c = 9.
    148. kernel add line 7, i am in kernel thread 2 in block 0. *c = 9.
    149. kernel add line 7, i am in kernel thread 3 in block 0. *c = 9.
    150. kernel add line 7, i am in kernel thread 4 in block 0. *c = 9.
    151. kernel add line 7, i am in kernel thread 5 in block 0. *c = 9.
    152. kernel add line 7, i am in kernel thread 6 in block 0. *c = 9.
    153. kernel add line 7, i am in kernel thread 7 in block 0. *c = 9.
    154. kernel add line 7, i am in kernel thread 8 in block 0. *c = 9.
    155. kernel add line 7, i am in kernel thread 9 in block 0. *c = 9.
    156. kernel add line 7, i am in kernel thread 10 in block 0. *c = 9.
    157. kernel add line 7, i am in kernel thread 11 in block 0. *c = 9.
    158. kernel add line 7, i am in kernel thread 12 in block 0. *c = 9.
    159. kernel add line 7, i am in kernel thread 13 in block 0. *c = 9.
    160. kernel add line 7, i am in kernel thread 14 in block 0. *c = 9.
    161. kernel add line 7, i am in kernel thread 15 in block 0. *c = 9.
    162. kernel add line 7, i am in kernel thread 0 in block 2. *c = 9.
    163. kernel add line 7, i am in kernel thread 1 in block 2. *c = 9.
    164. kernel add line 7, i am in kernel thread 2 in block 2. *c = 9.
    165. kernel add line 7, i am in kernel thread 3 in block 2. *c = 9.
    166. kernel add line 7, i am in kernel thread 4 in block 2. *c = 9.
    167. kernel add line 7, i am in kernel thread 5 in block 2. *c = 9.
    168. kernel add line 7, i am in kernel thread 6 in block 2. *c = 9.
    169. kernel add line 7, i am in kernel thread 7 in block 2. *c = 9.
    170. kernel add line 7, i am in kernel thread 8 in block 2. *c = 9.
    171. kernel add line 7, i am in kernel thread 9 in block 2. *c = 9.
    172. kernel add line 7, i am in kernel thread 10 in block 2. *c = 9.
    173. kernel add line 7, i am in kernel thread 11 in block 2. *c = 9.
    174. kernel add line 7, i am in kernel thread 12 in block 2. *c = 9.
    175. kernel add line 7, i am in kernel thread 13 in block 2. *c = 9.
    176. kernel add line 7, i am in kernel thread 14 in block 2. *c = 9.
    177. kernel add line 7, i am in kernel thread 15 in block 2. *c = 9.
    178. kernel add line 7, i am in kernel thread 0 in block 1. *c = 9.
    179. kernel add line 7, i am in kernel thread 1 in block 1. *c = 9.
    180. kernel add line 7, i am in kernel thread 2 in block 1. *c = 9.
    181. kernel add line 7, i am in kernel thread 3 in block 1. *c = 9.
    182. kernel add line 7, i am in kernel thread 4 in block 1. *c = 9.
    183. kernel add line 7, i am in kernel thread 5 in block 1. *c = 9.
    184. kernel add line 7, i am in kernel thread 6 in block 1. *c = 9.
    185. kernel add line 7, i am in kernel thread 7 in block 1. *c = 9.
    186. kernel add line 7, i am in kernel thread 8 in block 1. *c = 9.
    187. kernel add line 7, i am in kernel thread 9 in block 1. *c = 9.
    188. kernel add line 7, i am in kernel thread 10 in block 1. *c = 9.
    189. kernel add line 7, i am in kernel thread 11 in block 1. *c = 9.
    190. kernel add line 7, i am in kernel thread 12 in block 1. *c = 9.
    191. kernel add line 7, i am in kernel thread 13 in block 1. *c = 9.
    192. kernel add line 7, i am in kernel thread 14 in block 1. *c = 9.
    193. kernel add line 7, i am in kernel thread 15 in block 1. *c = 9.
    194. exit.c = 9.
    195. czl@czl-RedmiBook-14:~/workspace/work$


    1. #include
    2. #include
    3. __device__ void add(int a, int b, int *c)
    4. {
    5. *c = a + b;
    6. printf("kernel %s line %d, i am in kernel thread %d in blockidx.x %d. blokidx.y %d blockidx.z %d *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, blockIdx.y,blockIdx.z,*c);
    7. }
    8. __global__ void myfirstkernel(int a, int b, int *c)
    9. {
    10. printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
    11. printf("block.x = %d, block.y = %d,block.z = %d\n", blockDim.x, blockDim.y,blockDim.z);
    12. printf("thread.x = %d, thread.y = %d,thread.z = %d\n", threadIdx.x, threadIdx.y,threadIdx.z);
    13. printf("gridDim.x = %d, gridDim.y = %d,gridDim.z = %d\n", gridDim.x, gridDim.y,gridDim.z);
    14. add(a, b, c);
    15. }
    16. int main(void)
    17. {
    18. int c;
    19. int *gpu_c;
    20. cudaMalloc((void **)&gpu_c, sizeof(int));
    21. myfirstkernel <<<2,3>>>(3, 6, gpu_c);
    22. cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
    23. cudaFree(gpu_c);
    24. cudaDeviceSynchronize();
    25. printf("exit.c = %d.\n", c);
    26. return 0;
    27. }


    kernel call convontion:

    kernel call invocation convotional is:

    dim3 gridSize(3,2,1);

    dim3 blockSize(2,2,2);




    那么执行的线程是如何知道自己在在GRID,BLOCK,THREAD中的位置的呢?一种更简单的方案是让每个线程把自己的X索引(也就是threadIdx.x)记录下来,线程索引是线程在每个BLOCK里的索引,由于BLOCK的尺寸是三维的,因此线程索引也是一个三元常向量,threadIdx,访问方式为:threadIdx.x, threadIdx.y, threadIdx.z.对于一个BLOCK来说,它其中的每个线程的索引是唯一的,但是当一个GRID中有两个以上的BLOCK时,其中就会出现重复的线程索引,相应的,每个GRID里面的BLOCK也有唯一的BLOCK索引,用blockIdx表示,它同样是一个三维常向量,blockIdx.x, blockIdx.y, blockIdx.z。由于一次函数调用中只有一个GRID,因此不存在GRID索引。

    对于BLOCK和GRID的尺寸,也用相应的三维常向量来表示,BLOCK的尺寸保存在常向量blockDim中,GRID的尺寸保存在gridDim中,他们都是CUDA C的内建变量,可以直接在设备代码中使用,在计算中,用户常常要给每个线程一个唯一的标识符,即线程号,以便给每个线程分配不同的任务。在多个BLOCK的情况下,线程号也不能重复。线程号在实际使用中很重要,它关系到被处理的数据在输入数组中的位置,也关系到线程的分配和存储器的使用问题。


    1D grid && 1d block.

    1. __device__ int get_globalidx_1d_1d(void)
    2. {
    3. return blockIdx.x * blockDim.x + threadIdx.x;
    4. }

    1D grid && 2d block.

    1. __device__ int get_globalidx_1d_2d(void)
    2. {
    3. return blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
    4. }

    1d grid && 3d block

    1. __device__ int get_globalidx_1d_3d(void)
    2. {
    3. return blockIdx.x * blockDim.x * blockDim.y * blockDim.z + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
    4. }

    2d grid && 1d block

    1. __device__ int get_globalidx_2d_1d(void)
    2. {
    3. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    4. int threadid = blockid * blockDim.x + threadIdx.x;
    5. return threadid;
    6. }

    2d grid && 2d block

    1. __device__ int get_globalidx_2d_2d(void)
    2. {
    3. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    4. int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    5. return threadid;
    6. }

    2d grid && 3d block

    1. __device__ int get_globalidx_2d_3d(void)
    2. {
    3. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    4. int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
    5. return threadid;
    6. }

    3d grid && 1d block

    1. __device__ int get_globalidx_3d_1d(void)
    2. {
    3. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    4. int threadid = blockid * blockDim.x + threadIdx.x;
    5. return threadid;
    6. }

    3d grid && 2d block

    1. __device__ int get_globalidx_3d_2d(void)
    2. {
    3. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    4. int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    5. return threadid;
    6. }

    3d grid && 3d block

    1. __device__ int get_globalidx_3d_3d(void)
    2. {
    3. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    4. int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
    5. return threadid;
    6. }


    1. #include <cuda_runtime.h>
    2. #include <stdio.h>
    3. __device__ void add(int a, int b, int *c)
    4. {
    5. *c = a + b;
    6. printf("kernel %s line %d, i am in kernel thread %d in blockidx.x %d. blokidx.y %d blockidx.z %d *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, blockIdx.y,blockIdx.z,*c);
    7. }
    8. __global__ void myfirstkernel(int a, int b, int *c)
    9. {
    10. printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
    11. printf("block.x = %d, block.y = %d,block.z = %d\n", blockDim.x, blockDim.y,blockDim.z);
    12. printf("thread.x = %d, thread.y = %d,thread.z = %d\n", threadIdx.x, threadIdx.y,threadIdx.z);
    13. printf("gridDim.x = %d, gridDim.y = %d,gridDim.z = %d\n", gridDim.x, gridDim.y,gridDim.z);
    14. add(a, b, c);
    15. }
    16. __device__ int get_globalidx_1d_1d(void)
    17. {
    18. return blockIdx.x * blockDim.x + threadIdx.x;
    19. }
    20. __device__ int get_globalidx_1d_2d(void)
    21. {
    22. return blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
    23. }
    24. __device__ int get_globalidx_1d_3d(void)
    25. {
    26. return blockIdx.x * blockDim.x * blockDim.y * blockDim.z + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
    27. }
    28. __device__ int get_globalidx_2d_1d(void)
    29. {
    30. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    31. int threadid = blockid * blockDim.x + threadIdx.x;
    32. return threadid;
    33. }
    34. __device__ int get_globalidx_2d_2d(void)
    35. {
    36. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    37. int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    38. return threadid;
    39. }
    40. __device__ int get_globalidx_2d_3d(void)
    41. {
    42. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    43. int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
    44. return threadid;
    45. }
    46. __device__ int get_globalidx_3d_1d(void)
    47. {
    48. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    49. int threadid = blockid * blockDim.x + threadIdx.x;
    50. return threadid;
    51. }
    52. __device__ int get_globalidx_3d_2d(void)
    53. {
    54. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    55. int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    56. return threadid;
    57. }
    58. __device__ int get_globalidx_3d_3d(void)
    59. {
    60. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    61. int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
    62. return threadid;
    63. }
    64. __host__ int main(void)
    65. {
    66. int c;
    67. int *gpu_c;
    68. cudaMalloc((void **)&gpu_c, sizeof(int));
    69. myfirstkernel <<<2,3>>>(3, 6, gpu_c);
    70. cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
    71. cudaFree(gpu_c);
    72. cudaDeviceSynchronize();
    73. printf("exit.c = %d.\n", c);
    74. return 0;
    75. }




    cuda debug with cuda-gdb

    nvcc -g -G hello.cu 
    1. --debug (-g)
    2. Generate debug information for host code.
    3. --device-debug (-G)
    4. Generate debug information for device code. If --dopt is not specified, then
    5. turns off all optimizations. Don't use for profiling; use -lineinfo instead.
    1. czl@czl-RedmiBook-14:~/workspace/work$ cuda-gdb a.out
    2. NVIDIA (R) CUDA Debugger
    3. 11.7 release
    4. Portions Copyright (C) 2007-2022 NVIDIA Corporation
    5. GNU gdb (GDB) 10.2
    6. Copyright (C) 2021 Free Software Foundation, Inc.
    7. License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
    8. This is free software: you are free to change and redistribute it.
    9. There is NO WARRANTY, to the extent permitted by law.
    10. Type "show copying" and "show warranty" for details.
    11. This GDB was configured as "x86_64-pc-linux-gnu".
    12. Type "show configuration" for configuration details.
    13. For bug reporting instructions, please see:
    14. <https://www.gnu.org/software/gdb/bugs/>.
    15. Find the GDB manual and other documentation resources online at:
    16. <http://www.gnu.org/software/gdb/documentation/>.
    17. For help, type "help".
    18. Type "apropos word" to search for commands related to "word"...
    19. Reading symbols from a.out...
    20. (cuda-gdb) b myfirstkernel(int, int, int*)
    21. Breakpoint 1 at 0xa3e3: file /home/czl/workspace/work/hello.cu, line 11.
    22. (cuda-gdb) r
    23. Starting program: /home/czl/workspace/work/a.out
    24. [Thread debugging using libthread_db enabled]
    25. Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
    26. [Detaching after fork from child process 38487]
    27. [New Thread 0x7fffdffff000 (LWP 38492)]
    28. [New Thread 0x7fffdf7fe000 (LWP 38493)]
    29. [New Thread 0x7fffdeffd000 (LWP 38494)]
    30. [Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
    31. Thread 1 "a.out" hit Breakpoint 1, myfirstkernel<<<(2,1,1),(3,1,1)>>> (a=3, b=6, c=0x7fffcf000000) at hello.cu:12
    32. 12 printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
    33. (cuda-gdb) info cuda threads
    34. BlockIdx ThreadIdx To BlockIdx To ThreadIdx Count Virtual PC Filename Line
    35. Kernel 0
    36. * (0,0,0) (0,0,0) (1,0,0) (2,0,0) 6 0x0000555555a31628 hello.cu 12
    37. (cuda-gdb)
    38. BlockIdx ThreadIdx To BlockIdx To ThreadIdx Count Virtual PC Filename Line
    39. Kernel 0
    40. * (0,0,0) (0,0,0) (1,0,0) (2,0,0) 6 0x0000555555a31628 hello.cu 12
    41. (cuda-gdb)
    42. (cuda-gdb) info threads
    43. Id Target Id Frame
    44. 1 Thread 0x7ffff7d7d000 (LWP 38482) "a.out" 0x00007ffff6ce679c in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
    45. 2 Thread 0x7fffdffff000 (LWP 38492) "cuda-EvtHandlr" 0x00007ffff7e98d7f in poll () from /lib/x86_64-linux-gnu/libc.so.6
    46. 3 Thread 0x7fffdf7fe000 (LWP 38493) "cuda-EvtHandlr" 0x00007ffff7e98d7f in poll () from /lib/x86_64-linux-gnu/libc.so.6
    47. 4 Thread 0x7fffdeffd000 (LWP 38494) "a.out" 0x00007ffff7e11197 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
    48. (cuda-gdb)
    49. Id Target Id Frame
    50. 1 Thread 0x7ffff7d7d000 (LWP 38482) "a.out" 0x00007ffff6ce679c in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
    51. 2 Thread 0x7fffdffff000 (LWP 38492) "cuda-EvtHandlr" 0x00007ffff7e98d7f in poll () from /lib/x86_64-linux-gnu/libc.so.6
    52. 3 Thread 0x7fffdf7fe000 (LWP 38493) "cuda-EvtHandlr" 0x00007ffff7e98d7f in poll () from /lib/x86_64-linux-gnu/libc.so.6
    53. 4 Thread 0x7fffdeffd000 (LWP 38494) "a.out" 0x00007ffff7e11197 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
    54. (cuda-gdb)
    55. (cuda-gdb) info cuda
    56. blocks devices lanes launch trace managed threads
    57. contexts kernels launch children line sms warps
    58. (cuda-gdb) info cuda devices
    59. Dev PCI Bus/Dev ID Name Description SM Type SMs Warps/SM Lanes/Warp Max Regs/Lane Active SMs Mask
    60. * 0 02:00.0 NVIDIA GeForce MX250 GP108-A sm_61 3 64 32 256 0x00000000000000000000000000000003
    61. (cuda-gdb) info cuda devices
    62. Dev PCI Bus/Dev ID Name Description SM Type SMs Warps/SM Lanes/Warp Max Regs/Lane Active SMs Mask
    63. * 0 02:00.0 NVIDIA GeForce MX250 GP108-A sm_61 3 64 32 256 0x00000000000000000000000000000003
    64. (cuda-gdb) info cuda blocks
    65. BlockIdx To BlockIdx Count State
    66. Kernel 0
    67. * (0,0,0) (1,0,0) 2 running
    68. (cuda-gdb) info cuda lanes
    69. Ln State Physical PC ThreadIdx Exception
    70. Device 0 SM 0 Warp 0
    71. * 0 active 0x0000000000000188 (0,0,0) None
    72. 1 active 0x0000000000000188 (1,0,0) None
    73. 2 active 0x0000000000000188 (2,0,0) None
    74. (cuda-gdb) info cuda threads
    75. BlockIdx ThreadIdx To BlockIdx To ThreadIdx Count Virtual PC Filename Line
    76. Kernel 0
    77. * (0,0,0) (0,0,0) (1,0,0) (2,0,0) 6 0x0000555555a31628 hello.cu 12
    78. (cuda-gdb) info cuda sms
    79. SM Active Warps Mask
    80. Device 0
    81. * 0 0x0000000000000001
    82. 1 0x0000000000000001
    83. (cuda-gdb) info cuda kernels
    84. Kernel Parent Dev Grid Status SMs Mask GridDim BlockDim Invocation
    85. * 0 - 0 1 Active 0x00000003 (2,1,1) (3,1,1) myfirstkernel(a=3, b=6, c=0x7fffcf000000)
    86. (cuda-gdb) info cuda contexts
    87. Context Dev State
    88. * 0x0000555555626930 0 active
    89. (cuda-gdb) info cuda managed
    90. Static managed variables on device 0 are:
    91. (cuda-gdb)

    analysis a.out with binutils

    readelf -S a.out

    be notice the .nv_fatbin section, this section include all kinds of target device ISA binarys and bundle then together.so that is why it was called fat bin.

    objdump -C -d -j .nv_fatbin a.out|more

    so you can see it obviously include the HOST Like Device target enclude in the bundle fat bin.


    1. czl@czl-RedmiBook-14:~/workspace/work$ readelf -C -s a.out |grep get_globalidx_3d
    2. 2973: 000000000000a0fc 29 FUNC GLOBAL DEFAULT 15 get_globalidx_3d[...]
    3. 3101: 000000000000a0c2 29 FUNC GLOBAL DEFAULT 15 get_globalidx_3d[...]
    4. 3153: 000000000000a0df 29 FUNC GLOBAL DEFAULT 15 get_globalidx_3d[...]
    5. czl@czl-RedmiBook-14:~/workspace/work$

    从上图可以看出,即便是device端的代码,编译器也为其产生符号,binutils 工具中使用-C对 C++中的经过改编的名字进行反改编。


    1. czl@czl-RedmiBook-14:~/workspace/new$ nvcc --fatbin test.cu
    2. czl@czl-RedmiBook-14:~/workspace/new$ ls
    3. a.out test.cu test.fatbin
    4. czl@czl-RedmiBook-14:~/workspace/new$

    did cuda support recuisive function?

    the answer is yes.

    1. v#include <cuda_runtime.h>
    2. #include <stdio.h>
    3. __device__ void add(int a, int b, int *c)
    4. {
    5. *c = a + b;
    6. printf("kernel %s line %d, i am in kernel thread %d in blockidx.x %d. blokidx.y %d blockidx.z %d *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, blockIdx.y,blockIdx.z,*c);
    7. add(a,b+1,c);
    8. }
    9. __global__ void myfirstkernel(int a, int b, int *c)
    10. {
    11. //printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
    12. //printf("block.x = %d, block.y = %d,block.z = %d\n", blockDim.x, blockDim.y,blockDim.z);
    13. //printf("thread.x = %d, thread.y = %d,thread.z = %d\n", threadIdx.x, threadIdx.y,threadIdx.z);
    14. //printf("gridDim.x = %d, gridDim.y = %d,gridDim.z = %d\n", gridDim.x, gridDim.y,gridDim.z);
    15. add(a, b, c);
    16. }
    17. __device__ int get_globalidx_1d_1d(void)
    18. {
    19. return blockIdx.x * blockDim.x + threadIdx.x;
    20. }
    21. __device__ int get_globalidx_1d_2d(void)
    22. {
    23. return blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
    24. }
    25. __device__ int get_globalidx_1d_3d(void)
    26. {
    27. return blockIdx.x * blockDim.x * blockDim.y * blockDim.z + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
    28. }
    29. __device__ int get_globalidx_2d_1d(void)
    30. {
    31. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    32. int threadid = blockid * blockDim.x + threadIdx.x;
    33. return threadid;
    34. }
    35. __device__ int get_globalidx_2d_2d(void)
    36. {
    37. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    38. int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    39. return threadid;
    40. }
    41. __device__ int get_globalidx_2d_3d(void)
    42. {
    43. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    44. int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
    45. return threadid;
    46. }
    47. __device__ int get_globalidx_3d_1d(void)
    48. {
    49. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    50. int threadid = blockid * blockDim.x + threadIdx.x;
    51. return threadid;
    52. }
    53. __device__ int get_globalidx_3d_2d(void)
    54. {
    55. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    56. int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    57. return threadid;
    58. }
    59. __device__ int get_globalidx_3d_3d(void)
    60. {
    61. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    62. int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
    63. return threadid;
    64. }
    65. __host__ int main(void)
    66. {
    67. int c;
    68. int *gpu_c;
    69. cudaMalloc((void **)&gpu_c, sizeof(int));
    70. myfirstkernel <<<2,3>>>(3, 6, gpu_c);
    71. cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
    72. cudaFree(gpu_c);
    73. cudaDeviceSynchronize();
    74. printf("exit.c = %d.\n", c);
    75. return 0;
    76. }

    1. czl@czl-RedmiBook-14:~/workspace/work$ ./a.out
    2. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 9.
    3. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 9.
    4. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 9.
    5. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 9.
    6. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 9.
    7. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 9.
    8. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 10.
    9. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 10.
    10. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 10.
    11. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 10.
    12. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 10.
    13. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 10.
    14. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 11.
    15. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 11.
    16. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 11.
    17. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 11.
    18. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 11.
    19. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 11.
    20. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 12.
    21. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 12.
    22. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 12.
    23. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 12.
    24. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 12.
    25. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 12.
    26. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 13.
    27. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 13.
    28. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 13.
    29. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 13.
    30. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 13.
    31. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 13.
    32. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 14.
    33. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 14.
    34. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 14.
    35. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 14.
    36. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 14.
    37. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 14.
    38. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 15.
    39. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 15.
    40. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 15.
    41. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 15.
    42. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 15.
    43. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 15.
    44. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 16.
    45. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 16.
    46. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 16.
    47. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 16.
    48. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 16.
    49. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 16.
    50. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 17.
    51. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 17.
    52. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 17.
    53. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 17.
    54. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 17.
    55. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 17.
    56. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 18.
    57. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 18.
    58. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 18.
    59. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 18.
    60. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 18.
    61. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 18.
    62. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 19.
    63. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 19.
    64. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 19.
    65. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 19.
    66. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 19.
    67. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 19.
    68. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 20.
    69. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 20.
    70. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 20.
    71. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 20.
    72. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 20.
    73. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 20.
    74. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 21.
    75. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 21.
    76. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 21.
    77. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 21.
    78. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 21.
    79. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 21.
    80. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 22.
    81. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 22.
    82. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 22.
    83. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 22.
    84. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 22.
    85. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 22.
    86. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 23.
    87. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 23.
    88. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 23.
    89. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 23.
    90. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 23.
    91. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 23.
    92. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 24.
    93. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 24.
    94. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 24.
    95. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 24.
    96. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 24.
    97. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 24.
    98. exit.c = 3.
    99. czl@czl-RedmiBook-14:~/workspace/work$






    MPS means Multi Process Service.


    $ sudo nvidia-cuda-mps-control -d


    $ sudo nvidia-cuda-mps-control quit

    others blog, opencl:

    OpenCL编程初探_papaofdoudou的博客-CSDN博客_opencl 源码




  • 相关阅读:
    1671 得到山行数组的最少删除次数(贪心+二分)
    NFT Insider#110:The Sandbox与T&B Media Global合作,YGG Web3游戏峰会阵容揭晓
    Python try except else或finally异常处理
    Linux 模块的初始化过程
  • 原文地址:https://blog.csdn.net/tugouxp/article/details/126791881