• Ubuntu22.04安装CUDA深度学习环境&&cuda principle


    environment:

    neofetch && uname -a|lolcat

    install nvidia GPU driver:

    1. sudo add-apt-repository ppa:graphics-drivers/ppa # 加入官方ppa源
    2. sudo apt update # 检查软件包更新列表
    3. apt list --upgradable # 查看可更新的软件包列表
    4. sudo apt upgrade # 更新所有可更新的软件包
    1. ubuntu-drivers devices # ubuntu检测n卡的可选驱动
    2. sudo apt install nvidia-driver-510 # 根据自己的n卡可选驱动下载显卡驱动

    1. ubuntu-drivers devices # ubuntu检测n卡的可选驱动
    2. sudo apt install nvidia-driver-510 # 根据自己的n卡可选驱动下载显卡驱动

    disable the nouveau

    disable the nouveau by add the nouveau to the black list.

     /etc/modprobe.d/blacklist.conf
    

    最后一行加上: blacklist nouveau

    and execute:

    1. $ sudo update-initramfs -u
    2. $ reboot

     reboot the system and execute the nvidia-smi:

    the output of cuda does not mean the cuda environment already been installed, it just meas the corrspoinding versions of cuda that this driver supports.

    1. sudo nvidia-settings # 更改Nvidia驱动设置
    2. nvidia-smi # 查看显卡基本信息

    install cuda:

    1. wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
    2. sudo dpkg -i cuda-keyring_1.0-1_all.deb
    3. sudo apt-get update
    4. sudo apt-get -y install cuda

    nvcc:

    add environment in bash shell

    1. export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.7/lib64
    2. export PATH=$PATH:/usr/local/cuda-11.7/bin
    3. export CUDA_HOME=$CUDA_HOME:/usr/local/cuda-11.7

    test,printf on device:

    1. #include
    2. #include
    3. __device__ void add(void)
    4. {
    5. printf("kernel %s line %d, i am in kernel thread in block %d.\n", __func__, __LINE__,blockIdx.x);
    6. }
    7. __global__ void myfirstkernel(void)
    8. {
    9. printf("kernel %s line %d, i am in kernel thread in block %d.\n", __func__, __LINE__,blockIdx.x);
    10. add();
    11. }
    12. int main(void)
    13. {
    14. myfirstkernel <<<16,1>>>();
    15. cudaDeviceSynchronize();
    16. printf("exit.\n");
    17. return 0;
    18. }

    algo cuda sample:

    1. #include <cuda_runtime.h>
    2. #include <stdio.h>
    3. __device__ void add(int a, int b, int *c)
    4. {
    5. *c = a + b;
    6. printf("kernel %s line %d, i am in kernel thread in block %d. *c = %d.\n", __func__, __LINE__,blockIdx.x, *c);
    7. }
    8. __global__ void myfirstkernel(int a, int b, int *c)
    9. {
    10. printf("kernel %s line %d, i am in kernel thread in block %d.\n", __func__, __LINE__,blockIdx.x);
    11. add(a, b, c);
    12. }
    13. int main(void)
    14. {
    15. int c;
    16. int *gpu_c;
    17. cudaMalloc((void **)&gpu_c, sizeof(int));
    18. myfirstkernel <<<16,1>>>(3, 6, gpu_c);
    19. cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
    20. cudaFree(gpu_c);
    21. cudaDeviceSynchronize();
    22. printf("exit.c = %d.\n", c);
    23. return 0;
    24. }

    change thread and block

    1. #include <cuda_runtime.h>
    2. #include <stdio.h>
    3. __device__ void add(int a, int b, int *c)
    4. {
    5. *c = a + b;
    6. printf("kernel %s line %d, i am in kernel thread %d in block %d. *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, *c);
    7. }
    8. __global__ void myfirstkernel(int a, int b, int *c)
    9. {
    10. printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
    11. add(a, b, c);
    12. }
    13. int main(void)
    14. {
    15. int c;
    16. int *gpu_c;
    17. cudaMalloc((void **)&gpu_c, sizeof(int));
    18. myfirstkernel <<<1,16>>>(3, 6, gpu_c);
    19. cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
    20. cudaFree(gpu_c);
    21. cudaDeviceSynchronize();
    22. printf("exit.c = %d.\n", c);
    23. return 0;
    24. }

    1. #include
    2. #include
    3. __device__ void add(int a, int b, int *c)
    4. {
    5. *c = a + b;
    6. printf("kernel %s line %d, i am in kernel thread %d in block %d. *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, *c);
    7. }
    8. __global__ void myfirstkernel(int a, int b, int *c)
    9. {
    10. printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
    11. printf("block.x = %d, block.y = %d,block.z = %d\n", blockDim.x, blockDim.y,blockDim.z);
    12. printf("thread.x = %d, thread.y = %d,thread.z = %d\n", threadIdx.x, threadIdx.y,threadIdx.z);
    13. add(a, b, c);
    14. }
    15. int main(void)
    16. {
    17. int c;
    18. int *gpu_c;
    19. cudaMalloc((void **)&gpu_c, sizeof(int));
    20. myfirstkernel <<<3,16>>>(3, 6, gpu_c);
    21. cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
    22. cudaFree(gpu_c);
    23. cudaDeviceSynchronize();
    24. printf("exit.c = %d.\n", c);
    25. return 0;
    26. }
    1. czl@czl-RedmiBook-14:~/workspace/work$ ./a.out
    2. kernel myfirstkernel line 12, i am in kernel thread 0 in block 0.
    3. kernel myfirstkernel line 12, i am in kernel thread 1 in block 0.
    4. kernel myfirstkernel line 12, i am in kernel thread 2 in block 0.
    5. kernel myfirstkernel line 12, i am in kernel thread 3 in block 0.
    6. kernel myfirstkernel line 12, i am in kernel thread 4 in block 0.
    7. kernel myfirstkernel line 12, i am in kernel thread 5 in block 0.
    8. kernel myfirstkernel line 12, i am in kernel thread 6 in block 0.
    9. kernel myfirstkernel line 12, i am in kernel thread 7 in block 0.
    10. kernel myfirstkernel line 12, i am in kernel thread 8 in block 0.
    11. kernel myfirstkernel line 12, i am in kernel thread 9 in block 0.
    12. kernel myfirstkernel line 12, i am in kernel thread 10 in block 0.
    13. kernel myfirstkernel line 12, i am in kernel thread 11 in block 0.
    14. kernel myfirstkernel line 12, i am in kernel thread 12 in block 0.
    15. kernel myfirstkernel line 12, i am in kernel thread 13 in block 0.
    16. kernel myfirstkernel line 12, i am in kernel thread 14 in block 0.
    17. kernel myfirstkernel line 12, i am in kernel thread 15 in block 0.
    18. kernel myfirstkernel line 12, i am in kernel thread 0 in block 1.
    19. kernel myfirstkernel line 12, i am in kernel thread 1 in block 1.
    20. kernel myfirstkernel line 12, i am in kernel thread 2 in block 1.
    21. kernel myfirstkernel line 12, i am in kernel thread 3 in block 1.
    22. kernel myfirstkernel line 12, i am in kernel thread 4 in block 1.
    23. kernel myfirstkernel line 12, i am in kernel thread 5 in block 1.
    24. kernel myfirstkernel line 12, i am in kernel thread 6 in block 1.
    25. kernel myfirstkernel line 12, i am in kernel thread 7 in block 1.
    26. kernel myfirstkernel line 12, i am in kernel thread 8 in block 1.
    27. kernel myfirstkernel line 12, i am in kernel thread 9 in block 1.
    28. kernel myfirstkernel line 12, i am in kernel thread 10 in block 1.
    29. kernel myfirstkernel line 12, i am in kernel thread 11 in block 1.
    30. kernel myfirstkernel line 12, i am in kernel thread 12 in block 1.
    31. kernel myfirstkernel line 12, i am in kernel thread 13 in block 1.
    32. kernel myfirstkernel line 12, i am in kernel thread 14 in block 1.
    33. kernel myfirstkernel line 12, i am in kernel thread 15 in block 1.
    34. kernel myfirstkernel line 12, i am in kernel thread 0 in block 2.
    35. kernel myfirstkernel line 12, i am in kernel thread 1 in block 2.
    36. kernel myfirstkernel line 12, i am in kernel thread 2 in block 2.
    37. kernel myfirstkernel line 12, i am in kernel thread 3 in block 2.
    38. kernel myfirstkernel line 12, i am in kernel thread 4 in block 2.
    39. kernel myfirstkernel line 12, i am in kernel thread 5 in block 2.
    40. kernel myfirstkernel line 12, i am in kernel thread 6 in block 2.
    41. kernel myfirstkernel line 12, i am in kernel thread 7 in block 2.
    42. kernel myfirstkernel line 12, i am in kernel thread 8 in block 2.
    43. kernel myfirstkernel line 12, i am in kernel thread 9 in block 2.
    44. kernel myfirstkernel line 12, i am in kernel thread 10 in block 2.
    45. kernel myfirstkernel line 12, i am in kernel thread 11 in block 2.
    46. kernel myfirstkernel line 12, i am in kernel thread 12 in block 2.
    47. kernel myfirstkernel line 12, i am in kernel thread 13 in block 2.
    48. kernel myfirstkernel line 12, i am in kernel thread 14 in block 2.
    49. kernel myfirstkernel line 12, i am in kernel thread 15 in block 2.
    50. block.x = 16, block.y = 1,block.z = 1
    51. block.x = 16, block.y = 1,block.z = 1
    52. block.x = 16, block.y = 1,block.z = 1
    53. block.x = 16, block.y = 1,block.z = 1
    54. block.x = 16, block.y = 1,block.z = 1
    55. block.x = 16, block.y = 1,block.z = 1
    56. block.x = 16, block.y = 1,block.z = 1
    57. block.x = 16, block.y = 1,block.z = 1
    58. block.x = 16, block.y = 1,block.z = 1
    59. block.x = 16, block.y = 1,block.z = 1
    60. block.x = 16, block.y = 1,block.z = 1
    61. block.x = 16, block.y = 1,block.z = 1
    62. block.x = 16, block.y = 1,block.z = 1
    63. block.x = 16, block.y = 1,block.z = 1
    64. block.x = 16, block.y = 1,block.z = 1
    65. block.x = 16, block.y = 1,block.z = 1
    66. block.x = 16, block.y = 1,block.z = 1
    67. block.x = 16, block.y = 1,block.z = 1
    68. block.x = 16, block.y = 1,block.z = 1
    69. block.x = 16, block.y = 1,block.z = 1
    70. block.x = 16, block.y = 1,block.z = 1
    71. block.x = 16, block.y = 1,block.z = 1
    72. block.x = 16, block.y = 1,block.z = 1
    73. block.x = 16, block.y = 1,block.z = 1
    74. block.x = 16, block.y = 1,block.z = 1
    75. block.x = 16, block.y = 1,block.z = 1
    76. block.x = 16, block.y = 1,block.z = 1
    77. block.x = 16, block.y = 1,block.z = 1
    78. block.x = 16, block.y = 1,block.z = 1
    79. block.x = 16, block.y = 1,block.z = 1
    80. block.x = 16, block.y = 1,block.z = 1
    81. block.x = 16, block.y = 1,block.z = 1
    82. block.x = 16, block.y = 1,block.z = 1
    83. block.x = 16, block.y = 1,block.z = 1
    84. block.x = 16, block.y = 1,block.z = 1
    85. block.x = 16, block.y = 1,block.z = 1
    86. block.x = 16, block.y = 1,block.z = 1
    87. block.x = 16, block.y = 1,block.z = 1
    88. block.x = 16, block.y = 1,block.z = 1
    89. block.x = 16, block.y = 1,block.z = 1
    90. block.x = 16, block.y = 1,block.z = 1
    91. block.x = 16, block.y = 1,block.z = 1
    92. block.x = 16, block.y = 1,block.z = 1
    93. block.x = 16, block.y = 1,block.z = 1
    94. block.x = 16, block.y = 1,block.z = 1
    95. block.x = 16, block.y = 1,block.z = 1
    96. block.x = 16, block.y = 1,block.z = 1
    97. block.x = 16, block.y = 1,block.z = 1
    98. thread.x = 0, thread.y = 0,thread.z = 0
    99. thread.x = 1, thread.y = 0,thread.z = 0
    100. thread.x = 2, thread.y = 0,thread.z = 0
    101. thread.x = 3, thread.y = 0,thread.z = 0
    102. thread.x = 4, thread.y = 0,thread.z = 0
    103. thread.x = 5, thread.y = 0,thread.z = 0
    104. thread.x = 6, thread.y = 0,thread.z = 0
    105. thread.x = 7, thread.y = 0,thread.z = 0
    106. thread.x = 8, thread.y = 0,thread.z = 0
    107. thread.x = 9, thread.y = 0,thread.z = 0
    108. thread.x = 10, thread.y = 0,thread.z = 0
    109. thread.x = 11, thread.y = 0,thread.z = 0
    110. thread.x = 12, thread.y = 0,thread.z = 0
    111. thread.x = 13, thread.y = 0,thread.z = 0
    112. thread.x = 14, thread.y = 0,thread.z = 0
    113. thread.x = 15, thread.y = 0,thread.z = 0
    114. thread.x = 0, thread.y = 0,thread.z = 0
    115. thread.x = 1, thread.y = 0,thread.z = 0
    116. thread.x = 2, thread.y = 0,thread.z = 0
    117. thread.x = 3, thread.y = 0,thread.z = 0
    118. thread.x = 4, thread.y = 0,thread.z = 0
    119. thread.x = 5, thread.y = 0,thread.z = 0
    120. thread.x = 6, thread.y = 0,thread.z = 0
    121. thread.x = 7, thread.y = 0,thread.z = 0
    122. thread.x = 8, thread.y = 0,thread.z = 0
    123. thread.x = 9, thread.y = 0,thread.z = 0
    124. thread.x = 10, thread.y = 0,thread.z = 0
    125. thread.x = 11, thread.y = 0,thread.z = 0
    126. thread.x = 12, thread.y = 0,thread.z = 0
    127. thread.x = 13, thread.y = 0,thread.z = 0
    128. thread.x = 14, thread.y = 0,thread.z = 0
    129. thread.x = 15, thread.y = 0,thread.z = 0
    130. thread.x = 0, thread.y = 0,thread.z = 0
    131. thread.x = 1, thread.y = 0,thread.z = 0
    132. thread.x = 2, thread.y = 0,thread.z = 0
    133. thread.x = 3, thread.y = 0,thread.z = 0
    134. thread.x = 4, thread.y = 0,thread.z = 0
    135. thread.x = 5, thread.y = 0,thread.z = 0
    136. thread.x = 6, thread.y = 0,thread.z = 0
    137. thread.x = 7, thread.y = 0,thread.z = 0
    138. thread.x = 8, thread.y = 0,thread.z = 0
    139. thread.x = 9, thread.y = 0,thread.z = 0
    140. thread.x = 10, thread.y = 0,thread.z = 0
    141. thread.x = 11, thread.y = 0,thread.z = 0
    142. thread.x = 12, thread.y = 0,thread.z = 0
    143. thread.x = 13, thread.y = 0,thread.z = 0
    144. thread.x = 14, thread.y = 0,thread.z = 0
    145. thread.x = 15, thread.y = 0,thread.z = 0
    146. kernel add line 7, i am in kernel thread 0 in block 0. *c = 9.
    147. kernel add line 7, i am in kernel thread 1 in block 0. *c = 9.
    148. kernel add line 7, i am in kernel thread 2 in block 0. *c = 9.
    149. kernel add line 7, i am in kernel thread 3 in block 0. *c = 9.
    150. kernel add line 7, i am in kernel thread 4 in block 0. *c = 9.
    151. kernel add line 7, i am in kernel thread 5 in block 0. *c = 9.
    152. kernel add line 7, i am in kernel thread 6 in block 0. *c = 9.
    153. kernel add line 7, i am in kernel thread 7 in block 0. *c = 9.
    154. kernel add line 7, i am in kernel thread 8 in block 0. *c = 9.
    155. kernel add line 7, i am in kernel thread 9 in block 0. *c = 9.
    156. kernel add line 7, i am in kernel thread 10 in block 0. *c = 9.
    157. kernel add line 7, i am in kernel thread 11 in block 0. *c = 9.
    158. kernel add line 7, i am in kernel thread 12 in block 0. *c = 9.
    159. kernel add line 7, i am in kernel thread 13 in block 0. *c = 9.
    160. kernel add line 7, i am in kernel thread 14 in block 0. *c = 9.
    161. kernel add line 7, i am in kernel thread 15 in block 0. *c = 9.
    162. kernel add line 7, i am in kernel thread 0 in block 2. *c = 9.
    163. kernel add line 7, i am in kernel thread 1 in block 2. *c = 9.
    164. kernel add line 7, i am in kernel thread 2 in block 2. *c = 9.
    165. kernel add line 7, i am in kernel thread 3 in block 2. *c = 9.
    166. kernel add line 7, i am in kernel thread 4 in block 2. *c = 9.
    167. kernel add line 7, i am in kernel thread 5 in block 2. *c = 9.
    168. kernel add line 7, i am in kernel thread 6 in block 2. *c = 9.
    169. kernel add line 7, i am in kernel thread 7 in block 2. *c = 9.
    170. kernel add line 7, i am in kernel thread 8 in block 2. *c = 9.
    171. kernel add line 7, i am in kernel thread 9 in block 2. *c = 9.
    172. kernel add line 7, i am in kernel thread 10 in block 2. *c = 9.
    173. kernel add line 7, i am in kernel thread 11 in block 2. *c = 9.
    174. kernel add line 7, i am in kernel thread 12 in block 2. *c = 9.
    175. kernel add line 7, i am in kernel thread 13 in block 2. *c = 9.
    176. kernel add line 7, i am in kernel thread 14 in block 2. *c = 9.
    177. kernel add line 7, i am in kernel thread 15 in block 2. *c = 9.
    178. kernel add line 7, i am in kernel thread 0 in block 1. *c = 9.
    179. kernel add line 7, i am in kernel thread 1 in block 1. *c = 9.
    180. kernel add line 7, i am in kernel thread 2 in block 1. *c = 9.
    181. kernel add line 7, i am in kernel thread 3 in block 1. *c = 9.
    182. kernel add line 7, i am in kernel thread 4 in block 1. *c = 9.
    183. kernel add line 7, i am in kernel thread 5 in block 1. *c = 9.
    184. kernel add line 7, i am in kernel thread 6 in block 1. *c = 9.
    185. kernel add line 7, i am in kernel thread 7 in block 1. *c = 9.
    186. kernel add line 7, i am in kernel thread 8 in block 1. *c = 9.
    187. kernel add line 7, i am in kernel thread 9 in block 1. *c = 9.
    188. kernel add line 7, i am in kernel thread 10 in block 1. *c = 9.
    189. kernel add line 7, i am in kernel thread 11 in block 1. *c = 9.
    190. kernel add line 7, i am in kernel thread 12 in block 1. *c = 9.
    191. kernel add line 7, i am in kernel thread 13 in block 1. *c = 9.
    192. kernel add line 7, i am in kernel thread 14 in block 1. *c = 9.
    193. kernel add line 7, i am in kernel thread 15 in block 1. *c = 9.
    194. exit.c = 9.
    195. czl@czl-RedmiBook-14:~/workspace/work$

    gridDim.x/gridDim.y/gridDim.z

    1. #include
    2. #include
    3. __device__ void add(int a, int b, int *c)
    4. {
    5. *c = a + b;
    6. printf("kernel %s line %d, i am in kernel thread %d in blockidx.x %d. blokidx.y %d blockidx.z %d *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, blockIdx.y,blockIdx.z,*c);
    7. }
    8. __global__ void myfirstkernel(int a, int b, int *c)
    9. {
    10. printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
    11. printf("block.x = %d, block.y = %d,block.z = %d\n", blockDim.x, blockDim.y,blockDim.z);
    12. printf("thread.x = %d, thread.y = %d,thread.z = %d\n", threadIdx.x, threadIdx.y,threadIdx.z);
    13. printf("gridDim.x = %d, gridDim.y = %d,gridDim.z = %d\n", gridDim.x, gridDim.y,gridDim.z);
    14. add(a, b, c);
    15. }
    16. int main(void)
    17. {
    18. int c;
    19. int *gpu_c;
    20. cudaMalloc((void **)&gpu_c, sizeof(int));
    21. myfirstkernel <<<2,3>>>(3, 6, gpu_c);
    22. cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
    23. cudaFree(gpu_c);
    24. cudaDeviceSynchronize();
    25. printf("exit.c = %d.\n", c);
    26. return 0;
    27. }

    坐标系约定

    kernel call convontion:

    kernel call invocation convotional is:

    dim3 gridSize(3,2,1);

    dim3 blockSize(2,2,2);

    my_first_kernel<<>>(para1,para2,...);

    主机函数在声明的时候可以带有限定符__host__,全局函数在声明时必须带有限定符__global__.如果声明的函数没有限定符,则系统将认为其是主机函数,限定符写在返回类型之前。从上面的程序可以观察到,在调用全剧函数时除了函数名和形参表外,还有一个用三个小于号"<"和三个大于号">"包含的部分,这一部分用来指定在并行计算时使用的线程组数和每一个线程组包含的线程数。CUDA中将每一个线程组称为一个BLOCK,每个BLOCK由若干线程组成,而完成一次函数调用的所有BLOCK组成了一个grid.

    在使用时,BLOCK和GRID的尺寸都可以用三元向量来表示,这表明BLOCK和GRID都是三维数组,BLOCK的元素是线程,而GRID的数组元素是BLOCK,在当前CUDA的计算能力下,BLOCK和GRID的维数和各维数的尺寸都有限制。

    那么执行的线程是如何知道自己在在GRID,BLOCK,THREAD中的位置的呢?一种更简单的方案是让每个线程把自己的X索引(也就是threadIdx.x)记录下来,线程索引是线程在每个BLOCK里的索引,由于BLOCK的尺寸是三维的,因此线程索引也是一个三元常向量,threadIdx,访问方式为:threadIdx.x, threadIdx.y, threadIdx.z.对于一个BLOCK来说,它其中的每个线程的索引是唯一的,但是当一个GRID中有两个以上的BLOCK时,其中就会出现重复的线程索引,相应的,每个GRID里面的BLOCK也有唯一的BLOCK索引,用blockIdx表示,它同样是一个三维常向量,blockIdx.x, blockIdx.y, blockIdx.z。由于一次函数调用中只有一个GRID,因此不存在GRID索引。

    对于BLOCK和GRID的尺寸,也用相应的三维常向量来表示,BLOCK的尺寸保存在常向量blockDim中,GRID的尺寸保存在gridDim中,他们都是CUDA C的内建变量,可以直接在设备代码中使用,在计算中,用户常常要给每个线程一个唯一的标识符,即线程号,以便给每个线程分配不同的任务。在多个BLOCK的情况下,线程号也不能重复。线程号在实际使用中很重要,它关系到被处理的数据在输入数组中的位置,也关系到线程的分配和存储器的使用问题。

    当BLOCK或者GRID是多维的时候,该如何计算线程号呢?分别说明:

    1D grid && 1d block.

    1. __device__ int get_globalidx_1d_1d(void)
    2. {
    3. return blockIdx.x * blockDim.x + threadIdx.x;
    4. }

    1D grid && 2d block.

    1. __device__ int get_globalidx_1d_2d(void)
    2. {
    3. return blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
    4. }

    1d grid && 3d block

    1. __device__ int get_globalidx_1d_3d(void)
    2. {
    3. return blockIdx.x * blockDim.x * blockDim.y * blockDim.z + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
    4. }

    2d grid && 1d block

    1. __device__ int get_globalidx_2d_1d(void)
    2. {
    3. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    4. int threadid = blockid * blockDim.x + threadIdx.x;
    5. return threadid;
    6. }

    2d grid && 2d block

    1. __device__ int get_globalidx_2d_2d(void)
    2. {
    3. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    4. int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    5. return threadid;
    6. }

    2d grid && 3d block

    1. __device__ int get_globalidx_2d_3d(void)
    2. {
    3. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    4. int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
    5. return threadid;
    6. }

    3d grid && 1d block

    1. __device__ int get_globalidx_3d_1d(void)
    2. {
    3. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    4. int threadid = blockid * blockDim.x + threadIdx.x;
    5. return threadid;
    6. }

    3d grid && 2d block

    1. __device__ int get_globalidx_3d_2d(void)
    2. {
    3. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    4. int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    5. return threadid;
    6. }

    3d grid && 3d block

    1. __device__ int get_globalidx_3d_3d(void)
    2. {
    3. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    4. int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
    5. return threadid;
    6. }

    code:

    1. #include <cuda_runtime.h>
    2. #include <stdio.h>
    3. __device__ void add(int a, int b, int *c)
    4. {
    5. *c = a + b;
    6. printf("kernel %s line %d, i am in kernel thread %d in blockidx.x %d. blokidx.y %d blockidx.z %d *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, blockIdx.y,blockIdx.z,*c);
    7. }
    8. __global__ void myfirstkernel(int a, int b, int *c)
    9. {
    10. printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
    11. printf("block.x = %d, block.y = %d,block.z = %d\n", blockDim.x, blockDim.y,blockDim.z);
    12. printf("thread.x = %d, thread.y = %d,thread.z = %d\n", threadIdx.x, threadIdx.y,threadIdx.z);
    13. printf("gridDim.x = %d, gridDim.y = %d,gridDim.z = %d\n", gridDim.x, gridDim.y,gridDim.z);
    14. add(a, b, c);
    15. }
    16. __device__ int get_globalidx_1d_1d(void)
    17. {
    18. return blockIdx.x * blockDim.x + threadIdx.x;
    19. }
    20. __device__ int get_globalidx_1d_2d(void)
    21. {
    22. return blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
    23. }
    24. __device__ int get_globalidx_1d_3d(void)
    25. {
    26. return blockIdx.x * blockDim.x * blockDim.y * blockDim.z + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
    27. }
    28. __device__ int get_globalidx_2d_1d(void)
    29. {
    30. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    31. int threadid = blockid * blockDim.x + threadIdx.x;
    32. return threadid;
    33. }
    34. __device__ int get_globalidx_2d_2d(void)
    35. {
    36. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    37. int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    38. return threadid;
    39. }
    40. __device__ int get_globalidx_2d_3d(void)
    41. {
    42. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    43. int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
    44. return threadid;
    45. }
    46. __device__ int get_globalidx_3d_1d(void)
    47. {
    48. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    49. int threadid = blockid * blockDim.x + threadIdx.x;
    50. return threadid;
    51. }
    52. __device__ int get_globalidx_3d_2d(void)
    53. {
    54. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    55. int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    56. return threadid;
    57. }
    58. __device__ int get_globalidx_3d_3d(void)
    59. {
    60. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    61. int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
    62. return threadid;
    63. }
    64. __host__ int main(void)
    65. {
    66. int c;
    67. int *gpu_c;
    68. cudaMalloc((void **)&gpu_c, sizeof(int));
    69. myfirstkernel <<<2,3>>>(3, 6, gpu_c);
    70. cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
    71. cudaFree(gpu_c);
    72. cudaDeviceSynchronize();
    73. printf("exit.c = %d.\n", c);
    74. return 0;
    75. }

    另一种思路

    可以将最复杂的情况,也就是3维grid和3维block看成一个六维数组。

    Array[gridDim.z][gridDim.y][gridDim.x][blockDim.z][blockDim.y][blockDim.x];

    cuda debug with cuda-gdb

    nvcc -g -G hello.cu 
    1. --debug (-g)
    2. Generate debug information for host code.
    3. --device-debug (-G)
    4. Generate debug information for device code. If --dopt is not specified, then
    5. turns off all optimizations. Don't use for profiling; use -lineinfo instead.
    1. czl@czl-RedmiBook-14:~/workspace/work$ cuda-gdb a.out
    2. NVIDIA (R) CUDA Debugger
    3. 11.7 release
    4. Portions Copyright (C) 2007-2022 NVIDIA Corporation
    5. GNU gdb (GDB) 10.2
    6. Copyright (C) 2021 Free Software Foundation, Inc.
    7. License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
    8. This is free software: you are free to change and redistribute it.
    9. There is NO WARRANTY, to the extent permitted by law.
    10. Type "show copying" and "show warranty" for details.
    11. This GDB was configured as "x86_64-pc-linux-gnu".
    12. Type "show configuration" for configuration details.
    13. For bug reporting instructions, please see:
    14. <https://www.gnu.org/software/gdb/bugs/>.
    15. Find the GDB manual and other documentation resources online at:
    16. <http://www.gnu.org/software/gdb/documentation/>.
    17. For help, type "help".
    18. Type "apropos word" to search for commands related to "word"...
    19. Reading symbols from a.out...
    20. (cuda-gdb) b myfirstkernel(int, int, int*)
    21. Breakpoint 1 at 0xa3e3: file /home/czl/workspace/work/hello.cu, line 11.
    22. (cuda-gdb) r
    23. Starting program: /home/czl/workspace/work/a.out
    24. [Thread debugging using libthread_db enabled]
    25. Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
    26. [Detaching after fork from child process 38487]
    27. [New Thread 0x7fffdffff000 (LWP 38492)]
    28. [New Thread 0x7fffdf7fe000 (LWP 38493)]
    29. [New Thread 0x7fffdeffd000 (LWP 38494)]
    30. [Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
    31. Thread 1 "a.out" hit Breakpoint 1, myfirstkernel<<<(2,1,1),(3,1,1)>>> (a=3, b=6, c=0x7fffcf000000) at hello.cu:12
    32. 12 printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
    33. (cuda-gdb) info cuda threads
    34. BlockIdx ThreadIdx To BlockIdx To ThreadIdx Count Virtual PC Filename Line
    35. Kernel 0
    36. * (0,0,0) (0,0,0) (1,0,0) (2,0,0) 6 0x0000555555a31628 hello.cu 12
    37. (cuda-gdb)
    38. BlockIdx ThreadIdx To BlockIdx To ThreadIdx Count Virtual PC Filename Line
    39. Kernel 0
    40. * (0,0,0) (0,0,0) (1,0,0) (2,0,0) 6 0x0000555555a31628 hello.cu 12
    41. (cuda-gdb)
    42. (cuda-gdb) info threads
    43. Id Target Id Frame
    44. 1 Thread 0x7ffff7d7d000 (LWP 38482) "a.out" 0x00007ffff6ce679c in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
    45. 2 Thread 0x7fffdffff000 (LWP 38492) "cuda-EvtHandlr" 0x00007ffff7e98d7f in poll () from /lib/x86_64-linux-gnu/libc.so.6
    46. 3 Thread 0x7fffdf7fe000 (LWP 38493) "cuda-EvtHandlr" 0x00007ffff7e98d7f in poll () from /lib/x86_64-linux-gnu/libc.so.6
    47. 4 Thread 0x7fffdeffd000 (LWP 38494) "a.out" 0x00007ffff7e11197 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
    48. (cuda-gdb)
    49. Id Target Id Frame
    50. 1 Thread 0x7ffff7d7d000 (LWP 38482) "a.out" 0x00007ffff6ce679c in ?? () from /lib/x86_64-linux-gnu/libcuda.so.1
    51. 2 Thread 0x7fffdffff000 (LWP 38492) "cuda-EvtHandlr" 0x00007ffff7e98d7f in poll () from /lib/x86_64-linux-gnu/libc.so.6
    52. 3 Thread 0x7fffdf7fe000 (LWP 38493) "cuda-EvtHandlr" 0x00007ffff7e98d7f in poll () from /lib/x86_64-linux-gnu/libc.so.6
    53. 4 Thread 0x7fffdeffd000 (LWP 38494) "a.out" 0x00007ffff7e11197 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
    54. (cuda-gdb)
    55. (cuda-gdb) info cuda
    56. blocks devices lanes launch trace managed threads
    57. contexts kernels launch children line sms warps
    58. (cuda-gdb) info cuda devices
    59. Dev PCI Bus/Dev ID Name Description SM Type SMs Warps/SM Lanes/Warp Max Regs/Lane Active SMs Mask
    60. * 0 02:00.0 NVIDIA GeForce MX250 GP108-A sm_61 3 64 32 256 0x00000000000000000000000000000003
    61. (cuda-gdb) info cuda devices
    62. Dev PCI Bus/Dev ID Name Description SM Type SMs Warps/SM Lanes/Warp Max Regs/Lane Active SMs Mask
    63. * 0 02:00.0 NVIDIA GeForce MX250 GP108-A sm_61 3 64 32 256 0x00000000000000000000000000000003
    64. (cuda-gdb) info cuda blocks
    65. BlockIdx To BlockIdx Count State
    66. Kernel 0
    67. * (0,0,0) (1,0,0) 2 running
    68. (cuda-gdb) info cuda lanes
    69. Ln State Physical PC ThreadIdx Exception
    70. Device 0 SM 0 Warp 0
    71. * 0 active 0x0000000000000188 (0,0,0) None
    72. 1 active 0x0000000000000188 (1,0,0) None
    73. 2 active 0x0000000000000188 (2,0,0) None
    74. (cuda-gdb) info cuda threads
    75. BlockIdx ThreadIdx To BlockIdx To ThreadIdx Count Virtual PC Filename Line
    76. Kernel 0
    77. * (0,0,0) (0,0,0) (1,0,0) (2,0,0) 6 0x0000555555a31628 hello.cu 12
    78. (cuda-gdb) info cuda sms
    79. SM Active Warps Mask
    80. Device 0
    81. * 0 0x0000000000000001
    82. 1 0x0000000000000001
    83. (cuda-gdb) info cuda kernels
    84. Kernel Parent Dev Grid Status SMs Mask GridDim BlockDim Invocation
    85. * 0 - 0 1 Active 0x00000003 (2,1,1) (3,1,1) myfirstkernel(a=3, b=6, c=0x7fffcf000000)
    86. (cuda-gdb) info cuda contexts
    87. Context Dev State
    88. * 0x0000555555626930 0 active
    89. (cuda-gdb) info cuda managed
    90. Static managed variables on device 0 are:
    91. (cuda-gdb)

    analysis a.out with binutils

    readelf -S a.out

    be notice the .nv_fatbin section, this section include all kinds of target device ISA binarys and bundle then together.so that is why it was called fat bin.

    objdump -C -d -j .nv_fatbin a.out|more

    so you can see it obviously include the HOST Like Device target enclude in the bundle fat bin.

    CUDA架构下有许多不同计算能力的架构比如sm_20,sm_30…,使用cuda的nvcc生成的kernel程序二进制文件会有不同计算力的多个版本的.cubin文件,之后,CUDA会把这些.cubin文件打包成一个.fatbin文件,这样的好处是,若最开始设置的GPU架构为sm_60,但是在实际执行时,硬件架构达不到sm_60这个版本,如此,显卡驱动可以从fatbin中选取符合硬件版本的.cubin程序,而不需要再重新编译整个CUDA程序。

    1. czl@czl-RedmiBook-14:~/workspace/work$ readelf -C -s a.out |grep get_globalidx_3d
    2. 2973: 000000000000a0fc 29 FUNC GLOBAL DEFAULT 15 get_globalidx_3d[...]
    3. 3101: 000000000000a0c2 29 FUNC GLOBAL DEFAULT 15 get_globalidx_3d[...]
    4. 3153: 000000000000a0df 29 FUNC GLOBAL DEFAULT 15 get_globalidx_3d[...]
    5. czl@czl-RedmiBook-14:~/workspace/work$

    从上图可以看出,即便是device端的代码,编译器也为其产生符号,binutils 工具中使用-C对 C++中的经过改编的名字进行反改编。

    --fatbin(-fatbin):编译所有.cu,.ptx和.cubin输入文件为设备端的.fatbin文件。该选项使得nvcc丢弃所有.cu输入文件中的主机侧代码。

    1. czl@czl-RedmiBook-14:~/workspace/new$ nvcc --fatbin test.cu
    2. czl@czl-RedmiBook-14:~/workspace/new$ ls
    3. a.out test.cu test.fatbin
    4. czl@czl-RedmiBook-14:~/workspace/new$

    did cuda support recuisive function?

    the answer is yes.

    1. v#include <cuda_runtime.h>
    2. #include <stdio.h>
    3. __device__ void add(int a, int b, int *c)
    4. {
    5. *c = a + b;
    6. printf("kernel %s line %d, i am in kernel thread %d in blockidx.x %d. blokidx.y %d blockidx.z %d *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, blockIdx.y,blockIdx.z,*c);
    7. add(a,b+1,c);
    8. }
    9. __global__ void myfirstkernel(int a, int b, int *c)
    10. {
    11. //printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
    12. //printf("block.x = %d, block.y = %d,block.z = %d\n", blockDim.x, blockDim.y,blockDim.z);
    13. //printf("thread.x = %d, thread.y = %d,thread.z = %d\n", threadIdx.x, threadIdx.y,threadIdx.z);
    14. //printf("gridDim.x = %d, gridDim.y = %d,gridDim.z = %d\n", gridDim.x, gridDim.y,gridDim.z);
    15. add(a, b, c);
    16. }
    17. __device__ int get_globalidx_1d_1d(void)
    18. {
    19. return blockIdx.x * blockDim.x + threadIdx.x;
    20. }
    21. __device__ int get_globalidx_1d_2d(void)
    22. {
    23. return blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
    24. }
    25. __device__ int get_globalidx_1d_3d(void)
    26. {
    27. return blockIdx.x * blockDim.x * blockDim.y * blockDim.z + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
    28. }
    29. __device__ int get_globalidx_2d_1d(void)
    30. {
    31. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    32. int threadid = blockid * blockDim.x + threadIdx.x;
    33. return threadid;
    34. }
    35. __device__ int get_globalidx_2d_2d(void)
    36. {
    37. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    38. int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    39. return threadid;
    40. }
    41. __device__ int get_globalidx_2d_3d(void)
    42. {
    43. int blockid = blockIdx.y * gridDim.x + blockIdx.x;
    44. int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
    45. return threadid;
    46. }
    47. __device__ int get_globalidx_3d_1d(void)
    48. {
    49. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    50. int threadid = blockid * blockDim.x + threadIdx.x;
    51. return threadid;
    52. }
    53. __device__ int get_globalidx_3d_2d(void)
    54. {
    55. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    56. int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    57. return threadid;
    58. }
    59. __device__ int get_globalidx_3d_3d(void)
    60. {
    61. int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    62. int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
    63. return threadid;
    64. }
    65. __host__ int main(void)
    66. {
    67. int c;
    68. int *gpu_c;
    69. cudaMalloc((void **)&gpu_c, sizeof(int));
    70. myfirstkernel <<<2,3>>>(3, 6, gpu_c);
    71. cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);
    72. cudaFree(gpu_c);
    73. cudaDeviceSynchronize();
    74. printf("exit.c = %d.\n", c);
    75. return 0;
    76. }

    1. czl@czl-RedmiBook-14:~/workspace/work$ ./a.out
    2. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 9.
    3. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 9.
    4. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 9.
    5. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 9.
    6. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 9.
    7. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 9.
    8. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 10.
    9. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 10.
    10. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 10.
    11. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 10.
    12. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 10.
    13. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 10.
    14. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 11.
    15. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 11.
    16. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 11.
    17. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 11.
    18. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 11.
    19. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 11.
    20. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 12.
    21. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 12.
    22. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 12.
    23. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 12.
    24. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 12.
    25. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 12.
    26. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 13.
    27. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 13.
    28. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 13.
    29. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 13.
    30. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 13.
    31. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 13.
    32. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 14.
    33. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 14.
    34. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 14.
    35. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 14.
    36. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 14.
    37. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 14.
    38. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 15.
    39. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 15.
    40. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 15.
    41. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 15.
    42. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 15.
    43. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 15.
    44. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 16.
    45. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 16.
    46. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 16.
    47. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 16.
    48. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 16.
    49. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 16.
    50. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 17.
    51. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 17.
    52. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 17.
    53. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 17.
    54. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 17.
    55. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 17.
    56. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 18.
    57. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 18.
    58. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 18.
    59. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 18.
    60. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 18.
    61. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 18.
    62. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 19.
    63. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 19.
    64. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 19.
    65. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 19.
    66. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 19.
    67. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 19.
    68. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 20.
    69. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 20.
    70. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 20.
    71. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 20.
    72. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 20.
    73. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 20.
    74. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 21.
    75. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 21.
    76. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 21.
    77. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 21.
    78. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 21.
    79. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 21.
    80. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 22.
    81. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 22.
    82. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 22.
    83. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 22.
    84. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 22.
    85. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 22.
    86. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 23.
    87. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 23.
    88. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 23.
    89. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 23.
    90. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 23.
    91. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 23.
    92. kernel add line 7, i am in kernel thread 0 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 24.
    93. kernel add line 7, i am in kernel thread 1 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 24.
    94. kernel add line 7, i am in kernel thread 2 in blockidx.x 0. blokidx.y 0 blockidx.z 0 *c = 24.
    95. kernel add line 7, i am in kernel thread 0 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 24.
    96. kernel add line 7, i am in kernel thread 1 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 24.
    97. kernel add line 7, i am in kernel thread 2 in blockidx.x 1. blokidx.y 0 blockidx.z 0 *c = 24.
    98. exit.c = 3.
    99. czl@czl-RedmiBook-14:~/workspace/work$

    二维的grid-block-thread层级示意图。

    GPU打印原理

    CUDA和openCL中都定义了printf函数,让运行的核函数可以很方便的输出各种信息,GPU版本的printf函数原型与标准C一样,但是某些格式符号可能有所区别。

    核函数的printf不能直接显示内容到屏幕,所以GPU上的printf一般都先把信息写到一个内存缓冲区,让后在让CPU端的代码从这个缓冲区读信息并显示出来。如下图所示:

    NVIDIA MPS

    MPS means Multi Process Service.

    enabled:

    $ sudo nvidia-cuda-mps-control -d
    

    disable:

    $ sudo nvidia-cuda-mps-control quit

    others blog, opencl:

    OpenCL编程初探_papaofdoudou的博客-CSDN博客_opencl 源码

    CUDA编程初探_papaofdoudou的博客-CSDN博客_哪些场合必须用cuda编程

    OpenCL编程之二_papaofdoudou的博客-CSDN博客


    End

  • 相关阅读:
    Git的使用教程
    浏览器存储(webStorage)常用API以及简单使用
    陕西省2022年快递工程类职称评审相关资讯
    ChatGPT学习第三周
    1671 得到山行数组的最少删除次数(贪心+二分)
    NFT Insider#110:The Sandbox与T&B Media Global合作,YGG Web3游戏峰会阵容揭晓
    Python try except else或finally异常处理
    从零实现的Chrome扩展
    Linux 模块的初始化过程
    @Autowired具有什么功能呢?
  • 原文地址:https://blog.csdn.net/tugouxp/article/details/126791881