• 源码安装 HIPIFY 和应用示例,将cuda生态源码转化成HIP生态源码


    1,源码下载

    GitHub - ROCm/HIPIFY: HIPIFY: Convert CUDA to Portable C++ CodeHIPIFY: Convert CUDA to Portable C++ Code. Contribute to ROCm/HIPIFY development by creating an account on GitHub.icon-default.png?t=N7T8https://github.com/ROCm/HIPIFY.git

    git clone --recursive https://github.com/ROCm/HIPIFY.git
    sudo apt install clang-dev

     2,编译并安装

    2.1 通常方式

    hipify-clang 文档:

    https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/hipify-clang.md

    编译命令

    cmake -DCMAKE_INSTALL_PREFIX=../dist -DCMAKE_BUILD_TYPE=Release  ..
    make -j install
    

    此时 hipify-clang 会被安装到 HIPIFY/dist/bin 中,

    测试:

    1. cd ../dist/bin
    2. hipify --help

    如果系统中存在多个llvm版本,在执行翻译命令时,比如hipify-clang ./vectorAdd.cu  --cuda-path=/usr/local/cuda-12.1可能会发生错误,如下提示:

    CommandLine Error: Option 'static-func-full-module-prefix' registered more than once!
    LLVM ERROR: inconsistency in registered CommandLine option

    这时需要使用自制的LLVM,如下2.2节所示。

    2.2 自制LLVM的方式

    2.2.1  下载llvm源码

    wget https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-17.0.6.tar.gz

    解压,tar zxf llvmorg.....

    2.2.2  配置编译LLVM

    1. cd llvmorg.....
    2. mkdir -p build ../dist/local
    3. cd build
    4. cmake -G "Unix Makefiles" ../llvm \
    5. -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;compiler-rt" \
    6. -DLLVM_BUILD_EXAMPLES=ON -DLLVM_TARGETS_TO_BUILD="host" \
    7. -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON \
    8. -DLLVM_ENABLE_RUNTIMES=all -DLLVM_BUILD_LLVM_DYLIB=ON \
    9. -DCMAKE_INSTALL_PREFIX=../../dist/local

    make -j

    make -j install

    测试时,llvm 被install在如下文件夹:

    /home/hipper/ex_dock_hipify/dist/local

    ls /home/hipper/ex_dock_hipify/dist/local 如图:

    2.2.3 配置编译HIPIFY

    指定 LLVM 安装目录的配置方法:

    -DCMAKE_PREFIX_PATH=/home/hipper/ex_dock_hipify/dist/local

    1. cmake \
    2. -DCMAKE_BUILD_TYPE=Release \
    3. -DCMAKE_INSTALL_PREFIX=../dist \
    4. -DCMAKE_PREFIX_PATH=/home/hipper/ex_dock_hipify/dist/local \
    5. -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.1 ..

    make -j install

    3. 示例

    3.1翻译 .cu 文件到 .hip 文件

    命令:

    /home/hipper/ex_dock_hipify/HIPIFY/dist/bin/hipify-clang ./vectorAdd.cu  --cuda-path=/usr/local/cuda-12.1

    会在 ./ 目录中生成 vectoreAdd.cu.hip 的文件。

    其中,hipify-clang 并不检查输入文件的扩展名,比如这里的.cu,它只检查文件内部的内容,将cuda生态的关键字有机地翻译成 hip生态的关键字,输出文件会在原文件名的基础上加上 .hip 后缀;

    源代码分别如下。

    使用 cuda samples中的vectoradd.cu为例,源码如下:

    vectorAdd.cu

    1. #include
    2. #include
    3. __global__ void vectorAdd(const float *A, const float *B, float *C,
    4. int numElements) {
    5. int i = blockDim.x * blockIdx.x + threadIdx.x;
    6. if (i < numElements) {
    7. C[i] = A[i] + B[i] + 0.0f;
    8. }
    9. }
    10. int main(void) {
    11. cudaError_t err = cudaSuccess;
    12. int numElements = 50000;
    13. size_t size = numElements * sizeof(float);
    14. printf("[Vector addition of %d elements]\n", numElements);
    15. float *h_A = (float *)malloc(size);
    16. float *h_B = (float *)malloc(size);
    17. float *h_C = (float *)malloc(size);
    18. if (h_A == NULL || h_B == NULL || h_C == NULL) {
    19. fprintf(stderr, "Failed to allocate host vectors!\n");
    20. exit(EXIT_FAILURE);
    21. }
    22. for (int i = 0; i < numElements; ++i) {
    23. h_A[i] = rand() / (float)RAND_MAX;
    24. h_B[i] = rand() / (float)RAND_MAX;
    25. }
    26. float *d_A = NULL;
    27. err = cudaMalloc((void **)&d_A, size);
    28. if (err != cudaSuccess) {
    29. fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
    30. cudaGetErrorString(err));
    31. exit(EXIT_FAILURE);
    32. }
    33. float *d_B = NULL;
    34. err = cudaMalloc((void **)&d_B, size);
    35. if (err != cudaSuccess) {
    36. fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",
    37. cudaGetErrorString(err));
    38. exit(EXIT_FAILURE);
    39. }
    40. float *d_C = NULL;
    41. err = cudaMalloc((void **)&d_C, size);
    42. if (err != cudaSuccess) {
    43. fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",
    44. cudaGetErrorString(err));
    45. exit(EXIT_FAILURE);
    46. }
    47. printf("Copy input data from the host memory to the CUDA device\n");
    48. err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    49. if (err != cudaSuccess) {
    50. fprintf(stderr,
    51. "Failed to copy vector A from host to device (error code %s)!\n",
    52. cudaGetErrorString(err));
    53. exit(EXIT_FAILURE);
    54. }
    55. err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    56. if (err != cudaSuccess) {
    57. fprintf(stderr,
    58. "Failed to copy vector B from host to device (error code %s)!\n",
    59. cudaGetErrorString(err));
    60. exit(EXIT_FAILURE);
    61. }
    62. int threadsPerBlock = 256;
    63. int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
    64. printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
    65. threadsPerBlock);
    66. vectorAdd<<>>(d_A, d_B, d_C, numElements);
    67. err = cudaGetLastError();
    68. if (err != cudaSuccess) {
    69. fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
    70. cudaGetErrorString(err));
    71. exit(EXIT_FAILURE);
    72. }
    73. printf("Copy output data from the CUDA device to the host memory\n");
    74. err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    75. if (err != cudaSuccess) {
    76. fprintf(stderr,
    77. "Failed to copy vector C from device to host (error code %s)!\n",
    78. cudaGetErrorString(err));
    79. exit(EXIT_FAILURE);
    80. }
    81. for (int i = 0; i < numElements; ++i) {
    82. if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
    83. fprintf(stderr, "Result verification failed at element %d!\n", i);
    84. exit(EXIT_FAILURE);
    85. }
    86. }
    87. printf("Test PASSED\n");
    88. err = cudaFree(d_A);
    89. if (err != cudaSuccess) {
    90. fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
    91. cudaGetErrorString(err));
    92. exit(EXIT_FAILURE);
    93. }
    94. err = cudaFree(d_B);
    95. if (err != cudaSuccess) {
    96. fprintf(stderr, "Failed to free device vector B (error code %s)!\n",
    97. cudaGetErrorString(err));
    98. exit(EXIT_FAILURE);
    99. }
    100. err = cudaFree(d_C);
    101. if (err != cudaSuccess) {
    102. fprintf(stderr, "Failed to free device vector C (error code %s)!\n",
    103. cudaGetErrorString(err));
    104. exit(EXIT_FAILURE);
    105. }
    106. free(h_A);
    107. free(h_B);
    108. free(h_C);
    109. printf("Done\n");
    110. return 0;
    111. }

    生成的 vectorAdd.cu.hip :

    1. #include
    2. #include
    3. __global__ void vectorAdd(const float *A, const float *B, float *C,
    4. int numElements) {
    5. int i = blockDim.x * blockIdx.x + threadIdx.x;
    6. if (i < numElements) {
    7. C[i] = A[i] + B[i] + 0.0f;
    8. }
    9. }
    10. int main(void) {
    11. hipError_t err = hipSuccess;
    12. int numElements = 50000;
    13. size_t size = numElements * sizeof(float);
    14. printf("[Vector addition of %d elements]\n", numElements);
    15. float *h_A = (float *)malloc(size);
    16. float *h_B = (float *)malloc(size);
    17. float *h_C = (float *)malloc(size);
    18. if (h_A == NULL || h_B == NULL || h_C == NULL) {
    19. fprintf(stderr, "Failed to allocate host vectors!\n");
    20. exit(EXIT_FAILURE);
    21. }
    22. for (int i = 0; i < numElements; ++i) {
    23. h_A[i] = rand() / (float)RAND_MAX;
    24. h_B[i] = rand() / (float)RAND_MAX;
    25. }
    26. float *d_A = NULL;
    27. err = hipMalloc((void **)&d_A, size);
    28. if (err != hipSuccess) {
    29. fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
    30. hipGetErrorString(err));
    31. exit(EXIT_FAILURE);
    32. }
    33. float *d_B = NULL;
    34. err = hipMalloc((void **)&d_B, size);
    35. if (err != hipSuccess) {
    36. fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",
    37. hipGetErrorString(err));
    38. exit(EXIT_FAILURE);
    39. }
    40. float *d_C = NULL;
    41. err = hipMalloc((void **)&d_C, size);
    42. if (err != hipSuccess) {
    43. fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",
    44. hipGetErrorString(err));
    45. exit(EXIT_FAILURE);
    46. }
    47. printf("Copy input data from the host memory to the CUDA device\n");
    48. err = hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice);
    49. if (err != hipSuccess) {
    50. fprintf(stderr,
    51. "Failed to copy vector A from host to device (error code %s)!\n",
    52. hipGetErrorString(err));
    53. exit(EXIT_FAILURE);
    54. }
    55. err = hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice);
    56. if (err != hipSuccess) {
    57. fprintf(stderr,
    58. "Failed to copy vector B from host to device (error code %s)!\n",
    59. hipGetErrorString(err));
    60. exit(EXIT_FAILURE);
    61. }
    62. int threadsPerBlock = 256;
    63. int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
    64. printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
    65. threadsPerBlock);
    66. vectorAdd<<>>(d_A, d_B, d_C, numElements);
    67. err = hipGetLastError();
    68. if (err != hipSuccess) {
    69. fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
    70. hipGetErrorString(err));
    71. exit(EXIT_FAILURE);
    72. }
    73. printf("Copy output data from the CUDA device to the host memory\n");
    74. err = hipMemcpy(h_C, d_C, size, hipMemcpyDeviceToHost);
    75. if (err != hipSuccess) {
    76. fprintf(stderr,
    77. "Failed to copy vector C from device to host (error code %s)!\n",
    78. hipGetErrorString(err));
    79. exit(EXIT_FAILURE);
    80. }
    81. for (int i = 0; i < numElements; ++i) {
    82. if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
    83. fprintf(stderr, "Result verification failed at element %d!\n", i);
    84. exit(EXIT_FAILURE);
    85. }
    86. }
    87. printf("Test PASSED\n");
    88. err = hipFree(d_A);
    89. if (err != hipSuccess) {
    90. fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
    91. hipGetErrorString(err));
    92. exit(EXIT_FAILURE);
    93. }
    94. err = hipFree(d_B);
    95. if (err != hipSuccess) {
    96. fprintf(stderr, "Failed to free device vector B (error code %s)!\n",
    97. hipGetErrorString(err));
    98. exit(EXIT_FAILURE);
    99. }
    100. err = hipFree(d_C);
    101. if (err != hipSuccess) {
    102. fprintf(stderr, "Failed to free device vector C (error code %s)!\n",
    103. hipGetErrorString(err));
    104. exit(EXIT_FAILURE);
    105. }
    106. free(h_A);
    107. free(h_B);
    108. free(h_C);
    109. printf("Done\n");
    110. return 0;
    111. }

    3.2 编译运行 vectorAdd.cu.hip

    编译:

    $ /opt/rocm/bin/hipcc ./vectorAdd.cu.hip -o vectorAdd

    运行效果如下图:

  • 相关阅读:
    FlaskUser type object ‘User‘ has no attribute ‘get_user_by_token‘
    揭秘Spring事务失效场景分析与解决方案
    国际化配置(ant-design-vue设置成中文)
    面试官:说说Vue 3.0中Treeshaking特性?
    磁盘被未知资源耗尽lsof -n|grep deleted
    Java中如何执行多条shell/bat命令 2023年验证通过
    Mac电脑版鼠标连点工具 RapidClick for Mac
    Apache Seata -- 一款开源的分布式事务解决方案
    一、ubuntu-django+nginx+uwsgi:ubuntu系统部署django项目,前后端不分离项目
    【小白学机器学习13】一文理解假设检验的反证法,H0如何设计的,什么时候用左侧检验和右侧检验,等各种关于假设检验的基础知识
  • 原文地址:https://blog.csdn.net/eloudy/article/details/136364439