源码安装 HIPIFY 和应用示例,将cuda生态源码转化成HIP生态源码

1，源码下载

GitHub - ROCm/HIPIFY: HIPIFY: Convert CUDA to Portable C++ CodeHIPIFY: Convert CUDA to Portable C++ Code. Contribute to ROCm/HIPIFY development by creating an account on GitHub.https://github.com/ROCm/HIPIFY.git

git clone --recursive https://github.com/ROCm/HIPIFY.git

sudo apt install clang-dev

2，编译并安装

2.1 通常方式

hipify-clang 文档：

https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/hipify-clang.md

编译命令：

cmake -DCMAKE_INSTALL_PREFIX=../dist -DCMAKE_BUILD_TYPE=Release  ..

make -j install

此时 hipify-clang 会被安装到 HIPIFY/dist/bin 中，

测试：


cd  ../dist/bin
hipify --help

如果系统中存在多个llvm版本，在执行翻译命令时，比如hipify-clang ./vectorAdd.cu --cuda-path=/usr/local/cuda-12.1可能会发生错误，如下提示：

CommandLine Error: Option 'static-func-full-module-prefix' registered more than once!
LLVM ERROR: inconsistency in registered CommandLine option

这时需要使用自制的LLVM，如下2.2节所示。

2.2 自制LLVM的方式

2.2.1 下载llvm源码

wget https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-17.0.6.tar.gz

解压，tar zxf llvmorg.....

2.2.2 配置编译LLVM


cd llvmorg.....
 
mkdir -p build ../dist/local
 
cd build
 
cmake -G "Unix Makefiles" ../llvm      \
-DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;compiler-rt"           \
-DLLVM_BUILD_EXAMPLES=ON           -DLLVM_TARGETS_TO_BUILD="host"      \
-DCMAKE_BUILD_TYPE=Release           -DLLVM_ENABLE_ASSERTIONS=ON       \
-DLLVM_ENABLE_RUNTIMES=all             -DLLVM_BUILD_LLVM_DYLIB=ON      \
-DCMAKE_INSTALL_PREFIX=../../dist/local

make -j

make -j install

测试时，llvm 被install在如下文件夹：

/home/hipper/ex_dock_hipify/dist/local

ls /home/hipper/ex_dock_hipify/dist/local 如图：

2.2.3 配置编译HIPIFY

指定 LLVM 安装目录的配置方法：

-DCMAKE_PREFIX_PATH=/home/hipper/ex_dock_hipify/dist/local


 
cmake \
 -DCMAKE_BUILD_TYPE=Release \
 -DCMAKE_INSTALL_PREFIX=../dist \
 -DCMAKE_PREFIX_PATH=/home/hipper/ex_dock_hipify/dist/local \
 -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.1  ..

make -j install

3. 示例

3.1翻译 .cu 文件到 .hip 文件

命令：

/home/hipper/ex_dock_hipify/HIPIFY/dist/bin/hipify-clang ./vectorAdd.cu  --cuda-path=/usr/local/cuda-12.1

会在 ./ 目录中生成 vectoreAdd.cu.hip 的文件。

其中，hipify-clang 并不检查输入文件的扩展名，比如这里的.cu，它只检查文件内部的内容，将cuda生态的关键字有机地翻译成 hip生态的关键字，输出文件会在原文件名的基础上加上 .hip 后缀；

源代码分别如下。

使用 cuda samples中的vectoradd.cu为例，源码如下：

vectorAdd.cu


#include 
#include 
 
__global__ void vectorAdd(const float *A, const float *B, float *C,
                          int numElements) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  if (i < numElements) {
    C[i] = A[i] + B[i] + 0.0f;
  }
}
 
int main(void) {
  cudaError_t err = cudaSuccess;
  int numElements = 50000;
  size_t size = numElements * sizeof(float);
  printf("[Vector addition of %d elements]\n", numElements);
 
  float *h_A = (float *)malloc(size);
  float *h_B = (float *)malloc(size);
  float *h_C = (float *)malloc(size);
  if (h_A == NULL || h_B == NULL || h_C == NULL) {
    fprintf(stderr, "Failed to allocate host vectors!\n");
    exit(EXIT_FAILURE);
  }
 
  for (int i = 0; i < numElements; ++i) {
    h_A[i] = rand() / (float)RAND_MAX;
    h_B[i] = rand() / (float)RAND_MAX;
  }
 
  float *d_A = NULL;
  err = cudaMalloc((void **)&d_A, size);
 
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  float *d_B = NULL;
  err = cudaMalloc((void **)&d_B, size);
 
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  float *d_C = NULL;
  err = cudaMalloc((void **)&d_C, size);
 
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  printf("Copy input data from the host memory to the CUDA device\n");
  err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
 
  if (err != cudaSuccess) {
    fprintf(stderr,
            "Failed to copy vector A from host to device (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
 
  if (err != cudaSuccess) {
    fprintf(stderr,
            "Failed to copy vector B from host to device (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  int threadsPerBlock = 256;
  int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
  printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
         threadsPerBlock);
  vectorAdd<<>>(d_A, d_B, d_C, numElements);
  err = cudaGetLastError();
 
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  printf("Copy output data from the CUDA device to the host memory\n");
  err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
 
  if (err != cudaSuccess) {
    fprintf(stderr,
            "Failed to copy vector C from device to host (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  for (int i = 0; i < numElements; ++i) {
    if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
      fprintf(stderr, "Result verification failed at element %d!\n", i);
      exit(EXIT_FAILURE);
    }
  }
 
  printf("Test PASSED\n");
 
  err = cudaFree(d_A);
 
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  err = cudaFree(d_B);
 
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to free device vector B (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  err = cudaFree(d_C);
 
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to free device vector C (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  free(h_A);
  free(h_B);
  free(h_C);
 
  printf("Done\n");
  return 0;
}

生成的 vectorAdd.cu.hip :


#include 
#include 
 
__global__ void vectorAdd(const float *A, const float *B, float *C,
                          int numElements) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
 
  if (i < numElements) {
    C[i] = A[i] + B[i] + 0.0f;
  }
}
 
int main(void) {
  hipError_t err = hipSuccess;
  int numElements = 50000;
  size_t size = numElements * sizeof(float);
  printf("[Vector addition of %d elements]\n", numElements);
 
  float *h_A = (float *)malloc(size);
  float *h_B = (float *)malloc(size);
  float *h_C = (float *)malloc(size);
  if (h_A == NULL || h_B == NULL || h_C == NULL) {
    fprintf(stderr, "Failed to allocate host vectors!\n");
    exit(EXIT_FAILURE);
  }
 
  for (int i = 0; i < numElements; ++i) {
    h_A[i] = rand() / (float)RAND_MAX;
    h_B[i] = rand() / (float)RAND_MAX;
  }
 
  float *d_A = NULL;
  err = hipMalloc((void **)&d_A, size);
 
  if (err != hipSuccess) {
    fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  float *d_B = NULL;
  err = hipMalloc((void **)&d_B, size);
 
  if (err != hipSuccess) {
    fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  float *d_C = NULL;
  err = hipMalloc((void **)&d_C, size);
 
  if (err != hipSuccess) {
    fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  printf("Copy input data from the host memory to the CUDA device\n");
  err = hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice);
 
  if (err != hipSuccess) {
    fprintf(stderr,
            "Failed to copy vector A from host to device (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  err = hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice);
 
  if (err != hipSuccess) {
    fprintf(stderr,
            "Failed to copy vector B from host to device (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  int threadsPerBlock = 256;
  int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
  printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
         threadsPerBlock);
  vectorAdd<<>>(d_A, d_B, d_C, numElements);
  err = hipGetLastError();
 
  if (err != hipSuccess) {
    fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  printf("Copy output data from the CUDA device to the host memory\n");
  err = hipMemcpy(h_C, d_C, size, hipMemcpyDeviceToHost);
 
  if (err != hipSuccess) {
    fprintf(stderr,
            "Failed to copy vector C from device to host (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  for (int i = 0; i < numElements; ++i) {
    if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
      fprintf(stderr, "Result verification failed at element %d!\n", i);
      exit(EXIT_FAILURE);
    }
  }
 
  printf("Test PASSED\n");
 
  err = hipFree(d_A);
 
  if (err != hipSuccess) {
    fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  err = hipFree(d_B);
 
  if (err != hipSuccess) {
    fprintf(stderr, "Failed to free device vector B (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  err = hipFree(d_C);
 
  if (err != hipSuccess) {
    fprintf(stderr, "Failed to free device vector C (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 
  free(h_A);
  free(h_B);
  free(h_C);
 
  printf("Done\n");
  return 0;
}

3.2 编译运行 vectorAdd.cu.hip

编译：

$ /opt/rocm/bin/hipcc ./vectorAdd.cu.hip -o vectorAdd

运行效果如下图：

相关阅读:
FlaskUser type object ‘User‘ has no attribute ‘get_user_by_token‘
揭秘Spring事务失效场景分析与解决方案
 国际化配置（ant-design-vue设置成中文）
面试官：说说Vue 3.0中Treeshaking特性？
磁盘被未知资源耗尽lsof -n|grep deleted
Java中如何执行多条shell/bat命令 2023年验证通过
 Mac电脑版鼠标连点工具 RapidClick for Mac
Apache Seata -- 一款开源的分布式事务解决方案
 一、ubuntu-django+nginx+uwsgi：ubuntu系统部署django项目，前后端不分离项目
 【小白学机器学习13】一文理解假设检验的反证法，H0如何设计的，什么时候用左侧检验和右侧检验，等各种关于假设检验的基础知识
原文地址：https://blog.csdn.net/eloudy/article/details/136364439