git clone --recursive https://github.com/ROCm/HIPIFY.git
sudo apt install clang-dev
hipify-clang 文档:
https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/hipify-clang.md
编译命令:
cmake -DCMAKE_INSTALL_PREFIX=../dist -DCMAKE_BUILD_TYPE=Release ..
make -j install
此时 hipify-clang 会被安装到 HIPIFY/dist/bin 中,
测试:
- cd ../dist/bin
- hipify --help

如果系统中存在多个llvm版本,在执行翻译命令时,比如hipify-clang ./vectorAdd.cu --cuda-path=/usr/local/cuda-12.1可能会发生错误,如下提示:
CommandLine Error: Option 'static-func-full-module-prefix' registered more than once!
LLVM ERROR: inconsistency in registered CommandLine option
这时需要使用自制的LLVM,如下2.2节所示。
wget https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-17.0.6.tar.gz
解压,tar zxf llvmorg.....
- cd llvmorg.....
-
- mkdir -p build ../dist/local
-
- cd build
-
- cmake -G "Unix Makefiles" ../llvm \
- -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;compiler-rt" \
- -DLLVM_BUILD_EXAMPLES=ON -DLLVM_TARGETS_TO_BUILD="host" \
- -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON \
- -DLLVM_ENABLE_RUNTIMES=all -DLLVM_BUILD_LLVM_DYLIB=ON \
- -DCMAKE_INSTALL_PREFIX=../../dist/local
make -j
make -j install
测试时,llvm 被install在如下文件夹:
/home/hipper/ex_dock_hipify/dist/local
ls /home/hipper/ex_dock_hipify/dist/local 如图:

指定 LLVM 安装目录的配置方法:
-DCMAKE_PREFIX_PATH=/home/hipper/ex_dock_hipify/dist/local
-
- cmake \
- -DCMAKE_BUILD_TYPE=Release \
- -DCMAKE_INSTALL_PREFIX=../dist \
- -DCMAKE_PREFIX_PATH=/home/hipper/ex_dock_hipify/dist/local \
- -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.1 ..
make -j install
命令:
/home/hipper/ex_dock_hipify/HIPIFY/dist/bin/hipify-clang ./vectorAdd.cu --cuda-path=/usr/local/cuda-12.1
会在 ./ 目录中生成 vectoreAdd.cu.hip 的文件。
其中,hipify-clang 并不检查输入文件的扩展名,比如这里的.cu,它只检查文件内部的内容,将cuda生态的关键字有机地翻译成 hip生态的关键字,输出文件会在原文件名的基础上加上 .hip 后缀;
源代码分别如下。
使用 cuda samples中的vectoradd.cu为例,源码如下:
vectorAdd.cu
- #include
- #include
-
- __global__ void vectorAdd(const float *A, const float *B, float *C,
- int numElements) {
- int i = blockDim.x * blockIdx.x + threadIdx.x;
-
- if (i < numElements) {
- C[i] = A[i] + B[i] + 0.0f;
- }
- }
-
- int main(void) {
- cudaError_t err = cudaSuccess;
- int numElements = 50000;
- size_t size = numElements * sizeof(float);
- printf("[Vector addition of %d elements]\n", numElements);
-
- float *h_A = (float *)malloc(size);
- float *h_B = (float *)malloc(size);
- float *h_C = (float *)malloc(size);
- if (h_A == NULL || h_B == NULL || h_C == NULL) {
- fprintf(stderr, "Failed to allocate host vectors!\n");
- exit(EXIT_FAILURE);
- }
-
- for (int i = 0; i < numElements; ++i) {
- h_A[i] = rand() / (float)RAND_MAX;
- h_B[i] = rand() / (float)RAND_MAX;
- }
-
- float *d_A = NULL;
- err = cudaMalloc((void **)&d_A, size);
-
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
- cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- float *d_B = NULL;
- err = cudaMalloc((void **)&d_B, size);
-
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",
- cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- float *d_C = NULL;
- err = cudaMalloc((void **)&d_C, size);
-
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",
- cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- printf("Copy input data from the host memory to the CUDA device\n");
- err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
-
- if (err != cudaSuccess) {
- fprintf(stderr,
- "Failed to copy vector A from host to device (error code %s)!\n",
- cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
-
- if (err != cudaSuccess) {
- fprintf(stderr,
- "Failed to copy vector B from host to device (error code %s)!\n",
- cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- int threadsPerBlock = 256;
- int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
- printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
- threadsPerBlock);
- vectorAdd<<
>>(d_A, d_B, d_C, numElements); - err = cudaGetLastError();
-
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
- cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- printf("Copy output data from the CUDA device to the host memory\n");
- err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
-
- if (err != cudaSuccess) {
- fprintf(stderr,
- "Failed to copy vector C from device to host (error code %s)!\n",
- cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- for (int i = 0; i < numElements; ++i) {
- if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
- fprintf(stderr, "Result verification failed at element %d!\n", i);
- exit(EXIT_FAILURE);
- }
- }
-
- printf("Test PASSED\n");
-
- err = cudaFree(d_A);
-
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
- cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- err = cudaFree(d_B);
-
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to free device vector B (error code %s)!\n",
- cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- err = cudaFree(d_C);
-
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to free device vector C (error code %s)!\n",
- cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- free(h_A);
- free(h_B);
- free(h_C);
-
- printf("Done\n");
- return 0;
- }
生成的 vectorAdd.cu.hip :

- #include
- #include
-
- __global__ void vectorAdd(const float *A, const float *B, float *C,
- int numElements) {
- int i = blockDim.x * blockIdx.x + threadIdx.x;
-
- if (i < numElements) {
- C[i] = A[i] + B[i] + 0.0f;
- }
- }
-
- int main(void) {
- hipError_t err = hipSuccess;
- int numElements = 50000;
- size_t size = numElements * sizeof(float);
- printf("[Vector addition of %d elements]\n", numElements);
-
- float *h_A = (float *)malloc(size);
- float *h_B = (float *)malloc(size);
- float *h_C = (float *)malloc(size);
- if (h_A == NULL || h_B == NULL || h_C == NULL) {
- fprintf(stderr, "Failed to allocate host vectors!\n");
- exit(EXIT_FAILURE);
- }
-
- for (int i = 0; i < numElements; ++i) {
- h_A[i] = rand() / (float)RAND_MAX;
- h_B[i] = rand() / (float)RAND_MAX;
- }
-
- float *d_A = NULL;
- err = hipMalloc((void **)&d_A, size);
-
- if (err != hipSuccess) {
- fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
- hipGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- float *d_B = NULL;
- err = hipMalloc((void **)&d_B, size);
-
- if (err != hipSuccess) {
- fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",
- hipGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- float *d_C = NULL;
- err = hipMalloc((void **)&d_C, size);
-
- if (err != hipSuccess) {
- fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",
- hipGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- printf("Copy input data from the host memory to the CUDA device\n");
- err = hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice);
-
- if (err != hipSuccess) {
- fprintf(stderr,
- "Failed to copy vector A from host to device (error code %s)!\n",
- hipGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- err = hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice);
-
- if (err != hipSuccess) {
- fprintf(stderr,
- "Failed to copy vector B from host to device (error code %s)!\n",
- hipGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- int threadsPerBlock = 256;
- int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
- printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
- threadsPerBlock);
- vectorAdd<<
>>(d_A, d_B, d_C, numElements); - err = hipGetLastError();
-
- if (err != hipSuccess) {
- fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
- hipGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- printf("Copy output data from the CUDA device to the host memory\n");
- err = hipMemcpy(h_C, d_C, size, hipMemcpyDeviceToHost);
-
- if (err != hipSuccess) {
- fprintf(stderr,
- "Failed to copy vector C from device to host (error code %s)!\n",
- hipGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- for (int i = 0; i < numElements; ++i) {
- if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
- fprintf(stderr, "Result verification failed at element %d!\n", i);
- exit(EXIT_FAILURE);
- }
- }
-
- printf("Test PASSED\n");
-
- err = hipFree(d_A);
-
- if (err != hipSuccess) {
- fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
- hipGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- err = hipFree(d_B);
-
- if (err != hipSuccess) {
- fprintf(stderr, "Failed to free device vector B (error code %s)!\n",
- hipGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- err = hipFree(d_C);
-
- if (err != hipSuccess) {
- fprintf(stderr, "Failed to free device vector C (error code %s)!\n",
- hipGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-
- free(h_A);
- free(h_B);
- free(h_C);
-
- printf("Done\n");
- return 0;
- }
-
编译:
$ /opt/rocm/bin/hipcc ./vectorAdd.cu.hip -o vectorAdd
运行效果如下图:
