Python调用C++/CUDA

CUDA的核函数可以指定GPU来计算调用，这在深度学习网络模型计算等方面十分有用，CUDA编程中核函数需要写在.cu文件中。
这里介绍如何编写一个核函数.cu，通过.cpp调用该核函数，并将cpp通过pybind11打包成python可调用的.pyd文件。

一、示例代码

本文的示例代码来源于：https://github.com/torstem/demo-cuda-pybind11，共有4个文件：
在这里插入图片描述
使用代码时需要对gpu_library.cu和CMakeLists.txt进行一点修改：
gpu_library.cu的头需要插入：

#include 
1

CMakeLists.txt中需要设置PythonLib的路径，故txt中第3行被替换为：
set(PythonLibs required PATHS "C:\\Python27\\ArcGIS10.7\\libs")

懒得下载的可以复制以下代码制作示例文件：
gpu_library.cu

#include 
#include 
#include 
#include 

//函数前加上__global__就是定义该函数为核函数
__global__ void kernel(double *vec, double scalar, int num_elements)
{
  unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < num_elements) 
  {
    vec[idx] = vec[idx] * scalar;
  }
}

void run_kernel(double *vec, double scalar, int num_elements)
{
  dim3 dimBlock(256, 1, 1);
  dim3 dimGrid(ceil((double)num_elements / dimBlock.x));
  
  kernel<<<dimGrid, dimBlock>>> (vec, scalar, num_elements);

  cudaError_t error = cudaGetLastError();
  if (error != cudaSuccess) 
  {
    std::stringstream strstr;
    strstr << "run_kernel launch failed" << std::endl;
    strstr << "dimBlock: " << dimBlock.x << ", " << dimBlock.y << std::endl;
    strstr << "dimGrid: " << dimGrid.x << ", " << dimGrid.y << std::endl;
    strstr << cudaGetErrorString(error);
    throw strstr.str();
  }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34

gpu_library.cpp

#include 
#include 
#include 
#include 
#include 

void run_kernel(double *vec, double scalar, int num_elements);

void multiply_with_scalar(pybind11::array_t<double> vec, double scalar)
{
  int size = 10;
  double *gpu_ptr;
  cudaError_t error = cudaMalloc(&gpu_ptr, size * sizeof(double));

  if (error != cudaSuccess) {
    throw std::runtime_error(cudaGetErrorString(error));
  }
  auto ha = vec.request();

  if (ha.ndim != 1) {
    std::stringstream strstr;
    strstr << "ha.ndim != 1" << std::endl;
    strstr << "ha.ndim: " << ha.ndim << std::endl;
    throw std::runtime_error(strstr.str());
  }

  double* ptr = reinterpret_cast<double*>(ha.ptr);
  error = cudaMemcpy(gpu_ptr, ptr, size * sizeof(double), cudaMemcpyHostToDevice);
  if (error != cudaSuccess) {
    throw std::runtime_error(cudaGetErrorString(error));
  }

  run_kernel(gpu_ptr, scalar, size);

  error = cudaMemcpy(ptr, gpu_ptr, size * sizeof(double), cudaMemcpyDeviceToHost);
  if (error != cudaSuccess) {
    throw std::runtime_error(cudaGetErrorString(error));
  }

  error = cudaFree(gpu_ptr);
  if (error != cudaSuccess) {
    throw std::runtime_error(cudaGetErrorString(error));
  }
}

PYBIND11_MODULE(gpu_library, m)
{
  m.def("multiply_with_scalar", multiply_with_scalar);
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

CMakeLists.txt

cmake_minimum_required(VERSION 2.8)
find_package(CUDA)
set(PythonLibs required PATHS "C:\\Python27\\ArcGIS10.7\\libs")

include_directories(${PYTHON_INCLUDE_DIRS})

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")

cuda_add_library(gpu_library SHARED
  gpu_library.cpp
  gpu_library.cu)

target_link_libraries(gpu_library
  ${PYTHON_LIBRARIES}
  cudart)

set_target_properties(gpu_library PROPERTIES PREFIX "")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

二、CMake编译

编译之前需要：（1）创建编译文件夹“build”；（2）拷贝pybind11文件夹。
在这里插入图片描述
编译依旧使用CMake-gui，平台按照自己电脑选，这里是x64的：

输入源目录、编译目录，然后依次点“Configure”和"Generate"

这样就编译成功了，然后打开编译好的VS工程（“Open Project”）

三、VS生成

1、VS配置

配置库目录：要把CUDA的lib目录、python的lib目录都放进去
在这里插入图片描述
配置附加包含目录：把python的include、CUDA的include和pybind11的include目录都放进去

2、生成

配置完成之后直接：“生成”-“生成gpu_library”
在这里插入图片描述
有一个Warning，应该是CMakeLists.txt中set(CMAKE_CXX_FLAGS “${CMAKE_CXX_FLAGS} -std=c++11”)写的有问题，但是问题不大，成功生成了gpu_library.dll。

3、import测试

要在python中import成功，首先需要是.pyd文件，并且放在python直接可以找到的路径下，所以将生成的.dll文件拷贝到“D:\Anaconda3\Lib\site-packages”里面，并且修改后缀名为.pyd（没错，直接重命名修改）。
在这里插入图片描述
运行一下“test.py”：

import gpu_library
import numpy

vec = numpy.linspace(0,1,10)

print("before: ", vec)
gpu_library.multiply_with_scalar(vec, 10)
print("after: ", vec)
1
2
3
4
5
6
7
8

在pycharm的控制台看一下打印结果，调用成功：
在这里插入图片描述

相关阅读:
【剑指Offer】25.合并两个排序的链表
 带头双向循环链表
 Linux用户管理— 用户组管理命令
 【译】.NET 8 网络改进（一）
论文解读（g-U-Nets）《Graph U-Nets》
ConsulManager0.9.6 新增RDS云监控指标抓取与自建MySQL接入支持
 在VMware上创建虚拟机并安装CentOS
【ArcGIS绘图系列1】在ArcGIS中制作柱状图与饼状图
 selenium窗口切换
 【产品经理修炼之道】- 政务G端产品建设指南
原文地址：https://blog.csdn.net/qq_33339770/article/details/125896282