
作者|KIDGINBROOK
更新|潘丽晨
NCCL是英伟达开源的GPU通信库,支持集合通信和点对点通信。
看下官方给的一个demo:
- #include
- #include "cuda_runtime.h"
- #include "nccl.h"
- #include "mpi.h"
- #include
- #include
-
-
- #define MPICHECK(cmd) do { \
- int e = cmd; \
- if( e != MPI_SUCCESS ) { \
- printf("Failed: MPI error %s:%d '%d'\n", \
- __FILE__,__LINE__, e); \
- exit(EXIT_FAILURE); \
- } \
- } while(0)
-
-
- #define CUDACHECK(cmd) do { \
- cudaError_t e = cmd; \
- if( e != cudaSuccess ) { \
- printf("Failed: Cuda error %s:%d '%s'\n", \
- __FILE__,__LINE__,cudaGetErrorString(e)); \
- exit(EXIT_FAILURE); \
- } \
- } while(0)
-
-
- #define NCCLCHECK(cmd) do { \
- ncclResult_t r = cmd; \
- if (r!= ncclSuccess) { \
- printf("Failed, NCCL error %s:%d '%s'\n", \
- __FILE__,__LINE__,ncclGetErrorString(r)); \
- exit(EXIT_FAILURE); \
- } \
- } while(0)
-
-
- static uint64_t getHostHash(const char* string) {
- // Based on DJB2a, result = result * 33 ^ char
- uint64_t result = 5381;
- for (int c = 0; string[c] != '\0'; c++){
- result = ((result << 5) + result) ^ string[c];
- }
- return result;
- }
-
-
- static void getHostName(char* hostname, int maxlen) {
- gethostname(hostname, maxlen);
- for (int i=0; i< maxlen; i++) {
- if (hostname[i] == '.') {
- hostname[i] = '\0';
- return;
- }
- }
- }
-
-
- int main(int argc, char* argv[])
- {
- int size = 32*1024*1024;
-
- int myRank, nRanks, localRank = 0;
-
-
- //initializing MPI
- MPICHECK(MPI_Init(&argc, &argv));
- MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank));
- MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks));
-
-
- //calculating localRank which is used in selecting a GPU
- uint64_t hostHashs[nRanks];
- char hostname[1024];
- getHostName(hostname, 1024);
- hostHashs[myRank] = getHostHash(hostname);
- MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD));
- for (int p=0; p
- if (p == myRank) break;
- if (hostHashs[p] == hostHashs[myRank]) localRank++;
- }
-
-
- //each process is using two GPUs
- int nDev = 2;
-
-
- float** sendbuff = (float**)malloc(nDev * sizeof(float*));
- float** recvbuff = (float**)malloc(nDev * sizeof(float*));
- cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-
-
- //picking GPUs based on localRank
- for (int i = 0; i < nDev; ++i) {
- CUDACHECK(cudaSetDevice(localRank*nDev + i));
- CUDACHECK(cudaMalloc(sendbuff + i, size * sizeof(float)));
- CUDACHECK(cudaMalloc(recvbuff + i, size * sizeof(float)));
- CUDACHECK(cudaMemset(sendbuff[i], 1, size * sizeof(float)));
- CUDACHECK(cudaMemset(recvbuff[i], 0, size * sizeof(float)));
- CUDACHECK(cudaStreamCreate(s+i));
- }
-
-
- ncclUniqueId id;
- ncclComm_t comms[nDev];
-
-
- //generating NCCL unique ID at one process and bro