作者|KIDGINBROOK
更新|潘丽晨
上次介绍到rank0的机器生成了ncclUniqueId,并完成了机器的bootstrap网络和通信网络的初始化,这节接着看下所有节点间bootstrap的连接是如何建立的。
rank0节点执行ncclGetUniqueId生成ncclUniqueId,通过mpi将Id广播到所有节点,然后所有节点都会执行ncclCommInitRank,这里其他节点也会进行初始化bootstrap网络和通信网络的操作,然后会执行到ncclCommInitRankSync。
- ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) {
- ncclResult_t res;
-
- CUDACHECK(cudaSetDevice(cudaDev));
- NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
- NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
- NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
-
- INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
-
- return ncclSuccess;
- cleanup:
- if ((*newcomm) && (*newcomm)->bootstrap) bootstrapAbort((*newcomm)->bootstrap);
- *newcomm = NULL;
- return res;
- }
ncclComm_t是指向ncclComm的指针,ncclComm是一个大杂烩,包含了通信用到的所有上下文信息,里面的字段等用到的时候再介绍,然后通过commAlloc分配newcom,并且完成初始化,比如当前是哪个卡,对应的pcie busid是什么,然后执行initTransportsRank。
- static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
- // We use 3 AllGathers
- // 1. { peerInfo, comm }
- // 2. ConnectTransport[nranks], ConnectValue[nranks]
- // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
-
- int rank = comm->rank;
- int nranks = comm->nRanks;
- uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
- TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
- NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
-
- // AllGather1 - begin
- struct {
- struct ncclPeerInfo peerInfo;
- struct ncclComm* comm;
- } *allGather1Data;
-
- NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
- allGather1Data[rank].comm = comm;
- struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
- NCCLCHECK(fillInfo(comm, myInfo, commHash));
- NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
-
- NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root
- for (int i = 0; i < nranks; i++) {
- memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
- if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
- WARN("Duplicate GPU d