microsoft · chhwang · Dec 19, 2024 · Dec 19, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
@@ -173,9 +173,9 @@ static std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> setupSmChannel
   std::transform(smChannels.begin(), smChannels.end(), std::back_inserter(smChannelDeviceHandles),
                  [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
   std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> ptr =
-      mscclpp::allocSharedCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(smChannelDeviceHandles.size());
-  mscclpp::memcpyCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(ptr.get(), smChannelDeviceHandles.data(),
-                                                                 smChannelDeviceHandles.size(), cudaMemcpyHostToDevice);
+      mscclpp::detail::gpuCallocShared<mscclpp::DeviceHandle<mscclpp::SmChannel>>(smChannelDeviceHandles.size());
+  mscclpp::gpuMemcpy<mscclpp::DeviceHandle<mscclpp::SmChannel>>(ptr.get(), smChannelDeviceHandles.data(),
+                                                                smChannelDeviceHandles.size(), cudaMemcpyHostToDevice);
   return ptr;
 }
 
@@ -341,7 +341,7 @@ static void ncclCommInitRankFallbackSingleNode(ncclComm* commPtr, std::shared_pt
   commPtr->smSemaphores = std::move(smSemaphores);
   commPtr->buffFlag = 0;
   commPtr->numScratchBuff = 2;
-  commPtr->scratchBuff = mscclpp::allocExtSharedCuda<char>(SCRATCH_SIZE);
+  commPtr->scratchBuff = mscclpp::gpuMemAlloc(SCRATCH_SIZE);
   commPtr->remoteScratchRegMemories =
       setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), SCRATCH_SIZE, mscclpp::Transport::CudaIpc);
 }
@@ -726,17 +726,12 @@ NCCL_API ncclResult_t ncclCommDeregister(const ncclComm_t, void*) {
 }
 
 ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
-  // Allocate memory using mscclpp::allocSharedPhysicalCuda
   if (ptr == nullptr || size == 0) {
     return ncclInvalidArgument;
   }
   std::shared_ptr<char> sharedPtr;
   try {
-    if (mscclpp::isNvlsSupported()) {
-      sharedPtr = mscclpp::allocSharedPhysicalCuda<char>(size);
-    } else {
-      sharedPtr = mscclpp::allocExtSharedCuda<char>(size);
-    }
+    sharedPtr = mscclpp::gpuMemAlloc(size);
     if (sharedPtr == nullptr) {
       return ncclSystemError;
     }