Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPKIT] Adding the NPKIT support for kernel allreduce7 in mscclpp #399

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ option(BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
option(BUILD_APPS_NCCL "Build NCCL interfaces" ON)
option(USE_CUDA "Use NVIDIA/CUDA." OFF)
option(USE_ROCM "Use AMD/ROCm." OFF)
option(NPKIT_FLAGS "Enable NPKIT" OFF)
option(BYPASS_GPU_CHECK "Bypass GPU check." OFF)

if(BYPASS_GPU_CHECK)
Expand Down
4 changes: 3 additions & 1 deletion apps/nccl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ if(USE_CUDA)
elseif(USE_ROCM)
target_compile_definitions(mscclpp_nccl_obj PRIVATE USE_ROCM)
endif()

if(NPKIT_FLAGS)
target_compile_definitions(mscclpp_nccl_obj PRIVATE NPKIT_FLAGS)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is this flag used?

endif()
add_library(mscclpp_nccl SHARED)
target_link_libraries(mscclpp_nccl PUBLIC mscclpp_obj mscclpp_nccl_obj)
set_target_properties(mscclpp_nccl PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
Expand Down
53 changes: 52 additions & 1 deletion apps/nccl/src/allreduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
#include <mscclpp/sm_channel.hpp>
#include <mscclpp/sm_channel_device.hpp>

#if defined(ENABLE_NPKIT)
#include <mscclpp/npkit/npkit.hpp>
#endif

#include "common.hpp"
#include "gpu_data_types.hpp"

Expand Down Expand Up @@ -238,10 +242,40 @@ template <typename T>
__global__ void __launch_bounds__(1024, 1)
allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
size_t channelDataOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize,
size_t nelems, uint32_t flag) {
size_t nelems, uint32_t flag
#if defined(ENABLE_NPKIT)
,
NpKitEventCollectContext* npKitEventCollectContexts, uint64_t* cpuTimestamp) {
#else
) {
#endif
// This version of allreduce only works for single nodes
if (worldSize != nRanksPerNode) return;

#if defined(ENABLE_NPKIT)
extern __shared__ int4 NpkitSharedMem[];
NpKitEvent* event_buffer = (NpKitEvent*)((char*)NpkitSharedMem);
uint64_t event_buffer_head = 0;
#if defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY) && defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT)
uint64_t npkit_timestamp_entry = 0;
if (threadIdx.x == 0) {
npkit_timestamp_entry = NPKIT_GET_GPU_TIMESTAMP();
}
#endif
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
#if defined(MSCCLPP_DEVICE_HIP)
NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, NPKIT_LOAD_CPU_TIMESTAMP_PER_BLOCK(cpuTimestamp, blockIdx.x),
#else
NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
#endif
event_buffer, &event_buffer_head);
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), event_buffer,
&event_buffer_head);
#endif

if (sizeof(T) == 2)
nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int);
else
Expand Down Expand Up @@ -312,6 +346,16 @@ __global__ void __launch_bounds__(1024, 1)
result[idx].x = data.x;
result[idx].y = data.y;
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY) && \
defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT)
NpKit::CollectGpuEventShm(NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY, 0, 0, npkit_timestamp_entry, event_buffer,
&event_buffer_head);
NpKit::CollectGpuEventShm(NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), event_buffer,
&event_buffer_head);
#endif
#if defined(ENABLE_NPKIT)
NpKit::StoreGpuEventShm(npKitEventCollectContexts, event_buffer, event_buffer_head);
#endif
}

template <typename T>
Expand Down Expand Up @@ -460,9 +504,16 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
nBlocks = 56;
nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024;
}
#if defined(ENABLE_NPKIT)
size_t NpkitSharedMemSize = NPKIT_SHM_NUM_EVENTS * sizeof(NpKitEvent);
allreduce7<<<nBlocks, nThreadsPerBlock, NpkitSharedMemSize, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
channelScratchOffset, rank, nRanksPerNode, worldSize, nelems,
flag++, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
#else
allreduce7<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
channelScratchOffset, rank, nRanksPerNode, worldSize, nelems,
flag++);
#endif
} else {
int nBlocks = 35;
int nThreadsPerBlock = 512;
Expand Down
17 changes: 16 additions & 1 deletion apps/nccl/src/nccl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
#include <sstream>
#include <unordered_map>
#include <vector>

#if defined(ENABLE_NPKIT)
#include <mscclpp/npkit/npkit.hpp>
#endif
#include "allgather.hpp"
#include "allreduce.hpp"
#include "nccl.h"
Expand Down Expand Up @@ -366,6 +368,12 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
if (commPtr->smallMessageSizeBoundary > commPtr->largeMessageSizeBoundary) return ncclInvalidArgument;

*comm = commPtr;
#if defined(ENABLE_NPKIT)
const char* npkitDumpDir = getenv("NPKIT_DUMP_DIR");
if (npkitDumpDir != nullptr) {
NpKit::Init(rank);
}
#endif
return ncclSuccess;
}

Expand All @@ -381,6 +389,13 @@ NCCL_API ncclResult_t ncclCommFinalize(ncclComm_t comm) {

NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) {
if (comm == nullptr) return ncclInvalidArgument;
#if defined(ENABLE_NPKIT)
const char* npkitDumpDir = getenv("NPKIT_DUMP_DIR");
if (npkitDumpDir != nullptr) {
NpKit::Dump(npkitDumpDir);
NpKit::Shutdown();
}
#endif
delete comm;
return ncclSuccess;
}
Expand Down
9 changes: 7 additions & 2 deletions include/mscclpp/npkit/npkit_event.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@
#define NPKIT_EVENT_EXECUTOR_INIT_ENTRY 0x3
#define NPKIT_EVENT_EXECUTOR_INIT_EXIT 0x4

#define NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY 0x5
#define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x17
#define NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY 0x5
#define NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT 0x6

#define NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY 0x7
#define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x19



yzygitzh marked this conversation as resolved.
Show resolved Hide resolved
#endif