Skip to content

Commit

Permalink
Merge pull request #10391 from rakhmets/topic/nvml-fabric-info-v1
Browse files Browse the repository at this point in the history
UCT/CUDA/CUDA_IPC: Switched to nvmlGpuFabricInfo v1 - v1.18.x
  • Loading branch information
yosefe authored Dec 19, 2024
2 parents 0dc692c + a1a097a commit ed3cebd
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 13 deletions.
6 changes: 3 additions & 3 deletions config/m4/cuda.m4
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ AS_IF([test "x$cuda_checked" != "xyes"],
[AC_MSG_ERROR([libnvidia-ml not found. Install appropriate nvidia-driver package])])
cuda_happy="no"])])
# Check for nvmlDeviceGetGpuFabricInfoV
AC_CHECK_DECLS([nvmlDeviceGetGpuFabricInfoV],
# Check for nvmlDeviceGetGpuFabricInfo
AC_CHECK_DECLS([nvmlDeviceGetGpuFabricInfo],
[AC_DEFINE([HAVE_NVML_FABRIC_INFO], 1, [Enable NVML GPU fabric info support])],
[AC_MSG_NOTICE([nvmlDeviceGetGpuFabricInfoV function not found in libnvidia-ml. MNNVL support will be disabled.])],
[AC_MSG_NOTICE([nvmlDeviceGetGpuFabricInfo function not found in libnvidia-ml. MNNVL support will be disabled.])],
[[#include <nvml.h>]])
Expand Down
18 changes: 8 additions & 10 deletions src/uct/cuda/cuda_ipc/cuda_ipc_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ uct_cuda_ipc_md_check_fabric_info(uct_cuda_ipc_md_t *md,
static int mnnvl_supported = 0;
#else
static int mnnvl_supported = -1;
nvmlGpuFabricInfoV_t fabric_info;
nvmlGpuFabricInfo_t fabric_info;
nvmlDevice_t device;
ucs_status_t status;
char buf[64];
Expand All @@ -436,19 +436,17 @@ uct_cuda_ipc_md_check_fabric_info(uct_cuda_ipc_md_t *md,
goto out_not_supported;
}

fabric_info.version = nvmlGpuFabricInfo_v2;
status = UCT_NVML_FUNC_LOG_ERR(
nvmlDeviceGetGpuFabricInfoV(device, &fabric_info));
status = UCT_NVML_FUNC_LOG_ERR(
nvmlDeviceGetGpuFabricInfo(device, &fabric_info));
if (status != UCS_OK) {
goto out_not_supported;
}

ucs_debug("fabric_info: healthmask=%u state=%u status=%u clique=%u uuid=%s",
fabric_info.healthMask, fabric_info.state, fabric_info.status,
fabric_info.cliqueId,
ucs_str_dump_hex(
fabric_info.clusterUuid, NVML_GPU_FABRIC_UUID_LEN, buf,
sizeof(buf), SIZE_MAX));
ucs_debug("fabric_info: state=%u status=%u uuid=%s", fabric_info.state,
fabric_info.status,
ucs_str_dump_hex(fabric_info.clusterUuid,
NVML_GPU_FABRIC_UUID_LEN, buf, sizeof(buf),
SIZE_MAX));

if ((fabric_info.state == NVML_GPU_FABRIC_STATE_COMPLETED) &&
(fabric_info.status == NVML_SUCCESS)) {
Expand Down

0 comments on commit ed3cebd

Please sign in to comment.