Skip to content

Commit

Permalink
Merge pull request #717 from ngc92/nvml
Browse files Browse the repository at this point in the history
Nvidia management library for more detailed GPU state printing
  • Loading branch information
karpathy authored Jul 30, 2024
2 parents 3cefe09 + e4b5e57 commit ef12d1b
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 1 deletion.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ $(info ---------------------------------------------)

ifneq ($(OS), Windows_NT)
NVCC := $(shell which nvcc 2>/dev/null)
NVCC_LDFLAGS += -lnvidia-ml

# Function to test if the compiler accepts a given flag.
define check_and_add_flag
Expand Down
107 changes: 107 additions & 0 deletions llmc/mfu.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,29 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if __has_include(<nvml.h>)
#define USE_NVML 1
#include <nvml.h>
#else
#define USE_NVML 0
#endif

// tied to enum PrecisionMode, in a future refactor make them the same
#define MFUH_PRECISION_FP32 0
#define MFUH_PRECISION_FP16 1
#define MFUH_PRECISION_BF16 2

#if USE_NVML
inline void nvml_check(nvmlReturn_t status, const char *file, int line) {
if (status != NVML_SUCCESS) {
printf("[NVML ERROR] at file %s:%d:\n%s\n", file, line, nvmlErrorString(status));
exit(EXIT_FAILURE);
}
};
#define nvmlCheck(err) (nvml_check(err, __FILE__, __LINE__))
#endif


typedef struct {
float TF_32; // tensor-core performance 32 bit
float BF_16_32; // bf16 with 32 bit accumulate
Expand Down Expand Up @@ -134,4 +151,94 @@ float get_flops_promised(const char* device, int precision_mode) {
return -1.0f; // ¯\_(ツ)_/¯
}

struct GPUUtilInfo {
unsigned int clock;
unsigned int max_clock;
unsigned int power;
unsigned int power_limit;
unsigned int fan;
unsigned int temperature;
unsigned int temp_slowdown;

float gpu_utilization;
float mem_utilization;
const char* throttle_reason;
};

// lazily initialize nvml and generate a handle to the GPU
#if USE_NVML
nvmlDevice_t nvml_get_device() {
static bool needs_init = true;
static nvmlDevice_t device;
if(needs_init) {
needs_init = false;
nvmlCheck(nvmlInit());
nvmlCheck(nvmlDeviceGetHandleByIndex_v2(0, &device));
}
return device;
}

// convert throttle reason bitfield into a text reason.
// this is a lossy conversion; we just want to give some idea of what is happening
const char* get_throttle_reason(unsigned long long bits) {
if(bits & (nvmlClocksThrottleReasonSwPowerCap | nvmlClocksThrottleReasonHwPowerBrakeSlowdown)) {
return "power cap";
} else if (bits & (nvmlClocksThrottleReasonSwThermalSlowdown | nvmlClocksThrottleReasonHwThermalSlowdown)) {
return "thermal cap";
} else if (bits & (nvmlClocksThrottleReasonAll)) {
return "other cap";
} else {
return "no cap";
}
}

// gather data for a GPUUtilInfo object
GPUUtilInfo get_gpu_utilization_info() {
GPUUtilInfo info;
nvmlDevice_t device = nvml_get_device();
// query different infos directly
nvmlCheck(nvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &info.clock));
nvmlCheck(nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_SM, &info.max_clock));
nvmlCheck(nvmlDeviceGetPowerManagementLimit(device, &info.power_limit));
nvmlCheck(nvmlDeviceGetPowerUsage(device, &info.power));
nvmlCheck(nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &info.temperature));
nvmlCheck(nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, &info.temp_slowdown));
unsigned long long throttle;
nvmlCheck(nvmlDeviceGetCurrentClocksThrottleReasons(device, &throttle));
info.throttle_reason = get_throttle_reason(throttle);
nvmlCheck(nvmlDeviceGetFanSpeed(device, &info.fan));

// for "utilization", we look at recorded samples. In principle, we could query the driver for how many samples
// to request, but then we'd need to dynamically allocate sufficient space. Let's just hard-code a limit of 128,
// and have no memory management required
constexpr const int BUFFER_LIMIT = 128;
nvmlSample_t buffer[BUFFER_LIMIT];
nvmlValueType_t v_type;
unsigned int sample_count = BUFFER_LIMIT;
nvmlCheck(nvmlDeviceGetSamples(device, NVML_GPU_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer));
float gpu_utilization = 0.f;
for(unsigned i = 0; i < sample_count; ++i) {
gpu_utilization += (float)buffer[i].sampleValue.uiVal;
}
gpu_utilization /= (float)sample_count;

// sample count may have been modified by the query above; reset back to buffer size
sample_count = BUFFER_LIMIT;
nvmlCheck(nvmlDeviceGetSamples(device, NVML_MEMORY_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer));
float mem_utilization = 0.f;
for(unsigned i = 0; i < sample_count; ++i) {
mem_utilization += (float)buffer[i].sampleValue.uiVal;
}
mem_utilization /= (float)sample_count;

info.gpu_utilization = gpu_utilization;
info.mem_utilization = mem_utilization;
return info;
}
#else
GPUUtilInfo get_gpu_utilization_info() {
fprintf(stderr, "Error: Compiled without nvml support. Cannot perform additional GPU state tracking.");
exit(EXIT_FAILURE);
}
#endif
#endif // MFU_H
11 changes: 10 additions & 1 deletion train_gpt2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1358,6 +1358,7 @@ void error_usage() {
fprintf(stderr, " -j <string> val data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_val.bin)\n");
fprintf(stderr, " -e <string> input .bin filename or descriptor, see code comments as docs. (default = gpt2_124M_bf16.bin)\n");
fprintf(stderr, " -o <string> output log dir (default = NULL, no logging)\n");
fprintf(stderr, " -lg <int> log gpu info every x steps (default = -1; disabled)\n");
fprintf(stderr, " -n <int> write optimization checkpoints every how many steps? (default 0, don't)\n");
fprintf(stderr, " -nk <int> max number of checkpoints to keep in the directory, removing old ones (0 = disable, default)\n");
fprintf(stderr, " -nm <int> every how many step checkpoints are considered major? major checkpoints never get deleted.\n");
Expand Down Expand Up @@ -1418,6 +1419,7 @@ int main(int argc, char *argv[]) {
int T = 1024; // sequence length max
int total_batch_size = -1; // will be calculated down below later, if not provided
float learning_rate = 3e-4f;
int log_gpu_every = -1;
int warmup_iterations = 0;
float final_learning_rate_frac = 1.0f; // final fraction of learning rate, at end of training
float weight_decay = 0.0f;
Expand Down Expand Up @@ -1456,7 +1458,8 @@ int main(int argc, char *argv[]) {
else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } // Per-GPU (micro) batch size
else if (argv[i][1] == 't') { T = atoi(argv[i+1]); }
else if (argv[i][1] == 'd') { total_batch_size = atoi(argv[i+1]); }
else if (argv[i][1] == 'l') { learning_rate = atof(argv[i+1]); }
else if (argv[i][1] == 'l' && argv[i][2] == '\0') { learning_rate = atof(argv[i+1]); }
else if (argv[i][1] == 'l' && argv[i][2] == 'g') { log_gpu_every = atoi(argv[i+1]); }
else if (argv[i][1] == 'u') { warmup_iterations = atoi(argv[i+1]); }
else if (argv[i][1] == 'q') { final_learning_rate_frac = atof(argv[i+1]); }
else if (argv[i][1] == 'c') { weight_decay = atof(argv[i+1]); }
Expand Down Expand Up @@ -1857,6 +1860,12 @@ int main(int argc, char *argv[]) {
printf0("step %4d/%d | loss %7.6f (%+.2fz)| norm %6.4f (%+.2fz)| lr %.2e | %.2f ms | %.1f%% bf16 MFU | %.0f tok/s\n",
step + 1, train_num_batches, model.mean_loss, zloss, grad_norm, zgrad, step_learning_rate,
time_elapsed_ms, 100*mfu, bias_corrected_ema_tokens_per_second);
if(log_gpu_every > 0 && (step + 1) % log_gpu_every == 0) {
GPUUtilInfo gpu_info = get_gpu_utilization_info();
printf0(" compute %2.1f%% | memory: %2.1f%% | fan: %2d%% | %4d MHz / %4d MHz | %3d W / %3d W | %d°C / %d°C | %s\n",
gpu_info.gpu_utilization, gpu_info.mem_utilization, gpu_info.fan, gpu_info.clock, gpu_info.max_clock, gpu_info.power / 1000, gpu_info.power_limit / 1000,
gpu_info.temperature, gpu_info.temp_slowdown, gpu_info.throttle_reason);
}
logger_log_train(&logger, step, model.mean_loss, step_learning_rate, grad_norm);

// disable the profiler after 3 steps of optimization
Expand Down

0 comments on commit ef12d1b

Please sign in to comment.