diff --git a/Makefile b/Makefile index 40721edba..f2cce2d16 100644 --- a/Makefile +++ b/Makefile @@ -62,6 +62,7 @@ $(info ---------------------------------------------) ifneq ($(OS), Windows_NT) NVCC := $(shell which nvcc 2>/dev/null) + NVCC_LDFLAGS += -lnvidia-ml # Function to test if the compiler accepts a given flag. define check_and_add_flag diff --git a/llmc/mfu.h b/llmc/mfu.h index 7753305dc..1c40b7b7f 100644 --- a/llmc/mfu.h +++ b/llmc/mfu.h @@ -4,12 +4,29 @@ #include #include #include +#if __has_include() +#define USE_NVML 1 +#include +#else +#define USE_NVML 0 +#endif // tied to enum PrecisionMode, in a future refactor make them the same #define MFUH_PRECISION_FP32 0 #define MFUH_PRECISION_FP16 1 #define MFUH_PRECISION_BF16 2 +#if USE_NVML +inline void nvml_check(nvmlReturn_t status, const char *file, int line) { + if (status != NVML_SUCCESS) { + printf("[NVML ERROR] at file %s:%d:\n%s\n", file, line, nvmlErrorString(status)); + exit(EXIT_FAILURE); + } +}; +#define nvmlCheck(err) (nvml_check(err, __FILE__, __LINE__)) +#endif + + typedef struct { float TF_32; // tensor-core performance 32 bit float BF_16_32; // bf16 with 32 bit accumulate @@ -134,4 +151,94 @@ float get_flops_promised(const char* device, int precision_mode) { return -1.0f; // ¯\_(ツ)_/¯ } +struct GPUUtilInfo { + unsigned int clock; + unsigned int max_clock; + unsigned int power; + unsigned int power_limit; + unsigned int fan; + unsigned int temperature; + unsigned int temp_slowdown; + + float gpu_utilization; + float mem_utilization; + const char* throttle_reason; +}; + +// lazily initialize nvml and generate a handle to the GPU +#if USE_NVML +nvmlDevice_t nvml_get_device() { + static bool needs_init = true; + static nvmlDevice_t device; + if(needs_init) { + needs_init = false; + nvmlCheck(nvmlInit()); + nvmlCheck(nvmlDeviceGetHandleByIndex_v2(0, &device)); + } + return device; +} + +// convert throttle reason bitfield into a text reason. +// this is a lossy conversion; we just want to give some idea of what is happening +const char* get_throttle_reason(unsigned long long bits) { + if(bits & (nvmlClocksThrottleReasonSwPowerCap | nvmlClocksThrottleReasonHwPowerBrakeSlowdown)) { + return "power cap"; + } else if (bits & (nvmlClocksThrottleReasonSwThermalSlowdown | nvmlClocksThrottleReasonHwThermalSlowdown)) { + return "thermal cap"; + } else if (bits & (nvmlClocksThrottleReasonAll)) { + return "other cap"; + } else { + return "no cap"; + } +} + +// gather data for a GPUUtilInfo object +GPUUtilInfo get_gpu_utilization_info() { + GPUUtilInfo info; + nvmlDevice_t device = nvml_get_device(); + // query different infos directly + nvmlCheck(nvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &info.clock)); + nvmlCheck(nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_SM, &info.max_clock)); + nvmlCheck(nvmlDeviceGetPowerManagementLimit(device, &info.power_limit)); + nvmlCheck(nvmlDeviceGetPowerUsage(device, &info.power)); + nvmlCheck(nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &info.temperature)); + nvmlCheck(nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, &info.temp_slowdown)); + unsigned long long throttle; + nvmlCheck(nvmlDeviceGetCurrentClocksThrottleReasons(device, &throttle)); + info.throttle_reason = get_throttle_reason(throttle); + nvmlCheck(nvmlDeviceGetFanSpeed(device, &info.fan)); + + // for "utilization", we look at recorded samples. In principle, we could query the driver for how many samples + // to request, but then we'd need to dynamically allocate sufficient space. Let's just hard-code a limit of 128, + // and have no memory management required + constexpr const int BUFFER_LIMIT = 128; + nvmlSample_t buffer[BUFFER_LIMIT]; + nvmlValueType_t v_type; + unsigned int sample_count = BUFFER_LIMIT; + nvmlCheck(nvmlDeviceGetSamples(device, NVML_GPU_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer)); + float gpu_utilization = 0.f; + for(unsigned i = 0; i < sample_count; ++i) { + gpu_utilization += (float)buffer[i].sampleValue.uiVal; + } + gpu_utilization /= (float)sample_count; + + // sample count may have been modified by the query above; reset back to buffer size + sample_count = BUFFER_LIMIT; + nvmlCheck(nvmlDeviceGetSamples(device, NVML_MEMORY_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer)); + float mem_utilization = 0.f; + for(unsigned i = 0; i < sample_count; ++i) { + mem_utilization += (float)buffer[i].sampleValue.uiVal; + } + mem_utilization /= (float)sample_count; + + info.gpu_utilization = gpu_utilization; + info.mem_utilization = mem_utilization; + return info; +} +#else +GPUUtilInfo get_gpu_utilization_info() { + fprintf(stderr, "Error: Compiled without nvml support. Cannot perform additional GPU state tracking."); + exit(EXIT_FAILURE); +} +#endif #endif // MFU_H diff --git a/train_gpt2.cu b/train_gpt2.cu index 2737e64a3..16f801387 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1358,6 +1358,7 @@ void error_usage() { fprintf(stderr, " -j val data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_val.bin)\n"); fprintf(stderr, " -e input .bin filename or descriptor, see code comments as docs. (default = gpt2_124M_bf16.bin)\n"); fprintf(stderr, " -o output log dir (default = NULL, no logging)\n"); + fprintf(stderr, " -lg log gpu info every x steps (default = -1; disabled)\n"); fprintf(stderr, " -n write optimization checkpoints every how many steps? (default 0, don't)\n"); fprintf(stderr, " -nk max number of checkpoints to keep in the directory, removing old ones (0 = disable, default)\n"); fprintf(stderr, " -nm every how many step checkpoints are considered major? major checkpoints never get deleted.\n"); @@ -1418,6 +1419,7 @@ int main(int argc, char *argv[]) { int T = 1024; // sequence length max int total_batch_size = -1; // will be calculated down below later, if not provided float learning_rate = 3e-4f; + int log_gpu_every = -1; int warmup_iterations = 0; float final_learning_rate_frac = 1.0f; // final fraction of learning rate, at end of training float weight_decay = 0.0f; @@ -1456,7 +1458,8 @@ int main(int argc, char *argv[]) { else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } // Per-GPU (micro) batch size else if (argv[i][1] == 't') { T = atoi(argv[i+1]); } else if (argv[i][1] == 'd') { total_batch_size = atoi(argv[i+1]); } - else if (argv[i][1] == 'l') { learning_rate = atof(argv[i+1]); } + else if (argv[i][1] == 'l' && argv[i][2] == '\0') { learning_rate = atof(argv[i+1]); } + else if (argv[i][1] == 'l' && argv[i][2] == 'g') { log_gpu_every = atoi(argv[i+1]); } else if (argv[i][1] == 'u') { warmup_iterations = atoi(argv[i+1]); } else if (argv[i][1] == 'q') { final_learning_rate_frac = atof(argv[i+1]); } else if (argv[i][1] == 'c') { weight_decay = atof(argv[i+1]); } @@ -1857,6 +1860,12 @@ int main(int argc, char *argv[]) { printf0("step %4d/%d | loss %7.6f (%+.2fz)| norm %6.4f (%+.2fz)| lr %.2e | %.2f ms | %.1f%% bf16 MFU | %.0f tok/s\n", step + 1, train_num_batches, model.mean_loss, zloss, grad_norm, zgrad, step_learning_rate, time_elapsed_ms, 100*mfu, bias_corrected_ema_tokens_per_second); + if(log_gpu_every > 0 && (step + 1) % log_gpu_every == 0) { + GPUUtilInfo gpu_info = get_gpu_utilization_info(); + printf0(" compute %2.1f%% | memory: %2.1f%% | fan: %2d%% | %4d MHz / %4d MHz | %3d W / %3d W | %d°C / %d°C | %s\n", + gpu_info.gpu_utilization, gpu_info.mem_utilization, gpu_info.fan, gpu_info.clock, gpu_info.max_clock, gpu_info.power / 1000, gpu_info.power_limit / 1000, + gpu_info.temperature, gpu_info.temp_slowdown, gpu_info.throttle_reason); + } logger_log_train(&logger, step, model.mean_loss, step_learning_rate, grad_norm); // disable the profiler after 3 steps of optimization