diff --git a/Makefile b/Makefile
index 40721edba..f2cce2d16 100644
--- a/Makefile
+++ b/Makefile
@@ -62,6 +62,7 @@ $(info ---------------------------------------------)
 
 ifneq ($(OS), Windows_NT)
   NVCC := $(shell which nvcc 2>/dev/null)
+  NVCC_LDFLAGS += -lnvidia-ml
 
   # Function to test if the compiler accepts a given flag.
   define check_and_add_flag
diff --git a/llmc/mfu.h b/llmc/mfu.h
index 7753305dc..1c40b7b7f 100644
--- a/llmc/mfu.h
+++ b/llmc/mfu.h
@@ -4,12 +4,29 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#if __has_include(<nvml.h>)
+#define USE_NVML 1
+#include <nvml.h>
+#else
+#define USE_NVML 0
+#endif
 
 // tied to enum PrecisionMode, in a future refactor make them the same
 #define MFUH_PRECISION_FP32 0
 #define MFUH_PRECISION_FP16 1
 #define MFUH_PRECISION_BF16 2
 
+#if USE_NVML
+inline void nvml_check(nvmlReturn_t status, const char *file, int line) {
+    if (status != NVML_SUCCESS) {
+        printf("[NVML ERROR] at file %s:%d:\n%s\n", file, line, nvmlErrorString(status));
+        exit(EXIT_FAILURE);
+    }
+};
+#define nvmlCheck(err) (nvml_check(err, __FILE__, __LINE__))
+#endif
+
+
 typedef struct {
     float TF_32;       // tensor-core performance 32 bit
     float BF_16_32;    // bf16 with 32 bit accumulate
@@ -134,4 +151,94 @@ float get_flops_promised(const char* device, int precision_mode) {
     return -1.0f; // ¯\_(ツ)_/¯
 }
 
+struct GPUUtilInfo {
+    unsigned int clock;
+    unsigned int max_clock;
+    unsigned int power;
+    unsigned int power_limit;
+    unsigned int fan;
+    unsigned int temperature;
+    unsigned int temp_slowdown;
+
+    float gpu_utilization;
+    float mem_utilization;
+    const char* throttle_reason;
+};
+
+// lazily initialize nvml and generate a handle to the GPU
+#if USE_NVML
+nvmlDevice_t nvml_get_device() {
+    static bool needs_init = true;
+    static nvmlDevice_t device;
+    if(needs_init) {
+        needs_init = false;
+        nvmlCheck(nvmlInit());
+        nvmlCheck(nvmlDeviceGetHandleByIndex_v2(0, &device));
+    }
+    return device;
+}
+
+// convert throttle reason bitfield into a text reason.
+// this is a lossy conversion; we just want to give some idea of what is happening
+const char* get_throttle_reason(unsigned long long bits) {
+    if(bits & (nvmlClocksThrottleReasonSwPowerCap | nvmlClocksThrottleReasonHwPowerBrakeSlowdown)) {
+        return "power cap";
+    } else if (bits & (nvmlClocksThrottleReasonSwThermalSlowdown | nvmlClocksThrottleReasonHwThermalSlowdown)) {
+        return "thermal cap";
+    } else if (bits & (nvmlClocksThrottleReasonAll)) {
+        return "other cap";
+    } else {
+        return "no cap";
+    }
+}
+
+// gather data for a GPUUtilInfo object
+GPUUtilInfo get_gpu_utilization_info() {
+    GPUUtilInfo info;
+    nvmlDevice_t device = nvml_get_device();
+    // query different infos directly
+    nvmlCheck(nvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &info.clock));
+    nvmlCheck(nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_SM, &info.max_clock));
+    nvmlCheck(nvmlDeviceGetPowerManagementLimit(device, &info.power_limit));
+    nvmlCheck(nvmlDeviceGetPowerUsage(device, &info.power));
+    nvmlCheck(nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &info.temperature));
+    nvmlCheck(nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, &info.temp_slowdown));
+    unsigned long long throttle;
+    nvmlCheck(nvmlDeviceGetCurrentClocksThrottleReasons(device, &throttle));
+    info.throttle_reason = get_throttle_reason(throttle);
+    nvmlCheck(nvmlDeviceGetFanSpeed(device, &info.fan));
+
+    // for "utilization", we look at recorded samples. In principle, we could query the driver for how many samples
+    // to request, but then we'd need to dynamically allocate sufficient space. Let's just hard-code a limit of 128,
+    // and have no memory management required
+    constexpr const int BUFFER_LIMIT = 128;
+    nvmlSample_t buffer[BUFFER_LIMIT];
+    nvmlValueType_t v_type;
+    unsigned int sample_count = BUFFER_LIMIT;
+    nvmlCheck(nvmlDeviceGetSamples(device, NVML_GPU_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer));
+    float gpu_utilization = 0.f;
+    for(unsigned i = 0; i < sample_count; ++i) {
+        gpu_utilization += (float)buffer[i].sampleValue.uiVal;
+    }
+    gpu_utilization /= (float)sample_count;
+
+    // sample count may have been modified by the query above; reset back to buffer size
+    sample_count = BUFFER_LIMIT;
+    nvmlCheck(nvmlDeviceGetSamples(device, NVML_MEMORY_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer));
+    float mem_utilization = 0.f;
+    for(unsigned i = 0; i < sample_count; ++i) {
+        mem_utilization += (float)buffer[i].sampleValue.uiVal;
+    }
+    mem_utilization /= (float)sample_count;
+
+    info.gpu_utilization = gpu_utilization;
+    info.mem_utilization = mem_utilization;
+    return info;
+}
+#else
+GPUUtilInfo get_gpu_utilization_info() {
+    fprintf(stderr, "Error: Compiled without nvml support. Cannot perform additional GPU state tracking.");
+    exit(EXIT_FAILURE);
+}
+#endif
 #endif // MFU_H
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 2737e64a3..16f801387 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1358,6 +1358,7 @@ void error_usage() {
     fprintf(stderr, "  -j <string> val data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_val.bin)\n");
     fprintf(stderr, "  -e <string> input .bin filename or descriptor, see code comments as docs. (default = gpt2_124M_bf16.bin)\n");
     fprintf(stderr, "  -o <string> output log dir (default = NULL, no logging)\n");
+    fprintf(stderr, "  -lg <int>   log gpu info every x steps (default = -1; disabled)\n");
     fprintf(stderr, "  -n <int>    write optimization checkpoints every how many steps? (default 0, don't)\n");
     fprintf(stderr, "  -nk <int>   max number of checkpoints to keep in the directory, removing old ones (0 = disable, default)\n");
     fprintf(stderr, "  -nm <int>   every how many step checkpoints are considered major? major checkpoints never get deleted.\n");
@@ -1418,6 +1419,7 @@ int main(int argc, char *argv[]) {
     int T = 1024; // sequence length max
     int total_batch_size = -1; // will be calculated down below later, if not provided
     float learning_rate = 3e-4f;
+    int log_gpu_every = -1;
     int warmup_iterations = 0;
     float final_learning_rate_frac = 1.0f; // final fraction of learning rate, at end of training
     float weight_decay = 0.0f;
@@ -1456,7 +1458,8 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } // Per-GPU (micro) batch size
         else if (argv[i][1] == 't') { T = atoi(argv[i+1]); }
         else if (argv[i][1] == 'd') { total_batch_size = atoi(argv[i+1]); }
-        else if (argv[i][1] == 'l') { learning_rate = atof(argv[i+1]); }
+        else if (argv[i][1] == 'l' && argv[i][2] == '\0') { learning_rate = atof(argv[i+1]); }
+        else if (argv[i][1] == 'l' && argv[i][2] == 'g') { log_gpu_every = atoi(argv[i+1]); }
         else if (argv[i][1] == 'u') { warmup_iterations = atoi(argv[i+1]); }
         else if (argv[i][1] == 'q') { final_learning_rate_frac = atof(argv[i+1]); }
         else if (argv[i][1] == 'c') { weight_decay = atof(argv[i+1]); }
@@ -1857,6 +1860,12 @@ int main(int argc, char *argv[]) {
         printf0("step %4d/%d | loss %7.6f (%+.2fz)| norm %6.4f (%+.2fz)| lr %.2e | %.2f ms | %.1f%% bf16 MFU | %.0f tok/s\n",
                 step + 1, train_num_batches, model.mean_loss, zloss, grad_norm, zgrad, step_learning_rate,
                 time_elapsed_ms, 100*mfu, bias_corrected_ema_tokens_per_second);
+        if(log_gpu_every > 0 && (step + 1) % log_gpu_every == 0) {
+            GPUUtilInfo gpu_info = get_gpu_utilization_info();
+            printf0("                  compute %2.1f%% | memory: %2.1f%% | fan: %2d%% | %4d MHz / %4d MHz | %3d W / %3d W | %d°C / %d°C | %s\n",
+                    gpu_info.gpu_utilization, gpu_info.mem_utilization, gpu_info.fan, gpu_info.clock, gpu_info.max_clock, gpu_info.power / 1000, gpu_info.power_limit / 1000,
+                    gpu_info.temperature, gpu_info.temp_slowdown, gpu_info.throttle_reason);
+        }
         logger_log_train(&logger, step, model.mean_loss, step_learning_rate, grad_norm);
 
         // disable the profiler after 3 steps of optimization