Merge branch 'master' into add_high_perf

karpathy · Jul 26, 2024 · ec04185 · ec04185
2 parents ee1bd62 + cb44511
commit ec04185
Show file tree

Hide file tree

Showing 7 changed files with 134 additions and 118 deletions.
diff --git a/dev/unistd.h b/dev/unistd.h
@@ -13,6 +13,8 @@
 #include <string.h>
 #include <direct.h> // for _mkdir and _stat
 #include <io.h> // needed for _access below and _findfirst, _findnext, _findclose
+#pragma comment(lib, "Ws2_32.lib")  // Link Ws2_32.lib for socket functions
+#include <winsock2.h>
 
 #define CLOCK_MONOTONIC 0
 static inline int clock_gettime(int ignore_variable, struct timespec* tv)

diff --git a/llmc/cuda_utils.cuh b/llmc/cuda_utils.cuh
@@ -79,6 +79,36 @@ __device__ void store128cg(ElementType* target, Packed128<ElementType> value) {
 typedef Packed128<float> f128;
 typedef Packed128<floatX> x128;
 
+// ----------------------------------------------------------------------------
+// DType support
+
+// enumerator to indentify the datatype of a tensor.
+enum class DType : uint8_t {
+    FP32, FP16, BF16
+};
+
+// Given a datatype enum, returns the underlying number of bytes
+// for a scalar of that type
+size_t sizeof_dtype(DType type) {
+    switch (type) {
+        case DType::FP32:
+            return sizeof(float);
+        case DType::FP16:
+            return sizeof(half);
+        case DType::BF16:
+            return sizeof(nv_bfloat16);
+        default: // handle or get compiler warning
+            fprintf(stderr, "Unknown datatype\n");
+            exit(EXIT_FAILURE);
+    }
+}
+
+DType dtype_of(float* f) { return DType::FP32; }
+DType dtype_of(nv_bfloat16 * f) { return DType::BF16; }
+DType dtype_of(half * f) { return DType::FP16; }
+
+
+
 // ----------------------------------------------------------------------------
 // Copy, cast functions
 

diff --git a/llmc/cudnn_att.cpp b/llmc/cudnn_att.cpp
@@ -3,6 +3,7 @@
 // TODO this currently duplicates some of the utilities from the main file
 
 #define NOMINMAX
+#include <unistd.h>
 #include "cudnn_att.h"
 #include <cudnn_frontend.h>
 

diff --git a/llmc/utils.h b/llmc/utils.h
@@ -16,9 +16,6 @@
 #ifndef _WIN32
 #include <dirent.h>
 #include <arpa/inet.h>
-#else
-#pragma comment(lib, "Ws2_32.lib")  // Link Ws2_32.lib for socket functions
-#include <winsock2.h>
 #endif
 
 // ----------------------------------------------------------------------------

diff --git a/profile_gpt2.cu b/profile_gpt2.cu
@@ -58,6 +58,7 @@ int main(int argc, char *argv[]) {
     model.config.num_layers = 1;
     set_zero_configs(&multi_gpu_config, 0, model.num_parameters);
 
+    gpt2_allocate_state(&model, B, T);
     // do a training step
     gpt2_forward(&model, x, B, T, NULL);
     gpt2_backward_and_reduce(&model, x, y, 1, 0);

diff --git a/test_gpt2.cu b/test_gpt2.cu
@@ -168,6 +168,8 @@ int main(int argc, char *argv[]) {
     // overall OK signal for the test
     int allok = 1;
 
+    gpt2_allocate_state(&model, B, T);
+
     // First, do target-free forward pass to validate logits
     gpt2_forward(&model, x, B, T, NULL);
     // at this point, target should be equal to expected_logits, let's compare
@@ -346,6 +348,7 @@ int main(int argc, char *argv[]) {
     gpt2_free(&model);
     gpt2_build_from_checkpoint(&model, "test_gpt2cu_model.ckpt");
     int ld_step;
+    gpt2_allocate_state(&model, B, T);
     load_state(&ld_step, &model, &loader, "test_gpt2cu_state.ckpt");
     for (int step = 0; step < 10; step++) {
         dataloader_next_batch(&loader);

diff --git a/train_gpt2.cu b/train_gpt2.cu