Skip to content

Commit

Permalink
Merge branch 'master' into add_high_perf
Browse files Browse the repository at this point in the history
  • Loading branch information
gordicaleksa committed Jul 26, 2024
2 parents ee1bd62 + cb44511 commit ec04185
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 118 deletions.
2 changes: 2 additions & 0 deletions dev/unistd.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#include <string.h>
#include <direct.h> // for _mkdir and _stat
#include <io.h> // needed for _access below and _findfirst, _findnext, _findclose
#pragma comment(lib, "Ws2_32.lib") // Link Ws2_32.lib for socket functions
#include <winsock2.h>

#define CLOCK_MONOTONIC 0
static inline int clock_gettime(int ignore_variable, struct timespec* tv)
Expand Down
30 changes: 30 additions & 0 deletions llmc/cuda_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,36 @@ __device__ void store128cg(ElementType* target, Packed128<ElementType> value) {
typedef Packed128<float> f128;
typedef Packed128<floatX> x128;

// ----------------------------------------------------------------------------
// DType support

// enumerator to indentify the datatype of a tensor.
enum class DType : uint8_t {
FP32, FP16, BF16
};

// Given a datatype enum, returns the underlying number of bytes
// for a scalar of that type
size_t sizeof_dtype(DType type) {
switch (type) {
case DType::FP32:
return sizeof(float);
case DType::FP16:
return sizeof(half);
case DType::BF16:
return sizeof(nv_bfloat16);
default: // handle or get compiler warning
fprintf(stderr, "Unknown datatype\n");
exit(EXIT_FAILURE);
}
}

DType dtype_of(float* f) { return DType::FP32; }
DType dtype_of(nv_bfloat16 * f) { return DType::BF16; }
DType dtype_of(half * f) { return DType::FP16; }



// ----------------------------------------------------------------------------
// Copy, cast functions

Expand Down
1 change: 1 addition & 0 deletions llmc/cudnn_att.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// TODO this currently duplicates some of the utilities from the main file

#define NOMINMAX
#include <unistd.h>
#include "cudnn_att.h"
#include <cudnn_frontend.h>

Expand Down
3 changes: 0 additions & 3 deletions llmc/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@
#ifndef _WIN32
#include <dirent.h>
#include <arpa/inet.h>
#else
#pragma comment(lib, "Ws2_32.lib") // Link Ws2_32.lib for socket functions
#include <winsock2.h>
#endif

// ----------------------------------------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions profile_gpt2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ int main(int argc, char *argv[]) {
model.config.num_layers = 1;
set_zero_configs(&multi_gpu_config, 0, model.num_parameters);

gpt2_allocate_state(&model, B, T);
// do a training step
gpt2_forward(&model, x, B, T, NULL);
gpt2_backward_and_reduce(&model, x, y, 1, 0);
Expand Down
3 changes: 3 additions & 0 deletions test_gpt2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,8 @@ int main(int argc, char *argv[]) {
// overall OK signal for the test
int allok = 1;

gpt2_allocate_state(&model, B, T);

// First, do target-free forward pass to validate logits
gpt2_forward(&model, x, B, T, NULL);
// at this point, target should be equal to expected_logits, let's compare
Expand Down Expand Up @@ -346,6 +348,7 @@ int main(int argc, char *argv[]) {
gpt2_free(&model);
gpt2_build_from_checkpoint(&model, "test_gpt2cu_model.ckpt");
int ld_step;
gpt2_allocate_state(&model, B, T);
load_state(&ld_step, &model, &loader, "test_gpt2cu_state.ckpt");
for (int step = 0; step < 10; step++) {
dataloader_next_batch(&loader);
Expand Down
212 changes: 97 additions & 115 deletions train_gpt2.cu

Large diffs are not rendered by default.

0 comments on commit ec04185

Please sign in to comment.