diff --git a/llmc/encoder.cuh b/llmc/encoder.cuh index 4985b19e4..682cebb83 100644 --- a/llmc/encoder.cuh +++ b/llmc/encoder.cuh @@ -169,9 +169,6 @@ void encoder_forward(floatX* out, const int grid_size = CEIL_DIV(N, (int)(block_size * x128::size)); encoder_forward_kernel3<<>>(out, inp, wte, wpe, B, T, C, use_kv); - // Create a CPU B*T*C size buffer and memcopy the out tensor to it - - if (use_kv) { inp -= kv_offset; wpe -= kv_offset * C; diff --git a/train_gpt2.cu b/train_gpt2.cu index d2ecabe97..794e080fb 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1744,7 +1744,7 @@ int main(int argc, char *argv[]) { } // now sample from the model autoregressively printf("generating:\n---\n"); - model.use_kv = 0; // we need to use the KV cache for generation + model.use_kv = 1; // we need to use the KV cache for generation for (int t = 1; t < genT; t++) { NvtxRange generation_range("Generation step", t); // we try not to be too wasteful for inference by not calculating all of B,T