diff --git a/llmc/encoder.cuh b/llmc/encoder.cuh
index 4985b19e4..682cebb83 100644
--- a/llmc/encoder.cuh
+++ b/llmc/encoder.cuh
@@ -169,9 +169,6 @@ void encoder_forward(floatX* out,
     const int grid_size = CEIL_DIV(N, (int)(block_size * x128::size));
     encoder_forward_kernel3<<<grid_size, block_size, 0, stream>>>(out, inp, wte, wpe, B, T, C, use_kv);
 
-    // Create a CPU B*T*C size buffer and memcopy the out tensor to it
-    
-
     if (use_kv) {
         inp -= kv_offset;
         wpe -= kv_offset * C;
diff --git a/train_gpt2.cu b/train_gpt2.cu
index d2ecabe97..794e080fb 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1744,7 +1744,7 @@ int main(int argc, char *argv[]) {
             }
             // now sample from the model autoregressively
             printf("generating:\n---\n");
-            model.use_kv = 0; // we need to use the KV cache for generation
+            model.use_kv = 1; // we need to use the KV cache for generation
             for (int t = 1; t < genT; t++) {
                 NvtxRange generation_range("Generation step", t);
                 // we try not to be too wasteful for inference by not calculating all of B,T