zama-ai · andrei-stoian-zama · Nov 12, 2024 · andrei-stoian-zama · Feb 26, 2025
diff --git a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -42,6 +42,13 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
     void const *lwe_array_in, void const *cleartext_array_in,
     const uint32_t input_lwe_dimension,
     const uint32_t input_lwe_ciphertext_count);
+void cuda_wrapping_polynomial_mul_one_to_many_64(
+    void *stream, uint32_t gpu_index, void *result, void const *poly_lhs,
+    void const *poly_rhs, uint32_t polynomial_size, uint32_t n_rhs);
+void cuda_glwe_wrapping_polynomial_mul_one_to_many_64(
+    void *stream, uint32_t gpu_index, void *result, void const *poly_lhs,
+    void const *poly_rhs, uint32_t polynomial_size, uint32_t glwe_dimension,
+    uint32_t n_rhs);
 void cuda_add_lwe_ciphertext_vector_plaintext_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void const *lwe_array_in, const uint64_t plaintext_in,

diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
@@ -7,6 +7,7 @@
 #include "gadget.cuh"
 #include "helper_multi_gpu.h"
 #include "keyswitch.cuh"
+#include "linearalgebra/multiplication.cuh"
 #include "polynomial/functions.cuh"
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
@@ -17,8 +18,6 @@
 
 #define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
 
-const int BLOCK_SIZE_GEMM = 64;
-const int THREADS_GEMM = 8;
 const int BLOCK_SIZE_DECOMP = 8;
 
 template <typename Torus> uint64_t get_shared_mem_size_tgemm() {
@@ -90,106 +89,6 @@ decompose_vectorize_step_inplace(Torus *buffer_in, uint32_t lwe_dimension,
   buffer_in[state_idx] = state;
 }
 
-// Multiply matrices A, B of size (M, K), (K, N) respectively
-// with K as the inner dimension.
-//
-// A block of threads processeds blocks of size (BLOCK_SIZE_GEMM,
-// BLOCK_SIZE_GEMM) splitting them in multiple tiles: (BLOCK_SIZE_GEMM,
-// THREADS_GEMM)-shaped tiles of values from A, and a (THREADS_GEMM,
-// BLOCK_SIZE_GEMM)-shaped tiles of values from B.
-//
-// This code is adapted by generalizing the 1d block-tiling
-// kernel from https://github.com/siboehm/SGEMM_CUDA
-// to any matrix dimension
-template <typename Torus, typename TorusVec>
-__global__ void tgemm(int M, int N, int K, const Torus *A, const Torus *B,
-                      int stride_B, Torus *C) {
-
-  const int BM = BLOCK_SIZE_GEMM;
-  const int BN = BLOCK_SIZE_GEMM;
-  const int BK = THREADS_GEMM;
-  const int TM = THREADS_GEMM;
-
-  const uint cRow = blockIdx.y;
-  const uint cCol = blockIdx.x;
-
-  const int threadCol = threadIdx.x % BN;
-  const int threadRow = threadIdx.x / BN;
-
-  // Allocate space for the current block tile in shared memory
-  __shared__ Torus As[BM * BK];
-  __shared__ Torus Bs[BK * BN];
-
-  // Initialize the pointers to the input blocks from A, B
-  // Tiles from these blocks are loaded to shared memory
-  A += cRow * BM * K;
-  B += cCol * BN;
-
-  // Each thread will handle multiple sub-blocks
-  const uint innerColA = threadIdx.x % BK;
-  const uint innerRowA = threadIdx.x / BK;
-  const uint innerColB = threadIdx.x % BN;
-  const uint innerRowB = threadIdx.x / BN;
-
-  // allocate thread-local cache for results in registerfile
-  Torus threadResults[TM] = {0};
-
-  auto row_A = cRow * BM + innerRowA;
-  auto col_B = cCol * BN + innerColB;
-
-  // For each thread, loop over block tiles
-  for (uint bkIdx = 0; bkIdx < K; bkIdx += BK) {
-    auto col_A = bkIdx + innerColA;
-    auto row_B = bkIdx + innerRowB;
-
-    if (row_A < M && col_A < K) {
-      As[innerRowA * BK + innerColA] = A[innerRowA * K + innerColA];
-    } else {
-      As[innerRowA * BK + innerColA] = 0;
-    }
-
-    if (col_B < N && row_B < K) {
-      Bs[innerRowB * BN + innerColB] = B[innerRowB * stride_B + innerColB];
-    } else {
-      Bs[innerRowB * BN + innerColB] = 0;
-    }
-    synchronize_threads_in_block();
-
-    // Advance blocktile for the next iteration of this loop
-    A += BK;
-    B += BK * stride_B;
-
-    // calculate per-thread results
-    for (uint dotIdx = 0; dotIdx < BK; ++dotIdx) {
-      // we make the dotproduct loop the outside loop, which facilitates
-      // reuse of the Bs entry, which we can cache in a tmp var.
-      Torus tmp = Bs[dotIdx * BN + threadCol];
-      for (uint resIdx = 0; resIdx < TM; ++resIdx) {
-        threadResults[resIdx] +=
-            As[(threadRow * TM + resIdx) * BK + dotIdx] * tmp;
-      }
-    }
-    synchronize_threads_in_block();
-  }
-
-  // Initialize the pointer to the output block of size (BLOCK_SIZE_GEMM,
-  // BLOCK_SIZE_GEMM)
-  C += cRow * BM * N + cCol * BN;
-
-  // write out the results
-  for (uint resIdx = 0; resIdx < TM; ++resIdx) {
-    int outRow = cRow * BM + threadRow * TM + resIdx;
-    int outCol = cCol * BN + threadCol;
-
-    if (outRow >= M)
-      continue;
-    if (outCol >= N)
-      continue;
-
-    C[(threadRow * TM + resIdx) * N + threadCol] += threadResults[resIdx];
-  }
-}
-
 // Finish the keyswitching operation and prepare GLWEs for accumulation.
 // 1. Finish the keyswitching computation partially performed with a GEMM:
 //  - negate the dot product between the GLWE and KSK polynomial
@@ -312,7 +211,7 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(
   uint32_t shared_mem_size = get_shared_mem_size_tgemm<Torus>();
   tgemm<Torus, TorusVec><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
       num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0, fp_ksk_array,
-      stride_KSK_buffer, d_mem_1);
+      stride_KSK_buffer, d_mem_1, glwe_accumulator_size);
   check_cuda_error(cudaGetLastError());
 
   auto ksk_block_size = glwe_accumulator_size;
@@ -326,7 +225,8 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(
     tgemm<Torus, TorusVec>
         <<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
             num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0,
-            fp_ksk_array + li * ksk_block_size, stride_KSK_buffer, d_mem_1);
+            fp_ksk_array + li * ksk_block_size, stride_KSK_buffer, d_mem_1,
+            glwe_accumulator_size);
     check_cuda_error(cudaGetLastError());
   }
 

diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
@@ -1,4 +1,5 @@
 #include "linearalgebra/multiplication.cuh"
+#include "polynomial/polynomial_math.cuh"
 
 /*
  * Perform the multiplication of a u32 input LWE ciphertext vector with a u32
@@ -58,3 +59,24 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
       static_cast<const uint64_t *>(cleartext_array_in), input_lwe_dimension,
       input_lwe_ciphertext_count);
 }
+
+void cuda_wrapping_polynomial_mul_one_to_many_64(
+    void *stream, uint32_t gpu_index, void *result, void const *poly_lhs,
+    void const *poly_rhs, uint32_t polynomial_size, uint32_t n_rhs) {
+
+  host_wrapping_polynomial_mul_one_to_many<uint64_t, ulonglong4>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(result), static_cast<uint64_t const *>(poly_lhs),
+      static_cast<uint64_t const *>(poly_rhs), polynomial_size, 0, n_rhs);
+}
+
+void cuda_glwe_wrapping_polynomial_mul_one_to_many_64(
+    void *stream, uint32_t gpu_index, void *result, void const *glwe_lhs,
+    void const *poly_rhs, uint32_t polynomial_size, uint32_t glwe_dimension,
+    uint32_t n_rhs) {
+  host_glwe_wrapping_polynomial_mul_one_to_many<uint64_t, ulonglong4>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(result), static_cast<uint64_t const *>(glwe_lhs),
+      static_cast<uint64_t const *>(poly_rhs), polynomial_size, glwe_dimension,
+      n_rhs);
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -86,4 +86,108 @@ host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
   check_cuda_error(cudaGetLastError());
 }
 
+const int BLOCK_SIZE_GEMM = 64;
+const int THREADS_GEMM = 8;
+
+// Multiply matrices A, B of size (M, K), (K, N) respectively
+// with K as the inner dimension.
+//
+// A block of threads processeds blocks of size (BLOCK_SIZE_GEMM,
+// BLOCK_SIZE_GEMM) splitting them in multiple tiles: (BLOCK_SIZE_GEMM,
+// THREADS_GEMM)-shaped tiles of values from A, and a (THREADS_GEMM,
+// BLOCK_SIZE_GEMM)-shaped tiles of values from B.
+//
+// This code is adapted by generalizing the 1d block-tiling
+// kernel from https://github.com/siboehm/SGEMM_CUDA
+// to any matrix dimension
+template <typename Torus, typename TorusVec>
+__global__ void tgemm(int M, int N, int K, const Torus *A, const Torus *B,
+                      int stride_B, Torus *C, int stride_C) {
+
+  const int BM = BLOCK_SIZE_GEMM;
+  const int BN = BLOCK_SIZE_GEMM;
+  const int BK = THREADS_GEMM;
+  const int TM = THREADS_GEMM;
+
+  const uint cRow = blockIdx.y;
+  const uint cCol = blockIdx.x;
+
+  const int threadCol = threadIdx.x % BN;
+  const int threadRow = threadIdx.x / BN;
+
+  // Allocate space for the current block tile in shared memory
+  __shared__ Torus As[BM * BK];
+  __shared__ Torus Bs[BK * BN];
+
+  // Initialize the pointers to the input blocks from A, B
+  // Tiles from these blocks are loaded to shared memory
+  A += cRow * BM * K;
+  B += cCol * BN;
+
+  // Each thread will handle multiple sub-blocks
+  const uint innerColA = threadIdx.x % BK;
+  const uint innerRowA = threadIdx.x / BK;
+  const uint innerColB = threadIdx.x % BN;
+  const uint innerRowB = threadIdx.x / BN;
+
+  // allocate thread-local cache for results in registerfile
+  Torus threadResults[TM] = {0};
+
+  auto row_A = cRow * BM + innerRowA;
+  auto col_B = cCol * BN + innerColB;
+
+  // For each thread, loop over block tiles
+  for (uint bkIdx = 0; bkIdx < K; bkIdx += BK) {
+    auto col_A = bkIdx + innerColA;
+    auto row_B = bkIdx + innerRowB;
+
+    if (row_A < M && col_A < K) {
+      As[innerRowA * BK + innerColA] = A[innerRowA * K + innerColA];
+    } else {
+      As[innerRowA * BK + innerColA] = 0;
+    }
+
+    if (col_B < N && row_B < K) {
+      Bs[innerRowB * BN + innerColB] = B[innerRowB * stride_B + innerColB];
+    } else {
+      Bs[innerRowB * BN + innerColB] = 0;
+    }
+    synchronize_threads_in_block();
+
+    // Advance blocktile for the next iteration of this loop
+    A += BK;
+    B += BK * stride_B;
+
+    // calculate per-thread results
+    for (uint dotIdx = 0; dotIdx < BK; ++dotIdx) {
+      // we make the dotproduct loop the outside loop, which facilitates
+      // reuse of the Bs entry, which we can cache in a tmp var.
+      Torus tmp = Bs[dotIdx * BN + threadCol];
+      for (uint resIdx = 0; resIdx < TM; ++resIdx) {
+        threadResults[resIdx] +=
+            As[(threadRow * TM + resIdx) * BK + dotIdx] * tmp;
+      }
+    }
+    synchronize_threads_in_block();
+  }
+
+  // Initialize the pointer to the output block of size (BLOCK_SIZE_GEMM,
+  // BLOCK_SIZE_GEMM)
+  C += cRow * BM * stride_C + cCol * BN;
+
+  // write out the results
+  for (uint resIdx = 0; resIdx < TM; ++resIdx) {
+    int outRow = cRow * BM + threadRow * TM + resIdx;
+    int outCol = cCol * BN + threadCol;
+
+    if (outRow >= M)
+      continue;
+    if (outCol >= N)
+      continue;
+
+    C[(threadRow * TM + resIdx) * stride_C + threadCol] +=
+        threadResults[resIdx];
+  }
+}
+
 #endif // CUDA_MULT_H