Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(gpu): add circulant matrix for one vs many poly product #2030

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
void const *lwe_array_in, void const *cleartext_array_in,
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
void cuda_wrapping_polynomial_mul_one_to_many_64(
void *stream, uint32_t gpu_index, void *result, void const *poly_lhs,
void const *poly_rhs, uint32_t polynomial_size, uint32_t n_rhs);
void cuda_glwe_wrapping_polynomial_mul_one_to_many_64(
void *stream, uint32_t gpu_index, void *result, void const *poly_lhs,
void const *poly_rhs, uint32_t polynomial_size, uint32_t glwe_dimension,
uint32_t n_rhs);
void cuda_add_lwe_ciphertext_vector_plaintext_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, const uint64_t plaintext_in,
Expand Down
108 changes: 4 additions & 104 deletions backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "gadget.cuh"
#include "helper_multi_gpu.h"
#include "keyswitch.cuh"
#include "linearalgebra/multiplication.cuh"
#include "polynomial/functions.cuh"
#include "polynomial/polynomial_math.cuh"
#include "torus.cuh"
Expand All @@ -17,8 +18,6 @@

#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)

const int BLOCK_SIZE_GEMM = 64;
const int THREADS_GEMM = 8;
const int BLOCK_SIZE_DECOMP = 8;

template <typename Torus> uint64_t get_shared_mem_size_tgemm() {
Expand Down Expand Up @@ -90,106 +89,6 @@ decompose_vectorize_step_inplace(Torus *buffer_in, uint32_t lwe_dimension,
buffer_in[state_idx] = state;
}

// Multiply matrices A, B of size (M, K), (K, N) respectively
// with K as the inner dimension.
//
// A block of threads processeds blocks of size (BLOCK_SIZE_GEMM,
// BLOCK_SIZE_GEMM) splitting them in multiple tiles: (BLOCK_SIZE_GEMM,
// THREADS_GEMM)-shaped tiles of values from A, and a (THREADS_GEMM,
// BLOCK_SIZE_GEMM)-shaped tiles of values from B.
//
// This code is adapted by generalizing the 1d block-tiling
// kernel from https://github.com/siboehm/SGEMM_CUDA
// to any matrix dimension
template <typename Torus, typename TorusVec>
__global__ void tgemm(int M, int N, int K, const Torus *A, const Torus *B,
int stride_B, Torus *C) {

const int BM = BLOCK_SIZE_GEMM;
const int BN = BLOCK_SIZE_GEMM;
const int BK = THREADS_GEMM;
const int TM = THREADS_GEMM;

const uint cRow = blockIdx.y;
const uint cCol = blockIdx.x;

const int threadCol = threadIdx.x % BN;
const int threadRow = threadIdx.x / BN;

// Allocate space for the current block tile in shared memory
__shared__ Torus As[BM * BK];
__shared__ Torus Bs[BK * BN];

// Initialize the pointers to the input blocks from A, B
// Tiles from these blocks are loaded to shared memory
A += cRow * BM * K;
B += cCol * BN;

// Each thread will handle multiple sub-blocks
const uint innerColA = threadIdx.x % BK;
const uint innerRowA = threadIdx.x / BK;
const uint innerColB = threadIdx.x % BN;
const uint innerRowB = threadIdx.x / BN;

// allocate thread-local cache for results in registerfile
Torus threadResults[TM] = {0};

auto row_A = cRow * BM + innerRowA;
auto col_B = cCol * BN + innerColB;

// For each thread, loop over block tiles
for (uint bkIdx = 0; bkIdx < K; bkIdx += BK) {
auto col_A = bkIdx + innerColA;
auto row_B = bkIdx + innerRowB;

if (row_A < M && col_A < K) {
As[innerRowA * BK + innerColA] = A[innerRowA * K + innerColA];
} else {
As[innerRowA * BK + innerColA] = 0;
}

if (col_B < N && row_B < K) {
Bs[innerRowB * BN + innerColB] = B[innerRowB * stride_B + innerColB];
} else {
Bs[innerRowB * BN + innerColB] = 0;
}
synchronize_threads_in_block();

// Advance blocktile for the next iteration of this loop
A += BK;
B += BK * stride_B;

// calculate per-thread results
for (uint dotIdx = 0; dotIdx < BK; ++dotIdx) {
// we make the dotproduct loop the outside loop, which facilitates
// reuse of the Bs entry, which we can cache in a tmp var.
Torus tmp = Bs[dotIdx * BN + threadCol];
for (uint resIdx = 0; resIdx < TM; ++resIdx) {
threadResults[resIdx] +=
As[(threadRow * TM + resIdx) * BK + dotIdx] * tmp;
}
}
synchronize_threads_in_block();
}

// Initialize the pointer to the output block of size (BLOCK_SIZE_GEMM,
// BLOCK_SIZE_GEMM)
C += cRow * BM * N + cCol * BN;

// write out the results
for (uint resIdx = 0; resIdx < TM; ++resIdx) {
int outRow = cRow * BM + threadRow * TM + resIdx;
int outCol = cCol * BN + threadCol;

if (outRow >= M)
continue;
if (outCol >= N)
continue;

C[(threadRow * TM + resIdx) * N + threadCol] += threadResults[resIdx];
}
}

// Finish the keyswitching operation and prepare GLWEs for accumulation.
// 1. Finish the keyswitching computation partially performed with a GEMM:
// - negate the dot product between the GLWE and KSK polynomial
Expand Down Expand Up @@ -312,7 +211,7 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(
uint32_t shared_mem_size = get_shared_mem_size_tgemm<Torus>();
tgemm<Torus, TorusVec><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0, fp_ksk_array,
stride_KSK_buffer, d_mem_1);
stride_KSK_buffer, d_mem_1, glwe_accumulator_size);
check_cuda_error(cudaGetLastError());

auto ksk_block_size = glwe_accumulator_size;
Expand All @@ -326,7 +225,8 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(
tgemm<Torus, TorusVec>
<<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0,
fp_ksk_array + li * ksk_block_size, stride_KSK_buffer, d_mem_1);
fp_ksk_array + li * ksk_block_size, stride_KSK_buffer, d_mem_1,
glwe_accumulator_size);
check_cuda_error(cudaGetLastError());
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "linearalgebra/multiplication.cuh"
#include "polynomial/polynomial_math.cuh"

/*
* Perform the multiplication of a u32 input LWE ciphertext vector with a u32
Expand Down Expand Up @@ -58,3 +59,24 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
static_cast<const uint64_t *>(cleartext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}

void cuda_wrapping_polynomial_mul_one_to_many_64(
void *stream, uint32_t gpu_index, void *result, void const *poly_lhs,
void const *poly_rhs, uint32_t polynomial_size, uint32_t n_rhs) {

host_wrapping_polynomial_mul_one_to_many<uint64_t, ulonglong4>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(result), static_cast<uint64_t const *>(poly_lhs),
static_cast<uint64_t const *>(poly_rhs), polynomial_size, 0, n_rhs);
}

void cuda_glwe_wrapping_polynomial_mul_one_to_many_64(
void *stream, uint32_t gpu_index, void *result, void const *glwe_lhs,
void const *poly_rhs, uint32_t polynomial_size, uint32_t glwe_dimension,
uint32_t n_rhs) {
host_glwe_wrapping_polynomial_mul_one_to_many<uint64_t, ulonglong4>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(result), static_cast<uint64_t const *>(glwe_lhs),
static_cast<uint64_t const *>(poly_rhs), polynomial_size, glwe_dimension,
n_rhs);
}
104 changes: 104 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,108 @@ host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
check_cuda_error(cudaGetLastError());
}

const int BLOCK_SIZE_GEMM = 64;
const int THREADS_GEMM = 8;

// Multiply matrices A, B of size (M, K), (K, N) respectively
// with K as the inner dimension.
//
// A block of threads processeds blocks of size (BLOCK_SIZE_GEMM,
// BLOCK_SIZE_GEMM) splitting them in multiple tiles: (BLOCK_SIZE_GEMM,
// THREADS_GEMM)-shaped tiles of values from A, and a (THREADS_GEMM,
// BLOCK_SIZE_GEMM)-shaped tiles of values from B.
//
// This code is adapted by generalizing the 1d block-tiling
// kernel from https://github.com/siboehm/SGEMM_CUDA
// to any matrix dimension
template <typename Torus, typename TorusVec>
__global__ void tgemm(int M, int N, int K, const Torus *A, const Torus *B,
int stride_B, Torus *C, int stride_C) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a stride_C parameter, since I use this function to write the output matrix in a bigger buffer (a GLWE list) that has a bigger stride than the width of C which is N.


const int BM = BLOCK_SIZE_GEMM;
const int BN = BLOCK_SIZE_GEMM;
const int BK = THREADS_GEMM;
const int TM = THREADS_GEMM;

const uint cRow = blockIdx.y;
const uint cCol = blockIdx.x;

const int threadCol = threadIdx.x % BN;
const int threadRow = threadIdx.x / BN;

// Allocate space for the current block tile in shared memory
__shared__ Torus As[BM * BK];
__shared__ Torus Bs[BK * BN];

// Initialize the pointers to the input blocks from A, B
// Tiles from these blocks are loaded to shared memory
A += cRow * BM * K;
B += cCol * BN;

// Each thread will handle multiple sub-blocks
const uint innerColA = threadIdx.x % BK;
const uint innerRowA = threadIdx.x / BK;
const uint innerColB = threadIdx.x % BN;
const uint innerRowB = threadIdx.x / BN;

// allocate thread-local cache for results in registerfile
Torus threadResults[TM] = {0};

auto row_A = cRow * BM + innerRowA;
auto col_B = cCol * BN + innerColB;

// For each thread, loop over block tiles
for (uint bkIdx = 0; bkIdx < K; bkIdx += BK) {
auto col_A = bkIdx + innerColA;
auto row_B = bkIdx + innerRowB;

if (row_A < M && col_A < K) {
As[innerRowA * BK + innerColA] = A[innerRowA * K + innerColA];
} else {
As[innerRowA * BK + innerColA] = 0;
}

if (col_B < N && row_B < K) {
Bs[innerRowB * BN + innerColB] = B[innerRowB * stride_B + innerColB];
} else {
Bs[innerRowB * BN + innerColB] = 0;
}
synchronize_threads_in_block();

// Advance blocktile for the next iteration of this loop
A += BK;
B += BK * stride_B;

// calculate per-thread results
for (uint dotIdx = 0; dotIdx < BK; ++dotIdx) {
// we make the dotproduct loop the outside loop, which facilitates
// reuse of the Bs entry, which we can cache in a tmp var.
Torus tmp = Bs[dotIdx * BN + threadCol];
for (uint resIdx = 0; resIdx < TM; ++resIdx) {
threadResults[resIdx] +=
As[(threadRow * TM + resIdx) * BK + dotIdx] * tmp;
}
}
synchronize_threads_in_block();
}

// Initialize the pointer to the output block of size (BLOCK_SIZE_GEMM,
// BLOCK_SIZE_GEMM)
C += cRow * BM * stride_C + cCol * BN;

// write out the results
for (uint resIdx = 0; resIdx < TM; ++resIdx) {
int outRow = cRow * BM + threadRow * TM + resIdx;
int outCol = cCol * BN + threadCol;

if (outRow >= M)
continue;
if (outCol >= N)
continue;

C[(threadRow * TM + resIdx) * stride_C + threadCol] +=
threadResults[resIdx];
}
}

#endif // CUDA_MULT_H
Loading
Loading