feat: ICICLE MSM and NTT integration (#18)

**Summary**: This PR updates the existing ICICLE MSM operations to use the latest version of ICICLE v2 and also enables NTT and INTT operations to be executed on the GPU via ICICLE.
zkonduit · Dec 6, 2024 · 0654e92 · 0654e92
1 parent 930970a
commit 0654e92
Show file tree

Hide file tree

Showing 10 changed files with 167 additions and 177 deletions.
diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml
@@ -63,7 +63,9 @@ env_logger = "0.10.0"
 rustc-hash = "2.0.0"
 lazy_static = "1.4.0"
 # GPU Icicle integration
-icicle = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "rust/large-bucket-factor-msm", optional = true }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle", branch="ezkl-icicle2", package="icicle-core", optional = true }
+icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle", branch="ezkl-icicle2", package="icicle-bn254", optional = true }
+icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle", branch="ezkl-icicle2", package="icicle-cuda-runtime", optional = true }
 rustacuda = { version = "0.1", optional = true }
 serde_derive = { version = "1", optional = true}
 bincode = { version = "1.3.3", default_features = false }
@@ -107,7 +109,7 @@ sanity-checks = []
 batch = ["rand_core/getrandom"]
 circuit-params = []
 counter = []
-icicle_gpu = ["icicle", "rustacuda"]
+icicle_gpu = ["icicle-cuda-runtime", "icicle-core", "icicle-bn254"]
 mv-lookup = []
 cost-estimator = ["serde_derive"]
 derive_serde = ["halo2curves/derive_serde"]

diff --git a/halo2_proofs/benches/arithmetic.rs b/halo2_proofs/benches/arithmetic.rs
@@ -1,7 +1,7 @@
 #[macro_use]
 extern crate criterion;
 
-use crate::arithmetic::best_multiexp_cpu;
+use crate::arithmetic::best_multiexp;
 use crate::halo2curves::pasta::{EqAffine, Fp};
 use group::ff::Field;
 use halo2_proofs::*;
@@ -27,7 +27,7 @@ fn criterion_benchmark(c: &mut Criterion) {
         c.bench_function("double-and-add", |b| {
             b.iter(|| {
                 for (g_lo, g_hi) in g_lo.iter().zip(g_hi.iter()) {
-                    best_multiexp_cpu(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]);
+                    best_multiexp(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]);
                 }
             })
         });

diff --git a/halo2_proofs/src/arithmetic.rs b/halo2_proofs/src/arithmetic.rs
@@ -3,15 +3,15 @@
 
 #[cfg(feature = "icicle_gpu")]
 use super::icicle;
+#[cfg(feature = "icicle_gpu")]
+use std::env;
 use super::multicore;
 pub use ff::Field;
 use group::{
     ff::{BatchInvert, PrimeField},
     prime::PrimeCurveAffine,
     Curve, GroupOpsOwned, ScalarMulOwned,
 };
-#[cfg(feature = "icicle_gpu")]
-use rustacuda::prelude::DeviceBuffer;
 
 use halo2curves::msm::msm_best;
 pub use halo2curves::{CurveAffine, CurveExt};
@@ -31,6 +31,24 @@ where
 {
 }
 
+/// Best MSM
+pub fn best_multiexp<C: CurveAffine>(
+    coeffs: &[C::Scalar], bases: &[C]
+) -> C::Curve {
+    #[cfg(feature = "icicle_gpu")]
+    if env::var("ENABLE_ICICLE_GPU").is_ok()
+        && !icicle::should_use_cpu_msm(coeffs.len())
+        && icicle::is_gpu_supported_field(&coeffs[0])
+    {
+        best_multiexp_gpu(coeffs, bases)
+    } else {
+        best_multiexp_cpu(coeffs, bases)
+    }
+
+    #[cfg(not(feature = "icicle_gpu"))]
+    best_multiexp_cpu(coeffs, bases)
+}
+
 // [JPW] Keep this adapter to halo2curves to minimize code changes.
 /// Performs a multi-exponentiation operation.
 ///
@@ -43,15 +61,12 @@ pub fn best_multiexp_cpu<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C
 
 #[cfg(feature = "icicle_gpu")]
 /// Performs a multi-exponentiation operation on GPU using Icicle library
-pub fn best_multiexp_gpu<C: CurveAffine>(coeffs: &[C::Scalar], is_lagrange: bool) -> C::Curve {
-    let scalars_ptr: DeviceBuffer<::icicle::curves::bn254::ScalarField_BN254> =
-        icicle::copy_scalars_to_device::<C>(coeffs);
-
-    return icicle::multiexp_on_device::<C>(scalars_ptr, is_lagrange);
+pub fn best_multiexp_gpu<C: CurveAffine>(coeffs: &[C::Scalar], g: &[C]) -> C::Curve {
+    icicle::multiexp_on_device::<C>(coeffs, g)
 }
 
 /// Dispatcher
-pub fn best_fft<Scalar: Field, G: FftGroup<Scalar>>(
+pub fn best_fft_cpu<Scalar: Field, G: FftGroup<Scalar>>(
     a: &mut [G],
     omega: Scalar,
     log_n: u32,
@@ -61,6 +76,40 @@ pub fn best_fft<Scalar: Field, G: FftGroup<Scalar>>(
     fft::fft(a, omega, log_n, data, inverse);
 }
 
+/// Best FFT
+pub fn best_fft<Scalar: Field + ff::PrimeField, G: FftGroup<Scalar> + ff::PrimeField>(
+    scalars: &mut [G],
+    omega: Scalar,
+    log_n: u32,
+    data: &FFTData<Scalar>,
+    inverse: bool,
+) {
+    #[cfg(feature = "icicle_gpu")]
+    if env::var("ENABLE_ICICLE_GPU").is_ok()
+        && !icicle::should_use_cpu_fft(scalars.len())
+        && icicle::is_gpu_supported_field(&omega)
+    {
+        best_fft_gpu(scalars, omega, log_n, inverse);
+    } else {
+        best_fft_cpu(scalars, omega, log_n, data, inverse);
+    }
+
+    #[cfg(not(feature = "icicle_gpu"))]
+    best_fft_cpu(scalars, omega, log_n, data, inverse);
+}
+
+/// Performs a NTT operation on GPU using Icicle library
+#[cfg(feature = "icicle_gpu")]
+pub fn best_fft_gpu<Scalar: Field + ff::PrimeField, G: FftGroup<Scalar> + ff::PrimeField>(
+    a: &mut [G],
+    omega: Scalar,
+    log_n: u32,
+    inverse: bool,
+) {
+    println!("icicle_fft");
+    icicle::fft_on_device::<Scalar, G>(a, omega, log_n, inverse);
+}
+
 /// Convert coefficient bases group elements to lagrange basis by inverse FFT.
 pub fn g_to_lagrange<C: PrimeCurveAffine>(g_projective: Vec<C::Curve>, k: u32) -> Vec<C> {
     let n_inv = C::Scalar::TWO_INV.pow_vartime([k as u64, 0, 0, 0]);
@@ -74,7 +123,7 @@ pub fn g_to_lagrange<C: PrimeCurveAffine>(g_projective: Vec<C::Curve>, k: u32) -
     let n = g_lagrange_projective.len();
     let fft_data = FFTData::new(n, omega, omega_inv);
 
-    best_fft(&mut g_lagrange_projective, omega_inv, k, &fft_data, true);
+    best_fft_cpu(&mut g_lagrange_projective, omega_inv, k, &fft_data, true);
     parallelize(&mut g_lagrange_projective, |g, _| {
         for g in g.iter_mut() {
             *g *= n_inv;

diff --git a/halo2_proofs/src/icicle.rs b/halo2_proofs/src/icicle.rs
@@ -1,48 +1,34 @@
 use group::ff::PrimeField;
-use icicle::{
-    curves::bn254::{Point_BN254, ScalarField_BN254},
-    test_bn254::commit_bn254,
-};
-use std::sync::{Arc, Once};
-
-pub use icicle::curves::bn254::PointAffineNoInfinity_BN254;
-use rustacuda::memory::CopyDestination;
-use rustacuda::prelude::*;
-
+use icicle_bn254::curve::{CurveCfg, G1Projective, ScalarField};
+use halo2curves::bn256::Fr as Bn256Fr;
+use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};
+use crate::arithmetic::FftGroup;
+use std::any::{TypeId, Any};
 pub use halo2curves::CurveAffine;
+use icicle_core::{
+    curve::Affine,
+    msm,
+    ntt::{initialize_domain, ntt_inplace, NTTConfig, NTTDir},
+};
+use maybe_rayon::iter::IntoParallelRefIterator;
+use maybe_rayon::iter::ParallelIterator;
 use std::{env, mem};
 
-static mut GPU_CONTEXT: Option<Context> = None;
-static mut GPU_G: Option<DeviceBuffer<PointAffineNoInfinity_BN254>> = None;
-static mut GPU_G_LAGRANGE: Option<DeviceBuffer<PointAffineNoInfinity_BN254>> = None;
-static GPU_INIT: Once = Once::new();
-
 pub fn should_use_cpu_msm(size: usize) -> bool {
     size <= (1
-        << u8::from_str_radix(&env::var("ICICLE_SMALL_K").unwrap_or("8".to_string()), 10).unwrap())
+        << u8::from_str_radix(&env::var("ICICLE_SMALL_K").unwrap_or("2".to_string()), 10).unwrap())
 }
 
-pub fn init_gpu<C: CurveAffine>(g: &[C], g_lagrange: &[C]) {
-    unsafe {
-        GPU_INIT.call_once(|| {
-            GPU_CONTEXT = Some(rustacuda::quick_init().unwrap());
-            GPU_G = Some(copy_points_to_device(g));
-            GPU_G_LAGRANGE = Some(copy_points_to_device(g_lagrange));
-        });
-    }
+pub fn should_use_cpu_fft(size: usize) -> bool {
+    size <= (1
+        << u8::from_str_radix(&env::var("ICICLE_SMALL_K_FFT").unwrap_or("2".to_string()), 10).unwrap())
 }
 
-fn u32_from_u8(u8_arr: &[u8; 32]) -> [u32; 8] {
-    let mut t = [0u32; 8];
-    for i in 0..8 {
-        t[i] = u32::from_le_bytes([
-            u8_arr[4 * i],
-            u8_arr[4 * i + 1],
-            u8_arr[4 * i + 2],
-            u8_arr[4 * i + 3],
-        ]);
+pub fn is_gpu_supported_field<G: Any>(_sample_element: &G) -> bool {
+    match TypeId::of::<G>() {
+        id if id == TypeId::of::<Bn256Fr>() => true,
+        _ => false,
     }
-    return t;
 }
 
 fn repr_from_u32<C: CurveAffine>(u32_arr: &[u32; 8]) -> <C as CurveAffine>::Base {
@@ -51,96 +37,87 @@ fn repr_from_u32<C: CurveAffine>(u32_arr: &[u32; 8]) -> <C as CurveAffine>::Base
     return PrimeField::from_repr(t[0]).unwrap();
 }
 
-fn is_infinity_point(point: Point_BN254) -> bool {
-    let inf_point = Point_BN254::infinity();
-    point.z.s.eq(&inf_point.z.s)
+fn icicle_scalars_from_c_scalars<G: PrimeField>(coeffs: &[G]) -> Vec<ScalarField> {
+    coeffs.par_iter().map(|coef| {
+        let repr: [u32; 8] = unsafe { mem::transmute_copy(&coef.to_repr()) };
+        ScalarField::from(repr)
+    }).collect()
 }
 
-fn icicle_scalars_from_c<C: CurveAffine>(coeffs: &[C::Scalar]) -> Vec<ScalarField_BN254> {
-    let _coeffs = [Arc::new(
-        coeffs.iter().map(|x| x.to_repr()).collect::<Vec<_>>(),
-    )];
-
-    let _coeffs: &Arc<Vec<[u32; 8]>> = unsafe { mem::transmute(&_coeffs) };
-    _coeffs
-        .iter()
-        .map(|x| ScalarField_BN254::from_limbs(x))
-        .collect::<Vec<_>>()
+fn c_scalars_from_icicle_scalars<G: PrimeField>(scalars: &[ScalarField]) -> Vec<G> {
+    scalars.par_iter().map(|scalar| {
+        let repr: G::Repr = unsafe { mem::transmute_copy(scalar) };
+        G::from_repr(repr).unwrap()
+    }).collect()
 }
 
-pub fn copy_scalars_to_device<C: CurveAffine>(
-    coeffs: &[C::Scalar],
-) -> DeviceBuffer<ScalarField_BN254> {
-    let scalars = icicle_scalars_from_c::<C>(coeffs);
-
-    DeviceBuffer::from_slice(scalars.as_slice()).unwrap()
-}
+fn icicle_points_from_c<C: CurveAffine>(bases: &[C]) -> Vec<Affine<CurveCfg>> {
+    bases.par_iter().map(|p| {
+        let coordinates = p.coordinates().unwrap();
+        let x_repr: [u32; 8] = unsafe { mem::transmute_copy(&coordinates.x().to_repr()) };
+        let y_repr: [u32; 8] = unsafe { mem::transmute_copy(&coordinates.y().to_repr()) };
 
-fn icicle_points_from_c<C: CurveAffine>(bases: &[C]) -> Vec<PointAffineNoInfinity_BN254> {
-    let _bases = [Arc::new(
-        bases
-            .iter()
-            .map(|p| {
-                let coordinates = p.coordinates().unwrap();
-                [coordinates.x().to_repr(), coordinates.y().to_repr()]
-            })
-            .collect::<Vec<_>>(),
-    )];
-
-    let _bases: &Arc<Vec<[[u8; 32]; 2]>> = unsafe { mem::transmute(&_bases) };
-    _bases
-        .iter()
-        .map(|x| {
-            let tx = u32_from_u8(&x[0]);
-            let ty = u32_from_u8(&x[1]);
-            PointAffineNoInfinity_BN254::from_limbs(&tx, &ty)
-        })
-        .collect::<Vec<_>>()
+        Affine::<CurveCfg>::from_limbs(x_repr, y_repr)
+    }).collect()
 }
 
-pub fn copy_points_to_device<C: CurveAffine>(
-    bases: &[C],
-) -> DeviceBuffer<PointAffineNoInfinity_BN254> {
-    let points = icicle_points_from_c(bases);
+fn c_from_icicle_point<C: CurveAffine>(point: &G1Projective) -> C::Curve {
+    let (x, y) = {
+        let affine: Affine<CurveCfg> = Affine::<CurveCfg>::from(*point);
 
-    DeviceBuffer::from_slice(points.as_slice()).unwrap()
-}
-
-fn c_from_icicle_point<C: CurveAffine>(commit_res: Point_BN254) -> C::Curve {
-    let (x, y) = if is_infinity_point(commit_res) {
         (
-            repr_from_u32::<C>(&[0u32; 8]),
-            repr_from_u32::<C>(&[0u32; 8]),
-        )
-    } else {
-        let affine_res_from_cuda = commit_res.to_affine();
-        (
-            repr_from_u32::<C>(&affine_res_from_cuda.x.s),
-            repr_from_u32::<C>(&affine_res_from_cuda.y.s),
+            repr_from_u32::<C>(&affine.x.into()),
+            repr_from_u32::<C>(&affine.y.into()),
         )
     };
 
-    let affine = C::from_xy(x, y).unwrap();
-    return affine.to_curve();
+    let affine = C::from_xy(x, y);
+
+    return affine.unwrap().to_curve();
 }
 
-pub fn multiexp_on_device<C: CurveAffine>(
-    mut coeffs: DeviceBuffer<ScalarField_BN254>,
-    is_lagrange: bool,
-) -> C::Curve {
-    let base_ptr: &mut DeviceBuffer<PointAffineNoInfinity_BN254>;
-    unsafe {
-        if is_lagrange {
-            base_ptr = GPU_G_LAGRANGE.as_mut().unwrap();
-        } else {
-            base_ptr = GPU_G.as_mut().unwrap();
-        };
-    }
+pub fn multiexp_on_device<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve {
+    let binding = icicle_scalars_from_c_scalars::<C::ScalarExt>(coeffs);
+    let coeffs = HostSlice::from_slice(&binding[..]);
+    let binding = icicle_points_from_c(bases);
+    let bases = HostSlice::from_slice(&binding[..]);
+
+    let mut msm_results = DeviceVec::<G1Projective>::cuda_malloc(1).unwrap();
+    let cfg = msm::MSMConfig::default();
 
-    let d_commit_result = commit_bn254(base_ptr, &mut coeffs, 10);
+    msm::msm(coeffs, bases, &cfg, &mut msm_results[..]).unwrap();
 
-    let mut h_commit_result = Point_BN254::zero();
-    d_commit_result.copy_to(&mut h_commit_result).unwrap();
+    let mut msm_host_result = vec![G1Projective::zero(); 1];
+    msm_results
+        .copy_to_host(HostSlice::from_mut_slice(&mut msm_host_result[..]))
+        .unwrap();
 
-    c_from_icicle_point::<C>(h_commit_result)
+    let msm_point = c_from_icicle_point::<C>(&msm_host_result[0]);
+
+    msm_point
 }
+
+pub fn fft_on_device<Scalar: ff::PrimeField, G: FftGroup<Scalar> + ff::PrimeField>(
+    scalars: &mut [G], 
+    omega: Scalar, 
+    _log_n: u32, 
+    inverse: bool
+) {
+    let cfg = NTTConfig::<'_, ScalarField>::default();
+    let dir = if inverse { NTTDir::kInverse } else { NTTDir::kForward };
+
+    let omega = icicle_scalars_from_c_scalars(&[omega]);
+    initialize_domain(omega[0], &cfg.ctx, true).unwrap();
+
+    let mut icicle_scalars: Vec<ScalarField> = icicle_scalars_from_c_scalars(scalars);
+    let host_scalars = HostSlice::from_mut_slice(&mut icicle_scalars);
+
+    ntt_inplace::<ScalarField, ScalarField>(
+        host_scalars,
+        dir,
+        &cfg,
+    ).unwrap();
+
+    let c_scalars = &c_scalars_from_icicle_scalars::<G>(&mut host_scalars.as_slice())[..];
+    scalars.copy_from_slice(&c_scalars);
+}