Skip to content

Commit

Permalink
feat: ICICLE MSM and NTT integration (#18)
Browse files Browse the repository at this point in the history
**Summary**:
This PR updates the existing ICICLE MSM operations to use the latest
version of ICICLE v2 and also enables NTT and INTT operations to be
executed on the GPU via ICICLE.
  • Loading branch information
emirsoyturk authored Dec 6, 2024
1 parent 930970a commit 0654e92
Show file tree
Hide file tree
Showing 10 changed files with 167 additions and 177 deletions.
6 changes: 4 additions & 2 deletions halo2_proofs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ env_logger = "0.10.0"
rustc-hash = "2.0.0"
lazy_static = "1.4.0"
# GPU Icicle integration
icicle = { git = "https://github.com/ingonyama-zk/icicle.git", branch = "rust/large-bucket-factor-msm", optional = true }
icicle-core = { git = "https://github.com/ingonyama-zk/icicle", branch="ezkl-icicle2", package="icicle-core", optional = true }
icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle", branch="ezkl-icicle2", package="icicle-bn254", optional = true }
icicle-cuda-runtime = { git = "https://github.com/ingonyama-zk/icicle", branch="ezkl-icicle2", package="icicle-cuda-runtime", optional = true }
rustacuda = { version = "0.1", optional = true }
serde_derive = { version = "1", optional = true}
bincode = { version = "1.3.3", default_features = false }
Expand Down Expand Up @@ -107,7 +109,7 @@ sanity-checks = []
batch = ["rand_core/getrandom"]
circuit-params = []
counter = []
icicle_gpu = ["icicle", "rustacuda"]
icicle_gpu = ["icicle-cuda-runtime", "icicle-core", "icicle-bn254"]
mv-lookup = []
cost-estimator = ["serde_derive"]
derive_serde = ["halo2curves/derive_serde"]
Expand Down
4 changes: 2 additions & 2 deletions halo2_proofs/benches/arithmetic.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#[macro_use]
extern crate criterion;

use crate::arithmetic::best_multiexp_cpu;
use crate::arithmetic::best_multiexp;
use crate::halo2curves::pasta::{EqAffine, Fp};
use group::ff::Field;
use halo2_proofs::*;
Expand All @@ -27,7 +27,7 @@ fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("double-and-add", |b| {
b.iter(|| {
for (g_lo, g_hi) in g_lo.iter().zip(g_hi.iter()) {
best_multiexp_cpu(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]);
best_multiexp(&[black_box(coeff_1), black_box(coeff_2)], &[*g_lo, *g_hi]);
}
})
});
Expand Down
67 changes: 58 additions & 9 deletions halo2_proofs/src/arithmetic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
#[cfg(feature = "icicle_gpu")]
use super::icicle;
#[cfg(feature = "icicle_gpu")]
use std::env;
use super::multicore;
pub use ff::Field;
use group::{
ff::{BatchInvert, PrimeField},
prime::PrimeCurveAffine,
Curve, GroupOpsOwned, ScalarMulOwned,
};
#[cfg(feature = "icicle_gpu")]
use rustacuda::prelude::DeviceBuffer;

use halo2curves::msm::msm_best;
pub use halo2curves::{CurveAffine, CurveExt};
Expand All @@ -31,6 +31,24 @@ where
{
}

/// Best MSM
pub fn best_multiexp<C: CurveAffine>(
coeffs: &[C::Scalar], bases: &[C]
) -> C::Curve {
#[cfg(feature = "icicle_gpu")]
if env::var("ENABLE_ICICLE_GPU").is_ok()
&& !icicle::should_use_cpu_msm(coeffs.len())
&& icicle::is_gpu_supported_field(&coeffs[0])
{
best_multiexp_gpu(coeffs, bases)
} else {
best_multiexp_cpu(coeffs, bases)
}

#[cfg(not(feature = "icicle_gpu"))]
best_multiexp_cpu(coeffs, bases)
}

// [JPW] Keep this adapter to halo2curves to minimize code changes.
/// Performs a multi-exponentiation operation.
///
Expand All @@ -43,15 +61,12 @@ pub fn best_multiexp_cpu<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C

#[cfg(feature = "icicle_gpu")]
/// Performs a multi-exponentiation operation on GPU using Icicle library
pub fn best_multiexp_gpu<C: CurveAffine>(coeffs: &[C::Scalar], is_lagrange: bool) -> C::Curve {
let scalars_ptr: DeviceBuffer<::icicle::curves::bn254::ScalarField_BN254> =
icicle::copy_scalars_to_device::<C>(coeffs);

return icicle::multiexp_on_device::<C>(scalars_ptr, is_lagrange);
pub fn best_multiexp_gpu<C: CurveAffine>(coeffs: &[C::Scalar], g: &[C]) -> C::Curve {
icicle::multiexp_on_device::<C>(coeffs, g)
}

/// Dispatcher
pub fn best_fft<Scalar: Field, G: FftGroup<Scalar>>(
pub fn best_fft_cpu<Scalar: Field, G: FftGroup<Scalar>>(
a: &mut [G],
omega: Scalar,
log_n: u32,
Expand All @@ -61,6 +76,40 @@ pub fn best_fft<Scalar: Field, G: FftGroup<Scalar>>(
fft::fft(a, omega, log_n, data, inverse);
}

/// Best FFT
pub fn best_fft<Scalar: Field + ff::PrimeField, G: FftGroup<Scalar> + ff::PrimeField>(
scalars: &mut [G],
omega: Scalar,
log_n: u32,
data: &FFTData<Scalar>,
inverse: bool,
) {
#[cfg(feature = "icicle_gpu")]
if env::var("ENABLE_ICICLE_GPU").is_ok()
&& !icicle::should_use_cpu_fft(scalars.len())
&& icicle::is_gpu_supported_field(&omega)
{
best_fft_gpu(scalars, omega, log_n, inverse);
} else {
best_fft_cpu(scalars, omega, log_n, data, inverse);
}

#[cfg(not(feature = "icicle_gpu"))]
best_fft_cpu(scalars, omega, log_n, data, inverse);
}

/// Performs a NTT operation on GPU using Icicle library
#[cfg(feature = "icicle_gpu")]
pub fn best_fft_gpu<Scalar: Field + ff::PrimeField, G: FftGroup<Scalar> + ff::PrimeField>(
a: &mut [G],
omega: Scalar,
log_n: u32,
inverse: bool,
) {
println!("icicle_fft");
icicle::fft_on_device::<Scalar, G>(a, omega, log_n, inverse);
}

/// Convert coefficient bases group elements to lagrange basis by inverse FFT.
pub fn g_to_lagrange<C: PrimeCurveAffine>(g_projective: Vec<C::Curve>, k: u32) -> Vec<C> {
let n_inv = C::Scalar::TWO_INV.pow_vartime([k as u64, 0, 0, 0]);
Expand All @@ -74,7 +123,7 @@ pub fn g_to_lagrange<C: PrimeCurveAffine>(g_projective: Vec<C::Curve>, k: u32) -
let n = g_lagrange_projective.len();
let fft_data = FFTData::new(n, omega, omega_inv);

best_fft(&mut g_lagrange_projective, omega_inv, k, &fft_data, true);
best_fft_cpu(&mut g_lagrange_projective, omega_inv, k, &fft_data, true);
parallelize(&mut g_lagrange_projective, |g, _| {
for g in g.iter_mut() {
*g *= n_inv;
Expand Down
195 changes: 86 additions & 109 deletions halo2_proofs/src/icicle.rs
Original file line number Diff line number Diff line change
@@ -1,48 +1,34 @@
use group::ff::PrimeField;
use icicle::{
curves::bn254::{Point_BN254, ScalarField_BN254},
test_bn254::commit_bn254,
};
use std::sync::{Arc, Once};

pub use icicle::curves::bn254::PointAffineNoInfinity_BN254;
use rustacuda::memory::CopyDestination;
use rustacuda::prelude::*;

use icicle_bn254::curve::{CurveCfg, G1Projective, ScalarField};
use halo2curves::bn256::Fr as Bn256Fr;
use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};
use crate::arithmetic::FftGroup;
use std::any::{TypeId, Any};
pub use halo2curves::CurveAffine;
use icicle_core::{
curve::Affine,
msm,
ntt::{initialize_domain, ntt_inplace, NTTConfig, NTTDir},
};
use maybe_rayon::iter::IntoParallelRefIterator;
use maybe_rayon::iter::ParallelIterator;
use std::{env, mem};

static mut GPU_CONTEXT: Option<Context> = None;
static mut GPU_G: Option<DeviceBuffer<PointAffineNoInfinity_BN254>> = None;
static mut GPU_G_LAGRANGE: Option<DeviceBuffer<PointAffineNoInfinity_BN254>> = None;
static GPU_INIT: Once = Once::new();

pub fn should_use_cpu_msm(size: usize) -> bool {
size <= (1
<< u8::from_str_radix(&env::var("ICICLE_SMALL_K").unwrap_or("8".to_string()), 10).unwrap())
<< u8::from_str_radix(&env::var("ICICLE_SMALL_K").unwrap_or("2".to_string()), 10).unwrap())
}

pub fn init_gpu<C: CurveAffine>(g: &[C], g_lagrange: &[C]) {
unsafe {
GPU_INIT.call_once(|| {
GPU_CONTEXT = Some(rustacuda::quick_init().unwrap());
GPU_G = Some(copy_points_to_device(g));
GPU_G_LAGRANGE = Some(copy_points_to_device(g_lagrange));
});
}
pub fn should_use_cpu_fft(size: usize) -> bool {
size <= (1
<< u8::from_str_radix(&env::var("ICICLE_SMALL_K_FFT").unwrap_or("2".to_string()), 10).unwrap())
}

fn u32_from_u8(u8_arr: &[u8; 32]) -> [u32; 8] {
let mut t = [0u32; 8];
for i in 0..8 {
t[i] = u32::from_le_bytes([
u8_arr[4 * i],
u8_arr[4 * i + 1],
u8_arr[4 * i + 2],
u8_arr[4 * i + 3],
]);
pub fn is_gpu_supported_field<G: Any>(_sample_element: &G) -> bool {
match TypeId::of::<G>() {
id if id == TypeId::of::<Bn256Fr>() => true,
_ => false,
}
return t;
}

fn repr_from_u32<C: CurveAffine>(u32_arr: &[u32; 8]) -> <C as CurveAffine>::Base {
Expand All @@ -51,96 +37,87 @@ fn repr_from_u32<C: CurveAffine>(u32_arr: &[u32; 8]) -> <C as CurveAffine>::Base
return PrimeField::from_repr(t[0]).unwrap();
}

fn is_infinity_point(point: Point_BN254) -> bool {
let inf_point = Point_BN254::infinity();
point.z.s.eq(&inf_point.z.s)
fn icicle_scalars_from_c_scalars<G: PrimeField>(coeffs: &[G]) -> Vec<ScalarField> {
coeffs.par_iter().map(|coef| {
let repr: [u32; 8] = unsafe { mem::transmute_copy(&coef.to_repr()) };
ScalarField::from(repr)
}).collect()
}

fn icicle_scalars_from_c<C: CurveAffine>(coeffs: &[C::Scalar]) -> Vec<ScalarField_BN254> {
let _coeffs = [Arc::new(
coeffs.iter().map(|x| x.to_repr()).collect::<Vec<_>>(),
)];

let _coeffs: &Arc<Vec<[u32; 8]>> = unsafe { mem::transmute(&_coeffs) };
_coeffs
.iter()
.map(|x| ScalarField_BN254::from_limbs(x))
.collect::<Vec<_>>()
fn c_scalars_from_icicle_scalars<G: PrimeField>(scalars: &[ScalarField]) -> Vec<G> {
scalars.par_iter().map(|scalar| {
let repr: G::Repr = unsafe { mem::transmute_copy(scalar) };
G::from_repr(repr).unwrap()
}).collect()
}

pub fn copy_scalars_to_device<C: CurveAffine>(
coeffs: &[C::Scalar],
) -> DeviceBuffer<ScalarField_BN254> {
let scalars = icicle_scalars_from_c::<C>(coeffs);

DeviceBuffer::from_slice(scalars.as_slice()).unwrap()
}
fn icicle_points_from_c<C: CurveAffine>(bases: &[C]) -> Vec<Affine<CurveCfg>> {
bases.par_iter().map(|p| {
let coordinates = p.coordinates().unwrap();
let x_repr: [u32; 8] = unsafe { mem::transmute_copy(&coordinates.x().to_repr()) };
let y_repr: [u32; 8] = unsafe { mem::transmute_copy(&coordinates.y().to_repr()) };

fn icicle_points_from_c<C: CurveAffine>(bases: &[C]) -> Vec<PointAffineNoInfinity_BN254> {
let _bases = [Arc::new(
bases
.iter()
.map(|p| {
let coordinates = p.coordinates().unwrap();
[coordinates.x().to_repr(), coordinates.y().to_repr()]
})
.collect::<Vec<_>>(),
)];

let _bases: &Arc<Vec<[[u8; 32]; 2]>> = unsafe { mem::transmute(&_bases) };
_bases
.iter()
.map(|x| {
let tx = u32_from_u8(&x[0]);
let ty = u32_from_u8(&x[1]);
PointAffineNoInfinity_BN254::from_limbs(&tx, &ty)
})
.collect::<Vec<_>>()
Affine::<CurveCfg>::from_limbs(x_repr, y_repr)
}).collect()
}

pub fn copy_points_to_device<C: CurveAffine>(
bases: &[C],
) -> DeviceBuffer<PointAffineNoInfinity_BN254> {
let points = icicle_points_from_c(bases);
fn c_from_icicle_point<C: CurveAffine>(point: &G1Projective) -> C::Curve {
let (x, y) = {
let affine: Affine<CurveCfg> = Affine::<CurveCfg>::from(*point);

DeviceBuffer::from_slice(points.as_slice()).unwrap()
}

fn c_from_icicle_point<C: CurveAffine>(commit_res: Point_BN254) -> C::Curve {
let (x, y) = if is_infinity_point(commit_res) {
(
repr_from_u32::<C>(&[0u32; 8]),
repr_from_u32::<C>(&[0u32; 8]),
)
} else {
let affine_res_from_cuda = commit_res.to_affine();
(
repr_from_u32::<C>(&affine_res_from_cuda.x.s),
repr_from_u32::<C>(&affine_res_from_cuda.y.s),
repr_from_u32::<C>(&affine.x.into()),
repr_from_u32::<C>(&affine.y.into()),
)
};

let affine = C::from_xy(x, y).unwrap();
return affine.to_curve();
let affine = C::from_xy(x, y);

return affine.unwrap().to_curve();
}

pub fn multiexp_on_device<C: CurveAffine>(
mut coeffs: DeviceBuffer<ScalarField_BN254>,
is_lagrange: bool,
) -> C::Curve {
let base_ptr: &mut DeviceBuffer<PointAffineNoInfinity_BN254>;
unsafe {
if is_lagrange {
base_ptr = GPU_G_LAGRANGE.as_mut().unwrap();
} else {
base_ptr = GPU_G.as_mut().unwrap();
};
}
pub fn multiexp_on_device<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve {
let binding = icicle_scalars_from_c_scalars::<C::ScalarExt>(coeffs);
let coeffs = HostSlice::from_slice(&binding[..]);
let binding = icicle_points_from_c(bases);
let bases = HostSlice::from_slice(&binding[..]);

let mut msm_results = DeviceVec::<G1Projective>::cuda_malloc(1).unwrap();
let cfg = msm::MSMConfig::default();

let d_commit_result = commit_bn254(base_ptr, &mut coeffs, 10);
msm::msm(coeffs, bases, &cfg, &mut msm_results[..]).unwrap();

let mut h_commit_result = Point_BN254::zero();
d_commit_result.copy_to(&mut h_commit_result).unwrap();
let mut msm_host_result = vec![G1Projective::zero(); 1];
msm_results
.copy_to_host(HostSlice::from_mut_slice(&mut msm_host_result[..]))
.unwrap();

c_from_icicle_point::<C>(h_commit_result)
let msm_point = c_from_icicle_point::<C>(&msm_host_result[0]);

msm_point
}

pub fn fft_on_device<Scalar: ff::PrimeField, G: FftGroup<Scalar> + ff::PrimeField>(
scalars: &mut [G],
omega: Scalar,
_log_n: u32,
inverse: bool
) {
let cfg = NTTConfig::<'_, ScalarField>::default();
let dir = if inverse { NTTDir::kInverse } else { NTTDir::kForward };

let omega = icicle_scalars_from_c_scalars(&[omega]);
initialize_domain(omega[0], &cfg.ctx, true).unwrap();

let mut icicle_scalars: Vec<ScalarField> = icicle_scalars_from_c_scalars(scalars);
let host_scalars = HostSlice::from_mut_slice(&mut icicle_scalars);

ntt_inplace::<ScalarField, ScalarField>(
host_scalars,
dir,
&cfg,
).unwrap();

let c_scalars = &c_scalars_from_icicle_scalars::<G>(&mut host_scalars.as_slice())[..];
scalars.copy_from_slice(&c_scalars);
}
Loading

0 comments on commit 0654e92

Please sign in to comment.