From 885188fae6f911d9a6c2f52c3ec223b85c7644c1 Mon Sep 17 00:00:00 2001 From: huyanlin Date: Tue, 14 Jun 2022 18:00:31 +0800 Subject: [PATCH 01/13] gpu acceleration for fft --- halo2_proofs/Cargo.toml | 18 ++- halo2_proofs/src/lib.rs | 10 +- halo2_proofs/src/multicore.rs | 205 ++++++++++++++++++++++++++++++++ halo2_proofs/src/poly.rs | 2 +- halo2_proofs/src/poly/domain.rs | 128 ++++++++++++++++++++ 5 files changed, 359 insertions(+), 4 deletions(-) diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml index a0ea8a1cb0..80be358855 100644 --- a/halo2_proofs/Cargo.toml +++ b/halo2_proofs/Cargo.toml @@ -39,6 +39,21 @@ blake2b_simd = "1" pairing = { git = 'https://github.com/appliedzkp/pairing', package = "pairing_bn256", "tag" = "v0.1.1"} subtle = "2.3" cfg-if = "0.1" +ark-std = { version = "0.3", features = ["print-trace"] } + +log = "0.4.8" +thiserror = "1.0.10" +crossbeam-channel = "0.5.0" +crossbeam = {version = "0.7", optional = true} +lazy_static = {version = "1", optional = true} +futures = {package = "futures", version = "0.3", default_features = false, features = ["executor"]} +num_cpus = "1" +env_logger = "0.8.1" + +# gpu feature +rust-gpu-tools = { version = "0.3.0", optional = true } +ff-cl-gen = { path = "../../ff-cl-gen", version = "0.2.0", optional = true } +fs2 = { version = "0.4.3", optional = true } # Developer tooling dependencies plotters = { version = "0.3.0", optional = true } @@ -62,11 +77,12 @@ rand_core = { version = "0.6", default-features = false, features = ["getrandom" getrandom = { version = "0.2", features = ["js"] } [features] -default = ["shplonk"] +default = ["shplonk", "gpu"] dev-graph = ["plotters", "tabbycat"] gadget-traces = ["backtrace"] sanity-checks = [] shplonk = [] +gpu = ["rust-gpu-tools", "ff-cl-gen", "fs2", "lazy_static", "crossbeam", "futures/thread-pool"] gwc = [] [lib] diff --git a/halo2_proofs/src/lib.rs b/halo2_proofs/src/lib.rs index 2623a93833..920a3c38fd 100644 --- a/halo2_proofs/src/lib.rs +++ b/halo2_proofs/src/lib.rs @@ -18,8 +18,8 @@ clippy::upper_case_acronyms )] #![deny(broken_intra_doc_links)] -#![deny(missing_debug_implementations)] -#![deny(missing_docs)] +//#![deny(missing_debug_implementations)] +//#![deny(missing_docs)] #![deny(unsafe_code)] // Remove this once we update pasta_curves #![allow(unused_imports)] @@ -28,9 +28,15 @@ pub mod arithmetic; pub mod circuit; pub use pairing; mod multicore; +pub mod worker { + pub use super::multicore::*; +} pub mod plonk; +pub mod gpu; pub mod poly; pub mod transcript; pub mod dev; mod helpers; +#[macro_use] +extern crate lazy_static; diff --git a/halo2_proofs/src/multicore.rs b/halo2_proofs/src/multicore.rs index a22eac9e79..1e379506d0 100644 --- a/halo2_proofs/src/multicore.rs +++ b/halo2_proofs/src/multicore.rs @@ -3,3 +3,208 @@ //! be extended in the future to allow for various parallelism strategies. pub use rayon::{current_num_threads, scope, Scope}; + +use crossbeam_channel::{bounded, Receiver}; +use lazy_static::lazy_static; +use log::{error, trace}; +use std::env; + +use std::sync::atomic::{AtomicUsize, Ordering}; + +static WORKER_SPAWN_COUNTER: AtomicUsize = AtomicUsize::new(0); + +#[deny(missing_docs)] +lazy_static! { + static ref NUM_CPUS: usize = if let Ok(num) = env::var("BELLMAN_NUM_CPUS") { + if let Ok(num) = num.parse() { + num + } else { + num_cpus::get() + } + } else { + num_cpus::get() + }; + // See Worker::compute below for a description of this. + static ref WORKER_SPAWN_MAX_COUNT: usize = *NUM_CPUS * 4; + pub static ref THREAD_POOL: rayon::ThreadPool = rayon::ThreadPoolBuilder::new() + .num_threads(*NUM_CPUS) + .build() + .unwrap(); +} + +#[derive(Clone)] +pub struct Worker {} + +impl Worker { + pub fn new() -> Worker { + Worker {} + } + + pub fn get_num_cpus(&self) -> usize { + *NUM_CPUS + } + + pub fn log_num_cpus(&self) -> u32 { + log2_floor(*NUM_CPUS) + } + + pub fn compute(&self, f: F) -> Waiter + where + F: FnOnce() -> R + Send + 'static, + R: Send + 'static, + { + let (sender, receiver) = bounded(1); + + let thread_index = if THREAD_POOL.current_thread_index().is_some() { + THREAD_POOL.current_thread_index().unwrap() + } else { + 0 + }; + + // We keep track here of how many times spawn has been called. + // It can be called without limit, each time, putting a + // request for a new thread to execute a method on the + // ThreadPool. However, if we allow it to be called without + // limits, we run the risk of memory exhaustion due to limited + // stack space consumed by all of the pending closures to be + // executed. + let previous_count = WORKER_SPAWN_COUNTER.fetch_add(1, Ordering::SeqCst); + + // If the number of spawns requested has exceeded the number + // of cores available for processing by some factor (the + // default being 4), instead of requesting that we spawn a new + // thread, we instead execute the closure in the context of an + // install call to help clear the growing work queue and + // minimize the chances of memory exhaustion. + if previous_count > *WORKER_SPAWN_MAX_COUNT { + THREAD_POOL.install(move || { + trace!("[{}] switching to install to help clear backlog[current threads {}, threads requested {}]", + thread_index, + THREAD_POOL.current_num_threads(), + WORKER_SPAWN_COUNTER.load(Ordering::SeqCst)); + let res = f(); + sender.send(res).unwrap(); + WORKER_SPAWN_COUNTER.fetch_sub(1, Ordering::SeqCst); + }); + } else { + THREAD_POOL.spawn(move || { + let res = f(); + sender.send(res).unwrap(); + WORKER_SPAWN_COUNTER.fetch_sub(1, Ordering::SeqCst); + }); + } + + Waiter { receiver } + } + + pub fn scope<'a, F, R>(&self, elements: usize, f: F) -> R + where + F: FnOnce(&rayon::Scope<'a>, usize) -> R + Send, + R: Send, + { + let chunk_size = self.get_chunk_size(elements); + + THREAD_POOL.scope(|scope| f(scope, chunk_size)) + } + + pub fn in_place_scope<'a, F, R>(&self, elements: usize, f: F) -> R + where + F: FnOnce(&rayon::Scope<'a>, usize) -> R, + { + let chunk_size = self.get_chunk_size(elements); + + THREAD_POOL.in_place_scope(|scope| f(scope, chunk_size)) + } + + pub fn get_chunk_size(&self, elements: usize) -> usize { + let chunk_size = if elements <= *NUM_CPUS { + 1 + } else { + Self::chunk_size_for_num_spawned_threads(elements, *NUM_CPUS) + }; + + chunk_size + } + // TODO: check +1? + pub fn chunk_size_for_num_spawned_threads(elements: usize, num_threads: usize) -> usize { + assert!( + elements >= num_threads, + "received {} elements to spawn {} threads", + elements, + num_threads + ); + if elements % num_threads == 0 { + elements / num_threads + } else { + elements / num_threads + 1 + } + } + + pub fn get_num_spawned_threads(&self, elements: usize) -> usize { + let num_spawned = if elements <= *NUM_CPUS { + elements + } else { + let chunk = self.get_chunk_size(elements); + let mut spawned = elements / chunk; + if spawned * chunk < elements { + spawned += 1; + } + assert!(spawned <= 2 * *NUM_CPUS); + + spawned + }; + + num_spawned + } +} + +pub struct Waiter { + receiver: Receiver, +} + +impl Waiter { + /// Wait for the result. + pub fn wait(&self) -> T { + if THREAD_POOL.current_thread_index().is_some() { + // Calling `wait()` from within the worker thread pool can lead to dead logs + error!("The wait call should never be done inside the worker thread pool"); + debug_assert!(false); + } + self.receiver.recv().unwrap() + } + + /// One off sending. + pub fn done(val: T) -> Self { + let (sender, receiver) = bounded(1); + sender.send(val).unwrap(); + + Waiter { receiver } + } +} + +fn log2_floor(num: usize) -> u32 { + assert!(num > 0); + + let mut pow = 0; + + while (1 << (pow + 1)) <= num { + pow += 1; + } + + pow +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_log2_floor() { + assert_eq!(log2_floor(1), 0); + assert_eq!(log2_floor(3), 1); + assert_eq!(log2_floor(4), 2); + assert_eq!(log2_floor(5), 2); + assert_eq!(log2_floor(6), 2); + assert_eq!(log2_floor(7), 2); + assert_eq!(log2_floor(8), 3); + } +} diff --git a/halo2_proofs/src/poly.rs b/halo2_proofs/src/poly.rs index 2d913e25bf..16feb6f3eb 100644 --- a/halo2_proofs/src/poly.rs +++ b/halo2_proofs/src/poly.rs @@ -12,7 +12,7 @@ use std::marker::PhantomData; use std::ops::{Add, Deref, DerefMut, Index, IndexMut, Mul, RangeFrom, RangeFull, Sub}; pub mod commitment; -mod domain; +pub mod domain; mod msm; pub mod multiopen; diff --git a/halo2_proofs/src/poly/domain.rs b/halo2_proofs/src/poly/domain.rs index 5204ebef26..de7a74f320 100644 --- a/halo2_proofs/src/poly/domain.rs +++ b/halo2_proofs/src/poly/domain.rs @@ -10,8 +10,13 @@ use super::{Coeff, ExtendedLagrangeCoeff, LagrangeCoeff, Polynomial, Rotation}; use group::ff::{BatchInvert, Field, PrimeField}; +use group::Group as CryptGroup; + use std::marker::PhantomData; +use crate::gpu; +use log::{info, warn}; + /// This structure contains precomputed constants and other details needed for /// performing operations on an evaluation domain of size $2^k$ and an extended /// domain of size $2^{k} * j$ with $j \neq 0$. @@ -485,6 +490,129 @@ pub struct PinnedEvaluationDomain<'a, G: Group> { omega: &'a G::Scalar, } +pub fn create_fft_kernel(_log_d: usize, priority: bool) -> Option> +where + G: Group, +{ + match gpu::MultiFFTKernel::create(priority) { + Ok(k) => { + info!("GPU FFT kernel instantiated!"); + Some(k) + } + Err(e) => { + warn!("Cannot instantiate GPU FFT kernel! Error: {}", e); + None + } + } +} + +use crate::worker::Worker; +pub fn best_fft_multiple_gpu( + kern: &mut Option>, + polys: &mut [&mut [G::Scalar]], + worker: &Worker, + omega: &G::Scalar, + log_n: u32, +) -> gpu::GPUResult<()> { + if let Some(ref mut kern) = kern { + if kern + .with(|k: &mut gpu::MultiFFTKernel| gpu_fft_multiple(k, polys, omega, log_n)) + .is_ok() + { + println!("use multiple GPUs"); + return Ok(()); + } + } + Ok(()) +} + +pub fn gpu_fft_multiple( + kern: &mut gpu::MultiFFTKernel, + polys: &mut [&mut [G::Scalar]], + omega: &G::Scalar, + log_n: u32, +) -> gpu::GPUResult<()> { + kern.fft_multiple(polys, omega, log_n)?; + + Ok(()) +} + +#[test] +fn test_best_fft_multiple_gpu() { + use crate::gpu::LockedMultiFFTKernel; + use crate::pairing::bn256::Fr; + use pairing::bn256::Bn256; + use crate::worker::Worker; + + let worker = Worker::new(); + + use crate::poly::{EvaluationDomain}; + use ark_std::{end_timer, start_timer}; + use rand_core::OsRng; + + for k in 1..20 { + let rng = OsRng; + // polynomial degree n = 2^k + let n = 1u64 << k; + // polynomial coeffs + let coeffs: Vec<_> = (0..n).map(|_| Fr::random(rng)).collect(); + // evaluation domain + let domain: EvaluationDomain = EvaluationDomain::new(1, k); + + let mut prev_fft_coeffs = coeffs.clone(); + + let mut optimized_fft_coeffs = coeffs.clone(); + + let message = format!("prev_fft degree {}", k); + let start = start_timer!(|| message); + best_fft(&mut prev_fft_coeffs, domain.get_omega(), k); + end_timer!(start); + + let message = format!("gpu_fft degree {}", k); + let start = start_timer!(|| message); + + let mut fft_kern = Some(LockedMultiFFTKernel::::new(k as usize, false)); + + best_fft_multiple_gpu( + &mut fft_kern, + &mut [&mut optimized_fft_coeffs], + &worker, + &domain.get_omega_inv(), + k as u32, + ) + .unwrap(); + + end_timer!(start); + + assert_eq!(prev_fft_coeffs, optimized_fft_coeffs); + } +} + +#[test] +fn test_fft() { + use crate::poly::{EvaluationDomain}; + use ark_std::{end_timer, start_timer}; + use pairing::bn256::Fr; + use rand_core::OsRng; + + for k in 1..20 { + let rng = OsRng; + // polynomial degree n = 2^k + let n = 1u64 << k; + // polynomial coeffs + let coeffs: Vec<_> = (0..n).map(|_| Fr::random(rng)).collect(); + // evaluation domain + let domain: EvaluationDomain = EvaluationDomain::new(1, k); + + let mut prev_fft_coeffs = coeffs.clone(); + + let message = format!("prev_fft degree {}", k); + let start = start_timer!(|| message); + best_fft(&mut prev_fft_coeffs, domain.get_omega(), k); + end_timer!(start); + } +} + #[test] fn test_rotate() { use rand_core::OsRng; From 94e848fc6fb23786563a0ed1dfcc267483532112 Mon Sep 17 00:00:00 2001 From: huyanlin Date: Tue, 14 Jun 2022 21:46:55 +0800 Subject: [PATCH 02/13] upload gpu fft files --- halo2_proofs/src/gpu/error.rs | 31 ++++ halo2_proofs/src/gpu/fft.rs | 205 ++++++++++++++++++++++ halo2_proofs/src/gpu/fft/fft.cl | 76 ++++++++ halo2_proofs/src/gpu/locks.rs | 205 ++++++++++++++++++++++ halo2_proofs/src/gpu/mod.rs | 39 ++++ halo2_proofs/src/gpu/multiexp/ec.cl | 134 ++++++++++++++ halo2_proofs/src/gpu/multiexp/field2.cl | 61 +++++++ halo2_proofs/src/gpu/multiexp/multiexp.cl | 70 ++++++++ halo2_proofs/src/gpu/nogpu.rs | 92 ++++++++++ halo2_proofs/src/gpu/sources.rs | 66 +++++++ halo2_proofs/src/gpu/utils.rs | 88 ++++++++++ 11 files changed, 1067 insertions(+) create mode 100644 halo2_proofs/src/gpu/error.rs create mode 100644 halo2_proofs/src/gpu/fft.rs create mode 100644 halo2_proofs/src/gpu/fft/fft.cl create mode 100644 halo2_proofs/src/gpu/locks.rs create mode 100644 halo2_proofs/src/gpu/mod.rs create mode 100644 halo2_proofs/src/gpu/multiexp/ec.cl create mode 100644 halo2_proofs/src/gpu/multiexp/field2.cl create mode 100644 halo2_proofs/src/gpu/multiexp/multiexp.cl create mode 100644 halo2_proofs/src/gpu/nogpu.rs create mode 100644 halo2_proofs/src/gpu/sources.rs create mode 100644 halo2_proofs/src/gpu/utils.rs diff --git a/halo2_proofs/src/gpu/error.rs b/halo2_proofs/src/gpu/error.rs new file mode 100644 index 0000000000..1d6d38636e --- /dev/null +++ b/halo2_proofs/src/gpu/error.rs @@ -0,0 +1,31 @@ +#[cfg(feature = "gpu")] +use rust_gpu_tools::opencl; + +#[derive(thiserror::Error, Debug)] +pub enum GPUError { + #[error("GPUError: {0}")] + Simple(&'static str), + #[cfg(feature = "gpu")] + #[error("OpenCL Error: {0}")] + OpenCL(#[from] opencl::GPUError), + #[cfg(feature = "gpu")] + #[error("GPU taken by a high priority process!")] + GPUTaken, + #[cfg(feature = "gpu")] + #[error("No kernel is initialized!")] + KernelUninitialized, + #[error("GPU accelerator is disabled!")] + GPUDisabled, +} + +pub type GPUResult = std::result::Result; + +#[cfg(feature = "gpu")] +impl From> for GPUError { + fn from(e: std::boxed::Box) -> Self { + match e.downcast::() { + Ok(err) => *err, + Err(_) => GPUError::Simple("An unknown GPU error happened!"), + } + } +} diff --git a/halo2_proofs/src/gpu/fft.rs b/halo2_proofs/src/gpu/fft.rs new file mode 100644 index 0000000000..d35d9be04a --- /dev/null +++ b/halo2_proofs/src/gpu/fft.rs @@ -0,0 +1,205 @@ +use crate::gpu::{ + error::{GPUError, GPUResult}, + get_lock_name_and_gpu_range, locks, sources, +}; +use group::ff::Field; +//use group::Group; +use crate::arithmetic::Group; +use crate::worker::THREAD_POOL; +use log::{error, info}; +use rayon::join; +use rust_gpu_tools::*; +use std::cmp::min; +use std::{cmp, env}; +use std::ops::MulAssign; + +const LOG2_MAX_ELEMENTS: usize = 32; // At most 2^32 elements is supported. +const MAX_LOG2_RADIX: u32 = 9; // Radix512 +const MAX_LOG2_LOCAL_WORK_SIZE: u32 = 8; // 256 + +pub struct SingleFFTKernel +where + G: Group, +{ + program: opencl::Program, + pq_buffer: opencl::Buffer, + omegas_buffer: opencl::Buffer, + priority: bool, +} + +impl SingleFFTKernel +where + G: Group, +{ + pub fn create(device: opencl::Device, priority: bool) -> GPUResult> { + let src = sources::kernel::(device.brand() == opencl::Brand::Nvidia); + + let program = opencl::Program::from_opencl(device, &src)?; + let pq_buffer = program.create_buffer::(1 << MAX_LOG2_RADIX >> 1)?; + let omegas_buffer = program.create_buffer::(LOG2_MAX_ELEMENTS)?; + + info!("FFT: Device: {}", program.device().name()); + + Ok(SingleFFTKernel { + program, + pq_buffer, + omegas_buffer, + priority, + }) + } + + /// Peforms a FFT round + /// * `log_n` - Specifies log2 of number of elements + /// * `log_p` - Specifies log2 of `p`, (http://www.bealto.com/gpu-fft_group-1.html) + /// * `deg` - 1=>radix2, 2=>radix4, 3=>radix8, ... + /// * `max_deg` - The precalculated values pq` and `omegas` are valid for radix degrees up to `max_deg` + fn radix_fft_round( + &mut self, + src_buffer: &opencl::Buffer, + dst_buffer: &opencl::Buffer, + log_n: u32, + log_p: u32, + deg: u32, + max_deg: u32, + ) -> GPUResult<()> { + // if locks::PriorityLock::should_break(self.priority) { + // return Err(GPUError::GPUTaken); + // } + + let n = 1u32 << log_n; + let local_work_size = 1 << cmp::min(deg - 1, MAX_LOG2_LOCAL_WORK_SIZE); + let global_work_size = (n >> deg) * local_work_size; + let kernel = self.program.create_kernel( + "radix_fft", + global_work_size as usize, + Some(local_work_size as usize), + ); + kernel + .arg(src_buffer) + .arg(dst_buffer) + .arg(&self.pq_buffer) + .arg(&self.omegas_buffer) + .arg(opencl::LocalBuffer::::new(1 << deg)) + .arg(n) + .arg(log_p) + .arg(deg) + .arg(max_deg) + .run()?; + Ok(()) + } + + /// Share some precalculated values between threads to boost the performance + fn setup_pq_omegas(&mut self, omega: &G::Scalar, n: usize, max_deg: u32) -> GPUResult<()> { + // Precalculate: + // [omega^(0/(2^(deg-1))), omega^(1/(2^(deg-1))), ..., omega^((2^(deg-1)-1)/(2^(deg-1)))] + let mut pq = vec![G::Scalar::zero(); 1 << max_deg >> 1]; + let twiddle = omega.pow_vartime([(n >> max_deg) as u64]); + pq[0] = G::Scalar::one(); + if max_deg > 1 { + pq[1] = twiddle; + for i in 2..(1 << max_deg >> 1) { + pq[i] = pq[i - 1]; + pq[i].mul_assign(&twiddle); + } + } + self.pq_buffer.write_from(0, &pq)?; + + // Precalculate [omega, omega^2, omega^4, omega^8, ..., omega^(2^31)] + let mut omegas = vec![G::Scalar::zero(); 32]; + omegas[0] = *omega; + for i in 1..LOG2_MAX_ELEMENTS { + omegas[i] = omegas[i - 1].pow_vartime([2u64]); + } + self.omegas_buffer.write_from(0, &omegas)?; + + Ok(()) + } + + /// Performs FFT on `a` + /// * `omega` - Special value `omega` is used for FFT over finite-fields + /// * `log_n` - Specifies log2 of number of elements + pub fn radix_fft(&mut self, a: &mut [G::Scalar], omega: &G::Scalar, log_n: u32) -> GPUResult<()> { + let n = 1 << log_n; + let mut src_buffer = self.program.create_buffer::(n)?; + let mut dst_buffer = self.program.create_buffer::(n)?; + + let max_deg = cmp::min(MAX_LOG2_RADIX, log_n); + self.setup_pq_omegas(omega, n, max_deg)?; + + src_buffer.write_from(0, &*a)?; + let mut log_p = 0u32; + while log_p < log_n { + let deg = cmp::min(max_deg, log_n - log_p); + self.radix_fft_round(&src_buffer, &dst_buffer, log_n, log_p, deg, max_deg)?; + log_p += deg; + std::mem::swap(&mut src_buffer, &mut dst_buffer); + } + + src_buffer.read_into(0, a)?; + + Ok(()) + } +} + +pub struct MultiFFTKernel +where + G: Group, +{ + kernels: Vec>, + _lock: locks::GPULock, +} + +impl MultiFFTKernel +where + G: Group, +{ + pub fn create(priority: bool) -> GPUResult> { + let mut all_devices = opencl::Device::all(); + let all_num = all_devices.len(); + let (lock_index, gpu_range) = get_lock_name_and_gpu_range(all_num); + + let lock = locks::GPULock::lock(lock_index); + + let devices: Vec<&opencl::Device> = all_devices.drain(gpu_range).collect(); + + // use all of the GPUs + let kernels: Vec<_> = devices + .into_iter() + .map(|d| (d, SingleFFTKernel::::create(d.clone(), priority))) + .filter_map(|(device, res)| { + if let Err(ref e) = res { + error!( + "Cannot initialize kernel for device '{}'! Error: {}", + device.name(), + e + ); + } + res.ok() + }) + .collect(); + + Ok(MultiFFTKernel { + kernels, + _lock: lock, + }) + } + + pub fn fft_multiple( + &mut self, + polys: &mut [&mut [G::Scalar]], + omega: &G::Scalar, + log_n: u32, + ) -> GPUResult<()> { + use rayon::prelude::*; + + for poly in polys.chunks_mut(self.kernels.len()) { + crate::worker::THREAD_POOL.install(|| { + poly.par_iter_mut() + .zip(self.kernels.par_iter_mut()) + .for_each(|(p, kern)| kern.radix_fft(p, omega, log_n).unwrap()) + }); + } + + Ok(()) + } +} diff --git a/halo2_proofs/src/gpu/fft/fft.cl b/halo2_proofs/src/gpu/fft/fft.cl new file mode 100644 index 0000000000..b340a9d42a --- /dev/null +++ b/halo2_proofs/src/gpu/fft/fft.cl @@ -0,0 +1,76 @@ +uint bitreverse(uint n, uint bits) { + uint r = 0; + for(int i = 0; i < bits; i++) { + r = (r << 1) | (n & 1); + n >>= 1; + } + return r; +} + +/* + * FFT algorithm is inspired from: http://www.bealto.com/gpu-fft_group-1.html + */ +__kernel void radix_fft(__global FIELD* x, // Source buffer + __global FIELD* y, // Destination buffer + __global FIELD* pq, // Precalculated twiddle factors + __global FIELD* omegas, // [omega, omega^2, omega^4, ...] + __local FIELD* u, // Local buffer to store intermediary values + uint n, // Number of elements + uint lgp, // Log2 of `p` (Read more in the link above) + uint deg, // 1=>radix2, 2=>radix4, 3=>radix8, ... + uint max_deg) // Maximum degree supported, according to `pq` and `omegas` +{ + uint lid = get_local_id(0); + uint lsize = get_local_size(0); + uint index = get_group_id(0); + uint t = n >> deg; + uint p = 1 << lgp; + uint k = index & (p - 1); + + x += index; + y += ((index - k) << deg) + k; + + uint count = 1 << deg; // 2^deg + uint counth = count >> 1; // Half of count + + uint counts = count / lsize * lid; + uint counte = counts + count / lsize; + + // Compute powers of twiddle + const FIELD twiddle = FIELD_pow_lookup(omegas, (n >> lgp >> deg) * k); + FIELD tmp = FIELD_pow(twiddle, counts); + for(uint i = counts; i < counte; i++) { + u[i] = FIELD_mul(tmp, x[i*t]); + tmp = FIELD_mul(tmp, twiddle); + } + barrier(CLK_LOCAL_MEM_FENCE); + + const uint pqshift = max_deg - deg; + for(uint rnd = 0; rnd < deg; rnd++) { + const uint bit = counth >> rnd; + for(uint i = counts >> 1; i < counte >> 1; i++) { + const uint di = i & (bit - 1); + const uint i0 = (i << 1) - di; + const uint i1 = i0 + bit; + tmp = u[i0]; + u[i0] = FIELD_add(u[i0], u[i1]); + u[i1] = FIELD_sub(tmp, u[i1]); + if(di != 0) u[i1] = FIELD_mul(pq[di << rnd << pqshift], u[i1]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + for(uint i = counts >> 1; i < counte >> 1; i++) { + y[i*p] = u[bitreverse(i, deg)]; + y[(i+counth)*p] = u[bitreverse(i + counth, deg)]; + } +} + +/// Multiplies all of the elements by `field` +__kernel void mul_by_field(__global FIELD* elements, + uint n, + FIELD field) { + const uint gid = get_global_id(0); + elements[gid] = FIELD_mul(elements[gid], field); +} diff --git a/halo2_proofs/src/gpu/locks.rs b/halo2_proofs/src/gpu/locks.rs new file mode 100644 index 0000000000..8d36662453 --- /dev/null +++ b/halo2_proofs/src/gpu/locks.rs @@ -0,0 +1,205 @@ +use fs2::FileExt; +use log::{debug, info, warn}; +use std::fs::File; +use std::path::PathBuf; +//use group::Group; +use crate::arithmetic::Group; + +const GPU_LOCK_NAME: &str = "bellman.gpu.lock"; +const PRIORITY_LOCK_NAME: &str = "bellman.priority.lock"; +fn tmp_path(filename: &str) -> PathBuf { + let mut p = std::env::temp_dir(); + p.push(filename); + p +} + +pub fn get_lock_name_and_gpu_range(all_gpus: usize) -> (String, Range) { + let mut num_gpus_per_group = all_gpus; + let mut group_index = 0usize; + + let maybe_num = env::var("BELLMAN_NUM_GPUS_PER_GROUP"); + let maybe_index = env::var("BELLMAN_GPU_GROUP_INDEX"); + + if maybe_num.is_ok() && maybe_index.is_ok() { + let maybe_num = maybe_num.unwrap().parse(); + let maybe_index = maybe_index.unwrap().parse(); + if maybe_num.is_ok() && maybe_index.is_ok() { + num_gpus_per_group = maybe_num.unwrap(); + group_index = maybe_index.unwrap(); + } + } + + assert!( + (group_index + 1) * num_gpus_per_group <= all_gpus, + "BELLMAN_GPU_GROUP_INDEX and BELLMAN_NUM_GPUS_PER_GROUP error" + ); + + ( + group_index.to_string(), + Range { + start: group_index * num_gpus_per_group, + end: (group_index + 1) * num_gpus_per_group, + }, + ) +} + +/// `GPULock` prevents two kernel objects to be instantiated simultaneously. +#[derive(Debug)] +pub struct GPULock(File); +impl GPULock { + pub fn lock(index: String) -> GPULock { + let filename = GPU_LOCK_NAME.to_string() + &index; + let gpu_lock_file = tmp_path(&filename); + debug!("Acquiring GPU lock at {:?} ...", &gpu_lock_file); + let f = File::create(&gpu_lock_file) + .unwrap_or_else(|_| panic!("Cannot create GPU lock file at {:?}", &gpu_lock_file)); + f.lock_exclusive().unwrap(); + debug!("GPU lock acquired!"); + GPULock(f) + } +} +impl Drop for GPULock { + fn drop(&mut self) { + self.0.unlock().unwrap(); + debug!("GPU lock released!"); + } +} + +/// `PrioriyLock` is like a flag. When acquired, it means a high-priority process +/// needs to acquire the GPU really soon. Acquiring the `PriorityLock` is like +/// signaling all other processes to release their `GPULock`s. +/// Only one process can have the `PriorityLock` at a time. +#[derive(Debug)] +pub struct PriorityLock(File); +impl PriorityLock { + pub fn lock() -> PriorityLock { + let priority_lock_file = tmp_path(PRIORITY_LOCK_NAME); + debug!("Acquiring priority lock at {:?} ...", &priority_lock_file); + let f = File::create(&priority_lock_file).unwrap_or_else(|_| { + panic!( + "Cannot create priority lock file at {:?}", + &priority_lock_file + ) + }); + f.lock_exclusive().unwrap(); + debug!("Priority lock acquired!"); + PriorityLock(f) + } + pub fn wait(priority: bool) { + if !priority { + File::create(tmp_path(PRIORITY_LOCK_NAME)) + .unwrap() + .lock_exclusive() + .unwrap(); + } + } + pub fn should_break(priority: bool) -> bool { + !priority + && File::create(tmp_path(PRIORITY_LOCK_NAME)) + .unwrap() + .try_lock_exclusive() + .is_err() + } +} +impl Drop for PriorityLock { + fn drop(&mut self) { + self.0.unlock().unwrap(); + debug!("Priority lock released!"); + } +} + +use super::error::{GPUError, GPUResult}; +use super::fft::MultiFFTKernel; +//use super::multiexp::MultiexpKernel; +use crate::poly::domain::create_fft_kernel; +//use crate::multiexp::create_multiexp_kernel; +//use crate::pairing::Engine; +use std::env; +use std::ops::Range; + +macro_rules! locked_kernel { + ($class:ident, $kern:ident, $func:ident, $name:expr) => { + pub struct $class + where + G: Group, + { + log_d: usize, + priority: bool, + kernel: Option<$kern>, + } + + impl $class + where + G: Group, + { + pub fn new(log_d: usize, priority: bool) -> $class { + $class:: { + log_d, + priority, + kernel: None, + } + } + + fn init(&mut self) { + if self.kernel.is_none() { + PriorityLock::wait(self.priority); + info!("GPU is available for {}!", $name); + self.kernel = $func::(self.log_d, self.priority); + } + } + + fn free(&mut self) { + if let Some(_kernel) = self.kernel.take() { + warn!( + "GPU acquired by a high priority process! Freeing up {} kernels...", + $name + ); + } + } + + pub fn with(&mut self, mut f: F) -> GPUResult + where + F: FnMut(&mut $kern) -> GPUResult, + { + if let Ok(flag) = std::env::var("BELLMAN_USE_CPU") { + if flag == "1" { + return Err(GPUError::GPUDisabled); + } + } + + self.init(); + + loop { + if let Some(ref mut k) = self.kernel { + match f(k) { + Err(GPUError::GPUTaken) => { + self.free(); + self.init(); + } + Err(e) => { + warn!("GPU {} failed! Falling back to CPU... Error: {}", $name, e); + return Err(e); + } + Ok(v) => return Ok(v), + } + } else { + return Err(GPUError::KernelUninitialized); + } + } + } + } + }; +} + +locked_kernel!( + LockedMultiFFTKernel, + MultiFFTKernel, + create_fft_kernel, + "FFT" +); +/*locked_kernel!( + LockedMultiexpKernel, + MultiexpKernel, + create_multiexp_kernel, + "Multiexp" +);*/ diff --git a/halo2_proofs/src/gpu/mod.rs b/halo2_proofs/src/gpu/mod.rs new file mode 100644 index 0000000000..21cecfea40 --- /dev/null +++ b/halo2_proofs/src/gpu/mod.rs @@ -0,0 +1,39 @@ +mod error; + +pub use self::error::*; + +#[cfg(feature = "gpu")] +mod locks; + +#[cfg(feature = "gpu")] +pub use self::locks::*; + +#[cfg(feature = "gpu")] +mod sources; + +#[cfg(feature = "gpu")] +pub use self::sources::*; + +#[cfg(feature = "gpu")] +mod utils; + +#[cfg(feature = "gpu")] +pub use self::utils::*; + +#[cfg(feature = "gpu")] +mod fft; + +#[cfg(feature = "gpu")] +pub use self::fft::*; + +//#[cfg(feature = "gpu")] +//mod multiexp; + +//#[cfg(feature = "gpu")] +//pub use self::multiexp::*; + +#[cfg(not(feature = "gpu"))] +mod nogpu; + +#[cfg(not(feature = "gpu"))] +pub use self::nogpu::*; diff --git a/halo2_proofs/src/gpu/multiexp/ec.cl b/halo2_proofs/src/gpu/multiexp/ec.cl new file mode 100644 index 0000000000..26f617ad75 --- /dev/null +++ b/halo2_proofs/src/gpu/multiexp/ec.cl @@ -0,0 +1,134 @@ +// Elliptic curve operations (Short Weierstrass Jacobian form) + +#define POINT_ZERO ((POINT_projective){FIELD_ZERO, FIELD_ONE, FIELD_ZERO}) + +__BLSTRS__ // Affine points in `blstrs` library do not have `inf` field. + +typedef struct { + FIELD x; + FIELD y; + #ifndef BLSTRS + bool inf; + #endif + #if FIELD_LIMB_BITS == 32 + uint _padding; + #endif +} POINT_affine; + +typedef struct { + FIELD x; + FIELD y; + FIELD z; +} POINT_projective; + +// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l +POINT_projective POINT_double(POINT_projective inp) { + const FIELD local_zero = FIELD_ZERO; + if(FIELD_eq(inp.z, local_zero)) { + return inp; + } + + const FIELD a = FIELD_sqr(inp.x); // A = X1^2 + const FIELD b = FIELD_sqr(inp.y); // B = Y1^2 + FIELD c = FIELD_sqr(b); // C = B^2 + + // D = 2*((X1+B)2-A-C) + FIELD d = FIELD_add(inp.x, b); + d = FIELD_sqr(d); d = FIELD_sub(FIELD_sub(d, a), c); d = FIELD_double(d); + + const FIELD e = FIELD_add(FIELD_double(a), a); // E = 3*A + const FIELD f = FIELD_sqr(e); + + inp.z = FIELD_mul(inp.y, inp.z); inp.z = FIELD_double(inp.z); // Z3 = 2*Y1*Z1 + inp.x = FIELD_sub(FIELD_sub(f, d), d); // X3 = F-2*D + + // Y3 = E*(D-X3)-8*C + c = FIELD_double(c); c = FIELD_double(c); c = FIELD_double(c); + inp.y = FIELD_sub(FIELD_mul(FIELD_sub(d, inp.x), e), c); + + return inp; +} + +// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl +POINT_projective POINT_add_mixed(POINT_projective a, POINT_affine b) { + #ifndef BLSTRS + if(b.inf) { + return a; + } + #endif + + const FIELD local_zero = FIELD_ZERO; + if(FIELD_eq(a.z, local_zero)) { + const FIELD local_one = FIELD_ONE; + a.x = b.x; + a.y = b.y; + a.z = local_one; + return a; + } + + const FIELD z1z1 = FIELD_sqr(a.z); + const FIELD u2 = FIELD_mul(b.x, z1z1); + const FIELD s2 = FIELD_mul(FIELD_mul(b.y, a.z), z1z1); + + if(FIELD_eq(a.x, u2) && FIELD_eq(a.y, s2)) { + return POINT_double(a); + } + + const FIELD h = FIELD_sub(u2, a.x); // H = U2-X1 + const FIELD hh = FIELD_sqr(h); // HH = H^2 + FIELD i = FIELD_double(hh); i = FIELD_double(i); // I = 4*HH + FIELD j = FIELD_mul(h, i); // J = H*I + FIELD r = FIELD_sub(s2, a.y); r = FIELD_double(r); // r = 2*(S2-Y1) + const FIELD v = FIELD_mul(a.x, i); + + POINT_projective ret; + + // X3 = r^2 - J - 2*V + ret.x = FIELD_sub(FIELD_sub(FIELD_sqr(r), j), FIELD_double(v)); + + // Y3 = r*(V-X3)-2*Y1*J + j = FIELD_mul(a.y, j); j = FIELD_double(j); + ret.y = FIELD_sub(FIELD_mul(FIELD_sub(v, ret.x), r), j); + + // Z3 = (Z1+H)^2-Z1Z1-HH + ret.z = FIELD_add(a.z, h); ret.z = FIELD_sub(FIELD_sub(FIELD_sqr(ret.z), z1z1), hh); + return ret; +} + +// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl +POINT_projective POINT_add(POINT_projective a, POINT_projective b) { + + const FIELD local_zero = FIELD_ZERO; + if(FIELD_eq(a.z, local_zero)) return b; + if(FIELD_eq(b.z, local_zero)) return a; + + const FIELD z1z1 = FIELD_sqr(a.z); // Z1Z1 = Z1^2 + const FIELD z2z2 = FIELD_sqr(b.z); // Z2Z2 = Z2^2 + const FIELD u1 = FIELD_mul(a.x, z2z2); // U1 = X1*Z2Z2 + const FIELD u2 = FIELD_mul(b.x, z1z1); // U2 = X2*Z1Z1 + FIELD s1 = FIELD_mul(FIELD_mul(a.y, b.z), z2z2); // S1 = Y1*Z2*Z2Z2 + const FIELD s2 = FIELD_mul(FIELD_mul(b.y, a.z), z1z1); // S2 = Y2*Z1*Z1Z1 + + if(FIELD_eq(u1, u2) && FIELD_eq(s1, s2)) + return POINT_double(a); + else { + const FIELD h = FIELD_sub(u2, u1); // H = U2-U1 + FIELD i = FIELD_double(h); i = FIELD_sqr(i); // I = (2*H)^2 + const FIELD j = FIELD_mul(h, i); // J = H*I + FIELD r = FIELD_sub(s2, s1); r = FIELD_double(r); // r = 2*(S2-S1) + const FIELD v = FIELD_mul(u1, i); // V = U1*I + a.x = FIELD_sub(FIELD_sub(FIELD_sub(FIELD_sqr(r), j), v), v); // X3 = r^2 - J - 2*V + + // Y3 = r*(V - X3) - 2*S1*J + a.y = FIELD_mul(FIELD_sub(v, a.x), r); + s1 = FIELD_mul(s1, j); s1 = FIELD_double(s1); // S1 = S1 * J * 2 + a.y = FIELD_sub(a.y, s1); + + // Z3 = ((Z1+Z2)^2 - Z1Z1 - Z2Z2)*H + a.z = FIELD_add(a.z, b.z); a.z = FIELD_sqr(a.z); + a.z = FIELD_sub(FIELD_sub(a.z, z1z1), z2z2); + a.z = FIELD_mul(a.z, h); + + return a; + } +} diff --git a/halo2_proofs/src/gpu/multiexp/field2.cl b/halo2_proofs/src/gpu/multiexp/field2.cl new file mode 100644 index 0000000000..c520e33e79 --- /dev/null +++ b/halo2_proofs/src/gpu/multiexp/field2.cl @@ -0,0 +1,61 @@ +// Fp2 Extension Field where u^2 + 1 = 0 + +#define FIELD2_LIMB_BITS FIELD_LIMB_BITS +#define FIELD2_ZERO ((FIELD2){FIELD_ZERO, FIELD_ZERO}) +#define FIELD2_ONE ((FIELD2){FIELD_ONE, FIELD_ZERO}) + +typedef struct { + FIELD c0; + FIELD c1; +} FIELD2; // Represents: c0 + u * c1 + +bool FIELD2_eq(FIELD2 a, FIELD2 b) { + return FIELD_eq(a.c0, b.c0) && FIELD_eq(a.c1, b.c1); +} +FIELD2 FIELD2_sub(FIELD2 a, FIELD2 b) { + a.c0 = FIELD_sub(a.c0, b.c0); + a.c1 = FIELD_sub(a.c1, b.c1); + return a; +} +FIELD2 FIELD2_add(FIELD2 a, FIELD2 b) { + a.c0 = FIELD_add(a.c0, b.c0); + a.c1 = FIELD_add(a.c1, b.c1); + return a; +} +FIELD2 FIELD2_double(FIELD2 a) { + a.c0 = FIELD_double(a.c0); + a.c1 = FIELD_double(a.c1); + return a; +} + +/* + * (a_0 + u * a_1)(b_0 + u * b_1) = a_0 * b_0 - a_1 * b_1 + u * (a_0 * b_1 + a_1 * b_0) + * Therefore: + * c_0 = a_0 * b_0 - a_1 * b_1 + * c_1 = (a_0 * b_1 + a_1 * b_0) = (a_0 + a_1) * (b_0 + b_1) - a_0 * b_0 - a_1 * b_1 + */ +FIELD2 FIELD2_mul(FIELD2 a, FIELD2 b) { + const FIELD aa = FIELD_mul(a.c0, b.c0); + const FIELD bb = FIELD_mul(a.c1, b.c1); + const FIELD o = FIELD_add(b.c0, b.c1); + a.c1 = FIELD_add(a.c1, a.c0); + a.c1 = FIELD_mul(a.c1, o); + a.c1 = FIELD_sub(a.c1, aa); + a.c1 = FIELD_sub(a.c1, bb); + a.c0 = FIELD_sub(aa, bb); + return a; +} + +/* + * (a_0 + u * a_1)(a_0 + u * a_1) = a_0 ^ 2 - a_1 ^ 2 + u * 2 * a_0 * a_1 + * Therefore: + * c_0 = (a_0 * a_0 - a_1 * a_1) = (a_0 + a_1)(a_0 - a_1) + * c_1 = 2 * a_0 * a_1 + */ +FIELD2 FIELD2_sqr(FIELD2 a) { + const FIELD ab = FIELD_mul(a.c0, a.c1); + const FIELD c0c1 = FIELD_add(a.c0, a.c1); + a.c0 = FIELD_mul(FIELD_sub(a.c0, a.c1), c0c1); + a.c1 = FIELD_double(ab); + return a; +} diff --git a/halo2_proofs/src/gpu/multiexp/multiexp.cl b/halo2_proofs/src/gpu/multiexp/multiexp.cl new file mode 100644 index 0000000000..4e0d26e2e2 --- /dev/null +++ b/halo2_proofs/src/gpu/multiexp/multiexp.cl @@ -0,0 +1,70 @@ +/* + * Same multiexp algorithm used in Bellman, with some modifications. + * https://github.com/zkcrypto/bellman/blob/10c5010fd9c2ca69442dc9775ea271e286e776d8/src/multiexp.rs#L174 + * The CPU version of multiexp parallelism is done by dividing the exponent + * values into smaller windows, and then applying a sequence of rounds to each + * window. The GPU kernel not only assigns a thread to each window but also + * divides the bases into several groups which highly increases the number of + * threads running in parallel for calculating a multiexp instance. + */ + +__kernel void POINT_bellman_multiexp( + __global POINT_affine *bases, + __global POINT_projective *buckets, + __global POINT_projective *results, + __global EXPONENT *exps, + uint n, + uint num_groups, + uint num_windows, + uint window_size) { + + // We have `num_windows` * `num_groups` threads per multiexp. + const uint gid = get_global_id(0); + if(gid >= num_windows * num_groups) return; + + // We have (2^window_size - 1) buckets. + const uint bucket_len = ((1 << window_size) - 1); + + // Each thread has its own set of buckets in global memory. + buckets += bucket_len * gid; + + const POINT_projective local_zero = POINT_ZERO; + for(uint i = 0; i < bucket_len; i++) buckets[i] = local_zero; + + const uint len = (uint)ceil(n / (float)num_groups); // Num of elements in each group + + // This thread runs the multiexp algorithm on elements from `nstart` to `nened` + // on the window [`bits`, `bits` + `w`) + const uint nstart = len * (gid / num_windows); + const uint nend = min(nstart + len, n); + const uint bits = (gid % num_windows) * window_size; + const ushort w = min((ushort)window_size, (ushort)(EXPONENT_BITS - bits)); + + POINT_projective res = POINT_ZERO; + for(uint i = nstart; i < nend; i++) { + uint ind = EXPONENT_get_bits(exps[i], bits, w); + + #ifdef NVIDIA + // O_o, weird optimization, having a single special case makes it + // tremendously faster! + // 511 is chosen because it's half of the maximum bucket len, but + // any other number works... Bigger indices seems to be better... + if(ind == 511) buckets[510] = POINT_add_mixed(buckets[510], bases[i]); + else if(ind--) buckets[ind] = POINT_add_mixed(buckets[ind], bases[i]); + #else + if(ind--) buckets[ind] = POINT_add_mixed(buckets[ind], bases[i]); + #endif + } + + // Summation by parts + // e.g. 3a + 2b + 1c = a + + // (a) + b + + // ((a) + b) + c + POINT_projective acc = POINT_ZERO; + for(int j = bucket_len - 1; j >= 0; j--) { + acc = POINT_add(acc, buckets[j]); + res = POINT_add(res, acc); + } + + results[gid] = res; +} diff --git a/halo2_proofs/src/gpu/nogpu.rs b/halo2_proofs/src/gpu/nogpu.rs new file mode 100644 index 0000000000..de543340d8 --- /dev/null +++ b/halo2_proofs/src/gpu/nogpu.rs @@ -0,0 +1,92 @@ +use super::error::{GPUError, GPUResult}; +use crate::pairing::ff::{PrimeField, ScalarEngine}; +use crate::pairing::CurveAffine; +use crate::worker::Worker; +use std::marker::PhantomData; +use std::sync::Arc; + +// This module is compiled instead of `fft.rs` and `multiexp.rs` if `gpu` feature is disabled. + +pub struct MultiFFTKernel(PhantomData) +where + E: ScalarEngine; + +impl MultiFFTKernel +where + E: ScalarEngine, +{ + pub fn create(_: bool) -> GPUResult> { + return Err(GPUError::GPUDisabled); + } + + pub fn radix_fft(&mut self, _: &mut [E::Fr], _: &E::Fr, _: u32) -> GPUResult<()> { + return Err(GPUError::GPUDisabled); + } +} + +pub struct MultiexpKernel(PhantomData) +where + E: ScalarEngine; + +impl MultiexpKernel +where + E: ScalarEngine, +{ + pub fn create(_: bool) -> GPUResult> { + return Err(GPUError::GPUDisabled); + } + + pub fn multiexp( + &mut self, + _: &Worker, + _: Arc>, + _: Arc::Fr as PrimeField>::Repr>>, + _: usize, + _: usize, + ) -> GPUResult<::Projective> + where + G: CurveAffine, + { + return Err(GPUError::GPUDisabled); + } + + pub fn dense_multiexp( + &mut self, + _: &Worker, + _: Arc>, + _: Arc::Fr as PrimeField>::Repr>>, + _: usize, + ) -> GPUResult<::Projective> + where + G::Engine: crate::pairing::Engine, + { + return Err(GPUError::GPUDisabled); + } +} + +use crate::pairing::Engine; + +macro_rules! locked_kernel { + ($class:ident) => { + pub struct $class(PhantomData); + + impl $class + where + E: Engine, + { + pub fn new(_: usize, _: bool) -> $class { + $class::(PhantomData) + } + + pub fn with(&mut self, _: F) -> GPUResult + where + F: FnMut(&mut K) -> GPUResult, + { + return Err(GPUError::GPUDisabled); + } + } + }; +} + +locked_kernel!(LockedMultiFFTKernel); +locked_kernel!(LockedMultiexpKernel); diff --git a/halo2_proofs/src/gpu/sources.rs b/halo2_proofs/src/gpu/sources.rs new file mode 100644 index 0000000000..b14146ac1c --- /dev/null +++ b/halo2_proofs/src/gpu/sources.rs @@ -0,0 +1,66 @@ +//use group::Group; +use crate::arithmetic::Group; +use ff_cl_gen as ffgen; + +// Instead of having a very large OpenCL program written for a specific curve, with a lot of +// rudandant codes (As OpenCL doesn't have generic types or templates), this module will dynamically +// generate OpenCL codes given different PrimeFields and curves. + +static FFT_SRC: &str = include_str!("fft/fft.cl"); +static FIELD2_SRC: &str = include_str!("multiexp/field2.cl"); +static EC_SRC: &str = include_str!("multiexp/ec.cl"); +static MULTIEXP_SRC: &str = include_str!("multiexp/multiexp.cl"); + +fn field2(field2: &str, field: &str) -> String { + String::from(FIELD2_SRC) + .replace("FIELD2", field2) + .replace("FIELD", field) +} + +fn fft(field: &str) -> String { + String::from(FFT_SRC).replace("FIELD", field) +} + +#[cfg(not(feature = "blstrs"))] +const BLSTRS_DEF: &str = ""; +#[cfg(feature = "blstrs")] +const BLSTRS_DEF: &str = "#define BLSTRS"; + +fn ec(field: &str, point: &str) -> String { + String::from(EC_SRC) + .replace("FIELD", field) + .replace("POINT", point) + .replace("__BLSTRS__", BLSTRS_DEF) +} + +fn multiexp(point: &str, exp: &str) -> String { + String::from(MULTIEXP_SRC) + .replace("POINT", point) + .replace("EXPONENT", exp) +} + +// WARNING: This function works only with Short Weierstrass Jacobian curves with Fq2 extension field. +pub fn kernel(limb64: bool) -> String +where + G: Group, +{ + vec![ + if limb64 { + ffgen::field::("Fr") + } else { + ffgen::field::("Fr") + }, + fft("Fr"), + if limb64 { + ffgen::field::("Fq") + } else { + ffgen::field::("Fq") + }, + ec("Fq", "G1"), + multiexp("G1", "Fr"), + field2("Fq2", "Fq"), + ec("Fq2", "G2"), + multiexp("G2", "Fr"), + ] + .join("\n\n") +} diff --git a/halo2_proofs/src/gpu/utils.rs b/halo2_proofs/src/gpu/utils.rs new file mode 100644 index 0000000000..b0e4bf83e5 --- /dev/null +++ b/halo2_proofs/src/gpu/utils.rs @@ -0,0 +1,88 @@ +use log::{info, warn}; +use rust_gpu_tools::*; +use std::collections::HashMap; +use std::env; + +lazy_static::lazy_static! { + static ref CORE_COUNTS: HashMap = { + let mut core_counts : HashMap = vec![ + // AMD + ("gfx1010".to_string(), 2560), + // This value was chosen to give (approximately) empirically best performance for a Radeon Pro VII. + ("gfx906".to_string(), 7400), + + // NVIDIA + ("Quadro RTX 6000".to_string(), 4608), + ("Quadro RTX A6000".to_string(), 10752), + + ("TITAN RTX".to_string(), 4608), + + ("Tesla V100".to_string(), 5120), + ("Tesla P100".to_string(), 3584), + ("Tesla T4".to_string(), 2560), + ("Quadro M5000".to_string(), 2048), + + ("GeForce RTX 3090".to_string(), 10496), + ("GeForce RTX 3080".to_string(), 8704), + ("GeForce RTX 3070".to_string(), 5888), + + ("GeForce RTX 2080 Ti".to_string(), 4352), + ("GeForce RTX 2080 SUPER".to_string(), 3072), + ("GeForce RTX 2080".to_string(), 2944), + ("GeForce RTX 2070 SUPER".to_string(), 2560), + + ("GeForce GTX 1080 Ti".to_string(), 3584), + ("GeForce GTX 1080".to_string(), 2560), + ("GeForce GTX 2060".to_string(), 1920), + ("GeForce GTX 1660 Ti".to_string(), 1536), + ("GeForce GTX 1060".to_string(), 1280), + ("GeForce GTX 1650 SUPER".to_string(), 1280), + ("GeForce GTX 1650".to_string(), 896), + ].into_iter().collect(); + + match env::var("BELLMAN_CUSTOM_GPU").and_then(|var| { + for card in var.split(",") { + let splitted = card.split(":").collect::>(); + if splitted.len() != 2 { panic!("Invalid BELLMAN_CUSTOM_GPU!"); } + let name = splitted[0].trim().to_string(); + let cores : usize = splitted[1].trim().parse().expect("Invalid BELLMAN_CUSTOM_GPU!"); + info!("Adding \"{}\" to GPU list with {} CUDA cores.", name, cores); + core_counts.insert(name, cores); + } + Ok(()) + }) { Err(_) => { }, Ok(_) => { } } + + core_counts + }; +} + +const DEFAULT_CORE_COUNT: usize = 2560; +pub fn get_core_count(d: &opencl::Device) -> usize { + let name = d.name(); + match CORE_COUNTS.get(&name[..]) { + Some(&cores) => cores, + None => { + warn!( + "Number of CUDA cores for your device ({}) is unknown! Best performance is \ + only achieved when the number of CUDA cores is known! You can find the \ + instructions on how to support custom GPUs here: \ + https://lotu.sh/en+hardware-mining", + name + ); + DEFAULT_CORE_COUNT + } + } +} + +pub fn dump_device_list() { + for d in opencl::Device::all() { + info!("Device: {:?}", d); + } +} + +#[cfg(feature = "gpu")] +#[test] +pub fn test_list_devices() { + let _ = env_logger::try_init(); + dump_device_list(); +} From 9c41501029189bd8e75967e4894878c3144dced8 Mon Sep 17 00:00:00 2001 From: huyanlin Date: Thu, 16 Jun 2022 20:49:23 +0800 Subject: [PATCH 03/13] fix fr --- halo2_proofs/src/gpu/sources.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/halo2_proofs/src/gpu/sources.rs b/halo2_proofs/src/gpu/sources.rs index b14146ac1c..adef02d193 100644 --- a/halo2_proofs/src/gpu/sources.rs +++ b/halo2_proofs/src/gpu/sources.rs @@ -1,6 +1,7 @@ //use group::Group; use crate::arithmetic::Group; use ff_cl_gen as ffgen; +use pairing::bn256::{Fr, Fq}; // Instead of having a very large OpenCL program written for a specific curve, with a lot of // rudandant codes (As OpenCL doesn't have generic types or templates), this module will dynamically @@ -46,15 +47,15 @@ where { vec![ if limb64 { - ffgen::field::("Fr") + ffgen::field::("Fr") } else { - ffgen::field::("Fr") + ffgen::field::("Fr") }, fft("Fr"), if limb64 { - ffgen::field::("Fq") + ffgen::field::("Fq") } else { - ffgen::field::("Fq") + ffgen::field::("Fq") }, ec("Fq", "G1"), multiexp("G1", "Fr"), From c9e2bf34523864a5ca7d8c88119980d89decc8d9 Mon Sep 17 00:00:00 2001 From: huyanlin Date: Sun, 19 Jun 2022 21:18:08 +0800 Subject: [PATCH 04/13] update fr --- halo2_proofs/Cargo.toml | 1 + halo2_proofs/src/gpu/sources.rs | 8 ++++---- halo2_proofs/src/poly/domain.rs | 4 ++++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml index 80be358855..7bc5a13861 100644 --- a/halo2_proofs/Cargo.toml +++ b/halo2_proofs/Cargo.toml @@ -37,6 +37,7 @@ rand = "0.8" rand_core = { version = "0.6", default-features = false } blake2b_simd = "1" pairing = { git = 'https://github.com/appliedzkp/pairing', package = "pairing_bn256", "tag" = "v0.1.1"} +pairing-bn256 = {package = "pairing_ce", version = "0.24.*" } subtle = "2.3" cfg-if = "0.1" ark-std = { version = "0.3", features = ["print-trace"] } diff --git a/halo2_proofs/src/gpu/sources.rs b/halo2_proofs/src/gpu/sources.rs index adef02d193..eebdf69af4 100644 --- a/halo2_proofs/src/gpu/sources.rs +++ b/halo2_proofs/src/gpu/sources.rs @@ -47,15 +47,15 @@ where { vec![ if limb64 { - ffgen::field::("Fr") + ffgen::field_fr::("Fr") } else { - ffgen::field::("Fr") + ffgen::field_fr::("Fr") }, fft("Fr"), if limb64 { - ffgen::field::("Fq") + ffgen::field_fq::("Fq") } else { - ffgen::field::("Fq") + ffgen::field_fq::("Fq") }, ec("Fq", "G1"), multiexp("G1", "Fr"), diff --git a/halo2_proofs/src/poly/domain.rs b/halo2_proofs/src/poly/domain.rs index de7a74f320..791f94b07d 100644 --- a/halo2_proofs/src/poly/domain.rs +++ b/halo2_proofs/src/poly/domain.rs @@ -567,6 +567,7 @@ fn test_best_fft_multiple_gpu() { let start = start_timer!(|| message); best_fft(&mut prev_fft_coeffs, domain.get_omega(), k); end_timer!(start); + println!("coeffs cpu {:?}\n",prev_fft_coeffs); let message = format!("gpu_fft degree {}", k); let start = start_timer!(|| message); @@ -584,6 +585,7 @@ fn test_best_fft_multiple_gpu() { end_timer!(start); + println!("coeffs gpu {:?}\n",optimized_fft_coeffs); assert_eq!(prev_fft_coeffs, optimized_fft_coeffs); } } @@ -610,6 +612,8 @@ fn test_fft() { let start = start_timer!(|| message); best_fft(&mut prev_fft_coeffs, domain.get_omega(), k); end_timer!(start); + + assert_eq!(prev_fft_coeffs, coeffs); } } From ec7a95d19517411abb29e2cc88f1f9eee695a3c7 Mon Sep 17 00:00:00 2001 From: huyanlin Date: Thu, 23 Jun 2022 02:40:49 +0800 Subject: [PATCH 05/13] update dep and license --- halo2_proofs/Cargo.toml | 7 +- halo2_proofs/src/gpu/COPYRIGHT | 14 ++ halo2_proofs/src/gpu/LICENSE-APACHE | 201 ++++++++++++++++++++++++++++ halo2_proofs/src/gpu/LICENSE-MIT | 23 ++++ halo2_proofs/src/gpu/sources.rs | 8 +- halo2_proofs/src/poly/domain.rs | 8 +- 6 files changed, 248 insertions(+), 13 deletions(-) create mode 100644 halo2_proofs/src/gpu/COPYRIGHT create mode 100644 halo2_proofs/src/gpu/LICENSE-APACHE create mode 100644 halo2_proofs/src/gpu/LICENSE-MIT diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml index 7bc5a13861..0e703027c5 100644 --- a/halo2_proofs/Cargo.toml +++ b/halo2_proofs/Cargo.toml @@ -31,13 +31,12 @@ harness = false [dependencies] backtrace = { version = "0.3", optional = true } rayon = "1.5.1" -ff = "0.11" -group = "0.11" +ff = { git = 'https://github.com/starslabhq/ff', branch = "v0.11.1Char" } +group = { git = "https://github.com/starslabhq/group", branch = "v0.11.0ff" } rand = "0.8" rand_core = { version = "0.6", default-features = false } blake2b_simd = "1" -pairing = { git = 'https://github.com/appliedzkp/pairing', package = "pairing_bn256", "tag" = "v0.1.1"} -pairing-bn256 = {package = "pairing_ce", version = "0.24.*" } +pairing = { git = 'https://github.com/starslabhq/pairing', package = "pairing_bn256", branch = "primeFieldChar" } subtle = "2.3" cfg-if = "0.1" ark-std = { version = "0.3", features = ["print-trace"] } diff --git a/halo2_proofs/src/gpu/COPYRIGHT b/halo2_proofs/src/gpu/COPYRIGHT new file mode 100644 index 0000000000..9e3a9f6f90 --- /dev/null +++ b/halo2_proofs/src/gpu/COPYRIGHT @@ -0,0 +1,14 @@ +Copyrights in the "ff-cl-gen" library are retained by their contributors. No +copyright assignment is required to contribute to the "ff-cl-gen" library. + +The "ff-cl-gen" library is licensed under either of + + * Apache License, Version 2.0, (see ./LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0) + * MIT license (see ./LICENSE-MIT or http://opensource.org/licenses/MIT) + +at your option. + +Unless you explicitly state otherwise, any contribution intentionally +submitted for inclusion in the work by you, as defined in the Apache-2.0 +license, shall be dual licensed as above, without any additional terms or +conditions. diff --git a/halo2_proofs/src/gpu/LICENSE-APACHE b/halo2_proofs/src/gpu/LICENSE-APACHE new file mode 100644 index 0000000000..16fe87b06e --- /dev/null +++ b/halo2_proofs/src/gpu/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/halo2_proofs/src/gpu/LICENSE-MIT b/halo2_proofs/src/gpu/LICENSE-MIT new file mode 100644 index 0000000000..31aa79387f --- /dev/null +++ b/halo2_proofs/src/gpu/LICENSE-MIT @@ -0,0 +1,23 @@ +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/halo2_proofs/src/gpu/sources.rs b/halo2_proofs/src/gpu/sources.rs index eebdf69af4..adef02d193 100644 --- a/halo2_proofs/src/gpu/sources.rs +++ b/halo2_proofs/src/gpu/sources.rs @@ -47,15 +47,15 @@ where { vec![ if limb64 { - ffgen::field_fr::("Fr") + ffgen::field::("Fr") } else { - ffgen::field_fr::("Fr") + ffgen::field::("Fr") }, fft("Fr"), if limb64 { - ffgen::field_fq::("Fq") + ffgen::field::("Fq") } else { - ffgen::field_fq::("Fq") + ffgen::field::("Fq") }, ec("Fq", "G1"), multiexp("G1", "Fr"), diff --git a/halo2_proofs/src/poly/domain.rs b/halo2_proofs/src/poly/domain.rs index 791f94b07d..134d10d1d6 100644 --- a/halo2_proofs/src/poly/domain.rs +++ b/halo2_proofs/src/poly/domain.rs @@ -567,7 +567,7 @@ fn test_best_fft_multiple_gpu() { let start = start_timer!(|| message); best_fft(&mut prev_fft_coeffs, domain.get_omega(), k); end_timer!(start); - println!("coeffs cpu {:?}\n",prev_fft_coeffs); + //println!("coeffs cpu {:?}\n",prev_fft_coeffs); let message = format!("gpu_fft degree {}", k); let start = start_timer!(|| message); @@ -578,14 +578,14 @@ fn test_best_fft_multiple_gpu() { &mut fft_kern, &mut [&mut optimized_fft_coeffs], &worker, - &domain.get_omega_inv(), + &domain.get_omega(), k as u32, ) .unwrap(); end_timer!(start); - println!("coeffs gpu {:?}\n",optimized_fft_coeffs); + //println!("coeffs gpu {:?}\n",optimized_fft_coeffs); assert_eq!(prev_fft_coeffs, optimized_fft_coeffs); } } @@ -612,8 +612,6 @@ fn test_fft() { let start = start_timer!(|| message); best_fft(&mut prev_fft_coeffs, domain.get_omega(), k); end_timer!(start); - - assert_eq!(prev_fft_coeffs, coeffs); } } From a2b2f11eb825d9c9a0c3fd1cbc29f833424835fc Mon Sep 17 00:00:00 2001 From: huyanlin Date: Thu, 23 Jun 2022 02:55:00 +0800 Subject: [PATCH 06/13] fix path --- halo2_proofs/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml index 0e703027c5..40934c3db5 100644 --- a/halo2_proofs/Cargo.toml +++ b/halo2_proofs/Cargo.toml @@ -52,7 +52,7 @@ env_logger = "0.8.1" # gpu feature rust-gpu-tools = { version = "0.3.0", optional = true } -ff-cl-gen = { path = "../../ff-cl-gen", version = "0.2.0", optional = true } +ff-cl-gen = { git = 'https://github.com/starslabhq/ff-cl-gen', optional = true, branch = "gpuFft" } fs2 = { version = "0.4.3", optional = true } # Developer tooling dependencies From 70c53e5295b846f6eee3b65a97aec5422f279c16 Mon Sep 17 00:00:00 2001 From: huyanlin Date: Thu, 23 Jun 2022 11:32:48 +0800 Subject: [PATCH 07/13] small clean code --- halo2_proofs/Cargo.toml | 1 + halo2_proofs/src/gpu/error.rs | 9 ++++----- halo2_proofs/src/gpu/fft.rs | 1 - halo2_proofs/src/gpu/locks.rs | 12 +----------- halo2_proofs/src/gpu/mod.rs | 14 -------------- halo2_proofs/src/gpu/sources.rs | 1 - halo2_proofs/src/gpu/utils.rs | 1 - halo2_proofs/src/lib.rs | 2 ++ 8 files changed, 8 insertions(+), 33 deletions(-) diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml index 40934c3db5..15770eb30e 100644 --- a/halo2_proofs/Cargo.toml +++ b/halo2_proofs/Cargo.toml @@ -77,6 +77,7 @@ rand_core = { version = "0.6", default-features = false, features = ["getrandom" getrandom = { version = "0.2", features = ["js"] } [features] +# todo: set gpu as optional feature default = ["shplonk", "gpu"] dev-graph = ["plotters", "tabbycat"] gadget-traces = ["backtrace"] diff --git a/halo2_proofs/src/gpu/error.rs b/halo2_proofs/src/gpu/error.rs index 1d6d38636e..50a4316bea 100644 --- a/halo2_proofs/src/gpu/error.rs +++ b/halo2_proofs/src/gpu/error.rs @@ -1,26 +1,25 @@ -#[cfg(feature = "gpu")] use rust_gpu_tools::opencl; #[derive(thiserror::Error, Debug)] pub enum GPUError { #[error("GPUError: {0}")] Simple(&'static str), - #[cfg(feature = "gpu")] + #[error("OpenCL Error: {0}")] OpenCL(#[from] opencl::GPUError), - #[cfg(feature = "gpu")] + #[error("GPU taken by a high priority process!")] GPUTaken, - #[cfg(feature = "gpu")] + #[error("No kernel is initialized!")] KernelUninitialized, + #[error("GPU accelerator is disabled!")] GPUDisabled, } pub type GPUResult = std::result::Result; -#[cfg(feature = "gpu")] impl From> for GPUError { fn from(e: std::boxed::Box) -> Self { match e.downcast::() { diff --git a/halo2_proofs/src/gpu/fft.rs b/halo2_proofs/src/gpu/fft.rs index d35d9be04a..25d3f80921 100644 --- a/halo2_proofs/src/gpu/fft.rs +++ b/halo2_proofs/src/gpu/fft.rs @@ -3,7 +3,6 @@ use crate::gpu::{ get_lock_name_and_gpu_range, locks, sources, }; use group::ff::Field; -//use group::Group; use crate::arithmetic::Group; use crate::worker::THREAD_POOL; use log::{error, info}; diff --git a/halo2_proofs/src/gpu/locks.rs b/halo2_proofs/src/gpu/locks.rs index 8d36662453..0f460136f7 100644 --- a/halo2_proofs/src/gpu/locks.rs +++ b/halo2_proofs/src/gpu/locks.rs @@ -2,7 +2,6 @@ use fs2::FileExt; use log::{debug, info, warn}; use std::fs::File; use std::path::PathBuf; -//use group::Group; use crate::arithmetic::Group; const GPU_LOCK_NAME: &str = "bellman.gpu.lock"; @@ -110,10 +109,7 @@ impl Drop for PriorityLock { use super::error::{GPUError, GPUResult}; use super::fft::MultiFFTKernel; -//use super::multiexp::MultiexpKernel; use crate::poly::domain::create_fft_kernel; -//use crate::multiexp::create_multiexp_kernel; -//use crate::pairing::Engine; use std::env; use std::ops::Range; @@ -196,10 +192,4 @@ locked_kernel!( MultiFFTKernel, create_fft_kernel, "FFT" -); -/*locked_kernel!( - LockedMultiexpKernel, - MultiexpKernel, - create_multiexp_kernel, - "Multiexp" -);*/ +); \ No newline at end of file diff --git a/halo2_proofs/src/gpu/mod.rs b/halo2_proofs/src/gpu/mod.rs index 21cecfea40..2160946dd9 100644 --- a/halo2_proofs/src/gpu/mod.rs +++ b/halo2_proofs/src/gpu/mod.rs @@ -2,36 +2,22 @@ mod error; pub use self::error::*; -#[cfg(feature = "gpu")] mod locks; -#[cfg(feature = "gpu")] pub use self::locks::*; -#[cfg(feature = "gpu")] mod sources; -#[cfg(feature = "gpu")] pub use self::sources::*; -#[cfg(feature = "gpu")] mod utils; -#[cfg(feature = "gpu")] pub use self::utils::*; -#[cfg(feature = "gpu")] mod fft; -#[cfg(feature = "gpu")] pub use self::fft::*; -//#[cfg(feature = "gpu")] -//mod multiexp; - -//#[cfg(feature = "gpu")] -//pub use self::multiexp::*; - #[cfg(not(feature = "gpu"))] mod nogpu; diff --git a/halo2_proofs/src/gpu/sources.rs b/halo2_proofs/src/gpu/sources.rs index adef02d193..062362580b 100644 --- a/halo2_proofs/src/gpu/sources.rs +++ b/halo2_proofs/src/gpu/sources.rs @@ -1,4 +1,3 @@ -//use group::Group; use crate::arithmetic::Group; use ff_cl_gen as ffgen; use pairing::bn256::{Fr, Fq}; diff --git a/halo2_proofs/src/gpu/utils.rs b/halo2_proofs/src/gpu/utils.rs index b0e4bf83e5..7183d85a46 100644 --- a/halo2_proofs/src/gpu/utils.rs +++ b/halo2_proofs/src/gpu/utils.rs @@ -80,7 +80,6 @@ pub fn dump_device_list() { } } -#[cfg(feature = "gpu")] #[test] pub fn test_list_devices() { let _ = env_logger::try_init(); diff --git a/halo2_proofs/src/lib.rs b/halo2_proofs/src/lib.rs index 920a3c38fd..6c00c0ec91 100644 --- a/halo2_proofs/src/lib.rs +++ b/halo2_proofs/src/lib.rs @@ -32,11 +32,13 @@ pub mod worker { pub use super::multicore::*; } pub mod plonk; +#[cfg(feature = "gpu")] pub mod gpu; pub mod poly; pub mod transcript; pub mod dev; mod helpers; +#[cfg(feature = "gpu")] #[macro_use] extern crate lazy_static; From 135f64c69f9a34b2291670eb1bc24ef32ab71d00 Mon Sep 17 00:00:00 2001 From: huyanlin Date: Thu, 23 Jun 2022 15:19:36 +0800 Subject: [PATCH 08/13] add some doc comments --- halo2_proofs/src/gpu/error.rs | 7 +++++++ halo2_proofs/src/gpu/fft.rs | 5 +++++ halo2_proofs/src/gpu/locks.rs | 10 +++++++++- halo2_proofs/src/gpu/sources.rs | 2 +- halo2_proofs/src/gpu/utils.rs | 2 ++ halo2_proofs/src/lib.rs | 14 +++++++++----- halo2_proofs/src/multicore.rs | 21 +++++++++++++++++++-- halo2_proofs/src/poly/domain.rs | 13 +++++++------ 8 files changed, 59 insertions(+), 15 deletions(-) diff --git a/halo2_proofs/src/gpu/error.rs b/halo2_proofs/src/gpu/error.rs index 50a4316bea..44a6c9e79e 100644 --- a/halo2_proofs/src/gpu/error.rs +++ b/halo2_proofs/src/gpu/error.rs @@ -1,23 +1,30 @@ use rust_gpu_tools::opencl; +/// Gpu err #[derive(thiserror::Error, Debug)] pub enum GPUError { + /// Simple #[error("GPUError: {0}")] Simple(&'static str), + /// OpenCL #[error("OpenCL Error: {0}")] OpenCL(#[from] opencl::GPUError), + /// GPUTaken #[error("GPU taken by a high priority process!")] GPUTaken, + /// KernelUninitialized #[error("No kernel is initialized!")] KernelUninitialized, + /// GPUDisabled #[error("GPU accelerator is disabled!")] GPUDisabled, } +/// Type GPUResult pub type GPUResult = std::result::Result; impl From> for GPUError { diff --git a/halo2_proofs/src/gpu/fft.rs b/halo2_proofs/src/gpu/fft.rs index 25d3f80921..d8c60945db 100644 --- a/halo2_proofs/src/gpu/fft.rs +++ b/halo2_proofs/src/gpu/fft.rs @@ -16,6 +16,7 @@ const LOG2_MAX_ELEMENTS: usize = 32; // At most 2^32 elements is supported. const MAX_LOG2_RADIX: u32 = 9; // Radix512 const MAX_LOG2_LOCAL_WORK_SIZE: u32 = 8; // 256 +/// SingleFFTKernel pub struct SingleFFTKernel where G: Group, @@ -30,6 +31,7 @@ impl SingleFFTKernel where G: Group, { + /// New gpu fft kernel device pub fn create(device: opencl::Device, priority: bool) -> GPUResult> { let src = sources::kernel::(device.brand() == opencl::Brand::Nvidia); @@ -140,6 +142,7 @@ where } } +/// Gpu fft kernel vec pub struct MultiFFTKernel where G: Group, @@ -152,6 +155,7 @@ impl MultiFFTKernel where G: Group, { + /// New gpu kernel device pub fn create(priority: bool) -> GPUResult> { let mut all_devices = opencl::Device::all(); let all_num = all_devices.len(); @@ -183,6 +187,7 @@ where }) } + /// fft_multiple call for kernel radix_fft pub fn fft_multiple( &mut self, polys: &mut [&mut [G::Scalar]], diff --git a/halo2_proofs/src/gpu/locks.rs b/halo2_proofs/src/gpu/locks.rs index 0f460136f7..9e6a3cd780 100644 --- a/halo2_proofs/src/gpu/locks.rs +++ b/halo2_proofs/src/gpu/locks.rs @@ -12,6 +12,7 @@ fn tmp_path(filename: &str) -> PathBuf { p } +/// Get_lock_name_and_gpu_range pub fn get_lock_name_and_gpu_range(all_gpus: usize) -> (String, Range) { let mut num_gpus_per_group = all_gpus; let mut group_index = 0usize; @@ -46,6 +47,7 @@ pub fn get_lock_name_and_gpu_range(all_gpus: usize) -> (String, Range) { #[derive(Debug)] pub struct GPULock(File); impl GPULock { + /// GPULock pub fn lock(index: String) -> GPULock { let filename = GPU_LOCK_NAME.to_string() + &index; let gpu_lock_file = tmp_path(&filename); @@ -71,6 +73,7 @@ impl Drop for GPULock { #[derive(Debug)] pub struct PriorityLock(File); impl PriorityLock { + /// PriorityLock pub fn lock() -> PriorityLock { let priority_lock_file = tmp_path(PRIORITY_LOCK_NAME); debug!("Acquiring priority lock at {:?} ...", &priority_lock_file); @@ -84,6 +87,7 @@ impl PriorityLock { debug!("Priority lock acquired!"); PriorityLock(f) } + /// PriorityLock wait pub fn wait(priority: bool) { if !priority { File::create(tmp_path(PRIORITY_LOCK_NAME)) @@ -92,6 +96,7 @@ impl PriorityLock { .unwrap(); } } + /// PriorityLock break pub fn should_break(priority: bool) -> bool { !priority && File::create(tmp_path(PRIORITY_LOCK_NAME)) @@ -115,6 +120,7 @@ use std::ops::Range; macro_rules! locked_kernel { ($class:ident, $kern:ident, $func:ident, $name:expr) => { + /// gpu fft locked kernel pub struct $class where G: Group, @@ -128,6 +134,7 @@ macro_rules! locked_kernel { where G: Group, { + /// New gpu kernel macro pub fn new(log_d: usize, priority: bool) -> $class { $class:: { log_d, @@ -152,7 +159,8 @@ macro_rules! locked_kernel { ); } } - + + /// Gpu kernel config pub fn with(&mut self, mut f: F) -> GPUResult where F: FnMut(&mut $kern) -> GPUResult, diff --git a/halo2_proofs/src/gpu/sources.rs b/halo2_proofs/src/gpu/sources.rs index 062362580b..74da5317eb 100644 --- a/halo2_proofs/src/gpu/sources.rs +++ b/halo2_proofs/src/gpu/sources.rs @@ -39,7 +39,7 @@ fn multiexp(point: &str, exp: &str) -> String { .replace("EXPONENT", exp) } -// WARNING: This function works only with Short Weierstrass Jacobian curves with Fq2 extension field. +/// WARNING: This function works only with Short Weierstrass Jacobian curves with Fq2 extension field. pub fn kernel(limb64: bool) -> String where G: Group, diff --git a/halo2_proofs/src/gpu/utils.rs b/halo2_proofs/src/gpu/utils.rs index 7183d85a46..701f252f53 100644 --- a/halo2_proofs/src/gpu/utils.rs +++ b/halo2_proofs/src/gpu/utils.rs @@ -57,6 +57,7 @@ lazy_static::lazy_static! { } const DEFAULT_CORE_COUNT: usize = 2560; +/// Get core number pub fn get_core_count(d: &opencl::Device) -> usize { let name = d.name(); match CORE_COUNTS.get(&name[..]) { @@ -74,6 +75,7 @@ pub fn get_core_count(d: &opencl::Device) -> usize { } } +/// Dump gpu device pub fn dump_device_list() { for d in opencl::Device::all() { info!("Device: {:?}", d); diff --git a/halo2_proofs/src/lib.rs b/halo2_proofs/src/lib.rs index 6c00c0ec91..088221dcc0 100644 --- a/halo2_proofs/src/lib.rs +++ b/halo2_proofs/src/lib.rs @@ -18,8 +18,8 @@ clippy::upper_case_acronyms )] #![deny(broken_intra_doc_links)] -//#![deny(missing_debug_implementations)] -//#![deny(missing_docs)] +// #![deny(missing_debug_implementations)] +// #![deny(missing_docs)] #![deny(unsafe_code)] // Remove this once we update pasta_curves #![allow(unused_imports)] @@ -28,12 +28,16 @@ pub mod arithmetic; pub mod circuit; pub use pairing; mod multicore; -pub mod worker { - pub use super::multicore::*; -} pub mod plonk; + +/// Gpu fft #[cfg(feature = "gpu")] pub mod gpu; +/// Gpu worker thread +pub mod worker { + pub use super::multicore::*; +} + pub mod poly; pub mod transcript; diff --git a/halo2_proofs/src/multicore.rs b/halo2_proofs/src/multicore.rs index 1e379506d0..27eb3c8842 100644 --- a/halo2_proofs/src/multicore.rs +++ b/halo2_proofs/src/multicore.rs @@ -13,8 +13,12 @@ use std::sync::atomic::{AtomicUsize, Ordering}; static WORKER_SPAWN_COUNTER: AtomicUsize = AtomicUsize::new(0); -#[deny(missing_docs)] +/// Threadpool and number_cpus +#[cfg(feature = "gpu")] +#[allow(unused_doc_comments)] +#[allow(missing_docs)] lazy_static! { + // THREAD_POOL and NUM_CPUS static ref NUM_CPUS: usize = if let Ok(num) = env::var("BELLMAN_NUM_CPUS") { if let Ok(num) = num.parse() { num @@ -26,28 +30,35 @@ lazy_static! { }; // See Worker::compute below for a description of this. static ref WORKER_SPAWN_MAX_COUNT: usize = *NUM_CPUS * 4; + // THREAD_POOL pub static ref THREAD_POOL: rayon::ThreadPool = rayon::ThreadPoolBuilder::new() .num_threads(*NUM_CPUS) .build() .unwrap(); } +/// Thread pool #[derive(Clone)] +#[derive(Debug)] pub struct Worker {} impl Worker { + /// New worker pub fn new() -> Worker { Worker {} } + /// Get_num_cpus pub fn get_num_cpus(&self) -> usize { *NUM_CPUS } + /// Log_num_cpus pub fn log_num_cpus(&self) -> u32 { log2_floor(*NUM_CPUS) } + /// Compute pub fn compute(&self, f: F) -> Waiter where F: FnOnce() -> R + Send + 'static, @@ -97,6 +108,7 @@ impl Worker { Waiter { receiver } } + /// Scope pub fn scope<'a, F, R>(&self, elements: usize, f: F) -> R where F: FnOnce(&rayon::Scope<'a>, usize) -> R + Send, @@ -107,6 +119,7 @@ impl Worker { THREAD_POOL.scope(|scope| f(scope, chunk_size)) } + /// In_place_scope pub fn in_place_scope<'a, F, R>(&self, elements: usize, f: F) -> R where F: FnOnce(&rayon::Scope<'a>, usize) -> R, @@ -116,6 +129,7 @@ impl Worker { THREAD_POOL.in_place_scope(|scope| f(scope, chunk_size)) } + /// Get_chunk_size pub fn get_chunk_size(&self, elements: usize) -> usize { let chunk_size = if elements <= *NUM_CPUS { 1 @@ -125,7 +139,7 @@ impl Worker { chunk_size } - // TODO: check +1? + /// Chunk_size_for_num_spawned_threads pub fn chunk_size_for_num_spawned_threads(elements: usize, num_threads: usize) -> usize { assert!( elements >= num_threads, @@ -140,6 +154,7 @@ impl Worker { } } + /// Get_num_spawned_threads pub fn get_num_spawned_threads(&self, elements: usize) -> usize { let num_spawned = if elements <= *NUM_CPUS { elements @@ -158,6 +173,8 @@ impl Worker { } } +/// Waiter for receiving +#[derive(Debug)] pub struct Waiter { receiver: Receiver, } diff --git a/halo2_proofs/src/poly/domain.rs b/halo2_proofs/src/poly/domain.rs index 134d10d1d6..ba906aa7a4 100644 --- a/halo2_proofs/src/poly/domain.rs +++ b/halo2_proofs/src/poly/domain.rs @@ -490,6 +490,8 @@ pub struct PinnedEvaluationDomain<'a, G: Group> { omega: &'a G::Scalar, } +/// Config gpu fft kernel +#[cfg(feature = "gpu")] pub fn create_fft_kernel(_log_d: usize, priority: bool) -> Option> where G: Group, @@ -506,11 +508,11 @@ where } } -use crate::worker::Worker; +/// Wrap `gpu_fft_multiple` +#[cfg(feature = "gpu")] pub fn best_fft_multiple_gpu( kern: &mut Option>, polys: &mut [&mut [G::Scalar]], - worker: &Worker, omega: &G::Scalar, log_n: u32, ) -> gpu::GPUResult<()> { @@ -526,6 +528,8 @@ pub fn best_fft_multiple_gpu( Ok(()) } +/// Use multiple gpu fft +#[cfg(feature = "gpu")] pub fn gpu_fft_multiple( kern: &mut gpu::MultiFFTKernel, polys: &mut [&mut [G::Scalar]], @@ -537,14 +541,12 @@ pub fn gpu_fft_multiple( Ok(()) } +#[cfg(feature = "gpu")] #[test] fn test_best_fft_multiple_gpu() { use crate::gpu::LockedMultiFFTKernel; use crate::pairing::bn256::Fr; use pairing::bn256::Bn256; - use crate::worker::Worker; - - let worker = Worker::new(); use crate::poly::{EvaluationDomain}; use ark_std::{end_timer, start_timer}; @@ -577,7 +579,6 @@ fn test_best_fft_multiple_gpu() { best_fft_multiple_gpu( &mut fft_kern, &mut [&mut optimized_fft_coeffs], - &worker, &domain.get_omega(), k as u32, ) From f5931aca110fb8d9a82ec9777395bf154d2ccfff Mon Sep 17 00:00:00 2001 From: huyanlin Date: Thu, 23 Jun 2022 15:45:38 +0800 Subject: [PATCH 09/13] fix doc comment typo --- halo2_proofs/src/lib.rs | 2 +- halo2_proofs/src/multicore.rs | 12 ++++-------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/halo2_proofs/src/lib.rs b/halo2_proofs/src/lib.rs index 088221dcc0..eb9412154f 100644 --- a/halo2_proofs/src/lib.rs +++ b/halo2_proofs/src/lib.rs @@ -19,7 +19,7 @@ )] #![deny(broken_intra_doc_links)] // #![deny(missing_debug_implementations)] -// #![deny(missing_docs)] +#![deny(missing_docs)] #![deny(unsafe_code)] // Remove this once we update pasta_curves #![allow(unused_imports)] diff --git a/halo2_proofs/src/multicore.rs b/halo2_proofs/src/multicore.rs index 27eb3c8842..ab4b4b3120 100644 --- a/halo2_proofs/src/multicore.rs +++ b/halo2_proofs/src/multicore.rs @@ -13,12 +13,9 @@ use std::sync::atomic::{AtomicUsize, Ordering}; static WORKER_SPAWN_COUNTER: AtomicUsize = AtomicUsize::new(0); -/// Threadpool and number_cpus #[cfg(feature = "gpu")] -#[allow(unused_doc_comments)] -#[allow(missing_docs)] lazy_static! { - // THREAD_POOL and NUM_CPUS + /// THREAD_POOL and NUM_CPUS static ref NUM_CPUS: usize = if let Ok(num) = env::var("BELLMAN_NUM_CPUS") { if let Ok(num) = num.parse() { num @@ -28,9 +25,9 @@ lazy_static! { } else { num_cpus::get() }; - // See Worker::compute below for a description of this. + /// See Worker::compute below for a description of this. static ref WORKER_SPAWN_MAX_COUNT: usize = *NUM_CPUS * 4; - // THREAD_POOL + /// THREAD_POOL pub static ref THREAD_POOL: rayon::ThreadPool = rayon::ThreadPoolBuilder::new() .num_threads(*NUM_CPUS) .build() @@ -38,8 +35,7 @@ lazy_static! { } /// Thread pool -#[derive(Clone)] -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct Worker {} impl Worker { From 8c37bf9d7d32ade5b9b494d569349b8f8d526262 Mon Sep 17 00:00:00 2001 From: huyanlin Date: Thu, 23 Jun 2022 20:08:31 +0800 Subject: [PATCH 10/13] open lint check --- halo2_proofs/src/gpu/fft.rs | 3 +++ halo2_proofs/src/gpu/locks.rs | 1 + halo2_proofs/src/lib.rs | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/halo2_proofs/src/gpu/fft.rs b/halo2_proofs/src/gpu/fft.rs index d8c60945db..21b86502f4 100644 --- a/halo2_proofs/src/gpu/fft.rs +++ b/halo2_proofs/src/gpu/fft.rs @@ -17,6 +17,7 @@ const MAX_LOG2_RADIX: u32 = 9; // Radix512 const MAX_LOG2_LOCAL_WORK_SIZE: u32 = 8; // 256 /// SingleFFTKernel +#[allow(missing_debug_implementations)] pub struct SingleFFTKernel where G: Group, @@ -24,6 +25,7 @@ where program: opencl::Program, pq_buffer: opencl::Buffer, omegas_buffer: opencl::Buffer, + #[allow(dead_code)] priority: bool, } @@ -143,6 +145,7 @@ where } /// Gpu fft kernel vec +#[allow(missing_debug_implementations)] pub struct MultiFFTKernel where G: Group, diff --git a/halo2_proofs/src/gpu/locks.rs b/halo2_proofs/src/gpu/locks.rs index 9e6a3cd780..8a60c3b63f 100644 --- a/halo2_proofs/src/gpu/locks.rs +++ b/halo2_proofs/src/gpu/locks.rs @@ -121,6 +121,7 @@ use std::ops::Range; macro_rules! locked_kernel { ($class:ident, $kern:ident, $func:ident, $name:expr) => { /// gpu fft locked kernel + #[allow(missing_debug_implementations)] pub struct $class where G: Group, diff --git a/halo2_proofs/src/lib.rs b/halo2_proofs/src/lib.rs index eb9412154f..7ac27bd758 100644 --- a/halo2_proofs/src/lib.rs +++ b/halo2_proofs/src/lib.rs @@ -18,7 +18,7 @@ clippy::upper_case_acronyms )] #![deny(broken_intra_doc_links)] -// #![deny(missing_debug_implementations)] +#![deny(missing_debug_implementations)] #![deny(missing_docs)] #![deny(unsafe_code)] // Remove this once we update pasta_curves From 2125c6c13e0f515605b4f85fb74455010f76c056 Mon Sep 17 00:00:00 2001 From: huyanlin Date: Thu, 23 Jun 2022 22:51:02 +0800 Subject: [PATCH 11/13] set gpu acceleration as optional feature --- halo2_proofs/Cargo.toml | 4 ++-- halo2_proofs/src/multicore.rs | 8 ++++++++ halo2_proofs/src/poly/domain.rs | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml index 15770eb30e..902f100588 100644 --- a/halo2_proofs/Cargo.toml +++ b/halo2_proofs/Cargo.toml @@ -77,12 +77,12 @@ rand_core = { version = "0.6", default-features = false, features = ["getrandom" getrandom = { version = "0.2", features = ["js"] } [features] -# todo: set gpu as optional feature -default = ["shplonk", "gpu"] +default = ["shplonk"] dev-graph = ["plotters", "tabbycat"] gadget-traces = ["backtrace"] sanity-checks = [] shplonk = [] +# set gpu acceleration as optional feature gpu = ["rust-gpu-tools", "ff-cl-gen", "fs2", "lazy_static", "crossbeam", "futures/thread-pool"] gwc = [] diff --git a/halo2_proofs/src/multicore.rs b/halo2_proofs/src/multicore.rs index ab4b4b3120..e2cf25071a 100644 --- a/halo2_proofs/src/multicore.rs +++ b/halo2_proofs/src/multicore.rs @@ -5,12 +5,14 @@ pub use rayon::{current_num_threads, scope, Scope}; use crossbeam_channel::{bounded, Receiver}; +#[cfg(feature = "gpu")] use lazy_static::lazy_static; use log::{error, trace}; use std::env; use std::sync::atomic::{AtomicUsize, Ordering}; +#[cfg(feature = "gpu")] static WORKER_SPAWN_COUNTER: AtomicUsize = AtomicUsize::new(0); #[cfg(feature = "gpu")] @@ -35,9 +37,11 @@ lazy_static! { } /// Thread pool +#[cfg(feature = "gpu")] #[derive(Clone, Debug)] pub struct Worker {} +#[cfg(feature = "gpu")] impl Worker { /// New worker pub fn new() -> Worker { @@ -170,11 +174,13 @@ impl Worker { } /// Waiter for receiving +#[cfg(feature = "gpu")] #[derive(Debug)] pub struct Waiter { receiver: Receiver, } +#[cfg(feature = "gpu")] impl Waiter { /// Wait for the result. pub fn wait(&self) -> T { @@ -195,6 +201,7 @@ impl Waiter { } } +#[cfg(feature = "gpu")] fn log2_floor(num: usize) -> u32 { assert!(num > 0); @@ -207,6 +214,7 @@ fn log2_floor(num: usize) -> u32 { pow } +#[cfg(feature = "gpu")] #[cfg(test)] mod tests { use super::*; diff --git a/halo2_proofs/src/poly/domain.rs b/halo2_proofs/src/poly/domain.rs index ba906aa7a4..93df085c82 100644 --- a/halo2_proofs/src/poly/domain.rs +++ b/halo2_proofs/src/poly/domain.rs @@ -14,6 +14,7 @@ use group::Group as CryptGroup; use std::marker::PhantomData; +#[cfg(feature = "gpu")] use crate::gpu; use log::{info, warn}; From 6c1bc46a4101a91701e1a60d7f64ec2ecc6a41f5 Mon Sep 17 00:00:00 2001 From: huyanlin Date: Thu, 23 Jun 2022 23:32:33 +0800 Subject: [PATCH 12/13] code fmt --- halo2_proofs/src/gpu/fft.rs | 13 +++++++++---- halo2_proofs/src/gpu/locks.rs | 6 +++--- halo2_proofs/src/gpu/sources.rs | 2 +- halo2_proofs/src/multicore.rs | 2 +- halo2_proofs/src/poly/domain.rs | 4 ++-- 5 files changed, 16 insertions(+), 11 deletions(-) diff --git a/halo2_proofs/src/gpu/fft.rs b/halo2_proofs/src/gpu/fft.rs index 21b86502f4..09feff1b9f 100644 --- a/halo2_proofs/src/gpu/fft.rs +++ b/halo2_proofs/src/gpu/fft.rs @@ -1,16 +1,16 @@ +use crate::arithmetic::Group; use crate::gpu::{ error::{GPUError, GPUResult}, get_lock_name_and_gpu_range, locks, sources, }; -use group::ff::Field; -use crate::arithmetic::Group; use crate::worker::THREAD_POOL; +use group::ff::Field; use log::{error, info}; use rayon::join; use rust_gpu_tools::*; use std::cmp::min; -use std::{cmp, env}; use std::ops::MulAssign; +use std::{cmp, env}; const LOG2_MAX_ELEMENTS: usize = 32; // At most 2^32 elements is supported. const MAX_LOG2_RADIX: u32 = 9; // Radix512 @@ -121,7 +121,12 @@ where /// Performs FFT on `a` /// * `omega` - Special value `omega` is used for FFT over finite-fields /// * `log_n` - Specifies log2 of number of elements - pub fn radix_fft(&mut self, a: &mut [G::Scalar], omega: &G::Scalar, log_n: u32) -> GPUResult<()> { + pub fn radix_fft( + &mut self, + a: &mut [G::Scalar], + omega: &G::Scalar, + log_n: u32, + ) -> GPUResult<()> { let n = 1 << log_n; let mut src_buffer = self.program.create_buffer::(n)?; let mut dst_buffer = self.program.create_buffer::(n)?; diff --git a/halo2_proofs/src/gpu/locks.rs b/halo2_proofs/src/gpu/locks.rs index 8a60c3b63f..77a8d46ee2 100644 --- a/halo2_proofs/src/gpu/locks.rs +++ b/halo2_proofs/src/gpu/locks.rs @@ -1,8 +1,8 @@ +use crate::arithmetic::Group; use fs2::FileExt; use log::{debug, info, warn}; use std::fs::File; use std::path::PathBuf; -use crate::arithmetic::Group; const GPU_LOCK_NAME: &str = "bellman.gpu.lock"; const PRIORITY_LOCK_NAME: &str = "bellman.priority.lock"; @@ -160,7 +160,7 @@ macro_rules! locked_kernel { ); } } - + /// Gpu kernel config pub fn with(&mut self, mut f: F) -> GPUResult where @@ -201,4 +201,4 @@ locked_kernel!( MultiFFTKernel, create_fft_kernel, "FFT" -); \ No newline at end of file +); diff --git a/halo2_proofs/src/gpu/sources.rs b/halo2_proofs/src/gpu/sources.rs index 74da5317eb..11dc3144a2 100644 --- a/halo2_proofs/src/gpu/sources.rs +++ b/halo2_proofs/src/gpu/sources.rs @@ -1,6 +1,6 @@ use crate::arithmetic::Group; use ff_cl_gen as ffgen; -use pairing::bn256::{Fr, Fq}; +use pairing::bn256::{Fq, Fr}; // Instead of having a very large OpenCL program written for a specific curve, with a lot of // rudandant codes (As OpenCL doesn't have generic types or templates), this module will dynamically diff --git a/halo2_proofs/src/multicore.rs b/halo2_proofs/src/multicore.rs index e2cf25071a..8654191021 100644 --- a/halo2_proofs/src/multicore.rs +++ b/halo2_proofs/src/multicore.rs @@ -58,7 +58,7 @@ impl Worker { log2_floor(*NUM_CPUS) } - /// Compute + /// Compute pub fn compute(&self, f: F) -> Waiter where F: FnOnce() -> R + Send + 'static, diff --git a/halo2_proofs/src/poly/domain.rs b/halo2_proofs/src/poly/domain.rs index 93df085c82..3018b4614a 100644 --- a/halo2_proofs/src/poly/domain.rs +++ b/halo2_proofs/src/poly/domain.rs @@ -549,7 +549,7 @@ fn test_best_fft_multiple_gpu() { use crate::pairing::bn256::Fr; use pairing::bn256::Bn256; - use crate::poly::{EvaluationDomain}; + use crate::poly::EvaluationDomain; use ark_std::{end_timer, start_timer}; use rand_core::OsRng; @@ -594,7 +594,7 @@ fn test_best_fft_multiple_gpu() { #[test] fn test_fft() { - use crate::poly::{EvaluationDomain}; + use crate::poly::EvaluationDomain; use ark_std::{end_timer, start_timer}; use pairing::bn256::Fr; use rand_core::OsRng; From 407c46462851ce84a7a824d988c2f3a33e08bbd4 Mon Sep 17 00:00:00 2001 From: huyanlin Date: Mon, 27 Jun 2022 15:14:52 +0800 Subject: [PATCH 13/13] take params coeffs as measurement --- halo2_proofs/src/poly/domain.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/halo2_proofs/src/poly/domain.rs b/halo2_proofs/src/poly/domain.rs index 3018b4614a..ca1e2e7ce0 100644 --- a/halo2_proofs/src/poly/domain.rs +++ b/halo2_proofs/src/poly/domain.rs @@ -562,19 +562,18 @@ fn test_best_fft_multiple_gpu() { // evaluation domain let domain: EvaluationDomain = EvaluationDomain::new(1, k); - let mut prev_fft_coeffs = coeffs.clone(); - - let mut optimized_fft_coeffs = coeffs.clone(); - let message = format!("prev_fft degree {}", k); let start = start_timer!(|| message); + + let mut prev_fft_coeffs = coeffs.clone(); best_fft(&mut prev_fft_coeffs, domain.get_omega(), k); + end_timer!(start); - //println!("coeffs cpu {:?}\n",prev_fft_coeffs); let message = format!("gpu_fft degree {}", k); let start = start_timer!(|| message); + let mut optimized_fft_coeffs = coeffs.clone(); let mut fft_kern = Some(LockedMultiFFTKernel::::new(k as usize, false)); best_fft_multiple_gpu( @@ -587,7 +586,6 @@ fn test_best_fft_multiple_gpu() { end_timer!(start); - //println!("coeffs gpu {:?}\n",optimized_fft_coeffs); assert_eq!(prev_fft_coeffs, optimized_fft_coeffs); } } @@ -608,11 +606,12 @@ fn test_fft() { // evaluation domain let domain: EvaluationDomain = EvaluationDomain::new(1, k); - let mut prev_fft_coeffs = coeffs.clone(); - let message = format!("prev_fft degree {}", k); let start = start_timer!(|| message); + + let mut prev_fft_coeffs = coeffs.clone(); best_fft(&mut prev_fft_coeffs, domain.get_omega(), k); + end_timer!(start); } }