From 885188fae6f911d9a6c2f52c3ec223b85c7644c1 Mon Sep 17 00:00:00 2001
From: huyanlin <bchylmn@gmail.com>
Date: Tue, 14 Jun 2022 18:00:31 +0800
Subject: [PATCH 01/13] gpu acceleration for fft

---
 halo2_proofs/Cargo.toml         |  18 ++-
 halo2_proofs/src/lib.rs         |  10 +-
 halo2_proofs/src/multicore.rs   | 205 ++++++++++++++++++++++++++++++++
 halo2_proofs/src/poly.rs        |   2 +-
 halo2_proofs/src/poly/domain.rs | 128 ++++++++++++++++++++
 5 files changed, 359 insertions(+), 4 deletions(-)

diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml
index a0ea8a1cb0..80be358855 100644
--- a/halo2_proofs/Cargo.toml
+++ b/halo2_proofs/Cargo.toml
@@ -39,6 +39,21 @@ blake2b_simd = "1"
 pairing = { git = 'https://github.com/appliedzkp/pairing', package = "pairing_bn256", "tag" = "v0.1.1"}
 subtle = "2.3"
 cfg-if = "0.1"
+ark-std = { version = "0.3", features = ["print-trace"] }
+
+log = "0.4.8"
+thiserror = "1.0.10"
+crossbeam-channel = "0.5.0"
+crossbeam = {version = "0.7", optional = true}
+lazy_static = {version = "1", optional = true}
+futures = {package = "futures", version = "0.3", default_features = false, features = ["executor"]}
+num_cpus = "1"
+env_logger = "0.8.1"
+
+# gpu feature
+rust-gpu-tools = { version = "0.3.0", optional = true }
+ff-cl-gen = { path = "../../ff-cl-gen", version = "0.2.0", optional = true }
+fs2 = { version = "0.4.3", optional = true }
 
 # Developer tooling dependencies
 plotters = { version = "0.3.0", optional = true }
@@ -62,11 +77,12 @@ rand_core = { version = "0.6", default-features = false, features = ["getrandom"
 getrandom = { version = "0.2", features = ["js"] }
 
 [features]
-default = ["shplonk"]
+default = ["shplonk", "gpu"]
 dev-graph = ["plotters", "tabbycat"]
 gadget-traces = ["backtrace"]
 sanity-checks = []
 shplonk = []
+gpu = ["rust-gpu-tools", "ff-cl-gen", "fs2", "lazy_static", "crossbeam", "futures/thread-pool"]
 gwc = []
 
 [lib]
diff --git a/halo2_proofs/src/lib.rs b/halo2_proofs/src/lib.rs
index 2623a93833..920a3c38fd 100644
--- a/halo2_proofs/src/lib.rs
+++ b/halo2_proofs/src/lib.rs
@@ -18,8 +18,8 @@
     clippy::upper_case_acronyms
 )]
 #![deny(broken_intra_doc_links)]
-#![deny(missing_debug_implementations)]
-#![deny(missing_docs)]
+//#![deny(missing_debug_implementations)]
+//#![deny(missing_docs)]
 #![deny(unsafe_code)]
 // Remove this once we update pasta_curves
 #![allow(unused_imports)]
@@ -28,9 +28,15 @@ pub mod arithmetic;
 pub mod circuit;
 pub use pairing;
 mod multicore;
+pub mod worker {
+    pub use super::multicore::*;
+}
 pub mod plonk;
+pub mod gpu;
 pub mod poly;
 pub mod transcript;
 
 pub mod dev;
 mod helpers;
+#[macro_use]
+extern crate lazy_static;
diff --git a/halo2_proofs/src/multicore.rs b/halo2_proofs/src/multicore.rs
index a22eac9e79..1e379506d0 100644
--- a/halo2_proofs/src/multicore.rs
+++ b/halo2_proofs/src/multicore.rs
@@ -3,3 +3,208 @@
 //! be extended in the future to allow for various parallelism strategies.
 
 pub use rayon::{current_num_threads, scope, Scope};
+
+use crossbeam_channel::{bounded, Receiver};
+use lazy_static::lazy_static;
+use log::{error, trace};
+use std::env;
+
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+static WORKER_SPAWN_COUNTER: AtomicUsize = AtomicUsize::new(0);
+
+#[deny(missing_docs)]
+lazy_static! {
+    static ref NUM_CPUS: usize = if let Ok(num) = env::var("BELLMAN_NUM_CPUS") {
+        if let Ok(num) = num.parse() {
+            num
+        } else {
+            num_cpus::get()
+        }
+    } else {
+        num_cpus::get()
+    };
+    // See Worker::compute below for a description of this.
+    static ref WORKER_SPAWN_MAX_COUNT: usize = *NUM_CPUS * 4;
+    pub static ref THREAD_POOL: rayon::ThreadPool = rayon::ThreadPoolBuilder::new()
+        .num_threads(*NUM_CPUS)
+        .build()
+        .unwrap();
+}
+
+#[derive(Clone)]
+pub struct Worker {}
+
+impl Worker {
+    pub fn new() -> Worker {
+        Worker {}
+    }
+
+    pub fn get_num_cpus(&self) -> usize {
+        *NUM_CPUS
+    }
+
+    pub fn log_num_cpus(&self) -> u32 {
+        log2_floor(*NUM_CPUS)
+    }
+
+    pub fn compute<F, R>(&self, f: F) -> Waiter<R>
+    where
+        F: FnOnce() -> R + Send + 'static,
+        R: Send + 'static,
+    {
+        let (sender, receiver) = bounded(1);
+
+        let thread_index = if THREAD_POOL.current_thread_index().is_some() {
+            THREAD_POOL.current_thread_index().unwrap()
+        } else {
+            0
+        };
+
+        // We keep track here of how many times spawn has been called.
+        // It can be called without limit, each time, putting a
+        // request for a new thread to execute a method on the
+        // ThreadPool.  However, if we allow it to be called without
+        // limits, we run the risk of memory exhaustion due to limited
+        // stack space consumed by all of the pending closures to be
+        // executed.
+        let previous_count = WORKER_SPAWN_COUNTER.fetch_add(1, Ordering::SeqCst);
+
+        // If the number of spawns requested has exceeded the number
+        // of cores available for processing by some factor (the
+        // default being 4), instead of requesting that we spawn a new
+        // thread, we instead execute the closure in the context of an
+        // install call to help clear the growing work queue and
+        // minimize the chances of memory exhaustion.
+        if previous_count > *WORKER_SPAWN_MAX_COUNT {
+            THREAD_POOL.install(move || {
+                trace!("[{}] switching to install to help clear backlog[current threads {}, threads requested {}]",
+                       thread_index,
+                       THREAD_POOL.current_num_threads(),
+                       WORKER_SPAWN_COUNTER.load(Ordering::SeqCst));
+                let res = f();
+                sender.send(res).unwrap();
+                WORKER_SPAWN_COUNTER.fetch_sub(1, Ordering::SeqCst);
+            });
+        } else {
+            THREAD_POOL.spawn(move || {
+                let res = f();
+                sender.send(res).unwrap();
+                WORKER_SPAWN_COUNTER.fetch_sub(1, Ordering::SeqCst);
+            });
+        }
+
+        Waiter { receiver }
+    }
+
+    pub fn scope<'a, F, R>(&self, elements: usize, f: F) -> R
+    where
+        F: FnOnce(&rayon::Scope<'a>, usize) -> R + Send,
+        R: Send,
+    {
+        let chunk_size = self.get_chunk_size(elements);
+
+        THREAD_POOL.scope(|scope| f(scope, chunk_size))
+    }
+
+    pub fn in_place_scope<'a, F, R>(&self, elements: usize, f: F) -> R
+    where
+        F: FnOnce(&rayon::Scope<'a>, usize) -> R,
+    {
+        let chunk_size = self.get_chunk_size(elements);
+
+        THREAD_POOL.in_place_scope(|scope| f(scope, chunk_size))
+    }
+
+    pub fn get_chunk_size(&self, elements: usize) -> usize {
+        let chunk_size = if elements <= *NUM_CPUS {
+            1
+        } else {
+            Self::chunk_size_for_num_spawned_threads(elements, *NUM_CPUS)
+        };
+
+        chunk_size
+    }
+    // TODO: check +1?
+    pub fn chunk_size_for_num_spawned_threads(elements: usize, num_threads: usize) -> usize {
+        assert!(
+            elements >= num_threads,
+            "received {} elements to spawn {} threads",
+            elements,
+            num_threads
+        );
+        if elements % num_threads == 0 {
+            elements / num_threads
+        } else {
+            elements / num_threads + 1
+        }
+    }
+
+    pub fn get_num_spawned_threads(&self, elements: usize) -> usize {
+        let num_spawned = if elements <= *NUM_CPUS {
+            elements
+        } else {
+            let chunk = self.get_chunk_size(elements);
+            let mut spawned = elements / chunk;
+            if spawned * chunk < elements {
+                spawned += 1;
+            }
+            assert!(spawned <= 2 * *NUM_CPUS);
+
+            spawned
+        };
+
+        num_spawned
+    }
+}
+
+pub struct Waiter<T> {
+    receiver: Receiver<T>,
+}
+
+impl<T> Waiter<T> {
+    /// Wait for the result.
+    pub fn wait(&self) -> T {
+        if THREAD_POOL.current_thread_index().is_some() {
+            // Calling `wait()` from within the worker thread pool can lead to dead logs
+            error!("The wait call should never be done inside the worker thread pool");
+            debug_assert!(false);
+        }
+        self.receiver.recv().unwrap()
+    }
+
+    /// One off sending.
+    pub fn done(val: T) -> Self {
+        let (sender, receiver) = bounded(1);
+        sender.send(val).unwrap();
+
+        Waiter { receiver }
+    }
+}
+
+fn log2_floor(num: usize) -> u32 {
+    assert!(num > 0);
+
+    let mut pow = 0;
+
+    while (1 << (pow + 1)) <= num {
+        pow += 1;
+    }
+
+    pow
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_log2_floor() {
+        assert_eq!(log2_floor(1), 0);
+        assert_eq!(log2_floor(3), 1);
+        assert_eq!(log2_floor(4), 2);
+        assert_eq!(log2_floor(5), 2);
+        assert_eq!(log2_floor(6), 2);
+        assert_eq!(log2_floor(7), 2);
+        assert_eq!(log2_floor(8), 3);
+    }
+}
diff --git a/halo2_proofs/src/poly.rs b/halo2_proofs/src/poly.rs
index 2d913e25bf..16feb6f3eb 100644
--- a/halo2_proofs/src/poly.rs
+++ b/halo2_proofs/src/poly.rs
@@ -12,7 +12,7 @@ use std::marker::PhantomData;
 use std::ops::{Add, Deref, DerefMut, Index, IndexMut, Mul, RangeFrom, RangeFull, Sub};
 
 pub mod commitment;
-mod domain;
+pub mod domain;
 mod msm;
 pub mod multiopen;
 
diff --git a/halo2_proofs/src/poly/domain.rs b/halo2_proofs/src/poly/domain.rs
index 5204ebef26..de7a74f320 100644
--- a/halo2_proofs/src/poly/domain.rs
+++ b/halo2_proofs/src/poly/domain.rs
@@ -10,8 +10,13 @@ use super::{Coeff, ExtendedLagrangeCoeff, LagrangeCoeff, Polynomial, Rotation};
 
 use group::ff::{BatchInvert, Field, PrimeField};
 
+use group::Group as CryptGroup;
+
 use std::marker::PhantomData;
 
+use crate::gpu;
+use log::{info, warn};
+
 /// This structure contains precomputed constants and other details needed for
 /// performing operations on an evaluation domain of size $2^k$ and an extended
 /// domain of size $2^{k} * j$ with $j \neq 0$.
@@ -485,6 +490,129 @@ pub struct PinnedEvaluationDomain<'a, G: Group> {
     omega: &'a G::Scalar,
 }
 
+pub fn create_fft_kernel<G>(_log_d: usize, priority: bool) -> Option<gpu::MultiFFTKernel<G>>
+where
+    G: Group,
+{
+    match gpu::MultiFFTKernel::create(priority) {
+        Ok(k) => {
+            info!("GPU FFT kernel instantiated!");
+            Some(k)
+        }
+        Err(e) => {
+            warn!("Cannot instantiate GPU FFT kernel! Error: {}", e);
+            None
+        }
+    }
+}
+
+use crate::worker::Worker;
+pub fn best_fft_multiple_gpu<G: Group>(
+    kern: &mut Option<gpu::LockedMultiFFTKernel<G>>,
+    polys: &mut [&mut [G::Scalar]],
+    worker: &Worker,
+    omega: &G::Scalar,
+    log_n: u32,
+) -> gpu::GPUResult<()> {
+    if let Some(ref mut kern) = kern {
+        if kern
+            .with(|k: &mut gpu::MultiFFTKernel<G>| gpu_fft_multiple(k, polys, omega, log_n))
+            .is_ok()
+        {
+            println!("use multiple GPUs");
+            return Ok(());
+        }
+    }
+    Ok(())
+}
+
+pub fn gpu_fft_multiple<G: Group>(
+    kern: &mut gpu::MultiFFTKernel<G>,
+    polys: &mut [&mut [G::Scalar]],
+    omega: &G::Scalar,
+    log_n: u32,
+) -> gpu::GPUResult<()> {
+    kern.fft_multiple(polys, omega, log_n)?;
+
+    Ok(())
+}
+
+#[test]
+fn test_best_fft_multiple_gpu() {
+    use crate::gpu::LockedMultiFFTKernel;
+    use crate::pairing::bn256::Fr;
+    use pairing::bn256::Bn256;
+    use crate::worker::Worker;
+
+    let worker = Worker::new();
+
+    use crate::poly::{EvaluationDomain};
+    use ark_std::{end_timer, start_timer};
+    use rand_core::OsRng;
+
+    for k in 1..20 {
+        let rng = OsRng;
+        // polynomial degree n = 2^k
+        let n = 1u64 << k;
+        // polynomial coeffs
+        let coeffs: Vec<_> = (0..n).map(|_| Fr::random(rng)).collect();
+        // evaluation domain
+        let domain: EvaluationDomain<Fr> = EvaluationDomain::new(1, k);
+
+        let mut prev_fft_coeffs = coeffs.clone();
+
+        let mut optimized_fft_coeffs = coeffs.clone();
+
+        let message = format!("prev_fft degree {}", k);
+        let start = start_timer!(|| message);
+        best_fft(&mut prev_fft_coeffs, domain.get_omega(), k);
+        end_timer!(start);
+
+        let message = format!("gpu_fft degree {}", k);
+        let start = start_timer!(|| message);
+
+        let mut fft_kern = Some(LockedMultiFFTKernel::<Fr>::new(k as usize, false));
+
+        best_fft_multiple_gpu(
+            &mut fft_kern,
+            &mut [&mut optimized_fft_coeffs],
+            &worker,
+            &domain.get_omega_inv(),
+            k as u32,
+        )
+        .unwrap();
+
+        end_timer!(start);
+
+        assert_eq!(prev_fft_coeffs, optimized_fft_coeffs);
+    }
+}
+
+#[test]
+fn test_fft() {
+    use crate::poly::{EvaluationDomain};
+    use ark_std::{end_timer, start_timer};
+    use pairing::bn256::Fr;
+    use rand_core::OsRng;
+
+    for k in 1..20 {
+        let rng = OsRng;
+        // polynomial degree n = 2^k
+        let n = 1u64 << k;
+        // polynomial coeffs
+        let coeffs: Vec<_> = (0..n).map(|_| Fr::random(rng)).collect();
+        // evaluation domain
+        let domain: EvaluationDomain<Fr> = EvaluationDomain::new(1, k);
+
+        let mut prev_fft_coeffs = coeffs.clone();
+
+        let message = format!("prev_fft degree {}", k);
+        let start = start_timer!(|| message);
+        best_fft(&mut prev_fft_coeffs, domain.get_omega(), k);
+        end_timer!(start);
+    }
+}
+
 #[test]
 fn test_rotate() {
     use rand_core::OsRng;

From 94e848fc6fb23786563a0ed1dfcc267483532112 Mon Sep 17 00:00:00 2001
From: huyanlin <bchylmn@gmail.com>
Date: Tue, 14 Jun 2022 21:46:55 +0800
Subject: [PATCH 02/13] upload gpu fft files

---
 halo2_proofs/src/gpu/error.rs             |  31 ++++
 halo2_proofs/src/gpu/fft.rs               | 205 ++++++++++++++++++++++
 halo2_proofs/src/gpu/fft/fft.cl           |  76 ++++++++
 halo2_proofs/src/gpu/locks.rs             | 205 ++++++++++++++++++++++
 halo2_proofs/src/gpu/mod.rs               |  39 ++++
 halo2_proofs/src/gpu/multiexp/ec.cl       | 134 ++++++++++++++
 halo2_proofs/src/gpu/multiexp/field2.cl   |  61 +++++++
 halo2_proofs/src/gpu/multiexp/multiexp.cl |  70 ++++++++
 halo2_proofs/src/gpu/nogpu.rs             |  92 ++++++++++
 halo2_proofs/src/gpu/sources.rs           |  66 +++++++
 halo2_proofs/src/gpu/utils.rs             |  88 ++++++++++
 11 files changed, 1067 insertions(+)
 create mode 100644 halo2_proofs/src/gpu/error.rs
 create mode 100644 halo2_proofs/src/gpu/fft.rs
 create mode 100644 halo2_proofs/src/gpu/fft/fft.cl
 create mode 100644 halo2_proofs/src/gpu/locks.rs
 create mode 100644 halo2_proofs/src/gpu/mod.rs
 create mode 100644 halo2_proofs/src/gpu/multiexp/ec.cl
 create mode 100644 halo2_proofs/src/gpu/multiexp/field2.cl
 create mode 100644 halo2_proofs/src/gpu/multiexp/multiexp.cl
 create mode 100644 halo2_proofs/src/gpu/nogpu.rs
 create mode 100644 halo2_proofs/src/gpu/sources.rs
 create mode 100644 halo2_proofs/src/gpu/utils.rs

diff --git a/halo2_proofs/src/gpu/error.rs b/halo2_proofs/src/gpu/error.rs
new file mode 100644
index 0000000000..1d6d38636e
--- /dev/null
+++ b/halo2_proofs/src/gpu/error.rs
@@ -0,0 +1,31 @@
+#[cfg(feature = "gpu")]
+use rust_gpu_tools::opencl;
+
+#[derive(thiserror::Error, Debug)]
+pub enum GPUError {
+    #[error("GPUError: {0}")]
+    Simple(&'static str),
+    #[cfg(feature = "gpu")]
+    #[error("OpenCL Error: {0}")]
+    OpenCL(#[from] opencl::GPUError),
+    #[cfg(feature = "gpu")]
+    #[error("GPU taken by a high priority process!")]
+    GPUTaken,
+    #[cfg(feature = "gpu")]
+    #[error("No kernel is initialized!")]
+    KernelUninitialized,
+    #[error("GPU accelerator is disabled!")]
+    GPUDisabled,
+}
+
+pub type GPUResult<T> = std::result::Result<T, GPUError>;
+
+#[cfg(feature = "gpu")]
+impl From<std::boxed::Box<dyn std::any::Any + std::marker::Send>> for GPUError {
+    fn from(e: std::boxed::Box<dyn std::any::Any + std::marker::Send>) -> Self {
+        match e.downcast::<Self>() {
+            Ok(err) => *err,
+            Err(_) => GPUError::Simple("An unknown GPU error happened!"),
+        }
+    }
+}
diff --git a/halo2_proofs/src/gpu/fft.rs b/halo2_proofs/src/gpu/fft.rs
new file mode 100644
index 0000000000..d35d9be04a
--- /dev/null
+++ b/halo2_proofs/src/gpu/fft.rs
@@ -0,0 +1,205 @@
+use crate::gpu::{
+    error::{GPUError, GPUResult},
+    get_lock_name_and_gpu_range, locks, sources,
+};
+use group::ff::Field;
+//use group::Group;
+use crate::arithmetic::Group;
+use crate::worker::THREAD_POOL;
+use log::{error, info};
+use rayon::join;
+use rust_gpu_tools::*;
+use std::cmp::min;
+use std::{cmp, env};
+use std::ops::MulAssign;
+
+const LOG2_MAX_ELEMENTS: usize = 32; // At most 2^32 elements is supported.
+const MAX_LOG2_RADIX: u32 = 9; // Radix512
+const MAX_LOG2_LOCAL_WORK_SIZE: u32 = 8; // 256
+
+pub struct SingleFFTKernel<G>
+where
+    G: Group,
+{
+    program: opencl::Program,
+    pq_buffer: opencl::Buffer<G::Scalar>,
+    omegas_buffer: opencl::Buffer<G::Scalar>,
+    priority: bool,
+}
+
+impl<G> SingleFFTKernel<G>
+where
+    G: Group,
+{
+    pub fn create(device: opencl::Device, priority: bool) -> GPUResult<SingleFFTKernel<G>> {
+        let src = sources::kernel::<G>(device.brand() == opencl::Brand::Nvidia);
+
+        let program = opencl::Program::from_opencl(device, &src)?;
+        let pq_buffer = program.create_buffer::<G::Scalar>(1 << MAX_LOG2_RADIX >> 1)?;
+        let omegas_buffer = program.create_buffer::<G::Scalar>(LOG2_MAX_ELEMENTS)?;
+
+        info!("FFT: Device: {}", program.device().name());
+
+        Ok(SingleFFTKernel {
+            program,
+            pq_buffer,
+            omegas_buffer,
+            priority,
+        })
+    }
+
+    /// Peforms a FFT round
+    /// * `log_n` - Specifies log2 of number of elements
+    /// * `log_p` - Specifies log2 of `p`, (http://www.bealto.com/gpu-fft_group-1.html)
+    /// * `deg` - 1=>radix2, 2=>radix4, 3=>radix8, ...
+    /// * `max_deg` - The precalculated values pq` and `omegas` are valid for radix degrees up to `max_deg`
+    fn radix_fft_round(
+        &mut self,
+        src_buffer: &opencl::Buffer<G::Scalar>,
+        dst_buffer: &opencl::Buffer<G::Scalar>,
+        log_n: u32,
+        log_p: u32,
+        deg: u32,
+        max_deg: u32,
+    ) -> GPUResult<()> {
+        // if locks::PriorityLock::should_break(self.priority) {
+        //     return Err(GPUError::GPUTaken);
+        // }
+
+        let n = 1u32 << log_n;
+        let local_work_size = 1 << cmp::min(deg - 1, MAX_LOG2_LOCAL_WORK_SIZE);
+        let global_work_size = (n >> deg) * local_work_size;
+        let kernel = self.program.create_kernel(
+            "radix_fft",
+            global_work_size as usize,
+            Some(local_work_size as usize),
+        );
+        kernel
+            .arg(src_buffer)
+            .arg(dst_buffer)
+            .arg(&self.pq_buffer)
+            .arg(&self.omegas_buffer)
+            .arg(opencl::LocalBuffer::<G::Scalar>::new(1 << deg))
+            .arg(n)
+            .arg(log_p)
+            .arg(deg)
+            .arg(max_deg)
+            .run()?;
+        Ok(())
+    }
+
+    /// Share some precalculated values between threads to boost the performance
+    fn setup_pq_omegas(&mut self, omega: &G::Scalar, n: usize, max_deg: u32) -> GPUResult<()> {
+        // Precalculate:
+        // [omega^(0/(2^(deg-1))), omega^(1/(2^(deg-1))), ..., omega^((2^(deg-1)-1)/(2^(deg-1)))]
+        let mut pq = vec![G::Scalar::zero(); 1 << max_deg >> 1];
+        let twiddle = omega.pow_vartime([(n >> max_deg) as u64]);
+        pq[0] = G::Scalar::one();
+        if max_deg > 1 {
+            pq[1] = twiddle;
+            for i in 2..(1 << max_deg >> 1) {
+                pq[i] = pq[i - 1];
+                pq[i].mul_assign(&twiddle);
+            }
+        }
+        self.pq_buffer.write_from(0, &pq)?;
+
+        // Precalculate [omega, omega^2, omega^4, omega^8, ..., omega^(2^31)]
+        let mut omegas = vec![G::Scalar::zero(); 32];
+        omegas[0] = *omega;
+        for i in 1..LOG2_MAX_ELEMENTS {
+            omegas[i] = omegas[i - 1].pow_vartime([2u64]);
+        }
+        self.omegas_buffer.write_from(0, &omegas)?;
+
+        Ok(())
+    }
+
+    /// Performs FFT on `a`
+    /// * `omega` - Special value `omega` is used for FFT over finite-fields
+    /// * `log_n` - Specifies log2 of number of elements
+    pub fn radix_fft(&mut self, a: &mut [G::Scalar], omega: &G::Scalar, log_n: u32) -> GPUResult<()> {
+        let n = 1 << log_n;
+        let mut src_buffer = self.program.create_buffer::<G::Scalar>(n)?;
+        let mut dst_buffer = self.program.create_buffer::<G::Scalar>(n)?;
+
+        let max_deg = cmp::min(MAX_LOG2_RADIX, log_n);
+        self.setup_pq_omegas(omega, n, max_deg)?;
+
+        src_buffer.write_from(0, &*a)?;
+        let mut log_p = 0u32;
+        while log_p < log_n {
+            let deg = cmp::min(max_deg, log_n - log_p);
+            self.radix_fft_round(&src_buffer, &dst_buffer, log_n, log_p, deg, max_deg)?;
+            log_p += deg;
+            std::mem::swap(&mut src_buffer, &mut dst_buffer);
+        }
+
+        src_buffer.read_into(0, a)?;
+
+        Ok(())
+    }
+}
+
+pub struct MultiFFTKernel<G>
+where
+    G: Group,
+{
+    kernels: Vec<SingleFFTKernel<G>>,
+    _lock: locks::GPULock,
+}
+
+impl<G> MultiFFTKernel<G>
+where
+    G: Group,
+{
+    pub fn create(priority: bool) -> GPUResult<MultiFFTKernel<G>> {
+        let mut all_devices = opencl::Device::all();
+        let all_num = all_devices.len();
+        let (lock_index, gpu_range) = get_lock_name_and_gpu_range(all_num);
+
+        let lock = locks::GPULock::lock(lock_index);
+
+        let devices: Vec<&opencl::Device> = all_devices.drain(gpu_range).collect();
+
+        // use all of the  GPUs
+        let kernels: Vec<_> = devices
+            .into_iter()
+            .map(|d| (d, SingleFFTKernel::<G>::create(d.clone(), priority)))
+            .filter_map(|(device, res)| {
+                if let Err(ref e) = res {
+                    error!(
+                        "Cannot initialize kernel for device '{}'! Error: {}",
+                        device.name(),
+                        e
+                    );
+                }
+                res.ok()
+            })
+            .collect();
+
+        Ok(MultiFFTKernel {
+            kernels,
+            _lock: lock,
+        })
+    }
+
+    pub fn fft_multiple(
+        &mut self,
+        polys: &mut [&mut [G::Scalar]],
+        omega: &G::Scalar,
+        log_n: u32,
+    ) -> GPUResult<()> {
+        use rayon::prelude::*;
+
+        for poly in polys.chunks_mut(self.kernels.len()) {
+            crate::worker::THREAD_POOL.install(|| {
+                poly.par_iter_mut()
+                    .zip(self.kernels.par_iter_mut())
+                    .for_each(|(p, kern)| kern.radix_fft(p, omega, log_n).unwrap())
+            });
+        }
+
+        Ok(())
+    }
+}
diff --git a/halo2_proofs/src/gpu/fft/fft.cl b/halo2_proofs/src/gpu/fft/fft.cl
new file mode 100644
index 0000000000..b340a9d42a
--- /dev/null
+++ b/halo2_proofs/src/gpu/fft/fft.cl
@@ -0,0 +1,76 @@
+uint bitreverse(uint n, uint bits) {
+  uint r = 0;
+  for(int i = 0; i < bits; i++) {
+    r = (r << 1) | (n & 1);
+    n >>= 1;
+  }
+  return r;
+}
+
+/*
+ * FFT algorithm is inspired from: http://www.bealto.com/gpu-fft_group-1.html
+ */
+__kernel void radix_fft(__global FIELD* x, // Source buffer
+                        __global FIELD* y, // Destination buffer
+                        __global FIELD* pq, // Precalculated twiddle factors
+                        __global FIELD* omegas, // [omega, omega^2, omega^4, ...]
+                        __local FIELD* u, // Local buffer to store intermediary values
+                        uint n, // Number of elements
+                        uint lgp, // Log2 of `p` (Read more in the link above)
+                        uint deg, // 1=>radix2, 2=>radix4, 3=>radix8, ...
+                        uint max_deg) // Maximum degree supported, according to `pq` and `omegas`
+{
+  uint lid = get_local_id(0);
+  uint lsize = get_local_size(0);
+  uint index = get_group_id(0);
+  uint t = n >> deg;
+  uint p = 1 << lgp;
+  uint k = index & (p - 1);
+
+  x += index;
+  y += ((index - k) << deg) + k;
+
+  uint count = 1 << deg; // 2^deg
+  uint counth = count >> 1; // Half of count
+
+  uint counts = count / lsize * lid;
+  uint counte = counts + count / lsize;
+
+  // Compute powers of twiddle
+  const FIELD twiddle = FIELD_pow_lookup(omegas, (n >> lgp >> deg) * k);
+  FIELD tmp = FIELD_pow(twiddle, counts);
+  for(uint i = counts; i < counte; i++) {
+    u[i] = FIELD_mul(tmp, x[i*t]);
+    tmp = FIELD_mul(tmp, twiddle);
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  const uint pqshift = max_deg - deg;
+  for(uint rnd = 0; rnd < deg; rnd++) {
+    const uint bit = counth >> rnd;
+    for(uint i = counts >> 1; i < counte >> 1; i++) {
+      const uint di = i & (bit - 1);
+      const uint i0 = (i << 1) - di;
+      const uint i1 = i0 + bit;
+      tmp = u[i0];
+      u[i0] = FIELD_add(u[i0], u[i1]);
+      u[i1] = FIELD_sub(tmp, u[i1]);
+      if(di != 0) u[i1] = FIELD_mul(pq[di << rnd << pqshift], u[i1]);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  for(uint i = counts >> 1; i < counte >> 1; i++) {
+    y[i*p] = u[bitreverse(i, deg)];
+    y[(i+counth)*p] = u[bitreverse(i + counth, deg)];
+  }
+}
+
+/// Multiplies all of the elements by `field`
+__kernel void mul_by_field(__global FIELD* elements,
+                        uint n,
+                        FIELD field) {
+  const uint gid = get_global_id(0);
+  elements[gid] = FIELD_mul(elements[gid], field);
+}
diff --git a/halo2_proofs/src/gpu/locks.rs b/halo2_proofs/src/gpu/locks.rs
new file mode 100644
index 0000000000..8d36662453
--- /dev/null
+++ b/halo2_proofs/src/gpu/locks.rs
@@ -0,0 +1,205 @@
+use fs2::FileExt;
+use log::{debug, info, warn};
+use std::fs::File;
+use std::path::PathBuf;
+//use group::Group;
+use crate::arithmetic::Group;
+
+const GPU_LOCK_NAME: &str = "bellman.gpu.lock";
+const PRIORITY_LOCK_NAME: &str = "bellman.priority.lock";
+fn tmp_path(filename: &str) -> PathBuf {
+    let mut p = std::env::temp_dir();
+    p.push(filename);
+    p
+}
+
+pub fn get_lock_name_and_gpu_range(all_gpus: usize) -> (String, Range<usize>) {
+    let mut num_gpus_per_group = all_gpus;
+    let mut group_index = 0usize;
+
+    let maybe_num = env::var("BELLMAN_NUM_GPUS_PER_GROUP");
+    let maybe_index = env::var("BELLMAN_GPU_GROUP_INDEX");
+
+    if maybe_num.is_ok() && maybe_index.is_ok() {
+        let maybe_num = maybe_num.unwrap().parse();
+        let maybe_index = maybe_index.unwrap().parse();
+        if maybe_num.is_ok() && maybe_index.is_ok() {
+            num_gpus_per_group = maybe_num.unwrap();
+            group_index = maybe_index.unwrap();
+        }
+    }
+
+    assert!(
+        (group_index + 1) * num_gpus_per_group <= all_gpus,
+        "BELLMAN_GPU_GROUP_INDEX and BELLMAN_NUM_GPUS_PER_GROUP error"
+    );
+
+    (
+        group_index.to_string(),
+        Range {
+            start: group_index * num_gpus_per_group,
+            end: (group_index + 1) * num_gpus_per_group,
+        },
+    )
+}
+
+/// `GPULock` prevents two kernel objects to be instantiated simultaneously.
+#[derive(Debug)]
+pub struct GPULock(File);
+impl GPULock {
+    pub fn lock(index: String) -> GPULock {
+        let filename = GPU_LOCK_NAME.to_string() + &index;
+        let gpu_lock_file = tmp_path(&filename);
+        debug!("Acquiring GPU lock at {:?} ...", &gpu_lock_file);
+        let f = File::create(&gpu_lock_file)
+            .unwrap_or_else(|_| panic!("Cannot create GPU lock file at {:?}", &gpu_lock_file));
+        f.lock_exclusive().unwrap();
+        debug!("GPU lock acquired!");
+        GPULock(f)
+    }
+}
+impl Drop for GPULock {
+    fn drop(&mut self) {
+        self.0.unlock().unwrap();
+        debug!("GPU lock released!");
+    }
+}
+
+/// `PrioriyLock` is like a flag. When acquired, it means a high-priority process
+/// needs to acquire the GPU really soon. Acquiring the `PriorityLock` is like
+/// signaling all other processes to release their `GPULock`s.
+/// Only one process can have the `PriorityLock` at a time.
+#[derive(Debug)]
+pub struct PriorityLock(File);
+impl PriorityLock {
+    pub fn lock() -> PriorityLock {
+        let priority_lock_file = tmp_path(PRIORITY_LOCK_NAME);
+        debug!("Acquiring priority lock at {:?} ...", &priority_lock_file);
+        let f = File::create(&priority_lock_file).unwrap_or_else(|_| {
+            panic!(
+                "Cannot create priority lock file at {:?}",
+                &priority_lock_file
+            )
+        });
+        f.lock_exclusive().unwrap();
+        debug!("Priority lock acquired!");
+        PriorityLock(f)
+    }
+    pub fn wait(priority: bool) {
+        if !priority {
+            File::create(tmp_path(PRIORITY_LOCK_NAME))
+                .unwrap()
+                .lock_exclusive()
+                .unwrap();
+        }
+    }
+    pub fn should_break(priority: bool) -> bool {
+        !priority
+            && File::create(tmp_path(PRIORITY_LOCK_NAME))
+                .unwrap()
+                .try_lock_exclusive()
+                .is_err()
+    }
+}
+impl Drop for PriorityLock {
+    fn drop(&mut self) {
+        self.0.unlock().unwrap();
+        debug!("Priority lock released!");
+    }
+}
+
+use super::error::{GPUError, GPUResult};
+use super::fft::MultiFFTKernel;
+//use super::multiexp::MultiexpKernel;
+use crate::poly::domain::create_fft_kernel;
+//use crate::multiexp::create_multiexp_kernel;
+//use crate::pairing::Engine;
+use std::env;
+use std::ops::Range;
+
+macro_rules! locked_kernel {
+    ($class:ident, $kern:ident, $func:ident, $name:expr) => {
+        pub struct $class<G>
+        where
+            G: Group,
+        {
+            log_d: usize,
+            priority: bool,
+            kernel: Option<$kern<G>>,
+        }
+
+        impl<G> $class<G>
+        where
+            G: Group,
+        {
+            pub fn new(log_d: usize, priority: bool) -> $class<G> {
+                $class::<G> {
+                    log_d,
+                    priority,
+                    kernel: None,
+                }
+            }
+
+            fn init(&mut self) {
+                if self.kernel.is_none() {
+                    PriorityLock::wait(self.priority);
+                    info!("GPU is available for {}!", $name);
+                    self.kernel = $func::<G>(self.log_d, self.priority);
+                }
+            }
+
+            fn free(&mut self) {
+                if let Some(_kernel) = self.kernel.take() {
+                    warn!(
+                        "GPU acquired by a high priority process! Freeing up {} kernels...",
+                        $name
+                    );
+                }
+            }
+
+            pub fn with<F, R>(&mut self, mut f: F) -> GPUResult<R>
+            where
+                F: FnMut(&mut $kern<G>) -> GPUResult<R>,
+            {
+                if let Ok(flag) = std::env::var("BELLMAN_USE_CPU") {
+                    if flag == "1" {
+                        return Err(GPUError::GPUDisabled);
+                    }
+                }
+
+                self.init();
+
+                loop {
+                    if let Some(ref mut k) = self.kernel {
+                        match f(k) {
+                            Err(GPUError::GPUTaken) => {
+                                self.free();
+                                self.init();
+                            }
+                            Err(e) => {
+                                warn!("GPU {} failed! Falling back to CPU... Error: {}", $name, e);
+                                return Err(e);
+                            }
+                            Ok(v) => return Ok(v),
+                        }
+                    } else {
+                        return Err(GPUError::KernelUninitialized);
+                    }
+                }
+            }
+        }
+    };
+}
+
+locked_kernel!(
+    LockedMultiFFTKernel,
+    MultiFFTKernel,
+    create_fft_kernel,
+    "FFT"
+);
+/*locked_kernel!(
+    LockedMultiexpKernel,
+    MultiexpKernel,
+    create_multiexp_kernel,
+    "Multiexp"
+);*/
diff --git a/halo2_proofs/src/gpu/mod.rs b/halo2_proofs/src/gpu/mod.rs
new file mode 100644
index 0000000000..21cecfea40
--- /dev/null
+++ b/halo2_proofs/src/gpu/mod.rs
@@ -0,0 +1,39 @@
+mod error;
+
+pub use self::error::*;
+
+#[cfg(feature = "gpu")]
+mod locks;
+
+#[cfg(feature = "gpu")]
+pub use self::locks::*;
+
+#[cfg(feature = "gpu")]
+mod sources;
+
+#[cfg(feature = "gpu")]
+pub use self::sources::*;
+
+#[cfg(feature = "gpu")]
+mod utils;
+
+#[cfg(feature = "gpu")]
+pub use self::utils::*;
+
+#[cfg(feature = "gpu")]
+mod fft;
+
+#[cfg(feature = "gpu")]
+pub use self::fft::*;
+
+//#[cfg(feature = "gpu")]
+//mod multiexp;
+
+//#[cfg(feature = "gpu")]
+//pub use self::multiexp::*;
+
+#[cfg(not(feature = "gpu"))]
+mod nogpu;
+
+#[cfg(not(feature = "gpu"))]
+pub use self::nogpu::*;
diff --git a/halo2_proofs/src/gpu/multiexp/ec.cl b/halo2_proofs/src/gpu/multiexp/ec.cl
new file mode 100644
index 0000000000..26f617ad75
--- /dev/null
+++ b/halo2_proofs/src/gpu/multiexp/ec.cl
@@ -0,0 +1,134 @@
+// Elliptic curve operations (Short Weierstrass Jacobian form)
+
+#define POINT_ZERO ((POINT_projective){FIELD_ZERO, FIELD_ONE, FIELD_ZERO})
+
+__BLSTRS__ // Affine points in `blstrs` library do not have `inf` field.
+
+typedef struct {
+  FIELD x;
+  FIELD y;
+  #ifndef BLSTRS
+    bool inf;
+  #endif
+  #if FIELD_LIMB_BITS == 32
+    uint _padding;
+  #endif
+} POINT_affine;
+
+typedef struct {
+  FIELD x;
+  FIELD y;
+  FIELD z;
+} POINT_projective;
+
+// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
+POINT_projective POINT_double(POINT_projective inp) {
+  const FIELD local_zero = FIELD_ZERO;
+  if(FIELD_eq(inp.z, local_zero)) {
+      return inp;
+  }
+
+  const FIELD a = FIELD_sqr(inp.x); // A = X1^2
+  const FIELD b = FIELD_sqr(inp.y); // B = Y1^2
+  FIELD c = FIELD_sqr(b); // C = B^2
+
+  // D = 2*((X1+B)2-A-C)
+  FIELD d = FIELD_add(inp.x, b);
+  d = FIELD_sqr(d); d = FIELD_sub(FIELD_sub(d, a), c); d = FIELD_double(d);
+
+  const FIELD e = FIELD_add(FIELD_double(a), a); // E = 3*A
+  const FIELD f = FIELD_sqr(e);
+
+  inp.z = FIELD_mul(inp.y, inp.z); inp.z = FIELD_double(inp.z); // Z3 = 2*Y1*Z1
+  inp.x = FIELD_sub(FIELD_sub(f, d), d); // X3 = F-2*D
+
+  // Y3 = E*(D-X3)-8*C
+  c = FIELD_double(c); c = FIELD_double(c); c = FIELD_double(c);
+  inp.y = FIELD_sub(FIELD_mul(FIELD_sub(d, inp.x), e), c);
+
+  return inp;
+}
+
+// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl
+POINT_projective POINT_add_mixed(POINT_projective a, POINT_affine b) {
+  #ifndef BLSTRS
+    if(b.inf) {
+        return a;
+    }
+  #endif
+
+  const FIELD local_zero = FIELD_ZERO;
+  if(FIELD_eq(a.z, local_zero)) {
+    const FIELD local_one = FIELD_ONE;
+    a.x = b.x;
+    a.y = b.y;
+    a.z = local_one;
+    return a;
+  }
+
+  const FIELD z1z1 = FIELD_sqr(a.z);
+  const FIELD u2 = FIELD_mul(b.x, z1z1);
+  const FIELD s2 = FIELD_mul(FIELD_mul(b.y, a.z), z1z1);
+
+  if(FIELD_eq(a.x, u2) && FIELD_eq(a.y, s2)) {
+      return POINT_double(a);
+  }
+
+  const FIELD h = FIELD_sub(u2, a.x); // H = U2-X1
+  const FIELD hh = FIELD_sqr(h); // HH = H^2
+  FIELD i = FIELD_double(hh); i = FIELD_double(i); // I = 4*HH
+  FIELD j = FIELD_mul(h, i); // J = H*I
+  FIELD r = FIELD_sub(s2, a.y); r = FIELD_double(r); // r = 2*(S2-Y1)
+  const FIELD v = FIELD_mul(a.x, i);
+
+  POINT_projective ret;
+
+  // X3 = r^2 - J - 2*V
+  ret.x = FIELD_sub(FIELD_sub(FIELD_sqr(r), j), FIELD_double(v));
+
+  // Y3 = r*(V-X3)-2*Y1*J
+  j = FIELD_mul(a.y, j); j = FIELD_double(j);
+  ret.y = FIELD_sub(FIELD_mul(FIELD_sub(v, ret.x), r), j);
+
+  // Z3 = (Z1+H)^2-Z1Z1-HH
+  ret.z = FIELD_add(a.z, h); ret.z = FIELD_sub(FIELD_sub(FIELD_sqr(ret.z), z1z1), hh);
+  return ret;
+}
+
+// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
+POINT_projective POINT_add(POINT_projective a, POINT_projective b) {
+
+  const FIELD local_zero = FIELD_ZERO;
+  if(FIELD_eq(a.z, local_zero)) return b;
+  if(FIELD_eq(b.z, local_zero)) return a;
+
+  const FIELD z1z1 = FIELD_sqr(a.z); // Z1Z1 = Z1^2
+  const FIELD z2z2 = FIELD_sqr(b.z); // Z2Z2 = Z2^2
+  const FIELD u1 = FIELD_mul(a.x, z2z2); // U1 = X1*Z2Z2
+  const FIELD u2 = FIELD_mul(b.x, z1z1); // U2 = X2*Z1Z1
+  FIELD s1 = FIELD_mul(FIELD_mul(a.y, b.z), z2z2); // S1 = Y1*Z2*Z2Z2
+  const FIELD s2 = FIELD_mul(FIELD_mul(b.y, a.z), z1z1); // S2 = Y2*Z1*Z1Z1
+
+  if(FIELD_eq(u1, u2) && FIELD_eq(s1, s2))
+    return POINT_double(a);
+  else {
+    const FIELD h = FIELD_sub(u2, u1); // H = U2-U1
+    FIELD i = FIELD_double(h); i = FIELD_sqr(i); // I = (2*H)^2
+    const FIELD j = FIELD_mul(h, i); // J = H*I
+    FIELD r = FIELD_sub(s2, s1); r = FIELD_double(r); // r = 2*(S2-S1)
+    const FIELD v = FIELD_mul(u1, i); // V = U1*I
+    a.x = FIELD_sub(FIELD_sub(FIELD_sub(FIELD_sqr(r), j), v), v); // X3 = r^2 - J - 2*V
+
+    // Y3 = r*(V - X3) - 2*S1*J
+    a.y = FIELD_mul(FIELD_sub(v, a.x), r);
+    s1 = FIELD_mul(s1, j); s1 = FIELD_double(s1); // S1 = S1 * J * 2
+    a.y = FIELD_sub(a.y, s1);
+
+    // Z3 = ((Z1+Z2)^2 - Z1Z1 - Z2Z2)*H
+    a.z = FIELD_add(a.z, b.z); a.z = FIELD_sqr(a.z);
+    a.z = FIELD_sub(FIELD_sub(a.z, z1z1), z2z2);
+    a.z = FIELD_mul(a.z, h);
+
+    return a;
+  }
+}
diff --git a/halo2_proofs/src/gpu/multiexp/field2.cl b/halo2_proofs/src/gpu/multiexp/field2.cl
new file mode 100644
index 0000000000..c520e33e79
--- /dev/null
+++ b/halo2_proofs/src/gpu/multiexp/field2.cl
@@ -0,0 +1,61 @@
+// Fp2 Extension Field where u^2 + 1 = 0
+
+#define FIELD2_LIMB_BITS FIELD_LIMB_BITS
+#define FIELD2_ZERO ((FIELD2){FIELD_ZERO, FIELD_ZERO})
+#define FIELD2_ONE ((FIELD2){FIELD_ONE, FIELD_ZERO})
+
+typedef struct {
+  FIELD c0;
+  FIELD c1;
+} FIELD2; // Represents: c0 + u * c1
+
+bool FIELD2_eq(FIELD2 a, FIELD2 b) {
+  return FIELD_eq(a.c0, b.c0) && FIELD_eq(a.c1, b.c1);
+}
+FIELD2 FIELD2_sub(FIELD2 a, FIELD2 b) {
+  a.c0 = FIELD_sub(a.c0, b.c0);
+  a.c1 = FIELD_sub(a.c1, b.c1);
+  return a;
+}
+FIELD2 FIELD2_add(FIELD2 a, FIELD2 b) {
+  a.c0 = FIELD_add(a.c0, b.c0);
+  a.c1 = FIELD_add(a.c1, b.c1);
+  return a;
+}
+FIELD2 FIELD2_double(FIELD2 a) {
+  a.c0 = FIELD_double(a.c0);
+  a.c1 = FIELD_double(a.c1);
+  return a;
+}
+
+/*
+ * (a_0 + u * a_1)(b_0 + u * b_1) = a_0 * b_0 - a_1 * b_1 + u * (a_0 * b_1 + a_1 * b_0)
+ * Therefore:
+ * c_0 = a_0 * b_0 - a_1 * b_1
+ * c_1 = (a_0 * b_1 + a_1 * b_0) = (a_0 + a_1) * (b_0 + b_1) - a_0 * b_0 - a_1 * b_1
+ */
+FIELD2 FIELD2_mul(FIELD2 a, FIELD2 b) {
+  const FIELD aa = FIELD_mul(a.c0, b.c0);
+  const FIELD bb = FIELD_mul(a.c1, b.c1);
+  const FIELD o = FIELD_add(b.c0, b.c1);
+  a.c1 = FIELD_add(a.c1, a.c0);
+  a.c1 = FIELD_mul(a.c1, o);
+  a.c1 = FIELD_sub(a.c1, aa);
+  a.c1 = FIELD_sub(a.c1, bb);
+  a.c0 = FIELD_sub(aa, bb);
+  return a;
+}
+
+/*
+ * (a_0 + u * a_1)(a_0 + u * a_1) = a_0 ^ 2 - a_1 ^ 2 + u * 2 * a_0 * a_1
+ * Therefore:
+ * c_0 = (a_0 * a_0 - a_1 * a_1) = (a_0 + a_1)(a_0 - a_1)
+ * c_1 = 2 * a_0 * a_1
+ */
+FIELD2 FIELD2_sqr(FIELD2 a) {
+  const FIELD ab = FIELD_mul(a.c0, a.c1);
+  const FIELD c0c1 = FIELD_add(a.c0, a.c1);
+  a.c0 = FIELD_mul(FIELD_sub(a.c0, a.c1), c0c1);
+  a.c1 = FIELD_double(ab);
+  return a;
+}
diff --git a/halo2_proofs/src/gpu/multiexp/multiexp.cl b/halo2_proofs/src/gpu/multiexp/multiexp.cl
new file mode 100644
index 0000000000..4e0d26e2e2
--- /dev/null
+++ b/halo2_proofs/src/gpu/multiexp/multiexp.cl
@@ -0,0 +1,70 @@
+/*
+ * Same multiexp algorithm used in Bellman, with some modifications.
+ * https://github.com/zkcrypto/bellman/blob/10c5010fd9c2ca69442dc9775ea271e286e776d8/src/multiexp.rs#L174
+ * The CPU version of multiexp parallelism is done by dividing the exponent
+ * values into smaller windows, and then applying a sequence of rounds to each
+ * window. The GPU kernel not only assigns a thread to each window but also
+ * divides the bases into several groups which highly increases the number of
+ * threads running in parallel for calculating a multiexp instance.
+ */
+
+__kernel void POINT_bellman_multiexp(
+    __global POINT_affine *bases,
+    __global POINT_projective *buckets,
+    __global POINT_projective *results,
+    __global EXPONENT *exps,
+    uint n,
+    uint num_groups,
+    uint num_windows,
+    uint window_size) {
+
+  // We have `num_windows` * `num_groups` threads per multiexp.
+  const uint gid = get_global_id(0);
+  if(gid >= num_windows * num_groups) return;
+
+  // We have (2^window_size - 1) buckets.
+  const uint bucket_len = ((1 << window_size) - 1);
+
+  // Each thread has its own set of buckets in global memory.
+  buckets += bucket_len * gid;
+
+  const POINT_projective local_zero = POINT_ZERO;
+  for(uint i = 0; i < bucket_len; i++) buckets[i] = local_zero;
+
+  const uint len = (uint)ceil(n / (float)num_groups); // Num of elements in each group
+
+  // This thread runs the multiexp algorithm on elements from `nstart` to `nened`
+  // on the window [`bits`, `bits` + `w`)
+  const uint nstart = len * (gid / num_windows);
+  const uint nend = min(nstart + len, n);
+  const uint bits = (gid % num_windows) * window_size;
+  const ushort w = min((ushort)window_size, (ushort)(EXPONENT_BITS - bits));
+
+  POINT_projective res = POINT_ZERO;
+  for(uint i = nstart; i < nend; i++) {
+    uint ind = EXPONENT_get_bits(exps[i], bits, w);
+
+    #ifdef NVIDIA
+      // O_o, weird optimization, having a single special case makes it
+      // tremendously faster!
+      // 511 is chosen because it's half of the maximum bucket len, but
+      // any other number works... Bigger indices seems to be better...
+      if(ind == 511) buckets[510] = POINT_add_mixed(buckets[510], bases[i]);
+      else if(ind--) buckets[ind] = POINT_add_mixed(buckets[ind], bases[i]);
+    #else
+      if(ind--) buckets[ind] = POINT_add_mixed(buckets[ind], bases[i]);
+    #endif
+  }
+
+  // Summation by parts
+  // e.g. 3a + 2b + 1c = a +
+  //                    (a) + b +
+  //                    ((a) + b) + c
+  POINT_projective acc = POINT_ZERO;
+  for(int j = bucket_len - 1; j >= 0; j--) {
+    acc = POINT_add(acc, buckets[j]);
+    res = POINT_add(res, acc);
+  }
+
+  results[gid] = res;
+}
diff --git a/halo2_proofs/src/gpu/nogpu.rs b/halo2_proofs/src/gpu/nogpu.rs
new file mode 100644
index 0000000000..de543340d8
--- /dev/null
+++ b/halo2_proofs/src/gpu/nogpu.rs
@@ -0,0 +1,92 @@
+use super::error::{GPUError, GPUResult};
+use crate::pairing::ff::{PrimeField, ScalarEngine};
+use crate::pairing::CurveAffine;
+use crate::worker::Worker;
+use std::marker::PhantomData;
+use std::sync::Arc;
+
+// This module is compiled instead of `fft.rs` and `multiexp.rs` if `gpu` feature is disabled.
+
+pub struct MultiFFTKernel<E>(PhantomData<E>)
+where
+    E: ScalarEngine;
+
+impl<E> MultiFFTKernel<E>
+where
+    E: ScalarEngine,
+{
+    pub fn create(_: bool) -> GPUResult<MultiFFTKernel<E>> {
+        return Err(GPUError::GPUDisabled);
+    }
+
+    pub fn radix_fft(&mut self, _: &mut [E::Fr], _: &E::Fr, _: u32) -> GPUResult<()> {
+        return Err(GPUError::GPUDisabled);
+    }
+}
+
+pub struct MultiexpKernel<E>(PhantomData<E>)
+where
+    E: ScalarEngine;
+
+impl<E> MultiexpKernel<E>
+where
+    E: ScalarEngine,
+{
+    pub fn create(_: bool) -> GPUResult<MultiexpKernel<E>> {
+        return Err(GPUError::GPUDisabled);
+    }
+
+    pub fn multiexp<G>(
+        &mut self,
+        _: &Worker,
+        _: Arc<Vec<G>>,
+        _: Arc<Vec<<<G::Engine as ScalarEngine>::Fr as PrimeField>::Repr>>,
+        _: usize,
+        _: usize,
+    ) -> GPUResult<<G as CurveAffine>::Projective>
+    where
+        G: CurveAffine,
+    {
+        return Err(GPUError::GPUDisabled);
+    }
+
+    pub fn dense_multiexp<G: CurveAffine>(
+        &mut self,
+        _: &Worker,
+        _: Arc<Vec<G>>,
+        _: Arc<Vec<<<G::Engine as ScalarEngine>::Fr as PrimeField>::Repr>>,
+        _: usize,
+    ) -> GPUResult<<G as CurveAffine>::Projective>
+    where
+        G::Engine: crate::pairing::Engine,
+    {
+        return Err(GPUError::GPUDisabled);
+    }
+}
+
+use crate::pairing::Engine;
+
+macro_rules! locked_kernel {
+    ($class:ident) => {
+        pub struct $class<E>(PhantomData<E>);
+
+        impl<E> $class<E>
+        where
+            E: Engine,
+        {
+            pub fn new(_: usize, _: bool) -> $class<E> {
+                $class::<E>(PhantomData)
+            }
+
+            pub fn with<F, R, K>(&mut self, _: F) -> GPUResult<R>
+            where
+                F: FnMut(&mut K) -> GPUResult<R>,
+            {
+                return Err(GPUError::GPUDisabled);
+            }
+        }
+    };
+}
+
+locked_kernel!(LockedMultiFFTKernel);
+locked_kernel!(LockedMultiexpKernel);
diff --git a/halo2_proofs/src/gpu/sources.rs b/halo2_proofs/src/gpu/sources.rs
new file mode 100644
index 0000000000..b14146ac1c
--- /dev/null
+++ b/halo2_proofs/src/gpu/sources.rs
@@ -0,0 +1,66 @@
+//use group::Group;
+use crate::arithmetic::Group;
+use ff_cl_gen as ffgen;
+
+// Instead of having a very large OpenCL program written for a specific curve, with a lot of
+// rudandant codes (As OpenCL doesn't have generic types or templates), this module will dynamically
+// generate OpenCL codes given different PrimeFields and curves.
+
+static FFT_SRC: &str = include_str!("fft/fft.cl");
+static FIELD2_SRC: &str = include_str!("multiexp/field2.cl");
+static EC_SRC: &str = include_str!("multiexp/ec.cl");
+static MULTIEXP_SRC: &str = include_str!("multiexp/multiexp.cl");
+
+fn field2(field2: &str, field: &str) -> String {
+    String::from(FIELD2_SRC)
+        .replace("FIELD2", field2)
+        .replace("FIELD", field)
+}
+
+fn fft(field: &str) -> String {
+    String::from(FFT_SRC).replace("FIELD", field)
+}
+
+#[cfg(not(feature = "blstrs"))]
+const BLSTRS_DEF: &str = "";
+#[cfg(feature = "blstrs")]
+const BLSTRS_DEF: &str = "#define BLSTRS";
+
+fn ec(field: &str, point: &str) -> String {
+    String::from(EC_SRC)
+        .replace("FIELD", field)
+        .replace("POINT", point)
+        .replace("__BLSTRS__", BLSTRS_DEF)
+}
+
+fn multiexp(point: &str, exp: &str) -> String {
+    String::from(MULTIEXP_SRC)
+        .replace("POINT", point)
+        .replace("EXPONENT", exp)
+}
+
+// WARNING: This function works only with Short Weierstrass Jacobian curves with Fq2 extension field.
+pub fn kernel<G>(limb64: bool) -> String
+where
+    G: Group,
+{
+    vec![
+        if limb64 {
+            ffgen::field::<G::Scalar, ffgen::Limb64>("Fr")
+        } else {
+            ffgen::field::<G::Scalar, ffgen::Limb32>("Fr")
+        },
+        fft("Fr"),
+        if limb64 {
+            ffgen::field::<G::Scalar, ffgen::Limb64>("Fq")
+        } else {
+            ffgen::field::<G::Scalar, ffgen::Limb32>("Fq")
+        },
+        ec("Fq", "G1"),
+        multiexp("G1", "Fr"),
+        field2("Fq2", "Fq"),
+        ec("Fq2", "G2"),
+        multiexp("G2", "Fr"),
+    ]
+    .join("\n\n")
+}
diff --git a/halo2_proofs/src/gpu/utils.rs b/halo2_proofs/src/gpu/utils.rs
new file mode 100644
index 0000000000..b0e4bf83e5
--- /dev/null
+++ b/halo2_proofs/src/gpu/utils.rs
@@ -0,0 +1,88 @@
+use log::{info, warn};
+use rust_gpu_tools::*;
+use std::collections::HashMap;
+use std::env;
+
+lazy_static::lazy_static! {
+    static ref CORE_COUNTS: HashMap<String, usize> = {
+        let mut core_counts : HashMap<String, usize> = vec![
+            // AMD
+            ("gfx1010".to_string(), 2560),
+            // This value was chosen to give (approximately) empirically best performance for a Radeon Pro VII.
+            ("gfx906".to_string(), 7400),
+
+            // NVIDIA
+            ("Quadro RTX 6000".to_string(), 4608),
+            ("Quadro RTX A6000".to_string(), 10752),
+
+            ("TITAN RTX".to_string(), 4608),
+
+            ("Tesla V100".to_string(), 5120),
+            ("Tesla P100".to_string(), 3584),
+            ("Tesla T4".to_string(), 2560),
+            ("Quadro M5000".to_string(), 2048),
+
+            ("GeForce RTX 3090".to_string(), 10496),
+            ("GeForce RTX 3080".to_string(), 8704),
+            ("GeForce RTX 3070".to_string(), 5888),
+
+            ("GeForce RTX 2080 Ti".to_string(), 4352),
+            ("GeForce RTX 2080 SUPER".to_string(), 3072),
+            ("GeForce RTX 2080".to_string(), 2944),
+            ("GeForce RTX 2070 SUPER".to_string(), 2560),
+
+            ("GeForce GTX 1080 Ti".to_string(), 3584),
+            ("GeForce GTX 1080".to_string(), 2560),
+            ("GeForce GTX 2060".to_string(), 1920),
+            ("GeForce GTX 1660 Ti".to_string(), 1536),
+            ("GeForce GTX 1060".to_string(), 1280),
+            ("GeForce GTX 1650 SUPER".to_string(), 1280),
+            ("GeForce GTX 1650".to_string(), 896),
+        ].into_iter().collect();
+
+        match env::var("BELLMAN_CUSTOM_GPU").and_then(|var| {
+            for card in var.split(",") {
+                let splitted = card.split(":").collect::<Vec<_>>();
+                if splitted.len() != 2 { panic!("Invalid BELLMAN_CUSTOM_GPU!"); }
+                let name = splitted[0].trim().to_string();
+                let cores : usize = splitted[1].trim().parse().expect("Invalid BELLMAN_CUSTOM_GPU!");
+                info!("Adding \"{}\" to GPU list with {} CUDA cores.", name, cores);
+                core_counts.insert(name, cores);
+            }
+            Ok(())
+        }) { Err(_) => { }, Ok(_) => { } }
+
+        core_counts
+    };
+}
+
+const DEFAULT_CORE_COUNT: usize = 2560;
+pub fn get_core_count(d: &opencl::Device) -> usize {
+    let name = d.name();
+    match CORE_COUNTS.get(&name[..]) {
+        Some(&cores) => cores,
+        None => {
+            warn!(
+                "Number of CUDA cores for your device ({}) is unknown! Best performance is \
+                 only achieved when the number of CUDA cores is known! You can find the \
+                 instructions on how to support custom GPUs here: \
+                 https://lotu.sh/en+hardware-mining",
+                name
+            );
+            DEFAULT_CORE_COUNT
+        }
+    }
+}
+
+pub fn dump_device_list() {
+    for d in opencl::Device::all() {
+        info!("Device: {:?}", d);
+    }
+}
+
+#[cfg(feature = "gpu")]
+#[test]
+pub fn test_list_devices() {
+    let _ = env_logger::try_init();
+    dump_device_list();
+}

From 9c41501029189bd8e75967e4894878c3144dced8 Mon Sep 17 00:00:00 2001
From: huyanlin <bchylmn@gmail.com>
Date: Thu, 16 Jun 2022 20:49:23 +0800
Subject: [PATCH 03/13] fix fr

---
 halo2_proofs/src/gpu/sources.rs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/halo2_proofs/src/gpu/sources.rs b/halo2_proofs/src/gpu/sources.rs
index b14146ac1c..adef02d193 100644
--- a/halo2_proofs/src/gpu/sources.rs
+++ b/halo2_proofs/src/gpu/sources.rs
@@ -1,6 +1,7 @@
 //use group::Group;
 use crate::arithmetic::Group;
 use ff_cl_gen as ffgen;
+use pairing::bn256::{Fr, Fq};
 
 // Instead of having a very large OpenCL program written for a specific curve, with a lot of
 // rudandant codes (As OpenCL doesn't have generic types or templates), this module will dynamically
@@ -46,15 +47,15 @@ where
 {
     vec![
         if limb64 {
-            ffgen::field::<G::Scalar, ffgen::Limb64>("Fr")
+            ffgen::field::<Fr, ffgen::Limb64>("Fr")
         } else {
-            ffgen::field::<G::Scalar, ffgen::Limb32>("Fr")
+            ffgen::field::<Fr, ffgen::Limb32>("Fr")
         },
         fft("Fr"),
         if limb64 {
-            ffgen::field::<G::Scalar, ffgen::Limb64>("Fq")
+            ffgen::field::<Fq, ffgen::Limb64>("Fq")
         } else {
-            ffgen::field::<G::Scalar, ffgen::Limb32>("Fq")
+            ffgen::field::<Fq, ffgen::Limb32>("Fq")
         },
         ec("Fq", "G1"),
         multiexp("G1", "Fr"),

From c9e2bf34523864a5ca7d8c88119980d89decc8d9 Mon Sep 17 00:00:00 2001
From: huyanlin <bchylmn@gmail.com>
Date: Sun, 19 Jun 2022 21:18:08 +0800
Subject: [PATCH 04/13] update fr

---
 halo2_proofs/Cargo.toml         | 1 +
 halo2_proofs/src/gpu/sources.rs | 8 ++++----
 halo2_proofs/src/poly/domain.rs | 4 ++++
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml
index 80be358855..7bc5a13861 100644
--- a/halo2_proofs/Cargo.toml
+++ b/halo2_proofs/Cargo.toml
@@ -37,6 +37,7 @@ rand = "0.8"
 rand_core = { version = "0.6", default-features = false }
 blake2b_simd = "1"
 pairing = { git = 'https://github.com/appliedzkp/pairing', package = "pairing_bn256", "tag" = "v0.1.1"}
+pairing-bn256 = {package = "pairing_ce", version = "0.24.*" }
 subtle = "2.3"
 cfg-if = "0.1"
 ark-std = { version = "0.3", features = ["print-trace"] }
diff --git a/halo2_proofs/src/gpu/sources.rs b/halo2_proofs/src/gpu/sources.rs
index adef02d193..eebdf69af4 100644
--- a/halo2_proofs/src/gpu/sources.rs
+++ b/halo2_proofs/src/gpu/sources.rs
@@ -47,15 +47,15 @@ where
 {
     vec![
         if limb64 {
-            ffgen::field::<Fr, ffgen::Limb64>("Fr")
+            ffgen::field_fr::<Fr, ffgen::Limb64>("Fr")
         } else {
-            ffgen::field::<Fr, ffgen::Limb32>("Fr")
+            ffgen::field_fr::<Fr, ffgen::Limb32>("Fr")
         },
         fft("Fr"),
         if limb64 {
-            ffgen::field::<Fq, ffgen::Limb64>("Fq")
+            ffgen::field_fq::<Fq, ffgen::Limb64>("Fq")
         } else {
-            ffgen::field::<Fq, ffgen::Limb32>("Fq")
+            ffgen::field_fq::<Fq, ffgen::Limb32>("Fq")
         },
         ec("Fq", "G1"),
         multiexp("G1", "Fr"),
diff --git a/halo2_proofs/src/poly/domain.rs b/halo2_proofs/src/poly/domain.rs
index de7a74f320..791f94b07d 100644
--- a/halo2_proofs/src/poly/domain.rs
+++ b/halo2_proofs/src/poly/domain.rs
@@ -567,6 +567,7 @@ fn test_best_fft_multiple_gpu() {
         let start = start_timer!(|| message);
         best_fft(&mut prev_fft_coeffs, domain.get_omega(), k);
         end_timer!(start);
+        println!("coeffs cpu {:?}\n",prev_fft_coeffs);
 
         let message = format!("gpu_fft degree {}", k);
         let start = start_timer!(|| message);
@@ -584,6 +585,7 @@ fn test_best_fft_multiple_gpu() {
 
         end_timer!(start);
 
+        println!("coeffs gpu {:?}\n",optimized_fft_coeffs);
         assert_eq!(prev_fft_coeffs, optimized_fft_coeffs);
     }
 }
@@ -610,6 +612,8 @@ fn test_fft() {
         let start = start_timer!(|| message);
         best_fft(&mut prev_fft_coeffs, domain.get_omega(), k);
         end_timer!(start);
+
+        assert_eq!(prev_fft_coeffs, coeffs);
     }
 }
 

From ec7a95d19517411abb29e2cc88f1f9eee695a3c7 Mon Sep 17 00:00:00 2001
From: huyanlin <bchylmn@gmail.com>
Date: Thu, 23 Jun 2022 02:40:49 +0800
Subject: [PATCH 05/13] update dep and license

---
 halo2_proofs/Cargo.toml             |   7 +-
 halo2_proofs/src/gpu/COPYRIGHT      |  14 ++
 halo2_proofs/src/gpu/LICENSE-APACHE | 201 ++++++++++++++++++++++++++++
 halo2_proofs/src/gpu/LICENSE-MIT    |  23 ++++
 halo2_proofs/src/gpu/sources.rs     |   8 +-
 halo2_proofs/src/poly/domain.rs     |   8 +-
 6 files changed, 248 insertions(+), 13 deletions(-)
 create mode 100644 halo2_proofs/src/gpu/COPYRIGHT
 create mode 100644 halo2_proofs/src/gpu/LICENSE-APACHE
 create mode 100644 halo2_proofs/src/gpu/LICENSE-MIT

diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml
index 7bc5a13861..0e703027c5 100644
--- a/halo2_proofs/Cargo.toml
+++ b/halo2_proofs/Cargo.toml
@@ -31,13 +31,12 @@ harness = false
 [dependencies]
 backtrace = { version = "0.3", optional = true }
 rayon = "1.5.1"
-ff = "0.11"
-group = "0.11"
+ff = { git = 'https://github.com/starslabhq/ff', branch = "v0.11.1Char" }
+group = { git = "https://github.com/starslabhq/group", branch = "v0.11.0ff" }
 rand = "0.8"
 rand_core = { version = "0.6", default-features = false }
 blake2b_simd = "1"
-pairing = { git = 'https://github.com/appliedzkp/pairing', package = "pairing_bn256", "tag" = "v0.1.1"}
-pairing-bn256 = {package = "pairing_ce", version = "0.24.*" }
+pairing = { git = 'https://github.com/starslabhq/pairing', package = "pairing_bn256", branch = "primeFieldChar" }
 subtle = "2.3"
 cfg-if = "0.1"
 ark-std = { version = "0.3", features = ["print-trace"] }
diff --git a/halo2_proofs/src/gpu/COPYRIGHT b/halo2_proofs/src/gpu/COPYRIGHT
new file mode 100644
index 0000000000..9e3a9f6f90
--- /dev/null
+++ b/halo2_proofs/src/gpu/COPYRIGHT
@@ -0,0 +1,14 @@
+Copyrights in the "ff-cl-gen" library are retained by their contributors. No
+copyright assignment is required to contribute to the "ff-cl-gen" library.
+
+The "ff-cl-gen" library is licensed under either of
+
+ * Apache License, Version 2.0, (see ./LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0)
+ * MIT license (see ./LICENSE-MIT or http://opensource.org/licenses/MIT)
+
+at your option.
+
+Unless you explicitly state otherwise, any contribution intentionally
+submitted for inclusion in the work by you, as defined in the Apache-2.0
+license, shall be dual licensed as above, without any additional terms or
+conditions.
diff --git a/halo2_proofs/src/gpu/LICENSE-APACHE b/halo2_proofs/src/gpu/LICENSE-APACHE
new file mode 100644
index 0000000000..16fe87b06e
--- /dev/null
+++ b/halo2_proofs/src/gpu/LICENSE-APACHE
@@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/halo2_proofs/src/gpu/LICENSE-MIT b/halo2_proofs/src/gpu/LICENSE-MIT
new file mode 100644
index 0000000000..31aa79387f
--- /dev/null
+++ b/halo2_proofs/src/gpu/LICENSE-MIT
@@ -0,0 +1,23 @@
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/halo2_proofs/src/gpu/sources.rs b/halo2_proofs/src/gpu/sources.rs
index eebdf69af4..adef02d193 100644
--- a/halo2_proofs/src/gpu/sources.rs
+++ b/halo2_proofs/src/gpu/sources.rs
@@ -47,15 +47,15 @@ where
 {
     vec![
         if limb64 {
-            ffgen::field_fr::<Fr, ffgen::Limb64>("Fr")
+            ffgen::field::<Fr, ffgen::Limb64>("Fr")
         } else {
-            ffgen::field_fr::<Fr, ffgen::Limb32>("Fr")
+            ffgen::field::<Fr, ffgen::Limb32>("Fr")
         },
         fft("Fr"),
         if limb64 {
-            ffgen::field_fq::<Fq, ffgen::Limb64>("Fq")
+            ffgen::field::<Fq, ffgen::Limb64>("Fq")
         } else {
-            ffgen::field_fq::<Fq, ffgen::Limb32>("Fq")
+            ffgen::field::<Fq, ffgen::Limb32>("Fq")
         },
         ec("Fq", "G1"),
         multiexp("G1", "Fr"),
diff --git a/halo2_proofs/src/poly/domain.rs b/halo2_proofs/src/poly/domain.rs
index 791f94b07d..134d10d1d6 100644
--- a/halo2_proofs/src/poly/domain.rs
+++ b/halo2_proofs/src/poly/domain.rs
@@ -567,7 +567,7 @@ fn test_best_fft_multiple_gpu() {
         let start = start_timer!(|| message);
         best_fft(&mut prev_fft_coeffs, domain.get_omega(), k);
         end_timer!(start);
-        println!("coeffs cpu {:?}\n",prev_fft_coeffs);
+        //println!("coeffs cpu {:?}\n",prev_fft_coeffs);
 
         let message = format!("gpu_fft degree {}", k);
         let start = start_timer!(|| message);
@@ -578,14 +578,14 @@ fn test_best_fft_multiple_gpu() {
             &mut fft_kern,
             &mut [&mut optimized_fft_coeffs],
             &worker,
-            &domain.get_omega_inv(),
+            &domain.get_omega(),
             k as u32,
         )
         .unwrap();
 
         end_timer!(start);
 
-        println!("coeffs gpu {:?}\n",optimized_fft_coeffs);
+        //println!("coeffs gpu {:?}\n",optimized_fft_coeffs);
         assert_eq!(prev_fft_coeffs, optimized_fft_coeffs);
     }
 }
@@ -612,8 +612,6 @@ fn test_fft() {
         let start = start_timer!(|| message);
         best_fft(&mut prev_fft_coeffs, domain.get_omega(), k);
         end_timer!(start);
-
-        assert_eq!(prev_fft_coeffs, coeffs);
     }
 }
 

From a2b2f11eb825d9c9a0c3fd1cbc29f833424835fc Mon Sep 17 00:00:00 2001
From: huyanlin <bchylmn@gmail.com>
Date: Thu, 23 Jun 2022 02:55:00 +0800
Subject: [PATCH 06/13] fix path

---
 halo2_proofs/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml
index 0e703027c5..40934c3db5 100644
--- a/halo2_proofs/Cargo.toml
+++ b/halo2_proofs/Cargo.toml
@@ -52,7 +52,7 @@ env_logger = "0.8.1"
 
 # gpu feature
 rust-gpu-tools = { version = "0.3.0", optional = true }
-ff-cl-gen = { path = "../../ff-cl-gen", version = "0.2.0", optional = true }
+ff-cl-gen = { git = 'https://github.com/starslabhq/ff-cl-gen', optional = true,  branch = "gpuFft" }
 fs2 = { version = "0.4.3", optional = true }
 
 # Developer tooling dependencies

From 70c53e5295b846f6eee3b65a97aec5422f279c16 Mon Sep 17 00:00:00 2001
From: huyanlin <bchylmn@gmail.com>
Date: Thu, 23 Jun 2022 11:32:48 +0800
Subject: [PATCH 07/13] small clean code

---
 halo2_proofs/Cargo.toml         |  1 +
 halo2_proofs/src/gpu/error.rs   |  9 ++++-----
 halo2_proofs/src/gpu/fft.rs     |  1 -
 halo2_proofs/src/gpu/locks.rs   | 12 +-----------
 halo2_proofs/src/gpu/mod.rs     | 14 --------------
 halo2_proofs/src/gpu/sources.rs |  1 -
 halo2_proofs/src/gpu/utils.rs   |  1 -
 halo2_proofs/src/lib.rs         |  2 ++
 8 files changed, 8 insertions(+), 33 deletions(-)

diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml
index 40934c3db5..15770eb30e 100644
--- a/halo2_proofs/Cargo.toml
+++ b/halo2_proofs/Cargo.toml
@@ -77,6 +77,7 @@ rand_core = { version = "0.6", default-features = false, features = ["getrandom"
 getrandom = { version = "0.2", features = ["js"] }
 
 [features]
+# todo: set gpu as optional feature
 default = ["shplonk", "gpu"]
 dev-graph = ["plotters", "tabbycat"]
 gadget-traces = ["backtrace"]
diff --git a/halo2_proofs/src/gpu/error.rs b/halo2_proofs/src/gpu/error.rs
index 1d6d38636e..50a4316bea 100644
--- a/halo2_proofs/src/gpu/error.rs
+++ b/halo2_proofs/src/gpu/error.rs
@@ -1,26 +1,25 @@
-#[cfg(feature = "gpu")]
 use rust_gpu_tools::opencl;
 
 #[derive(thiserror::Error, Debug)]
 pub enum GPUError {
     #[error("GPUError: {0}")]
     Simple(&'static str),
-    #[cfg(feature = "gpu")]
+
     #[error("OpenCL Error: {0}")]
     OpenCL(#[from] opencl::GPUError),
-    #[cfg(feature = "gpu")]
+
     #[error("GPU taken by a high priority process!")]
     GPUTaken,
-    #[cfg(feature = "gpu")]
+
     #[error("No kernel is initialized!")]
     KernelUninitialized,
+
     #[error("GPU accelerator is disabled!")]
     GPUDisabled,
 }
 
 pub type GPUResult<T> = std::result::Result<T, GPUError>;
 
-#[cfg(feature = "gpu")]
 impl From<std::boxed::Box<dyn std::any::Any + std::marker::Send>> for GPUError {
     fn from(e: std::boxed::Box<dyn std::any::Any + std::marker::Send>) -> Self {
         match e.downcast::<Self>() {
diff --git a/halo2_proofs/src/gpu/fft.rs b/halo2_proofs/src/gpu/fft.rs
index d35d9be04a..25d3f80921 100644
--- a/halo2_proofs/src/gpu/fft.rs
+++ b/halo2_proofs/src/gpu/fft.rs
@@ -3,7 +3,6 @@ use crate::gpu::{
     get_lock_name_and_gpu_range, locks, sources,
 };
 use group::ff::Field;
-//use group::Group;
 use crate::arithmetic::Group;
 use crate::worker::THREAD_POOL;
 use log::{error, info};
diff --git a/halo2_proofs/src/gpu/locks.rs b/halo2_proofs/src/gpu/locks.rs
index 8d36662453..0f460136f7 100644
--- a/halo2_proofs/src/gpu/locks.rs
+++ b/halo2_proofs/src/gpu/locks.rs
@@ -2,7 +2,6 @@ use fs2::FileExt;
 use log::{debug, info, warn};
 use std::fs::File;
 use std::path::PathBuf;
-//use group::Group;
 use crate::arithmetic::Group;
 
 const GPU_LOCK_NAME: &str = "bellman.gpu.lock";
@@ -110,10 +109,7 @@ impl Drop for PriorityLock {
 
 use super::error::{GPUError, GPUResult};
 use super::fft::MultiFFTKernel;
-//use super::multiexp::MultiexpKernel;
 use crate::poly::domain::create_fft_kernel;
-//use crate::multiexp::create_multiexp_kernel;
-//use crate::pairing::Engine;
 use std::env;
 use std::ops::Range;
 
@@ -196,10 +192,4 @@ locked_kernel!(
     MultiFFTKernel,
     create_fft_kernel,
     "FFT"
-);
-/*locked_kernel!(
-    LockedMultiexpKernel,
-    MultiexpKernel,
-    create_multiexp_kernel,
-    "Multiexp"
-);*/
+);
\ No newline at end of file
diff --git a/halo2_proofs/src/gpu/mod.rs b/halo2_proofs/src/gpu/mod.rs
index 21cecfea40..2160946dd9 100644
--- a/halo2_proofs/src/gpu/mod.rs
+++ b/halo2_proofs/src/gpu/mod.rs
@@ -2,36 +2,22 @@ mod error;
 
 pub use self::error::*;
 
-#[cfg(feature = "gpu")]
 mod locks;
 
-#[cfg(feature = "gpu")]
 pub use self::locks::*;
 
-#[cfg(feature = "gpu")]
 mod sources;
 
-#[cfg(feature = "gpu")]
 pub use self::sources::*;
 
-#[cfg(feature = "gpu")]
 mod utils;
 
-#[cfg(feature = "gpu")]
 pub use self::utils::*;
 
-#[cfg(feature = "gpu")]
 mod fft;
 
-#[cfg(feature = "gpu")]
 pub use self::fft::*;
 
-//#[cfg(feature = "gpu")]
-//mod multiexp;
-
-//#[cfg(feature = "gpu")]
-//pub use self::multiexp::*;
-
 #[cfg(not(feature = "gpu"))]
 mod nogpu;
 
diff --git a/halo2_proofs/src/gpu/sources.rs b/halo2_proofs/src/gpu/sources.rs
index adef02d193..062362580b 100644
--- a/halo2_proofs/src/gpu/sources.rs
+++ b/halo2_proofs/src/gpu/sources.rs
@@ -1,4 +1,3 @@
-//use group::Group;
 use crate::arithmetic::Group;
 use ff_cl_gen as ffgen;
 use pairing::bn256::{Fr, Fq};
diff --git a/halo2_proofs/src/gpu/utils.rs b/halo2_proofs/src/gpu/utils.rs
index b0e4bf83e5..7183d85a46 100644
--- a/halo2_proofs/src/gpu/utils.rs
+++ b/halo2_proofs/src/gpu/utils.rs
@@ -80,7 +80,6 @@ pub fn dump_device_list() {
     }
 }
 
-#[cfg(feature = "gpu")]
 #[test]
 pub fn test_list_devices() {
     let _ = env_logger::try_init();
diff --git a/halo2_proofs/src/lib.rs b/halo2_proofs/src/lib.rs
index 920a3c38fd..6c00c0ec91 100644
--- a/halo2_proofs/src/lib.rs
+++ b/halo2_proofs/src/lib.rs
@@ -32,11 +32,13 @@ pub mod worker {
     pub use super::multicore::*;
 }
 pub mod plonk;
+#[cfg(feature = "gpu")]
 pub mod gpu;
 pub mod poly;
 pub mod transcript;
 
 pub mod dev;
 mod helpers;
+#[cfg(feature = "gpu")]
 #[macro_use]
 extern crate lazy_static;

From 135f64c69f9a34b2291670eb1bc24ef32ab71d00 Mon Sep 17 00:00:00 2001
From: huyanlin <bchylmn@gmail.com>
Date: Thu, 23 Jun 2022 15:19:36 +0800
Subject: [PATCH 08/13] add some doc comments

---
 halo2_proofs/src/gpu/error.rs   |  7 +++++++
 halo2_proofs/src/gpu/fft.rs     |  5 +++++
 halo2_proofs/src/gpu/locks.rs   | 10 +++++++++-
 halo2_proofs/src/gpu/sources.rs |  2 +-
 halo2_proofs/src/gpu/utils.rs   |  2 ++
 halo2_proofs/src/lib.rs         | 14 +++++++++-----
 halo2_proofs/src/multicore.rs   | 21 +++++++++++++++++++--
 halo2_proofs/src/poly/domain.rs | 13 +++++++------
 8 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/halo2_proofs/src/gpu/error.rs b/halo2_proofs/src/gpu/error.rs
index 50a4316bea..44a6c9e79e 100644
--- a/halo2_proofs/src/gpu/error.rs
+++ b/halo2_proofs/src/gpu/error.rs
@@ -1,23 +1,30 @@
 use rust_gpu_tools::opencl;
 
+/// Gpu err
 #[derive(thiserror::Error, Debug)]
 pub enum GPUError {
+    /// Simple
     #[error("GPUError: {0}")]
     Simple(&'static str),
 
+    /// OpenCL
     #[error("OpenCL Error: {0}")]
     OpenCL(#[from] opencl::GPUError),
 
+    /// GPUTaken
     #[error("GPU taken by a high priority process!")]
     GPUTaken,
 
+    /// KernelUninitialized
     #[error("No kernel is initialized!")]
     KernelUninitialized,
 
+    /// GPUDisabled
     #[error("GPU accelerator is disabled!")]
     GPUDisabled,
 }
 
+/// Type GPUResult
 pub type GPUResult<T> = std::result::Result<T, GPUError>;
 
 impl From<std::boxed::Box<dyn std::any::Any + std::marker::Send>> for GPUError {
diff --git a/halo2_proofs/src/gpu/fft.rs b/halo2_proofs/src/gpu/fft.rs
index 25d3f80921..d8c60945db 100644
--- a/halo2_proofs/src/gpu/fft.rs
+++ b/halo2_proofs/src/gpu/fft.rs
@@ -16,6 +16,7 @@ const LOG2_MAX_ELEMENTS: usize = 32; // At most 2^32 elements is supported.
 const MAX_LOG2_RADIX: u32 = 9; // Radix512
 const MAX_LOG2_LOCAL_WORK_SIZE: u32 = 8; // 256
 
+/// SingleFFTKernel
 pub struct SingleFFTKernel<G>
 where
     G: Group,
@@ -30,6 +31,7 @@ impl<G> SingleFFTKernel<G>
 where
     G: Group,
 {
+    /// New gpu fft kernel device
     pub fn create(device: opencl::Device, priority: bool) -> GPUResult<SingleFFTKernel<G>> {
         let src = sources::kernel::<G>(device.brand() == opencl::Brand::Nvidia);
 
@@ -140,6 +142,7 @@ where
     }
 }
 
+/// Gpu fft kernel vec
 pub struct MultiFFTKernel<G>
 where
     G: Group,
@@ -152,6 +155,7 @@ impl<G> MultiFFTKernel<G>
 where
     G: Group,
 {
+    /// New gpu kernel device
     pub fn create(priority: bool) -> GPUResult<MultiFFTKernel<G>> {
         let mut all_devices = opencl::Device::all();
         let all_num = all_devices.len();
@@ -183,6 +187,7 @@ where
         })
     }
 
+    /// fft_multiple call for kernel radix_fft
     pub fn fft_multiple(
         &mut self,
         polys: &mut [&mut [G::Scalar]],
diff --git a/halo2_proofs/src/gpu/locks.rs b/halo2_proofs/src/gpu/locks.rs
index 0f460136f7..9e6a3cd780 100644
--- a/halo2_proofs/src/gpu/locks.rs
+++ b/halo2_proofs/src/gpu/locks.rs
@@ -12,6 +12,7 @@ fn tmp_path(filename: &str) -> PathBuf {
     p
 }
 
+/// Get_lock_name_and_gpu_range
 pub fn get_lock_name_and_gpu_range(all_gpus: usize) -> (String, Range<usize>) {
     let mut num_gpus_per_group = all_gpus;
     let mut group_index = 0usize;
@@ -46,6 +47,7 @@ pub fn get_lock_name_and_gpu_range(all_gpus: usize) -> (String, Range<usize>) {
 #[derive(Debug)]
 pub struct GPULock(File);
 impl GPULock {
+    /// GPULock
     pub fn lock(index: String) -> GPULock {
         let filename = GPU_LOCK_NAME.to_string() + &index;
         let gpu_lock_file = tmp_path(&filename);
@@ -71,6 +73,7 @@ impl Drop for GPULock {
 #[derive(Debug)]
 pub struct PriorityLock(File);
 impl PriorityLock {
+    /// PriorityLock
     pub fn lock() -> PriorityLock {
         let priority_lock_file = tmp_path(PRIORITY_LOCK_NAME);
         debug!("Acquiring priority lock at {:?} ...", &priority_lock_file);
@@ -84,6 +87,7 @@ impl PriorityLock {
         debug!("Priority lock acquired!");
         PriorityLock(f)
     }
+    /// PriorityLock wait
     pub fn wait(priority: bool) {
         if !priority {
             File::create(tmp_path(PRIORITY_LOCK_NAME))
@@ -92,6 +96,7 @@ impl PriorityLock {
                 .unwrap();
         }
     }
+    /// PriorityLock break
     pub fn should_break(priority: bool) -> bool {
         !priority
             && File::create(tmp_path(PRIORITY_LOCK_NAME))
@@ -115,6 +120,7 @@ use std::ops::Range;
 
 macro_rules! locked_kernel {
     ($class:ident, $kern:ident, $func:ident, $name:expr) => {
+        /// gpu fft locked kernel
         pub struct $class<G>
         where
             G: Group,
@@ -128,6 +134,7 @@ macro_rules! locked_kernel {
         where
             G: Group,
         {
+            /// New gpu kernel macro
             pub fn new(log_d: usize, priority: bool) -> $class<G> {
                 $class::<G> {
                     log_d,
@@ -152,7 +159,8 @@ macro_rules! locked_kernel {
                     );
                 }
             }
-
+            
+            /// Gpu kernel config
             pub fn with<F, R>(&mut self, mut f: F) -> GPUResult<R>
             where
                 F: FnMut(&mut $kern<G>) -> GPUResult<R>,
diff --git a/halo2_proofs/src/gpu/sources.rs b/halo2_proofs/src/gpu/sources.rs
index 062362580b..74da5317eb 100644
--- a/halo2_proofs/src/gpu/sources.rs
+++ b/halo2_proofs/src/gpu/sources.rs
@@ -39,7 +39,7 @@ fn multiexp(point: &str, exp: &str) -> String {
         .replace("EXPONENT", exp)
 }
 
-// WARNING: This function works only with Short Weierstrass Jacobian curves with Fq2 extension field.
+/// WARNING: This function works only with Short Weierstrass Jacobian curves with Fq2 extension field.
 pub fn kernel<G>(limb64: bool) -> String
 where
     G: Group,
diff --git a/halo2_proofs/src/gpu/utils.rs b/halo2_proofs/src/gpu/utils.rs
index 7183d85a46..701f252f53 100644
--- a/halo2_proofs/src/gpu/utils.rs
+++ b/halo2_proofs/src/gpu/utils.rs
@@ -57,6 +57,7 @@ lazy_static::lazy_static! {
 }
 
 const DEFAULT_CORE_COUNT: usize = 2560;
+/// Get core number
 pub fn get_core_count(d: &opencl::Device) -> usize {
     let name = d.name();
     match CORE_COUNTS.get(&name[..]) {
@@ -74,6 +75,7 @@ pub fn get_core_count(d: &opencl::Device) -> usize {
     }
 }
 
+/// Dump gpu device
 pub fn dump_device_list() {
     for d in opencl::Device::all() {
         info!("Device: {:?}", d);
diff --git a/halo2_proofs/src/lib.rs b/halo2_proofs/src/lib.rs
index 6c00c0ec91..088221dcc0 100644
--- a/halo2_proofs/src/lib.rs
+++ b/halo2_proofs/src/lib.rs
@@ -18,8 +18,8 @@
     clippy::upper_case_acronyms
 )]
 #![deny(broken_intra_doc_links)]
-//#![deny(missing_debug_implementations)]
-//#![deny(missing_docs)]
+// #![deny(missing_debug_implementations)]
+// #![deny(missing_docs)]
 #![deny(unsafe_code)]
 // Remove this once we update pasta_curves
 #![allow(unused_imports)]
@@ -28,12 +28,16 @@ pub mod arithmetic;
 pub mod circuit;
 pub use pairing;
 mod multicore;
-pub mod worker {
-    pub use super::multicore::*;
-}
 pub mod plonk;
+
+/// Gpu fft
 #[cfg(feature = "gpu")]
 pub mod gpu;
+/// Gpu worker thread
+pub mod worker {
+    pub use super::multicore::*;
+}
+
 pub mod poly;
 pub mod transcript;
 
diff --git a/halo2_proofs/src/multicore.rs b/halo2_proofs/src/multicore.rs
index 1e379506d0..27eb3c8842 100644
--- a/halo2_proofs/src/multicore.rs
+++ b/halo2_proofs/src/multicore.rs
@@ -13,8 +13,12 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 
 static WORKER_SPAWN_COUNTER: AtomicUsize = AtomicUsize::new(0);
 
-#[deny(missing_docs)]
+/// Threadpool and number_cpus
+#[cfg(feature = "gpu")]
+#[allow(unused_doc_comments)]
+#[allow(missing_docs)]
 lazy_static! {
+    // THREAD_POOL and NUM_CPUS
     static ref NUM_CPUS: usize = if let Ok(num) = env::var("BELLMAN_NUM_CPUS") {
         if let Ok(num) = num.parse() {
             num
@@ -26,28 +30,35 @@ lazy_static! {
     };
     // See Worker::compute below for a description of this.
     static ref WORKER_SPAWN_MAX_COUNT: usize = *NUM_CPUS * 4;
+    // THREAD_POOL
     pub static ref THREAD_POOL: rayon::ThreadPool = rayon::ThreadPoolBuilder::new()
         .num_threads(*NUM_CPUS)
         .build()
         .unwrap();
 }
 
+/// Thread pool
 #[derive(Clone)]
+#[derive(Debug)]
 pub struct Worker {}
 
 impl Worker {
+    /// New worker
     pub fn new() -> Worker {
         Worker {}
     }
 
+    /// Get_num_cpus
     pub fn get_num_cpus(&self) -> usize {
         *NUM_CPUS
     }
 
+    /// Log_num_cpus
     pub fn log_num_cpus(&self) -> u32 {
         log2_floor(*NUM_CPUS)
     }
 
+   /// Compute
     pub fn compute<F, R>(&self, f: F) -> Waiter<R>
     where
         F: FnOnce() -> R + Send + 'static,
@@ -97,6 +108,7 @@ impl Worker {
         Waiter { receiver }
     }
 
+    /// Scope
     pub fn scope<'a, F, R>(&self, elements: usize, f: F) -> R
     where
         F: FnOnce(&rayon::Scope<'a>, usize) -> R + Send,
@@ -107,6 +119,7 @@ impl Worker {
         THREAD_POOL.scope(|scope| f(scope, chunk_size))
     }
 
+    /// In_place_scope
     pub fn in_place_scope<'a, F, R>(&self, elements: usize, f: F) -> R
     where
         F: FnOnce(&rayon::Scope<'a>, usize) -> R,
@@ -116,6 +129,7 @@ impl Worker {
         THREAD_POOL.in_place_scope(|scope| f(scope, chunk_size))
     }
 
+    /// Get_chunk_size
     pub fn get_chunk_size(&self, elements: usize) -> usize {
         let chunk_size = if elements <= *NUM_CPUS {
             1
@@ -125,7 +139,7 @@ impl Worker {
 
         chunk_size
     }
-    // TODO: check +1?
+    /// Chunk_size_for_num_spawned_threads
     pub fn chunk_size_for_num_spawned_threads(elements: usize, num_threads: usize) -> usize {
         assert!(
             elements >= num_threads,
@@ -140,6 +154,7 @@ impl Worker {
         }
     }
 
+    /// Get_num_spawned_threads
     pub fn get_num_spawned_threads(&self, elements: usize) -> usize {
         let num_spawned = if elements <= *NUM_CPUS {
             elements
@@ -158,6 +173,8 @@ impl Worker {
     }
 }
 
+/// Waiter for receiving
+#[derive(Debug)]
 pub struct Waiter<T> {
     receiver: Receiver<T>,
 }
diff --git a/halo2_proofs/src/poly/domain.rs b/halo2_proofs/src/poly/domain.rs
index 134d10d1d6..ba906aa7a4 100644
--- a/halo2_proofs/src/poly/domain.rs
+++ b/halo2_proofs/src/poly/domain.rs
@@ -490,6 +490,8 @@ pub struct PinnedEvaluationDomain<'a, G: Group> {
     omega: &'a G::Scalar,
 }
 
+/// Config gpu fft kernel
+#[cfg(feature = "gpu")]
 pub fn create_fft_kernel<G>(_log_d: usize, priority: bool) -> Option<gpu::MultiFFTKernel<G>>
 where
     G: Group,
@@ -506,11 +508,11 @@ where
     }
 }
 
-use crate::worker::Worker;
+/// Wrap `gpu_fft_multiple`
+#[cfg(feature = "gpu")]
 pub fn best_fft_multiple_gpu<G: Group>(
     kern: &mut Option<gpu::LockedMultiFFTKernel<G>>,
     polys: &mut [&mut [G::Scalar]],
-    worker: &Worker,
     omega: &G::Scalar,
     log_n: u32,
 ) -> gpu::GPUResult<()> {
@@ -526,6 +528,8 @@ pub fn best_fft_multiple_gpu<G: Group>(
     Ok(())
 }
 
+/// Use multiple gpu fft
+#[cfg(feature = "gpu")]
 pub fn gpu_fft_multiple<G: Group>(
     kern: &mut gpu::MultiFFTKernel<G>,
     polys: &mut [&mut [G::Scalar]],
@@ -537,14 +541,12 @@ pub fn gpu_fft_multiple<G: Group>(
     Ok(())
 }
 
+#[cfg(feature = "gpu")]
 #[test]
 fn test_best_fft_multiple_gpu() {
     use crate::gpu::LockedMultiFFTKernel;
     use crate::pairing::bn256::Fr;
     use pairing::bn256::Bn256;
-    use crate::worker::Worker;
-
-    let worker = Worker::new();
 
     use crate::poly::{EvaluationDomain};
     use ark_std::{end_timer, start_timer};
@@ -577,7 +579,6 @@ fn test_best_fft_multiple_gpu() {
         best_fft_multiple_gpu(
             &mut fft_kern,
             &mut [&mut optimized_fft_coeffs],
-            &worker,
             &domain.get_omega(),
             k as u32,
         )

From f5931aca110fb8d9a82ec9777395bf154d2ccfff Mon Sep 17 00:00:00 2001
From: huyanlin <bchylmn@gmail.com>
Date: Thu, 23 Jun 2022 15:45:38 +0800
Subject: [PATCH 09/13] fix doc comment typo

---
 halo2_proofs/src/lib.rs       |  2 +-
 halo2_proofs/src/multicore.rs | 12 ++++--------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/halo2_proofs/src/lib.rs b/halo2_proofs/src/lib.rs
index 088221dcc0..eb9412154f 100644
--- a/halo2_proofs/src/lib.rs
+++ b/halo2_proofs/src/lib.rs
@@ -19,7 +19,7 @@
 )]
 #![deny(broken_intra_doc_links)]
 // #![deny(missing_debug_implementations)]
-// #![deny(missing_docs)]
+#![deny(missing_docs)]
 #![deny(unsafe_code)]
 // Remove this once we update pasta_curves
 #![allow(unused_imports)]
diff --git a/halo2_proofs/src/multicore.rs b/halo2_proofs/src/multicore.rs
index 27eb3c8842..ab4b4b3120 100644
--- a/halo2_proofs/src/multicore.rs
+++ b/halo2_proofs/src/multicore.rs
@@ -13,12 +13,9 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 
 static WORKER_SPAWN_COUNTER: AtomicUsize = AtomicUsize::new(0);
 
-/// Threadpool and number_cpus
 #[cfg(feature = "gpu")]
-#[allow(unused_doc_comments)]
-#[allow(missing_docs)]
 lazy_static! {
-    // THREAD_POOL and NUM_CPUS
+    /// THREAD_POOL and NUM_CPUS
     static ref NUM_CPUS: usize = if let Ok(num) = env::var("BELLMAN_NUM_CPUS") {
         if let Ok(num) = num.parse() {
             num
@@ -28,9 +25,9 @@ lazy_static! {
     } else {
         num_cpus::get()
     };
-    // See Worker::compute below for a description of this.
+    /// See Worker::compute below for a description of this.
     static ref WORKER_SPAWN_MAX_COUNT: usize = *NUM_CPUS * 4;
-    // THREAD_POOL
+    /// THREAD_POOL
     pub static ref THREAD_POOL: rayon::ThreadPool = rayon::ThreadPoolBuilder::new()
         .num_threads(*NUM_CPUS)
         .build()
@@ -38,8 +35,7 @@ lazy_static! {
 }
 
 /// Thread pool
-#[derive(Clone)]
-#[derive(Debug)]
+#[derive(Clone, Debug)]
 pub struct Worker {}
 
 impl Worker {

From 8c37bf9d7d32ade5b9b494d569349b8f8d526262 Mon Sep 17 00:00:00 2001
From: huyanlin <bchylmn@gmail.com>
Date: Thu, 23 Jun 2022 20:08:31 +0800
Subject: [PATCH 10/13] open lint check

---
 halo2_proofs/src/gpu/fft.rs   | 3 +++
 halo2_proofs/src/gpu/locks.rs | 1 +
 halo2_proofs/src/lib.rs       | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/halo2_proofs/src/gpu/fft.rs b/halo2_proofs/src/gpu/fft.rs
index d8c60945db..21b86502f4 100644
--- a/halo2_proofs/src/gpu/fft.rs
+++ b/halo2_proofs/src/gpu/fft.rs
@@ -17,6 +17,7 @@ const MAX_LOG2_RADIX: u32 = 9; // Radix512
 const MAX_LOG2_LOCAL_WORK_SIZE: u32 = 8; // 256
 
 /// SingleFFTKernel
+#[allow(missing_debug_implementations)]
 pub struct SingleFFTKernel<G>
 where
     G: Group,
@@ -24,6 +25,7 @@ where
     program: opencl::Program,
     pq_buffer: opencl::Buffer<G::Scalar>,
     omegas_buffer: opencl::Buffer<G::Scalar>,
+    #[allow(dead_code)]
     priority: bool,
 }
 
@@ -143,6 +145,7 @@ where
 }
 
 /// Gpu fft kernel vec
+#[allow(missing_debug_implementations)]
 pub struct MultiFFTKernel<G>
 where
     G: Group,
diff --git a/halo2_proofs/src/gpu/locks.rs b/halo2_proofs/src/gpu/locks.rs
index 9e6a3cd780..8a60c3b63f 100644
--- a/halo2_proofs/src/gpu/locks.rs
+++ b/halo2_proofs/src/gpu/locks.rs
@@ -121,6 +121,7 @@ use std::ops::Range;
 macro_rules! locked_kernel {
     ($class:ident, $kern:ident, $func:ident, $name:expr) => {
         /// gpu fft locked kernel
+        #[allow(missing_debug_implementations)]
         pub struct $class<G>
         where
             G: Group,
diff --git a/halo2_proofs/src/lib.rs b/halo2_proofs/src/lib.rs
index eb9412154f..7ac27bd758 100644
--- a/halo2_proofs/src/lib.rs
+++ b/halo2_proofs/src/lib.rs
@@ -18,7 +18,7 @@
     clippy::upper_case_acronyms
 )]
 #![deny(broken_intra_doc_links)]
-// #![deny(missing_debug_implementations)]
+#![deny(missing_debug_implementations)]
 #![deny(missing_docs)]
 #![deny(unsafe_code)]
 // Remove this once we update pasta_curves

From 2125c6c13e0f515605b4f85fb74455010f76c056 Mon Sep 17 00:00:00 2001
From: huyanlin <bchylmn@gmail.com>
Date: Thu, 23 Jun 2022 22:51:02 +0800
Subject: [PATCH 11/13] set gpu acceleration as optional feature

---
 halo2_proofs/Cargo.toml         | 4 ++--
 halo2_proofs/src/multicore.rs   | 8 ++++++++
 halo2_proofs/src/poly/domain.rs | 1 +
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml
index 15770eb30e..902f100588 100644
--- a/halo2_proofs/Cargo.toml
+++ b/halo2_proofs/Cargo.toml
@@ -77,12 +77,12 @@ rand_core = { version = "0.6", default-features = false, features = ["getrandom"
 getrandom = { version = "0.2", features = ["js"] }
 
 [features]
-# todo: set gpu as optional feature
-default = ["shplonk", "gpu"]
+default = ["shplonk"]
 dev-graph = ["plotters", "tabbycat"]
 gadget-traces = ["backtrace"]
 sanity-checks = []
 shplonk = []
+# set gpu acceleration as optional feature
 gpu = ["rust-gpu-tools", "ff-cl-gen", "fs2", "lazy_static", "crossbeam", "futures/thread-pool"]
 gwc = []
 
diff --git a/halo2_proofs/src/multicore.rs b/halo2_proofs/src/multicore.rs
index ab4b4b3120..e2cf25071a 100644
--- a/halo2_proofs/src/multicore.rs
+++ b/halo2_proofs/src/multicore.rs
@@ -5,12 +5,14 @@
 pub use rayon::{current_num_threads, scope, Scope};
 
 use crossbeam_channel::{bounded, Receiver};
+#[cfg(feature = "gpu")]
 use lazy_static::lazy_static;
 use log::{error, trace};
 use std::env;
 
 use std::sync::atomic::{AtomicUsize, Ordering};
 
+#[cfg(feature = "gpu")]
 static WORKER_SPAWN_COUNTER: AtomicUsize = AtomicUsize::new(0);
 
 #[cfg(feature = "gpu")]
@@ -35,9 +37,11 @@ lazy_static! {
 }
 
 /// Thread pool
+#[cfg(feature = "gpu")]
 #[derive(Clone, Debug)]
 pub struct Worker {}
 
+#[cfg(feature = "gpu")]
 impl Worker {
     /// New worker
     pub fn new() -> Worker {
@@ -170,11 +174,13 @@ impl Worker {
 }
 
 /// Waiter for receiving
+#[cfg(feature = "gpu")]
 #[derive(Debug)]
 pub struct Waiter<T> {
     receiver: Receiver<T>,
 }
 
+#[cfg(feature = "gpu")]
 impl<T> Waiter<T> {
     /// Wait for the result.
     pub fn wait(&self) -> T {
@@ -195,6 +201,7 @@ impl<T> Waiter<T> {
     }
 }
 
+#[cfg(feature = "gpu")]
 fn log2_floor(num: usize) -> u32 {
     assert!(num > 0);
 
@@ -207,6 +214,7 @@ fn log2_floor(num: usize) -> u32 {
     pow
 }
 
+#[cfg(feature = "gpu")]
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/halo2_proofs/src/poly/domain.rs b/halo2_proofs/src/poly/domain.rs
index ba906aa7a4..93df085c82 100644
--- a/halo2_proofs/src/poly/domain.rs
+++ b/halo2_proofs/src/poly/domain.rs
@@ -14,6 +14,7 @@ use group::Group as CryptGroup;
 
 use std::marker::PhantomData;
 
+#[cfg(feature = "gpu")]
 use crate::gpu;
 use log::{info, warn};
 

From 6c1bc46a4101a91701e1a60d7f64ec2ecc6a41f5 Mon Sep 17 00:00:00 2001
From: huyanlin <bchylmn@gmail.com>
Date: Thu, 23 Jun 2022 23:32:33 +0800
Subject: [PATCH 12/13] code fmt

---
 halo2_proofs/src/gpu/fft.rs     | 13 +++++++++----
 halo2_proofs/src/gpu/locks.rs   |  6 +++---
 halo2_proofs/src/gpu/sources.rs |  2 +-
 halo2_proofs/src/multicore.rs   |  2 +-
 halo2_proofs/src/poly/domain.rs |  4 ++--
 5 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/halo2_proofs/src/gpu/fft.rs b/halo2_proofs/src/gpu/fft.rs
index 21b86502f4..09feff1b9f 100644
--- a/halo2_proofs/src/gpu/fft.rs
+++ b/halo2_proofs/src/gpu/fft.rs
@@ -1,16 +1,16 @@
+use crate::arithmetic::Group;
 use crate::gpu::{
     error::{GPUError, GPUResult},
     get_lock_name_and_gpu_range, locks, sources,
 };
-use group::ff::Field;
-use crate::arithmetic::Group;
 use crate::worker::THREAD_POOL;
+use group::ff::Field;
 use log::{error, info};
 use rayon::join;
 use rust_gpu_tools::*;
 use std::cmp::min;
-use std::{cmp, env};
 use std::ops::MulAssign;
+use std::{cmp, env};
 
 const LOG2_MAX_ELEMENTS: usize = 32; // At most 2^32 elements is supported.
 const MAX_LOG2_RADIX: u32 = 9; // Radix512
@@ -121,7 +121,12 @@ where
     /// Performs FFT on `a`
     /// * `omega` - Special value `omega` is used for FFT over finite-fields
     /// * `log_n` - Specifies log2 of number of elements
-    pub fn radix_fft(&mut self, a: &mut [G::Scalar], omega: &G::Scalar, log_n: u32) -> GPUResult<()> {
+    pub fn radix_fft(
+        &mut self,
+        a: &mut [G::Scalar],
+        omega: &G::Scalar,
+        log_n: u32,
+    ) -> GPUResult<()> {
         let n = 1 << log_n;
         let mut src_buffer = self.program.create_buffer::<G::Scalar>(n)?;
         let mut dst_buffer = self.program.create_buffer::<G::Scalar>(n)?;
diff --git a/halo2_proofs/src/gpu/locks.rs b/halo2_proofs/src/gpu/locks.rs
index 8a60c3b63f..77a8d46ee2 100644
--- a/halo2_proofs/src/gpu/locks.rs
+++ b/halo2_proofs/src/gpu/locks.rs
@@ -1,8 +1,8 @@
+use crate::arithmetic::Group;
 use fs2::FileExt;
 use log::{debug, info, warn};
 use std::fs::File;
 use std::path::PathBuf;
-use crate::arithmetic::Group;
 
 const GPU_LOCK_NAME: &str = "bellman.gpu.lock";
 const PRIORITY_LOCK_NAME: &str = "bellman.priority.lock";
@@ -160,7 +160,7 @@ macro_rules! locked_kernel {
                     );
                 }
             }
-            
+
             /// Gpu kernel config
             pub fn with<F, R>(&mut self, mut f: F) -> GPUResult<R>
             where
@@ -201,4 +201,4 @@ locked_kernel!(
     MultiFFTKernel,
     create_fft_kernel,
     "FFT"
-);
\ No newline at end of file
+);
diff --git a/halo2_proofs/src/gpu/sources.rs b/halo2_proofs/src/gpu/sources.rs
index 74da5317eb..11dc3144a2 100644
--- a/halo2_proofs/src/gpu/sources.rs
+++ b/halo2_proofs/src/gpu/sources.rs
@@ -1,6 +1,6 @@
 use crate::arithmetic::Group;
 use ff_cl_gen as ffgen;
-use pairing::bn256::{Fr, Fq};
+use pairing::bn256::{Fq, Fr};
 
 // Instead of having a very large OpenCL program written for a specific curve, with a lot of
 // rudandant codes (As OpenCL doesn't have generic types or templates), this module will dynamically
diff --git a/halo2_proofs/src/multicore.rs b/halo2_proofs/src/multicore.rs
index e2cf25071a..8654191021 100644
--- a/halo2_proofs/src/multicore.rs
+++ b/halo2_proofs/src/multicore.rs
@@ -58,7 +58,7 @@ impl Worker {
         log2_floor(*NUM_CPUS)
     }
 
-   /// Compute
+    /// Compute
     pub fn compute<F, R>(&self, f: F) -> Waiter<R>
     where
         F: FnOnce() -> R + Send + 'static,
diff --git a/halo2_proofs/src/poly/domain.rs b/halo2_proofs/src/poly/domain.rs
index 93df085c82..3018b4614a 100644
--- a/halo2_proofs/src/poly/domain.rs
+++ b/halo2_proofs/src/poly/domain.rs
@@ -549,7 +549,7 @@ fn test_best_fft_multiple_gpu() {
     use crate::pairing::bn256::Fr;
     use pairing::bn256::Bn256;
 
-    use crate::poly::{EvaluationDomain};
+    use crate::poly::EvaluationDomain;
     use ark_std::{end_timer, start_timer};
     use rand_core::OsRng;
 
@@ -594,7 +594,7 @@ fn test_best_fft_multiple_gpu() {
 
 #[test]
 fn test_fft() {
-    use crate::poly::{EvaluationDomain};
+    use crate::poly::EvaluationDomain;
     use ark_std::{end_timer, start_timer};
     use pairing::bn256::Fr;
     use rand_core::OsRng;

From 407c46462851ce84a7a824d988c2f3a33e08bbd4 Mon Sep 17 00:00:00 2001
From: huyanlin <bchylmn@gmail.com>
Date: Mon, 27 Jun 2022 15:14:52 +0800
Subject: [PATCH 13/13] take params coeffs as measurement

---
 halo2_proofs/src/poly/domain.rs | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/halo2_proofs/src/poly/domain.rs b/halo2_proofs/src/poly/domain.rs
index 3018b4614a..ca1e2e7ce0 100644
--- a/halo2_proofs/src/poly/domain.rs
+++ b/halo2_proofs/src/poly/domain.rs
@@ -562,19 +562,18 @@ fn test_best_fft_multiple_gpu() {
         // evaluation domain
         let domain: EvaluationDomain<Fr> = EvaluationDomain::new(1, k);
 
-        let mut prev_fft_coeffs = coeffs.clone();
-
-        let mut optimized_fft_coeffs = coeffs.clone();
-
         let message = format!("prev_fft degree {}", k);
         let start = start_timer!(|| message);
+
+        let mut prev_fft_coeffs = coeffs.clone();
         best_fft(&mut prev_fft_coeffs, domain.get_omega(), k);
+
         end_timer!(start);
-        //println!("coeffs cpu {:?}\n",prev_fft_coeffs);
 
         let message = format!("gpu_fft degree {}", k);
         let start = start_timer!(|| message);
 
+        let mut optimized_fft_coeffs = coeffs.clone();
         let mut fft_kern = Some(LockedMultiFFTKernel::<Fr>::new(k as usize, false));
 
         best_fft_multiple_gpu(
@@ -587,7 +586,6 @@ fn test_best_fft_multiple_gpu() {
 
         end_timer!(start);
 
-        //println!("coeffs gpu {:?}\n",optimized_fft_coeffs);
         assert_eq!(prev_fft_coeffs, optimized_fft_coeffs);
     }
 }
@@ -608,11 +606,12 @@ fn test_fft() {
         // evaluation domain
         let domain: EvaluationDomain<Fr> = EvaluationDomain::new(1, k);
 
-        let mut prev_fft_coeffs = coeffs.clone();
-
         let message = format!("prev_fft degree {}", k);
         let start = start_timer!(|| message);
+
+        let mut prev_fft_coeffs = coeffs.clone();
         best_fft(&mut prev_fft_coeffs, domain.get_omega(), k);
+
         end_timer!(start);
     }
 }