From c383a1d452d538bb47183b548c3c826bd77ade63 Mon Sep 17 00:00:00 2001
From: Jia Liu <lovelykitoun@gmail.com>
Date: Thu, 13 Jul 2023 11:27:42 +0100
Subject: [PATCH 1/3] fix bug in div_unsafe

---
 maingate/src/instructions.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/maingate/src/instructions.rs b/maingate/src/instructions.rs
index fd501478..514b3d41 100644
--- a/maingate/src/instructions.rs
+++ b/maingate/src/instructions.rs
@@ -400,7 +400,7 @@ pub trait MainGateInstructions<F: PrimeField, const WIDTH: usize>: Chip<F> {
                 [
                     Term::assigned_to_mul(b),
                     Term::unassigned_to_mul(c),
-                    Term::assigned_to_add(a),
+                    Term::assigned_to_sub(a),
                 ],
                 F::ZERO,
                 CombinationOptionCommon::OneLinerMul.into(),

From 7357aefde4a60264a139b1916d0988d2bffc412f Mon Sep 17 00:00:00 2001
From: Jia Liu <lovelykitoun@gmail.com>
Date: Mon, 21 Aug 2023 17:17:29 +0100
Subject: [PATCH 2/3] windowed scalar mul with aux for base field and general
 chip

---
 ecc/Cargo.toml                |   3 +-
 ecc/src/base_field_ecc.rs     | 108 +++++++++++++------
 ecc/src/base_field_ecc/mul.rs | 164 ++++++++++++++++-------------
 ecc/src/general_ecc.rs        | 102 ++++++++++++------
 ecc/src/general_ecc/mul.rs    | 189 +++++++++++++++++++---------------
 ecdsa/src/ecdsa.rs            |   2 +-
 6 files changed, 349 insertions(+), 219 deletions(-)

diff --git a/ecc/Cargo.toml b/ecc/Cargo.toml
index 6aa13e59..615c673a 100644
--- a/ecc/Cargo.toml
+++ b/ecc/Cargo.toml
@@ -15,7 +15,8 @@ subtle = { version = "2.3", default-features = false }
 [dev-dependencies]
 rand_core = { version = "0.6", default-features = false }
 paste = "1.0.7"
+rand_chacha = "0.3.1"
 
 [features]
 default = []
-circuit-params = ["integer/circuit-params"]
+circuit-params = ["integer/circuit-params"]
\ No newline at end of file
diff --git a/ecc/src/base_field_ecc.rs b/ecc/src/base_field_ecc.rs
index c38e8855..92f30cbd 100644
--- a/ecc/src/base_field_ecc.rs
+++ b/ecc/src/base_field_ecc.rs
@@ -1,4 +1,6 @@
-use super::{make_mul_aux, AssignedPoint, EccConfig, MulAux, Point};
+use super::{AssignedPoint, EccConfig, MulAux, Point};
+use crate::halo2::arithmetic::Field;
+use crate::halo2::halo2curves::ff::PrimeField;
 use crate::integer::chip::IntegerChip;
 use crate::integer::rns::{Integer, Rns};
 use crate::{halo2, maingate};
@@ -29,10 +31,14 @@ pub struct BaseFieldEccChip<C: CurveAffine, const NUMBER_OF_LIMBS: usize, const
         AssignedPoint<C::Base, C::Scalar, NUMBER_OF_LIMBS, BIT_LEN_LIMB>,
         Value<C>,
     )>,
-    /// Auxiliary points for optimized multiplication for each (window_size,
-    /// n_pairs) pairs
-    aux_registry:
-        BTreeMap<(usize, usize), AssignedPoint<C::Base, C::Scalar, NUMBER_OF_LIMBS, BIT_LEN_LIMB>>,
+    /// Auxiliary points for optimized multiplication for each window_size
+    aux_registry: BTreeMap<
+        usize,
+        (
+            C::Scalar,
+            AssignedPoint<C::Base, C::Scalar, NUMBER_OF_LIMBS, BIT_LEN_LIMB>,
+        ),
+    >,
 }
 
 impl<C: CurveAffine, const NUMBER_OF_LIMBS: usize, const BIT_LEN_LIMB: usize>
@@ -84,22 +90,26 @@ impl<C: CurveAffine, const NUMBER_OF_LIMBS: usize, const BIT_LEN_LIMB: usize>
     }
 
     /// Auxilary point for optimized multiplication algorithm
-    fn get_mul_aux(
+    fn get_mul_correction(
         &self,
         window_size: usize,
-        number_of_pairs: usize,
-    ) -> Result<MulAux<C::Base, C::Scalar, NUMBER_OF_LIMBS, BIT_LEN_LIMB>, Error> {
+    ) -> Result<
+        (
+            C::Scalar,
+            MulAux<C::Base, C::Scalar, NUMBER_OF_LIMBS, BIT_LEN_LIMB>,
+        ),
+        Error,
+    > {
         let to_add = match self.aux_generator.clone() {
             Some((assigned, _)) => Ok(assigned),
             None => Err(Error::Synthesis),
         }?;
-        let to_sub = match self.aux_registry.get(&(window_size, number_of_pairs)) {
+        let (scalar_correction, to_sub) = match self.aux_registry.get(&window_size) {
             Some(aux) => Ok(aux.clone()),
             None => Err(Error::Synthesis),
         }?;
-        // to_add the equivalent of AuxInit and to_sub AuxFin
-        // see https://hackmd.io/ncuKqRXzR-Cw-Au2fGzsMg?view
-        Ok(MulAux::new(to_add, to_sub))
+
+        Ok((scalar_correction, MulAux::new(to_add, to_sub)))
     }
 }
 
@@ -175,20 +185,22 @@ impl<C: CurveAffine, const NUMBER_OF_LIMBS: usize, const BIT_LEN_LIMB: usize>
         Ok(())
     }
 
-    /// Assigns multiplication auxiliary point for a pair of (window_size,
-    /// n_pairs)
-    pub fn assign_aux(
+    /// Assigns scalar correction and multiplication auxiliary point for window_size
+    pub fn assign_correction(
         &mut self,
         ctx: &mut RegionCtx<'_, C::Scalar>,
         window_size: usize,
-        number_of_pairs: usize,
     ) -> Result<(), Error> {
-        match self.aux_generator {
-            Some((_, point)) => {
-                let aux = point.map(|point| make_mul_aux(point, window_size, number_of_pairs));
-                let aux = self.assign_point(ctx, aux)?;
+        let scalar_correction = self.correct_scalar(window_size);
+
+        match &self.aux_generator {
+            Some((point, _)) => {
+                // compute correction point -2^w aux
+                let mut point_correction = self.double_n(ctx, point, window_size)?;
+                point_correction = self.neg(ctx, &point_correction)?;
+
                 self.aux_registry
-                    .insert((window_size, number_of_pairs), aux);
+                    .insert(window_size, (scalar_correction, point_correction));
                 Ok(())
             }
             // aux generator is not assigned yet
@@ -196,6 +208,25 @@ impl<C: CurveAffine, const NUMBER_OF_LIMBS: usize, const BIT_LEN_LIMB: usize>
         }
     }
 
+    /// correct scalar before mul; correction value is -2(1 + 2^w + ... + 2^{w(n-1)})
+    fn correct_scalar(&mut self, window_size: usize) -> C::Scalar {
+        let window: usize = 1 << window_size;
+        let window_scalar = C::Scalar::from(window as u64);
+
+        let num_bits = C::Scalar::NUM_BITS as usize;
+        let number_of_windows = (num_bits + window_size - 1) / window_size;
+
+        let mut correction = C::Scalar::ONE;
+        let mut power = window_scalar;
+        for _ in 0..number_of_windows - 1 {
+            correction += power;
+            power *= window_scalar;
+        }
+        correction += correction;
+
+        -correction
+    }
+
     /// Constraints to ensure `AssignedPoint` is on curve
     pub fn assert_is_on_curve(
         &self,
@@ -351,6 +382,7 @@ mod tests {
     use crate::integer::rns::Rns;
     use crate::integer::NUMBER_OF_LOOKUP_LIMBS;
     use crate::maingate;
+    use crate::maingate::DimensionMeasurement;
     use halo2::arithmetic::CurveAffine;
     use halo2::circuit::{Layouter, SimpleFloorPlanner, Value};
     use halo2::halo2curves::{
@@ -365,7 +397,8 @@ mod tests {
         RangeInstructions,
     };
     use paste::paste;
-    use rand_core::OsRng;
+    use rand_chacha::ChaCha20Rng;
+    use rand_core::{OsRng, SeedableRng};
 
     const NUMBER_OF_LIMBS: usize = 4;
     const BIT_LEN_LIMB: usize = 68;
@@ -659,8 +692,7 @@ mod tests {
                     let offset = 0;
                     let ctx = &mut RegionCtx::new(region, offset);
                     ecc_chip.assign_aux_generator(ctx, Value::known(self.aux_generator))?;
-                    ecc_chip.assign_aux(ctx, self.window_size, 1)?;
-                    ecc_chip.get_mul_aux(self.window_size, 1)?;
+                    ecc_chip.assign_correction(ctx, self.window_size)?;
                     Ok(())
                 },
             )?;
@@ -671,8 +703,11 @@ mod tests {
                     let offset = 0;
                     let ctx = &mut RegionCtx::new(region, offset);
 
-                    let base = C::CurveExt::random(OsRng);
-                    let s = C::Scalar::random(OsRng);
+                    // let mut rng = ChaCha20Rng::seed_from_u64(80);
+                    let mut rng = OsRng;
+
+                    let base = C::CurveExt::random(&mut rng);
+                    let s = C::Scalar::random(&mut rng);
                     let result = base * s;
 
                     let base = ecc_chip.assign_point(ctx, Value::known(base.into()))?;
@@ -698,8 +733,11 @@ mod tests {
         where
             C::Scalar: FromUniformBytes<64>,
         {
-            for window_size in 1..5 {
-                let aux_generator = <C as CurveAffine>::CurveExt::random(OsRng).to_affine();
+            //let mut rng = ChaCha20Rng::seed_from_u64(42);
+            let mut rng = OsRng;
+
+            for window_size in 2..5 {
+                let aux_generator = <C as CurveAffine>::CurveExt::random(&mut rng).to_affine();
 
                 let circuit = TestEccMul {
                     aux_generator,
@@ -707,6 +745,11 @@ mod tests {
                 };
                 let instance = vec![vec![]];
                 mock_prover_verify(&circuit, instance);
+                let dimension = DimensionMeasurement::measure(&circuit).unwrap();
+                println!(
+                    "window_size = {:?}, dimention: {:?}",
+                    window_size, dimension
+                );
             }
         }
         run::<Bn256>();
@@ -752,8 +795,7 @@ mod tests {
                     let offset = 0;
                     let ctx = &mut RegionCtx::new(region, offset);
                     ecc_chip.assign_aux_generator(ctx, Value::known(self.aux_generator))?;
-                    ecc_chip.assign_aux(ctx, self.window_size, self.number_of_pairs)?;
-                    ecc_chip.get_mul_aux(self.window_size, self.number_of_pairs)?;
+                    ecc_chip.assign_correction(ctx, self.window_size)?;
                     Ok(())
                 },
             )?;
@@ -799,8 +841,8 @@ mod tests {
             paste! {
                 #[test]
                 fn [<test_base_field_ecc_mul_batch_circuit_ $C:lower>]() {
-                    for number_of_pairs in 5..7 {
-                        for window_size in 1..3 {
+                    for number_of_pairs in 2..7 {
+                        for window_size in 2..4 {
                             let aux_generator = <$C as CurveAffine>::CurveExt::random(OsRng).to_affine();
 
                             let circuit = TestEccBatchMul {
@@ -810,6 +852,8 @@ mod tests {
                             };
                             let instance = vec![vec![]];
                             mock_prover_verify(&circuit, instance);
+                            let dimension = DimensionMeasurement::measure(&circuit).unwrap();
+                            println!("(number of pairs,  window_size) = ({:?}, {:?}), dimention: {:?}", number_of_pairs, window_size, dimension);
                         }
                     }
                 }
diff --git a/ecc/src/base_field_ecc/mul.rs b/ecc/src/base_field_ecc/mul.rs
index 6c9ccf9f..7fab2523 100644
--- a/ecc/src/base_field_ecc/mul.rs
+++ b/ecc/src/base_field_ecc/mul.rs
@@ -9,56 +9,41 @@ use integer::maingate::RegionCtx;
 impl<C: CurveAffine, const NUMBER_OF_LIMBS: usize, const BIT_LEN_LIMB: usize>
     BaseFieldEccChip<C, NUMBER_OF_LIMBS, BIT_LEN_LIMB>
 {
-    /// Pads scalar up to the next window_size mul
-    fn pad(
-        &self,
-        ctx: &mut RegionCtx<'_, C::Scalar>,
-        bits: &mut Vec<AssignedCondition<C::Scalar>>,
-        window_size: usize,
-    ) -> Result<(), Error> {
-        assert_eq!(bits.len(), C::Scalar::NUM_BITS as usize);
-
-        // TODO: This is a tmp workaround. Instead of padding with zeros we can use a
-        // shorter ending window.
-        let padding_offset = (window_size - (bits.len() % window_size)) % window_size;
-        let zeros: Vec<AssignedCondition<C::Scalar>> = (0..padding_offset)
-            .map(|_| self.main_gate().assign_constant(ctx, C::Scalar::ZERO))
-            .collect::<Result<_, Error>>()?;
-        bits.extend(zeros);
-        bits.reverse();
-
-        Ok(())
-    }
-
     /// Splits the bit representation of a scalar into windows
     fn window(bits: Vec<AssignedCondition<C::Scalar>>, window_size: usize) -> Windowed<C::Scalar> {
-        assert_eq!(bits.len() % window_size, 0);
-        let number_of_windows = bits.len() / window_size;
-        Windowed(
-            (0..number_of_windows)
-                .map(|i| {
-                    let mut selector: Vec<AssignedCondition<C::Scalar>> = (0..window_size)
-                        .map(|j| bits[i * window_size + j].clone())
-                        .collect();
-                    selector.reverse();
-                    Selector(selector)
-                })
-                .collect(),
-        )
+        let last = bits.len() % window_size;
+        let num = bits.len() / window_size;
+
+        let mut windows: Vec<_> = (0..num)
+            .map(|i| {
+                let k = i * window_size;
+                Selector(bits[k..k + window_size].to_vec())
+            })
+            .collect();
+
+        if last != 0 {
+            let last_start = bits.len() - last;
+            windows.push(Selector(bits[last_start..].to_vec()));
+        }
+
+        windows.reverse();
+
+        Windowed(windows)
     }
 
     /// Constructs table for efficient multiplication algorithm
     /// The table contains precomputed point values that allow to trade
     /// additions for selections
+    /// [2]P, [3]P, ..., [2^w + 1]P
     fn make_incremental_table(
         &self,
         ctx: &mut RegionCtx<'_, C::Scalar>,
-        aux: &AssignedPoint<C::Base, C::Scalar, NUMBER_OF_LIMBS, BIT_LEN_LIMB>,
         point: &AssignedPoint<C::Base, C::Scalar, NUMBER_OF_LIMBS, BIT_LEN_LIMB>,
         window_size: usize,
     ) -> Result<Table<C::Base, C::Scalar, NUMBER_OF_LIMBS, BIT_LEN_LIMB>, Error> {
         let table_size = 1 << window_size;
-        let mut table = vec![aux.clone()];
+        let double = self.double(ctx, point)?;
+        let mut table = vec![double];
         for i in 0..(table_size - 1) {
             table.push(self.add(ctx, &table[i], point)?);
         }
@@ -96,28 +81,48 @@ impl<C: CurveAffine, const NUMBER_OF_LIMBS: usize, const BIT_LEN_LIMB: usize>
         scalar: &AssignedValue<C::Scalar>,
         window_size: usize,
     ) -> Result<AssignedPoint<C::Base, C::Scalar, NUMBER_OF_LIMBS, BIT_LEN_LIMB>, Error> {
-        assert!(window_size > 0);
-        let aux = self.get_mul_aux(window_size, 1)?;
+        assert!(window_size > 1);
+        let num_bits = C::Scalar::NUM_BITS as usize;
+        let number_of_windows = (num_bits + window_size - 1) / window_size;
+        let mut last = num_bits % window_size;
+        if last == 0 {
+            last = window_size;
+        }
+        let window_last: usize = 1 << last;
 
+        let (scalar_correction, aux) = self.get_mul_correction(window_size)?;
         let main_gate = self.main_gate();
-        let decomposed = &mut main_gate.to_bits(ctx, scalar, C::Scalar::NUM_BITS as usize)?;
+        let scalar_adjusted = &main_gate.add_constant(ctx, scalar, scalar_correction)?;
 
-        self.pad(ctx, decomposed, window_size)?;
-        let windowed = Self::window(decomposed.to_vec(), window_size);
-        let table = &self.make_incremental_table(ctx, &aux.to_add, point, window_size)?;
+        let decomposed = main_gate.to_bits(ctx, &scalar_adjusted, num_bits)?;
+        let windowed = Self::window(decomposed, window_size);
 
-        let mut acc = self.select_multi(ctx, &windowed.0[0], table)?;
-        acc = self.double_n(ctx, &acc, window_size)?;
+        let table = &self.make_incremental_table(ctx, point, window_size)?;
+        let last_table = &Table(table.0[0..window_last].to_vec());
 
-        let to_add = self.select_multi(ctx, &windowed.0[1], table)?;
-        acc = self.add(ctx, &acc, &to_add)?;
+        let mut acc = self.select_multi(ctx, &windowed.0[0], last_table)?;
+        acc = self.double_n(ctx, &acc, window_size)?;
+        let q = self.select_multi(ctx, &windowed.0[1], table)?;
+        acc = self._add_incomplete_unsafe(ctx, &acc, &q)?;
 
-        for selector in windowed.0.iter().skip(2) {
+        for i in 2..number_of_windows - 2 {
             acc = self.double_n(ctx, &acc, window_size - 1)?;
-            let to_add = self.select_multi(ctx, selector, table)?;
-            acc = self.ladder(ctx, &acc, &to_add)?;
+            let q = self.select_multi(ctx, &windowed.0[i], table)?;
+            acc = self._ladder_incomplete(ctx, &acc, &q)?;
         }
 
+        // The last two rows use auxiliary generator
+        // aux_1 = (2^w aux_2 + aux_generator) + Q_1
+        // aux_0 = 2^w aux_1 + Q_0 - 2^w aux_generator
+        acc = self.double_n(ctx, &acc, window_size)?;
+        acc = self.add(ctx, &acc, &aux.to_add)?;
+        let q1 = self.select_multi(ctx, &windowed.0[number_of_windows - 2], table)?;
+        acc = self.add(ctx, &acc, &q1)?;
+
+        acc = self.double_n(ctx, &acc, window_size)?;
+        let q0 = self.select_multi(ctx, &windowed.0[number_of_windows - 1], table)?;
+        acc = self.add(ctx, &acc, &q0)?;
+
         self.add(ctx, &acc, &aux.to_sub)
     }
 
@@ -137,59 +142,74 @@ impl<C: CurveAffine, const NUMBER_OF_LIMBS: usize, const BIT_LEN_LIMB: usize>
         )>,
         window_size: usize,
     ) -> Result<AssignedPoint<C::Base, C::Scalar, NUMBER_OF_LIMBS, BIT_LEN_LIMB>, Error> {
-        assert!(window_size > 0);
+        assert!(window_size > 1);
         assert!(!pairs.is_empty());
-        let aux = self.get_mul_aux(window_size, pairs.len())?;
+
+        let num_bits = C::Scalar::NUM_BITS as usize;
+        let mut last = num_bits % window_size;
+        if last == 0 {
+            last = window_size;
+        }
+        let window_last: usize = 1 << last;
 
         let main_gate = self.main_gate();
 
-        let mut decomposed_scalars: Vec<Vec<AssignedCondition<C::Scalar>>> = pairs
+        let (scalar_correction, aux) = self.get_mul_correction(window_size)?;
+        let decomposed_scalars: Vec<Vec<AssignedCondition<C::Scalar>>> = pairs
             .iter()
-            .map(|(_, scalar)| main_gate.to_bits(ctx, scalar, C::Scalar::NUM_BITS as usize))
+            .map(|(_, scalar)| {
+                let scalar_adjusted = main_gate.add_constant(ctx, scalar, scalar_correction)?;
+                main_gate.to_bits(ctx, &scalar_adjusted, C::Scalar::NUM_BITS as usize)
+            })
             .collect::<Result<_, Error>>()?;
 
-        for decomposed in decomposed_scalars.iter_mut() {
-            self.pad(ctx, decomposed, window_size)?;
-        }
-
         let windowed_scalars: Vec<Windowed<C::Scalar>> = decomposed_scalars
             .iter()
             .map(|decomposed| Self::window(decomposed.to_vec(), window_size))
             .collect();
         let number_of_windows = windowed_scalars[0].0.len();
 
-        let mut binary_aux = aux.to_add.clone();
         let tables: Vec<Table<C::Base, C::Scalar, NUMBER_OF_LIMBS, BIT_LEN_LIMB>> = pairs
             .iter()
-            .enumerate()
-            .map(|(i, (point, _))| {
-                let table = self.make_incremental_table(ctx, &binary_aux, point, window_size);
-                if i != pairs.len() - 1 {
-                    binary_aux = self.double(ctx, &binary_aux)?;
-                }
-                table
-            })
+            .map(|(point, _)| self.make_incremental_table(ctx, point, window_size))
             .collect::<Result<_, Error>>()?;
 
         // preparation for the first round
         // initialize accumulator
-        let mut acc = self.select_multi(ctx, &windowed_scalars[0].0[0], &tables[0])?;
+        let last_table = &Table(tables[0].0[0..window_last].to_vec());
+        let mut acc = self.select_multi(ctx, &windowed_scalars[0].0[0], last_table)?;
         // add first contributions other point scalar
         for (table, windowed) in tables.iter().skip(1).zip(windowed_scalars.iter().skip(1)) {
+            let last_table = &Table(table.0[0..window_last].to_vec());
             let selector = &windowed.0[0];
-            let to_add = self.select_multi(ctx, selector, table)?;
-            acc = self.add(ctx, &acc, &to_add)?;
+            let q = self.select_multi(ctx, selector, last_table)?;
+            acc = self.add(ctx, &acc, &q)?;
         }
 
-        for i in 1..number_of_windows {
+        for i in 1..number_of_windows - 2 {
             acc = self.double_n(ctx, &acc, window_size)?;
             for (table, windowed) in tables.iter().zip(windowed_scalars.iter()) {
                 let selector = &windowed.0[i];
-                let to_add = self.select_multi(ctx, selector, table)?;
-                acc = self.add(ctx, &acc, &to_add)?;
+                let q = self.select_multi(ctx, selector, table)?;
+                acc = self.add(ctx, &acc, &q)?;
             }
         }
 
+        acc = self.double_n(ctx, &acc, window_size)?;
+        acc = self.add(ctx, &acc, &aux.to_add)?;
+        for (table, windowed) in tables.iter().zip(windowed_scalars.iter()) {
+            let selector = &windowed.0[number_of_windows - 2];
+            let q = self.select_multi(ctx, selector, table)?;
+            acc = self.add(ctx, &acc, &q)?;
+        }
+
+        acc = self.double_n(ctx, &acc, window_size)?;
+        for (table, windowed) in tables.iter().zip(windowed_scalars.iter()) {
+            let selector = &windowed.0[number_of_windows - 1];
+            let q = self.select_multi(ctx, selector, table)?;
+            acc = self.add(ctx, &acc, &q)?;
+        }
+
         self.add(ctx, &acc, &aux.to_sub)
     }
 }
diff --git a/ecc/src/general_ecc.rs b/ecc/src/general_ecc.rs
index 89fd5ee6..0845814e 100644
--- a/ecc/src/general_ecc.rs
+++ b/ecc/src/general_ecc.rs
@@ -1,5 +1,6 @@
-use super::{make_mul_aux, AssignedPoint, EccConfig, MulAux, Point};
+use super::{AssignedPoint, EccConfig, MulAux, Point};
 use crate::halo2;
+use crate::halo2::arithmetic::Field;
 use crate::integer::rns::{Integer, Rns};
 use crate::integer::{IntegerChip, IntegerInstructions, Range, UnassignedInteger};
 use crate::maingate;
@@ -36,8 +37,13 @@ pub struct GeneralEccChip<
     )>,
     /// Auxiliary points for optimized multiplication for each (window_size,
     /// n_pairs) pairs
-    aux_registry:
-        BTreeMap<(usize, usize), AssignedPoint<Emulated::Base, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>>,
+    aux_registry: BTreeMap<
+        usize,
+        (
+            Integer<Emulated::Scalar, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>,
+            AssignedPoint<Emulated::Base, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>,
+        ),
+    >,
 }
 
 impl<
@@ -137,23 +143,27 @@ impl<
     }
 
     /// Auxilary point for optimized multiplication algorithm
-    fn get_mul_aux(
+    fn get_mul_correction(
         &self,
         window_size: usize,
-        number_of_pairs: usize,
-    ) -> Result<MulAux<Emulated::Base, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>, Error> {
+    ) -> Result<
+        (
+            Integer<Emulated::Scalar, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>,
+            MulAux<Emulated::Base, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>,
+        ),
+        Error,
+    > {
         // Gets chips' aux generator
         let to_add = match self.aux_generator.clone() {
             Some((assigned, _)) => Ok(assigned),
             None => Err(Error::Synthesis),
         }?;
-        let to_sub = match self.aux_registry.get(&(window_size, number_of_pairs)) {
+        let (scalar_correction, to_sub) = match self.aux_registry.get(&window_size) {
             Some(aux) => Ok(aux.clone()),
             None => Err(Error::Synthesis),
         }?;
-        // to_add the equivalent of AuxInit and to_sub AuxFin
-        // see https://hackmd.io/ncuKqRXzR-Cw-Au2fGzsMg?view
-        Ok(MulAux::new(to_add, to_sub))
+
+        Ok((scalar_correction, MulAux::new(to_add, to_sub)))
     }
 }
 
@@ -235,20 +245,22 @@ impl<
         Ok(())
     }
 
-    /// Assigns multiplication auxiliary point for a pair of (window_size,
-    /// n_pairs)
-    pub fn assign_aux(
+    /// Assigns multiplication auxiliary point for window_size
+    pub fn assign_correction(
         &mut self,
         ctx: &mut RegionCtx<'_, N>,
         window_size: usize,
-        number_of_pairs: usize,
     ) -> Result<(), Error> {
-        match self.aux_generator {
-            Some((_, point)) => {
-                let aux = point.map(|point| make_mul_aux(point, window_size, number_of_pairs));
-                let aux = self.assign_point(ctx, aux)?;
+        let scalar_correction = self.correct_scalar(window_size);
+
+        match &self.aux_generator {
+            Some((point, _)) => {
+                // compute correction point -2^w aux
+                let mut point_correction = self.double_n(ctx, point, window_size)?;
+                point_correction = self.neg(ctx, &point_correction)?;
+
                 self.aux_registry
-                    .insert((window_size, number_of_pairs), aux);
+                    .insert(window_size, (scalar_correction, point_correction));
                 Ok(())
             }
             // aux generator is not assigned yet
@@ -256,6 +268,28 @@ impl<
         }
     }
 
+    /// correct scalar before mul; correction value is -2(1 + 2^w + ... + 2^{w(n-1)})
+    fn correct_scalar(
+        &mut self,
+        window_size: usize,
+    ) -> Integer<Emulated::Scalar, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB> {
+        let window: usize = 1 << window_size;
+        let window_scalar = Emulated::Scalar::from(window as u64);
+
+        let num_bits = Emulated::Scalar::NUM_BITS as usize;
+        let number_of_windows = (num_bits + window_size - 1) / window_size;
+
+        let mut correction = Emulated::Scalar::ONE;
+        let mut power = window_scalar;
+        for _ in 0..number_of_windows - 1 {
+            correction += power;
+            power *= window_scalar;
+        }
+        correction += correction;
+
+        Integer::from_fe(-correction, self.rns_scalar())
+    }
+
     /// Constraints to ensure `AssignedPoint` is on curve
     pub fn assert_is_on_curve(
         &self,
@@ -415,13 +449,15 @@ mod tests {
         MainGate, MainGateConfig, RangeChip, RangeConfig, RangeInstructions, RegionCtx,
     };
     use paste::paste;
-    use rand_core::OsRng;
+    use rand_chacha::ChaCha20Rng;
+    use rand_core::{OsRng, SeedableRng};
 
     use crate::curves::bn256::{Fr as BnScalar, G1Affine as Bn256};
     use crate::curves::pasta::{
         EpAffine as Pallas, EqAffine as Vesta, Fp as PastaFp, Fq as PastaFq,
     };
     use crate::curves::secp256k1::Secp256k1Affine as Secp256k1;
+    use crate::maingate::DimensionMeasurement;
 
     const NUMBER_OF_LIMBS: usize = 4;
     const BIT_LEN_LIMB: usize = 68;
@@ -794,8 +830,7 @@ mod tests {
                     let offset = 0;
                     let ctx = &mut RegionCtx::new(region, offset);
                     ecc_chip.assign_aux_generator(ctx, Value::known(self.aux_generator))?;
-                    ecc_chip.assign_aux(ctx, self.window_size, 1)?;
-                    ecc_chip.get_mul_aux(self.window_size, 1)?;
+                    ecc_chip.assign_correction(ctx, self.window_size)?;
                     Ok(())
                 },
             )?;
@@ -808,8 +843,11 @@ mod tests {
                     let offset = 0;
                     let ctx = &mut RegionCtx::new(region, offset);
 
-                    let base = C::Curve::random(OsRng);
-                    let s = C::Scalar::random(OsRng);
+                    // let mut rng = ChaCha20Rng::seed_from_u64(80);
+                    let mut rng = OsRng;
+
+                    let base = C::Curve::random(&mut rng);
+                    let s = C::Scalar::random(&mut rng);
                     let result = base * s;
 
                     let s = Integer::from_fe(s, ecc_chip.rns_scalar());
@@ -842,8 +880,10 @@ mod tests {
             const NUMBER_OF_LIMBS: usize,
             const BIT_LEN_LIMB: usize,
         >() {
-            for window_size in 1..5 {
-                let aux_generator = C::Curve::random(OsRng).to_affine();
+            for window_size in 2..5 {
+                //   let mut rng = ChaCha20Rng::seed_from_u64(42);
+                let mut rng = OsRng;
+                let aux_generator = C::Curve::random(&mut rng).to_affine();
 
                 let circuit = TestEccMul::<C, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB> {
                     aux_generator,
@@ -852,6 +892,11 @@ mod tests {
                 };
                 let instance = vec![vec![]];
                 mock_prover_verify(&circuit, instance);
+                let dimension = DimensionMeasurement::measure(&circuit).unwrap();
+                println!(
+                    "window_size = {:?}, dimention: {:?}",
+                    window_size, dimension
+                );
             }
         }
 
@@ -921,8 +966,7 @@ mod tests {
                     let offset = 0;
                     let ctx = &mut RegionCtx::new(region, offset);
                     ecc_chip.assign_aux_generator(ctx, Value::known(self.aux_generator))?;
-                    ecc_chip.assign_aux(ctx, self.window_size, self.number_of_pairs)?;
-                    ecc_chip.get_mul_aux(self.window_size, self.number_of_pairs)?;
+                    ecc_chip.assign_correction(ctx, self.window_size)?;
                     Ok(())
                 },
             )?;
@@ -976,7 +1020,7 @@ mod tests {
                 #[test]
                 fn [<test_general_ecc_mul_batch_circuit_ $C:lower _ $N:lower>]() {
                     for number_of_pairs in 5..7 {
-                        for window_size in 1..3 {
+                        for window_size in 2..4 {
                             let aux_generator = <$C as PrimeCurveAffine>::Curve::random(OsRng).to_affine();
 
                             let circuit = TestEccBatchMul::<$C, $N, $NUMBER_OF_LIMBS, $BIT_LEN_LIMB> {
diff --git a/ecc/src/general_ecc/mul.rs b/ecc/src/general_ecc/mul.rs
index 7b5af700..e573cf99 100644
--- a/ecc/src/general_ecc/mul.rs
+++ b/ecc/src/general_ecc/mul.rs
@@ -14,58 +14,43 @@ impl<
         const BIT_LEN_LIMB: usize,
     > GeneralEccChip<Emulated, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>
 {
-    /// Pads scalar up to the next window_size mul
-    fn pad(
-        &self,
-        region: &mut RegionCtx<'_, N>,
-        bits: &mut Vec<AssignedCondition<N>>,
-        window_size: usize,
-    ) -> Result<(), Error> {
-        assert_eq!(bits.len(), Emulated::ScalarExt::NUM_BITS as usize);
-
-        // TODO: This is a tmp workaround. Instead of padding with zeros we can use a
-        // shorter ending window.
-        let padding_offset = (window_size - (bits.len() % window_size)) % window_size;
-        let zeros: Vec<AssignedCondition<N>> = (0..padding_offset)
-            .map(|_| self.main_gate().assign_constant(region, N::ZERO))
-            .collect::<Result<_, Error>>()?;
-        bits.extend(zeros);
-        bits.reverse();
-
-        Ok(())
-    }
-
     /// Splits the bit representation of a scalar into windows
     fn window(bits: Vec<AssignedCondition<N>>, window_size: usize) -> Windowed<N> {
-        assert_eq!(bits.len() % window_size, 0);
-        let number_of_windows = bits.len() / window_size;
-        Windowed(
-            (0..number_of_windows)
-                .map(|i| {
-                    let mut selector: Vec<AssignedCondition<N>> = (0..window_size)
-                        .map(|j| bits[i * window_size + j].clone())
-                        .collect();
-                    selector.reverse();
-                    Selector(selector)
-                })
-                .collect(),
-        )
+        let last = bits.len() % window_size;
+        let num = bits.len() / window_size;
+
+        let mut windows: Vec<_> = (0..num)
+            .map(|i| {
+                let k = i * window_size;
+                Selector(bits[k..k + window_size].to_vec())
+            })
+            .collect();
+
+        if last != 0 {
+            let last_start = bits.len() - last;
+            windows.push(Selector(bits[last_start..].to_vec()));
+        }
+
+        windows.reverse();
+
+        Windowed(windows)
     }
 
     /// Constructs table for efficient multiplication algorithm
     /// The table contains precomputed point values that allow to trade
     /// additions for selections
+    /// [2]P, [3]P, ..., [2^w + 1]P
     fn make_incremental_table(
         &self,
-        region: &mut RegionCtx<'_, N>,
-        aux: &AssignedPoint<Emulated::Base, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>,
+        ctx: &mut RegionCtx<'_, N>,
         point: &AssignedPoint<Emulated::Base, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>,
         window_size: usize,
     ) -> Result<Table<Emulated::Base, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>, Error> {
         let table_size = 1 << window_size;
-        let mut table = vec![aux.clone()];
+        let double = self.double(ctx, point)?;
+        let mut table = vec![double];
         for i in 0..(table_size - 1) {
-            table.push(self.add(region, &table[i], point)?);
+            table.push(self.add(ctx, &table[i], point)?);
         }
         Ok(Table(table))
     }
@@ -96,33 +81,55 @@ impl<
     /// Performed with the sliding-window algorithm
     pub fn mul(
         &self,
-        region: &mut RegionCtx<'_, N>,
+        ctx: &mut RegionCtx<'_, N>,
         point: &AssignedPoint<Emulated::Base, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>,
         scalar: &AssignedInteger<Emulated::Scalar, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>,
         window_size: usize,
     ) -> Result<AssignedPoint<Emulated::Base, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>, Error> {
-        assert!(window_size > 0);
-        let aux = self.get_mul_aux(window_size, 1)?;
+        assert!(window_size > 1);
+        let num_bits = Emulated::Scalar::NUM_BITS as usize;
+        let number_of_windows = (num_bits + window_size - 1) / window_size;
+        let mut last = num_bits % window_size;
+        if last == 0 {
+            last = window_size;
+        }
+        let window_last: usize = 1 << last;
 
         let scalar_chip = self.scalar_field_chip();
-        let decomposed = &mut scalar_chip.decompose(region, scalar)?;
-        self.pad(region, decomposed, window_size)?;
-        let windowed = Self::window(decomposed.to_vec(), window_size);
-        let table = &self.make_incremental_table(region, &aux.to_add, point, window_size)?;
 
-        let mut acc = self.select_multi(region, &windowed.0[0], table)?;
-        acc = self.double_n(region, &acc, window_size)?;
+        let (scalar_correction, aux) = self.get_mul_correction(window_size)?;
+        let scalar_adjusted = &scalar_chip.add_constant(ctx, scalar, &scalar_correction)?;
+        let scalar_reduced = &scalar_chip.reduce(ctx, scalar_adjusted)?;
+        let decomposed = scalar_chip.decompose(ctx, scalar_reduced)?;
+        let windowed = Self::window(decomposed, window_size);
+
+        let table = &self.make_incremental_table(ctx, point, window_size)?;
+        let last_table = &Table(table.0[0..window_last].to_vec());
+        let mut acc = self.select_multi(ctx, &windowed.0[0], last_table)?;
 
-        let to_add = self.select_multi(region, &windowed.0[1], table)?;
-        acc = self.add(region, &acc, &to_add)?;
+        acc = self.double_n(ctx, &acc, window_size)?;
+        let q = self.select_multi(ctx, &windowed.0[1], table)?;
+        acc = self._add_incomplete_unsafe(ctx, &acc, &q)?;
 
-        for selector in windowed.0.iter().skip(2) {
-            acc = self.double_n(region, &acc, window_size - 1)?;
-            let to_add = self.select_multi(region, selector, table)?;
-            acc = self.ladder(region, &acc, &to_add)?;
+        for i in 2..number_of_windows - 2 {
+            acc = self.double_n(ctx, &acc, window_size - 1)?;
+            let q = self.select_multi(ctx, &windowed.0[i], table)?;
+            acc = self._ladder_incomplete(ctx, &acc, &q)?;
         }
 
-        self.add(region, &acc, &aux.to_sub)
+        // The last two rows use auxiliary generator
+        // aux_1 = (2^w aux_2 + aux_generator) + Q_1
+        // aux_0 = 2^w aux_1 + Q_0 - 2^w aux_generator
+        acc = self.double_n(ctx, &acc, window_size)?;
+        acc = self.add(ctx, &acc, &aux.to_add)?;
+        let q1 = self.select_multi(ctx, &windowed.0[number_of_windows - 2], table)?;
+        acc = self.add(ctx, &acc, &q1)?;
+
+        acc = self.double_n(ctx, &acc, window_size)?;
+        let q0 = self.select_multi(ctx, &windowed.0[number_of_windows - 1], table)?;
+        acc = self.add(ctx, &acc, &q0)?;
+
+        self.add(ctx, &acc, &aux.to_sub)
     }
 
     /// Computes multi-product
@@ -134,68 +141,82 @@ impl<
     #[allow(clippy::type_complexity)]
     pub fn mul_batch_1d_horizontal(
         &self,
-        region: &mut RegionCtx<'_, N>,
+        ctx: &mut RegionCtx<'_, N>,
         pairs: Vec<(
             AssignedPoint<Emulated::Base, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>,
             AssignedInteger<Emulated::Scalar, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>,
         )>,
         window_size: usize,
     ) -> Result<AssignedPoint<Emulated::Base, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>, Error> {
-        assert!(window_size > 0);
+        assert!(window_size > 1);
         assert!(!pairs.is_empty());
-        let aux = self.get_mul_aux(window_size, pairs.len())?;
+
+        let num_bits = Emulated::Scalar::NUM_BITS as usize;
+        let mut last = num_bits % window_size;
+        if last == 0 {
+            last = window_size;
+        }
+        let window_last: usize = 1 << last;
 
         let scalar_chip = self.scalar_field_chip();
+        let (scalar_correction, aux) = self.get_mul_correction(window_size)?;
+
         // 1. Decompose scalars in bits
-        let mut decomposed_scalars: Vec<Vec<AssignedCondition<N>>> = pairs
+        let decomposed_scalars: Vec<Vec<AssignedCondition<N>>> = pairs
             .iter()
-            .map(|(_, scalar)| scalar_chip.decompose(region, scalar))
+            .map(|(_, scalar)| {
+                let scalar_adjusted = &scalar_chip.add_constant(ctx, scalar, &scalar_correction)?;
+                let scalar_reduced = &scalar_chip.reduce(ctx, scalar_adjusted)?;
+                scalar_chip.decompose(ctx, scalar_reduced)
+            })
             .collect::<Result<_, Error>>()?;
 
-        // 2. Pad scalars bit representations
-        for decomposed in decomposed_scalars.iter_mut() {
-            self.pad(region, decomposed, window_size)?;
-        }
-
-        // 3. Split scalar bits into windows
+        // 2. Split scalar bits into windows
         let windowed_scalars: Vec<Windowed<N>> = decomposed_scalars
             .into_iter()
             .map(|decomposed| Self::window(decomposed, window_size))
             .collect();
         let number_of_windows = windowed_scalars[0].0.len();
 
-        let mut binary_aux = aux.to_add.clone();
         let tables: Vec<Table<Emulated::Base, N, NUMBER_OF_LIMBS, BIT_LEN_LIMB>> = pairs
             .iter()
-            .enumerate()
-            .map(|(i, (point, _))| {
-                let table = self.make_incremental_table(region, &binary_aux, point, window_size);
-                if i != pairs.len() - 1 {
-                    binary_aux = self.double(region, &binary_aux)?;
-                }
-                table
-            })
+            .map(|(point, _)| self.make_incremental_table(ctx, point, window_size))
             .collect::<Result<_, Error>>()?;
 
-        // preparation for the first round
-        // initialize accumulator
-        let mut acc = self.select_multi(region, &windowed_scalars[0].0[0], &tables[0])?;
+        let last_table = &Table(tables[0].0[0..window_last].to_vec());
+        let mut acc = self.select_multi(ctx, &windowed_scalars[0].0[0], last_table)?;
         // add first contributions other point scalar
         for (table, windowed) in tables.iter().skip(1).zip(windowed_scalars.iter().skip(1)) {
+            let last_table = &Table(table.0[0..window_last].to_vec());
             let selector = &windowed.0[0];
-            let to_add = self.select_multi(region, selector, table)?;
-            acc = self.add(region, &acc, &to_add)?;
+            let q = self.select_multi(ctx, selector, last_table)?;
+            acc = self.add(ctx, &acc, &q)?;
         }
 
-        for i in 1..number_of_windows {
-            acc = self.double_n(region, &acc, window_size)?;
+        for i in 1..number_of_windows - 2 {
+            acc = self.double_n(ctx, &acc, window_size)?;
             for (table, windowed) in tables.iter().zip(windowed_scalars.iter()) {
                 let selector = &windowed.0[i];
-                let to_add = self.select_multi(region, selector, table)?;
-                acc = self.add(region, &acc, &to_add)?;
+                let q = self.select_multi(ctx, selector, table)?;
+                acc = self.add(ctx, &acc, &q)?;
             }
         }
 
-        self.add(region, &acc, &aux.to_sub)
+        acc = self.double_n(ctx, &acc, window_size)?;
+        acc = self.add(ctx, &acc, &aux.to_add)?;
+        for (table, windowed) in tables.iter().zip(windowed_scalars.iter()) {
+            let selector = &windowed.0[number_of_windows - 2];
+            let q = self.select_multi(ctx, selector, table)?;
+            acc = self.add(ctx, &acc, &q)?;
+        }
+
+        acc = self.double_n(ctx, &acc, window_size)?;
+        for (table, windowed) in tables.iter().zip(windowed_scalars.iter()) {
+            let selector = &windowed.0[number_of_windows - 1];
+            let q = self.select_multi(ctx, selector, table)?;
+            acc = self.add(ctx, &acc, &q)?;
+        }
+
+        self.add(ctx, &acc, &aux.to_sub)
     }
 }
diff --git a/ecdsa/src/ecdsa.rs b/ecdsa/src/ecdsa.rs
index eac14d62..d7b7891f 100644
--- a/ecdsa/src/ecdsa.rs
+++ b/ecdsa/src/ecdsa.rs
@@ -248,7 +248,7 @@ mod tests {
                     let ctx = &mut RegionCtx::new(region, offset);
 
                     ecc_chip.assign_aux_generator(ctx, Value::known(self.aux_generator))?;
-                    ecc_chip.assign_aux(ctx, self.window_size, 2)?;
+                    ecc_chip.assign_correction(ctx, self.window_size)?;
                     Ok(())
                 },
             )?;

From c75195be1f6499d6e7f71bbff15419acbbdb0079 Mon Sep 17 00:00:00 2001
From: Jia Liu <58184672+kitounliu@users.noreply.github.com>
Date: Tue, 22 Aug 2023 11:39:21 +0100
Subject: [PATCH 3/3] Create README.md

---
 ecc/README.md | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 ecc/README.md

diff --git a/ecc/README.md b/ecc/README.md
new file mode 100644
index 00000000..0a2498c3
--- /dev/null
+++ b/ecc/README.md
@@ -0,0 +1,45 @@
+### Windowed scalar mul using auxiliary generator
+
+$$
+\begin{align}
+0 &~~~~~~~ 1: ~~~~~~~~~~ [2]P ~~ [3]P ~~ \cdots~\cdots ~~~ [2^w +1]P   \\ 
+1 &~~~~~~ 2^w: ~~~~~~~~~ [2]P ~~ [3]P ~~ \cdots~\cdots ~~~ [2^w +1]P  \\
+2  &~~~~ (2^w)^2: ~~~~~~ [2]P ~~ [3]P ~~ \cdots~\cdots ~~~ [2^w +1]P  \\
+\cdots & \cdots \\ 
+n-3 & ~~~~ (2^w)^{n-3}: ~~ [2]P ~~ [3]P ~~ \cdots~\cdots ~~~ [2^w +1]P \\ 
+n-2 & ~~~~ (2^w)^{n-2}: ~~ [2]P ~~ [3]P ~~ \cdots~\cdots ~~~ [2^w +1]P  \\ 
+n-1  & ~~~~ (2^w)^{n-1}: ~~ [2]P ~~ [3]P ~~ \cdots ~~~ [2^\ell +1]P \\
+\end{align}
+$$
+
+where  window_size $w>1$ and scalar_size $= w(n-1) + \ell$ with $1\leq \ell \leq w$.
+
+The scalar $k\in F_r$ can be adjusted upfront $k' = k - (2*\sum_{0\leq j\leq n-1} 2^{wj}) \mod r$ to avoid computing 
+correction point $[\sum_{0\leq j\leq n-1}2* 2^{wj}]P$. This works for both base_field_chip and general_ecc_chip.
+
+The accumulation $acc_i$ is computed from the bottom up: 
+
+$$
+\begin{align}
+acc_{n-1} & = Q_{n-1} \\ 
+acc_{n-2} & = 2^w acc_{n-1} + Q_{n-2} \\ 
+acc_i & = 2^w acc_{i+1} + Q_i \\
+& = 2(2^{w-1} acc_{i+1}) + Q_i \text{ for } i = n-3,...,2
+\end{align}
+$$
+
+The scalar in $acc_{n-1},\dots, acc_2$ increases monotonically, and $acc_{n-3}...acc_2$ can be computed using laddr_incomplete. 
+The last two steps $acc_{1}, acc_0$ might overflow (when $\ell = 1$ and $w = 2$) 
+and need to use auxiliary generator and addition with assertions to ensure the x-coordinates are not the same: 
+
+$$
+\begin{align}
+acc_1 & = (2^{w} acc_2 + aux) + Q_1\\ 
+acc_0 & = (2^w acc_1 + Q_0) - 2^w aux
+\end{align}
+$$
+
+### mul_batch_1d_horizontal
+This algorithm uses addtion with assertions in all steps and the auxiliary generator in the last two steps. It is only suitable for computing 
+$e_1 P_1 + e_2 P_2 + \cdots + e_n P_n$, where $P_1, \dots, P_n$ are randomly chosen, 
+i.e., their discrete logarithms are unknown. The algorithm is not suitable for computing things like $eP + sP$.