From df27dfa0eae8a14d69e5006334bb06d84ba050b7 Mon Sep 17 00:00:00 2001
From: Mikhail Zabaluev <mikhail.zabaluev@gmail.com>
Date: Fri, 22 Mar 2024 16:50:48 +0200
Subject: [PATCH 1/4] Optimize integer pow by removing exit branch

The branch at the end of the `pow` implementations is redundant
with multiplication code already present in the loop. By rotating
the exit check, this branch can be largely removed, improving code size
and instruction cache coherence.
---
 library/core/src/num/int_macros.rs  | 61 ++++++++++++---------------
 library/core/src/num/uint_macros.rs | 64 ++++++++++++-----------------
 2 files changed, 52 insertions(+), 73 deletions(-)

diff --git a/library/core/src/num/int_macros.rs b/library/core/src/num/int_macros.rs
index d40e02352a1d0..6ed0eb07e48d8 100644
--- a/library/core/src/num/int_macros.rs
+++ b/library/core/src/num/int_macros.rs
@@ -1495,18 +1495,17 @@ macro_rules! int_impl {
             let mut base = self;
             let mut acc: Self = 1;
 
-            while exp > 1 {
+            loop {
                 if (exp & 1) == 1 {
                     acc = try_opt!(acc.checked_mul(base));
+                    // since exp!=0, finally the exp must be 1.
+                    if exp == 1 {
+                        return Some(acc);
+                    }
                 }
                 exp /= 2;
                 base = try_opt!(base.checked_mul(base));
             }
-            // since exp!=0, finally the exp must be 1.
-            // Deal with the final bit of the exponent separately, since
-            // squaring the base afterwards is not necessary and may cause a
-            // needless overflow.
-            acc.checked_mul(base)
         }
 
         /// Strict exponentiation. Computes `self.pow(exp)`, panicking if
@@ -1546,18 +1545,17 @@ macro_rules! int_impl {
             let mut base = self;
             let mut acc: Self = 1;
 
-            while exp > 1 {
+            loop {
                 if (exp & 1) == 1 {
                     acc = acc.strict_mul(base);
+                    // since exp!=0, finally the exp must be 1.
+                    if exp == 1 {
+                        return acc;
+                    }
                 }
                 exp /= 2;
                 base = base.strict_mul(base);
             }
-            // since exp!=0, finally the exp must be 1.
-            // Deal with the final bit of the exponent separately, since
-            // squaring the base afterwards is not necessary and may cause a
-            // needless overflow.
-            acc.strict_mul(base)
         }
 
         /// Returns the square root of the number, rounded down.
@@ -2181,19 +2179,17 @@ macro_rules! int_impl {
             let mut base = self;
             let mut acc: Self = 1;
 
-            while exp > 1 {
+            loop {
                 if (exp & 1) == 1 {
                     acc = acc.wrapping_mul(base);
+                    // since exp!=0, finally the exp must be 1.
+                    if exp == 1 {
+                        return acc;
+                    }
                 }
                 exp /= 2;
                 base = base.wrapping_mul(base);
             }
-
-            // since exp!=0, finally the exp must be 1.
-            // Deal with the final bit of the exponent separately, since
-            // squaring the base afterwards is not necessary and may cause a
-            // needless overflow.
-            acc.wrapping_mul(base)
         }
 
         /// Calculates `self` + `rhs`
@@ -2687,9 +2683,14 @@ macro_rules! int_impl {
             // Scratch space for storing results of overflowing_mul.
             let mut r;
 
-            while exp > 1 {
+            loop {
                 if (exp & 1) == 1 {
                     r = acc.overflowing_mul(base);
+                    // since exp!=0, finally the exp must be 1.
+                    if exp == 1 {
+                        r.1 |= overflown;
+                        return r;
+                    }
                     acc = r.0;
                     overflown |= r.1;
                 }
@@ -2698,14 +2699,6 @@ macro_rules! int_impl {
                 base = r.0;
                 overflown |= r.1;
             }
-
-            // since exp!=0, finally the exp must be 1.
-            // Deal with the final bit of the exponent separately, since
-            // squaring the base afterwards is not necessary and may cause a
-            // needless overflow.
-            r = acc.overflowing_mul(base);
-            r.1 |= overflown;
-            r
         }
 
         /// Raises self to the power of `exp`, using exponentiation by squaring.
@@ -2732,19 +2725,17 @@ macro_rules! int_impl {
             let mut base = self;
             let mut acc = 1;
 
-            while exp > 1 {
+            loop {
                 if (exp & 1) == 1 {
                     acc = acc * base;
+                    // since exp!=0, finally the exp must be 1.
+                    if exp == 1 {
+                        return acc;
+                    }
                 }
                 exp /= 2;
                 base = base * base;
             }
-
-            // since exp!=0, finally the exp must be 1.
-            // Deal with the final bit of the exponent separately, since
-            // squaring the base afterwards is not necessary and may cause a
-            // needless overflow.
-            acc * base
         }
 
         /// Returns the square root of the number, rounded down.
diff --git a/library/core/src/num/uint_macros.rs b/library/core/src/num/uint_macros.rs
index ad72c29758bd7..b272a9d901bf4 100644
--- a/library/core/src/num/uint_macros.rs
+++ b/library/core/src/num/uint_macros.rs
@@ -1534,20 +1534,17 @@ macro_rules! uint_impl {
             let mut base = self;
             let mut acc: Self = 1;
 
-            while exp > 1 {
+            loop {
                 if (exp & 1) == 1 {
                     acc = try_opt!(acc.checked_mul(base));
+                    // since exp!=0, finally the exp must be 1.
+                    if exp == 1 {
+                        return Some(acc);
+                    }
                 }
                 exp /= 2;
                 base = try_opt!(base.checked_mul(base));
             }
-
-            // since exp!=0, finally the exp must be 1.
-            // Deal with the final bit of the exponent separately, since
-            // squaring the base afterwards is not necessary and may cause a
-            // needless overflow.
-
-            acc.checked_mul(base)
         }
 
         /// Strict exponentiation. Computes `self.pow(exp)`, panicking if
@@ -1587,18 +1584,17 @@ macro_rules! uint_impl {
             let mut base = self;
             let mut acc: Self = 1;
 
-            while exp > 1 {
+            loop {
                 if (exp & 1) == 1 {
                     acc = acc.strict_mul(base);
+                    // since exp!=0, finally the exp must be 1.
+                    if exp == 1 {
+                        return acc;
+                    }
                 }
                 exp /= 2;
                 base = base.strict_mul(base);
             }
-            // since exp!=0, finally the exp must be 1.
-            // Deal with the final bit of the exponent separately, since
-            // squaring the base afterwards is not necessary and may cause a
-            // needless overflow.
-            acc.strict_mul(base)
         }
 
         /// Saturating integer addition. Computes `self + rhs`, saturating at
@@ -2059,19 +2055,17 @@ macro_rules! uint_impl {
             let mut base = self;
             let mut acc: Self = 1;
 
-            while exp > 1 {
+            loop {
                 if (exp & 1) == 1 {
                     acc = acc.wrapping_mul(base);
+                    // since exp!=0, finally the exp must be 1.
+                    if exp == 1 {
+                        return acc;
+                    }
                 }
                 exp /= 2;
                 base = base.wrapping_mul(base);
             }
-
-            // since exp!=0, finally the exp must be 1.
-            // Deal with the final bit of the exponent separately, since
-            // squaring the base afterwards is not necessary and may cause a
-            // needless overflow.
-            acc.wrapping_mul(base)
         }
 
         /// Calculates `self` + `rhs`
@@ -2516,9 +2510,14 @@ macro_rules! uint_impl {
             // Scratch space for storing results of overflowing_mul.
             let mut r;
 
-            while exp > 1 {
+            loop {
                 if (exp & 1) == 1 {
                     r = acc.overflowing_mul(base);
+                    // since exp!=0, finally the exp must be 1.
+                    if exp == 1 {
+                        r.1 |= overflown;
+                        return r;
+                    }
                     acc = r.0;
                     overflown |= r.1;
                 }
@@ -2527,15 +2526,6 @@ macro_rules! uint_impl {
                 base = r.0;
                 overflown |= r.1;
             }
-
-            // since exp!=0, finally the exp must be 1.
-            // Deal with the final bit of the exponent separately, since
-            // squaring the base afterwards is not necessary and may cause a
-            // needless overflow.
-            r = acc.overflowing_mul(base);
-            r.1 |= overflown;
-
-            r
         }
 
         /// Raises self to the power of `exp`, using exponentiation by squaring.
@@ -2560,19 +2550,17 @@ macro_rules! uint_impl {
             let mut base = self;
             let mut acc = 1;
 
-            while exp > 1 {
+            loop {
                 if (exp & 1) == 1 {
                     acc = acc * base;
+                    // since exp!=0, finally the exp must be 1.
+                    if exp == 1 {
+                        return acc;
+                    }
                 }
                 exp /= 2;
                 base = base * base;
             }
-
-            // since exp!=0, finally the exp must be 1.
-            // Deal with the final bit of the exponent separately, since
-            // squaring the base afterwards is not necessary and may cause a
-            // needless overflow.
-            acc * base
         }
 
         /// Returns the square root of the number, rounded down.

From 1faa1018c7ddd2f505904b0f71255afa5ae1bbe7 Mon Sep 17 00:00:00 2001
From: Mikhail Zabaluev <mikhail.zabaluev@gmail.com>
Date: Fri, 12 Jul 2024 00:54:26 +0300
Subject: [PATCH 2/4] Explicitly unroll integer pow for small exponents

The newly optimized loop has introduced a regression in the case
when pow is called with a small constant exponent. LLVM is no longer
able to unroll the loop and the generated code is larger and slower
than what's expected in tests.

Match and handle small exponent values separately by branching out
to an explicit multiplication sequence for that exponent.
Powers larger than 6 need more than three multiplications, so these
cases are less likely to benefit from this optimization, also such
constant exponents are less likely to be used in practice.
For uses with a non-constant exponent, this might also provide
a performance benefit if the exponent is small and does not vary
between successive calls, so the same match arm tends to be taken as
a predicted branch.
---
 library/core/src/num/int_macros.rs  | 62 ++++++++++++++++++++++++++---
 library/core/src/num/uint_macros.rs | 62 ++++++++++++++++++++++++++---
 2 files changed, 112 insertions(+), 12 deletions(-)

diff --git a/library/core/src/num/int_macros.rs b/library/core/src/num/int_macros.rs
index 6ed0eb07e48d8..d1bb5a6ef478a 100644
--- a/library/core/src/num/int_macros.rs
+++ b/library/core/src/num/int_macros.rs
@@ -2173,10 +2173,35 @@ macro_rules! int_impl {
                       without modifying the original"]
         #[inline]
         pub const fn wrapping_pow(self, mut exp: u32) -> Self {
-            if exp == 0 {
-                return 1;
-            }
             let mut base = self;
+
+            // Unroll multiplications for small exponent values.
+            // This gives the optimizer a way to efficiently inline call sites
+            // for the most common use cases with constant exponents.
+            // Currently, LLVM is unable to unroll the loop below.
+            match exp {
+                0 => return 1,
+                1 => return base,
+                2 => return base.wrapping_mul(base),
+                3 => {
+                    let squared = base.wrapping_mul(base);
+                    return squared.wrapping_mul(base);
+                }
+                4 => {
+                    let squared = base.wrapping_mul(base);
+                    return squared.wrapping_mul(squared);
+                }
+                5 => {
+                    let squared = base.wrapping_mul(base);
+                    return squared.wrapping_mul(squared).wrapping_mul(base);
+                }
+                6 => {
+                    let cubed = base.wrapping_mul(base).wrapping_mul(base);
+                    return cubed.wrapping_mul(cubed);
+                }
+                _ => {}
+            }
+
             let mut acc: Self = 1;
 
             loop {
@@ -2719,10 +2744,35 @@ macro_rules! int_impl {
         #[inline]
         #[rustc_inherit_overflow_checks]
         pub const fn pow(self, mut exp: u32) -> Self {
-            if exp == 0 {
-                return 1;
-            }
             let mut base = self;
+
+            // Unroll multiplications for small exponent values.
+            // This gives the optimizer a way to efficiently inline call sites
+            // for the most common use cases with constant exponents.
+            // Currently, LLVM is unable to unroll the loop below.
+            match exp {
+                0 => return 1,
+                1 => return base,
+                2 => return base * base,
+                3 => {
+                    let squared = base * base;
+                    return squared * base;
+                }
+                4 => {
+                    let squared = base * base;
+                    return squared * squared;
+                }
+                5 => {
+                    let squared = base * base;
+                    return squared * squared * base;
+                }
+                6 => {
+                    let cubed = base * base * base;
+                    return cubed * cubed;
+                }
+                _ => {}
+            }
+
             let mut acc = 1;
 
             loop {
diff --git a/library/core/src/num/uint_macros.rs b/library/core/src/num/uint_macros.rs
index b272a9d901bf4..6e5d37f8163ea 100644
--- a/library/core/src/num/uint_macros.rs
+++ b/library/core/src/num/uint_macros.rs
@@ -2049,10 +2049,35 @@ macro_rules! uint_impl {
                       without modifying the original"]
         #[inline]
         pub const fn wrapping_pow(self, mut exp: u32) -> Self {
-            if exp == 0 {
-                return 1;
-            }
             let mut base = self;
+
+            // Unroll multiplications for small exponent values.
+            // This gives the optimizer a way to efficiently inline call sites
+            // for the most common use cases with constant exponents.
+            // Currently, LLVM is unable to unroll the loop below.
+            match exp {
+                0 => return 1,
+                1 => return base,
+                2 => return base.wrapping_mul(base),
+                3 => {
+                    let squared = base.wrapping_mul(base);
+                    return squared.wrapping_mul(base);
+                }
+                4 => {
+                    let squared = base.wrapping_mul(base);
+                    return squared.wrapping_mul(squared);
+                }
+                5 => {
+                    let squared = base.wrapping_mul(base);
+                    return squared.wrapping_mul(squared).wrapping_mul(base);
+                }
+                6 => {
+                    let cubed = base.wrapping_mul(base).wrapping_mul(base);
+                    return cubed.wrapping_mul(cubed);
+                }
+                _ => {}
+            }
+
             let mut acc: Self = 1;
 
             loop {
@@ -2544,10 +2569,35 @@ macro_rules! uint_impl {
         #[inline]
         #[rustc_inherit_overflow_checks]
         pub const fn pow(self, mut exp: u32) -> Self {
-            if exp == 0 {
-                return 1;
-            }
             let mut base = self;
+
+            // Unroll multiplications for small exponent values.
+            // This gives the optimizer a way to efficiently inline call sites
+            // for the most common use cases with constant exponents.
+            // Currently, LLVM is unable to unroll the loop below.
+            match exp {
+                0 => return 1,
+                1 => return base,
+                2 => return base * base,
+                3 => {
+                    let squared = base * base;
+                    return squared * base;
+                }
+                4 => {
+                    let squared = base * base;
+                    return squared * squared;
+                }
+                5 => {
+                    let squared = base * base;
+                    return squared * squared * base;
+                }
+                6 => {
+                    let cubed = base * base * base;
+                    return cubed * cubed;
+                }
+                _ => {}
+            }
+
             let mut acc = 1;
 
             loop {

From 2f235343529c39bdab47704ec9620d6784eeeb6d Mon Sep 17 00:00:00 2001
From: Mikhail Zabaluev <mikhail.zabaluev@gmail.com>
Date: Fri, 12 Jul 2024 22:54:08 +0300
Subject: [PATCH 3/4] Use is_val_statically_known to optimize pow

In the dynamic exponent case, it's preferred to not increase code size,
so use solely the loop-based implementation there.
This shows about 4% penalty in the variable exponent benchmarks
on x86_64.
---
 library/core/src/lib.rs             |   1 +
 library/core/src/num/int_macros.rs  | 108 ++++++++++++++++------------
 library/core/src/num/uint_macros.rs | 108 ++++++++++++++++------------
 3 files changed, 125 insertions(+), 92 deletions(-)

diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs
index 0f82f01e57a71..ede95e3b2ca9b 100644
--- a/library/core/src/lib.rs
+++ b/library/core/src/lib.rs
@@ -170,6 +170,7 @@
 #![feature(internal_impls_macro)]
 #![feature(ip)]
 #![feature(is_ascii_octdigit)]
+#![feature(is_val_statically_known)]
 #![feature(isqrt)]
 #![feature(link_cfg)]
 #![feature(offset_of_enum)]
diff --git a/library/core/src/num/int_macros.rs b/library/core/src/num/int_macros.rs
index d1bb5a6ef478a..be0e6a2a03b70 100644
--- a/library/core/src/num/int_macros.rs
+++ b/library/core/src/num/int_macros.rs
@@ -2172,35 +2172,43 @@ macro_rules! int_impl {
         #[must_use = "this returns the result of the operation, \
                       without modifying the original"]
         #[inline]
+        #[rustc_allow_const_fn_unstable(is_val_statically_known)]
         pub const fn wrapping_pow(self, mut exp: u32) -> Self {
             let mut base = self;
 
-            // Unroll multiplications for small exponent values.
-            // This gives the optimizer a way to efficiently inline call sites
-            // for the most common use cases with constant exponents.
-            // Currently, LLVM is unable to unroll the loop below.
-            match exp {
-                0 => return 1,
-                1 => return base,
-                2 => return base.wrapping_mul(base),
-                3 => {
-                    let squared = base.wrapping_mul(base);
-                    return squared.wrapping_mul(base);
-                }
-                4 => {
-                    let squared = base.wrapping_mul(base);
-                    return squared.wrapping_mul(squared);
-                }
-                5 => {
-                    let squared = base.wrapping_mul(base);
-                    return squared.wrapping_mul(squared).wrapping_mul(base);
+            if intrinsics::is_val_statically_known(exp) {
+                // Unroll multiplications for small exponent values.
+                // This gives the optimizer a way to efficiently inline call sites
+                // for the most common use cases with constant exponents.
+                // Currently, LLVM is unable to unroll the loop below.
+                match exp {
+                    0 => return 1,
+                    1 => return base,
+                    2 => return base.wrapping_mul(base),
+                    3 => {
+                        let squared = base.wrapping_mul(base);
+                        return squared.wrapping_mul(base);
+                    }
+                    4 => {
+                        let squared = base.wrapping_mul(base);
+                        return squared.wrapping_mul(squared);
+                    }
+                    5 => {
+                        let squared = base.wrapping_mul(base);
+                        return squared.wrapping_mul(squared).wrapping_mul(base);
+                    }
+                    6 => {
+                        let cubed = base.wrapping_mul(base).wrapping_mul(base);
+                        return cubed.wrapping_mul(cubed);
+                    }
+                    _ => {}
                 }
-                6 => {
-                    let cubed = base.wrapping_mul(base).wrapping_mul(base);
-                    return cubed.wrapping_mul(cubed);
+            } else {
+                if exp == 0 {
+                    return 1;
                 }
-                _ => {}
             }
+            debug_assert!(exp != 0);
 
             let mut acc: Self = 1;
 
@@ -2743,35 +2751,43 @@ macro_rules! int_impl {
                       without modifying the original"]
         #[inline]
         #[rustc_inherit_overflow_checks]
+        #[rustc_allow_const_fn_unstable(is_val_statically_known)]
         pub const fn pow(self, mut exp: u32) -> Self {
             let mut base = self;
 
-            // Unroll multiplications for small exponent values.
-            // This gives the optimizer a way to efficiently inline call sites
-            // for the most common use cases with constant exponents.
-            // Currently, LLVM is unable to unroll the loop below.
-            match exp {
-                0 => return 1,
-                1 => return base,
-                2 => return base * base,
-                3 => {
-                    let squared = base * base;
-                    return squared * base;
-                }
-                4 => {
-                    let squared = base * base;
-                    return squared * squared;
-                }
-                5 => {
-                    let squared = base * base;
-                    return squared * squared * base;
+            if intrinsics::is_val_statically_known(exp) {
+                // Unroll multiplications for small exponent values.
+                // This gives the optimizer a way to efficiently inline call sites
+                // for the most common use cases with constant exponents.
+                // Currently, LLVM is unable to unroll the loop below.
+                match exp {
+                    0 => return 1,
+                    1 => return base,
+                    2 => return base * base,
+                    3 => {
+                        let squared = base * base;
+                        return squared * base;
+                    }
+                    4 => {
+                        let squared = base * base;
+                        return squared * squared;
+                    }
+                    5 => {
+                        let squared = base * base;
+                        return squared * squared * base;
+                    }
+                    6 => {
+                        let cubed = base * base * base;
+                        return cubed * cubed;
+                    }
+                    _ => {}
                 }
-                6 => {
-                    let cubed = base * base * base;
-                    return cubed * cubed;
+            } else {
+                if exp == 0 {
+                    return 1;
                 }
-                _ => {}
             }
+            debug_assert!(exp != 0);
 
             let mut acc = 1;
 
diff --git a/library/core/src/num/uint_macros.rs b/library/core/src/num/uint_macros.rs
index 6e5d37f8163ea..24352593fca3d 100644
--- a/library/core/src/num/uint_macros.rs
+++ b/library/core/src/num/uint_macros.rs
@@ -2048,35 +2048,43 @@ macro_rules! uint_impl {
         #[must_use = "this returns the result of the operation, \
                       without modifying the original"]
         #[inline]
+        #[rustc_allow_const_fn_unstable(is_val_statically_known)]
         pub const fn wrapping_pow(self, mut exp: u32) -> Self {
             let mut base = self;
 
-            // Unroll multiplications for small exponent values.
-            // This gives the optimizer a way to efficiently inline call sites
-            // for the most common use cases with constant exponents.
-            // Currently, LLVM is unable to unroll the loop below.
-            match exp {
-                0 => return 1,
-                1 => return base,
-                2 => return base.wrapping_mul(base),
-                3 => {
-                    let squared = base.wrapping_mul(base);
-                    return squared.wrapping_mul(base);
-                }
-                4 => {
-                    let squared = base.wrapping_mul(base);
-                    return squared.wrapping_mul(squared);
-                }
-                5 => {
-                    let squared = base.wrapping_mul(base);
-                    return squared.wrapping_mul(squared).wrapping_mul(base);
+            if intrinsics::is_val_statically_known(exp) {
+                // Unroll multiplications for small exponent values.
+                // This gives the optimizer a way to efficiently inline call sites
+                // for the most common use cases with constant exponents.
+                // Currently, LLVM is unable to unroll the loop below.
+                match exp {
+                    0 => return 1,
+                    1 => return base,
+                    2 => return base.wrapping_mul(base),
+                    3 => {
+                        let squared = base.wrapping_mul(base);
+                        return squared.wrapping_mul(base);
+                    }
+                    4 => {
+                        let squared = base.wrapping_mul(base);
+                        return squared.wrapping_mul(squared);
+                    }
+                    5 => {
+                        let squared = base.wrapping_mul(base);
+                        return squared.wrapping_mul(squared).wrapping_mul(base);
+                    }
+                    6 => {
+                        let cubed = base.wrapping_mul(base).wrapping_mul(base);
+                        return cubed.wrapping_mul(cubed);
+                    }
+                    _ => {}
                 }
-                6 => {
-                    let cubed = base.wrapping_mul(base).wrapping_mul(base);
-                    return cubed.wrapping_mul(cubed);
+            } else {
+                if exp == 0 {
+                    return 1;
                 }
-                _ => {}
             }
+            debug_assert!(exp != 0);
 
             let mut acc: Self = 1;
 
@@ -2568,35 +2576,43 @@ macro_rules! uint_impl {
                       without modifying the original"]
         #[inline]
         #[rustc_inherit_overflow_checks]
+        #[rustc_allow_const_fn_unstable(is_val_statically_known)]
         pub const fn pow(self, mut exp: u32) -> Self {
             let mut base = self;
 
-            // Unroll multiplications for small exponent values.
-            // This gives the optimizer a way to efficiently inline call sites
-            // for the most common use cases with constant exponents.
-            // Currently, LLVM is unable to unroll the loop below.
-            match exp {
-                0 => return 1,
-                1 => return base,
-                2 => return base * base,
-                3 => {
-                    let squared = base * base;
-                    return squared * base;
-                }
-                4 => {
-                    let squared = base * base;
-                    return squared * squared;
-                }
-                5 => {
-                    let squared = base * base;
-                    return squared * squared * base;
+            if intrinsics::is_val_statically_known(exp) {
+                // Unroll multiplications for small exponent values.
+                // This gives the optimizer a way to efficiently inline call sites
+                // for the most common use cases with constant exponents.
+                // Currently, LLVM is unable to unroll the loop below.
+                match exp {
+                    0 => return 1,
+                    1 => return base,
+                    2 => return base * base,
+                    3 => {
+                        let squared = base * base;
+                        return squared * base;
+                    }
+                    4 => {
+                        let squared = base * base;
+                        return squared * squared;
+                    }
+                    5 => {
+                        let squared = base * base;
+                        return squared * squared * base;
+                    }
+                    6 => {
+                        let cubed = base * base * base;
+                        return cubed * cubed;
+                    }
+                    _ => {}
                 }
-                6 => {
-                    let cubed = base * base * base;
-                    return cubed * cubed;
+            } else {
+                if exp == 0 {
+                    return 1;
                 }
-                _ => {}
             }
+            debug_assert!(exp != 0);
 
             let mut acc = 1;
 

From ac88b330b875e8058589b1804ac5d95fcd40905d Mon Sep 17 00:00:00 2001
From: Mikhail Zabaluev <mikhail.zabaluev@gmail.com>
Date: Tue, 13 Aug 2024 08:21:16 +0300
Subject: [PATCH 4/4] Revert to original loop for const pow exponents

Give LLVM the for original, optimizable loop in pow and wrapped_pow
functions in the case when the exponent is statically known.
---
 library/core/src/num/int_macros.rs  | 135 ++++++++++++----------------
 library/core/src/num/uint_macros.rs | 135 ++++++++++++----------------
 2 files changed, 110 insertions(+), 160 deletions(-)

diff --git a/library/core/src/num/int_macros.rs b/library/core/src/num/int_macros.rs
index be0e6a2a03b70..d8ef36f21ac63 100644
--- a/library/core/src/num/int_macros.rs
+++ b/library/core/src/num/int_macros.rs
@@ -2174,54 +2174,41 @@ macro_rules! int_impl {
         #[inline]
         #[rustc_allow_const_fn_unstable(is_val_statically_known)]
         pub const fn wrapping_pow(self, mut exp: u32) -> Self {
+            if exp == 0 {
+                return 1;
+            }
             let mut base = self;
+            let mut acc: Self = 1;
 
             if intrinsics::is_val_statically_known(exp) {
-                // Unroll multiplications for small exponent values.
-                // This gives the optimizer a way to efficiently inline call sites
-                // for the most common use cases with constant exponents.
-                // Currently, LLVM is unable to unroll the loop below.
-                match exp {
-                    0 => return 1,
-                    1 => return base,
-                    2 => return base.wrapping_mul(base),
-                    3 => {
-                        let squared = base.wrapping_mul(base);
-                        return squared.wrapping_mul(base);
-                    }
-                    4 => {
-                        let squared = base.wrapping_mul(base);
-                        return squared.wrapping_mul(squared);
+                while exp > 1 {
+                    if (exp & 1) == 1 {
+                        acc = acc.wrapping_mul(base);
                     }
-                    5 => {
-                        let squared = base.wrapping_mul(base);
-                        return squared.wrapping_mul(squared).wrapping_mul(base);
-                    }
-                    6 => {
-                        let cubed = base.wrapping_mul(base).wrapping_mul(base);
-                        return cubed.wrapping_mul(cubed);
-                    }
-                    _ => {}
+                    exp /= 2;
+                    base = base.wrapping_mul(base);
                 }
-            } else {
-                if exp == 0 {
-                    return 1;
-                }
-            }
-            debug_assert!(exp != 0);
 
-            let mut acc: Self = 1;
-
-            loop {
-                if (exp & 1) == 1 {
-                    acc = acc.wrapping_mul(base);
-                    // since exp!=0, finally the exp must be 1.
-                    if exp == 1 {
-                        return acc;
+                // since exp!=0, finally the exp must be 1.
+                // Deal with the final bit of the exponent separately, since
+                // squaring the base afterwards is not necessary.
+                acc.wrapping_mul(base)
+            } else {
+                // This is faster than the above when the exponent is not known
+                // at compile time. We can't use the same code for the constant
+                // exponent case because LLVM is currently unable to unroll
+                // this loop.
+                loop {
+                    if (exp & 1) == 1 {
+                        acc = acc.wrapping_mul(base);
+                        // since exp!=0, finally the exp must be 1.
+                        if exp == 1 {
+                            return acc;
+                        }
                     }
+                    exp /= 2;
+                    base = base.wrapping_mul(base);
                 }
-                exp /= 2;
-                base = base.wrapping_mul(base);
             }
         }
 
@@ -2753,54 +2740,42 @@ macro_rules! int_impl {
         #[rustc_inherit_overflow_checks]
         #[rustc_allow_const_fn_unstable(is_val_statically_known)]
         pub const fn pow(self, mut exp: u32) -> Self {
+            if exp == 0 {
+                return 1;
+            }
             let mut base = self;
+            let mut acc = 1;
 
             if intrinsics::is_val_statically_known(exp) {
-                // Unroll multiplications for small exponent values.
-                // This gives the optimizer a way to efficiently inline call sites
-                // for the most common use cases with constant exponents.
-                // Currently, LLVM is unable to unroll the loop below.
-                match exp {
-                    0 => return 1,
-                    1 => return base,
-                    2 => return base * base,
-                    3 => {
-                        let squared = base * base;
-                        return squared * base;
-                    }
-                    4 => {
-                        let squared = base * base;
-                        return squared * squared;
+                while exp > 1 {
+                    if (exp & 1) == 1 {
+                        acc = acc * base;
                     }
-                    5 => {
-                        let squared = base * base;
-                        return squared * squared * base;
-                    }
-                    6 => {
-                        let cubed = base * base * base;
-                        return cubed * cubed;
-                    }
-                    _ => {}
+                    exp /= 2;
+                    base = base * base;
                 }
-            } else {
-                if exp == 0 {
-                    return 1;
-                }
-            }
-            debug_assert!(exp != 0);
 
-            let mut acc = 1;
-
-            loop {
-                if (exp & 1) == 1 {
-                    acc = acc * base;
-                    // since exp!=0, finally the exp must be 1.
-                    if exp == 1 {
-                        return acc;
+                // since exp!=0, finally the exp must be 1.
+                // Deal with the final bit of the exponent separately, since
+                // squaring the base afterwards is not necessary and may cause a
+                // needless overflow.
+                acc * base
+            } else {
+                // This is faster than the above when the exponent is not known
+                // at compile time. We can't use the same code for the constant
+                // exponent case because LLVM is currently unable to unroll
+                // this loop.
+                loop {
+                    if (exp & 1) == 1 {
+                        acc = acc * base;
+                        // since exp!=0, finally the exp must be 1.
+                        if exp == 1 {
+                            return acc;
+                        }
                     }
+                    exp /= 2;
+                    base = base * base;
                 }
-                exp /= 2;
-                base = base * base;
             }
         }
 
diff --git a/library/core/src/num/uint_macros.rs b/library/core/src/num/uint_macros.rs
index 24352593fca3d..5b3ef78d39a3d 100644
--- a/library/core/src/num/uint_macros.rs
+++ b/library/core/src/num/uint_macros.rs
@@ -2050,54 +2050,41 @@ macro_rules! uint_impl {
         #[inline]
         #[rustc_allow_const_fn_unstable(is_val_statically_known)]
         pub const fn wrapping_pow(self, mut exp: u32) -> Self {
+            if exp == 0 {
+                return 1;
+            }
             let mut base = self;
+            let mut acc: Self = 1;
 
             if intrinsics::is_val_statically_known(exp) {
-                // Unroll multiplications for small exponent values.
-                // This gives the optimizer a way to efficiently inline call sites
-                // for the most common use cases with constant exponents.
-                // Currently, LLVM is unable to unroll the loop below.
-                match exp {
-                    0 => return 1,
-                    1 => return base,
-                    2 => return base.wrapping_mul(base),
-                    3 => {
-                        let squared = base.wrapping_mul(base);
-                        return squared.wrapping_mul(base);
-                    }
-                    4 => {
-                        let squared = base.wrapping_mul(base);
-                        return squared.wrapping_mul(squared);
+                while exp > 1 {
+                    if (exp & 1) == 1 {
+                        acc = acc.wrapping_mul(base);
                     }
-                    5 => {
-                        let squared = base.wrapping_mul(base);
-                        return squared.wrapping_mul(squared).wrapping_mul(base);
-                    }
-                    6 => {
-                        let cubed = base.wrapping_mul(base).wrapping_mul(base);
-                        return cubed.wrapping_mul(cubed);
-                    }
-                    _ => {}
+                    exp /= 2;
+                    base = base.wrapping_mul(base);
                 }
-            } else {
-                if exp == 0 {
-                    return 1;
-                }
-            }
-            debug_assert!(exp != 0);
 
-            let mut acc: Self = 1;
-
-            loop {
-                if (exp & 1) == 1 {
-                    acc = acc.wrapping_mul(base);
-                    // since exp!=0, finally the exp must be 1.
-                    if exp == 1 {
-                        return acc;
+                // since exp!=0, finally the exp must be 1.
+                // Deal with the final bit of the exponent separately, since
+                // squaring the base afterwards is not necessary.
+                acc.wrapping_mul(base)
+            } else {
+                // This is faster than the above when the exponent is not known
+                // at compile time. We can't use the same code for the constant
+                // exponent case because LLVM is currently unable to unroll
+                // this loop.
+                loop {
+                    if (exp & 1) == 1 {
+                        acc = acc.wrapping_mul(base);
+                        // since exp!=0, finally the exp must be 1.
+                        if exp == 1 {
+                            return acc;
+                        }
                     }
+                    exp /= 2;
+                    base = base.wrapping_mul(base);
                 }
-                exp /= 2;
-                base = base.wrapping_mul(base);
             }
         }
 
@@ -2578,54 +2565,42 @@ macro_rules! uint_impl {
         #[rustc_inherit_overflow_checks]
         #[rustc_allow_const_fn_unstable(is_val_statically_known)]
         pub const fn pow(self, mut exp: u32) -> Self {
+            if exp == 0 {
+                return 1;
+            }
             let mut base = self;
+            let mut acc = 1;
 
             if intrinsics::is_val_statically_known(exp) {
-                // Unroll multiplications for small exponent values.
-                // This gives the optimizer a way to efficiently inline call sites
-                // for the most common use cases with constant exponents.
-                // Currently, LLVM is unable to unroll the loop below.
-                match exp {
-                    0 => return 1,
-                    1 => return base,
-                    2 => return base * base,
-                    3 => {
-                        let squared = base * base;
-                        return squared * base;
-                    }
-                    4 => {
-                        let squared = base * base;
-                        return squared * squared;
+                while exp > 1 {
+                    if (exp & 1) == 1 {
+                        acc = acc * base;
                     }
-                    5 => {
-                        let squared = base * base;
-                        return squared * squared * base;
-                    }
-                    6 => {
-                        let cubed = base * base * base;
-                        return cubed * cubed;
-                    }
-                    _ => {}
+                    exp /= 2;
+                    base = base * base;
                 }
-            } else {
-                if exp == 0 {
-                    return 1;
-                }
-            }
-            debug_assert!(exp != 0);
 
-            let mut acc = 1;
-
-            loop {
-                if (exp & 1) == 1 {
-                    acc = acc * base;
-                    // since exp!=0, finally the exp must be 1.
-                    if exp == 1 {
-                        return acc;
+                // since exp!=0, finally the exp must be 1.
+                // Deal with the final bit of the exponent separately, since
+                // squaring the base afterwards is not necessary and may cause a
+                // needless overflow.
+                acc * base
+            } else {
+                // This is faster than the above when the exponent is not known
+                // at compile time. We can't use the same code for the constant
+                // exponent case because LLVM is currently unable to unroll
+                // this loop.
+                loop {
+                    if (exp & 1) == 1 {
+                        acc = acc * base;
+                        // since exp!=0, finally the exp must be 1.
+                        if exp == 1 {
+                            return acc;
+                        }
                     }
+                    exp /= 2;
+                    base = base * base;
                 }
-                exp /= 2;
-                base = base * base;
             }
         }