Add jeaiii optimizations for u128.

Uses a similar strategy to `u64`, which heavily optimizes smaller numbers (`<= 32::MAX`) without much performance impact for larger numbers (`> u64::MAX`). Closes #163.
Alexhuszagh · Dec 7, 2024 · edceaca · edceaca
1 parent 07ca225
commit edceaca
Show file tree

Hide file tree

Showing 6 changed files with 151 additions and 68 deletions.
diff --git a/lexical-util/etc/div128.py b/lexical-util/etc/div128.py
@@ -107,38 +107,60 @@ def print_pow2(radix):
     print('')
 
 
-def print_fast(radix, divisor, fast_shr, factor, factor_shr):
+def print_fast(radix, divisor, fast_shr, factor, factor_shr, suffix):
     '''Print the function for the fastest division algorithm.'''
 
     fast = 1 << (64 + fast_shr)
     print('#[inline(always)]')
-    print(f'fn u128_divrem_{radix}(n: u128) -> (u128, u64) {{')
+    print(f'fn u128_divrem_{radix}{suffix}(n: u128) -> (u128, u64) {{')
     print(f'    fast_u128_divrem(n, {divisor}, {fast}, {fast_shr}, {factor}, {factor_shr})')
     print('}')
     print('')
 
 
-def print_moderate(radix, divisor, factor, factor_shr):
+def print_moderate(radix, divisor, factor, factor_shr, suffix):
     '''Print the function for the moderate division algorithm.'''
 
     print('#[inline(always)]')
-    print(f'const fn u128_divrem_{radix}(n: u128) -> (u128, u64) {{')
+    print(f'const fn u128_divrem_{radix}{suffix}(n: u128) -> (u128, u64) {{')
     print(f'    moderate_u128_divrem(n, {divisor}, {factor}, {factor_shr})')
     print('}')
     print('')
 
 
-def print_slow(radix, divisor):
+def print_slow(radix, divisor, suffix):
     '''Print the function for the slow division algorithm.'''
 
     ctlz = 66 - len(bin(divisor))
     print('#[inline(always)]')
-    print(f'fn u128_divrem_{radix}(n: u128) -> (u128, u64) {{')
+    print(f'fn u128_divrem_{radix}{suffix}(n: u128) -> (u128, u64) {{')
     print(f'    slow_u128_divrem(n, {divisor}, {ctlz})')
     print('}')
     print('')
 
 
+def print_radix(radix, digits=None):
+    '''Print the divisor constant for a single radix.'''
+
+    # Not a power of two, must be slower.
+    if digits is None:
+        digits = find_power(radix)
+        suffix = ''
+    else:
+        suffix = f'_{radix}pow{digits}'
+    divisor = radix**digits
+    fast_shr = fast_shift(divisor)
+    factor, factor_shr, _ = choose_multiplier(divisor, 128)
+
+    if factor >= 2**128:
+        # Cannot fit in a u128, must revert to the slow algorithm.
+        print_slow(radix, divisor, suffix)
+    elif fast_shr != 0:
+        print_fast(radix, divisor, fast_shr, factor, factor_shr, suffix)
+    else:
+        print_moderate(radix, divisor, factor, factor_shr, suffix)
+
+
 def divisor_constants():
     '''Generate all the divisor constants for all radices.'''
 
@@ -148,20 +170,10 @@ def divisor_constants():
         if is_pow2(radix):
             print_pow2(radix)
             continue
+        print_radix(radix)
 
-        # Not a power of two, must be slower.
-        digits = find_power(radix)
-        divisor = radix**digits
-        fast_shr = fast_shift(divisor)
-        factor, factor_shr, _ = choose_multiplier(divisor, 128)
-
-        if factor >= 2**128:
-            # Cannot fit in a u128, must revert to the slow algorithm.
-            print_slow(radix, divisor)
-        elif fast_shr != 0:
-            print_fast(radix, divisor, fast_shr, factor, factor_shr)
-        else:
-            print_moderate(radix, divisor, factor, factor_shr)
+    # print a special case for 1e10
+    print_radix(10, 10)
 
 # PYTHON LOGIC
 # This is the approach, in Python, for how to do this.

diff --git a/lexical-write-integer/src/algorithm.rs b/lexical-write-integer/src/algorithm.rs
@@ -12,6 +12,7 @@
 //! recent benchmark data.
 
 #![cfg(not(feature = "compact"))]
+#![cfg(feature = "power-of-two")]
 
 use lexical_util::assert::debug_assert_radix;
 use lexical_util::digit::digit_to_char;
@@ -22,6 +23,25 @@ use lexical_util::step::u64_step;
 
 use crate::digit_count::DigitCount;
 
+/// Index a buffer and get a mutable reference, without bounds checking.
+/// The `($x:ident[$i:expr] = $y:ident[$j:expr])` is not used with `compact`.
+/// The newer version of the lint is `unused_macro_rules`, but this isn't
+/// supported until nightly-2022-05-12.
+///
+/// By default, writers tend to be safe, due to Miri, Valgrind,
+/// and other tests and careful validation against a wide range
+/// of randomized input. Parsers are much trickier to validate.
+#[allow(unknown_lints, unused_macro_rules)]
+macro_rules! i {
+    ($x:ident[$i:expr]) => {
+        *$x.get_unchecked_mut($i)
+    };
+
+    ($x:ident[$i:expr] = $y:ident[$j:expr]) => {
+        *$x.get_unchecked_mut($i) = *$y.get_unchecked($j)
+    };
+}
+
 /// Write 2 digits to buffer.
 ///
 /// # Safety
@@ -34,9 +54,9 @@ macro_rules! write_digits {
         debug_assert!($bytes.len() >= 2);
         debug_assert!($r + 1 < $table.len());
         $index -= 1;
-        unsafe { index_unchecked_mut!($bytes[$index] = $table[$r + 1]) };
+        unsafe { i!($bytes[$index] = $table[$r + 1]) };
         $index -= 1;
-        unsafe { index_unchecked_mut!($bytes[$index] = $table[$r]) };
+        unsafe { i!($bytes[$index] = $table[$r]) };
     }};
 }
 
@@ -53,7 +73,7 @@ macro_rules! write_digit {
         debug_assert!($bytes.len() >= 1);
         debug_assert!($r < 36);
         $index -= 1;
-        unsafe { index_unchecked_mut!($bytes[$index]) = digit_to_char($r) };
+        unsafe { i!($bytes[$index]) = digit_to_char($r) };
     }};
 }
 
@@ -182,7 +202,7 @@ unsafe fn write_step_digits<T: UnsignedInteger>(
     // Write the remaining 0 bytes.
     let end = start.saturating_sub(step);
     // SAFETY: this is always safe since `end < index && index < start`.
-    let zeros = unsafe { &mut index_unchecked_mut!(buffer[end..index]) };
+    let zeros = unsafe { &mut i!(buffer[end..index]) };
     zeros.fill(b'0');
 
     end

diff --git a/lexical-write-integer/src/decimal.rs b/lexical-write-integer/src/decimal.rs
@@ -13,13 +13,10 @@
 #![cfg(not(feature = "compact"))]
 #![doc(hidden)]
 
-use lexical_util::format::{RADIX, RADIX_SHIFT, STANDARD};
 use lexical_util::num::UnsignedInteger;
 
-use crate::algorithm::algorithm_u128;
 use crate::digit_count::fast_log2;
 use crate::jeaiii;
-use crate::table::DIGIT_TO_BASE10_SQUARED;
 
 /// Calculate the fast, integral log10 of a value.
 ///
@@ -269,17 +266,7 @@ decimal_impl! {
     u16; from_u16
     u32; from_u32
     u64; from_u64
-}
-
-impl Decimal for u128 {
-    #[inline(always)]
-    fn decimal(self, buffer: &mut [u8]) -> usize {
-        algorithm_u128::<{ STANDARD }, { RADIX }, { RADIX_SHIFT }>(
-            self,
-            &DIGIT_TO_BASE10_SQUARED,
-            buffer,
-        )
-    }
+    u128; from_u128
 }
 
 impl Decimal for usize {

diff --git a/lexical-write-integer/src/index.rs b/lexical-write-integer/src/index.rs
diff --git a/lexical-write-integer/src/jeaiii.rs b/lexical-write-integer/src/jeaiii.rs
@@ -25,6 +25,7 @@
 #![doc(hidden)]
 
 use lexical_util::digit::digit_to_char_const;
+use lexical_util::div128::fast_u128_divrem;
 
 use crate::table::DIGIT_TO_BASE10_SQUARED;
 
@@ -38,6 +39,28 @@ fn next2(prod: &mut u64) -> u32 {
     (*prod >> 32) as u32
 }
 
+/// Quickly calculate `n / 1e10` and `n % 1e10`.
+#[inline(always)]
+fn u128_divrem_10_10pow10(n: u128) -> (u128, u64) {
+    fast_u128_divrem(
+        n,
+        10000000000,
+        18889465931478580854784,
+        10,
+        73075081866545145910184241635814150983,
+        31,
+    )
+}
+
+/// Quickly calculate `n / 1e10` and `n % 1e10`.
+///
+/// We use this for quickly breaking our integer into
+/// chunks of 10 digits for fast u128 formatting.
+#[inline(always)]
+fn div128_rem_1e10(n: u128) -> (u128, u64) {
+    u128_divrem_10_10pow10(n)
+}
+
 // Index a value from a buffer without bounds checking.
 macro_rules! i {
     ($array:ident[$index:expr]) => {
@@ -288,7 +311,7 @@ pub fn from_u64(n: u64, buffer: &mut [u8]) -> usize {
         } else {
             write_digits!(@1 buffer, n)
         }
-    } else if n < 100_0000_0000 {
+    } else if n < FACTOR {
         // 5 to 10 digits
         if n >= 10_0000_0000 {
             // NOTE: We DO NOT know if this is >= u32::MAX,
@@ -304,7 +327,7 @@ pub fn from_u64(n: u64, buffer: &mut [u8]) -> usize {
         }
     } else {
         // 11-20 digits, can do in 2 steps
-        // NOTE: `hi` has to be in [0, 2^31], while `lo` is in `[0, 10^11)`
+        // NOTE: `hi` has to be in `[0, 2^31)`, while `lo` is in `[0, 10^11)`
         // So, we can use our `from_u64_small` for hi. For our `lo`, we always
         // need to write 10 digits. However, the `jeaiii` algorithm is too
         // slow, so we use a modified variant of our 2-digit unfolding for
@@ -317,5 +340,73 @@ pub fn from_u64(n: u64, buffer: &mut [u8]) -> usize {
     }
 }
 
-// TODO: Implement for:
-//  from_u128
+/// Optimized jeaiii algorithm for u128.
+#[inline(always)]
+#[allow(clippy::collapsible_else_if)] // reason = "branching is fine-tuned for performance"
+pub fn from_u128(n: u128, buffer: &mut [u8]) -> usize {
+    // NOTE: Like before, this optimizes better for large and small
+    // values if there's a flat comparison with larger values first.
+    let buffer = &mut buffer[..39];
+    if n < 1_0000 {
+        // 1 to 4 digits
+        if n >= 100 {
+            write_digits!(@3-4 buffer, n)
+        } else if n >= 10 {
+            write_digits!(@2 buffer, n)
+        } else {
+            write_digits!(@1 buffer, n)
+        }
+    } else if n < 100_0000_0000 {
+        // 5 to 10 digits
+        if n >= 10_0000_0000 {
+            // NOTE: We DO NOT know if this is >= u32::MAX,
+            // and the `write_digits!(@10)` is only accurate
+            // if `n <= 5.5e9`, which we cannot guarantee.
+            write_digits!(@10u64 buffer, n)
+        } else if n >= 1_0000_0000 {
+            write_digits!(@9 buffer, n)
+        } else if n >= 100_0000 {
+            write_digits!(@7-8 buffer, n)
+        } else {
+            write_digits!(@5-6 buffer, n)
+        }
+    } else {
+        // 11-39 digits, can do in 2-4 steps
+
+        // NOTE: We need to use fast division (`u128_divrem`) for this, which
+        // we can do in 2-4 steps (`2^128 - 1 == ~3.4e38`). So, we need to
+        // calculate the number of digits to avoid shifting into place, then
+        // once we do, we can write 1-3 `lo` digits and the `hi` digits (which
+        // must be in the range `[0, 2^29)`). Our `jeaiii` algorithm is too
+        // slow, so we use a modified variant of our 2-digit unfolding for
+        // exactly 10 digits to read our values. We can optimize this in
+        // 2x 4 digits and 1x 2 digits.
+        if n >= 100_0000_0000_0000_0000_0000_0000_0000 {
+            // 4 steps
+            let (mid, d) = div128_rem_1e10(n);
+            let (mid, c) = div128_rem_1e10(mid);
+            let (hi, b) = div128_rem_1e10(mid);
+            // NOTE: `2^128 == ~3.4e38`, so `a` must be in the
+            // range `[0, 2^29)`)
+            let a = hi as u32;
+            let mut offset = from_u32(a, buffer);
+            offset = write_digits!(@10alex buffer, b, offset);
+            offset = write_digits!(@10alex buffer, c, offset);
+            write_digits!(@10alex buffer, d, offset)
+        } else if n >= 1_0000_0000_0000_0000_0000 {
+            // 3 steps
+            let (mid, lo) = div128_rem_1e10(n);
+            let (hi, mid) = div128_rem_1e10(mid);
+            let hi = hi as u64;
+            let mut offset = from_u64(hi, buffer);
+            offset = write_digits!(@10alex buffer, mid, offset);
+            write_digits!(@10alex buffer, lo, offset)
+        } else {
+            // 2 steps
+            let (hi, lo) = div128_rem_1e10(n);
+            let hi = hi as u64;
+            let offset = from_u64(hi, buffer);
+            write_digits!(@10alex buffer, lo, offset)
+        }
+    }
+}
diff --git a/lexical-write-integer/src/lib.rs b/lexical-write-integer/src/lib.rs
@@ -146,9 +146,6 @@
     clippy::semicolon_inside_block,
 )]
 
-#[macro_use]
-mod index;
-
 pub mod algorithm;
 pub mod compact;
 pub mod decimal;