Skip to content

Commit

Permalink
Add jeaiii optimizations for u128.
Browse files Browse the repository at this point in the history
Uses a similar strategy to `u64`, which heavily optimizes smaller numbers (`<= 32::MAX`) without much performance impact for larger numbers (`> u64::MAX`).

Closes #163.
  • Loading branch information
Alexhuszagh committed Dec 7, 2024
1 parent 07ca225 commit edceaca
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 68 deletions.
50 changes: 31 additions & 19 deletions lexical-util/etc/div128.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,38 +107,60 @@ def print_pow2(radix):
print('')


def print_fast(radix, divisor, fast_shr, factor, factor_shr):
def print_fast(radix, divisor, fast_shr, factor, factor_shr, suffix):
'''Print the function for the fastest division algorithm.'''

fast = 1 << (64 + fast_shr)
print('#[inline(always)]')
print(f'fn u128_divrem_{radix}(n: u128) -> (u128, u64) {{')
print(f'fn u128_divrem_{radix}{suffix}(n: u128) -> (u128, u64) {{')
print(f' fast_u128_divrem(n, {divisor}, {fast}, {fast_shr}, {factor}, {factor_shr})')
print('}')
print('')


def print_moderate(radix, divisor, factor, factor_shr):
def print_moderate(radix, divisor, factor, factor_shr, suffix):
'''Print the function for the moderate division algorithm.'''

print('#[inline(always)]')
print(f'const fn u128_divrem_{radix}(n: u128) -> (u128, u64) {{')
print(f'const fn u128_divrem_{radix}{suffix}(n: u128) -> (u128, u64) {{')
print(f' moderate_u128_divrem(n, {divisor}, {factor}, {factor_shr})')
print('}')
print('')


def print_slow(radix, divisor):
def print_slow(radix, divisor, suffix):
'''Print the function for the slow division algorithm.'''

ctlz = 66 - len(bin(divisor))
print('#[inline(always)]')
print(f'fn u128_divrem_{radix}(n: u128) -> (u128, u64) {{')
print(f'fn u128_divrem_{radix}{suffix}(n: u128) -> (u128, u64) {{')
print(f' slow_u128_divrem(n, {divisor}, {ctlz})')
print('}')
print('')


def print_radix(radix, digits=None):
'''Print the divisor constant for a single radix.'''

# Not a power of two, must be slower.
if digits is None:
digits = find_power(radix)
suffix = ''
else:
suffix = f'_{radix}pow{digits}'
divisor = radix**digits
fast_shr = fast_shift(divisor)
factor, factor_shr, _ = choose_multiplier(divisor, 128)

if factor >= 2**128:
# Cannot fit in a u128, must revert to the slow algorithm.
print_slow(radix, divisor, suffix)
elif fast_shr != 0:
print_fast(radix, divisor, fast_shr, factor, factor_shr, suffix)
else:
print_moderate(radix, divisor, factor, factor_shr, suffix)


def divisor_constants():
'''Generate all the divisor constants for all radices.'''

Expand All @@ -148,20 +170,10 @@ def divisor_constants():
if is_pow2(radix):
print_pow2(radix)
continue
print_radix(radix)

# Not a power of two, must be slower.
digits = find_power(radix)
divisor = radix**digits
fast_shr = fast_shift(divisor)
factor, factor_shr, _ = choose_multiplier(divisor, 128)

if factor >= 2**128:
# Cannot fit in a u128, must revert to the slow algorithm.
print_slow(radix, divisor)
elif fast_shr != 0:
print_fast(radix, divisor, fast_shr, factor, factor_shr)
else:
print_moderate(radix, divisor, factor, factor_shr)
# print a special case for 1e10
print_radix(10, 10)

# PYTHON LOGIC
# This is the approach, in Python, for how to do this.
Expand Down
28 changes: 24 additions & 4 deletions lexical-write-integer/src/algorithm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
//! recent benchmark data.
#![cfg(not(feature = "compact"))]
#![cfg(feature = "power-of-two")]

use lexical_util::assert::debug_assert_radix;
use lexical_util::digit::digit_to_char;
Expand All @@ -22,6 +23,25 @@ use lexical_util::step::u64_step;

use crate::digit_count::DigitCount;

/// Index a buffer and get a mutable reference, without bounds checking.
/// The `($x:ident[$i:expr] = $y:ident[$j:expr])` is not used with `compact`.
/// The newer version of the lint is `unused_macro_rules`, but this isn't
/// supported until nightly-2022-05-12.
///
/// By default, writers tend to be safe, due to Miri, Valgrind,
/// and other tests and careful validation against a wide range
/// of randomized input. Parsers are much trickier to validate.
#[allow(unknown_lints, unused_macro_rules)]
macro_rules! i {
($x:ident[$i:expr]) => {
*$x.get_unchecked_mut($i)
};

($x:ident[$i:expr] = $y:ident[$j:expr]) => {
*$x.get_unchecked_mut($i) = *$y.get_unchecked($j)
};
}

/// Write 2 digits to buffer.
///
/// # Safety
Expand All @@ -34,9 +54,9 @@ macro_rules! write_digits {
debug_assert!($bytes.len() >= 2);
debug_assert!($r + 1 < $table.len());
$index -= 1;
unsafe { index_unchecked_mut!($bytes[$index] = $table[$r + 1]) };
unsafe { i!($bytes[$index] = $table[$r + 1]) };
$index -= 1;
unsafe { index_unchecked_mut!($bytes[$index] = $table[$r]) };
unsafe { i!($bytes[$index] = $table[$r]) };
}};
}

Expand All @@ -53,7 +73,7 @@ macro_rules! write_digit {
debug_assert!($bytes.len() >= 1);
debug_assert!($r < 36);
$index -= 1;
unsafe { index_unchecked_mut!($bytes[$index]) = digit_to_char($r) };
unsafe { i!($bytes[$index]) = digit_to_char($r) };
}};
}

Expand Down Expand Up @@ -182,7 +202,7 @@ unsafe fn write_step_digits<T: UnsignedInteger>(
// Write the remaining 0 bytes.
let end = start.saturating_sub(step);
// SAFETY: this is always safe since `end < index && index < start`.
let zeros = unsafe { &mut index_unchecked_mut!(buffer[end..index]) };
let zeros = unsafe { &mut i!(buffer[end..index]) };
zeros.fill(b'0');

end
Expand Down
15 changes: 1 addition & 14 deletions lexical-write-integer/src/decimal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,10 @@
#![cfg(not(feature = "compact"))]
#![doc(hidden)]

use lexical_util::format::{RADIX, RADIX_SHIFT, STANDARD};
use lexical_util::num::UnsignedInteger;

use crate::algorithm::algorithm_u128;
use crate::digit_count::fast_log2;
use crate::jeaiii;
use crate::table::DIGIT_TO_BASE10_SQUARED;

/// Calculate the fast, integral log10 of a value.
///
Expand Down Expand Up @@ -269,17 +266,7 @@ decimal_impl! {
u16; from_u16
u32; from_u32
u64; from_u64
}

impl Decimal for u128 {
#[inline(always)]
fn decimal(self, buffer: &mut [u8]) -> usize {
algorithm_u128::<{ STANDARD }, { RADIX }, { RADIX_SHIFT }>(
self,
&DIGIT_TO_BASE10_SQUARED,
buffer,
)
}
u128; from_u128
}

impl Decimal for usize {
Expand Down
24 changes: 0 additions & 24 deletions lexical-write-integer/src/index.rs

This file was deleted.

99 changes: 95 additions & 4 deletions lexical-write-integer/src/jeaiii.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#![doc(hidden)]

use lexical_util::digit::digit_to_char_const;
use lexical_util::div128::fast_u128_divrem;

use crate::table::DIGIT_TO_BASE10_SQUARED;

Expand All @@ -38,6 +39,28 @@ fn next2(prod: &mut u64) -> u32 {
(*prod >> 32) as u32
}

/// Quickly calculate `n / 1e10` and `n % 1e10`.
#[inline(always)]
fn u128_divrem_10_10pow10(n: u128) -> (u128, u64) {
fast_u128_divrem(
n,
10000000000,
18889465931478580854784,
10,
73075081866545145910184241635814150983,
31,
)
}

/// Quickly calculate `n / 1e10` and `n % 1e10`.
///
/// We use this for quickly breaking our integer into
/// chunks of 10 digits for fast u128 formatting.
#[inline(always)]
fn div128_rem_1e10(n: u128) -> (u128, u64) {
u128_divrem_10_10pow10(n)
}

// Index a value from a buffer without bounds checking.
macro_rules! i {
($array:ident[$index:expr]) => {
Expand Down Expand Up @@ -288,7 +311,7 @@ pub fn from_u64(n: u64, buffer: &mut [u8]) -> usize {
} else {
write_digits!(@1 buffer, n)
}
} else if n < 100_0000_0000 {
} else if n < FACTOR {
// 5 to 10 digits
if n >= 10_0000_0000 {
// NOTE: We DO NOT know if this is >= u32::MAX,
Expand All @@ -304,7 +327,7 @@ pub fn from_u64(n: u64, buffer: &mut [u8]) -> usize {
}
} else {
// 11-20 digits, can do in 2 steps
// NOTE: `hi` has to be in [0, 2^31], while `lo` is in `[0, 10^11)`
// NOTE: `hi` has to be in `[0, 2^31)`, while `lo` is in `[0, 10^11)`
// So, we can use our `from_u64_small` for hi. For our `lo`, we always
// need to write 10 digits. However, the `jeaiii` algorithm is too
// slow, so we use a modified variant of our 2-digit unfolding for
Expand All @@ -317,5 +340,73 @@ pub fn from_u64(n: u64, buffer: &mut [u8]) -> usize {
}
}

// TODO: Implement for:
// from_u128
/// Optimized jeaiii algorithm for u128.
#[inline(always)]
#[allow(clippy::collapsible_else_if)] // reason = "branching is fine-tuned for performance"
pub fn from_u128(n: u128, buffer: &mut [u8]) -> usize {
// NOTE: Like before, this optimizes better for large and small
// values if there's a flat comparison with larger values first.
let buffer = &mut buffer[..39];
if n < 1_0000 {
// 1 to 4 digits
if n >= 100 {
write_digits!(@3-4 buffer, n)
} else if n >= 10 {
write_digits!(@2 buffer, n)
} else {
write_digits!(@1 buffer, n)
}
} else if n < 100_0000_0000 {
// 5 to 10 digits
if n >= 10_0000_0000 {
// NOTE: We DO NOT know if this is >= u32::MAX,
// and the `write_digits!(@10)` is only accurate
// if `n <= 5.5e9`, which we cannot guarantee.
write_digits!(@10u64 buffer, n)
} else if n >= 1_0000_0000 {
write_digits!(@9 buffer, n)
} else if n >= 100_0000 {
write_digits!(@7-8 buffer, n)
} else {
write_digits!(@5-6 buffer, n)
}
} else {
// 11-39 digits, can do in 2-4 steps

// NOTE: We need to use fast division (`u128_divrem`) for this, which
// we can do in 2-4 steps (`2^128 - 1 == ~3.4e38`). So, we need to
// calculate the number of digits to avoid shifting into place, then
// once we do, we can write 1-3 `lo` digits and the `hi` digits (which
// must be in the range `[0, 2^29)`). Our `jeaiii` algorithm is too
// slow, so we use a modified variant of our 2-digit unfolding for
// exactly 10 digits to read our values. We can optimize this in
// 2x 4 digits and 1x 2 digits.
if n >= 100_0000_0000_0000_0000_0000_0000_0000 {
// 4 steps
let (mid, d) = div128_rem_1e10(n);
let (mid, c) = div128_rem_1e10(mid);
let (hi, b) = div128_rem_1e10(mid);
// NOTE: `2^128 == ~3.4e38`, so `a` must be in the
// range `[0, 2^29)`)
let a = hi as u32;
let mut offset = from_u32(a, buffer);
offset = write_digits!(@10alex buffer, b, offset);
offset = write_digits!(@10alex buffer, c, offset);
write_digits!(@10alex buffer, d, offset)
} else if n >= 1_0000_0000_0000_0000_0000 {
// 3 steps
let (mid, lo) = div128_rem_1e10(n);
let (hi, mid) = div128_rem_1e10(mid);
let hi = hi as u64;
let mut offset = from_u64(hi, buffer);
offset = write_digits!(@10alex buffer, mid, offset);
write_digits!(@10alex buffer, lo, offset)
} else {
// 2 steps
let (hi, lo) = div128_rem_1e10(n);
let hi = hi as u64;
let offset = from_u64(hi, buffer);
write_digits!(@10alex buffer, lo, offset)
}
}
}
3 changes: 0 additions & 3 deletions lexical-write-integer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,6 @@
clippy::semicolon_inside_block,
)]

#[macro_use]
mod index;

pub mod algorithm;
pub mod compact;
pub mod decimal;
Expand Down

0 comments on commit edceaca

Please sign in to comment.