diff --git a/CHANGELOG b/CHANGELOG index 0262203e..0dda1435 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Fixes a correctness regression + ## [1.0.0] 2024-09-14 ### Added diff --git a/lexical-parse-float/src/binary.rs b/lexical-parse-float/src/binary.rs index f3fcf5ef..1c92e7c1 100644 --- a/lexical-parse-float/src/binary.rs +++ b/lexical-parse-float/src/binary.rs @@ -10,7 +10,7 @@ use lexical_parse_integer::algorithm; use lexical_util::digit::char_to_valid_digit_const; use lexical_util::format::NumberFormat; -use lexical_util::iterator::{AsBytes, BytesIter}; +use lexical_util::iterator::{AsBytes, DigitsIter}; use lexical_util::step::u64_step; use crate::float::{ExtendedFloat80, RawFloat}; @@ -104,7 +104,7 @@ pub fn parse_u64_digits<'a, Iter, const FORMAT: u128>( overflowed: &mut bool, zero: &mut bool, ) where - Iter: BytesIter<'a>, + Iter: DigitsIter<'a>, { let format = NumberFormat::<{ FORMAT }> {}; let radix = format.radix() as u64; diff --git a/lexical-parse-float/src/lib.rs b/lexical-parse-float/src/lib.rs index 035bbddf..7f0e8049 100644 --- a/lexical-parse-float/src/lib.rs +++ b/lexical-parse-float/src/lib.rs @@ -126,6 +126,9 @@ mod table_lemire; mod table_radix; mod table_small; +#[macro_use(parse_sign)] +extern crate lexical_parse_integer; + // Re-exports #[cfg(feature = "f16")] pub use lexical_util::bf16::bf16; diff --git a/lexical-parse-float/src/parse.rs b/lexical-parse-float/src/parse.rs index b5cb92bb..21dc309d 100644 --- a/lexical-parse-float/src/parse.rs +++ b/lexical-parse-float/src/parse.rs @@ -13,13 +13,12 @@ use lexical_parse_integer::algorithm; #[cfg(feature = "f16")] use lexical_util::bf16::bf16; -use lexical_util::buffer::Buffer; use lexical_util::digit::{char_to_digit_const, char_to_valid_digit_const}; use lexical_util::error::Error; #[cfg(feature = "f16")] use lexical_util::f16::f16; use lexical_util::format::NumberFormat; -use lexical_util::iterator::{AsBytes, Bytes, BytesIter}; +use lexical_util::iterator::{AsBytes, Bytes, DigitsIter, Iter}; use lexical_util::result::Result; use lexical_util::step::u64_step; @@ -166,91 +165,37 @@ parse_float_as_f32! { bf16 f16 } // code is only like 30 lines. /// Parse the sign from the leading digits. -/// -/// This routine does the following: -/// -/// 1. Parses the sign digit. -/// 2. Handles if positive signs before integers are not allowed. -/// 3. Handles negative signs if the type is unsigned. -/// 4. Handles if the sign is required, but missing. -/// 5. Handles if the iterator is empty, before or after parsing the sign. -/// 6. Handles if the iterator has invalid, leading zeros. -/// -/// Returns if the value is negative, or any values detected when -/// validating the input. #[cfg_attr(not(feature = "compact"), inline(always))] pub fn parse_mantissa_sign(byte: &mut Bytes<'_, FORMAT>) -> Result { let format = NumberFormat::<{ FORMAT }> {}; - - // NOTE: Using `read_if` with a predicate compiles badly and is very slow. - // Also, it's better to do the step_unchecked inside rather than get the step - // count and do `step_by_unchecked`. The compiler knows what to do better. - match byte.integer_iter().peek() { - Some(&b'+') if !format.no_positive_mantissa_sign() => { - // SAFETY: Safe, we peeked 1 byte. - unsafe { byte.step_unchecked() }; - Ok(false) - }, - Some(&b'+') if format.no_positive_mantissa_sign() => { - Err(Error::InvalidPositiveSign(byte.cursor())) - }, - Some(&b'-') => { - // SAFETY: Safe, we peeked 1 byte. - unsafe { byte.step_unchecked() }; - Ok(true) - }, - Some(_) if format.required_mantissa_sign() => Err(Error::MissingSign(byte.cursor())), - _ if format.required_mantissa_sign() => Err(Error::MissingSign(byte.cursor())), - _ => Ok(false), - } + parse_sign!( + byte, + true, + format.no_positive_mantissa_sign(), + format.required_mantissa_sign(), + InvalidPositiveSign, + MissingSign + ) } /// Parse the sign from the leading digits. -/// -/// This routine does the following: -/// -/// 1. Parses the sign digit. -/// 2. Handles if positive signs before integers are not allowed. -/// 3. Handles negative signs if the type is unsigned. -/// 4. Handles if the sign is required, but missing. -/// 5. Handles if the iterator is empty, before or after parsing the sign. -/// 6. Handles if the iterator has invalid, leading zeros. -/// -/// Returns if the value is negative, or any values detected when -/// validating the input. #[cfg_attr(not(feature = "compact"), inline(always))] pub fn parse_exponent_sign(byte: &mut Bytes<'_, FORMAT>) -> Result { let format = NumberFormat::<{ FORMAT }> {}; - - // NOTE: Using `read_if` with a predicate compiles badly and is very slow. - // Also, it's better to do the step_unchecked inside rather than get the step - // count and do `step_by_unchecked`. The compiler knows what to do better. - match byte.integer_iter().peek() { - Some(&b'+') if !format.no_positive_exponent_sign() => { - // SAFETY: Safe, we peeked 1 byte. - unsafe { byte.step_unchecked() }; - Ok(false) - }, - Some(&b'+') if format.no_positive_exponent_sign() => { - Err(Error::InvalidPositiveExponentSign(byte.cursor())) - }, - Some(&b'-') => { - // SAFETY: Safe, we peeked 1 byte. - unsafe { byte.step_unchecked() }; - Ok(true) - }, - Some(_) if format.required_exponent_sign() => { - Err(Error::MissingExponentSign(byte.cursor())) - }, - _ if format.required_exponent_sign() => Err(Error::MissingExponentSign(byte.cursor())), - _ => Ok(false), - } + parse_sign!( + byte, + true, + format.no_positive_exponent_sign(), + format.required_exponent_sign(), + InvalidPositiveExponentSign, + MissingExponentSign + ) } /// Utility to extract the result and handle any errors from parsing a `Number`. /// /// - `format` - The numberical format as a packed integer -/// - `byte` - The BytesIter iterator +/// - `byte` - The DigitsIter iterator /// - `is_negative` - If the final value is negative /// - `parse_normal` - The function to parse non-special numbers with /// - `parse_special` - The function to parse special numbers with @@ -547,31 +492,20 @@ pub fn parse_partial_number<'a, const FORMAT: u128>( // INTEGER // Check to see if we have a valid base prefix. - // NOTE: `read_if` compiles poorly so use `peek` and then `step_unchecked`. #[allow(unused_variables)] let mut is_prefix = false; #[cfg(feature = "format")] { let base_prefix = format.base_prefix(); let mut iter = byte.integer_iter(); - if base_prefix != 0 && iter.peek() == Some(&b'0') { - // SAFETY: safe since `byte.len() >= 1`. - unsafe { iter.step_unchecked() }; + if base_prefix != 0 && iter.read_if_value_cased(b'0').is_some() { // Check to see if the next character is the base prefix. // We must have a format like `0x`, `0d`, `0o`. Note: - if let Some(&c) = iter.peek() { - is_prefix = if format.case_sensitive_base_prefix() { - c == base_prefix - } else { - c.to_ascii_lowercase() == base_prefix.to_ascii_lowercase() - }; - if is_prefix { - // SAFETY: safe since `byte.len() >= 1`. - unsafe { iter.step_unchecked() }; - if iter.is_done() { - return Err(Error::Empty(iter.cursor())); - } - } + is_prefix = true; + if iter.read_if_value(base_prefix, format.case_sensitive_base_prefix()).is_some() + && iter.is_done() + { + return Err(Error::Empty(iter.cursor())); } } } @@ -619,7 +553,8 @@ pub fn parse_partial_number<'a, const FORMAT: u128>( let mut implicit_exponent: i64; let int_end = n_digits as i64; let mut fraction_digits = None; - if byte.first_is(decimal_point) { + // TODO: Change this to something different from read_if_value but same idea + if byte.first_is_cased(decimal_point) { // SAFETY: byte cannot be empty due to first_is unsafe { byte.step_unchecked() }; let before = byte.clone(); @@ -658,11 +593,8 @@ pub fn parse_partial_number<'a, const FORMAT: u128>( // Handle scientific notation. let mut explicit_exponent = 0_i64; - let is_exponent = if cfg!(feature = "format") && format.case_sensitive_exponent() { - byte.first_is(exponent_character) - } else { - byte.case_insensitive_first_is(exponent_character) - }; + let is_exponent = byte + .first_is(exponent_character, format.case_sensitive_exponent() && cfg!(feature = "format")); if is_exponent { // Check float format syntax checks. #[cfg(feature = "format")] @@ -677,6 +609,7 @@ pub fn parse_partial_number<'a, const FORMAT: u128>( } // SAFETY: byte cannot be empty due to `first_is` from `is_exponent`.` + // TODO: Fix: we need a read_if for bytes themselves? unsafe { byte.step_unchecked() }; let is_negative = parse_exponent_sign(&mut byte)?; @@ -708,12 +641,7 @@ pub fn parse_partial_number<'a, const FORMAT: u128>( let base_suffix = format.base_suffix(); #[cfg(feature = "format")] if base_suffix != 0 { - let is_suffix: bool = if format.case_sensitive_base_suffix() { - byte.first_is(base_suffix) - } else { - byte.case_insensitive_first_is(base_suffix) - }; - if is_suffix { + if byte.first_is(base_suffix, format.case_sensitive_base_suffix()) { // SAFETY: safe since `byte.len() >= 1`. unsafe { byte.step_unchecked() }; } @@ -744,24 +672,20 @@ pub fn parse_partial_number<'a, const FORMAT: u128>( } // Check for leading zeros, and to see if we had a false overflow. - // NOTE: Once again, `read_if` is slow: do peek and step n_digits -= step; let mut zeros = start.clone(); let mut zeros_integer = zeros.integer_iter(); - while zeros_integer.peek_is(b'0') { + while zeros_integer.read_if_value_cased(b'0').is_some() { n_digits = n_digits.saturating_sub(1); - // SAFETY: safe since zeros cannot be empty due to peek_is - unsafe { zeros_integer.step_unchecked() }; } - if zeros.first_is(decimal_point) { + if zeros.first_is_cased(decimal_point) { + // TODO: Fix with some read_if like logic // SAFETY: safe since zeros cannot be empty due to first_is unsafe { zeros.step_unchecked() }; } let mut zeros_fraction = zeros.fraction_iter(); - while zeros_fraction.peek_is(b'0') { + while zeros_fraction.read_if_value_cased(b'0').is_some() { n_digits = n_digits.saturating_sub(1); - // SAFETY: safe since zeros cannot be empty due to peek_is - unsafe { zeros_fraction.step_unchecked() }; } // OVERFLOW @@ -824,7 +748,7 @@ pub fn parse_number<'a, const FORMAT: u128>( is_negative: bool, options: &Options, ) -> Result> { - let length = byte.length(); + let length = byte.buffer_length(); let (float, count) = parse_partial_number::(byte, is_negative, options)?; if count == length { Ok(float) @@ -840,7 +764,7 @@ pub fn parse_number<'a, const FORMAT: u128>( #[inline(always)] pub fn parse_digits<'a, Iter, Cb, const FORMAT: u128>(mut iter: Iter, mut cb: Cb) where - Iter: BytesIter<'a>, + Iter: DigitsIter<'a>, Cb: FnMut(u32), { let format = NumberFormat::<{ FORMAT }> {}; @@ -851,6 +775,8 @@ where None => break, } // SAFETY: iter cannot be empty due to `iter.peek()`. + // NOTE: Because of the match statement, this would optimize poorly with + // read_if. unsafe { iter.step_unchecked() }; } } @@ -860,7 +786,7 @@ where #[cfg(not(feature = "compact"))] pub fn parse_8digits<'a, Iter, const FORMAT: u128>(mut iter: Iter, mantissa: &mut u64) where - Iter: BytesIter<'a>, + Iter: DigitsIter<'a>, { let format = NumberFormat::<{ FORMAT }> {}; let radix: u64 = format.radix() as u64; @@ -886,7 +812,7 @@ pub fn parse_u64_digits<'a, Iter, const FORMAT: u128>( mantissa: &mut u64, step: &mut usize, ) where - Iter: BytesIter<'a>, + Iter: DigitsIter<'a>, { let format = NumberFormat::<{ FORMAT }> {}; let radix = format.radix() as u64; @@ -934,7 +860,7 @@ pub fn is_special_eq(mut byte: Bytes, string: &'stat byte.special_iter().peek(); return byte.cursor(); } - } else if shared::case_insensitive_starts_with(byte.special_iter(), string.iter()) { + } else if shared::starts_with_uncased(byte.special_iter(), string.iter()) { // Trim the iterator afterwards. byte.special_iter().peek(); return byte.cursor(); @@ -957,7 +883,7 @@ where } let cursor = byte.cursor(); - let length = byte.length() - cursor; + let length = byte.buffer_length() - cursor; if let Some(nan_string) = options.nan_string() { if length >= nan_string.len() { let count = is_special_eq::(byte.clone(), nan_string); @@ -1013,7 +939,7 @@ pub fn parse_special( where F: LemireFloat, { - let length = byte.length(); + let length = byte.buffer_length(); if let Some((float, count)) = parse_partial_special::(byte, is_negative, options) { if count == length { return Some(float); diff --git a/lexical-parse-float/src/shared.rs b/lexical-parse-float/src/shared.rs index 657adc51..9933c832 100644 --- a/lexical-parse-float/src/shared.rs +++ b/lexical-parse-float/src/shared.rs @@ -114,7 +114,7 @@ where /// This optimizes decently well, to the following ASM for pure slices: /// /// ```text -/// case_insensitive_starts_with_slc: +/// starts_with_uncased: /// xor eax, eax /// .LBB1_1: /// cmp rcx, rax @@ -134,7 +134,7 @@ where /// ret /// ``` #[cfg_attr(not(feature = "compact"), inline(always))] -pub fn case_insensitive_starts_with<'a, 'b, Iter1, Iter2>(mut x: Iter1, mut y: Iter2) -> bool +pub fn starts_with_uncased<'a, 'b, Iter1, Iter2>(mut x: Iter1, mut y: Iter2) -> bool where Iter1: Iterator, Iter2: Iterator, diff --git a/lexical-parse-float/src/slow.rs b/lexical-parse-float/src/slow.rs index e2c8941b..2822b309 100644 --- a/lexical-parse-float/src/slow.rs +++ b/lexical-parse-float/src/slow.rs @@ -10,12 +10,11 @@ use core::cmp; #[cfg(not(feature = "compact"))] use lexical_parse_integer::algorithm; -use lexical_util::buffer::Buffer; use lexical_util::digit::char_to_valid_digit_const; #[cfg(feature = "radix")] use lexical_util::digit::digit_to_char_const; use lexical_util::format::NumberFormat; -use lexical_util::iterator::{AsBytes, BytesIter}; +use lexical_util::iterator::{AsBytes, DigitsIter, Iter}; use lexical_util::num::{AsPrimitive, Integer}; #[cfg(feature = "radix")] @@ -358,12 +357,12 @@ macro_rules! round_up_truncated { /// - `count` - The total number of parsed digits macro_rules! round_up_nonzero { ($format:ident, $iter:expr, $result:ident, $count:ident) => {{ - // NOTE: All digits must be valid. + // NOTE: All digits must already be valid. let mut iter = $iter; // First try reading 8-digits at a time. if iter.is_contiguous() { - while let Some(value) = iter.read_u64() { + while let Some(value) = iter.peek_u64() { // SAFETY: safe since we have at least 8 bytes in the buffer. unsafe { iter.step_by_unchecked(8) }; if value != 0x3030_3030_3030_3030 { diff --git a/lexical-parse-float/tests/shared_tests.rs b/lexical-parse-float/tests/shared_tests.rs index 00e7ad14..9fbcf51b 100644 --- a/lexical-parse-float/tests/shared_tests.rs +++ b/lexical-parse-float/tests/shared_tests.rs @@ -40,11 +40,11 @@ fn starts_with_test() { } #[test] -fn case_insensitive_starts_with_test() { - assert_eq!(shared::case_insensitive_starts_with(b"NaN".iter(), b"nAN".iter()), true); - assert_eq!(shared::case_insensitive_starts_with(b"nAN".iter(), b"nAN".iter()), true); - assert_eq!(shared::case_insensitive_starts_with(b"nAN1".iter(), b"nAN".iter()), true); - assert_eq!(shared::case_insensitive_starts_with(b"nAN1".iter(), b"nAN12".iter()), false); +fn starts_with_uncased_test() { + assert_eq!(shared::starts_with_uncased(b"NaN".iter(), b"nAN".iter()), true); + assert_eq!(shared::starts_with_uncased(b"nAN".iter(), b"nAN".iter()), true); + assert_eq!(shared::starts_with_uncased(b"nAN1".iter(), b"nAN".iter()), true); + assert_eq!(shared::starts_with_uncased(b"nAN1".iter(), b"nAN12".iter()), false); } #[test] diff --git a/lexical-parse-integer/src/algorithm.rs b/lexical-parse-integer/src/algorithm.rs index 1ee2dafa..c41943a3 100644 --- a/lexical-parse-integer/src/algorithm.rs +++ b/lexical-parse-integer/src/algorithm.rs @@ -32,11 +32,10 @@ #![doc(hidden)] -use lexical_util::buffer::Buffer; use lexical_util::digit::char_to_digit_const; use lexical_util::error::Error; use lexical_util::format::NumberFormat; -use lexical_util::iterator::{AsBytes, Bytes, BytesIter}; +use lexical_util::iterator::{AsBytes, Bytes, DigitsIter, Iter}; use lexical_util::num::{as_cast, Integer}; use lexical_util::result::Result; @@ -45,7 +44,7 @@ use crate::Options; // HELPERS /// Check if we should do multi-digit optimizations -const fn can_try_parse_multidigits<'a, Iter: BytesIter<'a>, const FORMAT: u128>(_: &Iter) -> bool { +const fn can_try_parse_multidigits<'a, Iter: DigitsIter<'a>, const FORMAT: u128>(_: &Iter) -> bool { let format = NumberFormat:: {}; Iter::IS_CONTIGUOUS && (cfg!(not(feature = "power-of-two")) || format.mantissa_radix() <= 10) } @@ -125,6 +124,8 @@ macro_rules! fmt_invalid_digit { // invalid_digit!. Need to ensure we include the // base suffix in that. + // TODO: This isn't localized and needs to be fixed + // SAFETY: safe since the iterator is not empty, as checked // in `$iter.is_done()` and `$is_end`. If `$is_end` is false, // then we have more elements in our @@ -159,25 +160,48 @@ macro_rules! fmt_invalid_digit { /// /// Returns if the value is negative, or any values detected when /// validating the input. +#[macro_export] +macro_rules! parse_sign { + ( + $byte:ident, + $is_signed:expr, + $no_positive:expr, + $required:expr, + $invalid_positive:ident, + $missing:ident + ) => { + // NOTE: read_if optimizes poorly since we then match after + match $byte.integer_iter().peek() { + Some(&b'+') if !$no_positive => { + // SAFETY: We have at least 1 item left since we peaked a value + unsafe { $byte.step_unchecked() }; + Ok(false) + }, + Some(&b'+') if $no_positive => Err(Error::$invalid_positive($byte.cursor())), + Some(&b'-') if $is_signed => { + // SAFETY: We have at least 1 item left since we peaked a value + unsafe { $byte.step_unchecked() }; + Ok(true) + }, + Some(_) if $required => Err(Error::$missing($byte.cursor())), + _ if $required => Err(Error::$missing($byte.cursor())), + _ => Ok(false), + } + }; +} + +/// Parse the sign from the leading digits. #[cfg_attr(not(feature = "compact"), inline(always))] pub fn parse_sign(byte: &mut Bytes<'_, FORMAT>) -> Result { let format = NumberFormat:: {}; - match byte.integer_iter().peek() { - Some(&b'+') if !format.no_positive_mantissa_sign() => { - unsafe { byte.step_unchecked() }; - Ok(false) - }, - Some(&b'+') if format.no_positive_mantissa_sign() => { - Err(Error::InvalidPositiveSign(byte.cursor())) - }, - Some(&b'-') if T::IS_SIGNED => { - unsafe { byte.step_unchecked() }; - Ok(true) - }, - Some(_) if format.required_mantissa_sign() => Err(Error::MissingSign(byte.cursor())), - _ if format.required_mantissa_sign() => Err(Error::MissingSign(byte.cursor())), - _ => Ok(false), - } + parse_sign!( + byte, + T::IS_SIGNED, + format.no_positive_mantissa_sign(), + format.required_mantissa_sign(), + InvalidPositiveSign, + MissingSign + ) } // FOUR DIGITS @@ -230,7 +254,7 @@ pub fn parse_4digits(mut v: u32) -> u32 { pub fn try_parse_4digits<'a, T, Iter, const FORMAT: u128>(iter: &mut Iter) -> Option where T: Integer, - Iter: BytesIter<'a>, + Iter: DigitsIter<'a>, { // Can't do fast optimizations with radixes larger than 10, since // we no longer have a contiguous ASCII block. Likewise, cannot @@ -239,7 +263,7 @@ where debug_assert!(Iter::IS_CONTIGUOUS); // Read our digits, validate the input, and check from there. - let bytes = u32::from_le(iter.read_u32()?); + let bytes = u32::from_le(iter.peek_u32()?); if is_4digits::(bytes) { // SAFETY: safe since we have at least 4 bytes in the buffer. unsafe { iter.step_by_unchecked(4) }; @@ -304,7 +328,7 @@ pub fn parse_8digits(mut v: u64) -> u64 { pub fn try_parse_8digits<'a, T, Iter, const FORMAT: u128>(iter: &mut Iter) -> Option where T: Integer, - Iter: BytesIter<'a>, + Iter: DigitsIter<'a>, { // Can't do fast optimizations with radixes larger than 10, since // we no longer have a contiguous ASCII block. Likewise, cannot @@ -313,7 +337,7 @@ where debug_assert!(Iter::IS_CONTIGUOUS); // Read our digits, validate the input, and check from there. - let bytes = u64::from_le(iter.read_u64()?); + let bytes = u64::from_le(iter.peek_u64()?); if is_8digits::(bytes) { // SAFETY: safe since we have at least 8 bytes in the buffer. unsafe { iter.step_by_unchecked(8) }; @@ -448,13 +472,13 @@ macro_rules! parse_digits_unchecked { // `try_parse_4digits` it will be optimized out and the overflow won't // matter. let format = NumberFormat:: {}; - if use_multi && T::BITS >= 64 && $iter.length() >= 8 { + if use_multi && T::BITS >= 64 && $iter.buffer_length() >= 8 { // Try our fast, 8-digit at a time optimizations. let radix8 = T::from_u32(format.radix8()); while let Some(value) = try_parse_8digits::(&mut $iter) { $value = $value.wrapping_mul(radix8).$add_op(value); } - } else if use_multi && T::BITS == 32 && $iter.length() >= 4 { + } else if use_multi && T::BITS == 32 && $iter.buffer_length() >= 4 { // Try our fast, 8-digit at a time optimizations. let radix4 = T::from_u32(format.radix4()); while let Some(value) = try_parse_4digits::(&mut $iter) { @@ -577,20 +601,12 @@ macro_rules! algorithm { if base_prefix != 0 && zeros == 1 { // Check to see if the next character is the base prefix. // We must have a format like `0x`, `0d`, `0o`. Note: - if let Some(&c) = iter.peek() { - is_prefix = if format.case_sensitive_base_prefix() { - c == base_prefix + if iter.read_if_value(base_prefix, format.case_sensitive_base_prefix()).is_some() { + is_prefix = true; + if iter.is_done() { + return into_error!(Empty, iter.cursor()); } else { - c.to_ascii_lowercase() == base_prefix.to_ascii_lowercase() - }; - if is_prefix { - // SAFETY: safe since we `byte.len() >= 1` (we got an item from peek). - unsafe { iter.step_unchecked() }; - if iter.is_done() { - return into_error!(Empty, iter.cursor()); - } else { - start_index += 1; - } + start_index += 1; } } } @@ -635,7 +651,7 @@ macro_rules! algorithm { parse_digits_checked!(value, iter, checked_add, wrapping_add, start_index, $invalid_digit, Overflow, $no_multi_digit, overflow_digits); } - $into_ok!(value, iter.length()) + $into_ok!(value, iter.buffer_length()) }}; } diff --git a/lexical-size/bin/write.rs b/lexical-size/bin/write.rs index 1480314f..97e1ed6b 100644 --- a/lexical-size/bin/write.rs +++ b/lexical-size/bin/write.rs @@ -1,8 +1,6 @@ #[allow(unused_macros)] macro_rules! integer_module { ($t:ty) => { - #[cfg(feature = "lexical")] - use core::mem; use std::io::BufRead; #[cfg(not(feature = "lexical"))] use std::io::Write; @@ -27,10 +25,7 @@ macro_rules! integer_module { #[cfg(feature = "lexical")] { - // FIXME: This is UB but it doesn't affect code integrity here - // Undo when we implement #92. - let buffer: mem::MaybeUninit<[u8; 128]> = mem::MaybeUninit::uninit(); - let mut buffer = unsafe { buffer.assume_init() }; + let mut buffer: [u8; 128] = [0u8; 128]; println!("{}", value.to_lexical(&mut buffer).len()); } @@ -47,8 +42,6 @@ macro_rules! integer_module { #[allow(unused_macros)] macro_rules! float_module { ($t:ty) => { - #[cfg(feature = "lexical")] - use core::mem; use std::io::BufRead; #[cfg(not(feature = "lexical"))] use std::io::Write; @@ -73,10 +66,7 @@ macro_rules! float_module { #[cfg(feature = "lexical")] { - // FIXME: This is UB but it doesn't affect code integrity here - // Undo when we implement #92. - let buffer: mem::MaybeUninit<[u8; 128]> = mem::MaybeUninit::uninit(); - let mut buffer = unsafe { buffer.assume_init() }; + let mut buffer = [0u8; 128]; println!("{}", value.to_lexical(&mut buffer).len()); } diff --git a/lexical-util/src/buffer.rs b/lexical-util/src/buffer.rs deleted file mode 100644 index bbfeea8a..00000000 --- a/lexical-util/src/buffer.rs +++ /dev/null @@ -1,96 +0,0 @@ -//! Specialized buffer traits. -//! -//! The traits are for iterables containing bytes, and provide optimizations -//! which then can be used for contiguous or non-contiguous iterables, -//! including containers or iterators of any kind. - -/// A trait for working with iterables of bytes. -/// -/// These buffers can either be contiguous or not contiguous and provide -/// methods for reading data and accessing underlying data. The readers -/// can either be contiguous or non-contiguous, although performance and -/// some API methods may not be available for both. -/// -/// # Safety -/// -/// This trait is effectively safe but the implementor must guarantee that -/// `is_empty` is implemented correctly. For most implementations, this can -/// be `self.as_slice().is_empty()`, where `as_slice` is implemented as -/// `&self.bytes[self.index..]`. -#[cfg(feature = "parse")] -pub unsafe trait Buffer<'a> { - /// Determine if the buffer is contiguous in memory. - const IS_CONTIGUOUS: bool; - - /// Get a ptr to the current start of the buffer. - fn as_ptr(&self) -> *const u8; - - /// Get a slice to the current start of the buffer. - fn as_slice(&self) -> &'a [u8]; - - /// Get if no bytes are available in the buffer. - /// - /// This operators on the underlying buffer: that is, - /// it returns if [as_slice] would return an empty slice. - /// - /// [as_slice]: Buffer::as_slice - #[inline(always)] - fn is_empty(&self) -> bool { - self.as_slice().is_empty() - } - - /// Determine if the buffer is contiguous. - #[inline(always)] - fn is_contiguous(&self) -> bool { - Self::IS_CONTIGUOUS - } - - /// Peek the next value of the buffer, without checking bounds. - /// - /// # Safety - /// - /// Safe as long as there is at least a single valid value left in - /// the buffer. Note that the behavior of this may lead to out-of-bounds - /// access (for contiguous buffers) or panics (for non-contiguous - /// buffers). - unsafe fn first_unchecked(&self) -> &'a u8; - - /// Get the next value available without consuming it. - /// - /// # Safety - /// - /// An implementor must implement `is_empty` correctly in - /// order to guarantee the traitt is safe: `is_empty` **MUST** - /// ensure that one value remains, if the iterator is non- - /// contiguous this means advancing the iterator to the next - /// position. - #[inline(always)] - fn first(&self) -> Option<&'a u8> { - if !self.is_empty() { - // SAFETY: safe since the buffer cannot be empty as validated before. - unsafe { Some(self.first_unchecked()) } - } else { - None - } - } - - /// Check if the next element is a given value. - #[inline(always)] - fn first_is(&self, value: u8) -> bool { - if let Some(&c) = self.first() { - c == value - } else { - false - } - } - - /// Check if the next element is a given value without case sensitivity. - #[inline(always)] - fn case_insensitive_first_is(&self, value: u8) -> bool { - if let Some(&c) = self.first() { - c.to_ascii_lowercase() == value.to_ascii_lowercase() - } else { - false - } - } -} diff --git a/lexical-util/src/iterator.rs b/lexical-util/src/iterator.rs index cd4a6ced..d14c5aa5 100644 --- a/lexical-util/src/iterator.rs +++ b/lexical-util/src/iterator.rs @@ -1,35 +1,63 @@ //! Specialized iterator traits. //! -//! The traits are iterable, and provide optimizations for contiguous -//! iterators, while still working for non-contiguous data. +//! The traits are for iterables containing bytes, and provide optimizations +//! which then can be used for contiguous or non-contiguous iterables, +//! including containers or iterators of any kind. #![cfg(feature = "parse")] -pub use crate::buffer::Buffer; +use core::mem; + // Re-export our digit iterators. #[cfg(not(feature = "format"))] pub use crate::noskip::{AsBytes, Bytes}; #[cfg(feature = "format")] pub use crate::skip::{AsBytes, Bytes}; -/// Iterator over a contiguous block of bytes. +/// A trait for working with iterables of bytes. /// -/// This allows us to convert to-and-from-slices, raw pointers, and -/// peek/query the data from either end cheaply. -/// -/// A default implementation is provided for slice iterators. -/// This trait **should never** return `null` from `as_ptr`, or be -/// implemented for non-contiguous data. +/// These iterators can either be contiguous or not contiguous and provide +/// methods for reading data and accessing underlying data. The readers +/// can either be contiguous or non-contiguous, although performance and +/// some API methods may not be available for both. /// /// # Safety /// -/// The safe methods are sound as long as the caller ensures that -/// the methods for `read_32`, `read_64`, etc. check the bounds -/// of the underlying contiguous buffer and is only called on -/// contiguous buffers. -pub unsafe trait BytesIter<'a>: Iterator + Buffer<'a> { - /// Get the total number of elements in the underlying slice. - fn length(&self) -> usize; +/// // TODO: FIX CORRECTNESS DOCUMENTATION +/// This trait is effectively safe but the implementor must guarantee that +/// `is_empty` is implemented correctly. For most implementations, this can +/// be `self.as_slice().is_empty()`, where `as_slice` is implemented as +/// `&self.bytes[self.index..]`. +#[cfg(feature = "parse")] +pub unsafe trait Iter<'a> { + /// Determine if the buffer is contiguous in memory. + const IS_CONTIGUOUS: bool; + + // CURSORS + // ------- + + /// Get a ptr to the current start of the buffer. + #[inline(always)] + fn as_ptr(&self) -> *const u8 { + self.as_slice().as_ptr() + } + + /// Get a slice to the current start of the buffer. + #[inline(always)] + fn as_slice(&self) -> &'a [u8] { + debug_assert!(self.cursor() <= self.buffer_length()); + // SAFETY: safe since index must be in range. + unsafe { self.get_buffer().get_unchecked(self.cursor()..) } + } + + /// Get a slice to the full underlying contiguous buffer, + fn get_buffer(&self) -> &'a [u8]; + + /// Get the total number of elements in the underlying buffer. + #[inline(always)] + fn buffer_length(&self) -> usize { + self.get_buffer().len() + } /// Get the current index of the iterator in the slice. fn cursor(&self) -> usize; @@ -44,18 +72,193 @@ pub unsafe trait BytesIter<'a>: Iterator + Buffer<'a> { /// /// # Safety /// - /// Safe if `index <= self.length()`. Although this won't - /// affect safety, the caller also should be careful it - /// does not set the cursor within skipped characters. + /// Safe if `index <= self.buffer_length()`. Although this + /// won't affect safety, the caller also should be careful it + /// does not set the cursor within skipped characters + /// since this could affect correctness: an iterator that + /// only accepts non-consecutive digit separators would + /// pass if the cursor was set between the two. unsafe fn set_cursor(&mut self, index: usize); - /// Get a slice to the full buffer, which may or may not be the same as - /// `as_slice`. - fn as_full_slice(&self) -> &'a [u8]; - /// Get the current number of values returned by the iterator. + /// + /// For contiguous iterators, this is always the cursor, for + /// non-contiguous iterators this can be smaller. fn current_count(&self) -> usize; + // TODO: DOCUMENT + // PROPERTIES + + // TODO: Fix this naming convention + /// Get if no bytes are available in the buffer. + /// + /// This operators on the underlying buffer: that is, + /// it returns if [as_slice] would return an empty slice. + /// + /// [as_slice]: Iter::as_slice + #[inline(always)] + fn is_empty(&self) -> bool { + self.as_slice().is_empty() + } + + /// Determine if the buffer is contiguous. + #[inline(always)] + fn is_contiguous(&self) -> bool { + Self::IS_CONTIGUOUS + } + + // TODO: Ensure we get this **RIGHT** + + /// Get the next value available without consuming it. + /// + /// This does **NOT** skip digits, and directly fetches the item + /// from the underlying buffer. + /// + /// # Safety + /// + /// An implementor must implement `is_empty` correctly in + /// order to guarantee the traitt is safe: `is_empty` **MUST** + /// ensure that one value remains, if the iterator is non- + /// contiguous this means advancing the iterator to the next + /// position. + fn first(&self) -> Option<&'a u8>; + + /// Check if the next element is a given value. + #[inline(always)] + fn first_is_cased(&self, value: u8) -> bool { + Some(&value) == self.first() + } + + /// Check if the next element is a given value without case sensitivity. + #[inline(always)] + fn first_is_uncased(&self, value: u8) -> bool { + if let Some(&c) = self.first() { + c.to_ascii_lowercase() == value.to_ascii_lowercase() + } else { + false + } + } + + /// Check if the next item in buffer is a given value with optional case + /// sensitivity. + #[inline(always)] + fn first_is(&mut self, value: u8, is_cased: bool) -> bool { + if is_cased { + self.first_is_cased(value) + } else { + self.first_is_uncased(value) + } + } + + // STEP BY + // ------- + + /// Advance the internal slice by `N` elements. + /// + /// This does not advance the iterator by `N` elements for + /// non-contiguous iterators: this just advances the internal, + /// underlying buffer. This is useful for multi-digit optimizations + /// for contiguous iterators. + /// + /// # Panics + /// + /// This will panic if the buffer advances for non-contiguous + /// iterators if the current byte is a digit separator, or if the + /// count is more than 1. + /// + /// # Safety + /// + /// As long as the iterator is at least `N` elements, this + /// is safe. + unsafe fn step_by_unchecked(&mut self, count: usize); + + /// Advance the internal slice by 1 element. + /// + /// # Panics + /// + /// This will panic if the buffer advances for non-contiguous + /// iterators if the current byte is a digit separator. + /// + /// # Safety + /// + /// Safe as long as the iterator is not empty. + #[inline(always)] + unsafe fn step_unchecked(&mut self) { + debug_assert!(!self.as_slice().is_empty()); + // SAFETY: safe if `self.index < self.buffer_length()`. + unsafe { self.step_by_unchecked(1) }; + } + + // READ + // ---- + + /// Read a value of a difference type from the iterator. + /// + /// This does **not** advance the internal state of the iterator. + /// This can only be implemented for contiguous iterators: non- + /// contiguous iterators **MUST** panic. + /// + /// # Panics + /// + /// If the iterator is a non-contiguous iterator. + /// + /// # Safety + /// + /// Safe as long as the number of the buffer is contains as least as + /// many bytes as the size of V. This must be unimplemented for + /// non-contiguous iterators. + #[inline(always)] + unsafe fn peek_many_unchecked(&self) -> V { + unimplemented!(); + } + + /// Try to read a the next four bytes as a u32. + /// + /// This does not advance the internal state of the iterator. + #[inline(always)] + fn peek_u32(&self) -> Option { + if Self::IS_CONTIGUOUS && self.as_slice().len() >= mem::size_of::() { + // SAFETY: safe since we've guaranteed the buffer is greater than + // the number of elements read. u32 is valid for all bit patterns + unsafe { Some(self.peek_many_unchecked()) } + } else { + None + } + } + + /// Try to read the next eight bytes as a u64. + /// + /// This does not advance the internal state of the iterator. + #[inline(always)] + fn peek_u64(&self) -> Option { + if Self::IS_CONTIGUOUS && self.as_slice().len() >= mem::size_of::() { + // SAFETY: safe since we've guaranteed the buffer is greater than + // the number of elements read. u64 is valid for all bit patterns + unsafe { Some(self.peek_many_unchecked()) } + } else { + None + } + } +} + +/// Iterator over a contiguous block of bytes. +/// +/// This allows us to convert to-and-from-slices, raw pointers, and +/// peek/query the data from either end cheaply. +/// +/// A default implementation is provided for slice iterators. +/// This trait **should never** return `null` from `as_ptr`, or be +/// implemented for non-contiguous data. +/// +/// # Safety +/// +/// TODO: Fix the safety documentation here... +/// The safe methods are sound as long as the caller ensures that +/// the methods for `read_32`, `read_64`, etc. check the bounds +/// of the underlying contiguous buffer and is only called on +/// contiguous buffers. +pub trait DigitsIter<'a>: Iterator + Iter<'a> { + // TODO: Fix the documentation /// Get if the iterator cannot return any more elements. /// /// This may advance the internal iterator state, but not @@ -77,9 +280,9 @@ pub unsafe trait BytesIter<'a>: Iterator + Buffer<'a> { /// If you would like to see if the cursor is at the end of the buffer, /// see [is_done] or [is_empty] instead. /// - /// [is_done]: BytesIter::is_done - /// [is_empty]: Buffer::is_empty - /// [peek]: BytesIter::peek + /// [is_done]: DigitsIter::is_done + /// [is_empty]: Iter::is_empty + /// [peek]: DigitsIter::peek #[inline(always)] #[allow(clippy::wrong_self_convention)] fn is_consumed(&mut self) -> bool { @@ -94,35 +297,14 @@ pub unsafe trait BytesIter<'a>: Iterator + Buffer<'a> { /// iterator has an empty slice. It is effectively a cheaper, /// but weaker variant of [is_consumed]. /// - /// [is_consumed]: BytesIter::is_consumed + /// [is_consumed]: DigitsIter::is_consumed fn is_done(&self) -> bool; - /// Peek the next value of the iterator, without checking bounds. - /// - /// Note that this can modify the internal state, by skipping digits - /// for iterators that find the first non-zero value, etc. - /// - /// # Safety - /// - /// Safe as long as there is at least a single valid value left in - /// the iterator. Note that the behavior of this may lead to out-of-bounds - /// access (for contiguous iterators) or panics (for non-contiguous - /// iterators). - unsafe fn peek_unchecked(&mut self) -> Self::Item; - /// Peek the next value of the iterator, without consuming it. /// /// Note that this can modify the internal state, by skipping digits /// for iterators that find the first non-zero value, etc. - #[inline(always)] - fn peek(&mut self) -> Option { - if !self.is_empty() { - // SAFETY: safe since the buffer cannot be empty - unsafe { Some(self.peek_unchecked()) } - } else { - None - } - } + fn peek(&mut self) -> Option; /// Peek the next value of the iterator, and step only if it exists. #[inline(always)] @@ -138,22 +320,39 @@ pub unsafe trait BytesIter<'a>: Iterator + Buffer<'a> { /// Check if the next element is a given value. #[inline(always)] - fn peek_is(&mut self, value: u8) -> bool { + fn peek_is_cased(&mut self, value: u8) -> bool { + Some(&value) == self.peek() + } + + /// Check if the next element is a given value without case sensitivity. + #[inline(always)] + fn peek_is_uncased(&mut self, value: u8) -> bool { if let Some(&c) = self.peek() { - c == value + c.to_ascii_lowercase() == value.to_ascii_lowercase() } else { false } } + /// Check if the next element is a given value with optional case + /// sensitivity. + #[inline(always)] + fn peek_is(&mut self, value: u8, is_cased: bool) -> bool { + if is_cased { + self.peek_is_cased(value) + } else { + self.peek_is_uncased(value) + } + } + /// Peek the next value and consume it if the read value matches the /// expected one. #[inline(always)] - fn read_if bool>(&mut self, pred: Pred) -> Option { + fn read_if bool>(&mut self, pred: Pred) -> Option { // NOTE: This was implemented to remove usage of unsafe throughout to code // base, however, performance was really not up to scratch. I'm not sure // the cause of this. - if let Some(peeked) = self.peek() { + if let Some(&peeked) = self.peek() { if pred(peeked) { // SAFETY: the slice cannot be empty because we peeked a value. unsafe { self.step_unchecked() }; @@ -166,70 +365,43 @@ pub unsafe trait BytesIter<'a>: Iterator + Buffer<'a> { } } - /// Check if the next element is a given value without case sensitivity. + /// Read a value if the value matches the provided one. #[inline(always)] - fn case_insensitive_peek_is(&mut self, value: u8) -> bool { - if let Some(&c) = self.peek() { - c.to_ascii_lowercase() == value.to_ascii_lowercase() + fn read_if_value_cased(&mut self, value: u8) -> Option { + if self.peek() == Some(&value) { + // SAFETY: the slice cannot be empty because we peeked a value. + unsafe { self.step_unchecked() }; + Some(value) } else { - false + None } } - /// Skip zeros from the start of the iterator + /// Read a value if the value matches the provided one without case + /// sensitivity. #[inline(always)] - fn skip_zeros(&mut self) -> usize { - let start = self.cursor(); - while let Some(&b'0') = self.peek() { - self.next(); - } - self.cursor() - start + fn read_if_value_uncased(&mut self, value: u8) -> Option { + self.read_if(|x| x.to_ascii_lowercase() == value.to_ascii_lowercase()) } - /// Read a value of a difference type from the iterator. - /// - /// This advances the internal state of the iterator. This - /// can only be implemented for contiguous iterators: non- - /// contiguous iterators **MUST** panic. - /// - /// # Safety - /// - /// Safe as long as the number of the buffer is contains as least as - /// many bytes as the size of V. This must be unimplemented for - /// non-contiguous iterators. + /// Read a value if the value matches the provided one. #[inline(always)] - unsafe fn read_unchecked(&self) -> V { - unimplemented!(); + fn read_if_value(&mut self, value: u8, is_cased: bool) -> Option { + if is_cased { + self.read_if_value_cased(value) + } else { + self.read_if_value_uncased(value) + } } - /// Try to read a the next four bytes as a u32. - /// This advances the internal state of the iterator. - fn read_u32(&self) -> Option; - - /// Try to read the next eight bytes as a u64 - /// This advances the internal state of the iterator. - fn read_u64(&self) -> Option; - - /// Advance the internal slice by `N` elements. - /// - /// This does not advance the iterator by `N` elements for - /// non-contiguous iterators: this just advances the internal, - /// underlying buffer. This is useful for multi-digit optimizations - /// for contiguous iterators. - /// - /// # Safety - /// - /// As long as the iterator is at least `N` elements, this - /// is safe. - unsafe fn step_by_unchecked(&mut self, count: usize); - - /// Advance the internal slice by 1 element. - /// - /// # Safety - /// - /// Safe as long as the iterator is not empty. + /// Skip zeros from the start of the iterator #[inline(always)] - unsafe fn step_unchecked(&mut self) { - unsafe { self.step_by_unchecked(1) }; + fn skip_zeros(&mut self) -> usize { + let start = self.cursor(); + while self.read_if_value_cased(b'0').is_some() {} + self.cursor() - start } + + /// Determine if the character is a digit. + fn is_digit(&self, value: u8) -> bool; } diff --git a/lexical-util/src/lib.rs b/lexical-util/src/lib.rs index 2357bf43..e3eab8f5 100644 --- a/lexical-util/src/lib.rs +++ b/lexical-util/src/lib.rs @@ -42,10 +42,10 @@ //! traits to clearly define safety invariants and localize any unsafety to //! 1 or 2 lines of code. //! -//! The core, unsafe trait is `BytesIter` and `Buffer`, both which expect +//! The core, unsafe trait is `DigitsIter` and `Iter`, both which expect //! to be backed by a contiguous block of memory (a slice) but may skip //! bytes internally. To guarantee safety, for non-skip iterators you -//! must implement [BytesIter::is_consumed][is_consumed] correctly. +//! must implement [DigitsIter::is_consumed][is_consumed] correctly. //! //! This must correctly determine if there are any elements left in the //! iterator. If the buffer is contiguous, this can just be `index == @@ -55,21 +55,18 @@ //! correctly. //! //! To see if the cursor is at the end of the buffer, use -//! [BytesIter::is_done][is_done]. +//! [DigitsIter::is_done][is_done]. //! //! Any iterators must be peekable: you must be able to read and return the next //! value without advancing the iterator past that point. For iterators that //! skip bytes, this means advancing to the next element to be returned and -//! returning that value. Therefore, [peek_unchecked] for skip iterators must be -//! implemented in terms of [peek]: you must peek the next element and -//! [peek_unchecked] will merely unwrap this value. Contiguous iterators can use -//! raw indexing instead. +//! returning that value. //! //! For examples of how to safely implement skip iterators, you can do something //! like: //! //! ```rust,ignore -//! unsafe impl<_> BytesIter<_> for MyIter { +//! unsafe impl<_> DigitsIter<_> for MyIter { //! fn peek(&mut self) -> Option { //! loop { //! let value = self.bytes.get(self.index)?; @@ -98,10 +95,9 @@ //! } //! ``` //! -//! [is_done]: iterator::BytesIter::is_done -//! [is_consumed]: iterator::BytesIter::is_consumed -//! [peek]: iterator::BytesIter::peek -//! [peek_unchecked]: iterator::BytesIter::peek_unchecked +//! [is_done]: iterator::DigitsIter::is_done +//! [is_consumed]: iterator::DigitsIter::is_consumed +//! [peek]: iterator::DigitsIter::peek // We want to have the same safety guarantees as Rust core, // so we allow unused unsafe to clearly document safety guarantees. @@ -113,7 +109,6 @@ pub mod algorithm; pub mod ascii; pub mod assert; pub mod bf16; -pub mod buffer; pub mod constants; pub mod digit; pub mod div128; diff --git a/lexical-util/src/noskip.rs b/lexical-util/src/noskip.rs index 7fc3d2f2..7fdbb074 100644 --- a/lexical-util/src/noskip.rs +++ b/lexical-util/src/noskip.rs @@ -7,7 +7,9 @@ use core::{mem, ptr}; -use crate::{buffer::Buffer, iterator::BytesIter}; +use crate::digit::char_is_digit_const; +use crate::format::NumberFormat; +use crate::iterator::{DigitsIter, Iter}; // AS DIGITS // --------- @@ -47,12 +49,13 @@ impl<'a, const __: u128> Bytes<'a, __> { } } + // TODO: Move to `Iter` as a trait along with `new` as well` /// Initialize the slice from raw parts. /// /// # Safety /// This is safe if and only if the index is <= slc.len(). /// For this reason, since it's easy to get wrong, we only - /// expose it to `BytesIterator` and nothing else. + /// expose it to `DigitsIterator` and nothing else. #[inline(always)] #[allow(clippy::assertions_on_constants)] const unsafe fn from_parts(slc: &'a [u8], index: usize) -> Self { @@ -64,35 +67,6 @@ impl<'a, const __: u128> Bytes<'a, __> { } } - /// Get the total number of elements in the underlying slice. - #[inline(always)] - pub fn length(&self) -> usize { - self.slc.len() - } - - /// Get the current index of the iterator in the slice. - #[inline(always)] - pub fn cursor(&self) -> usize { - self.index - } - - /// Set the current index of the iterator in the slice. - /// - /// # Safety - /// - /// Safe if `index <= self.length()`. - #[inline(always)] - pub unsafe fn set_cursor(&mut self, index: usize) { - debug_assert!(index <= self.length()); - self.index = index - } - - /// Get the current number of values returned by the iterator. - #[inline(always)] - pub fn current_count(&self) -> usize { - self.index - } - /// Get if the buffer underlying the iterator is empty. /// Same as `is_consumed`. #[inline(always)] @@ -100,146 +74,97 @@ impl<'a, const __: u128> Bytes<'a, __> { self.index >= self.slc.len() } - /// Read a value of a difference type from the iterator. - /// This advances the internal state of the iterator. - /// - /// # Safety - /// - /// Safe as long as the number of the buffer is contains as least as - /// many bytes as the size of V, and V is valid for all bit patterns. - #[inline(always)] - #[allow(clippy::assertions_on_constants)] - pub unsafe fn read_unchecked(&self) -> V { - debug_assert!(Self::IS_CONTIGUOUS); - debug_assert!(self.as_slice().len() >= mem::size_of::()); - - let slc = self.as_slice(); - // SAFETY: safe as long as the slice has at least count elements. - unsafe { ptr::read_unaligned::(slc.as_ptr() as *const _) } - } - - /// Try to read a the next four bytes as a u32. - /// This advances the internal state of the iterator. - #[inline(always)] - pub fn read_u32(&self) -> Option { - if Self::IS_CONTIGUOUS && self.as_slice().len() >= mem::size_of::() { - // SAFETY: safe since we've guaranteed the buffer is greater than - // the number of elements read. u32 is valid for all bit patterns - unsafe { Some(self.read_unchecked()) } - } else { - None - } - } - - /// Try to read the next eight bytes as a u64 - /// This advances the internal state of the iterator. - #[inline(always)] - pub fn read_u64(&self) -> Option { - if Self::IS_CONTIGUOUS && self.as_slice().len() >= mem::size_of::() { - // SAFETY: safe since we've guaranteed the buffer is greater than - // the number of elements read. u64 is valid for all bit patterns - unsafe { Some(self.read_unchecked()) } - } else { - None - } - } - - /// Check if the next element is a given value. - #[inline(always)] - pub fn peek_is(&mut self, value: u8) -> bool { - self.first_is(value) - } - - /// Check if the next element is a given value without case sensitivity. - #[inline(always)] - pub fn case_insensitive_peek_is(&mut self, value: u8) -> bool { - self.case_insensitive_first_is(value) - } - /// Get iterator over integer digits. #[inline(always)] - pub fn integer_iter<'b>(&'b mut self) -> BytesIterator<'a, 'b, __> { - BytesIterator { + pub fn integer_iter<'b>(&'b mut self) -> DigitsIterator<'a, 'b, __> { + DigitsIterator { byte: self, } } /// Get iterator over fraction digits. #[inline(always)] - pub fn fraction_iter<'b>(&'b mut self) -> BytesIterator<'a, 'b, __> { - BytesIterator { + pub fn fraction_iter<'b>(&'b mut self) -> DigitsIterator<'a, 'b, __> { + DigitsIterator { byte: self, } } /// Get iterator over exponent digits. #[inline(always)] - pub fn exponent_iter<'b>(&'b mut self) -> BytesIterator<'a, 'b, __> { - BytesIterator { + pub fn exponent_iter<'b>(&'b mut self) -> DigitsIterator<'a, 'b, __> { + DigitsIterator { byte: self, } } /// Get iterator over special floating point values. #[inline(always)] - pub fn special_iter<'b>(&'b mut self) -> BytesIterator<'a, 'b, __> { - BytesIterator { + pub fn special_iter<'b>(&'b mut self) -> DigitsIterator<'a, 'b, __> { + DigitsIterator { byte: self, } } +} + +unsafe impl<'a, const __: u128> Iter<'a> for Bytes<'a, __> { + const IS_CONTIGUOUS: bool = true; - /// Advance the byte by `N` elements. - /// - /// # Safety - /// - /// As long as the iterator is at least `N` elements, this - /// is safe. #[inline(always)] - #[allow(clippy::assertions_on_constants)] - pub unsafe fn step_by_unchecked(&mut self, count: usize) { - debug_assert!(Self::IS_CONTIGUOUS); - debug_assert!(self.as_slice().len() >= count); - self.index += count; + fn get_buffer(&self) -> &'a [u8] { + self.slc + } + + /// Get the current index of the iterator in the slice. + #[inline(always)] + fn cursor(&self) -> usize { + self.index } - /// Advance the byte by 1 element. + /// Set the current index of the iterator in the slice. /// /// # Safety /// - /// Safe as long as the iterator is not empty. + /// Safe if `index <= self.buffer_length()`. #[inline(always)] - #[allow(clippy::assertions_on_constants)] - pub unsafe fn step_unchecked(&mut self) { - debug_assert!(Self::IS_CONTIGUOUS); - debug_assert!(!self.as_slice().is_empty()); - self.index += 1; + unsafe fn set_cursor(&mut self, index: usize) { + debug_assert!(index <= self.buffer_length()); + self.index = index } -} -unsafe impl<'a, const __: u128> Buffer<'a> for Bytes<'a, __> { - const IS_CONTIGUOUS: bool = true; + /// Get the current number of values returned by the iterator. + #[inline(always)] + fn current_count(&self) -> usize { + self.index + } + // TODO: Rename #[inline(always)] - fn as_ptr(&self) -> *const u8 { - self.as_slice().as_ptr() + fn is_empty(&self) -> bool { + self.as_slice().is_empty() } #[inline(always)] - fn as_slice(&self) -> &'a [u8] { - debug_assert!(self.index <= self.length()); - // SAFETY: safe since index must be in range. - unsafe { self.slc.get_unchecked(self.index..) } + fn first(&self) -> Option<&'a u8> { + self.slc.get(self.index) } #[inline(always)] - fn is_empty(&self) -> bool { - self.as_slice().is_empty() + #[allow(clippy::assertions_on_constants)] + unsafe fn step_by_unchecked(&mut self, count: usize) { + debug_assert!(Self::IS_CONTIGUOUS); + debug_assert!(self.as_slice().len() >= count); + self.index += count; } #[inline(always)] - unsafe fn first_unchecked(&self) -> &'a u8 { - // SAFETY: safe if there's at least 1 item in the buffer - unsafe { self.as_slice().get_unchecked(0) } + #[allow(clippy::assertions_on_constants)] + unsafe fn peek_many_unchecked(&self) -> V { + debug_assert!(Self::IS_CONTIGUOUS); + debug_assert!(self.as_slice().len() >= mem::size_of::()); + + // SAFETY: safe as long as the slice has at least count elements. + unsafe { ptr::read_unaligned::(self.as_ptr() as *const _) } } } @@ -247,12 +172,22 @@ unsafe impl<'a, const __: u128> Buffer<'a> for Bytes<'a, __> { // --------------- /// Slice iterator that stores the original length of the slice. -pub struct BytesIterator<'a: 'b, 'b, const __: u128> { +pub struct DigitsIterator<'a: 'b, 'b, const __: u128> { /// The internal byte object for the noskip iterator. byte: &'b mut Bytes<'a, __>, } -impl<'a: 'b, 'b, const __: u128> BytesIterator<'a, 'b, __> { +impl<'a: 'b, 'b, const __: u128> DigitsIterator<'a, 'b, __> { + /// Create a new digits iterator from the bytes underlying item. + #[inline(always)] + pub fn new(byte: &'b mut Bytes<'a, __>) -> Self { + Self { + byte, + } + } + + // TODO: Move as a trait + /// Take the first N digits from the iterator. /// /// This only takes the digits if we have a contiguous iterator. @@ -277,35 +212,12 @@ impl<'a: 'b, 'b, const __: u128> BytesIterator<'a, 'b, __> { } } -unsafe impl<'a: 'b, 'b, const __: u128> Buffer<'a> for BytesIterator<'a, 'b, __> { +unsafe impl<'a: 'b, 'b, const __: u128> Iter<'a> for DigitsIterator<'a, 'b, __> { const IS_CONTIGUOUS: bool = Bytes::<'a, __>::IS_CONTIGUOUS; #[inline(always)] - fn as_ptr(&self) -> *const u8 { - self.byte.as_ptr() - } - - #[inline(always)] - fn as_slice(&self) -> &'a [u8] { - self.byte.as_slice() - } - - #[inline(always)] - fn is_empty(&self) -> bool { - self.byte.is_done() - } - - #[inline(always)] - unsafe fn first_unchecked(&self) -> ::Item { - // SAFETY: safe if `self.cursor() < self.length()`. - unsafe { self.byte.slc.get_unchecked(self.byte.index) } - } -} - -unsafe impl<'a: 'b, 'b, const __: u128> BytesIter<'a> for BytesIterator<'a, 'b, __> { - #[inline(always)] - fn length(&self) -> usize { - self.byte.length() + fn get_buffer(&self) -> &'a [u8] { + self.byte.get_buffer() } #[inline(always)] @@ -315,75 +227,66 @@ unsafe impl<'a: 'b, 'b, const __: u128> BytesIter<'a> for BytesIterator<'a, 'b, #[inline(always)] unsafe fn set_cursor(&mut self, index: usize) { - debug_assert!(index <= self.length()); - // SAFETY: safe if `index <= self.length()`. + debug_assert!(index <= self.buffer_length()); + // SAFETY: safe if `index <= self.buffer_length()`. unsafe { self.byte.set_cursor(index) }; } - #[inline(always)] - fn as_full_slice(&self) -> &'a [u8] { - self.byte.slc - } - #[inline(always)] fn current_count(&self) -> usize { self.byte.current_count() } #[inline(always)] - fn is_consumed(&mut self) -> bool { - Self::is_done(self) - } - - #[inline(always)] - fn is_done(&self) -> bool { + fn is_empty(&self) -> bool { self.byte.is_done() } #[inline(always)] - unsafe fn peek_unchecked(&mut self) -> ::Item { - // SAFETY: safe if `self.cursor() < self.length()`. - unsafe { self.first_unchecked() } + fn first(&self) -> Option<&'a u8> { + self.byte.first() } #[inline(always)] - fn peek(&mut self) -> Option<::Item> { - self.first() + unsafe fn step_by_unchecked(&mut self, count: usize) { + debug_assert!(self.as_slice().len() >= count); + // SAFETY: safe as long as `slc.len() >= count`. + unsafe { self.byte.step_by_unchecked(count) } } #[inline(always)] - unsafe fn read_unchecked(&self) -> V { + unsafe fn peek_many_unchecked(&self) -> V { debug_assert!(self.as_slice().len() >= mem::size_of::()); // SAFETY: safe as long as the slice has at least count elements. - unsafe { self.byte.read_unchecked() } + unsafe { self.byte.peek_many_unchecked() } } +} +impl<'a: 'b, 'b, const FORMAT: u128> DigitsIter<'a> for DigitsIterator<'a, 'b, FORMAT> { #[inline(always)] - fn read_u32(&self) -> Option { - self.byte.read_u32() + fn is_consumed(&mut self) -> bool { + Self::is_done(self) } #[inline(always)] - fn read_u64(&self) -> Option { - self.byte.read_u64() + fn is_done(&self) -> bool { + self.byte.is_done() } #[inline(always)] - unsafe fn step_by_unchecked(&mut self, count: usize) { - debug_assert!(self.as_slice().len() >= count); - // SAFETY: safe as long as `slc.len() >= count`. - unsafe { self.byte.step_by_unchecked(count) } + fn peek(&mut self) -> Option<::Item> { + self.byte.slc.get(self.byte.index) } + /// Determine if the character is a digit. #[inline(always)] - unsafe fn step_unchecked(&mut self) { - debug_assert!(!self.as_slice().is_empty()); - // SAFETY: safe as long as `slc.len() >= 1`. - unsafe { self.byte.step_unchecked() } + fn is_digit(&self, value: u8) -> bool { + let format = NumberFormat::<{ FORMAT }> {}; + char_is_digit_const(value, format.mantissa_radix()) } } -impl<'a: 'b, 'b, const __: u128> Iterator for BytesIterator<'a, 'b, __> { +impl<'a: 'b, 'b, const __: u128> Iterator for DigitsIterator<'a, 'b, __> { type Item = &'a u8; #[inline(always)] @@ -394,9 +297,9 @@ impl<'a: 'b, 'b, const __: u128> Iterator for BytesIterator<'a, 'b, __> { } } -impl<'a: 'b, 'b, const __: u128> ExactSizeIterator for BytesIterator<'a, 'b, __> { +impl<'a: 'b, 'b, const __: u128> ExactSizeIterator for DigitsIterator<'a, 'b, __> { #[inline(always)] fn len(&self) -> usize { - self.length() - self.cursor() + self.buffer_length() - self.cursor() } } diff --git a/lexical-util/src/skip.rs b/lexical-util/src/skip.rs index 5b208702..2b45dafe 100644 --- a/lexical-util/src/skip.rs +++ b/lexical-util/src/skip.rs @@ -43,11 +43,10 @@ use core::{mem, ptr}; -use crate::buffer::Buffer; use crate::digit::char_is_digit_const; use crate::format::NumberFormat; use crate::format_flags as flags; -use crate::iterator::BytesIter; +use crate::iterator::{DigitsIter, Iter}; // PEEK // ---- @@ -150,7 +149,7 @@ macro_rules! peek_1 { // a non-digit separator character. Don't need any complex checks // here, since we've already done them above. let mut index = $self.byte.index + 1; - while index < $self.length() + while index < $self.buffer_length() && $self.byte.slc.get(index).map_or(false, |&x| $self.is_digit_separator(x)) { index += 1; @@ -370,7 +369,7 @@ impl<'a, const FORMAT: u128> Bytes<'a, FORMAT> { /// # Safety /// This is safe if and only if the index is <= slc.len(). /// For this reason, since it's easy to get wrong, we only - /// expose it to our `BytesIterator`s and nothing else. + /// expose it to our `DigitsIterator`s and nothing else. /// /// This is only ever used for contiguous arrays. #[inline(always)] @@ -384,40 +383,7 @@ impl<'a, const FORMAT: u128> Bytes<'a, FORMAT> { } } - /// Get the total number of elements in the underlying slice. - #[inline(always)] - pub fn length(&self) -> usize { - self.slc.len() - } - - /// Get the current index of the iterator in the slice. - #[inline(always)] - pub fn cursor(&self) -> usize { - self.index - } - - /// Set the current index of the iterator in the slice. - /// - /// # Safety - /// - /// Safe if `index <= self.length()`. - #[inline(always)] - pub unsafe fn set_cursor(&mut self, index: usize) { - debug_assert!(index <= self.length()); - self.index = index - } - - /// Get the current number of values returned by the iterator. - #[inline(always)] - pub fn current_count(&self) -> usize { - // If the buffer is contiguous, then we don't need to track the - // number of values: the current index is enough. - if Self::IS_CONTIGUOUS { - self.index - } else { - self.count - } - } + // TODO: Move this to our iter trait /// Get if the buffer underlying the iterator is empty. /// @@ -431,57 +397,10 @@ impl<'a, const FORMAT: u128> Bytes<'a, FORMAT> { self.index >= self.slc.len() } - // Determine if the abstraction is contiguous. - #[inline(always)] - pub fn is_contiguous(&self) -> bool { - Self::IS_CONTIGUOUS - } - - /// Read a value of a difference type from the iterator. - /// This advances the internal state of the iterator. - /// - /// # Safety - /// - /// Safe as long as the number of the buffer is contains as least as - /// many bytes as the size of V, and V is valid for all bit patterns. - #[inline(always)] - pub unsafe fn read_unchecked(&self) -> V { - debug_assert!(Self::IS_CONTIGUOUS); - debug_assert!(self.as_slice().len() >= mem::size_of::()); - - let slc = self.as_slice(); - // SAFETY: safe as long as the slice has at least count elements. - unsafe { ptr::read_unaligned::(slc.as_ptr() as *const _) } - } - - /// Try to read a the next four bytes as a u32. - /// This advances the internal state of the iterator. - #[inline(always)] - fn read_u32(&self) -> Option { - if Self::IS_CONTIGUOUS && self.as_slice().len() >= mem::size_of::() { - // SAFETY: safe since we've guaranteed the buffer is greater than - // the number of elements read. u32 is valid for all bit patterns - unsafe { Some(self.read_unchecked()) } - } else { - None - } - } - /// Try to read a the next four bytes as a u32. - /// This advances the internal state of the iterator. - #[inline(always)] - fn read_u64(&self) -> Option { - if Self::IS_CONTIGUOUS && self.as_slice().len() >= mem::size_of::() { - // SAFETY: safe since we've guaranteed the buffer is greater than - // the number of elements read. u64 is valid for all bit patterns - unsafe { Some(self.read_unchecked()) } - } else { - None - } - } - /// Check if the next element is a given value. + // TODO: Remove the peek methods, these shouldn't be on `bytes`. #[inline(always)] - pub fn peek_is(&mut self, value: u8) -> bool { + pub fn peek_is_cased(&mut self, value: u8) -> bool { // Don't assert not a digit separator, since this can occur when // a different component does not allow digit separators there. if let Some(&c) = self.first() { @@ -491,58 +410,90 @@ impl<'a, const FORMAT: u128> Bytes<'a, FORMAT> { } } - /// Check if the next element is a given value without case sensitivity. - #[inline(always)] - pub fn case_insensitive_peek_is(&mut self, value: u8) -> bool { - // Don't assert not a digit separator, since this can occur when - // a different component does not allow digit separators there. - if let Some(&c) = self.first() { - c.to_ascii_lowercase() == value.to_ascii_lowercase() - } else { - false - } - } - /// Get iterator over integer digits. #[inline(always)] - pub fn integer_iter<'b>(&'b mut self) -> IntegerBytesIterator<'a, 'b, FORMAT> { - IntegerBytesIterator { + pub fn integer_iter<'b>(&'b mut self) -> IntegerDigitsIterator<'a, 'b, FORMAT> { + IntegerDigitsIterator { byte: self, } } /// Get iterator over fraction digits. #[inline(always)] - pub fn fraction_iter<'b>(&'b mut self) -> FractionBytesIterator<'a, 'b, FORMAT> { - FractionBytesIterator { + pub fn fraction_iter<'b>(&'b mut self) -> FractionDigitsIterator<'a, 'b, FORMAT> { + FractionDigitsIterator { byte: self, } } /// Get iterator over exponent digits. #[inline(always)] - pub fn exponent_iter<'b>(&'b mut self) -> ExponentBytesIterator<'a, 'b, FORMAT> { - ExponentBytesIterator { + pub fn exponent_iter<'b>(&'b mut self) -> ExponentDigitsIterator<'a, 'b, FORMAT> { + ExponentDigitsIterator { byte: self, } } /// Get iterator over special floating point values. #[inline(always)] - pub fn special_iter<'b>(&'b mut self) -> SpecialBytesIterator<'a, 'b, FORMAT> { - SpecialBytesIterator { + pub fn special_iter<'b>(&'b mut self) -> SpecialDigitsIterator<'a, 'b, FORMAT> { + SpecialDigitsIterator { byte: self, } } +} + +unsafe impl<'a, const FORMAT: u128> Iter<'a> for Bytes<'a, FORMAT> { + /// If each yielded value is adjacent in memory. + const IS_CONTIGUOUS: bool = NumberFormat::<{ FORMAT }>::DIGIT_SEPARATOR == 0; - /// Advance the byte by `N` elements. + #[inline(always)] + fn get_buffer(&self) -> &'a [u8] { + self.slc + } + + /// Get the current index of the iterator in the slice. + #[inline(always)] + fn cursor(&self) -> usize { + self.index + } + + /// Set the current index of the iterator in the slice. /// /// # Safety /// - /// As long as the iterator is at least `N` elements, this - /// is safe. + /// Safe if `index <= self.buffer_length()`. #[inline(always)] - pub unsafe fn step_by_unchecked(&mut self, count: usize) { + unsafe fn set_cursor(&mut self, index: usize) { + debug_assert!(index <= self.buffer_length()); + self.index = index + } + + /// Get the current number of values returned by the iterator. + #[inline(always)] + fn current_count(&self) -> usize { + // If the buffer is contiguous, then we don't need to track the + // number of values: the current index is enough. + if Self::IS_CONTIGUOUS { + self.index + } else { + self.count + } + } + + // TODO: Rename + #[inline(always)] + fn is_empty(&self) -> bool { + self.index >= self.slc.len() + } + + #[inline(always)] + fn first(&self) -> Option<&'a u8> { + self.slc.get(self.index) + } + + #[inline(always)] + unsafe fn step_by_unchecked(&mut self, count: usize) { if Self::IS_CONTIGUOUS { // Contiguous, can skip most of these checks. debug_assert!(self.as_slice().len() >= count); @@ -566,44 +517,14 @@ impl<'a, const FORMAT: u128> Bytes<'a, FORMAT> { } } - /// Advance the byte by 1 element. - /// - /// # Safety - /// - /// Safe as long as the iterator is not empty. #[inline(always)] - pub unsafe fn step_unchecked(&mut self) { - debug_assert!(!self.as_slice().is_empty()); - // SAFETY: safe if `self.index < self.length()`. - unsafe { self.step_by_unchecked(1) }; - } -} - -unsafe impl<'a, const FORMAT: u128> Buffer<'a> for Bytes<'a, FORMAT> { - /// If each yielded value is adjacent in memory. - const IS_CONTIGUOUS: bool = NumberFormat::<{ FORMAT }>::DIGIT_SEPARATOR == 0; - - #[inline(always)] - fn as_ptr(&self) -> *const u8 { - self.as_slice().as_ptr() - } - - #[inline(always)] - fn as_slice(&self) -> &'a [u8] { - debug_assert!(self.index <= self.length()); - // SAFETY: safe since index must be in range. - unsafe { self.slc.get_unchecked(self.index..) } - } - - #[inline(always)] - fn is_empty(&self) -> bool { - self.index >= self.slc.len() - } + unsafe fn peek_many_unchecked(&self) -> V { + debug_assert!(Self::IS_CONTIGUOUS); + debug_assert!(self.as_slice().len() >= mem::size_of::()); - #[inline(always)] - unsafe fn first_unchecked(&self) -> &'a u8 { - // SAFETY: safe if there's at least 1 item in the buffer - unsafe { self.as_slice().get_unchecked(0) } + let slc = self.as_slice(); + // SAFETY: safe as long as the slice has at least count elements. + unsafe { ptr::read_unaligned::(slc.as_ptr() as *const _) } } } @@ -644,11 +565,11 @@ macro_rules! skip_iterator_impl { impl<'a: 'b, 'b, const FORMAT: u128> $iterator<'a, 'b, FORMAT> { is_digit_separator!(FORMAT); - /// Determine if the character is a digit. - #[inline(always)] - pub const fn is_digit(&self, value: u8) -> bool { - let format = NumberFormat::<{ FORMAT }> {}; - char_is_digit_const(value, format.$radix_cb()) + /// Create a new digits iterator from the bytes underlying item. + pub fn new(byte: &'b mut Bytes<'a, FORMAT>) -> Self { + Self { + byte, + } } /// Take the first N digits from the iterator. @@ -656,7 +577,7 @@ macro_rules! skip_iterator_impl { /// This only takes the digits if we have a contiguous iterator. /// It takes the digits, validating the bounds, and then advanced /// the iterators state. It does not support non-contiguous iterators - /// since we would lost information on the count. + /// since we would lose information on the count. #[cfg_attr(not(feature = "compact"), inline(always))] #[allow(clippy::assertions_on_constants)] pub fn take_n(&mut self, n: usize) -> Option> { @@ -704,8 +625,8 @@ macro_rules! skip_iterator_iterator_impl { }; } -/// Create base methods for the Buffer block of a skip iterator. -macro_rules! skip_iterator_buffer_base { +/// Create base methods for the Iter block of a skip iterator. +macro_rules! skip_iterator_iter_base { ($format:ident, $mask:ident) => { // It's contiguous if we don't skip over any values. // IE, the digit separator flags for the iterator over @@ -713,34 +634,8 @@ macro_rules! skip_iterator_buffer_base { const IS_CONTIGUOUS: bool = $format & flags::$mask == 0; #[inline(always)] - fn as_ptr(&self) -> *const u8 { - self.byte.as_ptr() - } - - #[inline(always)] - fn as_slice(&self) -> &'a [u8] { - self.byte.as_slice() - } - - #[inline(always)] - fn is_empty(&self) -> bool { - self.byte.is_done() - } - - #[inline(always)] - unsafe fn first_unchecked(&self) -> ::Item { - // SAFETY: safe if `self.cursor() < self.length()`. - unsafe { self.byte.slc.get_unchecked(self.byte.index) } - } - }; -} - -/// Create base methods for the BytesIter block of a skip iterator. -macro_rules! skip_iterator_bytesiter_base { - () => { - #[inline(always)] - fn length(&self) -> usize { - self.byte.length() + fn get_buffer(&self) -> &'a [u8] { + self.byte.get_buffer() } #[inline(always)] @@ -750,58 +645,53 @@ macro_rules! skip_iterator_bytesiter_base { #[inline(always)] unsafe fn set_cursor(&mut self, index: usize) { - debug_assert!(index <= self.length()); - // SAFETY: safe if `index <= self.length()`. + debug_assert!(index <= self.buffer_length()); + // SAFETY: safe if `index <= self.buffer_length()`. unsafe { self.byte.set_cursor(index) }; } - #[inline(always)] - fn as_full_slice(&self) -> &'a [u8] { - self.byte.slc - } - #[inline(always)] fn current_count(&self) -> usize { self.byte.current_count() } #[inline(always)] - fn is_consumed(&mut self) -> bool { - self.peek().is_none() + fn is_empty(&self) -> bool { + self.byte.is_done() } #[inline(always)] - fn is_done(&self) -> bool { - self.byte.is_done() + fn first(&self) -> Option<&'a u8> { + self.byte.first() } #[inline(always)] - unsafe fn peek_unchecked(&mut self) -> ::Item { - self.peek().unwrap() + unsafe fn step_by_unchecked(&mut self, count: usize) { + debug_assert!(self.as_slice().len() >= count); + // SAFETY: safe as long as `slc.len() >= count`. + unsafe { self.byte.step_by_unchecked(count) } } #[inline(always)] - unsafe fn read_unchecked(&self) -> V { + unsafe fn peek_many_unchecked(&self) -> V { debug_assert!(self.as_slice().len() >= mem::size_of::()); // SAFETY: safe as long as the slice has at least count elements. - unsafe { self.byte.read_unchecked() } - } - - #[inline(always)] - fn read_u32(&self) -> Option { - self.byte.read_u32() + unsafe { self.byte.peek_many_unchecked() } } + }; +} +/// Create base methods for the DigitsIter block of a skip iterator. +macro_rules! skip_iterator_digits_iter_base { + () => { #[inline(always)] - fn read_u64(&self) -> Option { - self.byte.read_u64() + fn is_consumed(&mut self) -> bool { + self.peek().is_none() } #[inline(always)] - unsafe fn step_by_unchecked(&mut self, count: usize) { - debug_assert!(self.as_slice().len() >= count); - // SAFETY: safe as long as `slc.len() >= count`. - unsafe { self.byte.step_by_unchecked(count) } + fn is_done(&self) -> bool { + self.byte.is_done() } }; } @@ -809,12 +699,12 @@ macro_rules! skip_iterator_bytesiter_base { /// Create impl ByteIter block for skip iterator. macro_rules! skip_iterator_bytesiter_impl { ($iterator:ident, $mask:ident, $i:ident, $l:ident, $t:ident, $c:ident) => { - unsafe impl<'a: 'b, 'b, const FORMAT: u128> Buffer<'a> for $iterator<'a, 'b, FORMAT> { - skip_iterator_buffer_base!(FORMAT, $mask); + unsafe impl<'a: 'b, 'b, const FORMAT: u128> Iter<'a> for $iterator<'a, 'b, FORMAT> { + skip_iterator_iter_base!(FORMAT, $mask); } - unsafe impl<'a: 'b, 'b, const FORMAT: u128> BytesIter<'a> for $iterator<'a, 'b, FORMAT> { - skip_iterator_bytesiter_base!(); + impl<'a: 'b, 'b, const FORMAT: u128> DigitsIter<'a> for $iterator<'a, 'b, FORMAT> { + skip_iterator_digits_iter_base!(); /// Peek the next value of the iterator, without consuming it. #[inline(always)] @@ -851,6 +741,13 @@ macro_rules! skip_iterator_bytesiter_impl { _ => unreachable!(), } } + + /// Determine if the character is a digit. + #[inline(always)] + fn is_digit(&self, value: u8) -> bool { + let format = NumberFormat::<{ FORMAT }> {}; + char_is_digit_const(value, format.mantissa_radix()) + } } }; } @@ -858,11 +755,11 @@ macro_rules! skip_iterator_bytesiter_impl { // INTEGER DIGITS ITERATOR // ----------------------- -skip_iterator!(IntegerBytesIterator, "Iterator that skips over digit separators in the integer."); -skip_iterator_impl!(IntegerBytesIterator, mantissa_radix); -skip_iterator_iterator_impl!(IntegerBytesIterator); +skip_iterator!(IntegerDigitsIterator, "Iterator that skips over digit separators in the integer."); +skip_iterator_impl!(IntegerDigitsIterator, mantissa_radix); +skip_iterator_iterator_impl!(IntegerDigitsIterator); skip_iterator_bytesiter_impl!( - IntegerBytesIterator, + IntegerDigitsIterator, INTEGER_DIGIT_SEPARATOR_FLAG_MASK, INTEGER_INTERNAL_DIGIT_SEPARATOR, INTEGER_LEADING_DIGIT_SEPARATOR, @@ -873,11 +770,14 @@ skip_iterator_bytesiter_impl!( // FRACTION DIGITS ITERATOR // ------------------------ -skip_iterator!(FractionBytesIterator, "Iterator that skips over digit separators in the fraction."); -skip_iterator_impl!(FractionBytesIterator, mantissa_radix); -skip_iterator_iterator_impl!(FractionBytesIterator); +skip_iterator!( + FractionDigitsIterator, + "Iterator that skips over digit separators in the fraction." +); +skip_iterator_impl!(FractionDigitsIterator, mantissa_radix); +skip_iterator_iterator_impl!(FractionDigitsIterator); skip_iterator_bytesiter_impl!( - FractionBytesIterator, + FractionDigitsIterator, FRACTION_DIGIT_SEPARATOR_FLAG_MASK, FRACTION_INTERNAL_DIGIT_SEPARATOR, FRACTION_LEADING_DIGIT_SEPARATOR, @@ -888,11 +788,14 @@ skip_iterator_bytesiter_impl!( // EXPONENT DIGITS ITERATOR // ------------------------ -skip_iterator!(ExponentBytesIterator, "Iterator that skips over digit separators in the exponent."); -skip_iterator_impl!(ExponentBytesIterator, exponent_radix); -skip_iterator_iterator_impl!(ExponentBytesIterator); +skip_iterator!( + ExponentDigitsIterator, + "Iterator that skips over digit separators in the exponent." +); +skip_iterator_impl!(ExponentDigitsIterator, exponent_radix); +skip_iterator_iterator_impl!(ExponentDigitsIterator); skip_iterator_bytesiter_impl!( - ExponentBytesIterator, + ExponentDigitsIterator, EXPONENT_DIGIT_SEPARATOR_FLAG_MASK, EXPONENT_INTERNAL_DIGIT_SEPARATOR, EXPONENT_LEADING_DIGIT_SEPARATOR, @@ -904,21 +807,21 @@ skip_iterator_bytesiter_impl!( // ----------------------- skip_iterator!( - SpecialBytesIterator, + SpecialDigitsIterator, "Iterator that skips over digit separators in special floats." ); -skip_iterator_iterator_impl!(SpecialBytesIterator); +skip_iterator_iterator_impl!(SpecialDigitsIterator); -impl<'a: 'b, 'b, const FORMAT: u128> SpecialBytesIterator<'a, 'b, FORMAT> { +impl<'a: 'b, 'b, const FORMAT: u128> SpecialDigitsIterator<'a, 'b, FORMAT> { is_digit_separator!(FORMAT); } -unsafe impl<'a: 'b, 'b, const FORMAT: u128> Buffer<'a> for SpecialBytesIterator<'a, 'b, FORMAT> { - skip_iterator_buffer_base!(FORMAT, SPECIAL_DIGIT_SEPARATOR); +unsafe impl<'a: 'b, 'b, const FORMAT: u128> Iter<'a> for SpecialDigitsIterator<'a, 'b, FORMAT> { + skip_iterator_iter_base!(FORMAT, SPECIAL_DIGIT_SEPARATOR); } -unsafe impl<'a: 'b, 'b, const FORMAT: u128> BytesIter<'a> for SpecialBytesIterator<'a, 'b, FORMAT> { - skip_iterator_bytesiter_base!(); +impl<'a: 'b, 'b, const FORMAT: u128> DigitsIter<'a> for SpecialDigitsIterator<'a, 'b, FORMAT> { + skip_iterator_digits_iter_base!(); /// Peek the next value of the iterator, without consuming it. #[inline(always)] @@ -930,4 +833,11 @@ unsafe impl<'a: 'b, 'b, const FORMAT: u128> BytesIter<'a> for SpecialBytesIterat peek_noskip!(self) } } + + /// Determine if the character is a digit. + #[inline(always)] + fn is_digit(&self, value: u8) -> bool { + let format = NumberFormat::<{ FORMAT }> {}; + char_is_digit_const(value, format.mantissa_radix()) + } } diff --git a/lexical-util/tests/iterator_tests.rs b/lexical-util/tests/iterator_tests.rs index 5d110ac9..44ca4a61 100644 --- a/lexical-util/tests/iterator_tests.rs +++ b/lexical-util/tests/iterator_tests.rs @@ -1,7 +1,6 @@ #![cfg(feature = "parse")] -use lexical_util::buffer::Buffer; -use lexical_util::iterator::{AsBytes, Bytes, BytesIter}; +use lexical_util::iterator::{AsBytes, Bytes, DigitsIter, Iter}; #[test] #[cfg(not(feature = "format"))] @@ -22,17 +21,17 @@ fn digits_iterator_test() { assert_eq!(iter.as_ptr(), digits.as_ptr()); assert_eq!(iter.is_consumed(), false); assert_eq!(iter.is_done(), false); - assert_eq!(u32::from_le(iter.read_u32().unwrap()), 0x34333231); - assert_eq!(iter.length(), 5); + assert_eq!(u32::from_le(iter.peek_u32().unwrap()), 0x34333231); + assert_eq!(iter.buffer_length(), 5); assert_eq!(iter.cursor(), 0); assert_eq!(iter.current_count(), 0); unsafe { iter.step_by_unchecked(4); } - assert_eq!(iter.length(), 5); + assert_eq!(iter.buffer_length(), 5); assert_eq!(iter.cursor(), 4); assert_eq!(iter.current_count(), 4); - assert_eq!(unsafe { iter.peek_unchecked() }, &b'5'); + assert_eq!(iter.peek(), Some(&b'5')); assert_eq!(iter.peek(), Some(&b'5')); assert_eq!(iter.next(), Some(&b'5')); assert_eq!(iter.peek(), None); @@ -40,7 +39,7 @@ fn digits_iterator_test() { let mut byte = digits.bytes::<{ STANDARD }>(); let mut iter = byte.integer_iter(); - assert_eq!(iter.read_u64(), None); + assert_eq!(iter.peek_u64(), None); assert_eq!(iter.nth(4).unwrap(), &b'5'); assert_eq!(iter.as_slice(), &digits[digits.len()..]); assert_eq!(iter.as_ptr(), digits[digits.len()..].as_ptr()); @@ -85,7 +84,7 @@ fn skip_iterator_test() { assert_eq!(iter.as_ptr(), digits.as_ptr()); assert_eq!(iter.is_consumed(), false); assert_eq!(iter.is_done(), false); - assert_eq!(iter.length(), 6); + assert_eq!(iter.buffer_length(), 6); assert_eq!(iter.cursor(), 0); assert_eq!(iter.current_count(), 0); unsafe { iter.step_unchecked() }; @@ -97,7 +96,7 @@ fn skip_iterator_test() { let mut byte = digits.bytes::<{ FORMAT }>(); let mut iter = byte.integer_iter(); - assert_eq!(unsafe { iter.peek_unchecked() }, &b'1'); + assert_eq!(iter.peek(), Some(&b'1')); assert_eq!(iter.peek(), Some(&b'1')); assert_eq!(iter.next(), Some(&b'1')); assert_eq!(iter.next(), Some(&b'2')); diff --git a/lexical-write-integer/src/algorithm.rs b/lexical-write-integer/src/algorithm.rs index bf4d07aa..9c8f88cf 100644 --- a/lexical-write-integer/src/algorithm.rs +++ b/lexical-write-integer/src/algorithm.rs @@ -16,6 +16,8 @@ use lexical_util::format::{radix_from_flags, NumberFormat}; use lexical_util::num::{AsCast, UnsignedInteger}; use lexical_util::step::u64_step; +use crate::decimal::DigitCount; + /// Write 2 digits to buffer. /// /// # Safety @@ -78,7 +80,7 @@ macro_rules! write_digit { /// See [algorithm] and the [crate] documentation for more detailed /// information on the safety considerations. #[inline(always)] -unsafe fn write_digits( +unsafe fn write_digits( mut value: T, radix: u32, table: &[u8], @@ -86,6 +88,10 @@ unsafe fn write_digits( mut index: usize, ) -> usize { debug_assert_radix(radix); + debug_assert!( + buffer.len() >= value.digit_count(), + "buffer must at least be as the digit count" + ); // Pre-compute our powers of radix. let radix = T::from_u32(radix); @@ -147,7 +153,7 @@ unsafe fn write_digits( /// This is safe as long as the buffer is large enough to hold `T::MAX` /// digits in radix `N`. See [algorithm] for more safety considerations. #[inline(always)] -unsafe fn write_step_digits( +unsafe fn write_step_digits( value: T, radix: u32, table: &[u8], @@ -163,10 +169,8 @@ unsafe fn write_step_digits( // Write the remaining 0 bytes. // SAFETY: this is always safe since `end < index && index < start`. let end = start.saturating_sub(step); - unsafe { - let zeros = &mut index_unchecked_mut!(buffer[end..index]); - slice_fill_unchecked!(zeros, b'0'); - } + let zeros = unsafe { &mut index_unchecked_mut!(buffer[end..index]) }; + zeros.fill(b'0'); end } @@ -182,25 +186,28 @@ unsafe fn write_step_digits( /// /// See the crate [crate] documentation for more security considerations. /// -/// `write_digits` internally -/// /// [digit_count]: crate::decimal::DigitCount #[inline(always)] -pub unsafe fn algorithm(value: T, radix: u32, table: &[u8], buffer: &mut [u8]) -> usize +pub fn algorithm(value: T, radix: u32, table: &[u8], buffer: &mut [u8]) -> usize where - T: UnsignedInteger, + T: UnsignedInteger + DigitCount, { // This is so that radix^4 does not overflow, since 36^4 overflows a u16. assert!(T::BITS >= 32, "Must have at least 32 bits in the input."); assert!(radix <= 36, "radix must be <= 36"); assert!(table.len() >= (radix * radix * 2) as usize, "table must be 2 * radix^2 long"); + let count = value.digit_count(); + assert!(count <= buffer.len()); + let buffer = &mut buffer[..count]; + // SAFETY: Both forms of unchecked indexing cannot overflow. // The table always has 2*radix^2 elements, so it must be a legal index. // The buffer is ensured to have at least `FORMATTED_SIZE` or // `FORMATTED_SIZE_DECIMAL` characters, which is the maximum number of // digits an integer of that size may write. - unsafe { write_digits(value, radix, table, buffer, buffer.len()) } + unsafe { write_digits(value, radix, table, buffer, buffer.len()) }; + count } /// Optimized implementation for radix-N 128-bit numbers. @@ -216,7 +223,7 @@ where /// /// [digit_count]: crate::decimal::DigitCount #[inline(always)] -pub unsafe fn algorithm_u128( +pub fn algorithm_u128( value: u128, table: &[u8], buffer: &mut [u8], @@ -225,6 +232,10 @@ pub unsafe fn algorithm_u128 {}.is_valid()); + let count = value.digit_count(); + assert!(count <= buffer.len()); + let buffer = &mut buffer[..count]; + // Quick approximations to make the algorithm **a lot** faster. // If the value can be represented in a 64-bit integer, we can // do this as a native integer. @@ -254,15 +265,16 @@ pub unsafe fn algorithm_u128 usize { - // SAFETY: safe since we've guaranteed the buffer is large enough to hold the max value. - let count = self.digit_count(); - assert!(count <= buffer.len()); - unsafe { - algorithm(self, 10, &DIGIT_TO_BASE10_SQUARED, &mut buffer[..count]); - count - } + algorithm(self, 10, &DIGIT_TO_BASE10_SQUARED, buffer) } } )*); @@ -276,17 +270,10 @@ decimal_impl! { u32 u64 } impl Decimal for u128 { #[inline(always)] fn decimal(self, buffer: &mut [u8]) -> usize { - // SAFETY: safe since we've guaranteed the buffer is large enough to hold the - // max value. - let count = self.digit_count(); - assert!(count <= buffer.len()); - unsafe { - algorithm_u128::<{ STANDARD }, { RADIX }, { RADIX_SHIFT }>( - self, - &DIGIT_TO_BASE10_SQUARED, - &mut buffer[..count], - ); - count - } + algorithm_u128::<{ STANDARD }, { RADIX }, { RADIX_SHIFT }>( + self, + &DIGIT_TO_BASE10_SQUARED, + buffer, + ) } } diff --git a/lexical-write-integer/src/index.rs b/lexical-write-integer/src/index.rs index 52bcd48d..cb30e63f 100644 --- a/lexical-write-integer/src/index.rs +++ b/lexical-write-integer/src/index.rs @@ -22,13 +22,3 @@ macro_rules! index_unchecked_mut { *$x.get_unchecked_mut($i) = *$y.get_unchecked($j) }; } - -/// Fill a slice with a value, without bounds checking. -macro_rules! slice_fill_unchecked { - ($slc:expr, $value:expr) => { - // Get the length first to avoid stacked borrows, since we might - // have a complex expression for the slice calculation. - let len = $slc.len(); - core::ptr::write_bytes($slc.as_mut_ptr(), $value, len) - }; -}