diff --git a/README.md b/README.md index 00d547f1..ccfa96b0 100644 --- a/README.md +++ b/README.md @@ -232,9 +232,9 @@ These builtins are needed to support 128-bit integers. These builtins are needed to support `f16` and `f128`, which are in the process of being added to Rust. -- [ ] addtf3.c -- [ ] comparetf2.c -- [ ] divtf3.c +- [x] addtf3.c +- [x] comparetf2.c +- [x] divtf3.c - [x] extenddftf2.c - [x] extendhfsf2.c - [x] extendhftf2.c @@ -249,13 +249,13 @@ These builtins are needed to support `f16` and `f128`, which are in the process - [ ] floatsitf.c - [ ] floatunditf.c - [ ] floatunsitf.c -- [ ] multf3.c +- [x] multf3.c - [ ] powitf2.c - [ ] ppc/fixtfdi.c - [ ] ppc/fixunstfdi.c - [ ] ppc/floatditf.c - [ ] ppc/floatunditf.c -- [ ] subtf3.c +- [x] subtf3.c - [x] truncdfhf2.c - [x] truncsfhf2.c - [x] trunctfdf2.c diff --git a/build.rs b/build.rs index bafbf75d..1229fb2a 100644 --- a/build.rs +++ b/build.rs @@ -479,10 +479,6 @@ mod c { ("__floatsitf", "floatsitf.c"), ("__floatunditf", "floatunditf.c"), ("__floatunsitf", "floatunsitf.c"), - ("__addtf3", "addtf3.c"), - ("__multf3", "multf3.c"), - ("__subtf3", "subtf3.c"), - ("__divtf3", "divtf3.c"), ("__powitf2", "powitf2.c"), ("__fe_getround", "fp_mode.c"), ("__fe_raise_inexact", "fp_mode.c"), @@ -500,30 +496,22 @@ mod c { if target_arch == "mips64" { sources.extend(&[ ("__netf2", "comparetf2.c"), - ("__addtf3", "addtf3.c"), - ("__multf3", "multf3.c"), - ("__subtf3", "subtf3.c"), ("__fixtfsi", "fixtfsi.c"), ("__floatsitf", "floatsitf.c"), ("__fixunstfsi", "fixunstfsi.c"), ("__floatunsitf", "floatunsitf.c"), ("__fe_getround", "fp_mode.c"), - ("__divtf3", "divtf3.c"), ]); } if target_arch == "loongarch64" { sources.extend(&[ ("__netf2", "comparetf2.c"), - ("__addtf3", "addtf3.c"), - ("__multf3", "multf3.c"), - ("__subtf3", "subtf3.c"), ("__fixtfsi", "fixtfsi.c"), ("__floatsitf", "floatsitf.c"), ("__fixunstfsi", "fixunstfsi.c"), ("__floatunsitf", "floatunsitf.c"), ("__fe_getround", "fp_mode.c"), - ("__divtf3", "divtf3.c"), ]); } diff --git a/src/float/add.rs b/src/float/add.rs index 97f73e2f..cf1d4136 100644 --- a/src/float/add.rs +++ b/src/float/add.rs @@ -1,5 +1,5 @@ use crate::float::Float; -use crate::int::{CastInto, Int}; +use crate::int::{CastInto, Int, MinInt}; /// Returns `a + b` fn add(a: F, b: F) -> F @@ -57,9 +57,9 @@ where } // zero + anything = anything - if a_abs == Int::ZERO { + if a_abs == MinInt::ZERO { // but we need to get the sign right for zero + zero - if b_abs == Int::ZERO { + if b_abs == MinInt::ZERO { return F::from_repr(a.repr() & b.repr()); } else { return b; @@ -67,7 +67,7 @@ where } // anything + zero = anything - if b_abs == Int::ZERO { + if b_abs == MinInt::ZERO { return a; } } @@ -113,10 +113,10 @@ where // Shift the significand of b by the difference in exponents, with a sticky // bottom bit to get rounding correct. let align = a_exponent.wrapping_sub(b_exponent).cast(); - if align != Int::ZERO { + if align != MinInt::ZERO { if align < bits { let sticky = - F::Int::from_bool(b_significand << bits.wrapping_sub(align).cast() != Int::ZERO); + F::Int::from_bool(b_significand << bits.wrapping_sub(align).cast() != MinInt::ZERO); b_significand = (b_significand >> align.cast()) | sticky; } else { b_significand = one; // sticky; b is known to be non-zero. @@ -125,8 +125,8 @@ where if subtraction { a_significand = a_significand.wrapping_sub(b_significand); // If a == -b, return +zero. - if a_significand == Int::ZERO { - return F::from_repr(Int::ZERO); + if a_significand == MinInt::ZERO { + return F::from_repr(MinInt::ZERO); } // If partial cancellation occured, we need to left-shift the result @@ -143,8 +143,8 @@ where // If the addition carried up, we need to right-shift the result and // adjust the exponent: - if a_significand & implicit_bit << 4 != Int::ZERO { - let sticky = F::Int::from_bool(a_significand & one != Int::ZERO); + if a_significand & implicit_bit << 4 != MinInt::ZERO { + let sticky = F::Int::from_bool(a_significand & one != MinInt::ZERO); a_significand = a_significand >> 1 | sticky; a_exponent += 1; } @@ -160,7 +160,7 @@ where // need to shift the significand. let shift = (1 - a_exponent).cast(); let sticky = - F::Int::from_bool((a_significand << bits.wrapping_sub(shift).cast()) != Int::ZERO); + F::Int::from_bool((a_significand << bits.wrapping_sub(shift).cast()) != MinInt::ZERO); a_significand = a_significand >> shift.cast() | sticky; a_exponent = 0; } @@ -203,6 +203,11 @@ intrinsics! { add(a, b) } + #[cfg(not(feature = "no-f16-f128"))] + pub extern "C" fn __addtf3(a: f128, b: f128) -> f128 { + add(a, b) + } + #[cfg(target_arch = "arm")] pub extern "C" fn __addsf3vfp(a: f32, b: f32) -> f32 { a + b diff --git a/src/float/cmp.rs b/src/float/cmp.rs index 1c8917af..36086969 100644 --- a/src/float/cmp.rs +++ b/src/float/cmp.rs @@ -1,7 +1,7 @@ #![allow(unreachable_code)] use crate::float::Float; -use crate::int::Int; +use crate::int::MinInt; #[derive(Clone, Copy)] enum Result { @@ -172,6 +172,44 @@ intrinsics! { } } +#[cfg(not(feature = "no-f16-f128"))] +intrinsics! { + #[avr_skip] + pub extern "C" fn __letf2(a: f128, b: f128) -> i32 { + cmp(a, b).to_le_abi() + } + + #[avr_skip] + pub extern "C" fn __getf2(a: f128, b: f128) -> i32 { + cmp(a, b).to_ge_abi() + } + + #[avr_skip] + pub extern "C" fn __unordtf2(a: f128, b: f128) -> i32 { + unord(a, b) as i32 + } + + #[avr_skip] + pub extern "C" fn __eqtf2(a: f128, b: f128) -> i32 { + cmp(a, b).to_le_abi() + } + + #[avr_skip] + pub extern "C" fn __lttf2(a: f128, b: f128) -> i32 { + cmp(a, b).to_le_abi() + } + + #[avr_skip] + pub extern "C" fn __netf2(a: f128, b: f128) -> i32 { + cmp(a, b).to_le_abi() + } + + #[avr_skip] + pub extern "C" fn __gttf2(a: f128, b: f128) -> i32 { + cmp(a, b).to_ge_abi() + } +} + #[cfg(target_arch = "arm")] intrinsics! { pub extern "aapcs" fn __aeabi_fcmple(a: f32, b: f32) -> i32 { diff --git a/src/float/div.rs b/src/float/div.rs index d587fe4f..75af759f 100644 --- a/src/float/div.rs +++ b/src/float/div.rs @@ -3,7 +3,65 @@ #![allow(clippy::needless_return)] use crate::float::Float; -use crate::int::{CastInto, DInt, HInt, Int}; +use crate::int::{CastInto, DInt, HInt, Int, MinInt}; + +use super::HalfRep; + +/// Configuration for division on the 64-bit implementation +trait DivIterations64: Float +where + u16: CastInto, +{ + const NUMBER_OF_HALF_ITERATIONS: usize; + const NUMBER_OF_FULL_ITERATIONS: usize; + const USE_NATIVE_FULL_ITERATIONS: bool = false; + + fn reciprocal_precision() -> Self::Int { + let precision: u16 = const { + if Self::BITS == 32 + && Self::NUMBER_OF_HALF_ITERATIONS == 2 + && Self::NUMBER_OF_FULL_ITERATIONS == 1 + { + 74 + } else if Self::BITS == 32 + && Self::NUMBER_OF_HALF_ITERATIONS == 0 + && Self::NUMBER_OF_FULL_ITERATIONS == 3 + { + 10 + } else if Self::BITS == 64 + && Self::NUMBER_OF_HALF_ITERATIONS == 3 + && Self::NUMBER_OF_FULL_ITERATIONS == 1 + { + 220 + } else if Self::BITS == 128 + && Self::NUMBER_OF_HALF_ITERATIONS == 4 + && Self::NUMBER_OF_FULL_ITERATIONS == 1 + { + 13922 + } else { + panic!("invalid iterations for the specified bits"); + } + }; + + precision.cast() + } +} + +impl DivIterations64 for f32 { + const NUMBER_OF_HALF_ITERATIONS: usize = 0; + const NUMBER_OF_FULL_ITERATIONS: usize = 3; +} + +impl DivIterations64 for f64 { + const NUMBER_OF_HALF_ITERATIONS: usize = 3; + const NUMBER_OF_FULL_ITERATIONS: usize = 1; +} + +#[cfg(not(feature = "no-f16-f128"))] +impl DivIterations64 for f128 { + const NUMBER_OF_HALF_ITERATIONS: usize = 4; + const NUMBER_OF_FULL_ITERATIONS: usize = 1; +} fn div32(a: F, b: F) -> F where @@ -37,6 +95,11 @@ where let quiet_bit = implicit_bit >> 1; let qnan_rep = exponent_mask | quiet_bit; + // #[inline(always)] + // fn negate(a: T) -> T { + // T::wrapping_neg(a.signe) + // } + #[inline(always)] fn negate_u32(a: u32) -> u32 { (::wrapping_neg(a as i32)) as u32 @@ -454,24 +517,26 @@ where fn div64(a: F, b: F) -> F where - u32: CastInto, + F: DivIterations64, F::Int: CastInto, - i32: CastInto, F::Int: CastInto, - u64: CastInto, + F::Int: CastInto>, + F::Int: From>, + F::Int: From, F::Int: CastInto, - i64: CastInto, F::Int: CastInto, - F::Int: HInt, + F::Int: HInt + DInt, + u16: CastInto, + i32: CastInto, + i64: CastInto, + u32: CastInto, + u64: CastInto, + u64: CastInto>, { - const NUMBER_OF_HALF_ITERATIONS: usize = 3; - const NUMBER_OF_FULL_ITERATIONS: usize = 1; - const USE_NATIVE_FULL_ITERATIONS: bool = false; - let one = F::Int::ONE; let zero = F::Int::ZERO; let hw = F::BITS / 2; - let lo_mask = u64::MAX >> hw; + let lo_mask = F::Int::MAX >> hw; let significand_bits = F::SIGNIFICAND_BITS; let max_exponent = F::EXPONENT_MAX; @@ -614,10 +679,11 @@ where // rounding, so error estimations have to be computed taking the selected // mode into account! - let mut x_uq0 = if NUMBER_OF_HALF_ITERATIONS > 0 { + let mut x_uq0 = if F::NUMBER_OF_HALF_ITERATIONS > 0 { // Starting with (n-1) half-width iterations - let b_uq1_hw: u32 = - (CastInto::::cast(b_significand) >> (significand_bits + 1 - hw)) as u32; + let b_uq1_hw: HalfRep = CastInto::>::cast( + CastInto::::cast(b_significand) >> (significand_bits + 1 - hw), + ); // C is (3/4 + 1/sqrt(2)) - 1 truncated to W0 fractional bits as UQ0.HW // with W0 being either 16 or 32 and W0 <= HW. @@ -625,12 +691,13 @@ where // b/2 is subtracted to obtain x0) wrapped to [0, 1) range. // HW is at least 32. Shifting into the highest bits if needed. - let c_hw = (0x7504F333_u64 as u32).wrapping_shl(hw.wrapping_sub(32)); + let c_hw = (CastInto::>::cast(0x7504F333_u64)).wrapping_shl(hw.wrapping_sub(32)); // b >= 1, thus an upper bound for 3/4 + 1/sqrt(2) - b/2 is about 0.9572, // so x0 fits to UQ0.HW without wrapping. - let x_uq0_hw: u32 = { - let mut x_uq0_hw: u32 = c_hw.wrapping_sub(b_uq1_hw /* exact b_hw/2 as UQ0.HW */); + let x_uq0_hw: HalfRep = { + let mut x_uq0_hw: HalfRep = + c_hw.wrapping_sub(b_uq1_hw /* exact b_hw/2 as UQ0.HW */); // dbg!(x_uq0_hw); // An e_0 error is comprised of errors due to // * x0 being an inherently imprecise first approximation of 1/b_hw @@ -653,7 +720,7 @@ where // On (1/b, 1], g(x) > 0 <=> f(x) < x // // For half-width iterations, b_hw is used instead of b. - for _ in 0..NUMBER_OF_HALF_ITERATIONS { + for _ in 0..F::NUMBER_OF_HALF_ITERATIONS { // corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp // of corr_UQ1_hw. // "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1). @@ -661,8 +728,9 @@ where // no overflow occurred earlier: ((rep_t)x_UQ0_hw * b_UQ1_hw >> HW) is // expected to be strictly positive because b_UQ1_hw has its highest bit set // and x_UQ0_hw should be rather large (it converges to 1/2 < 1/b_hw <= 1). - let corr_uq1_hw: u32 = - 0.wrapping_sub(((x_uq0_hw as u64).wrapping_mul(b_uq1_hw as u64)) >> hw) as u32; + let corr_uq1_hw: HalfRep = CastInto::>::cast(zero.wrapping_sub( + ((F::Int::from(x_uq0_hw)).wrapping_mul(F::Int::from(b_uq1_hw))) >> hw, + )); // dbg!(corr_uq1_hw); // Now, we should multiply UQ0.HW and UQ1.(HW-1) numbers, naturally @@ -677,7 +745,9 @@ where // The fact corr_UQ1_hw was virtually round up (due to result of // multiplication being **first** truncated, then negated - to improve // error estimations) can increase x_UQ0_hw by up to 2*Ulp of x_UQ0_hw. - x_uq0_hw = ((x_uq0_hw as u64).wrapping_mul(corr_uq1_hw as u64) >> (hw - 1)) as u32; + x_uq0_hw = ((F::Int::from(x_uq0_hw)).wrapping_mul(F::Int::from(corr_uq1_hw)) + >> (hw - 1)) + .cast(); // dbg!(x_uq0_hw); // Now, either no overflow occurred or x_UQ0_hw is 0 or 1 in its half_rep_t // representation. In the latter case, x_UQ0_hw will be either 0 or 1 after @@ -707,7 +777,7 @@ where // be not below that value (see g(x) above), so it is safe to decrement just // once after the final iteration. On the other hand, an effective value of // divisor changes after this point (from b_hw to b), so adjust here. - x_uq0_hw.wrapping_sub(1_u32) + x_uq0_hw.wrapping_sub(HalfRep::::ONE) }; // Error estimations for full-precision iterations are calculated just @@ -717,7 +787,7 @@ where // Simulating operations on a twice_rep_t to perform a single final full-width // iteration. Using ad-hoc multiplication implementations to take advantage // of particular structure of operands. - let blo: u64 = (CastInto::::cast(b_uq1)) & lo_mask; + let blo: F::Int = b_uq1 & lo_mask; // x_UQ0 = x_UQ0_hw * 2^HW - 1 // x_UQ0 * b_UQ1 = (x_UQ0_hw * 2^HW) * (b_UQ1_hw * 2^HW + blo) - b_UQ1 // @@ -726,19 +796,20 @@ where // + [ x_UQ0_hw * blo ] // - [ b_UQ1 ] // = [ result ][.... discarded ...] - let corr_uq1 = negate_u64( - (x_uq0_hw as u64) * (b_uq1_hw as u64) + (((x_uq0_hw as u64) * (blo)) >> hw) - 1, - ); // account for *possible* carry - let lo_corr = corr_uq1 & lo_mask; - let hi_corr = corr_uq1 >> hw; + let corr_uq1: F::Int = (F::Int::from(x_uq0_hw) * F::Int::from(b_uq1_hw) + + ((F::Int::from(x_uq0_hw) * blo) >> hw)) + .wrapping_sub(one) + .wrapping_neg(); // account for *possible* carry + let lo_corr: F::Int = corr_uq1 & lo_mask; + let hi_corr: F::Int = corr_uq1 >> hw; // x_UQ0 * corr_UQ1 = (x_UQ0_hw * 2^HW) * (hi_corr * 2^HW + lo_corr) - corr_UQ1 - let mut x_uq0: ::Int = ((((x_uq0_hw as u64) * hi_corr) << 1) - .wrapping_add(((x_uq0_hw as u64) * lo_corr) >> (hw - 1)) - .wrapping_sub(2)) - .cast(); // 1 to account for the highest bit of corr_UQ1 can be 1 - // 1 to account for possible carry - // Just like the case of half-width iterations but with possibility - // of overflowing by one extra Ulp of x_UQ0. + let mut x_uq0: F::Int = ((F::Int::from(x_uq0_hw) * hi_corr) << 1) + .wrapping_add((F::Int::from(x_uq0_hw) * lo_corr) >> (hw - 1)) + .wrapping_sub(F::Int::from(2u8)); + // 1 to account for the highest bit of corr_UQ1 can be 1 + // 1 to account for possible carry + // Just like the case of half-width iterations but with possibility + // of overflowing by one extra Ulp of x_UQ0. x_uq0 -= one; // ... and then traditional fixup by 2 should work @@ -755,14 +826,14 @@ where x_uq0 } else { // C is (3/4 + 1/sqrt(2)) - 1 truncated to 64 fractional bits as UQ0.n - let c: ::Int = (0x7504F333 << (F::BITS - 32)).cast(); - let x_uq0: ::Int = c.wrapping_sub(b_uq1); + let c: F::Int = (0x7504F333 << (F::BITS - 32)).cast(); + let x_uq0: F::Int = c.wrapping_sub(b_uq1); // E_0 <= 3/4 - 1/sqrt(2) + 2 * 2^-64 x_uq0 }; - let mut x_uq0 = if USE_NATIVE_FULL_ITERATIONS { - for _ in 0..NUMBER_OF_FULL_ITERATIONS { + let mut x_uq0 = if F::USE_NATIVE_FULL_ITERATIONS { + for _ in 0..F::NUMBER_OF_FULL_ITERATIONS { let corr_uq1: u64 = 0.wrapping_sub( (CastInto::::cast(x_uq0) * (CastInto::::cast(b_uq1))) >> F::BITS, ); @@ -799,14 +870,14 @@ where // Add 2 to U_N due to final decrement. - let reciprocal_precision: ::Int = 220.cast(); + let reciprocal_precision: F::Int = F::reciprocal_precision(); // Suppose 1/b - P * 2^-W < x < 1/b + P * 2^-W let x_uq0 = x_uq0 - reciprocal_precision; // Now 1/b - (2*P) * 2^-W < x < 1/b // FIXME Is x_UQ0 still >= 0.5? - let mut quotient: ::Int = x_uq0.widen_mul(a_significand << 1).hi(); + let mut quotient: F::Int = x_uq0.widen_mul(a_significand << 1).hi(); // Now, a/b - 4*P * 2^-W < q < a/b for q= in UQ1.(SB+1+W). // quotient_UQ1 is in [0.5, 2.0) as UQ1.(SB+1), @@ -868,7 +939,7 @@ where // r = a - b * q let abs_result = if written_exponent > 0 { let mut ret = quotient & significand_mask; - ret |= ((written_exponent as u64) << significand_bits).cast(); + ret |= written_exponent.cast() << significand_bits; residual <<= 1; ret } else { @@ -914,6 +985,11 @@ intrinsics! { div64(a, b) } + #[cfg(not(feature = "no-f16-f128"))] + pub extern "C" fn __divtf3(a: f128, b: f128) -> f128 { + div64(a, b) + } + #[cfg(target_arch = "arm")] pub extern "C" fn __divsf3vfp(a: f32, b: f32) -> f32 { a / b diff --git a/src/float/extend.rs b/src/float/extend.rs index 7c244660..5b0c0d97 100644 --- a/src/float/extend.rs +++ b/src/float/extend.rs @@ -1,5 +1,5 @@ use crate::float::Float; -use crate::int::{CastInto, Int}; +use crate::int::{CastInto, Int, MinInt}; /// Generic conversion from a narrower to a wider IEEE-754 floating-point type fn extend(a: F) -> R diff --git a/src/float/mod.rs b/src/float/mod.rs index a82dd7d2..6e169d23 100644 --- a/src/float/mod.rs +++ b/src/float/mod.rs @@ -1,6 +1,6 @@ use core::ops; -use super::int::Int; +use crate::int::{DInt, Int, MinInt}; pub mod add; pub mod cmp; @@ -12,6 +12,9 @@ pub mod pow; pub mod sub; pub mod trunc; +/// Wrapper to extract the integer type half of the float's size +pub(crate) type HalfRep = <::Int as DInt>::H; + public_test_dep! { /// Trait for some basic operations on floats pub(crate) trait Float: @@ -59,7 +62,7 @@ pub(crate) trait Float: /// A mask for the significand const SIGNIFICAND_MASK: Self::Int; - // The implicit bit of the float format + /// The implicit bit of the float format const IMPLICIT_BIT: Self::Int; /// A mask for the exponent diff --git a/src/float/mul.rs b/src/float/mul.rs index 378fa970..fcc593d6 100644 --- a/src/float/mul.rs +++ b/src/float/mul.rs @@ -1,5 +1,5 @@ use crate::float::Float; -use crate::int::{CastInto, DInt, HInt, Int}; +use crate::int::{CastInto, DInt, HInt, Int, MinInt}; fn mul(a: F, b: F) -> F where @@ -199,6 +199,11 @@ intrinsics! { mul(a, b) } + #[cfg(not(feature = "no-f16-f128"))] + pub extern "C" fn __multf3(a: f128, b: f128) -> f128 { + mul(a, b) + } + #[cfg(target_arch = "arm")] pub extern "C" fn __mulsf3vfp(a: f32, b: f32) -> f32 { a * b diff --git a/src/float/sub.rs b/src/float/sub.rs index 64653ee2..477019f7 100644 --- a/src/float/sub.rs +++ b/src/float/sub.rs @@ -15,6 +15,12 @@ intrinsics! { __adddf3(a, f64::from_repr(b.repr() ^ f64::SIGN_MASK)) } + #[cfg(not(feature = "no-f16-f128"))] + pub extern "C" fn __subtf3(a: f128, b: f128) -> f128 { + use crate::float::add::__addtf3; + __addtf3(a, f128::from_repr(b.repr() ^ f128::SIGN_MASK)) + } + #[cfg(target_arch = "arm")] pub extern "C" fn __subsf3vfp(a: f32, b: f32) -> f32 { a - b diff --git a/src/float/trunc.rs b/src/float/trunc.rs index 6de446c1..b607a654 100644 --- a/src/float/trunc.rs +++ b/src/float/trunc.rs @@ -1,5 +1,5 @@ use crate::float::Float; -use crate::int::{CastInto, Int}; +use crate::int::{CastInto, Int, MinInt}; fn trunc(a: F) -> R where diff --git a/src/int/addsub.rs b/src/int/addsub.rs index f31eff4b..e95590d8 100644 --- a/src/int/addsub.rs +++ b/src/int/addsub.rs @@ -1,6 +1,6 @@ -use crate::int::{DInt, Int}; +use crate::int::{DInt, Int, MinInt}; -trait UAddSub: DInt { +trait UAddSub: DInt + Int { fn uadd(self, other: Self) -> Self { let (lo, carry) = self.lo().overflowing_add(other.lo()); let hi = self.hi().wrapping_add(other.hi()); @@ -22,7 +22,7 @@ impl UAddSub for u128 {} trait AddSub: Int where - ::UnsignedInt: UAddSub, + ::UnsignedInt: UAddSub, { fn add(self, other: Self) -> Self { Self::from_unsigned(self.unsigned().uadd(other.unsigned())) @@ -37,7 +37,7 @@ impl AddSub for i128 {} trait Addo: AddSub where - ::UnsignedInt: UAddSub, + ::UnsignedInt: UAddSub, { fn addo(self, other: Self) -> (Self, bool) { let sum = AddSub::add(self, other); @@ -50,7 +50,7 @@ impl Addo for u128 {} trait Subo: AddSub where - ::UnsignedInt: UAddSub, + ::UnsignedInt: UAddSub, { fn subo(self, other: Self) -> (Self, bool) { let sum = AddSub::sub(self, other); diff --git a/src/int/big.rs b/src/int/big.rs new file mode 100644 index 00000000..026c6463 --- /dev/null +++ b/src/int/big.rs @@ -0,0 +1,365 @@ +//! Integers used for wide operations, larger than `u128`. + +#![allow(unused)] + +use crate::int::{DInt, HInt, Int, MinInt}; +use core::{fmt, ops}; + +const WORD_LO_MASK: u64 = 0x00000000ffffffff; +const WORD_HI_MASK: u64 = 0xffffffff00000000; +const WORD_FULL_MASK: u64 = 0xffffffffffffffff; +const U128_LO_MASK: u128 = u64::MAX as u128; +const U128_HI_MASK: u128 = (u64::MAX as u128) << 64; + +/// A 256-bit unsigned integer represented as 4 64-bit limbs. +/// +/// Each limb is a native-endian number, but the array is little-limb-endian. +#[allow(non_camel_case_types)] +#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)] +pub struct u256(pub [u64; 4]); + +impl u256 { + pub const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX]); + + /// Reinterpret as a signed integer + pub fn signed(self) -> i256 { + i256(self.0) + } +} + +/// A 256-bit signed integer represented as 4 64-bit limbs. +/// +/// Each limb is a native-endian number, but the array is little-limb-endian. +#[allow(non_camel_case_types)] +#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)] +pub struct i256(pub [u64; 4]); + +impl i256 { + /// Reinterpret as an unsigned integer + pub fn unsigned(self) -> u256 { + u256(self.0) + } +} + +impl MinInt for u256 { + type OtherSign = i256; + + type UnsignedInt = u256; + + const SIGNED: bool = false; + const BITS: u32 = 256; + const ZERO: Self = Self([0u64; 4]); + const ONE: Self = Self([1, 0, 0, 0]); + const MIN: Self = Self([0u64; 4]); + const MAX: Self = Self([u64::MAX; 4]); +} + +impl MinInt for i256 { + type OtherSign = u256; + + type UnsignedInt = u256; + + const SIGNED: bool = false; + const BITS: u32 = 256; + const ZERO: Self = Self([0u64; 4]); + const ONE: Self = Self([1, 0, 0, 0]); + const MIN: Self = Self([0, 0, 0, 1 << 63]); + const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX << 1]); +} + +// impl Int for i256 { +// fn is_zero(self) -> bool { +// self == Self::ZERO +// } + +// fn wrapping_neg(self) -> Self { +// Self::ZERO.wrapping_sub(self) +// } + +// fn wrapping_add(self, other: Self) -> Self { +// self.overflowing_add(other).0 +// } +// +// fn overflowing_add(self, other: Self) -> (Self, bool) { +// let x0 = (u128::from(self.0[0])).wrapping_add(u128::from(other.0[0])); +// let v0 = x0 as u64; +// let c0 = x0 >> 64; + +// let x1 = (u128::from(self.0[1])) +// .wrapping_add(u128::from(other.0[1])) +// .wrapping_add(c0); +// let v1 = x1 as u64; +// let c1 = x1 >> 64; + +// let x2 = (u128::from(self.0[2])) +// .wrapping_add(u128::from(other.0[2])) +// .wrapping_add(c1); +// let v2 = x2 as u64; +// let c2 = x2 >> 64; + +// let x3 = (u128::from(self.0[3])) +// .wrapping_add(u128::from(other.0[3])) +// .wrapping_add(c2); +// let v3 = x3 as u64; +// let c3 = x3 >> 64; + +// (Self([v0, v1, v2, v3]), c3 > 0) +// } +// } + +macro_rules! impl_common { + ($ty:ty) => { + // impl ops::Add for $ty { + // type Output = Self; + + // fn add(self, rhs: Self) -> Self::Output { + // let (val, wrapped) = self.overflowing_add(rhs); + // debug_assert!(!wrapped, "attempted to add with overflow"); + // val + // } + // } + + // impl ops::AddAssign for $ty { + // fn add_assign(&mut self, rhs: Self) { + // *self = *self + rhs + // } + // } + + // impl ops::BitAnd for $ty { + // type Output = Self; + + // fn bitand(self, rhs: Self) -> Self::Output { + // Self([ + // self.0[0] & rhs.0[0], + // self.0[1] & rhs.0[1], + // self.0[2] & rhs.0[2], + // self.0[3] & rhs.0[3], + // ]) + // } + // } + + // impl ops::BitAndAssign for $ty { + // fn bitand_assign(&mut self, rhs: Self) { + // *self = *self & rhs + // } + // } + + impl ops::BitOr for $ty { + type Output = Self; + + fn bitor(mut self, rhs: Self) -> Self::Output { + self.0[0] |= rhs.0[0]; + self.0[1] |= rhs.0[1]; + self.0[2] |= rhs.0[2]; + self.0[3] |= rhs.0[3]; + self + } + } + + // impl ops::BitOrAssign for $ty { + // fn bitor_assign(&mut self, rhs: Self) { + // *self = *self | rhs + // } + // } + + // impl ops::BitXor for $ty { + // type Output = Self; + + // fn bitxor(self, rhs: Self) -> Self::Output { + // Self([ + // self.0[0] ^ rhs.0[0], + // self.0[1] ^ rhs.0[1], + // self.0[2] ^ rhs.0[2], + // self.0[3] ^ rhs.0[3], + // ]) + // } + // } + + // impl ops::BitXorAssign for $ty { + // fn bitxor_assign(&mut self, rhs: Self) { + // *self = *self ^ rhs + // } + // } + + impl ops::Not for $ty { + type Output = Self; + + fn not(self) -> Self::Output { + Self([!self.0[0], !self.0[1], !self.0[2], !self.0[3]]) + } + } + + impl ops::Shl for $ty { + type Output = Self; + + fn shl(self, rhs: u32) -> Self::Output { + todo!() + } + } + }; +} + +impl_common!(i256); +impl_common!(u256); + +macro_rules! word { + (1, $val:expr) => { + (($val >> (32 * 3)) & Self::from(WORD_LO_MASK)) as u64 + }; + (2, $val:expr) => { + (($val >> (32 * 2)) & Self::from(WORD_LO_MASK)) as u64 + }; + (3, $val:expr) => { + (($val >> (32 * 1)) & Self::from(WORD_LO_MASK)) as u64 + }; + (4, $val:expr) => { + (($val >> (32 * 0)) & Self::from(WORD_LO_MASK)) as u64 + }; +} + +impl HInt for u128 { + type D = u256; + + fn widen(self) -> Self::D { + let w0 = self & u128::from(u64::MAX); + let w1 = (self >> u64::BITS) & u128::from(u64::MAX); + u256([w0 as u64, w1 as u64, 0, 0]) + } + + fn zero_widen(self) -> Self::D { + self.widen() + } + + fn zero_widen_mul(self, rhs: Self) -> Self::D { + let product11: u64 = word!(1, self) * word!(1, rhs); + let product12: u64 = word!(1, self) * word!(2, rhs); + let product13: u64 = word!(1, self) * word!(3, rhs); + let product14: u64 = word!(1, self) * word!(4, rhs); + let product21: u64 = word!(2, self) * word!(1, rhs); + let product22: u64 = word!(2, self) * word!(2, rhs); + let product23: u64 = word!(2, self) * word!(3, rhs); + let product24: u64 = word!(2, self) * word!(4, rhs); + let product31: u64 = word!(3, self) * word!(1, rhs); + let product32: u64 = word!(3, self) * word!(2, rhs); + let product33: u64 = word!(3, self) * word!(3, rhs); + let product34: u64 = word!(3, self) * word!(4, rhs); + let product41: u64 = word!(4, self) * word!(1, rhs); + let product42: u64 = word!(4, self) * word!(2, rhs); + let product43: u64 = word!(4, self) * word!(3, rhs); + let product44: u64 = word!(4, self) * word!(4, rhs); + + let sum0: u128 = u128::from(product44); + let sum1: u128 = u128::from(product34) + u128::from(product43); + let sum2: u128 = u128::from(product24) + u128::from(product33) + u128::from(product42); + let sum3: u128 = u128::from(product14) + + u128::from(product23) + + u128::from(product32) + + u128::from(product41); + let sum4: u128 = u128::from(product13) + u128::from(product22) + u128::from(product31); + let sum5: u128 = u128::from(product12) + u128::from(product21); + let sum6: u128 = u128::from(product11); + + let r0: u128 = + (sum0 & u128::from(WORD_FULL_MASK)) + ((sum1 & u128::from(WORD_LO_MASK)) << 32); + let r1: u128 = (sum0 >> 64) + + ((sum1 >> 32) & u128::from(WORD_FULL_MASK)) + + (sum2 & u128::from(WORD_FULL_MASK)) + + ((sum3 << 32) & u128::from(WORD_HI_MASK)); + + let (lo, carry) = r0.overflowing_add(r1 << 64); + let hi = (r1 >> 64) + + (sum1 >> 96) + + (sum2 >> 64) + + (sum3 >> 32) + + sum4 + + (sum5 << 32) + + (sum6 << 64) + + u128::from(carry); + + u256([ + (lo & U128_LO_MASK) as u64, + ((lo >> 64) & U128_LO_MASK) as u64, + (hi & U128_LO_MASK) as u64, + ((hi >> 64) & U128_LO_MASK) as u64, + ]) + } + + fn widen_mul(self, rhs: Self) -> Self::D { + self.zero_widen_mul(rhs) + } +} + +impl HInt for i128 { + type D = i256; + + fn widen(self) -> Self::D { + let mut ret = self.unsigned().zero_widen().signed(); + if self.is_negative() { + ret.0[2] = u64::MAX; + ret.0[3] = u64::MAX; + } + ret + } + + fn zero_widen(self) -> Self::D { + self.unsigned().zero_widen().signed() + } + + fn zero_widen_mul(self, rhs: Self) -> Self::D { + self.unsigned().zero_widen_mul(rhs.unsigned()).signed() + } + + fn widen_mul(self, rhs: Self) -> Self::D { + unimplemented!() + // let mut res = self.zero_widen_mul(rhs); + // if self.is_negative() ^ rhs.is_negative() { + // // Sign extend as needed + // // for word in res.0.iter_mut().rev() { + // // let zeroes = word.leading_zeros(); + // // let leading = u64::MAX << (64 - zeroes); + // // *word |= leading; + // // if zeroes != 64 { + // // break; + // // } + // // } + // } + + // res + } +} + +impl DInt for u256 { + type H = u128; + + fn lo(self) -> Self::H { + let mut tmp = [0u8; 16]; + tmp[..8].copy_from_slice(&self.0[0].to_le_bytes()); + tmp[8..].copy_from_slice(&self.0[1].to_le_bytes()); + u128::from_le_bytes(tmp) + } + + fn hi(self) -> Self::H { + let mut tmp = [0u8; 16]; + tmp[..8].copy_from_slice(&self.0[2].to_le_bytes()); + tmp[8..].copy_from_slice(&self.0[3].to_le_bytes()); + u128::from_le_bytes(tmp) + } +} + +impl DInt for i256 { + type H = i128; + + fn lo(self) -> Self::H { + let mut tmp = [0u8; 16]; + tmp[..8].copy_from_slice(&self.0[0].to_le_bytes()); + tmp[8..].copy_from_slice(&self.0[1].to_le_bytes()); + i128::from_le_bytes(tmp) + } + + fn hi(self) -> Self::H { + let mut tmp = [0u8; 16]; + tmp[..8].copy_from_slice(&self.0[2].to_le_bytes()); + tmp[8..].copy_from_slice(&self.0[3].to_le_bytes()); + i128::from_le_bytes(tmp) + } +} diff --git a/src/int/mod.rs b/src/int/mod.rs index 509f9fda..bb343d79 100644 --- a/src/int/mod.rs +++ b/src/int/mod.rs @@ -3,42 +3,29 @@ use core::ops; mod specialized_div_rem; pub mod addsub; +mod big; pub mod leading_zeros; pub mod mul; pub mod sdiv; pub mod shift; pub mod udiv; -pub use self::leading_zeros::__clzsi2; +pub use big::{i256, u256}; +pub use leading_zeros::__clzsi2; public_test_dep! { -/// Trait for some basic operations on integers -pub(crate) trait Int: - Copy +/// Minimal integer implementations needed on all integer types, including wide integers. +pub(crate) trait MinInt: Copy + core::fmt::Debug - + PartialEq - + PartialOrd - + ops::AddAssign - + ops::SubAssign - + ops::BitAndAssign - + ops::BitOrAssign - + ops::BitXorAssign - + ops::ShlAssign - + ops::ShrAssign - + ops::Add - + ops::Sub - + ops::Div - + ops::Shl - + ops::Shr + ops::BitOr - + ops::BitXor - + ops::BitAnd + ops::Not + + ops::Shl { + /// Type with the same width but other signedness - type OtherSign: Int; + type OtherSign: MinInt; /// Unsigned version of Self - type UnsignedInt: Int; + type UnsignedInt: MinInt; /// If `Self` is a signed integer const SIGNED: bool; @@ -50,13 +37,46 @@ pub(crate) trait Int: const ONE: Self; const MIN: Self; const MAX: Self; +} +} +public_test_dep! { +/// Trait for some basic operations on integers +pub(crate) trait Int: MinInt + + PartialEq + + PartialOrd + + ops::AddAssign + + ops::SubAssign + + ops::BitAndAssign + + ops::BitOrAssign + + ops::BitXorAssign + + ops::ShlAssign + + ops::ShrAssign + + ops::Add + + ops::Sub + + ops::Mul + + ops::Div + + ops::Shr + + ops::BitXor + + ops::BitAnd +{ /// LUT used for maximizing the space covered and minimizing the computational cost of fuzzing /// in `testcrate`. For example, Self = u128 produces [0,1,2,7,8,15,16,31,32,63,64,95,96,111, /// 112,119,120,125,126,127]. - const FUZZ_LENGTHS: [u8; 20]; + const FUZZ_LENGTHS: [u8; 20] = make_fuzz_lengths(::BITS); + /// The number of entries of `FUZZ_LENGTHS` actually used. The maximum is 20 for u128. - const FUZZ_NUM: usize; + const FUZZ_NUM: usize = { + let log2 = (::BITS - 1).count_ones() as usize; + if log2 == 3 { + // case for u8 + 6 + } else { + // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate + // boundaries. + 8 + (4 * (log2 - 4)) + } + }; fn unsigned(self) -> Self::UnsignedInt; fn from_unsigned(unsigned: Self::UnsignedInt) -> Self; @@ -83,74 +103,54 @@ pub(crate) trait Int: } } +pub(crate) const fn make_fuzz_lengths(bits: u32) -> [u8; 20] { + let mut v = [0u8; 20]; + v[0] = 0; + v[1] = 1; + v[2] = 2; // important for parity and the iX::MIN case when reversed + let mut i = 3; + + // No need for any more until the byte boundary, because there should be no algorithms + // that are sensitive to anything not next to byte boundaries after 2. We also scale + // in powers of two, which is important to prevent u128 corner tests from getting too + // big. + let mut l = 8; + loop { + if l >= ((bits / 2) as u8) { + break; + } + // get both sides of the byte boundary + v[i] = l - 1; + i += 1; + v[i] = l; + i += 1; + l *= 2; + } + + if bits != 8 { + // add the lower side of the middle boundary + v[i] = ((bits / 2) - 1) as u8; + i += 1; + } + + // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS + // boundary because of algorithms that split the high part up. We reverse the scaling + // as we go to Self::BITS. + let mid = i; + let mut j = 1; + loop { + v[i] = (bits as u8) - (v[mid - j]) - 1; + if j == mid { + break; + } + i += 1; + j += 1; + } + v +} + macro_rules! int_impl_common { ($ty:ty) => { - const BITS: u32 = ::ZERO.count_zeros(); - const SIGNED: bool = Self::MIN != Self::ZERO; - - const ZERO: Self = 0; - const ONE: Self = 1; - const MIN: Self = ::MIN; - const MAX: Self = ::MAX; - - const FUZZ_LENGTHS: [u8; 20] = { - let bits = ::BITS; - let mut v = [0u8; 20]; - v[0] = 0; - v[1] = 1; - v[2] = 2; // important for parity and the iX::MIN case when reversed - let mut i = 3; - // No need for any more until the byte boundary, because there should be no algorithms - // that are sensitive to anything not next to byte boundaries after 2. We also scale - // in powers of two, which is important to prevent u128 corner tests from getting too - // big. - let mut l = 8; - loop { - if l >= ((bits / 2) as u8) { - break; - } - // get both sides of the byte boundary - v[i] = l - 1; - i += 1; - v[i] = l; - i += 1; - l *= 2; - } - - if bits != 8 { - // add the lower side of the middle boundary - v[i] = ((bits / 2) - 1) as u8; - i += 1; - } - - // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS - // boundary because of algorithms that split the high part up. We reverse the scaling - // as we go to Self::BITS. - let mid = i; - let mut j = 1; - loop { - v[i] = (bits as u8) - (v[mid - j]) - 1; - if j == mid { - break; - } - i += 1; - j += 1; - } - v - }; - - const FUZZ_NUM: usize = { - let log2 = (::BITS - 1).count_ones() as usize; - if log2 == 3 { - // case for u8 - 6 - } else { - // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate - // boundaries. - 8 + (4 * (log2 - 4)) - } - }; - fn from_bool(b: bool) -> Self { b as $ty } @@ -203,10 +203,20 @@ macro_rules! int_impl_common { macro_rules! int_impl { ($ity:ty, $uty:ty) => { - impl Int for $uty { + impl MinInt for $uty { type OtherSign = $ity; type UnsignedInt = $uty; + const BITS: u32 = ::ZERO.count_zeros(); + const SIGNED: bool = Self::MIN != Self::ZERO; + + const ZERO: Self = 0; + const ONE: Self = 1; + const MIN: Self = ::MIN; + const MAX: Self = ::MAX; + } + + impl Int for $uty { fn unsigned(self) -> $uty { self } @@ -228,10 +238,20 @@ macro_rules! int_impl { int_impl_common!($uty); } - impl Int for $ity { + impl MinInt for $ity { type OtherSign = $uty; type UnsignedInt = $uty; + const BITS: u32 = ::ZERO.count_zeros(); + const SIGNED: bool = Self::MIN != Self::ZERO; + + const ZERO: Self = 0; + const ONE: Self = 1; + const MIN: Self = ::MIN; + const MAX: Self = ::MAX; + } + + impl Int for $ity { fn unsigned(self) -> $uty { self as $uty } @@ -259,18 +279,22 @@ int_impl!(i128, u128); public_test_dep! { /// Trait for integers twice the bit width of another integer. This is implemented for all /// primitives except for `u8`, because there is not a smaller primitive. -pub(crate) trait DInt: Int { +pub(crate) trait DInt: MinInt { /// Integer that is half the bit width of the integer this trait is implemented for - type H: HInt + Int; + type H: HInt; /// Returns the low half of `self` fn lo(self) -> Self::H; /// Returns the high half of `self` fn hi(self) -> Self::H; /// Returns the low and high halves of `self` as a tuple - fn lo_hi(self) -> (Self::H, Self::H); + fn lo_hi(self) -> (Self::H, Self::H) { + (self.lo(), self.hi()) + } /// Constructs an integer using lower and higher half parts - fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self; + fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self { + lo.zero_widen() | hi.widen_hi() + } } } @@ -279,7 +303,7 @@ public_test_dep! { /// primitives except for `u128`, because it there is not a larger primitive. pub(crate) trait HInt: Int { /// Integer that is double the bit width of the integer this trait is implemented for - type D: DInt + Int; + type D: DInt + MinInt; /// Widens (using default extension) the integer to have double bit width fn widen(self) -> Self::D; @@ -287,7 +311,9 @@ pub(crate) trait HInt: Int { /// around problems with associated type bounds (such as `Int`) being unstable fn zero_widen(self) -> Self::D; /// Widens the integer to have double bit width and shifts the integer into the higher bits - fn widen_hi(self) -> Self::D; + fn widen_hi(self) -> Self::D { + self.widen() << ::BITS + } /// Widening multiplication with zero widening. This cannot overflow. fn zero_widen_mul(self, rhs: Self) -> Self::D; /// Widening multiplication. This cannot overflow. @@ -305,13 +331,7 @@ macro_rules! impl_d_int { self as $X } fn hi(self) -> Self::H { - (self >> <$X as Int>::BITS) as $X - } - fn lo_hi(self) -> (Self::H, Self::H) { - (self.lo(), self.hi()) - } - fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self { - lo.zero_widen() | hi.widen_hi() + (self >> <$X as MinInt>::BITS) as $X } } )* @@ -330,9 +350,6 @@ macro_rules! impl_h_int { fn zero_widen(self) -> Self::D { (self as $uH) as $X } - fn widen_hi(self) -> Self::D { - (self as $X) << <$H as Int>::BITS - } fn zero_widen_mul(self, rhs: Self) -> Self::D { self.zero_widen().wrapping_mul(rhs.zero_widen()) } diff --git a/src/int/mul.rs b/src/int/mul.rs index 2538e2f4..e0093a72 100644 --- a/src/int/mul.rs +++ b/src/int/mul.rs @@ -1,6 +1,6 @@ use crate::int::{DInt, HInt, Int}; -trait Mul: DInt +trait Mul: DInt + Int where Self::H: DInt, { @@ -30,7 +30,7 @@ where impl Mul for u64 {} impl Mul for i128 {} -pub(crate) trait UMulo: Int + DInt { +pub(crate) trait UMulo: DInt + Int { fn mulo(self, rhs: Self) -> (Self, bool) { match (self.hi().is_zero(), rhs.hi().is_zero()) { // overflow is guaranteed diff --git a/src/int/shift.rs b/src/int/shift.rs index dbd04018..31727298 100644 --- a/src/int/shift.rs +++ b/src/int/shift.rs @@ -1,4 +1,4 @@ -use crate::int::{DInt, HInt, Int}; +use crate::int::{DInt, HInt, Int, MinInt}; trait Ashl: DInt { /// Returns `a << b`, requires `b < Self::BITS` diff --git a/src/lib.rs b/src/lib.rs index 40564178..e468ea81 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,7 @@ #![cfg_attr(not(feature = "no-asm"), feature(asm))] #![feature(abi_unadjusted)] #![feature(asm_experimental_arch)] +#![feature(c_unwind)] #![cfg_attr(not(feature = "no-asm"), feature(global_asm))] #![feature(cfg_target_has_atomic)] #![feature(compiler_builtins)] @@ -12,7 +13,6 @@ #![feature(linkage)] #![feature(naked_functions)] #![feature(repr_simd)] -#![feature(c_unwind)] #![cfg_attr(not(feature = "no-f16-f128"), feature(f16))] #![cfg_attr(not(feature = "no-f16-f128"), feature(f128))] #![no_builtins] @@ -43,6 +43,21 @@ extern crate core; #[macro_use] mod macros; +macro_rules! vdbg { + ($val:expr $(,)?) => { + // Use of `match` here is intentional because it affects the lifetimes + // of temporaries - https://stackoverflow.com/a/48732525/1063961 + match $val { + tmp => { + $crate::write_val( + tmp, + concat!("[", file!(), ":", line!(), "] ", stringify!($val), " = "), + ); + tmp + } + } + }; +} pub mod float; pub mod int; @@ -79,3 +94,45 @@ pub mod x86; pub mod x86_64; pub mod probestack; + +// Hacky way to print values since we don't have `std` for the crate +mod val_print { + extern "C" { + fn print_callback(val_ptr: *const u8, val_sz: usize, name_ptr: *const u8, name_len: usize); + } + + pub fn write_val(val: T, name: &str) { + unsafe { + print_callback( + core::ptr::addr_of!(val).cast(), + core::mem::size_of::(), + name.as_ptr(), + name.len(), + ) + }; + } +} + +pub use val_print::write_val; + +#[macro_export] +macro_rules! set_val_callback { + () => { + #[no_mangle] + unsafe extern "C" fn print_callback( + val_ptr: *const u8, + val_sz: usize, + name_ptr: *const u8, + name_len: usize, + ) { + let val = unsafe { core::slice::from_raw_parts(val_ptr, val_sz) }; + let name_slice = unsafe { core::slice::from_raw_parts(name_ptr, name_len) }; + let name = core::str::from_utf8(name_slice).unwrap(); + print!("{}: 0x", name); + for byte in val.iter().rev() { + print!("{:02x}", byte); + } + println!(); + } + }; +} diff --git a/testcrate/Cargo.toml b/testcrate/Cargo.toml index 6ff3fde1..6f771181 100644 --- a/testcrate/Cargo.toml +++ b/testcrate/Cargo.toml @@ -33,3 +33,5 @@ no-asm = ["compiler_builtins/no-asm"] no-f16-f128 = ["compiler_builtins/no-f16-f128"] mem = ["compiler_builtins/mem"] mangled-names = ["compiler_builtins/mangled-names"] +# Skip tests that rely on f128 symbols being available on the system +no-sys-f128 = [] diff --git a/testcrate/benches/float.rs b/testcrate/benches/float.rs new file mode 100644 index 00000000..a12300b3 --- /dev/null +++ b/testcrate/benches/float.rs @@ -0,0 +1,90 @@ +#![feature(test, f16, f128)] + +extern crate test; +use core::hint::black_box; +use test::Bencher; + +extern crate compiler_builtins; + +macro_rules! test_values { + ($ty:ty) => { + &[ + <$ty>::MIN, + <$ty>::MAX, + <$ty>::NAN, + <$ty>::INFINITY, + <$ty>::NEG_INFINITY, + <$ty>::MIN_POSITIVE, + 0.0, + 1.0, + -1.0, + ] + }; +} + +fn combine2(vals: &[T]) -> Vec<(T, T)> { + let mut ret = Vec::new(); + for x in vals.iter().copied() { + for y in vals.iter().copied() { + ret.push((x, y)); + } + } + ret +} + +macro_rules! test_iter { + ($b:ident, $ty:ty, $fn:path) => {{ + let vals = combine2(test_values!($ty)); + let iter_loop = || { + for (a, b) in vals.iter().copied() { + black_box($fn(black_box(a), black_box(b))); + } + }; + + // Warmup + for _ in 0..1000 { + iter_loop(); + } + + $b.iter(iter_loop); + }}; +} + +macro_rules! foobar { + ($($ty:ty, $rust_fn:ident, $builtin_fn:ident, $mod:ident::$sym:ident);* $(;)?) => { + $( + #[bench] + fn $rust_fn(b: &mut Bencher) { + // Equalize with the builtin function which is called separately + #[inline(never)] + fn inline_wrapper(a: $ty, b: $ty) -> $ty { + compiler_builtins::float::$mod::$sym(black_box(a), black_box(b)) + } + + test_iter!(b, $ty, inline_wrapper); + } + + #[bench] + fn $builtin_fn(b: &mut Bencher) { + extern "C" { + fn $sym(a: $ty, b: $ty) -> $ty; + } + + unsafe { + test_iter!(b, $ty, $sym); + } + } + )* + }; +} + +foobar! { + f32, addsf3_rust, addsf3_builtin, add::__addsf3; + f32, subsf3_rust, subsf3_builtin, sub::__subsf3; + f32, mulsf3_rust, mulsf3_builtin, mul::__mulsf3; + f32, divsf3_rust, divsf3_builtin, div::__divsf3; + f64, adddf3_rust, adddf3_builtin, add::__adddf3; + f64, subdf3_rust, subdf3_builtin, sub::__subdf3; + f64, muldf3_rust, muldf3_builtin, mul::__muldf3; + f64, divdf3_rust, divdf3_builtin, div::__divdf3; +} diff --git a/testcrate/build.rs b/testcrate/build.rs new file mode 100644 index 00000000..f279a363 --- /dev/null +++ b/testcrate/build.rs @@ -0,0 +1,14 @@ +use std::env; + +fn main() { + let target = env::var("TARGET").unwrap(); + + // These platforms do not have f128 symbols available in their system libraries, so + // skip related tests. + if target.starts_with("arm-") + || target.contains("apple-darwin") + || target.contains("windows-msvc") + { + println!("cargo:rustc-cfg=feature=\"no-sys-f128\""); + } +} diff --git a/testcrate/src/lib.rs b/testcrate/src/lib.rs index 9bd155f6..f9606d47 100644 --- a/testcrate/src/lib.rs +++ b/testcrate/src/lib.rs @@ -15,7 +15,7 @@ #![no_std] use compiler_builtins::float::Float; -use compiler_builtins::int::Int; +use compiler_builtins::int::{Int, MinInt}; use rand_xoshiro::rand_core::{RngCore, SeedableRng}; use rand_xoshiro::Xoshiro128StarStar; @@ -101,7 +101,10 @@ macro_rules! edge_cases { /// Feeds a series of fuzzing inputs to `f`. The fuzzer first uses an algorithm designed to find /// edge cases, followed by a more random fuzzer that runs `n` times. -pub fn fuzz(n: u32, mut f: F) { +pub fn fuzz(n: u32, mut f: F) +where + ::UnsignedInt: Int, +{ // edge case tester. Calls `f` 210 times for u128. // zero gets skipped by the loop f(I::ZERO); @@ -111,7 +114,7 @@ pub fn fuzz(n: u32, mut f: F) { // random fuzzer let mut rng = Xoshiro128StarStar::seed_from_u64(0); - let mut x: I = Int::ZERO; + let mut x: I = MinInt::ZERO; for _ in 0..n { fuzz_step(&mut rng, &mut x); f(x) @@ -119,7 +122,10 @@ pub fn fuzz(n: u32, mut f: F) { } /// The same as `fuzz`, except `f` has two inputs. -pub fn fuzz_2(n: u32, f: F) { +pub fn fuzz_2(n: u32, f: F) +where + ::UnsignedInt: Int, +{ // Check cases where the first and second inputs are zero. Both call `f` 210 times for `u128`. edge_cases!(I, case, { f(I::ZERO, case); @@ -150,10 +156,10 @@ pub fn fuzz_shift(f: F) { // Shift functions are very simple and do not need anything other than shifting a small // set of random patterns for every fuzz length. let mut rng = Xoshiro128StarStar::seed_from_u64(0); - let mut x: I = Int::ZERO; + let mut x: I = MinInt::ZERO; for i in 0..I::FUZZ_NUM { fuzz_step(&mut rng, &mut x); - f(x, Int::ZERO); + f(x, MinInt::ZERO); f(x, I::FUZZ_LENGTHS[i] as u32); } } @@ -257,3 +263,49 @@ pub fn fuzz_float_2(n: u32, f: E) { f(x, y) } } + +/// Use the builtin operation if avialable, fallback to apfloat if not +#[macro_export] +macro_rules! apfloat_fallback { + ( + $float_ty:ty, + $apfloat_ty:ident, + $sys_available:meta, + $op:expr $(=> $convert:ident)?, + $($arg:expr),+ + $(,)? + ) => {{ + #[cfg($sys_available)] + let ret = { + type FloatTy = $float_ty; + $op( $($arg),+ ) + }; + + #[cfg(not($sys_available))] + let ret = { + use rustc_apfloat::Float; + type FloatTy = rustc_apfloat::ieee::$apfloat_ty; + + let op_res = $op( $(FloatTy::from_bits($arg.to_bits().into())),+ ); + + apfloat_fallback!(@convert $float_ty, op_res $(,$convert)?) + }; + + ret + }}; + + // Operations that do not need converting back to a float + (@convert $float_ty:ty, $val:expr, no_convert) => { + $val + }; + + // Some apfloat operations return a `StatusAnd` that we need to extract the value from. This + // is the default. + (@convert $float_ty:ty, $val:expr) => {{ + // ignore the status, just get the value + let unwrapped = $val.value; + + <$float_ty>::from_bits(FloatTy::to_bits(unwrapped).try_into().unwrap()) + }}; + +} diff --git a/testcrate/tests/addsub.rs b/testcrate/tests/addsub.rs index da7684ec..20503003 100644 --- a/testcrate/tests/addsub.rs +++ b/testcrate/tests/addsub.rs @@ -1,5 +1,8 @@ #![allow(unused_macros)] +#![feature(f128)] +#![feature(f16)] +use core::ops::{Add, Sub}; use testcrate::*; macro_rules! sum { @@ -71,28 +74,28 @@ fn addsub() { } macro_rules! float_sum { - ($($f:ty, $fn_add:ident, $fn_sub:ident);*;) => { + ($($f:ty, $fn_add:ident, $fn_sub:ident, $apfloat_ty:ident, $sys_available:meta);*;) => { $( fuzz_float_2(N, |x: $f, y: $f| { - let add0 = x + y; - let sub0 = x - y; + let add0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Add::add, x, y); + let sub0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Sub::sub, x, y); let add1: $f = $fn_add(x, y); let sub1: $f = $fn_sub(x, y); if !Float::eq_repr(add0, add1) { panic!( - "{}({}, {}): std: {}, builtins: {}", + "{}({:?}, {:?}): std: {:?}, builtins: {:?}", stringify!($fn_add), x, y, add0, add1 ); } if !Float::eq_repr(sub0, sub1) { panic!( - "{}({}, {}): std: {}, builtins: {}", + "{}({:?}, {:?}): std: {:?}, builtins: {:?}", stringify!($fn_sub), x, y, sub0, sub1 ); } }); )* - }; + } } #[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))] @@ -105,9 +108,18 @@ fn float_addsub() { }; float_sum!( - f32, __addsf3, __subsf3; - f64, __adddf3, __subdf3; + f32, __addsf3, __subsf3, Single, all(); + f64, __adddf3, __subdf3, Double, all(); ); + + #[cfg(not(feature = "no-f16-f128"))] + { + use compiler_builtins::float::{add::__addtf3, sub::__subtf3, Float}; + + float_sum!( + f128, __addtf3, __subtf3, Quad, not(feature = "no-sys-f128"); + ); + } } #[cfg(target_arch = "arm")] @@ -120,7 +132,7 @@ fn float_addsub_arm() { }; float_sum!( - f32, __addsf3vfp, __subsf3vfp; - f64, __adddf3vfp, __subdf3vfp; + f32, __addsf3vfp, __subsf3vfp, Single, all(); + f64, __adddf3vfp, __subdf3vfp, Double, all(); ); } diff --git a/testcrate/tests/big.rs b/testcrate/tests/big.rs new file mode 100644 index 00000000..84320bc6 --- /dev/null +++ b/testcrate/tests/big.rs @@ -0,0 +1,108 @@ +use compiler_builtins::int::{i256, u256, HInt, MinInt}; + +const LOHI_SPLIT: u128 = 0xaaaaaaaaaaaaaaaaffffffffffffffff; + +/// Print a `u256` as hex since we can't add format implementations +fn hexu(v: u256) -> String { + format!( + "0x{:016x}{:016x}{:016x}{:016x}", + v.0[3], v.0[2], v.0[1], v.0[0] + ) +} + +fn hexi(v: i256) -> String { + hexu(v.unsigned()) +} + +#[test] +fn widen_u128() { + assert_eq!(u128::MAX.widen(), u256([u64::MAX, u64::MAX, 0, 0])); + assert_eq!( + LOHI_SPLIT.widen(), + u256([u64::MAX, 0xaaaaaaaaaaaaaaaa, 0, 0]) + ); +} + +#[test] +fn widen_i128() { + assert_eq!((-1i128).widen(), u256::MAX.signed()); + assert_eq!( + (LOHI_SPLIT as i128).widen(), + i256([u64::MAX, 0xaaaaaaaaaaaaaaaa, u64::MAX, u64::MAX]) + ); + assert_eq!((-1i128).zero_widen().unsigned(), (u128::MAX).widen()); +} + +#[test] +fn widen_mul_u128() { + let tests = [ + (u128::MAX / 2, 2_u128, u256([u64::MAX - 1, u64::MAX, 0, 0])), + (u128::MAX, 2_u128, u256([u64::MAX - 1, u64::MAX, 1, 0])), + (u128::MAX, u128::MAX, u256([1, 0, u64::MAX - 1, u64::MAX])), + (u128::MIN, u128::MIN, u256::ZERO), + (1234, 0, u256::ZERO), + (0, 1234, u256::ZERO), + ]; + + let mut errors = Vec::new(); + for (i, (a, b, exp)) in tests.iter().copied().enumerate() { + let res = a.widen_mul(b); + let res_z = a.zero_widen_mul(b); + assert_eq!(res, res_z); + if res != exp { + errors.push((i, a, b, exp, res)); + } + } + + for (i, a, b, exp, res) in &errors { + eprintln!( + "FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}", + hexu(*exp), + hexu(*res) + ); + } + assert!(errors.is_empty()); +} + +// unneeded? +// #[test] +// fn widen_mul_i128() { +// let tests = [ +// ( +// i128::MAX / 2, +// 2_i128, +// i256([u64::MAX - 1, u64::MAX >> 1, 0, 0]), +// ), +// (i128::MAX, 2_i128, i256([u64::MAX - 1, u64::MAX, 0, 0])), +// (i128::MIN, 2_i128, i256([0, 0, u64::MAX, u64::MAX])), +// ( +// i128::MAX, +// i128::MAX, +// i256([1, 0, u64::MAX - 1, u64::MAX >> 2]), +// ), +// (i128::MAX, i128::MIN, i256([0, 0, 0, 0b11 << 62])), +// (i128::MIN, i128::MIN, i256([0, 0, 0, 0])), +// (1234, 0, i256::ZERO), +// (0, 1234, i256::ZERO), +// (-1234, 0, i256::ZERO), +// (0, -1234, i256::ZERO), +// ]; + +// let mut errors = Vec::new(); +// for (i, (a, b, exp)) in tests.iter().copied().enumerate() { +// let res = a.widen_mul(b); +// // TODO check zero widen mul +// if res != exp { +// errors.push((i, a, b, exp, res)); +// } +// } + +// for (i, a, b, exp, res) in &errors { +// eprintln!( +// "FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}", +// hexi(*exp), +// hexi(*res) +// ); +// } +// assert!(errors.is_empty()); +// } diff --git a/testcrate/tests/cmp.rs b/testcrate/tests/cmp.rs index 5c10a560..45a6ee90 100644 --- a/testcrate/tests/cmp.rs +++ b/testcrate/tests/cmp.rs @@ -1,22 +1,51 @@ #![allow(unused_macros)] +#![allow(unreachable_code)] +#![feature(f128)] +#![feature(f16)] use testcrate::*; macro_rules! cmp { - ($x:ident, $y:ident, $($unordered_val:expr, $fn:ident);*;) => { + ( + $f:ty, $x:ident, $y:ident, $apfloat_ty:ident, $sys_available:meta, + $($unordered_val:expr, $fn:ident);*; + ) => { $( - let cmp0 = if $x.is_nan() || $y.is_nan() { + println!("a"); + let cmp0 = if apfloat_fallback!( + $f, $apfloat_ty, $sys_available, + |x: FloatTy| x.is_nan() => no_convert, + $x + ) || apfloat_fallback!( + $f, $apfloat_ty, $sys_available, + |y: FloatTy| y.is_nan() => no_convert, + $y + ) + { $unordered_val - } else if $x < $y { + } else if apfloat_fallback!( + $f, $apfloat_ty, $sys_available, + |x, y| x < y => no_convert, + $x, $y + ) { -1 - } else if $x == $y { + } else if apfloat_fallback!( + $f, $apfloat_ty, $sys_available, + |x, y| x == y => no_convert, + $x, $y + ) { 0 } else { 1 }; + println!("b"); + let cmp1 = $fn($x, $y); if cmp0 != cmp1 { - panic!("{}({}, {}): std: {}, builtins: {}", stringify!($fn_builtins), $x, $y, cmp0, cmp1); + panic!( + "{}({:?}, {:?}): std: {:?}, builtins: {:?}", + stringify!($fn), $x, $y, cmp0, cmp1 + ); } )* }; @@ -33,7 +62,7 @@ fn float_comparisons() { fuzz_float_2(N, |x: f32, y: f32| { assert_eq!(__unordsf2(x, y) != 0, x.is_nan() || y.is_nan()); - cmp!(x, y, + cmp!(f32, x, y, Single, all(), 1, __ltsf2; 1, __lesf2; 1, __eqsf2; @@ -44,7 +73,7 @@ fn float_comparisons() { }); fuzz_float_2(N, |x: f64, y: f64| { assert_eq!(__unorddf2(x, y) != 0, x.is_nan() || y.is_nan()); - cmp!(x, y, + cmp!(f64, x, y, Double, all(), 1, __ltdf2; 1, __ledf2; 1, __eqdf2; @@ -53,6 +82,37 @@ fn float_comparisons() { 1, __nedf2; ); }); + + #[cfg(not(feature = "no-f16-f128"))] + { + use compiler_builtins::float::cmp::{ + __eqtf2, __getf2, __gttf2, __letf2, __lttf2, __netf2, __unordtf2, + }; + + fuzz_float_2(N, |x: f128, y: f128| { + let x_is_nan = apfloat_fallback!( + f128, Quad, not(feature = "no-sys-f128"), + |x: FloatTy| x.is_nan() => no_convert, + x + ); + let y_is_nan = apfloat_fallback!( + f128, Quad, not(feature = "no-sys-f128"), + |x: FloatTy| x.is_nan() => no_convert, + y + ); + + assert_eq!(__unordtf2(x, y) != 0, x_is_nan || y_is_nan); + + cmp!(f128, x, y, Quad, not(feature = "no-sys-f128"), + 1, __lttf2; + 1, __letf2; + 1, __eqtf2; + -1, __getf2; + -1, __gttf2; + 1, __netf2; + ); + }); + } } macro_rules! cmp2 { diff --git a/testcrate/tests/div_rem.rs b/testcrate/tests/div_rem.rs index de3bd9be..4ccc5660 100644 --- a/testcrate/tests/div_rem.rs +++ b/testcrate/tests/div_rem.rs @@ -1,9 +1,16 @@ #![allow(unused_macros)] +#![feature(f128)] +#![feature(f16)] + +use core::ops::Div; use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divmodti4}; use compiler_builtins::int::udiv::{__udivmoddi4, __udivmodsi4, __udivmodti4, u128_divide_sparc}; + use testcrate::*; +compiler_builtins::set_val_callback!(); + // Division algorithms have by far the nastiest and largest number of edge cases, and experience shows // that sometimes 100_000 iterations of the random fuzzer is needed. @@ -104,16 +111,20 @@ fn divide_sparc() { } macro_rules! float { - ($($i:ty, $fn:ident);*;) => { + ($($f:ty, $fn:ident, $apfloat_ty:ident, $sys_available:meta);*;) => { $( - fuzz_float_2(N, |x: $i, y: $i| { - let quo0 = x / y; - let quo1: $i = $fn(x, y); + fuzz_float_2(N, |x: $f, y: $f| { + let quo0: $f = apfloat_fallback!($f, $apfloat_ty, $sys_available, Div::div, x, y); + let quo1: $f = $fn(x, y); #[cfg(not(target_arch = "arm"))] if !Float::eq_repr(quo0, quo1) { panic!( - "{}({}, {}): std: {}, builtins: {}", - stringify!($fn), x, y, quo0, quo1 + "{}({:?}, {:?}): std: {:?}, builtins: {:?}", + stringify!($fn), + x, + y, + quo0, + quo1 ); } @@ -122,8 +133,12 @@ macro_rules! float { if !(Float::is_subnormal(quo0) || Float::is_subnormal(quo1)) { if !Float::eq_repr(quo0, quo1) { panic!( - "{}({}, {}): std: {}, builtins: {}", - stringify!($fn), x, y, quo0, quo1 + "{}({:?}, {:?}): std: {:?}, builtins: {:?}", + stringify!($fn), + x, + y, + quo0, + quo1 ); } } @@ -141,9 +156,27 @@ fn float_div() { }; float!( - f32, __divsf3; - f64, __divdf3; + f32, __divsf3, Single, all(); + f64, __divdf3, Double, all(); ); + + #[cfg(not(feature = "no-f16-f128"))] + { + use compiler_builtins::float::div::__divtf3; + + float!( + f128, __divtf3, Quad, not(feature = "no-sys-f128"); + ); + } +} + +#[cfg(not(any(feature = "no-sys-f128", feature = "no-f16-f128")))] +#[test] +fn div_failures() { + use compiler_builtins::float::{div::__divtf3, Float}; + let a = f128::from_bits(0x1); + let b = f128::from_bits(0x1); + dbg!(__divtf3(a, b)); } #[cfg(target_arch = "arm")] @@ -155,7 +188,7 @@ fn float_div_arm() { }; float!( - f32, __divsf3vfp; - f64, __divdf3vfp; + f32, __divsf3vfp, Single, all(); + f64, __divdf3vfp, Double, all(); ); } diff --git a/testcrate/tests/mul.rs b/testcrate/tests/mul.rs index 819f06ca..39a71ac5 100644 --- a/testcrate/tests/mul.rs +++ b/testcrate/tests/mul.rs @@ -1,5 +1,8 @@ #![allow(unused_macros)] +#![feature(f128)] +#![feature(f16)] +use core::ops::Mul; use testcrate::*; macro_rules! mul { @@ -82,16 +85,16 @@ fn overflowing_mul() { } macro_rules! float_mul { - ($($f:ty, $fn:ident);*;) => { + ($($f:ty, $fn:ident, $apfloat_ty:ident, $sys_available:meta);*;) => { $( fuzz_float_2(N, |x: $f, y: $f| { - let mul0 = x * y; + let mul0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Mul::mul, x, y); let mul1: $f = $fn(x, y); // multiplication of subnormals is not currently handled if !(Float::is_subnormal(mul0) || Float::is_subnormal(mul1)) { if !Float::eq_repr(mul0, mul1) { panic!( - "{}({}, {}): std: {}, builtins: {}", + "{}({:?}, {:?}): std: {:?}, builtins: {:?}", stringify!($fn), x, y, mul0, mul1 ); } @@ -110,9 +113,18 @@ fn float_mul() { }; float_mul!( - f32, __mulsf3; - f64, __muldf3; + f32, __mulsf3, Single, all(); + f64, __muldf3, Double, all(); ); + + #[cfg(not(feature = "no-f16-f128"))] + { + use compiler_builtins::float::mul::__multf3; + + float_mul!( + f128, __multf3, Quad, not(feature = "no-sys-f128"); + ); + } } #[cfg(target_arch = "arm")] @@ -124,7 +136,7 @@ fn float_mul_arm() { }; float_mul!( - f32, __mulsf3vfp; - f64, __muldf3vfp; + f32, __mulsf3vfp, Single, all(); + f64, __muldf3vfp, Double, all(); ); }