diff --git a/src/arithmetic.rs b/src/arithmetic.rs index 9762c67..7b1051c 100644 --- a/src/arithmetic.rs +++ b/src/arithmetic.rs @@ -4,7 +4,7 @@ extern crate alloc; use crate::bigint::BigInt; use super::bigint::LossFraction; -use super::float::{Category, Float, RoundingMode}; +use super::float::{Category, Float, RoundingMode, Status}; use core::cmp::Ordering; use core::ops::{ Add, AddAssign, Div, DivAssign, Mul, MulAssign, Sub, SubAssign, @@ -94,14 +94,25 @@ impl Float { /// Computes a+b using the rounding mode `rm`. pub fn add_with_rm(a: &Self, b: &Self, rm: RoundingMode) -> Self { - Self::add_sub(a, b, false, rm) + Self::add_with_status(a, b, rm).0 } /// Computes a-b using the rounding mode `rm`. pub fn sub_with_rm(a: &Self, b: &Self, rm: RoundingMode) -> Self { - Self::add_sub(a, b, true, rm) + Self::sub_with_status(a, b, rm).0 } - fn add_sub(a: &Self, b: &Self, subtract: bool, rm: RoundingMode) -> Self { + /// Computes a+b using the rounding mode `rm`, returning the result + /// and IEEE 754 exception status flags. + pub fn add_with_status(a: &Self, b: &Self, rm: RoundingMode) -> (Self, Status) { + Self::add_sub_with_status(a, b, false, rm) + } + /// Computes a-b using the rounding mode `rm`, returning the result + /// and IEEE 754 exception status flags. + pub fn sub_with_status(a: &Self, b: &Self, rm: RoundingMode) -> (Self, Status) { + Self::add_sub_with_status(a, b, true, rm) + } + + fn add_sub_with_status(a: &Self, b: &Self, subtract: bool, rm: RoundingMode) -> (Self, Status) { let sem = a.get_semantics(); // Table 8.2: Specification of addition for positive floating-point // data. Pg 247. @@ -112,35 +123,36 @@ impl Float { | (Category::NaN, Category::Zero) | (Category::Normal, Category::Zero) | (Category::Infinity, Category::Normal) - | (Category::Infinity, Category::Zero) => a.clone(), + | (Category::Infinity, Category::Zero) => (a.clone(), Status::default()), (Category::Zero, Category::NaN) | (Category::Normal, Category::NaN) | (Category::Infinity, Category::NaN) => { - Self::nan(sem, b.get_sign()) + (Self::nan(sem, b.get_sign()), Status::default()) } (Category::Normal, Category::Infinity) | (Category::Zero, Category::Infinity) => { - Self::inf(sem, b.get_sign() ^ subtract) + (Self::inf(sem, b.get_sign() ^ subtract), Status::default()) } - (Category::Zero, Category::Normal) => Self::from_parts( + (Category::Zero, Category::Normal) => (Self::from_parts( sem, b.get_sign() ^ subtract, b.get_exp(), b.get_mantissa(), - ), + ), Status::default()), (Category::Zero, Category::Zero) => { - Self::zero(sem, a.get_sign() && b.get_sign()) + (Self::zero(sem, a.get_sign() && b.get_sign()), Status::default()) } (Category::Infinity, Category::Infinity) => { if a.get_sign() ^ b.get_sign() ^ subtract { - return Self::nan(sem, a.get_sign() ^ b.get_sign()); + return (Self::nan(sem, a.get_sign() ^ b.get_sign()), + Status { invalid: true, ..Default::default() }); } - Self::inf(sem, a.get_sign()) + (Self::inf(sem, a.get_sign()), Status::default()) } (Category::Normal, Category::Normal) => { @@ -151,12 +163,12 @@ impl Float { let same_absolute_number = a.same_absolute_value(b); if cancellation && same_absolute_number { let is_negative = RoundingMode::Negative == rm; - return Self::zero(sem, is_negative); + return (Self::zero(sem, is_negative), Status::default()); } let mut res = Self::add_or_sub_normals(a, b, subtract); - res.0.normalize(rm, res.1); - res.0 + let status = res.0.normalize(rm, res.1); + (res.0, status) } } } @@ -353,6 +365,12 @@ fn test_add_random_vals() { impl Float { /// Compute a*b using the rounding mode `rm`. pub fn mul_with_rm(a: &Self, b: &Self, rm: RoundingMode) -> Self { + Self::mul_with_status(a, b, rm).0 + } + + /// Compute a*b using the rounding mode `rm`, returning the result + /// and IEEE 754 exception status flags. + pub fn mul_with_status(a: &Self, b: &Self, rm: RoundingMode) -> (Self, Status) { let sem = a.get_semantics(); let sign = a.get_sign() ^ b.get_sign(); @@ -362,26 +380,28 @@ impl Float { (Category::Zero, Category::NaN) | (Category::Normal, Category::NaN) | (Category::Infinity, Category::NaN) => { - Self::nan(sem, b.get_sign()) + (Self::nan(sem, b.get_sign()), Status::default()) } (Category::NaN, Category::Infinity) | (Category::NaN, Category::NaN) | (Category::NaN, Category::Normal) - | (Category::NaN, Category::Zero) => Self::nan(sem, a.get_sign()), + | (Category::NaN, Category::Zero) => (Self::nan(sem, a.get_sign()), Status::default()), (Category::Normal, Category::Infinity) | (Category::Infinity, Category::Normal) - | (Category::Infinity, Category::Infinity) => Self::inf(sem, sign), + | (Category::Infinity, Category::Infinity) => (Self::inf(sem, sign), Status::default()), (Category::Normal, Category::Zero) | (Category::Zero, Category::Normal) - | (Category::Zero, Category::Zero) => Self::zero(sem, sign), + | (Category::Zero, Category::Zero) => (Self::zero(sem, sign), Status::default()), (Category::Zero, Category::Infinity) - | (Category::Infinity, Category::Zero) => Self::nan(sem, sign), + | (Category::Infinity, Category::Zero) => { + (Self::nan(sem, sign), Status { invalid: true, ..Default::default() }) + } (Category::Normal, Category::Normal) => { let (mut res, loss) = Self::mul_normals(a, b, sign); - res.normalize(rm, loss); - res + let status = res.normalize(rm, loss); + (res, status) } } } @@ -510,23 +530,34 @@ fn test_mul_random_vals() { impl Float { /// Compute a/b, with the rounding mode `rm`. pub fn div_with_rm(a: &Self, b: &Self, rm: RoundingMode) -> Self { + Self::div_with_status(a, b, rm).0 + } + + /// Compute a/b with the rounding mode `rm`, returning the result + /// and IEEE 754 exception status flags. + pub fn div_with_status(a: &Self, b: &Self, rm: RoundingMode) -> (Self, Status) { let sem = a.get_semantics(); let sign = a.get_sign() ^ b.get_sign(); // Table 8.5: Special values for x/y - Page 263. match (a.get_category(), b.get_category()) { (Category::NaN, _) - | (_, Category::NaN) - | (Category::Zero, Category::Zero) - | (Category::Infinity, Category::Infinity) => Self::nan(sem, sign), - - (_, Category::Infinity) => Self::zero(sem, sign), - (Category::Zero, _) => Self::zero(sem, sign), - (_, Category::Zero) => Self::inf(sem, sign), - (Category::Infinity, _) => Self::inf(sem, sign), + | (_, Category::NaN) => (Self::nan(sem, sign), Status::default()), + + (Category::Zero, Category::Zero) + | (Category::Infinity, Category::Infinity) => { + (Self::nan(sem, sign), Status { invalid: true, ..Default::default() }) + } + + (_, Category::Infinity) => (Self::zero(sem, sign), Status::default()), + (Category::Zero, _) => (Self::zero(sem, sign), Status::default()), + (_, Category::Zero) => { + (Self::inf(sem, sign), Status { divide_by_zero: true, ..Default::default() }) + } + (Category::Infinity, _) => (Self::inf(sem, sign), Status::default()), (Category::Normal, Category::Normal) => { let (mut res, loss) = Self::div_normals(a, b); - res.normalize(rm, loss); - res + let status = res.normalize(rm, loss); + (res, status) } } } @@ -818,7 +849,7 @@ impl Float { ) -> Self { if a.is_normal() && b.is_normal() && c.is_normal() { let (mut res, loss) = Self::fused_mul_add_normals(a, b, c); - res.normalize(rm, loss); // Finally, round the result. + let _ = res.normalize(rm, loss); // Finally, round the result. res } else { // Perform two operations. First, handle non-normal values. @@ -942,3 +973,129 @@ fn test_fma_random_vals() { assert!(r1.is_nan() || r0_bits == r1_bits); } } + +#[test] +fn test_status_exact_ops() { + use super::float::Status; + let rm = RoundingMode::NearestTiesToEven; + // 1.0 + 1.0 = 2.0 exactly. + let one = Float::from_f64(1.0); + let (res, st) = Float::add_with_status(&one, &one, rm); + assert_eq!(res.as_f64(), 2.0); + assert_eq!(st, Status::default()); + // 2.0 * 3.0 = 6.0 exactly. + let two = Float::from_f64(2.0); + let three = Float::from_f64(3.0); + let (res, st) = Float::mul_with_status(&two, &three, rm); + assert_eq!(res.as_f64(), 6.0); + assert_eq!(st, Status::default()); + // 6.0 / 2.0 = 3.0 exactly. + let six = Float::from_f64(6.0); + let (res, st) = Float::div_with_status(&six, &two, rm); + assert_eq!(res.as_f64(), 3.0); + assert_eq!(st, Status::default()); +} + +#[test] +fn test_status_inexact() { + let rm = RoundingMode::NearestTiesToEven; + let one = Float::from_f64(1.0); + let three = Float::from_f64(3.0); + let (_, st) = Float::div_with_status(&one, &three, rm); + assert!(st.inexact); + assert!(!st.overflow); + assert!(!st.invalid); + assert!(!st.divide_by_zero); +} + +#[test] +fn test_status_overflow() { + use super::float::FP16; + let rm = RoundingMode::NearestTiesToEven; + // FP16 max is 65504. Multiplying two large values should overflow. + let big = Float::from_u64(FP16, 60000); + let two = Float::from_u64(FP16, 2); + let (res, st) = Float::mul_with_status(&big, &two, rm); + assert!(res.is_inf()); + assert!(st.overflow); + assert!(st.inexact); +} + +#[test] +fn test_status_invalid() { + use super::float::FP64; + let rm = RoundingMode::NearestTiesToEven; + let sem = FP64; + + // inf + (-inf) = NaN, invalid. + let pos_inf = Float::inf(sem, false); + let neg_inf = Float::inf(sem, true); + let (res, st) = Float::add_with_status(&pos_inf, &neg_inf, rm); + assert!(res.is_nan()); + assert!(st.invalid); + + // 0 * inf = NaN, invalid. + let zero = Float::zero(sem, false); + let (res, st) = Float::mul_with_status(&zero, &pos_inf, rm); + assert!(res.is_nan()); + assert!(st.invalid); + + // 0 / 0 = NaN, invalid. + let zero2 = Float::zero(sem, false); + let (res, st) = Float::div_with_status(&zero, &zero2, rm); + assert!(res.is_nan()); + assert!(st.invalid); + + // inf / inf = NaN, invalid. + let (res, st) = Float::div_with_status(&pos_inf, &pos_inf, rm); + assert!(res.is_nan()); + assert!(st.invalid); +} + +#[test] +fn test_status_divide_by_zero() { + use super::float::FP64; + let rm = RoundingMode::NearestTiesToEven; + let one = Float::from_f64(1.0); + let zero = Float::zero(FP64, false); + let (res, st) = Float::div_with_status(&one, &zero, rm); + assert!(res.is_inf()); + assert!(st.divide_by_zero); + assert!(!st.inexact); + assert!(!st.invalid); +} + +#[test] +fn test_status_bitor() { + use super::float::Status; + let a = Status { invalid: true, ..Default::default() }; + let b = Status { inexact: true, ..Default::default() }; + let c = a | b; + assert!(c.invalid); + assert!(c.inexact); + assert!(!c.overflow); + + let mut d = Status::default(); + d |= a; + d |= b; + assert_eq!(c, d); +} + +#[test] +fn test_status_wrapper_equivalence() { + use super::float::FP64; + let rm = RoundingMode::NearestTiesToEven; + let a = Float::from_f64(1.234); + let b = Float::from_f64(5.678); + + assert_eq!(Float::add_with_rm(&a, &b, rm).as_f64(), + Float::add_with_status(&a, &b, rm).0.as_f64()); + assert_eq!(Float::sub_with_rm(&a, &b, rm).as_f64(), + Float::sub_with_status(&a, &b, rm).0.as_f64()); + assert_eq!(Float::mul_with_rm(&a, &b, rm).as_f64(), + Float::mul_with_status(&a, &b, rm).0.as_f64()); + assert_eq!(Float::div_with_rm(&a, &b, rm).as_f64(), + Float::div_with_status(&a, &b, rm).0.as_f64()); + assert_eq!(a.cast_with_rm(FP64, rm).as_f64(), + a.cast_with_status(FP64, rm).0.as_f64()); +} diff --git a/src/cast.rs b/src/cast.rs index 6774b39..223f9ba 100644 --- a/src/cast.rs +++ b/src/cast.rs @@ -6,7 +6,7 @@ use crate::FP128; use super::bigint::BigInt; use super::bigint::LossFraction; use super::float::{self, Category}; -use super::float::{Float, RoundingMode, FP32, FP64}; +use super::float::{Float, RoundingMode, Status, FP32, FP64}; use super::utils; use super::utils::mask; @@ -22,7 +22,7 @@ impl Float { pub fn from_bigint(sem: Semantics, val: BigInt) -> Self { let mut a = Self::from_parts(sem, false, sem.get_mantissa_len() as i64, val); - a.normalize(sem.get_rounding_mode(), LossFraction::ExactlyZero); + let _ = a.normalize(sem.get_rounding_mode(), LossFraction::ExactlyZero); a } @@ -194,6 +194,12 @@ impl Float { /// Cast to another float using the non-default rounding mode `rm`. pub fn cast_with_rm(&self, to: Semantics, rm: RoundingMode) -> Float { + self.cast_with_status(to, rm).0 + } + + /// Cast to another float using the rounding mode `rm`, returning the + /// result and IEEE 754 exception status flags. + pub fn cast_with_status(&self, to: Semantics, rm: RoundingMode) -> (Float, Status) { let mut loss = LossFraction::ExactlyZero; let exp_delta = self.get_mantissa_len() as i64 - to.get_mantissa_len() as i64; @@ -213,12 +219,14 @@ impl Float { temp.get_category(), ); // Don't normalize if this is a nop conversion. - if to.get_exponent_len() != self.get_exponent_len() + let status = if to.get_exponent_len() != self.get_exponent_len() || to.get_mantissa_len() != self.get_mantissa_len() { - x.normalize(rm, loss); - } - x + x.normalize(rm, loss) + } else { + Status::default() + }; + (x, status) } /// Convert from one float format to another. pub fn cast(&self, to: Semantics) -> Float { diff --git a/src/float.rs b/src/float.rs index c0d8230..575decf 100644 --- a/src/float.rs +++ b/src/float.rs @@ -31,6 +31,35 @@ impl RoundingMode { } } +/// IEEE 754 exception status flags. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +pub struct Status { + pub inexact: bool, + pub underflow: bool, + pub overflow: bool, + pub invalid: bool, + pub divide_by_zero: bool, +} + +impl core::ops::BitOr for Status { + type Output = Self; + fn bitor(self, rhs: Self) -> Self { + Status { + inexact: self.inexact || rhs.inexact, + underflow: self.underflow || rhs.underflow, + overflow: self.overflow || rhs.overflow, + invalid: self.invalid || rhs.invalid, + divide_by_zero: self.divide_by_zero || rhs.divide_by_zero, + } + } +} + +impl core::ops::BitOrAssign for Status { + fn bitor_assign(&mut self, rhs: Self) { + *self = *self | rhs; + } +} + /// Controls the semantics of a floating point number with: /// 'precision', that determines the number of bits, 'exponent' that controls /// the dynamic range of the number, and rounding mode that controls how @@ -542,12 +571,13 @@ impl Float { /// Normalize the number by adjusting the exponent to the legal range, shift /// the mantissa to the msb, and round the number if bits are lost. This is /// based on Neil Booth' implementation in APFloat. - pub(crate) fn normalize(&mut self, rm: RoundingMode, loss: LossFraction) { + pub(crate) fn normalize(&mut self, rm: RoundingMode, loss: LossFraction) -> Status { if !self.is_normal() { - return; + return Status::default(); } let mut loss = loss; let bounds = self.get_exp_bounds(); + let mut is_underflow = false; let nmsb = self.mantissa.msb_index() as i64; @@ -560,12 +590,13 @@ impl Float { if self.exp + exp_change > bounds.1 { self.overflow(rm); self.check_bounds(); - return; + return Status { overflow: true, inexact: true, ..Default::default() }; } // Handle underflowing low exponents. Don't allow to go below the // legal exponent range. if self.exp + exp_change < bounds.0 { + is_underflow = true; exp_change = bounds.0 - self.exp; } @@ -573,7 +604,7 @@ impl Float { // Handle reducing the exponent. debug_assert!(loss.is_exactly_zero(), "losing information"); self.shift_significand_left(-exp_change as u64); - return; + return Status::default(); } if exp_change > 0 { @@ -590,9 +621,9 @@ impl Float { // Canonicalize to zero. if self.mantissa.is_zero() { *self = Self::zero(self.sem, self.sign); - return; + return Status::default(); } - return; + return Status::default(); } // Check if we need to round away from zero. @@ -612,7 +643,7 @@ impl Float { self.shift_significand_right(1); } else { *self = Self::inf(self.sem, self.sign); - return; + return Status { overflow: true, inexact: true, ..Default::default() }; } } } @@ -621,6 +652,8 @@ impl Float { if self.mantissa.is_zero() { *self = Self::zero(self.sem, self.sign); } + + Status { inexact: true, underflow: is_underflow, ..Default::default() } } // round. } diff --git a/src/lib.rs b/src/lib.rs index 98ed239..1a83ad0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -132,7 +132,7 @@ mod string; mod utils; pub use self::bigint::BigInt; -pub use self::float::Float; +pub use self::float::{Float, Status}; pub use self::float::RoundingMode; pub use self::float::Semantics; pub use self::float::{BF16, FP128, FP16, FP256, FP32, FP64}; diff --git a/src/operations/functions.rs b/src/operations/functions.rs index 8816722..ee2c88e 100644 --- a/src/operations/functions.rs +++ b/src/operations/functions.rs @@ -218,7 +218,7 @@ impl Float { self.get_exp() + scale, self.get_mantissa(), ); - r.normalize(rm, LossFraction::ExactlyZero); + let _ = r.normalize(rm, LossFraction::ExactlyZero); r }