1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
//! The various algorithms from the paper.

use crate::cmp::min;
use crate::cmp::Ordering::{Less, Equal, Greater};
use crate::num::diy_float::Fp;
use crate::num::dec2flt::table;
use crate::num::dec2flt::rawfp::{self, Unpacked, RawFloat, fp_to_float, next_float, prev_float};
use crate::num::dec2flt::num::{self, Big};

/// Number of significand bits in Fp
const P: u32 = 64;

// We simply store the best approximation for *all* exponents, so the variable "h" and the
// associated conditions can be omitted. This trades performance for a couple kilobytes of space.

fn power_of_ten(e: i16) -> Fp {
    assert!(e >= table::MIN_E);
    let i = e - table::MIN_E;
    let sig = table::POWERS.0[i as usize];
    let exp = table::POWERS.1[i as usize];
    Fp { f: sig, e: exp }
}

// In most architectures, floating point operations have an explicit bit size, therefore the
// precision of the computation is determined on a per-operation basis.
#[cfg(any(not(target_arch="x86"), target_feature="sse2"))]
mod fpu_precision {
    pub fn set_precision<T>() { }
}

// On x86, the x87 FPU is used for float operations if the SSE/SSE2 extensions are not available.
// The x87 FPU operates with 80 bits of precision by default, which means that operations will
// round to 80 bits causing double rounding to happen when values are eventually represented as
// 32/64 bit float values. To overcome this, the FPU control word can be set so that the
// computations are performed in the desired precision.
#[cfg(all(target_arch="x86", not(target_feature="sse2")))]
mod fpu_precision {
    use crate::mem::size_of;

    /// A structure used to preserve the original value of the FPU control word, so that it can be
    /// restored when the structure is dropped.
    ///
    /// The x87 FPU is a 16-bits register whose fields are as follows:
    ///
    /// | 12-15 | 10-11 | 8-9 | 6-7 |  5 |  4 |  3 |  2 |  1 |  0 |
    /// |------:|------:|----:|----:|---:|---:|---:|---:|---:|---:|
    /// |       | RC    | PC  |     | PM | UM | OM | ZM | DM | IM |
    ///
    /// The documentation for all of the fields is available in the IA-32 Architectures Software
    /// Developer's Manual (Volume 1).
    ///
    /// The only field which is relevant for the following code is PC, Precision Control. This
    /// field determines the precision of the operations performed by the  FPU. It can be set to:
    ///  - 0b00, single precision i.e., 32-bits
    ///  - 0b10, double precision i.e., 64-bits
    ///  - 0b11, double extended precision i.e., 80-bits (default state)
    /// The 0b01 value is reserved and should not be used.
    pub struct FPUControlWord(u16);

    fn set_cw(cw: u16) {
        unsafe { asm!("fldcw $0" :: "m" (cw) :: "volatile") }
    }

    /// Sets the precision field of the FPU to `T` and returns a `FPUControlWord`.
    pub fn set_precision<T>() -> FPUControlWord {
        let cw = 0u16;

        // Compute the value for the Precision Control field that is appropriate for `T`.
        let cw_precision = match size_of::<T>() {
            4 => 0x0000, // 32 bits
            8 => 0x0200, // 64 bits
            _ => 0x0300, // default, 80 bits
        };

        // Get the original value of the control word to restore it later, when the
        // `FPUControlWord` structure is dropped
        unsafe { asm!("fnstcw $0" : "=*m" (&cw) ::: "volatile") }

        // Set the control word to the desired precision. This is achieved by masking away the old
        // precision (bits 8 and 9, 0x300) and replacing it with the precision flag computed above.
        set_cw((cw & 0xFCFF) | cw_precision);

        FPUControlWord(cw)
    }

    impl Drop for FPUControlWord {
        fn drop(&mut self) {
            set_cw(self.0)
        }
    }
}

/// The fast path of Bellerophon using machine-sized integers and floats.
///
/// This is extracted into a separate function so that it can be attempted before constructing
/// a bignum.
pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Option<T> {
    let num_digits = integral.len() + fractional.len();
    // log_10(f64::MAX_SIG) ~ 15.95. We compare the exact value to MAX_SIG near the end,
    // this is just a quick, cheap rejection (and also frees the rest of the code from
    // worrying about underflow).
    if num_digits > 16 {
        return None;
    }
    if e.abs() >= T::CEIL_LOG5_OF_MAX_SIG as i64 {
        return None;
    }
    let f = num::from_str_unchecked(integral.iter().chain(fractional.iter()));
    if f > T::MAX_SIG {
        return None;
    }

    // The fast path crucially depends on arithmetic being rounded to the correct number of bits
    // without any intermediate rounding. On x86 (without SSE or SSE2) this requires the precision
    // of the x87 FPU stack to be changed so that it directly rounds to 64/32 bit.
    // The `set_precision` function takes care of setting the precision on architectures which
    // require setting it by changing the global state (like the control word of the x87 FPU).
    let _cw = fpu_precision::set_precision::<T>();

    // The case e < 0 cannot be folded into the other branch. Negative powers result in
    // a repeating fractional part in binary, which are rounded, which causes real
    // (and occasionally quite significant!) errors in the final result.
    if e >= 0 {
        Some(T::from_int(f) * T::short_fast_pow10(e as usize))
    } else {
        Some(T::from_int(f) / T::short_fast_pow10(e.abs() as usize))
    }
}

/// Algorithm Bellerophon is trivial code justified by non-trivial numeric analysis.
///
/// It rounds ``f`` to a float with 64 bit significand and multiplies it by the best approximation
/// of `10^e` (in the same floating point format). This is often enough to get the correct result.
/// However, when the result is close to halfway between two adjacent (ordinary) floats, the
/// compound rounding error from multiplying two approximation means the result may be off by a
/// few bits. When this happens, the iterative Algorithm R fixes things up.
///
/// The hand-wavy "close to halfway" is made precise by the numeric analysis in the paper.
/// In the words of Clinger:
///
/// > Slop, expressed in units of the least significant bit, is an inclusive bound for the error
/// > accumulated during the floating point calculation of the approximation to f * 10^e. (Slop is
/// > not a bound for the true error, but bounds the difference between the approximation z and
/// > the best possible approximation that uses p bits of significand.)
pub fn bellerophon<T: RawFloat>(f: &Big, e: i16) -> T {
    let slop;
    if f <= &Big::from_u64(T::MAX_SIG) {
        // The cases abs(e) < log5(2^N) are in fast_path()
        slop = if e >= 0 { 0 } else { 3 };
    } else {
        slop = if e >= 0 { 1 } else { 4 };
    }
    let z = rawfp::big_to_fp(f).mul(&power_of_ten(e)).normalize();
    let exp_p_n = 1 << (P - T::SIG_BITS as u32);
    let lowbits: i64 = (z.f % exp_p_n) as i64;
    // Is the slop large enough to make a difference when
    // rounding to n bits?
    if (lowbits - exp_p_n as i64 / 2).abs() <= slop {
        algorithm_r(f, e, fp_to_float(z))
    } else {
        fp_to_float(z)
    }
}

/// An iterative algorithm that improves a floating point approximation of `f * 10^e`.
///
/// Each iteration gets one unit in the last place closer, which of course takes terribly long to
/// converge if `z0` is even mildly off. Luckily, when used as fallback for Bellerophon, the
/// starting approximation is off by at most one ULP.
fn algorithm_r<T: RawFloat>(f: &Big, e: i16, z0: T) -> T {
    let mut z = z0;
    loop {
        let raw = z.unpack();
        let (m, k) = (raw.sig, raw.k);
        let mut x = f.clone();
        let mut y = Big::from_u64(m);

        // Find positive integers `x`, `y` such that `x / y` is exactly `(f * 10^e) / (m * 2^k)`.
        // This not only avoids dealing with the signs of `e` and `k`, we also eliminate the
        // power of two common to `10^e` and `2^k` to make the numbers smaller.
        make_ratio(&mut x, &mut y, e, k);

        let m_digits = [(m & 0xFF_FF_FF_FF) as u32, (m >> 32) as u32];
        // This is written a bit awkwardly because our bignums don't support
        // negative numbers, so we use the absolute value + sign information.
        // The multiplication with m_digits can't overflow. If `x` or `y` are large enough that
        // we need to worry about overflow, then they are also large enough that `make_ratio` has
        // reduced the fraction by a factor of 2^64 or more.
        let (d2, d_negative) = if x >= y {
            // Don't need x any more, save a clone().
            x.sub(&y).mul_pow2(1).mul_digits(&m_digits);
            (x, false)
        } else {
            // Still need y - make a copy.
            let mut y = y.clone();
            y.sub(&x).mul_pow2(1).mul_digits(&m_digits);
            (y, true)
        };

        if d2 < y {
            let mut d2_double = d2;
            d2_double.mul_pow2(1);
            if m == T::MIN_SIG && d_negative && d2_double > y {
                z = prev_float(z);
            } else {
                return z;
            }
        } else if d2 == y {
            if m % 2 == 0 {
                if m == T::MIN_SIG && d_negative {
                    z = prev_float(z);
                } else {
                    return z;
                }
            } else if d_negative {
                z = prev_float(z);
            } else {
                z = next_float(z);
            }
        } else if d_negative {
            z = prev_float(z);
        } else {
            z = next_float(z);
        }
    }
}

/// Given `x = f` and `y = m` where `f` represent input decimal digits as usual and `m` is the
/// significand of a floating point approximation, make the ratio `x / y` equal to
/// `(f * 10^e) / (m * 2^k)`, possibly reduced by a power of two both have in common.
fn make_ratio(x: &mut Big, y: &mut Big, e: i16, k: i16) {
    let (e_abs, k_abs) = (e.abs() as usize, k.abs() as usize);
    if e >= 0 {
        if k >= 0 {
            // x = f * 10^e, y = m * 2^k, except that we reduce the fraction by some power of two.
            let common = min(e_abs, k_abs);
            x.mul_pow5(e_abs).mul_pow2(e_abs - common);
            y.mul_pow2(k_abs - common);
        } else {
            // x = f * 10^e * 2^abs(k), y = m
            // This can't overflow because it requires positive `e` and negative `k`, which can
            // only happen for values extremely close to 1, which means that `e` and `k` will be
            // comparatively tiny.
            x.mul_pow5(e_abs).mul_pow2(e_abs + k_abs);
        }
    } else {
        if k >= 0 {
            // x = f, y = m * 10^abs(e) * 2^k
            // This can't overflow either, see above.
            y.mul_pow5(e_abs).mul_pow2(k_abs + e_abs);
        } else {
            // x = f * 2^abs(k), y = m * 10^abs(e), again reducing by a common power of two.
            let common = min(e_abs, k_abs);
            x.mul_pow2(k_abs - common);
            y.mul_pow5(e_abs).mul_pow2(e_abs - common);
        }
    }
}

/// Conceptually, Algorithm M is the simplest way to convert a decimal to a float.
///
/// We form a ratio that is equal to `f * 10^e`, then throwing in powers of two until it gives
/// a valid float significand. The binary exponent `k` is the number of times we multiplied
/// numerator or denominator by two, i.e., at all times `f * 10^e` equals `(u / v) * 2^k`.
/// When we have found out significand, we only need to round by inspecting the remainder of the
/// division, which is done in helper functions further below.
///
/// This algorithm is super slow, even with the optimization described in `quick_start()`.
/// However, it's the simplest of the algorithms to adapt for overflow, underflow, and subnormal
/// results. This implementation takes over when Bellerophon and Algorithm R are overwhelmed.
/// Detecting underflow and overflow is easy: The ratio still isn't an in-range significand,
/// yet the minimum/maximum exponent has been reached. In the case of overflow, we simply return
/// infinity.
///
/// Handling underflow and subnormals is trickier. One big problem is that, with the minimum
/// exponent, the ratio might still be too large for a significand. See underflow() for details.
pub fn algorithm_m<T: RawFloat>(f: &Big, e: i16) -> T {
    let mut u;
    let mut v;
    let e_abs = e.abs() as usize;
    let mut k = 0;
    if e < 0 {
        u = f.clone();
        v = Big::from_small(1);
        v.mul_pow5(e_abs).mul_pow2(e_abs);
    } else {
        // FIXME possible optimization: generalize big_to_fp so that we can do the equivalent of
        // fp_to_float(big_to_fp(u)) here, only without the double rounding.
        u = f.clone();
        u.mul_pow5(e_abs).mul_pow2(e_abs);
        v = Big::from_small(1);
    }
    quick_start::<T>(&mut u, &mut v, &mut k);
    let mut rem = Big::from_small(0);
    let mut x = Big::from_small(0);
    let min_sig = Big::from_u64(T::MIN_SIG);
    let max_sig = Big::from_u64(T::MAX_SIG);
    loop {
        u.div_rem(&v, &mut x, &mut rem);
        if k == T::MIN_EXP_INT {
            // We have to stop at the minimum exponent, if we wait until `k < T::MIN_EXP_INT`,
            // then we'd be off by a factor of two. Unfortunately this means we have to special-
            // case normal numbers with the minimum exponent.
            // FIXME find a more elegant formulation, but run the `tiny-pow10` test to make sure
            // that it's actually correct!
            if x >= min_sig && x <= max_sig {
                break;
            }
            return underflow(x, v, rem);
        }
        if k > T::MAX_EXP_INT {
            return T::INFINITY;
        }
        if x < min_sig {
            u.mul_pow2(1);
            k -= 1;
        } else if x > max_sig {
            v.mul_pow2(1);
            k += 1;
        } else {
            break;
        }
    }
    let q = num::to_u64(&x);
    let z = rawfp::encode_normal(Unpacked::new(q, k));
    round_by_remainder(v, rem, q, z)
}

/// Skips over most Algorithm M iterations by checking the bit length.
fn quick_start<T: RawFloat>(u: &mut Big, v: &mut Big, k: &mut i16) {
    // The bit length is an estimate of the base two logarithm, and log(u / v) = log(u) - log(v).
    // The estimate is off by at most 1, but always an under-estimate, so the error on log(u)
    // and log(v) are of the same sign and cancel out (if both are large). Therefore the error
    // for log(u / v) is at most one as well.
    // The target ratio is one where u/v is in an in-range significand. Thus our termination
    // condition is log2(u / v) being the significand bits, plus/minus one.
    // FIXME Looking at the second bit could improve the estimate and avoid some more divisions.
    let target_ratio = T::SIG_BITS as i16;
    let log2_u = u.bit_length() as i16;
    let log2_v = v.bit_length() as i16;
    let mut u_shift: i16 = 0;
    let mut v_shift: i16 = 0;
    assert!(*k == 0);
    loop {
        if *k == T::MIN_EXP_INT {
            // Underflow or subnormal. Leave it to the main function.
            break;
        }
        if *k == T::MAX_EXP_INT {
            // Overflow. Leave it to the main function.
            break;
        }
        let log2_ratio = (log2_u + u_shift) - (log2_v + v_shift);
        if log2_ratio < target_ratio - 1 {
            u_shift += 1;
            *k -= 1;
        } else if log2_ratio > target_ratio + 1 {
            v_shift += 1;
            *k += 1;
        } else {
            break;
        }
    }
    u.mul_pow2(u_shift as usize);
    v.mul_pow2(v_shift as usize);
}

fn underflow<T: RawFloat>(x: Big, v: Big, rem: Big) -> T {
    if x < Big::from_u64(T::MIN_SIG) {
        let q = num::to_u64(&x);
        let z = rawfp::encode_subnormal(q);
        return round_by_remainder(v, rem, q, z);
    }
    // Ratio isn't an in-range significand with the minimum exponent, so we need to round off
    // excess bits and adjust the exponent accordingly. The real value now looks like this:
    //
    //        x        lsb
    // /--------------\/
    // 1010101010101010.10101010101010 * 2^k
    // \-----/\-------/ \------------/
    //    q     trunc.    (represented by rem)
    //
    // Therefore, when the rounded-off bits are != 0.5 ULP, they decide the rounding
    // on their own. When they are equal and the remainder is non-zero, the value still
    // needs to be rounded up. Only when the rounded off bits are 1/2 and the remainder
    // is zero, we have a half-to-even situation.
    let bits = x.bit_length();
    let lsb = bits - T::SIG_BITS as usize;
    let q = num::get_bits(&x, lsb, bits);
    let k = T::MIN_EXP_INT + lsb as i16;
    let z = rawfp::encode_normal(Unpacked::new(q, k));
    let q_even = q % 2 == 0;
    match num::compare_with_half_ulp(&x, lsb) {
        Greater => next_float(z),
        Less => z,
        Equal if rem.is_zero() && q_even => z,
        Equal => next_float(z),
    }
}

/// Ordinary round-to-even, obfuscated by having to round based on the remainder of a division.
fn round_by_remainder<T: RawFloat>(v: Big, r: Big, q: u64, z: T) -> T {
    let mut v_minus_r = v;
    v_minus_r.sub(&r);
    if r < v_minus_r {
        z
    } else if r > v_minus_r {
        next_float(z)
    } else if q % 2 == 0 {
        z
    } else {
        next_float(z)
    }
}