1 // Copyright 2014 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 // This file implements multi-precision floating-point numbers.
6 // Like in the GNU MPFR library (https://www.mpfr.org/), operands
7 // can be of mixed precision. Unlike MPFR, the rounding mode is
8 // not specified with each operation, but with each operand. The
9 // rounding mode of the result operand determines the rounding
10 // mode of an operation. This is a from-scratch implementation.
11 12 package big
13 14 import (
15 "fmt"
16 "math"
17 "math/bits"
18 )
19 20 const debugFloat = false // enable for debugging
21 22 // A nonzero finite Float represents a multi-precision floating point number
23 //
24 // sign × mantissa × 2**exponent
25 //
26 // with 0.5 <= mantissa < 1.0, and MinExp <= exponent <= MaxExp.
27 // A Float may also be zero (+0, -0) or infinite (+Inf, -Inf).
28 // All Floats are ordered, and the ordering of two Floats x and y
29 // is defined by x.Cmp(y).
30 //
31 // Each Float value also has a precision, rounding mode, and accuracy.
32 // The precision is the maximum number of mantissa bits available to
33 // represent the value. The rounding mode specifies how a result should
34 // be rounded to fit into the mantissa bits, and accuracy describes the
35 // rounding error with respect to the exact result.
36 //
37 // Unless specified otherwise, all operations (including setters) that
38 // specify a *Float variable for the result (usually via the receiver
39 // with the exception of [Float.MantExp]), round the numeric result according
40 // to the precision and rounding mode of the result variable.
41 //
42 // If the provided result precision is 0 (see below), it is set to the
43 // precision of the argument with the largest precision value before any
44 // rounding takes place, and the rounding mode remains unchanged. Thus,
45 // uninitialized Floats provided as result arguments will have their
46 // precision set to a reasonable value determined by the operands, and
47 // their mode is the zero value for RoundingMode (ToNearestEven).
48 //
49 // By setting the desired precision to 24 or 53 and using matching rounding
50 // mode (typically [ToNearestEven]), Float operations produce the same results
51 // as the corresponding float32 or float64 IEEE 754 arithmetic for operands
52 // that correspond to normal (i.e., not denormal) float32 or float64 numbers.
53 // Exponent underflow and overflow lead to a 0 or an Infinity for different
54 // values than IEEE 754 because Float exponents have a much larger range.
55 //
56 // The zero (uninitialized) value for a Float is ready to use and represents
57 // the number +0.0 exactly, with precision 0 and rounding mode [ToNearestEven].
58 //
59 // Operations always take pointer arguments (*Float) rather
60 // than Float values, and each unique Float value requires
61 // its own unique *Float pointer. To "copy" a Float value,
62 // an existing (or newly allocated) Float must be set to
63 // a new value using the [Float.Set] method; shallow copies
64 // of Floats are not supported and may lead to errors.
65 type Float struct {
66 prec uint32
67 mode RoundingMode
68 acc Accuracy
69 form form
70 neg bool
71 mant nat
72 exp int32
73 }
74 75 // An ErrNaN panic is raised by a [Float] operation that would lead to
76 // a NaN under IEEE 754 rules. An ErrNaN implements the error interface.
77 type ErrNaN struct {
78 msg []byte
79 }
80 81 func (err ErrNaN) Error() string {
82 return err.msg
83 }
84 85 // NewFloat allocates and returns a new [Float] set to x,
86 // with precision 53 and rounding mode [ToNearestEven].
87 // NewFloat panics with [ErrNaN] if x is a NaN.
88 func NewFloat(x float64) *Float {
89 if math.IsNaN(x) {
90 panic(ErrNaN{"NewFloat(NaN)"})
91 }
92 return (&Float{}).SetFloat64(x)
93 }
94 95 // Exponent and precision limits.
96 const (
97 MaxExp = math.MaxInt32 // largest supported exponent
98 MinExp = math.MinInt32 // smallest supported exponent
99 MaxPrec = math.MaxUint32 // largest (theoretically) supported precision; likely memory-limited
100 )
101 102 // Internal representation: The mantissa bits x.mant of a nonzero finite
103 // Float x are stored in a nat slice long enough to hold up to x.prec bits;
104 // the slice may (but doesn't have to) be shorter if the mantissa contains
105 // trailing 0 bits. x.mant is normalized if the msb of x.mant == 1 (i.e.,
106 // the msb is shifted all the way "to the left"). Thus, if the mantissa has
107 // trailing 0 bits or x.prec is not a multiple of the Word size _W,
108 // x.mant[0] has trailing zero bits. The msb of the mantissa corresponds
109 // to the value 0.5; the exponent x.exp shifts the binary point as needed.
110 //
111 // A zero or non-finite Float x ignores x.mant and x.exp.
112 //
113 // x form neg mant exp
114 // ----------------------------------------------------------
115 // ±0 zero sign - -
116 // 0 < |x| < +Inf finite sign mantissa exponent
117 // ±Inf inf sign - -
118 119 // A form value describes the internal representation.
120 type form byte
121 122 // The form value order is relevant - do not change!
123 const (
124 zero form = iota
125 finite
126 inf
127 )
128 129 // RoundingMode determines how a [Float] value is rounded to the
130 // desired precision. Rounding may change the [Float] value; the
131 // rounding error is described by the [Float]'s [Accuracy].
132 type RoundingMode byte
133 134 // These constants define supported rounding modes.
135 const (
136 ToNearestEven RoundingMode = iota // == IEEE 754-2008 roundTiesToEven
137 ToNearestAway // == IEEE 754-2008 roundTiesToAway
138 ToZero // == IEEE 754-2008 roundTowardZero
139 AwayFromZero // no IEEE 754-2008 equivalent
140 ToNegativeInf // == IEEE 754-2008 roundTowardNegative
141 ToPositiveInf // == IEEE 754-2008 roundTowardPositive
142 )
143 144 //go:generate stringer -type=RoundingMode
145 146 // Accuracy describes the rounding error produced by the most recent
147 // operation that generated a [Float] value, relative to the exact value.
148 type Accuracy int8
149 150 // Constants describing the [Accuracy] of a [Float].
151 const (
152 Below Accuracy = -1
153 Exact Accuracy = 0
154 Above Accuracy = +1
155 )
156 157 //go:generate stringer -type=Accuracy
158 159 // SetPrec sets z's precision to prec and returns the (possibly) rounded
160 // value of z. Rounding occurs according to z's rounding mode if the mantissa
161 // cannot be represented in prec bits without loss of precision.
162 // SetPrec(0) maps all finite values to ±0; infinite values remain unchanged.
163 // If prec > [MaxPrec], it is set to [MaxPrec].
164 func (z *Float) SetPrec(prec uint) *Float {
165 z.acc = Exact // optimistically assume no rounding is needed
166 167 // special case
168 if prec == 0 {
169 z.prec = 0
170 if z.form == finite {
171 // truncate z to 0
172 z.acc = makeAcc(z.neg)
173 z.form = zero
174 }
175 return z
176 }
177 178 // general case
179 if prec > MaxPrec {
180 prec = MaxPrec
181 }
182 old := z.prec
183 z.prec = uint32(prec)
184 if z.prec < old {
185 z.round(0)
186 }
187 return z
188 }
189 190 func makeAcc(above bool) Accuracy {
191 if above {
192 return Above
193 }
194 return Below
195 }
196 197 // SetMode sets z's rounding mode to mode and returns an exact z.
198 // z remains unchanged otherwise.
199 // z.SetMode(z.Mode()) is a cheap way to set z's accuracy to [Exact].
200 func (z *Float) SetMode(mode RoundingMode) *Float {
201 z.mode = mode
202 z.acc = Exact
203 return z
204 }
205 206 // Prec returns the mantissa precision of x in bits.
207 // The result may be 0 for |x| == 0 and |x| == Inf.
208 func (x *Float) Prec() uint {
209 return uint(x.prec)
210 }
211 212 // MinPrec returns the minimum precision required to represent x exactly
213 // (i.e., the smallest prec before x.SetPrec(prec) would start rounding x).
214 // The result is 0 for |x| == 0 and |x| == Inf.
215 func (x *Float) MinPrec() uint {
216 if x.form != finite {
217 return 0
218 }
219 return uint(len(x.mant))*_W - x.mant.trailingZeroBits()
220 }
221 222 // Mode returns the rounding mode of x.
223 func (x *Float) Mode() RoundingMode {
224 return x.mode
225 }
226 227 // Acc returns the accuracy of x produced by the most recent
228 // operation, unless explicitly documented otherwise by that
229 // operation.
230 func (x *Float) Acc() Accuracy {
231 return x.acc
232 }
233 234 // Sign returns:
235 // - -1 if x < 0;
236 // - 0 if x is ±0;
237 // - +1 if x > 0.
238 func (x *Float) Sign() int {
239 if debugFloat {
240 x.validate()
241 }
242 if x.form == zero {
243 return 0
244 }
245 if x.neg {
246 return -1
247 }
248 return 1
249 }
250 251 // MantExp breaks x into its mantissa and exponent components
252 // and returns the exponent. If a non-nil mant argument is
253 // provided its value is set to the mantissa of x, with the
254 // same precision and rounding mode as x. The components
255 // satisfy x == mant × 2**exp, with 0.5 <= |mant| < 1.0.
256 // Calling MantExp with a nil argument is an efficient way to
257 // get the exponent of the receiver.
258 //
259 // Special cases are:
260 //
261 // ( ±0).MantExp(mant) = 0, with mant set to ±0
262 // (±Inf).MantExp(mant) = 0, with mant set to ±Inf
263 //
264 // x and mant may be the same in which case x is set to its
265 // mantissa value.
266 func (x *Float) MantExp(mant *Float) (exp int) {
267 if debugFloat {
268 x.validate()
269 }
270 if x.form == finite {
271 exp = int(x.exp)
272 }
273 if mant != nil {
274 mant.Copy(x)
275 if mant.form == finite {
276 mant.exp = 0
277 }
278 }
279 return
280 }
281 282 func (z *Float) setExpAndRound(exp int64, sbit uint) {
283 if exp < MinExp {
284 // underflow
285 z.acc = makeAcc(z.neg)
286 z.form = zero
287 return
288 }
289 290 if exp > MaxExp {
291 // overflow
292 z.acc = makeAcc(!z.neg)
293 z.form = inf
294 return
295 }
296 297 z.form = finite
298 z.exp = int32(exp)
299 z.round(sbit)
300 }
301 302 // SetMantExp sets z to mant × 2**exp and returns z.
303 // The result z has the same precision and rounding mode
304 // as mant. SetMantExp is an inverse of [Float.MantExp] but does
305 // not require 0.5 <= |mant| < 1.0. Specifically, for a
306 // given x of type *[Float], SetMantExp relates to [Float.MantExp]
307 // as follows:
308 //
309 // mant := new(Float)
310 // new(Float).SetMantExp(mant, x.MantExp(mant)).Cmp(x) == 0
311 //
312 // Special cases are:
313 //
314 // z.SetMantExp( ±0, exp) = ±0
315 // z.SetMantExp(±Inf, exp) = ±Inf
316 //
317 // z and mant may be the same in which case z's exponent
318 // is set to exp.
319 func (z *Float) SetMantExp(mant *Float, exp int) *Float {
320 if debugFloat {
321 z.validate()
322 mant.validate()
323 }
324 z.Copy(mant)
325 326 if z.form == finite {
327 // 0 < |mant| < +Inf
328 z.setExpAndRound(int64(z.exp)+int64(exp), 0)
329 }
330 return z
331 }
332 333 // Signbit reports whether x is negative or negative zero.
334 func (x *Float) Signbit() bool {
335 return x.neg
336 }
337 338 // IsInf reports whether x is +Inf or -Inf.
339 func (x *Float) IsInf() bool {
340 return x.form == inf
341 }
342 343 // IsInt reports whether x is an integer.
344 // ±Inf values are not integers.
345 func (x *Float) IsInt() bool {
346 if debugFloat {
347 x.validate()
348 }
349 // special cases
350 if x.form != finite {
351 return x.form == zero
352 }
353 // x.form == finite
354 if x.exp <= 0 {
355 return false
356 }
357 // x.exp > 0
358 return x.prec <= uint32(x.exp) || x.MinPrec() <= uint(x.exp) // not enough bits for fractional mantissa
359 }
360 361 // debugging support
362 func (x *Float) validate() {
363 if !debugFloat {
364 // avoid performance bugs
365 panic("validate called but debugFloat is not set")
366 }
367 if msg := x.validate0(); msg != "" {
368 panic(msg)
369 }
370 }
371 372 func (x *Float) validate0() []byte {
373 if x.form != finite {
374 return ""
375 }
376 m := len(x.mant)
377 if m == 0 {
378 return "nonzero finite number with empty mantissa"
379 }
380 const msb = 1 << (_W - 1)
381 if x.mant[m-1]&msb == 0 {
382 return fmt.Sprintf("msb not set in last word %#x of %s", x.mant[m-1], x.Text('p', 0))
383 }
384 if x.prec == 0 {
385 return "zero precision finite number"
386 }
387 return ""
388 }
389 390 // round rounds z according to z.mode to z.prec bits and sets z.acc accordingly.
391 // sbit must be 0 or 1 and summarizes any "sticky bit" information one might
392 // have before calling round. z's mantissa must be normalized (with the msb set)
393 // or empty.
394 //
395 // CAUTION: The rounding modes [ToNegativeInf], [ToPositiveInf] are affected by the
396 // sign of z. For correct rounding, the sign of z must be set correctly before
397 // calling round.
398 func (z *Float) round(sbit uint) {
399 if debugFloat {
400 z.validate()
401 }
402 403 z.acc = Exact
404 if z.form != finite {
405 // ±0 or ±Inf => nothing left to do
406 return
407 }
408 // z.form == finite && len(z.mant) > 0
409 // m > 0 implies z.prec > 0 (checked by validate)
410 411 m := uint32(len(z.mant)) // present mantissa length in words
412 bits := m * _W // present mantissa bits; bits > 0
413 if bits <= z.prec {
414 // mantissa fits => nothing to do
415 return
416 }
417 // bits > z.prec
418 419 // Rounding is based on two bits: the rounding bit (rbit) and the
420 // sticky bit (sbit). The rbit is the bit immediately before the
421 // z.prec leading mantissa bits (the "0.5"). The sbit is set if any
422 // of the bits before the rbit are set (the "0.25", "0.125", etc.):
423 //
424 // rbit sbit => "fractional part"
425 //
426 // 0 0 == 0
427 // 0 1 > 0 , < 0.5
428 // 1 0 == 0.5
429 // 1 1 > 0.5, < 1.0
430 431 // bits > z.prec: mantissa too large => round
432 r := uint(bits - z.prec - 1) // rounding bit position; r >= 0
433 rbit := z.mant.bit(r) & 1 // rounding bit; be safe and ensure it's a single bit
434 // The sticky bit is only needed for rounding ToNearestEven
435 // or when the rounding bit is zero. Avoid computation otherwise.
436 if sbit == 0 && (rbit == 0 || z.mode == ToNearestEven) {
437 sbit = z.mant.sticky(r)
438 }
439 sbit &= 1 // be safe and ensure it's a single bit
440 441 // cut off extra words
442 n := (z.prec + (_W - 1)) / _W // mantissa length in words for desired precision
443 if m > n {
444 copy(z.mant, z.mant[m-n:]) // move n last words to front
445 z.mant = z.mant[:n]
446 }
447 448 // determine number of trailing zero bits (ntz) and compute lsb mask of mantissa's least-significant word
449 ntz := n*_W - z.prec // 0 <= ntz < _W
450 lsb := Word(1) << ntz
451 452 // round if result is inexact
453 if rbit|sbit != 0 {
454 // Make rounding decision: The result mantissa is truncated ("rounded down")
455 // by default. Decide if we need to increment, or "round up", the (unsigned)
456 // mantissa.
457 inc := false
458 switch z.mode {
459 case ToNegativeInf:
460 inc = z.neg
461 case ToZero:
462 // nothing to do
463 case ToNearestEven:
464 inc = rbit != 0 && (sbit != 0 || z.mant[0]&lsb != 0)
465 case ToNearestAway:
466 inc = rbit != 0
467 case AwayFromZero:
468 inc = true
469 case ToPositiveInf:
470 inc = !z.neg
471 default:
472 panic("unreachable")
473 }
474 475 // A positive result (!z.neg) is Above the exact result if we increment,
476 // and it's Below if we truncate (Exact results require no rounding).
477 // For a negative result (z.neg) it is exactly the opposite.
478 z.acc = makeAcc(inc != z.neg)
479 480 if inc {
481 // add 1 to mantissa
482 if addVW(z.mant, z.mant, lsb) != 0 {
483 // mantissa overflow => adjust exponent
484 if z.exp >= MaxExp {
485 // exponent overflow
486 z.form = inf
487 return
488 }
489 z.exp++
490 // adjust mantissa: divide by 2 to compensate for exponent adjustment
491 rshVU(z.mant, z.mant, 1)
492 // set msb == carry == 1 from the mantissa overflow above
493 const msb = 1 << (_W - 1)
494 z.mant[n-1] |= msb
495 }
496 }
497 }
498 499 // zero out trailing bits in least-significant word
500 z.mant[0] &^= lsb - 1
501 502 if debugFloat {
503 z.validate()
504 }
505 }
506 507 func (z *Float) setBits64(neg bool, x uint64) *Float {
508 if z.prec == 0 {
509 z.prec = 64
510 }
511 z.acc = Exact
512 z.neg = neg
513 if x == 0 {
514 z.form = zero
515 return z
516 }
517 // x != 0
518 z.form = finite
519 s := bits.LeadingZeros64(x)
520 z.mant = z.mant.setUint64(x << uint(s))
521 z.exp = int32(64 - s) // always fits
522 if z.prec < 64 {
523 z.round(0)
524 }
525 return z
526 }
527 528 // SetUint64 sets z to the (possibly rounded) value of x and returns z.
529 // If z's precision is 0, it is changed to 64 (and rounding will have
530 // no effect).
531 func (z *Float) SetUint64(x uint64) *Float {
532 return z.setBits64(false, x)
533 }
534 535 // SetInt64 sets z to the (possibly rounded) value of x and returns z.
536 // If z's precision is 0, it is changed to 64 (and rounding will have
537 // no effect).
538 func (z *Float) SetInt64(x int64) *Float {
539 u := x
540 if u < 0 {
541 u = -u
542 }
543 // We cannot simply call z.SetUint64(uint64(u)) and change
544 // the sign afterwards because the sign affects rounding.
545 return z.setBits64(x < 0, uint64(u))
546 }
547 548 // SetFloat64 sets z to the (possibly rounded) value of x and returns z.
549 // If z's precision is 0, it is changed to 53 (and rounding will have
550 // no effect). SetFloat64 panics with [ErrNaN] if x is a NaN.
551 func (z *Float) SetFloat64(x float64) *Float {
552 if z.prec == 0 {
553 z.prec = 53
554 }
555 if math.IsNaN(x) {
556 panic(ErrNaN{"Float.SetFloat64(NaN)"})
557 }
558 z.acc = Exact
559 z.neg = math.Signbit(x) // handle -0, -Inf correctly
560 if x == 0 {
561 z.form = zero
562 return z
563 }
564 if math.IsInf(x, 0) {
565 z.form = inf
566 return z
567 }
568 // normalized x != 0
569 z.form = finite
570 fmant, exp := math.Frexp(x) // get normalized mantissa
571 z.mant = z.mant.setUint64(1<<63 | math.Float64bits(fmant)<<11)
572 z.exp = int32(exp) // always fits
573 if z.prec < 53 {
574 z.round(0)
575 }
576 return z
577 }
578 579 // fnorm normalizes mantissa m by shifting it to the left
580 // such that the msb of the most-significant word (msw) is 1.
581 // It returns the shift amount. It assumes that len(m) != 0.
582 func fnorm(m nat) int64 {
583 if debugFloat && (len(m) == 0 || m[len(m)-1] == 0) {
584 panic("msw of mantissa is 0")
585 }
586 s := nlz(m[len(m)-1])
587 if s > 0 {
588 c := lshVU(m, m, s)
589 if debugFloat && c != 0 {
590 panic("nlz or lshVU incorrect")
591 }
592 }
593 return int64(s)
594 }
595 596 // SetInt sets z to the (possibly rounded) value of x and returns z.
597 // If z's precision is 0, it is changed to the larger of x.BitLen()
598 // or 64 (and rounding will have no effect).
599 func (z *Float) SetInt(x *Int) *Float {
600 // TODO(gri) can be more efficient if z.prec > 0
601 // but small compared to the size of x, or if there
602 // are many trailing 0's.
603 bits := uint32(x.BitLen())
604 if z.prec == 0 {
605 z.prec = max(bits, 64)
606 }
607 z.acc = Exact
608 z.neg = x.neg
609 if len(x.abs) == 0 {
610 z.form = zero
611 return z
612 }
613 // x != 0
614 z.mant = z.mant.set(x.abs)
615 fnorm(z.mant)
616 z.setExpAndRound(int64(bits), 0)
617 return z
618 }
619 620 // SetRat sets z to the (possibly rounded) value of x and returns z.
621 // If z's precision is 0, it is changed to the largest of a.BitLen(),
622 // b.BitLen(), or 64; with x = a/b.
623 func (z *Float) SetRat(x *Rat) *Float {
624 if x.IsInt() {
625 return z.SetInt(x.Num())
626 }
627 var a, b Float
628 a.SetInt(x.Num())
629 b.SetInt(x.Denom())
630 if z.prec == 0 {
631 z.prec = max(a.prec, b.prec)
632 }
633 return z.Quo(&a, &b)
634 }
635 636 // SetInf sets z to the infinite Float -Inf if signbit is
637 // set, or +Inf if signbit is not set, and returns z. The
638 // precision of z is unchanged and the result is always
639 // [Exact].
640 func (z *Float) SetInf(signbit bool) *Float {
641 z.acc = Exact
642 z.form = inf
643 z.neg = signbit
644 return z
645 }
646 647 // Set sets z to the (possibly rounded) value of x and returns z.
648 // If z's precision is 0, it is changed to the precision of x
649 // before setting z (and rounding will have no effect).
650 // Rounding is performed according to z's precision and rounding
651 // mode; and z's accuracy reports the result error relative to the
652 // exact (not rounded) result.
653 func (z *Float) Set(x *Float) *Float {
654 if debugFloat {
655 x.validate()
656 }
657 z.acc = Exact
658 if z != x {
659 z.form = x.form
660 z.neg = x.neg
661 if x.form == finite {
662 z.exp = x.exp
663 z.mant = z.mant.set(x.mant)
664 }
665 if z.prec == 0 {
666 z.prec = x.prec
667 } else if z.prec < x.prec {
668 z.round(0)
669 }
670 }
671 return z
672 }
673 674 // Copy sets z to x, with the same precision, rounding mode, and accuracy as x.
675 // Copy returns z. If x and z are identical, Copy is a no-op.
676 func (z *Float) Copy(x *Float) *Float {
677 if debugFloat {
678 x.validate()
679 }
680 if z != x {
681 z.prec = x.prec
682 z.mode = x.mode
683 z.acc = x.acc
684 z.form = x.form
685 z.neg = x.neg
686 if z.form == finite {
687 z.mant = z.mant.set(x.mant)
688 z.exp = x.exp
689 }
690 }
691 return z
692 }
693 694 // msb32 returns the 32 most significant bits of x.
695 func msb32(x nat) uint32 {
696 i := len(x) - 1
697 if i < 0 {
698 return 0
699 }
700 if debugFloat && x[i]&(1<<(_W-1)) == 0 {
701 panic("x not normalized")
702 }
703 switch _W {
704 case 32:
705 return uint32(x[i])
706 case 64:
707 return uint32(x[i] >> 32)
708 }
709 panic("unreachable")
710 }
711 712 // msb64 returns the 64 most significant bits of x.
713 func msb64(x nat) uint64 {
714 i := len(x) - 1
715 if i < 0 {
716 return 0
717 }
718 if debugFloat && x[i]&(1<<(_W-1)) == 0 {
719 panic("x not normalized")
720 }
721 switch _W {
722 case 32:
723 v := uint64(x[i]) << 32
724 if i > 0 {
725 v |= uint64(x[i-1])
726 }
727 return v
728 case 64:
729 return uint64(x[i])
730 }
731 panic("unreachable")
732 }
733 734 // Uint64 returns the unsigned integer resulting from truncating x
735 // towards zero. If 0 <= x <= [math.MaxUint64], the result is [Exact]
736 // if x is an integer and [Below] otherwise.
737 // The result is (0, [Above]) for x < 0, and ([math.MaxUint64], [Below])
738 // for x > [math.MaxUint64].
739 func (x *Float) Uint64() (uint64, Accuracy) {
740 if debugFloat {
741 x.validate()
742 }
743 744 switch x.form {
745 case finite:
746 if x.neg {
747 return 0, Above
748 }
749 // 0 < x < +Inf
750 if x.exp <= 0 {
751 // 0 < x < 1
752 return 0, Below
753 }
754 // 1 <= x < Inf
755 if x.exp <= 64 {
756 // u = trunc(x) fits into a uint64
757 u := msb64(x.mant) >> (64 - uint32(x.exp))
758 if x.MinPrec() <= 64 {
759 return u, Exact
760 }
761 return u, Below // x truncated
762 }
763 // x too large
764 return math.MaxUint64, Below
765 766 case zero:
767 return 0, Exact
768 769 case inf:
770 if x.neg {
771 return 0, Above
772 }
773 return math.MaxUint64, Below
774 }
775 776 panic("unreachable")
777 }
778 779 // Int64 returns the integer resulting from truncating x towards zero.
780 // If [math.MinInt64] <= x <= [math.MaxInt64], the result is [Exact] if x is
781 // an integer, and [Above] (x < 0) or [Below] (x > 0) otherwise.
782 // The result is ([math.MinInt64], [Above]) for x < [math.MinInt64],
783 // and ([math.MaxInt64], [Below]) for x > [math.MaxInt64].
784 func (x *Float) Int64() (int64, Accuracy) {
785 if debugFloat {
786 x.validate()
787 }
788 789 switch x.form {
790 case finite:
791 // 0 < |x| < +Inf
792 acc := makeAcc(x.neg)
793 if x.exp <= 0 {
794 // 0 < |x| < 1
795 return 0, acc
796 }
797 // x.exp > 0
798 799 // 1 <= |x| < +Inf
800 if x.exp <= 63 {
801 // i = trunc(x) fits into an int64 (excluding math.MinInt64)
802 i := int64(msb64(x.mant) >> (64 - uint32(x.exp)))
803 if x.neg {
804 i = -i
805 }
806 if x.MinPrec() <= uint(x.exp) {
807 return i, Exact
808 }
809 return i, acc // x truncated
810 }
811 if x.neg {
812 // check for special case x == math.MinInt64 (i.e., x == -(0.5 << 64))
813 if x.exp == 64 && x.MinPrec() == 1 {
814 acc = Exact
815 }
816 return math.MinInt64, acc
817 }
818 // x too large
819 return math.MaxInt64, Below
820 821 case zero:
822 return 0, Exact
823 824 case inf:
825 if x.neg {
826 return math.MinInt64, Above
827 }
828 return math.MaxInt64, Below
829 }
830 831 panic("unreachable")
832 }
833 834 // Float32 returns the float32 value nearest to x. If x is too small to be
835 // represented by a float32 (|x| < [math.SmallestNonzeroFloat32]), the result
836 // is (0, [Below]) or (-0, [Above]), respectively, depending on the sign of x.
837 // If x is too large to be represented by a float32 (|x| > [math.MaxFloat32]),
838 // the result is (+Inf, [Above]) or (-Inf, [Below]), depending on the sign of x.
839 func (x *Float) Float32() (float32, Accuracy) {
840 if debugFloat {
841 x.validate()
842 }
843 844 switch x.form {
845 case finite:
846 // 0 < |x| < +Inf
847 848 const (
849 fbits = 32 // float size
850 mbits = 23 // mantissa size (excluding implicit msb)
851 ebits = fbits - mbits - 1 // 8 exponent size
852 bias = 1<<(ebits-1) - 1 // 127 exponent bias
853 dmin = 1 - bias - mbits // -149 smallest unbiased exponent (denormal)
854 emin = 1 - bias // -126 smallest unbiased exponent (normal)
855 emax = bias // 127 largest unbiased exponent (normal)
856 )
857 858 // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float32 mantissa.
859 e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
860 861 // Compute precision p for float32 mantissa.
862 // If the exponent is too small, we have a denormal number before
863 // rounding and fewer than p mantissa bits of precision available
864 // (the exponent remains fixed but the mantissa gets shifted right).
865 p := mbits + 1 // precision of normal float
866 if e < emin {
867 // recompute precision
868 p = mbits + 1 - emin + int(e)
869 // If p == 0, the mantissa of x is shifted so much to the right
870 // that its msb falls immediately to the right of the float32
871 // mantissa space. In other words, if the smallest denormal is
872 // considered "1.0", for p == 0, the mantissa value m is >= 0.5.
873 // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
874 // If m == 0.5, it is rounded down to even, i.e., 0.0.
875 // If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
876 if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
877 // underflow to ±0
878 if x.neg {
879 var z float32
880 return -z, Above
881 }
882 return 0.0, Below
883 }
884 // otherwise, round up
885 // We handle p == 0 explicitly because it's easy and because
886 // Float.round doesn't support rounding to 0 bits of precision.
887 if p == 0 {
888 if x.neg {
889 return -math.SmallestNonzeroFloat32, Below
890 }
891 return math.SmallestNonzeroFloat32, Above
892 }
893 }
894 // p > 0
895 896 // round
897 var r Float
898 r.prec = uint32(p)
899 r.Set(x)
900 e = r.exp - 1
901 902 // Rounding may have caused r to overflow to ±Inf
903 // (rounding never causes underflows to 0).
904 // If the exponent is too large, also overflow to ±Inf.
905 if r.form == inf || e > emax {
906 // overflow
907 if x.neg {
908 return float32(math.Inf(-1)), Below
909 }
910 return float32(math.Inf(+1)), Above
911 }
912 // e <= emax
913 914 // Determine sign, biased exponent, and mantissa.
915 var sign, bexp, mant uint32
916 if x.neg {
917 sign = 1 << (fbits - 1)
918 }
919 920 // Rounding may have caused a denormal number to
921 // become normal. Check again.
922 if e < emin {
923 // denormal number: recompute precision
924 // Since rounding may have at best increased precision
925 // and we have eliminated p <= 0 early, we know p > 0.
926 // bexp == 0 for denormals
927 p = mbits + 1 - emin + int(e)
928 mant = msb32(r.mant) >> uint(fbits-p)
929 } else {
930 // normal number: emin <= e <= emax
931 bexp = uint32(e+bias) << mbits
932 mant = msb32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
933 }
934 935 return math.Float32frombits(sign | bexp | mant), r.acc
936 937 case zero:
938 if x.neg {
939 var z float32
940 return -z, Exact
941 }
942 return 0.0, Exact
943 944 case inf:
945 if x.neg {
946 return float32(math.Inf(-1)), Exact
947 }
948 return float32(math.Inf(+1)), Exact
949 }
950 951 panic("unreachable")
952 }
953 954 // Float64 returns the float64 value nearest to x. If x is too small to be
955 // represented by a float64 (|x| < [math.SmallestNonzeroFloat64]), the result
956 // is (0, [Below]) or (-0, [Above]), respectively, depending on the sign of x.
957 // If x is too large to be represented by a float64 (|x| > [math.MaxFloat64]),
958 // the result is (+Inf, [Above]) or (-Inf, [Below]), depending on the sign of x.
959 func (x *Float) Float64() (float64, Accuracy) {
960 if debugFloat {
961 x.validate()
962 }
963 964 switch x.form {
965 case finite:
966 // 0 < |x| < +Inf
967 968 const (
969 fbits = 64 // float size
970 mbits = 52 // mantissa size (excluding implicit msb)
971 ebits = fbits - mbits - 1 // 11 exponent size
972 bias = 1<<(ebits-1) - 1 // 1023 exponent bias
973 dmin = 1 - bias - mbits // -1074 smallest unbiased exponent (denormal)
974 emin = 1 - bias // -1022 smallest unbiased exponent (normal)
975 emax = bias // 1023 largest unbiased exponent (normal)
976 )
977 978 // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float64 mantissa.
979 e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
980 981 // Compute precision p for float64 mantissa.
982 // If the exponent is too small, we have a denormal number before
983 // rounding and fewer than p mantissa bits of precision available
984 // (the exponent remains fixed but the mantissa gets shifted right).
985 p := mbits + 1 // precision of normal float
986 if e < emin {
987 // recompute precision
988 p = mbits + 1 - emin + int(e)
989 // If p == 0, the mantissa of x is shifted so much to the right
990 // that its msb falls immediately to the right of the float64
991 // mantissa space. In other words, if the smallest denormal is
992 // considered "1.0", for p == 0, the mantissa value m is >= 0.5.
993 // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
994 // If m == 0.5, it is rounded down to even, i.e., 0.0.
995 // If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
996 if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
997 // underflow to ±0
998 if x.neg {
999 var z float64
1000 return -z, Above
1001 }
1002 return 0.0, Below
1003 }
1004 // otherwise, round up
1005 // We handle p == 0 explicitly because it's easy and because
1006 // Float.round doesn't support rounding to 0 bits of precision.
1007 if p == 0 {
1008 if x.neg {
1009 return -math.SmallestNonzeroFloat64, Below
1010 }
1011 return math.SmallestNonzeroFloat64, Above
1012 }
1013 }
1014 // p > 0
1015 1016 // round
1017 var r Float
1018 r.prec = uint32(p)
1019 r.Set(x)
1020 e = r.exp - 1
1021 1022 // Rounding may have caused r to overflow to ±Inf
1023 // (rounding never causes underflows to 0).
1024 // If the exponent is too large, also overflow to ±Inf.
1025 if r.form == inf || e > emax {
1026 // overflow
1027 if x.neg {
1028 return math.Inf(-1), Below
1029 }
1030 return math.Inf(+1), Above
1031 }
1032 // e <= emax
1033 1034 // Determine sign, biased exponent, and mantissa.
1035 var sign, bexp, mant uint64
1036 if x.neg {
1037 sign = 1 << (fbits - 1)
1038 }
1039 1040 // Rounding may have caused a denormal number to
1041 // become normal. Check again.
1042 if e < emin {
1043 // denormal number: recompute precision
1044 // Since rounding may have at best increased precision
1045 // and we have eliminated p <= 0 early, we know p > 0.
1046 // bexp == 0 for denormals
1047 p = mbits + 1 - emin + int(e)
1048 mant = msb64(r.mant) >> uint(fbits-p)
1049 } else {
1050 // normal number: emin <= e <= emax
1051 bexp = uint64(e+bias) << mbits
1052 mant = msb64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
1053 }
1054 1055 return math.Float64frombits(sign | bexp | mant), r.acc
1056 1057 case zero:
1058 if x.neg {
1059 var z float64
1060 return -z, Exact
1061 }
1062 return 0.0, Exact
1063 1064 case inf:
1065 if x.neg {
1066 return math.Inf(-1), Exact
1067 }
1068 return math.Inf(+1), Exact
1069 }
1070 1071 panic("unreachable")
1072 }
1073 1074 // Int returns the result of truncating x towards zero;
1075 // or nil if x is an infinity.
1076 // The result is [Exact] if x.IsInt(); otherwise it is [Below]
1077 // for x > 0, and [Above] for x < 0.
1078 // If a non-nil *[Int] argument z is provided, [Int] stores
1079 // the result in z instead of allocating a new [Int].
1080 func (x *Float) Int(z *Int) (*Int, Accuracy) {
1081 if debugFloat {
1082 x.validate()
1083 }
1084 1085 if z == nil && x.form <= finite {
1086 z = &Int{}
1087 }
1088 1089 switch x.form {
1090 case finite:
1091 // 0 < |x| < +Inf
1092 acc := makeAcc(x.neg)
1093 if x.exp <= 0 {
1094 // 0 < |x| < 1
1095 return z.SetInt64(0), acc
1096 }
1097 // x.exp > 0
1098 1099 // 1 <= |x| < +Inf
1100 // determine minimum required precision for x
1101 allBits := uint(len(x.mant)) * _W
1102 exp := uint(x.exp)
1103 if x.MinPrec() <= exp {
1104 acc = Exact
1105 }
1106 // shift mantissa as needed
1107 if z == nil {
1108 z = &Int{}
1109 }
1110 z.neg = x.neg
1111 switch {
1112 case exp > allBits:
1113 z.abs = z.abs.lsh(x.mant, exp-allBits)
1114 default:
1115 z.abs = z.abs.set(x.mant)
1116 case exp < allBits:
1117 z.abs = z.abs.rsh(x.mant, allBits-exp)
1118 }
1119 return z, acc
1120 1121 case zero:
1122 return z.SetInt64(0), Exact
1123 1124 case inf:
1125 return nil, makeAcc(x.neg)
1126 }
1127 1128 panic("unreachable")
1129 }
1130 1131 // Rat returns the rational number corresponding to x;
1132 // or nil if x is an infinity.
1133 // The result is [Exact] if x is not an Inf.
1134 // If a non-nil *[Rat] argument z is provided, [Rat] stores
1135 // the result in z instead of allocating a new [Rat].
1136 func (x *Float) Rat(z *Rat) (*Rat, Accuracy) {
1137 if debugFloat {
1138 x.validate()
1139 }
1140 1141 if z == nil && x.form <= finite {
1142 z = &Rat{}
1143 }
1144 1145 switch x.form {
1146 case finite:
1147 // 0 < |x| < +Inf
1148 allBits := int32(len(x.mant)) * _W
1149 // build up numerator and denominator
1150 z.a.neg = x.neg
1151 switch {
1152 case x.exp > allBits:
1153 z.a.abs = z.a.abs.lsh(x.mant, uint(x.exp-allBits))
1154 z.b.abs = z.b.abs[:0] // == 1 (see Rat)
1155 // z already in normal form
1156 default:
1157 z.a.abs = z.a.abs.set(x.mant)
1158 z.b.abs = z.b.abs[:0] // == 1 (see Rat)
1159 // z already in normal form
1160 case x.exp < allBits:
1161 z.a.abs = z.a.abs.set(x.mant)
1162 t := z.b.abs.setUint64(1)
1163 z.b.abs = t.lsh(t, uint(allBits-x.exp))
1164 z.norm()
1165 }
1166 return z, Exact
1167 1168 case zero:
1169 return z.SetInt64(0), Exact
1170 1171 case inf:
1172 return nil, makeAcc(x.neg)
1173 }
1174 1175 panic("unreachable")
1176 }
1177 1178 // Abs sets z to the (possibly rounded) value |x| (the absolute value of x)
1179 // and returns z.
1180 func (z *Float) Abs(x *Float) *Float {
1181 z.Set(x)
1182 z.neg = false
1183 return z
1184 }
1185 1186 // Neg sets z to the (possibly rounded) value of x with its sign negated,
1187 // and returns z.
1188 func (z *Float) Neg(x *Float) *Float {
1189 z.Set(x)
1190 z.neg = !z.neg
1191 return z
1192 }
1193 1194 func validateBinaryOperands(x, y *Float) {
1195 if !debugFloat {
1196 // avoid performance bugs
1197 panic("validateBinaryOperands called but debugFloat is not set")
1198 }
1199 if len(x.mant) == 0 {
1200 panic("empty mantissa for x")
1201 }
1202 if len(y.mant) == 0 {
1203 panic("empty mantissa for y")
1204 }
1205 }
1206 1207 // z = x + y, ignoring signs of x and y for the addition
1208 // but using the sign of z for rounding the result.
1209 // x and y must have a non-empty mantissa and valid exponent.
1210 func (z *Float) uadd(x, y *Float) {
1211 // Note: This implementation requires 2 shifts most of the
1212 // time. It is also inefficient if exponents or precisions
1213 // differ by wide margins. The following article describes
1214 // an efficient (but much more complicated) implementation
1215 // compatible with the internal representation used here:
1216 //
1217 // Vincent Lefèvre: "The Generic Multiple-Precision Floating-
1218 // Point Addition With Exact Rounding (as in the MPFR Library)"
1219 // http://www.vinc17.net/research/papers/rnc6.pdf
1220 1221 if debugFloat {
1222 validateBinaryOperands(x, y)
1223 }
1224 1225 // compute exponents ex, ey for mantissa with "binary point"
1226 // on the right (mantissa.0) - use int64 to avoid overflow
1227 ex := int64(x.exp) - int64(len(x.mant))*_W
1228 ey := int64(y.exp) - int64(len(y.mant))*_W
1229 1230 al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
1231 1232 // TODO(gri) having a combined add-and-shift primitive
1233 // could make this code significantly faster
1234 switch {
1235 case ex < ey:
1236 if al {
1237 t := nat(nil).lsh(y.mant, uint(ey-ex))
1238 z.mant = z.mant.add(x.mant, t)
1239 } else {
1240 z.mant = z.mant.lsh(y.mant, uint(ey-ex))
1241 z.mant = z.mant.add(x.mant, z.mant)
1242 }
1243 default:
1244 // ex == ey, no shift needed
1245 z.mant = z.mant.add(x.mant, y.mant)
1246 case ex > ey:
1247 if al {
1248 t := nat(nil).lsh(x.mant, uint(ex-ey))
1249 z.mant = z.mant.add(t, y.mant)
1250 } else {
1251 z.mant = z.mant.lsh(x.mant, uint(ex-ey))
1252 z.mant = z.mant.add(z.mant, y.mant)
1253 }
1254 ex = ey
1255 }
1256 // len(z.mant) > 0
1257 1258 z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
1259 }
1260 1261 // z = x - y for |x| > |y|, ignoring signs of x and y for the subtraction
1262 // but using the sign of z for rounding the result.
1263 // x and y must have a non-empty mantissa and valid exponent.
1264 func (z *Float) usub(x, y *Float) {
1265 // This code is symmetric to uadd.
1266 // We have not factored the common code out because
1267 // eventually uadd (and usub) should be optimized
1268 // by special-casing, and the code will diverge.
1269 1270 if debugFloat {
1271 validateBinaryOperands(x, y)
1272 }
1273 1274 ex := int64(x.exp) - int64(len(x.mant))*_W
1275 ey := int64(y.exp) - int64(len(y.mant))*_W
1276 1277 al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
1278 1279 switch {
1280 case ex < ey:
1281 if al {
1282 t := nat(nil).lsh(y.mant, uint(ey-ex))
1283 z.mant = t.sub(x.mant, t)
1284 } else {
1285 z.mant = z.mant.lsh(y.mant, uint(ey-ex))
1286 z.mant = z.mant.sub(x.mant, z.mant)
1287 }
1288 default:
1289 // ex == ey, no shift needed
1290 z.mant = z.mant.sub(x.mant, y.mant)
1291 case ex > ey:
1292 if al {
1293 t := nat(nil).lsh(x.mant, uint(ex-ey))
1294 z.mant = t.sub(t, y.mant)
1295 } else {
1296 z.mant = z.mant.lsh(x.mant, uint(ex-ey))
1297 z.mant = z.mant.sub(z.mant, y.mant)
1298 }
1299 ex = ey
1300 }
1301 1302 // operands may have canceled each other out
1303 if len(z.mant) == 0 {
1304 z.acc = Exact
1305 z.form = zero
1306 z.neg = false
1307 return
1308 }
1309 // len(z.mant) > 0
1310 1311 z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
1312 }
1313 1314 // z = x * y, ignoring signs of x and y for the multiplication
1315 // but using the sign of z for rounding the result.
1316 // x and y must have a non-empty mantissa and valid exponent.
1317 func (z *Float) umul(x, y *Float) {
1318 if debugFloat {
1319 validateBinaryOperands(x, y)
1320 }
1321 1322 // Note: This is doing too much work if the precision
1323 // of z is less than the sum of the precisions of x
1324 // and y which is often the case (e.g., if all floats
1325 // have the same precision).
1326 // TODO(gri) Optimize this for the common case.
1327 1328 e := int64(x.exp) + int64(y.exp)
1329 if x == y {
1330 z.mant = z.mant.sqr(nil, x.mant)
1331 } else {
1332 z.mant = z.mant.mul(nil, x.mant, y.mant)
1333 }
1334 z.setExpAndRound(e-fnorm(z.mant), 0)
1335 }
1336 1337 // z = x / y, ignoring signs of x and y for the division
1338 // but using the sign of z for rounding the result.
1339 // x and y must have a non-empty mantissa and valid exponent.
1340 func (z *Float) uquo(x, y *Float) {
1341 if debugFloat {
1342 validateBinaryOperands(x, y)
1343 }
1344 1345 // mantissa length in words for desired result precision + 1
1346 // (at least one extra bit so we get the rounding bit after
1347 // the division)
1348 n := int(z.prec/_W) + 1
1349 1350 // compute adjusted x.mant such that we get enough result precision
1351 xadj := x.mant
1352 if d := n - len(x.mant) + len(y.mant); d > 0 {
1353 // d extra words needed => add d "0 digits" to x
1354 xadj = make(nat, len(x.mant)+d)
1355 copy(xadj[d:], x.mant)
1356 }
1357 // TODO(gri): If we have too many digits (d < 0), we should be able
1358 // to shorten x for faster division. But we must be extra careful
1359 // with rounding in that case.
1360 1361 // Compute d before division since there may be aliasing of x.mant
1362 // (via xadj) or y.mant with z.mant.
1363 d := len(xadj) - len(y.mant)
1364 1365 // divide
1366 stk := getStack()
1367 defer stk.free()
1368 var r nat
1369 z.mant, r = z.mant.div(stk, nil, xadj, y.mant)
1370 e := int64(x.exp) - int64(y.exp) - int64(d-len(z.mant))*_W
1371 1372 // The result is long enough to include (at least) the rounding bit.
1373 // If there's a non-zero remainder, the corresponding fractional part
1374 // (if it were computed), would have a non-zero sticky bit (if it were
1375 // zero, it couldn't have a non-zero remainder).
1376 var sbit uint
1377 if len(r) > 0 {
1378 sbit = 1
1379 }
1380 1381 z.setExpAndRound(e-fnorm(z.mant), sbit)
1382 }
1383 1384 // ucmp returns -1, 0, or +1, depending on whether
1385 // |x| < |y|, |x| == |y|, or |x| > |y|.
1386 // x and y must have a non-empty mantissa and valid exponent.
1387 func (x *Float) ucmp(y *Float) int {
1388 if debugFloat {
1389 validateBinaryOperands(x, y)
1390 }
1391 1392 switch {
1393 case x.exp < y.exp:
1394 return -1
1395 case x.exp > y.exp:
1396 return +1
1397 }
1398 // x.exp == y.exp
1399 1400 // compare mantissas
1401 i := len(x.mant)
1402 j := len(y.mant)
1403 for i > 0 || j > 0 {
1404 var xm, ym Word
1405 if i > 0 {
1406 i--
1407 xm = x.mant[i]
1408 }
1409 if j > 0 {
1410 j--
1411 ym = y.mant[j]
1412 }
1413 switch {
1414 case xm < ym:
1415 return -1
1416 case xm > ym:
1417 return +1
1418 }
1419 }
1420 1421 return 0
1422 }
1423 1424 // Handling of sign bit as defined by IEEE 754-2008, section 6.3:
1425 //
1426 // When neither the inputs nor result are NaN, the sign of a product or
1427 // quotient is the exclusive OR of the operands’ signs; the sign of a sum,
1428 // or of a difference x−y regarded as a sum x+(−y), differs from at most
1429 // one of the addends’ signs; and the sign of the result of conversions,
1430 // the quantize operation, the roundToIntegral operations, and the
1431 // roundToIntegralExact (see 5.3.1) is the sign of the first or only operand.
1432 // These rules shall apply even when operands or results are zero or infinite.
1433 //
1434 // When the sum of two operands with opposite signs (or the difference of
1435 // two operands with like signs) is exactly zero, the sign of that sum (or
1436 // difference) shall be +0 in all rounding-direction attributes except
1437 // roundTowardNegative; under that attribute, the sign of an exact zero
1438 // sum (or difference) shall be −0. However, x+x = x−(−x) retains the same
1439 // sign as x even when x is zero.
1440 //
1441 // See also: https://play.golang.org/p/RtH3UCt5IH
1442 1443 // Add sets z to the rounded sum x+y and returns z. If z's precision is 0,
1444 // it is changed to the larger of x's or y's precision before the operation.
1445 // Rounding is performed according to z's precision and rounding mode; and
1446 // z's accuracy reports the result error relative to the exact (not rounded)
1447 // result. Add panics with [ErrNaN] if x and y are infinities with opposite
1448 // signs. The value of z is undefined in that case.
1449 func (z *Float) Add(x, y *Float) *Float {
1450 if debugFloat {
1451 x.validate()
1452 y.validate()
1453 }
1454 1455 if z.prec == 0 {
1456 z.prec = max(x.prec, y.prec)
1457 }
1458 1459 if x.form == finite && y.form == finite {
1460 // x + y (common case)
1461 1462 // Below we set z.neg = x.neg, and when z aliases y this will
1463 // change the y operand's sign. This is fine, because if an
1464 // operand aliases the receiver it'll be overwritten, but we still
1465 // want the original x.neg and y.neg values when we evaluate
1466 // x.neg != y.neg, so we need to save y.neg before setting z.neg.
1467 yneg := y.neg
1468 1469 z.neg = x.neg
1470 if x.neg == yneg {
1471 // x + y == x + y
1472 // (-x) + (-y) == -(x + y)
1473 z.uadd(x, y)
1474 } else {
1475 // x + (-y) == x - y == -(y - x)
1476 // (-x) + y == y - x == -(x - y)
1477 if x.ucmp(y) > 0 {
1478 z.usub(x, y)
1479 } else {
1480 z.neg = !z.neg
1481 z.usub(y, x)
1482 }
1483 }
1484 if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact {
1485 z.neg = true
1486 }
1487 return z
1488 }
1489 1490 if x.form == inf && y.form == inf && x.neg != y.neg {
1491 // +Inf + -Inf
1492 // -Inf + +Inf
1493 // value of z is undefined but make sure it's valid
1494 z.acc = Exact
1495 z.form = zero
1496 z.neg = false
1497 panic(ErrNaN{"addition of infinities with opposite signs"})
1498 }
1499 1500 if x.form == zero && y.form == zero {
1501 // ±0 + ±0
1502 z.acc = Exact
1503 z.form = zero
1504 z.neg = x.neg && y.neg // -0 + -0 == -0
1505 return z
1506 }
1507 1508 if x.form == inf || y.form == zero {
1509 // ±Inf + y
1510 // x + ±0
1511 return z.Set(x)
1512 }
1513 1514 // ±0 + y
1515 // x + ±Inf
1516 return z.Set(y)
1517 }
1518 1519 // Sub sets z to the rounded difference x-y and returns z.
1520 // Precision, rounding, and accuracy reporting are as for [Float.Add].
1521 // Sub panics with [ErrNaN] if x and y are infinities with equal
1522 // signs. The value of z is undefined in that case.
1523 func (z *Float) Sub(x, y *Float) *Float {
1524 if debugFloat {
1525 x.validate()
1526 y.validate()
1527 }
1528 1529 if z.prec == 0 {
1530 z.prec = max(x.prec, y.prec)
1531 }
1532 1533 if x.form == finite && y.form == finite {
1534 // x - y (common case)
1535 yneg := y.neg
1536 z.neg = x.neg
1537 if x.neg != yneg {
1538 // x - (-y) == x + y
1539 // (-x) - y == -(x + y)
1540 z.uadd(x, y)
1541 } else {
1542 // x - y == x - y == -(y - x)
1543 // (-x) - (-y) == y - x == -(x - y)
1544 if x.ucmp(y) > 0 {
1545 z.usub(x, y)
1546 } else {
1547 z.neg = !z.neg
1548 z.usub(y, x)
1549 }
1550 }
1551 if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact {
1552 z.neg = true
1553 }
1554 return z
1555 }
1556 1557 if x.form == inf && y.form == inf && x.neg == y.neg {
1558 // +Inf - +Inf
1559 // -Inf - -Inf
1560 // value of z is undefined but make sure it's valid
1561 z.acc = Exact
1562 z.form = zero
1563 z.neg = false
1564 panic(ErrNaN{"subtraction of infinities with equal signs"})
1565 }
1566 1567 if x.form == zero && y.form == zero {
1568 // ±0 - ±0
1569 z.acc = Exact
1570 z.form = zero
1571 z.neg = x.neg && !y.neg // -0 - +0 == -0
1572 return z
1573 }
1574 1575 if x.form == inf || y.form == zero {
1576 // ±Inf - y
1577 // x - ±0
1578 return z.Set(x)
1579 }
1580 1581 // ±0 - y
1582 // x - ±Inf
1583 return z.Neg(y)
1584 }
1585 1586 // Mul sets z to the rounded product x*y and returns z.
1587 // Precision, rounding, and accuracy reporting are as for [Float.Add].
1588 // Mul panics with [ErrNaN] if one operand is zero and the other
1589 // operand an infinity. The value of z is undefined in that case.
1590 func (z *Float) Mul(x, y *Float) *Float {
1591 if debugFloat {
1592 x.validate()
1593 y.validate()
1594 }
1595 1596 if z.prec == 0 {
1597 z.prec = max(x.prec, y.prec)
1598 }
1599 1600 z.neg = x.neg != y.neg
1601 1602 if x.form == finite && y.form == finite {
1603 // x * y (common case)
1604 z.umul(x, y)
1605 return z
1606 }
1607 1608 z.acc = Exact
1609 if x.form == zero && y.form == inf || x.form == inf && y.form == zero {
1610 // ±0 * ±Inf
1611 // ±Inf * ±0
1612 // value of z is undefined but make sure it's valid
1613 z.form = zero
1614 z.neg = false
1615 panic(ErrNaN{"multiplication of zero with infinity"})
1616 }
1617 1618 if x.form == inf || y.form == inf {
1619 // ±Inf * y
1620 // x * ±Inf
1621 z.form = inf
1622 return z
1623 }
1624 1625 // ±0 * y
1626 // x * ±0
1627 z.form = zero
1628 return z
1629 }
1630 1631 // Quo sets z to the rounded quotient x/y and returns z.
1632 // Precision, rounding, and accuracy reporting are as for [Float.Add].
1633 // Quo panics with [ErrNaN] if both operands are zero or infinities.
1634 // The value of z is undefined in that case.
1635 func (z *Float) Quo(x, y *Float) *Float {
1636 if debugFloat {
1637 x.validate()
1638 y.validate()
1639 }
1640 1641 if z.prec == 0 {
1642 z.prec = max(x.prec, y.prec)
1643 }
1644 1645 z.neg = x.neg != y.neg
1646 1647 if x.form == finite && y.form == finite {
1648 // x / y (common case)
1649 z.uquo(x, y)
1650 return z
1651 }
1652 1653 z.acc = Exact
1654 if x.form == zero && y.form == zero || x.form == inf && y.form == inf {
1655 // ±0 / ±0
1656 // ±Inf / ±Inf
1657 // value of z is undefined but make sure it's valid
1658 z.form = zero
1659 z.neg = false
1660 panic(ErrNaN{"division of zero by zero or infinity by infinity"})
1661 }
1662 1663 if x.form == zero || y.form == inf {
1664 // ±0 / y
1665 // x / ±Inf
1666 z.form = zero
1667 return z
1668 }
1669 1670 // x / ±0
1671 // ±Inf / y
1672 z.form = inf
1673 return z
1674 }
1675 1676 // Cmp compares x and y and returns:
1677 // - -1 if x < y;
1678 // - 0 if x == y (incl. -0 == 0, -Inf == -Inf, and +Inf == +Inf);
1679 // - +1 if x > y.
1680 func (x *Float) Cmp(y *Float) int {
1681 if debugFloat {
1682 x.validate()
1683 y.validate()
1684 }
1685 1686 mx := x.ord()
1687 my := y.ord()
1688 switch {
1689 case mx < my:
1690 return -1
1691 case mx > my:
1692 return +1
1693 }
1694 // mx == my
1695 1696 // only if |mx| == 1 we have to compare the mantissae
1697 switch mx {
1698 case -1:
1699 return y.ucmp(x)
1700 case +1:
1701 return x.ucmp(y)
1702 }
1703 1704 return 0
1705 }
1706 1707 // ord classifies x and returns:
1708 //
1709 // -2 if -Inf == x
1710 // -1 if -Inf < x < 0
1711 // 0 if x == 0 (signed or unsigned)
1712 // +1 if 0 < x < +Inf
1713 // +2 if x == +Inf
1714 func (x *Float) ord() int {
1715 var m int
1716 switch x.form {
1717 case finite:
1718 m = 1
1719 case zero:
1720 return 0
1721 case inf:
1722 m = 2
1723 }
1724 if x.neg {
1725 m = -m
1726 }
1727 return m
1728 }
1729