ghc/runtime/gmp/mpn_sqrt.c

   1 /* mpn_sqrt(root_ptr, rem_ptr, op_ptr, op_size)
   2
   3    Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR.
   4    Write the remainder at REM_PTR, if REM_PTR != NULL.
   5    Return the size of the remainder.
   6    (The size of the root is always half of the size of the operand.)
   7
   8    OP_PTR and ROOT_PTR may not point to the same object.
   9    OP_PTR and REM_PTR may point to the same object.
  10
  11    If REM_PTR is NULL, only the root is computed and the return value of
  12    the function is 0 if OP is a perfect square, and *any* non-zero number
  13    otherwise.
  14
  15 Copyright (C) 1991, 1993 Free Software Foundation, Inc.
  16
  17 This file is part of the GNU MP Library.
  18
  19 The GNU MP Library is free software; you can redistribute it and/or modify
  20 it under the terms of the GNU General Public License as published by
  21 the Free Software Foundation; either version 2, or (at your option)
  22 any later version.
  23
  24 The GNU MP Library is distributed in the hope that it will be useful,
  25 but WITHOUT ANY WARRANTY; without even the implied warranty of
  26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  27 GNU General Public License for more details.
  28
  29 You should have received a copy of the GNU General Public License
  30 along with the GNU MP Library; see the file COPYING.  If not, write to
  31 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */
  32
  33 /* This code is just correct if "unsigned char" has at least 8 bits.  It
  34    doesn't help to use CHAR_BIT from limits.h, as the real problem is
  35    the static arrays.  */
  36
  37 #include "gmp.h"
  38 #include "gmp-impl.h"
  39 #include "longlong.h"
  40
  41 /* Square root algorithm:
  42
  43    1. Shift OP (the input) to the left an even number of bits s.t. there
  44       are an even number of words and either (or both) of the most
  45       significant bits are set.  This way, sqrt(OP) has exactly half as
  46       many words as OP, and has its most significant bit set.
  47
  48    2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables.
  49       This approximation is used for the first single-precision
  50       iterations of Newton's method, yielding a full-word approximation
  51       to sqrt(OP).
  52
  53    3. Perform multiple-precision Newton iteration until we have the
  54       exact result.  Only about half of the input operand is used in
  55       this calculation, as the square root is perfectly determinable
  56       from just the higher half of a number.  */
  57 \f
  58 /* Define this macro for IEEE P854 machines with a fast sqrt instruction.  */
  59 #if defined __GNUC__
  60
  61 #if defined __sparc__
  62 #define SQRT(a) \
  63   ({                                                                    \
  64     double __sqrt_res;                                                  \
  65     asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a));                 \
  66     __sqrt_res;                                                         \
  67   })
  68 #endif
  69
  70 #if defined __HAVE_68881__
  71 #define SQRT(a) \
  72   ({                                                                    \
  73     double __sqrt_res;                                                  \
  74     asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a));                 \
  75     __sqrt_res;                                                         \
  76   })
  77 #endif
  78
  79 #if defined __hppa
  80 #define SQRT(a) \
  81   ({                                                                    \
  82     double __sqrt_res;                                                  \
  83     asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a));            \
  84     __sqrt_res;                                                         \
  85   })
  86 #endif
  87
  88 #endif
  89
  90 #ifndef SQRT
  91
  92 /* Tables for initial approximation of the square root.  These are
  93    indexed with bits 1-8 of the operand for which the square root is
  94    calculated, where bit 0 is the most significant non-zero bit.  I.e.
  95    the most significant one-bit is not used, since that per definition
  96    is one.  Likewise, the tables don't return the highest bit of the
  97    result.  That bit must be inserted by or:ing the returned value with
  98    0x100.  This way, we get a 9-bit approximation from 8-bit tables!  */
  99
 100 /* Table to be used for operands with an even total number of bits.
 101    (Exactly as in the decimal system there are similarities between the
 102    square root of numbers with the same initial digits and an even
 103    difference in the total number of digits.  Consider the square root
 104    of 1, 10, 100, 1000, ...)  */
 105 static unsigned char even_approx_tab[256] =
 106 {
 107   0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e,
 108   0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74,
 109   0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79,
 110   0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f,
 111   0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84,
 112   0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89,
 113   0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f,
 114   0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94,
 115   0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99,
 116   0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e,
 117   0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3,
 118   0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7,
 119   0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac,
 120   0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1,
 121   0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6,
 122   0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba,
 123   0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf,
 124   0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3,
 125   0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8,
 126   0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc,
 127   0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1,
 128   0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5,
 129   0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda,
 130   0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde,
 131   0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2,
 132   0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6,
 133   0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb,
 134   0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef,
 135   0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3,
 136   0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7,
 137   0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb,
 138   0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff,
 139 };
 140
 141 /* Table to be used for operands with an odd total number of bits.
 142    (Further comments before previous table.)  */
 143 static unsigned char odd_approx_tab[256] =
 144 {
 145   0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
 146   0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07,
 147   0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b,
 148   0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f,
 149   0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12,
 150   0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16,
 151   0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a,
 152   0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d,
 153   0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21,
 154   0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24,
 155   0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,
 156   0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b,
 157   0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f,
 158   0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32,
 159   0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35,
 160   0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39,
 161   0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c,
 162   0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f,
 163   0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42,
 164   0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45,
 165   0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49,
 166   0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c,
 167   0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f,
 168   0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52,
 169   0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55,
 170   0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58,
 171   0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b,
 172   0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e,
 173   0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61,
 174   0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63,
 175   0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66,
 176   0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69,
 177 };
 178 #endif
 179
 180 \f
 181 mp_size
 182 #ifdef __STDC__
 183 mpn_sqrt (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size op_size)
 184 #else
 185 mpn_sqrt (root_ptr, rem_ptr, op_ptr, op_size)
 186      mp_ptr root_ptr;
 187      mp_ptr rem_ptr;
 188      mp_srcptr op_ptr;
 189      mp_size op_size;
 190 #endif
 191 {
 192   /* R (root result) */
 193   mp_ptr rp;                    /* Pointer to least significant word */
 194   mp_size rsize;                /* The size in words */
 195
 196   /* T (OP shifted to the left a.k.a. normalized) */
 197   mp_ptr tp;                    /* Pointer to least significant word */
 198   mp_size tsize;                /* The size in words */
 199   mp_ptr t_end_ptr;             /* Pointer right beyond most sign. word */
 200   mp_limb t_high0, t_high1;     /* The two most significant words */
 201
 202   /* TT (temporary for numerator/remainder) */
 203   mp_ptr ttp;                   /* Pointer to least significant word */
 204
 205   /* X (temporary for quotient in main loop) */
 206   mp_ptr xp;                    /* Pointer to least significant word */
 207   mp_size xsize;                /* The size in words */
 208
 209   unsigned cnt;
 210   mp_limb initial_approx;       /* Initially made approximation */
 211   mp_size tsizes[BITS_PER_MP_LIMB];     /* Successive calculation precisions */
 212   mp_size tmp;
 213   mp_size i;
 214
 215   /* If OP is zero, both results are zero.  */
 216   if (op_size == 0)
 217     return 0;
 218
 219   count_leading_zeros (cnt, op_ptr[op_size - 1]);
 220   tsize = op_size;
 221   if ((tsize & 1) != 0)
 222     {
 223       cnt += BITS_PER_MP_LIMB;
 224       tsize++;
 225     }
 226
 227   rsize = tsize / 2;
 228   rp = root_ptr;
 229
 230   /* Shift OP an even number of bits into T, such that either the most or
 231      the second most significant bit is set, and such that the number of
 232      words in T becomes even.  This way, the number of words in R=sqrt(OP)
 233      is exactly half as many as in OP, and the most significant bit of R
 234      is set.
 235
 236      Also, the initial approximation is simplified by this up-shifted OP.
 237
 238      Finally, the Newtonian iteration which is the main part of this
 239      program performs division by R.  The fast division routine expects
 240      the divisor to be "normalized" in exactly the sense of having the
 241      most significant bit set.  */
 242
 243   tp = (mp_ptr) alloca (tsize * BYTES_PER_MP_LIMB);
 244
 245   t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size,
 246                         (cnt & ~1) % BITS_PER_MP_LIMB);
 247   if (cnt >= BITS_PER_MP_LIMB)
 248     tp[0] = 0;
 249
 250   t_high0 = tp[tsize - 1];
 251   t_high1 = tp[tsize - 2];      /* Never stray.  TSIZE is >= 2.  */
 252
 253 /* Is there a fast sqrt instruction defined for this machine?  */
 254 #ifdef SQRT
 255   {
 256     initial_approx = SQRT (t_high0 * 2.0
 257                            * ((mp_limb) 1 << (BITS_PER_MP_LIMB - 1))
 258                            + t_high1);
 259     /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have
 260        become incorrect due to overflow in the conversion from double to
 261        mp_limb above.  It will typically be zero in that case, but might be
 262        a small number on some machines.  The most significant bit of
 263        INITIAL_APPROX should be set, so that bit is a good overflow
 264        indication.  */
 265     if ((mp_limb_signed) initial_approx >= 0)
 266       initial_approx = ~0;
 267   }
 268 #else
 269   /* Get a 9 bit approximation from the tables.  The tables expect to
 270      be indexed with the 8 high bits right below the highest bit.
 271      Also, the highest result bit is not returned by the tables, and
 272      must be or:ed into the result.  The scheme gives 9 bits of start
 273      approximation with just 256-entry 8 bit tables.  */
 274
 275   if ((cnt & 1) == 0)
 276     {
 277       /* The most sign bit of t_high0 is set.  */
 278       initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1);
 279       initial_approx &= 0xff;
 280       initial_approx = even_approx_tab[initial_approx];
 281     }
 282   else
 283     {
 284       /* The most significant bit of T_HIGH0 is unset,
 285          the second most significant is set.  */
 286       initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2);
 287       initial_approx &= 0xff;
 288       initial_approx = odd_approx_tab[initial_approx];
 289     }
 290   initial_approx |= 0x100;
 291   initial_approx <<= BITS_PER_MP_LIMB - 8 - 1;
 292
 293   /* Perform small precision Newtonian iterations to get a full word
 294      approximation.  For small operands, these iteration will make the
 295      entire job.  */
 296   if (t_high0 == ~0)
 297     initial_approx = t_high0;
 298   else
 299     {
 300       mp_limb quot;
 301
 302       if (t_high0 >= initial_approx)
 303         initial_approx = t_high0 + 1;
 304
 305       /* First get about 18 bits with pure C arithmetics.  */
 306       quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2;
 307       initial_approx = (initial_approx + quot) / 2;
 308       initial_approx |= (mp_limb) 1 << (BITS_PER_MP_LIMB - 1);
 309
 310       /* Now get a full word by one (or for > 36 bit machines) several
 311          iterations.  */
 312       for (i = 16; i < BITS_PER_MP_LIMB; i <<= 1)
 313         {
 314           mp_limb ignored_remainder;
 315
 316           udiv_qrnnd (quot, ignored_remainder,
 317                       t_high0, t_high1, initial_approx);
 318           initial_approx = (initial_approx + quot) / 2;
 319           initial_approx |= (mp_limb) 1 << (BITS_PER_MP_LIMB - 1);
 320         }
 321     }
 322 #endif
 323
 324   rp[0] = initial_approx;
 325   rsize = 1;
 326
 327   xp = (mp_ptr) alloca (tsize * BYTES_PER_MP_LIMB);
 328   ttp = (mp_ptr) alloca (tsize * BYTES_PER_MP_LIMB);
 329
 330   t_end_ptr = tp + tsize;
 331
 332 #ifdef GMP_DEBUG /* partain: was DEBUG */
 333           printf ("\n\nT = ");
 334           _mp_mout (tp, tsize);
 335 #endif
 336
 337   if (tsize > 2)
 338     {
 339       /* Determine the successive precisions to use in the iteration.  We
 340          minimize the precisions, beginning with the highest (i.e. last
 341          iteration) to the lowest (i.e. first iteration).  */
 342
 343       tmp = tsize / 2;
 344       for (i = 0;;i++)
 345         {
 346           tsize = (tmp + 1) / 2;
 347           if (tmp == tsize)
 348             break;
 349           tsizes[i] = tsize + tmp;
 350           tmp = tsize;
 351         }
 352
 353       /* Main Newton iteration loop.  For big arguments, most of the
 354          time is spent here.  */
 355
 356       /* It is possible to do a great optimization here.  The successive
 357          divisors in the mpn_div call below has more and more leading
 358          words equal to its predecessor.  Therefore the beginning of
 359          each division will repeat the same work as did the last
 360          division.  If we could guarantee that the leading words of two
 361          consecutive divisors are the same (i.e. in this case, a later
 362          divisor has just more digits at the end) it would be a simple
 363          matter of just using the old remainder of the last division in
 364          a subsequent division, to take care of this optimization.  This
 365          idea would surely make a difference even for small arguments.  */
 366
 367       /* Loop invariants:
 368
 369          R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1.
 370          X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X.
 371          R <= shiftdown_to_same_size(X).  */
 372
 373       while (--i >= 0)
 374         {
 375           mp_limb cy;
 376 #ifdef GMP_DEBUG /* partain: was DEBUG */
 377           mp_limb old_least_sign_r = rp[0];
 378           mp_size old_rsize = rsize;
 379
 380           printf ("R = ");
 381           _mp_mout (rp, rsize);
 382 #endif
 383           tsize = tsizes[i];
 384
 385           /* Need to copy the numerator into temporary space, as
 386              mpn_div overwrites its numerator argument with the
 387              remainder (which we currently ignore).  */
 388           MPN_COPY (ttp, t_end_ptr - tsize, tsize);
 389           cy = mpn_div (xp, ttp, tsize, rp, rsize);
 390           xsize = tsize - rsize;
 391           cy = cy ? xp[xsize] : 0;
 392
 393 #ifdef GMP_DEBUG /* partain: was DEBUG */
 394           printf ("X =%d", cy);
 395           _mp_mout (xp, xsize);
 396 #endif
 397
 398           /* Add X and R with the most significant limbs aligned,
 399              temporarily ignoring at least one limb at the low end of X.  */
 400           tmp = xsize - rsize;
 401           cy += mpn_add (xp + tmp, rp, rsize, xp + tmp, rsize);
 402
 403           /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get
 404              intermediate roots that'd need an extra bit.  We don't want to
 405              handle that since it would make the subsequent divisor
 406              non-normalized, so round such roots down to be only ones in the
 407              current precision.  */
 408           if (cy == 2)
 409             {
 410               mp_size j;
 411               for (j = xsize; j >= 0; j--)
 412                 xp[j] = ~(mp_limb)0;
 413             }
 414
 415           /* Divide X by 2 and put the result in R.  This is the new
 416              approximation.  Shift in the carry from the addition.  */
 417           rsize = mpn_rshiftci (rp, xp, xsize, 1, (mp_limb) 1);
 418 #ifdef GMP_DEBUG /* partain: was DEBUG */
 419           if (old_least_sign_r != rp[rsize - old_rsize])
 420             printf (">>>>>>>> %d: %08x, %08x <<<<<<<<\n",
 421                     i, old_least_sign_r, rp[rsize - old_rsize]);
 422 #endif
 423         }
 424     }
 425
 426 #ifdef GMP_DEBUG /* partain: was DEBUG */
 427   printf ("(final) R = ");
 428   _mp_mout (rp, rsize);
 429 #endif
 430
 431   /* We computed the square root of OP * 2**(2*floor(cnt/2)).
 432      This has resulted in R being 2**floor(cnt/2) to large.
 433      Shift it down here to fix that.  */
 434   rsize = mpn_rshift (rp, rp, rsize, cnt/2);
 435
 436   /* Calculate the remainder.  */
 437   tsize = mpn_mul (tp, rp, rsize, rp, rsize);
 438   if (op_size < tsize
 439       || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0))
 440     {
 441       /* R is too large.  Decrement it.  */
 442       mp_limb one = 1;
 443
 444       tsize = tsize + mpn_sub (tp, tp, tsize, rp, rsize);
 445       tsize = tsize + mpn_sub (tp, tp, tsize, rp, rsize);
 446       tsize = tsize + mpn_add (tp, tp, tsize, &one, 1);
 447
 448       (void) mpn_sub (rp, rp, rsize, &one, 1);
 449
 450 #ifdef GMP_DEBUG /* partain: was DEBUG */
 451       printf ("(adjusted) R = ");
 452       _mp_mout (rp, rsize);
 453 #endif
 454     }
 455
 456   if (rem_ptr != NULL)
 457     {
 458       mp_size retval = op_size + mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize);
 459       alloca (0);
 460       return retval;
 461     }
 462   else
 463     {
 464       mp_size retval = (op_size != tsize || mpn_cmp (op_ptr, tp, op_size));
 465       alloca (0);
 466       return retval;
 467     }
 468 }
 469
 470 #ifdef GMP_DEBUG /* partain: was DEBUG */
 471 _mp_mout (mp_srcptr p, mp_size size)
 472 {
 473   mp_size ii;
 474   for (ii = size - 1; ii >= 0; ii--)
 475     printf ("%08X", p[ii]);
 476
 477   puts ("");
 478 }
 479 #endif