rts/gmp/mpn/generic/sqrtrem.c

   1 /* mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
   2
   3    Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR.
   4    Write the remainder at REM_PTR, if REM_PTR != NULL.
   5    Return the size of the remainder.
   6    (The size of the root is always half of the size of the operand.)
   7
   8    OP_PTR and ROOT_PTR may not point to the same object.
   9    OP_PTR and REM_PTR may point to the same object.
  10
  11    If REM_PTR is NULL, only the root is computed and the return value of
  12    the function is 0 if OP is a perfect square, and *any* non-zero number
  13    otherwise.
  14
  15 Copyright (C) 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
  16 Foundation, Inc.
  17
  18 This file is part of the GNU MP Library.
  19
  20 The GNU MP Library is free software; you can redistribute it and/or modify
  21 it under the terms of the GNU Lesser General Public License as published by
  22 the Free Software Foundation; either version 2.1 of the License, or (at your
  23 option) any later version.
  24
  25 The GNU MP Library is distributed in the hope that it will be useful, but
  26 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  27 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  28 License for more details.
  29
  30 You should have received a copy of the GNU Lesser General Public License
  31 along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
  32 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  33 MA 02111-1307, USA. */
  34
  35 /* This code is just correct if "unsigned char" has at least 8 bits.  It
  36    doesn't help to use CHAR_BIT from limits.h, as the real problem is
  37    the static arrays.  */
  38
  39 #include <stdio.h> /* for NULL */
  40 #include "gmp.h"
  41 #include "gmp-impl.h"
  42 #include "longlong.h"
  43
  44 /* Square root algorithm:
  45
  46    1. Shift OP (the input) to the left an even number of bits s.t. there
  47       are an even number of words and either (or both) of the most
  48       significant bits are set.  This way, sqrt(OP) has exactly half as
  49       many words as OP, and has its most significant bit set.
  50
  51    2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables.
  52       This approximation is used for the first single-precision
  53       iterations of Newton's method, yielding a full-word approximation
  54       to sqrt(OP).
  55
  56    3. Perform multiple-precision Newton iteration until we have the
  57       exact result.  Only about half of the input operand is used in
  58       this calculation, as the square root is perfectly determinable
  59       from just the higher half of a number.  */
  60 \f
  61 /* Define this macro for IEEE P854 machines with a fast sqrt instruction.  */
  62 #if defined __GNUC__ && ! defined __SOFT_FLOAT
  63
  64 #if defined (__sparc__) && BITS_PER_MP_LIMB == 32
  65 #define SQRT(a) \
  66   ({                                                                    \
  67     double __sqrt_res;                                                  \
  68     asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a));                 \
  69     __sqrt_res;                                                         \
  70   })
  71 #endif
  72
  73 #if defined (__HAVE_68881__)
  74 #define SQRT(a) \
  75   ({                                                                    \
  76     double __sqrt_res;                                                  \
  77     asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a));                 \
  78     __sqrt_res;                                                         \
  79   })
  80 #endif
  81
  82 #if defined (__hppa) && BITS_PER_MP_LIMB == 32
  83 #define SQRT(a) \
  84   ({                                                                    \
  85     double __sqrt_res;                                                  \
  86     asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a));            \
  87     __sqrt_res;                                                         \
  88   })
  89 #endif
  90
  91 #if defined (_ARCH_PWR2) && BITS_PER_MP_LIMB == 32
  92 #define SQRT(a) \
  93   ({                                                                    \
  94     double __sqrt_res;                                                  \
  95     asm ("fsqrt %0,%1" : "=f" (__sqrt_res) : "f" (a));                  \
  96     __sqrt_res;                                                         \
  97   })
  98 #endif
  99
 100 #if 0
 101 #if defined (__i386__) || defined (__i486__)
 102 #define SQRT(a) \
 103   ({                                                                    \
 104     double __sqrt_res;                                                  \
 105     asm ("fsqrt" : "=t" (__sqrt_res) : "0" (a));                        \
 106     __sqrt_res;                                                         \
 107   })
 108 #endif
 109 #endif
 110
 111 #endif
 112
 113 #ifndef SQRT
 114
 115 /* Tables for initial approximation of the square root.  These are
 116    indexed with bits 1-8 of the operand for which the square root is
 117    calculated, where bit 0 is the most significant non-zero bit.  I.e.
 118    the most significant one-bit is not used, since that per definition
 119    is one.  Likewise, the tables don't return the highest bit of the
 120    result.  That bit must be inserted by or:ing the returned value with
 121    0x100.  This way, we get a 9-bit approximation from 8-bit tables!  */
 122
 123 /* Table to be used for operands with an even total number of bits.
 124    (Exactly as in the decimal system there are similarities between the
 125    square root of numbers with the same initial digits and an even
 126    difference in the total number of digits.  Consider the square root
 127    of 1, 10, 100, 1000, ...)  */
 128 static const unsigned char even_approx_tab[256] =
 129 {
 130   0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e,
 131   0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74,
 132   0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79,
 133   0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f,
 134   0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84,
 135   0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89,
 136   0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f,
 137   0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94,
 138   0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99,
 139   0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e,
 140   0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3,
 141   0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7,
 142   0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac,
 143   0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1,
 144   0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6,
 145   0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba,
 146   0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf,
 147   0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3,
 148   0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8,
 149   0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc,
 150   0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1,
 151   0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5,
 152   0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda,
 153   0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde,
 154   0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2,
 155   0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6,
 156   0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb,
 157   0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef,
 158   0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3,
 159   0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7,
 160   0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb,
 161   0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff,
 162 };
 163
 164 /* Table to be used for operands with an odd total number of bits.
 165    (Further comments before previous table.)  */
 166 static const unsigned char odd_approx_tab[256] =
 167 {
 168   0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
 169   0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07,
 170   0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b,
 171   0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f,
 172   0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12,
 173   0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16,
 174   0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a,
 175   0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d,
 176   0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21,
 177   0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24,
 178   0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,
 179   0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b,
 180   0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f,
 181   0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32,
 182   0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35,
 183   0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39,
 184   0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c,
 185   0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f,
 186   0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42,
 187   0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45,
 188   0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49,
 189   0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c,
 190   0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f,
 191   0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52,
 192   0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55,
 193   0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58,
 194   0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b,
 195   0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e,
 196   0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61,
 197   0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63,
 198   0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66,
 199   0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69,
 200 };
 201 #endif
 202
 203 \f
 204 mp_size_t
 205 #if __STDC__
 206 mpn_sqrtrem (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size_t op_size)
 207 #else
 208 mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
 209      mp_ptr root_ptr;
 210      mp_ptr rem_ptr;
 211      mp_srcptr op_ptr;
 212      mp_size_t op_size;
 213 #endif
 214 {
 215   /* R (root result) */
 216   mp_ptr rp;                    /* Pointer to least significant word */
 217   mp_size_t rsize;              /* The size in words */
 218
 219   /* T (OP shifted to the left a.k.a. normalized) */
 220   mp_ptr tp;                    /* Pointer to least significant word */
 221   mp_size_t tsize;              /* The size in words */
 222   mp_ptr t_end_ptr;             /* Pointer right beyond most sign. word */
 223   mp_limb_t t_high0, t_high1;   /* The two most significant words */
 224
 225   /* TT (temporary for numerator/remainder) */
 226   mp_ptr ttp;                   /* Pointer to least significant word */
 227
 228   /* X (temporary for quotient in main loop) */
 229   mp_ptr xp;                    /* Pointer to least significant word */
 230   mp_size_t xsize;              /* The size in words */
 231
 232   unsigned cnt;
 233   mp_limb_t initial_approx;     /* Initially made approximation */
 234   mp_size_t tsizes[BITS_PER_MP_LIMB];   /* Successive calculation precisions */
 235   mp_size_t tmp;
 236   mp_size_t i;
 237
 238   mp_limb_t cy_limb;
 239   TMP_DECL (marker);
 240
 241   /* If OP is zero, both results are zero.  */
 242   if (op_size == 0)
 243     return 0;
 244
 245   count_leading_zeros (cnt, op_ptr[op_size - 1]);
 246   tsize = op_size;
 247   if ((tsize & 1) != 0)
 248     {
 249       cnt += BITS_PER_MP_LIMB;
 250       tsize++;
 251     }
 252
 253   rsize = tsize / 2;
 254   rp = root_ptr;
 255
 256   TMP_MARK (marker);
 257
 258   /* Shift OP an even number of bits into T, such that either the most or
 259      the second most significant bit is set, and such that the number of
 260      words in T becomes even.  This way, the number of words in R=sqrt(OP)
 261      is exactly half as many as in OP, and the most significant bit of R
 262      is set.
 263
 264      Also, the initial approximation is simplified by this up-shifted OP.
 265
 266      Finally, the Newtonian iteration which is the main part of this
 267      program performs division by R.  The fast division routine expects
 268      the divisor to be "normalized" in exactly the sense of having the
 269      most significant bit set.  */
 270
 271   tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
 272
 273   if ((cnt & ~1) % BITS_PER_MP_LIMB != 0)
 274     t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size,
 275                           (cnt & ~1) % BITS_PER_MP_LIMB);
 276   else
 277     MPN_COPY (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size);
 278
 279   if (cnt >= BITS_PER_MP_LIMB)
 280     tp[0] = 0;
 281
 282   t_high0 = tp[tsize - 1];
 283   t_high1 = tp[tsize - 2];      /* Never stray.  TSIZE is >= 2.  */
 284
 285 /* Is there a fast sqrt instruction defined for this machine?  */
 286 #ifdef SQRT
 287   {
 288     initial_approx = SQRT (t_high0 * MP_BASE_AS_DOUBLE + t_high1);
 289     /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have
 290        become incorrect due to overflow in the conversion from double to
 291        mp_limb_t above.  It will typically be zero in that case, but might be
 292        a small number on some machines.  The most significant bit of
 293        INITIAL_APPROX should be set, so that bit is a good overflow
 294        indication.  */
 295     if ((mp_limb_signed_t) initial_approx >= 0)
 296       initial_approx = ~(mp_limb_t)0;
 297   }
 298 #else
 299   /* Get a 9 bit approximation from the tables.  The tables expect to
 300      be indexed with the 8 high bits right below the highest bit.
 301      Also, the highest result bit is not returned by the tables, and
 302      must be or:ed into the result.  The scheme gives 9 bits of start
 303      approximation with just 256-entry 8 bit tables.  */
 304
 305   if ((cnt & 1) == 0)
 306     {
 307       /* The most significant bit of t_high0 is set.  */
 308       initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1);
 309       initial_approx &= 0xff;
 310       initial_approx = even_approx_tab[initial_approx];
 311     }
 312   else
 313     {
 314       /* The most significant bit of t_high0 is unset,
 315          the second most significant is set.  */
 316       initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2);
 317       initial_approx &= 0xff;
 318       initial_approx = odd_approx_tab[initial_approx];
 319     }
 320   initial_approx |= 0x100;
 321   initial_approx <<= BITS_PER_MP_LIMB - 8 - 1;
 322
 323   /* Perform small precision Newtonian iterations to get a full word
 324      approximation.  For small operands, these iterations will do the
 325      entire job.  */
 326   if (t_high0 == ~(mp_limb_t)0)
 327     initial_approx = t_high0;
 328   else
 329     {
 330       mp_limb_t quot;
 331
 332       if (t_high0 >= initial_approx)
 333         initial_approx = t_high0 + 1;
 334
 335       /* First get about 18 bits with pure C arithmetics.  */
 336       quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2;
 337       initial_approx = (initial_approx + quot) / 2;
 338       initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
 339
 340       /* Now get a full word by one (or for > 36 bit machines) several
 341          iterations.  */
 342       for (i = 18; i < BITS_PER_MP_LIMB; i <<= 1)
 343         {
 344           mp_limb_t ignored_remainder;
 345
 346           udiv_qrnnd (quot, ignored_remainder,
 347                       t_high0, t_high1, initial_approx);
 348           initial_approx = (initial_approx + quot) / 2;
 349           initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
 350         }
 351     }
 352 #endif
 353
 354   rp[0] = initial_approx;
 355   rsize = 1;
 356
 357 #ifdef SQRT_DEBUG
 358           printf ("\n\nT = ");
 359           mpn_dump (tp, tsize);
 360 #endif
 361
 362   if (tsize > 2)
 363     {
 364       /* Determine the successive precisions to use in the iteration.  We
 365          minimize the precisions, beginning with the highest (i.e. last
 366          iteration) to the lowest (i.e. first iteration).  */
 367
 368       xp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
 369       ttp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
 370
 371       t_end_ptr = tp + tsize;
 372
 373       tmp = tsize / 2;
 374       for (i = 0;; i++)
 375         {
 376           tsize = (tmp + 1) / 2;
 377           if (tmp == tsize)
 378             break;
 379           tsizes[i] = tsize + tmp;
 380           tmp = tsize;
 381         }
 382
 383       /* Main Newton iteration loop.  For big arguments, most of the
 384          time is spent here.  */
 385
 386       /* It is possible to do a great optimization here.  The successive
 387          divisors in the mpn_divmod call below have more and more leading
 388          words equal to its predecessor.  Therefore the beginning of
 389          each division will repeat the same work as did the last
 390          division.  If we could guarantee that the leading words of two
 391          consecutive divisors are the same (i.e. in this case, a later
 392          divisor has just more digits at the end) it would be a simple
 393          matter of just using the old remainder of the last division in
 394          a subsequent division, to take care of this optimization.  This
 395          idea would surely make a difference even for small arguments.  */
 396
 397       /* Loop invariants:
 398
 399          R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1.
 400          X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X.
 401          R <= shiftdown_to_same_size(X).  */
 402
 403       while (--i >= 0)
 404         {
 405           mp_limb_t cy;
 406 #ifdef SQRT_DEBUG
 407           mp_limb_t old_least_sign_r = rp[0];
 408           mp_size_t old_rsize = rsize;
 409
 410           printf ("R = ");
 411           mpn_dump (rp, rsize);
 412 #endif
 413           tsize = tsizes[i];
 414
 415           /* Need to copy the numerator into temporary space, as
 416              mpn_divmod overwrites its numerator argument with the
 417              remainder (which we currently ignore).  */
 418           MPN_COPY (ttp, t_end_ptr - tsize, tsize);
 419           cy = mpn_divmod (xp, ttp, tsize, rp, rsize);
 420           xsize = tsize - rsize;
 421
 422 #ifdef SQRT_DEBUG
 423           printf ("X =%d ", cy);
 424           mpn_dump (xp, xsize);
 425 #endif
 426
 427           /* Add X and R with the most significant limbs aligned,
 428              temporarily ignoring at least one limb at the low end of X.  */
 429           tmp = xsize - rsize;
 430           cy += mpn_add_n (xp + tmp, rp, xp + tmp, rsize);
 431
 432           /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get
 433              intermediate roots that'd need an extra bit.  We don't want to
 434              handle that since it would make the subsequent divisor
 435              non-normalized, so round such roots down to be only ones in the
 436              current precision.  */
 437           if (cy == 2)
 438             {
 439               mp_size_t j;
 440               for (j = xsize; j >= 0; j--)
 441                 xp[j] = ~(mp_limb_t)0;
 442             }
 443
 444           /* Divide X by 2 and put the result in R.  This is the new
 445              approximation.  Shift in the carry from the addition.  */
 446           mpn_rshift (rp, xp, xsize, 1);
 447           rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1));
 448           rsize = xsize;
 449 #ifdef SQRT_DEBUG
 450           if (old_least_sign_r != rp[rsize - old_rsize])
 451             printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n",
 452                     i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r,
 453                     2 * BYTES_PER_MP_LIMB, rp[rsize - old_rsize]);
 454 #endif
 455         }
 456     }
 457
 458 #ifdef SQRT_DEBUG
 459   printf ("(final) R = ");
 460   mpn_dump (rp, rsize);
 461 #endif
 462
 463   /* We computed the square root of OP * 2**(2*floor(cnt/2)).
 464      This has resulted in R being 2**floor(cnt/2) to large.
 465      Shift it down here to fix that.  */
 466   if (cnt / 2 != 0)
 467     {
 468       mpn_rshift (rp, rp, rsize, cnt/2);
 469       rsize -= rp[rsize - 1] == 0;
 470     }
 471
 472   /* Calculate the remainder.  */
 473   mpn_mul_n (tp, rp, rp, rsize);
 474   tsize = rsize + rsize;
 475   tsize -= tp[tsize - 1] == 0;
 476   if (op_size < tsize
 477       || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0))
 478     {
 479       /* R is too large.  Decrement it.  */
 480
 481       /* These operations can't overflow.  */
 482       cy_limb  = mpn_sub_n (tp, tp, rp, rsize);
 483       cy_limb += mpn_sub_n (tp, tp, rp, rsize);
 484       mpn_decr_u (tp + rsize, cy_limb);
 485       mpn_incr_u (tp, (mp_limb_t) 1);
 486
 487       mpn_decr_u (rp, (mp_limb_t) 1);
 488
 489 #ifdef SQRT_DEBUG
 490       printf ("(adjusted) R = ");
 491       mpn_dump (rp, rsize);
 492 #endif
 493     }
 494
 495   if (rem_ptr != NULL)
 496     {
 497       cy_limb = mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize);
 498       MPN_NORMALIZE (rem_ptr, op_size);
 499       TMP_FREE (marker);
 500       return op_size;
 501     }
 502   else
 503     {
 504       int res;
 505       res = op_size != tsize || mpn_cmp (op_ptr, tp, op_size);
 506       TMP_FREE (marker);
 507       return res;
 508     }
 509 }