1 /* mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
3 Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR.
4 Write the remainder at REM_PTR, if REM_PTR != NULL.
5 Return the size of the remainder.
6 (The size of the root is always half of the size of the operand.)
8 OP_PTR and ROOT_PTR may not point to the same object.
9 OP_PTR and REM_PTR may point to the same object.
11 If REM_PTR is NULL, only the root is computed and the return value of
12 the function is 0 if OP is a perfect square, and *any* non-zero number
15 Copyright (C) 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
18 This file is part of the GNU MP Library.
20 The GNU MP Library is free software; you can redistribute it and/or modify
21 it under the terms of the GNU Lesser General Public License as published by
22 the Free Software Foundation; either version 2.1 of the License, or (at your
23 option) any later version.
25 The GNU MP Library is distributed in the hope that it will be useful, but
26 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
28 License for more details.
30 You should have received a copy of the GNU Lesser General Public License
31 along with the GNU MP Library; see the file COPYING.LIB. If not, write to
32 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
33 MA 02111-1307, USA. */
35 /* This code is just correct if "unsigned char" has at least 8 bits. It
36 doesn't help to use CHAR_BIT from limits.h, as the real problem is
39 #include <stdio.h> /* for NULL */
44 /* Square root algorithm:
46 1. Shift OP (the input) to the left an even number of bits s.t. there
47 are an even number of words and either (or both) of the most
48 significant bits are set. This way, sqrt(OP) has exactly half as
49 many words as OP, and has its most significant bit set.
51 2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables.
52 This approximation is used for the first single-precision
53 iterations of Newton's method, yielding a full-word approximation
56 3. Perform multiple-precision Newton iteration until we have the
57 exact result. Only about half of the input operand is used in
58 this calculation, as the square root is perfectly determinable
59 from just the higher half of a number. */
61 /* Define this macro for IEEE P854 machines with a fast sqrt instruction. */
62 #if defined __GNUC__ && ! defined __SOFT_FLOAT
64 #if defined (__sparc__) && BITS_PER_MP_LIMB == 32
68 asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a)); \
73 #if defined (__HAVE_68881__)
77 asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a)); \
82 #if defined (__hppa) && BITS_PER_MP_LIMB == 32
86 asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a)); \
91 #if defined (_ARCH_PWR2) && BITS_PER_MP_LIMB == 32
95 asm ("fsqrt %0,%1" : "=f" (__sqrt_res) : "f" (a)); \
101 #if defined (__i386__) || defined (__i486__)
105 asm ("fsqrt" : "=t" (__sqrt_res) : "0" (a)); \
115 /* Tables for initial approximation of the square root. These are
116 indexed with bits 1-8 of the operand for which the square root is
117 calculated, where bit 0 is the most significant non-zero bit. I.e.
118 the most significant one-bit is not used, since that per definition
119 is one. Likewise, the tables don't return the highest bit of the
120 result. That bit must be inserted by or:ing the returned value with
121 0x100. This way, we get a 9-bit approximation from 8-bit tables! */
123 /* Table to be used for operands with an even total number of bits.
124 (Exactly as in the decimal system there are similarities between the
125 square root of numbers with the same initial digits and an even
126 difference in the total number of digits. Consider the square root
127 of 1, 10, 100, 1000, ...) */
128 static const unsigned char even_approx_tab[256] =
130 0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e,
131 0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74,
132 0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79,
133 0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f,
134 0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84,
135 0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89,
136 0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f,
137 0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94,
138 0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99,
139 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e,
140 0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3,
141 0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7,
142 0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac,
143 0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1,
144 0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6,
145 0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba,
146 0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf,
147 0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3,
148 0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8,
149 0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc,
150 0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1,
151 0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5,
152 0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda,
153 0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde,
154 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2,
155 0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6,
156 0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb,
157 0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef,
158 0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3,
159 0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7,
160 0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb,
161 0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff,
164 /* Table to be used for operands with an odd total number of bits.
165 (Further comments before previous table.) */
166 static const unsigned char odd_approx_tab[256] =
168 0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
169 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07,
170 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b,
171 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f,
172 0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12,
173 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16,
174 0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a,
175 0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d,
176 0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21,
177 0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24,
178 0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,
179 0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b,
180 0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f,
181 0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32,
182 0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35,
183 0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39,
184 0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c,
185 0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f,
186 0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42,
187 0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45,
188 0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49,
189 0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c,
190 0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f,
191 0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52,
192 0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55,
193 0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58,
194 0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b,
195 0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e,
196 0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61,
197 0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63,
198 0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66,
199 0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69,
206 mpn_sqrtrem (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size_t op_size)
208 mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
215 /* R (root result) */
216 mp_ptr rp; /* Pointer to least significant word */
217 mp_size_t rsize; /* The size in words */
219 /* T (OP shifted to the left a.k.a. normalized) */
220 mp_ptr tp; /* Pointer to least significant word */
221 mp_size_t tsize; /* The size in words */
222 mp_ptr t_end_ptr; /* Pointer right beyond most sign. word */
223 mp_limb_t t_high0, t_high1; /* The two most significant words */
225 /* TT (temporary for numerator/remainder) */
226 mp_ptr ttp; /* Pointer to least significant word */
228 /* X (temporary for quotient in main loop) */
229 mp_ptr xp; /* Pointer to least significant word */
230 mp_size_t xsize; /* The size in words */
233 mp_limb_t initial_approx; /* Initially made approximation */
234 mp_size_t tsizes[BITS_PER_MP_LIMB]; /* Successive calculation precisions */
241 /* If OP is zero, both results are zero. */
245 count_leading_zeros (cnt, op_ptr[op_size - 1]);
247 if ((tsize & 1) != 0)
249 cnt += BITS_PER_MP_LIMB;
258 /* Shift OP an even number of bits into T, such that either the most or
259 the second most significant bit is set, and such that the number of
260 words in T becomes even. This way, the number of words in R=sqrt(OP)
261 is exactly half as many as in OP, and the most significant bit of R
264 Also, the initial approximation is simplified by this up-shifted OP.
266 Finally, the Newtonian iteration which is the main part of this
267 program performs division by R. The fast division routine expects
268 the divisor to be "normalized" in exactly the sense of having the
269 most significant bit set. */
271 tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
273 if ((cnt & ~1) % BITS_PER_MP_LIMB != 0)
274 t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size,
275 (cnt & ~1) % BITS_PER_MP_LIMB);
277 MPN_COPY (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size);
279 if (cnt >= BITS_PER_MP_LIMB)
282 t_high0 = tp[tsize - 1];
283 t_high1 = tp[tsize - 2]; /* Never stray. TSIZE is >= 2. */
285 /* Is there a fast sqrt instruction defined for this machine? */
288 initial_approx = SQRT (t_high0 * MP_BASE_AS_DOUBLE + t_high1);
289 /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have
290 become incorrect due to overflow in the conversion from double to
291 mp_limb_t above. It will typically be zero in that case, but might be
292 a small number on some machines. The most significant bit of
293 INITIAL_APPROX should be set, so that bit is a good overflow
295 if ((mp_limb_signed_t) initial_approx >= 0)
296 initial_approx = ~(mp_limb_t)0;
299 /* Get a 9 bit approximation from the tables. The tables expect to
300 be indexed with the 8 high bits right below the highest bit.
301 Also, the highest result bit is not returned by the tables, and
302 must be or:ed into the result. The scheme gives 9 bits of start
303 approximation with just 256-entry 8 bit tables. */
307 /* The most significant bit of t_high0 is set. */
308 initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1);
309 initial_approx &= 0xff;
310 initial_approx = even_approx_tab[initial_approx];
314 /* The most significant bit of t_high0 is unset,
315 the second most significant is set. */
316 initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2);
317 initial_approx &= 0xff;
318 initial_approx = odd_approx_tab[initial_approx];
320 initial_approx |= 0x100;
321 initial_approx <<= BITS_PER_MP_LIMB - 8 - 1;
323 /* Perform small precision Newtonian iterations to get a full word
324 approximation. For small operands, these iterations will do the
326 if (t_high0 == ~(mp_limb_t)0)
327 initial_approx = t_high0;
332 if (t_high0 >= initial_approx)
333 initial_approx = t_high0 + 1;
335 /* First get about 18 bits with pure C arithmetics. */
336 quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2;
337 initial_approx = (initial_approx + quot) / 2;
338 initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
340 /* Now get a full word by one (or for > 36 bit machines) several
342 for (i = 18; i < BITS_PER_MP_LIMB; i <<= 1)
344 mp_limb_t ignored_remainder;
346 udiv_qrnnd (quot, ignored_remainder,
347 t_high0, t_high1, initial_approx);
348 initial_approx = (initial_approx + quot) / 2;
349 initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
354 rp[0] = initial_approx;
359 mpn_dump (tp, tsize);
364 /* Determine the successive precisions to use in the iteration. We
365 minimize the precisions, beginning with the highest (i.e. last
366 iteration) to the lowest (i.e. first iteration). */
368 xp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
369 ttp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
371 t_end_ptr = tp + tsize;
376 tsize = (tmp + 1) / 2;
379 tsizes[i] = tsize + tmp;
383 /* Main Newton iteration loop. For big arguments, most of the
384 time is spent here. */
386 /* It is possible to do a great optimization here. The successive
387 divisors in the mpn_divmod call below have more and more leading
388 words equal to its predecessor. Therefore the beginning of
389 each division will repeat the same work as did the last
390 division. If we could guarantee that the leading words of two
391 consecutive divisors are the same (i.e. in this case, a later
392 divisor has just more digits at the end) it would be a simple
393 matter of just using the old remainder of the last division in
394 a subsequent division, to take care of this optimization. This
395 idea would surely make a difference even for small arguments. */
399 R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1.
400 X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X.
401 R <= shiftdown_to_same_size(X). */
407 mp_limb_t old_least_sign_r = rp[0];
408 mp_size_t old_rsize = rsize;
411 mpn_dump (rp, rsize);
415 /* Need to copy the numerator into temporary space, as
416 mpn_divmod overwrites its numerator argument with the
417 remainder (which we currently ignore). */
418 MPN_COPY (ttp, t_end_ptr - tsize, tsize);
419 cy = mpn_divmod (xp, ttp, tsize, rp, rsize);
420 xsize = tsize - rsize;
423 printf ("X =%d ", cy);
424 mpn_dump (xp, xsize);
427 /* Add X and R with the most significant limbs aligned,
428 temporarily ignoring at least one limb at the low end of X. */
430 cy += mpn_add_n (xp + tmp, rp, xp + tmp, rsize);
432 /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get
433 intermediate roots that'd need an extra bit. We don't want to
434 handle that since it would make the subsequent divisor
435 non-normalized, so round such roots down to be only ones in the
436 current precision. */
440 for (j = xsize; j >= 0; j--)
441 xp[j] = ~(mp_limb_t)0;
444 /* Divide X by 2 and put the result in R. This is the new
445 approximation. Shift in the carry from the addition. */
446 mpn_rshift (rp, xp, xsize, 1);
447 rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1));
450 if (old_least_sign_r != rp[rsize - old_rsize])
451 printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n",
452 i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r,
453 2 * BYTES_PER_MP_LIMB, rp[rsize - old_rsize]);
459 printf ("(final) R = ");
460 mpn_dump (rp, rsize);
463 /* We computed the square root of OP * 2**(2*floor(cnt/2)).
464 This has resulted in R being 2**floor(cnt/2) to large.
465 Shift it down here to fix that. */
468 mpn_rshift (rp, rp, rsize, cnt/2);
469 rsize -= rp[rsize - 1] == 0;
472 /* Calculate the remainder. */
473 mpn_mul_n (tp, rp, rp, rsize);
474 tsize = rsize + rsize;
475 tsize -= tp[tsize - 1] == 0;
477 || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0))
479 /* R is too large. Decrement it. */
481 /* These operations can't overflow. */
482 cy_limb = mpn_sub_n (tp, tp, rp, rsize);
483 cy_limb += mpn_sub_n (tp, tp, rp, rsize);
484 mpn_decr_u (tp + rsize, cy_limb);
485 mpn_incr_u (tp, (mp_limb_t) 1);
487 mpn_decr_u (rp, (mp_limb_t) 1);
490 printf ("(adjusted) R = ");
491 mpn_dump (rp, rsize);
497 cy_limb = mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize);
498 MPN_NORMALIZE (rem_ptr, op_size);
505 res = op_size != tsize || mpn_cmp (op_ptr, tp, op_size);