1 /* mpn_sqrt(root_ptr, rem_ptr, op_ptr, op_size)
3 Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR.
4 Write the remainder at REM_PTR, if REM_PTR != NULL.
5 Return the size of the remainder.
6 (The size of the root is always half of the size of the operand.)
8 OP_PTR and ROOT_PTR may not point to the same object.
9 OP_PTR and REM_PTR may point to the same object.
11 If REM_PTR is NULL, only the root is computed and the return value of
12 the function is 0 if OP is a perfect square, and *any* non-zero number
15 Copyright (C) 1991, 1993 Free Software Foundation, Inc.
17 This file is part of the GNU MP Library.
19 The GNU MP Library is free software; you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation; either version 2, or (at your option)
24 The GNU MP Library is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
29 You should have received a copy of the GNU General Public License
30 along with the GNU MP Library; see the file COPYING. If not, write to
31 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
33 /* This code is just correct if "unsigned char" has at least 8 bits. It
34 doesn't help to use CHAR_BIT from limits.h, as the real problem is
41 /* Square root algorithm:
43 1. Shift OP (the input) to the left an even number of bits s.t. there
44 are an even number of words and either (or both) of the most
45 significant bits are set. This way, sqrt(OP) has exactly half as
46 many words as OP, and has its most significant bit set.
48 2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables.
49 This approximation is used for the first single-precision
50 iterations of Newton's method, yielding a full-word approximation
53 3. Perform multiple-precision Newton iteration until we have the
54 exact result. Only about half of the input operand is used in
55 this calculation, as the square root is perfectly determinable
56 from just the higher half of a number. */
58 /* Define this macro for IEEE P854 machines with a fast sqrt instruction. */
65 asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a)); \
70 #if defined __HAVE_68881__
74 asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a)); \
83 asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a)); \
92 /* Tables for initial approximation of the square root. These are
93 indexed with bits 1-8 of the operand for which the square root is
94 calculated, where bit 0 is the most significant non-zero bit. I.e.
95 the most significant one-bit is not used, since that per definition
96 is one. Likewise, the tables don't return the highest bit of the
97 result. That bit must be inserted by or:ing the returned value with
98 0x100. This way, we get a 9-bit approximation from 8-bit tables! */
100 /* Table to be used for operands with an even total number of bits.
101 (Exactly as in the decimal system there are similarities between the
102 square root of numbers with the same initial digits and an even
103 difference in the total number of digits. Consider the square root
104 of 1, 10, 100, 1000, ...) */
105 static unsigned char even_approx_tab[256] =
107 0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e,
108 0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74,
109 0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79,
110 0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f,
111 0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84,
112 0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89,
113 0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f,
114 0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94,
115 0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99,
116 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e,
117 0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3,
118 0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7,
119 0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac,
120 0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1,
121 0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6,
122 0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba,
123 0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf,
124 0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3,
125 0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8,
126 0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc,
127 0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1,
128 0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5,
129 0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda,
130 0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde,
131 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2,
132 0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6,
133 0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb,
134 0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef,
135 0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3,
136 0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7,
137 0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb,
138 0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff,
141 /* Table to be used for operands with an odd total number of bits.
142 (Further comments before previous table.) */
143 static unsigned char odd_approx_tab[256] =
145 0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
146 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07,
147 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b,
148 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f,
149 0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12,
150 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16,
151 0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a,
152 0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d,
153 0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21,
154 0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24,
155 0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,
156 0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b,
157 0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f,
158 0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32,
159 0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35,
160 0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39,
161 0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c,
162 0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f,
163 0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42,
164 0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45,
165 0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49,
166 0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c,
167 0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f,
168 0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52,
169 0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55,
170 0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58,
171 0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b,
172 0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e,
173 0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61,
174 0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63,
175 0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66,
176 0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69,
183 mpn_sqrt (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size op_size)
185 mpn_sqrt (root_ptr, rem_ptr, op_ptr, op_size)
192 /* R (root result) */
193 mp_ptr rp; /* Pointer to least significant word */
194 mp_size rsize; /* The size in words */
196 /* T (OP shifted to the left a.k.a. normalized) */
197 mp_ptr tp; /* Pointer to least significant word */
198 mp_size tsize; /* The size in words */
199 mp_ptr t_end_ptr; /* Pointer right beyond most sign. word */
200 mp_limb t_high0, t_high1; /* The two most significant words */
202 /* TT (temporary for numerator/remainder) */
203 mp_ptr ttp; /* Pointer to least significant word */
205 /* X (temporary for quotient in main loop) */
206 mp_ptr xp; /* Pointer to least significant word */
207 mp_size xsize; /* The size in words */
210 mp_limb initial_approx; /* Initially made approximation */
211 mp_size tsizes[BITS_PER_MP_LIMB]; /* Successive calculation precisions */
215 /* If OP is zero, both results are zero. */
219 count_leading_zeros (cnt, op_ptr[op_size - 1]);
221 if ((tsize & 1) != 0)
223 cnt += BITS_PER_MP_LIMB;
230 /* Shift OP an even number of bits into T, such that either the most or
231 the second most significant bit is set, and such that the number of
232 words in T becomes even. This way, the number of words in R=sqrt(OP)
233 is exactly half as many as in OP, and the most significant bit of R
236 Also, the initial approximation is simplified by this up-shifted OP.
238 Finally, the Newtonian iteration which is the main part of this
239 program performs division by R. The fast division routine expects
240 the divisor to be "normalized" in exactly the sense of having the
241 most significant bit set. */
243 tp = (mp_ptr) alloca (tsize * BYTES_PER_MP_LIMB);
245 t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size,
246 (cnt & ~1) % BITS_PER_MP_LIMB);
247 if (cnt >= BITS_PER_MP_LIMB)
250 t_high0 = tp[tsize - 1];
251 t_high1 = tp[tsize - 2]; /* Never stray. TSIZE is >= 2. */
253 /* Is there a fast sqrt instruction defined for this machine? */
256 initial_approx = SQRT (t_high0 * 2.0
257 * ((mp_limb) 1 << (BITS_PER_MP_LIMB - 1))
259 /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have
260 become incorrect due to overflow in the conversion from double to
261 mp_limb above. It will typically be zero in that case, but might be
262 a small number on some machines. The most significant bit of
263 INITIAL_APPROX should be set, so that bit is a good overflow
265 if ((mp_limb_signed) initial_approx >= 0)
269 /* Get a 9 bit approximation from the tables. The tables expect to
270 be indexed with the 8 high bits right below the highest bit.
271 Also, the highest result bit is not returned by the tables, and
272 must be or:ed into the result. The scheme gives 9 bits of start
273 approximation with just 256-entry 8 bit tables. */
277 /* The most sign bit of t_high0 is set. */
278 initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1);
279 initial_approx &= 0xff;
280 initial_approx = even_approx_tab[initial_approx];
284 /* The most significant bit of T_HIGH0 is unset,
285 the second most significant is set. */
286 initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2);
287 initial_approx &= 0xff;
288 initial_approx = odd_approx_tab[initial_approx];
290 initial_approx |= 0x100;
291 initial_approx <<= BITS_PER_MP_LIMB - 8 - 1;
293 /* Perform small precision Newtonian iterations to get a full word
294 approximation. For small operands, these iteration will make the
297 initial_approx = t_high0;
302 if (t_high0 >= initial_approx)
303 initial_approx = t_high0 + 1;
305 /* First get about 18 bits with pure C arithmetics. */
306 quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2;
307 initial_approx = (initial_approx + quot) / 2;
308 initial_approx |= (mp_limb) 1 << (BITS_PER_MP_LIMB - 1);
310 /* Now get a full word by one (or for > 36 bit machines) several
312 for (i = 16; i < BITS_PER_MP_LIMB; i <<= 1)
314 mp_limb ignored_remainder;
316 udiv_qrnnd (quot, ignored_remainder,
317 t_high0, t_high1, initial_approx);
318 initial_approx = (initial_approx + quot) / 2;
319 initial_approx |= (mp_limb) 1 << (BITS_PER_MP_LIMB - 1);
324 rp[0] = initial_approx;
327 xp = (mp_ptr) alloca (tsize * BYTES_PER_MP_LIMB);
328 ttp = (mp_ptr) alloca (tsize * BYTES_PER_MP_LIMB);
330 t_end_ptr = tp + tsize;
332 #ifdef GMP_DEBUG /* partain: was DEBUG */
334 _mp_mout (tp, tsize);
339 /* Determine the successive precisions to use in the iteration. We
340 minimize the precisions, beginning with the highest (i.e. last
341 iteration) to the lowest (i.e. first iteration). */
346 tsize = (tmp + 1) / 2;
349 tsizes[i] = tsize + tmp;
353 /* Main Newton iteration loop. For big arguments, most of the
354 time is spent here. */
356 /* It is possible to do a great optimization here. The successive
357 divisors in the mpn_div call below has more and more leading
358 words equal to its predecessor. Therefore the beginning of
359 each division will repeat the same work as did the last
360 division. If we could guarantee that the leading words of two
361 consecutive divisors are the same (i.e. in this case, a later
362 divisor has just more digits at the end) it would be a simple
363 matter of just using the old remainder of the last division in
364 a subsequent division, to take care of this optimization. This
365 idea would surely make a difference even for small arguments. */
369 R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1.
370 X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X.
371 R <= shiftdown_to_same_size(X). */
376 #ifdef GMP_DEBUG /* partain: was DEBUG */
377 mp_limb old_least_sign_r = rp[0];
378 mp_size old_rsize = rsize;
381 _mp_mout (rp, rsize);
385 /* Need to copy the numerator into temporary space, as
386 mpn_div overwrites its numerator argument with the
387 remainder (which we currently ignore). */
388 MPN_COPY (ttp, t_end_ptr - tsize, tsize);
389 cy = mpn_div (xp, ttp, tsize, rp, rsize);
390 xsize = tsize - rsize;
391 cy = cy ? xp[xsize] : 0;
393 #ifdef GMP_DEBUG /* partain: was DEBUG */
394 printf ("X =%d", cy);
395 _mp_mout (xp, xsize);
398 /* Add X and R with the most significant limbs aligned,
399 temporarily ignoring at least one limb at the low end of X. */
401 cy += mpn_add (xp + tmp, rp, rsize, xp + tmp, rsize);
403 /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get
404 intermediate roots that'd need an extra bit. We don't want to
405 handle that since it would make the subsequent divisor
406 non-normalized, so round such roots down to be only ones in the
407 current precision. */
411 for (j = xsize; j >= 0; j--)
415 /* Divide X by 2 and put the result in R. This is the new
416 approximation. Shift in the carry from the addition. */
417 rsize = mpn_rshiftci (rp, xp, xsize, 1, (mp_limb) 1);
418 #ifdef GMP_DEBUG /* partain: was DEBUG */
419 if (old_least_sign_r != rp[rsize - old_rsize])
420 printf (">>>>>>>> %d: %08x, %08x <<<<<<<<\n",
421 i, old_least_sign_r, rp[rsize - old_rsize]);
426 #ifdef GMP_DEBUG /* partain: was DEBUG */
427 printf ("(final) R = ");
428 _mp_mout (rp, rsize);
431 /* We computed the square root of OP * 2**(2*floor(cnt/2)).
432 This has resulted in R being 2**floor(cnt/2) to large.
433 Shift it down here to fix that. */
434 rsize = mpn_rshift (rp, rp, rsize, cnt/2);
436 /* Calculate the remainder. */
437 tsize = mpn_mul (tp, rp, rsize, rp, rsize);
439 || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0))
441 /* R is too large. Decrement it. */
444 tsize = tsize + mpn_sub (tp, tp, tsize, rp, rsize);
445 tsize = tsize + mpn_sub (tp, tp, tsize, rp, rsize);
446 tsize = tsize + mpn_add (tp, tp, tsize, &one, 1);
448 (void) mpn_sub (rp, rp, rsize, &one, 1);
450 #ifdef GMP_DEBUG /* partain: was DEBUG */
451 printf ("(adjusted) R = ");
452 _mp_mout (rp, rsize);
458 mp_size retval = op_size + mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize);
464 mp_size retval = (op_size != tsize || mpn_cmp (op_ptr, tp, op_size));
470 #ifdef GMP_DEBUG /* partain: was DEBUG */
471 _mp_mout (mp_srcptr p, mp_size size)
474 for (ii = size - 1; ii >= 0; ii--)
475 printf ("%08X", p[ii]);