1 /* mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
3 Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR.
4 Write the remainder at REM_PTR, if REM_PTR != NULL.
5 Return the size of the remainder.
6 (The size of the root is always half of the size of the operand.)
8 OP_PTR and ROOT_PTR may not point to the same object.
9 OP_PTR and REM_PTR may point to the same object.
11 If REM_PTR is NULL, only the root is computed and the return value of
12 the function is 0 if OP is a perfect square, and *any* non-zero number
15 Copyright (C) 1993, 1994, 1996 Free Software Foundation, Inc.
17 This file is part of the GNU MP Library.
19 The GNU MP Library is free software; you can redistribute it and/or modify
20 it under the terms of the GNU Library General Public License as published by
21 the Free Software Foundation; either version 2 of the License, or (at your
22 option) any later version.
24 The GNU MP Library is distributed in the hope that it will be useful, but
25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
27 License for more details.
29 You should have received a copy of the GNU Library General Public License
30 along with the GNU MP Library; see the file COPYING.LIB. If not, write to
31 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
32 MA 02111-1307, USA. */
34 /* This code is just correct if "unsigned char" has at least 8 bits. It
35 doesn't help to use CHAR_BIT from limits.h, as the real problem is
42 /* Square root algorithm:
44 1. Shift OP (the input) to the left an even number of bits s.t. there
45 are an even number of words and either (or both) of the most
46 significant bits are set. This way, sqrt(OP) has exactly half as
47 many words as OP, and has its most significant bit set.
49 2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables.
50 This approximation is used for the first single-precision
51 iterations of Newton's method, yielding a full-word approximation
54 3. Perform multiple-precision Newton iteration until we have the
55 exact result. Only about half of the input operand is used in
56 this calculation, as the square root is perfectly determinable
57 from just the higher half of a number. */
59 /* Define this macro for IEEE P854 machines with a fast sqrt instruction. */
60 #if defined __GNUC__ && ! defined __SOFT_FLOAT
66 asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a)); \
71 #if defined __HAVE_68881__
75 asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a)); \
84 asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a)); \
89 #if defined _ARCH_PWR2
93 asm ("fsqrt %0,%1" : "=f" (__sqrt_res) : "f" (a)); \
102 /* Tables for initial approximation of the square root. These are
103 indexed with bits 1-8 of the operand for which the square root is
104 calculated, where bit 0 is the most significant non-zero bit. I.e.
105 the most significant one-bit is not used, since that per definition
106 is one. Likewise, the tables don't return the highest bit of the
107 result. That bit must be inserted by or:ing the returned value with
108 0x100. This way, we get a 9-bit approximation from 8-bit tables! */
110 /* Table to be used for operands with an even total number of bits.
111 (Exactly as in the decimal system there are similarities between the
112 square root of numbers with the same initial digits and an even
113 difference in the total number of digits. Consider the square root
114 of 1, 10, 100, 1000, ...) */
115 static unsigned char even_approx_tab[256] =
117 0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e,
118 0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74,
119 0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79,
120 0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f,
121 0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84,
122 0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89,
123 0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f,
124 0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94,
125 0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99,
126 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e,
127 0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3,
128 0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7,
129 0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac,
130 0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1,
131 0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6,
132 0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba,
133 0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf,
134 0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3,
135 0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8,
136 0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc,
137 0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1,
138 0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5,
139 0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda,
140 0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde,
141 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2,
142 0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6,
143 0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb,
144 0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef,
145 0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3,
146 0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7,
147 0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb,
148 0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff,
151 /* Table to be used for operands with an odd total number of bits.
152 (Further comments before previous table.) */
153 static unsigned char odd_approx_tab[256] =
155 0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
156 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07,
157 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b,
158 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f,
159 0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12,
160 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16,
161 0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a,
162 0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d,
163 0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21,
164 0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24,
165 0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,
166 0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b,
167 0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f,
168 0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32,
169 0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35,
170 0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39,
171 0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c,
172 0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f,
173 0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42,
174 0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45,
175 0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49,
176 0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c,
177 0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f,
178 0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52,
179 0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55,
180 0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58,
181 0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b,
182 0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e,
183 0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61,
184 0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63,
185 0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66,
186 0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69,
193 mpn_sqrtrem (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size_t op_size)
195 mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
202 /* R (root result) */
203 mp_ptr rp; /* Pointer to least significant word */
204 mp_size_t rsize; /* The size in words */
206 /* T (OP shifted to the left a.k.a. normalized) */
207 mp_ptr tp; /* Pointer to least significant word */
208 mp_size_t tsize; /* The size in words */
209 mp_ptr t_end_ptr; /* Pointer right beyond most sign. word */
210 mp_limb_t t_high0, t_high1; /* The two most significant words */
212 /* TT (temporary for numerator/remainder) */
213 mp_ptr ttp; /* Pointer to least significant word */
215 /* X (temporary for quotient in main loop) */
216 mp_ptr xp; /* Pointer to least significant word */
217 mp_size_t xsize; /* The size in words */
220 mp_limb_t initial_approx; /* Initially made approximation */
221 mp_size_t tsizes[BITS_PER_MP_LIMB]; /* Successive calculation precisions */
228 /* If OP is zero, both results are zero. */
232 count_leading_zeros (cnt, op_ptr[op_size - 1]);
234 if ((tsize & 1) != 0)
236 cnt += BITS_PER_MP_LIMB;
245 /* Shift OP an even number of bits into T, such that either the most or
246 the second most significant bit is set, and such that the number of
247 words in T becomes even. This way, the number of words in R=sqrt(OP)
248 is exactly half as many as in OP, and the most significant bit of R
251 Also, the initial approximation is simplified by this up-shifted OP.
253 Finally, the Newtonian iteration which is the main part of this
254 program performs division by R. The fast division routine expects
255 the divisor to be "normalized" in exactly the sense of having the
256 most significant bit set. */
258 tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
260 if ((cnt & ~1) % BITS_PER_MP_LIMB != 0)
261 t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size,
262 (cnt & ~1) % BITS_PER_MP_LIMB);
264 MPN_COPY (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size);
266 if (cnt >= BITS_PER_MP_LIMB)
269 t_high0 = tp[tsize - 1];
270 t_high1 = tp[tsize - 2]; /* Never stray. TSIZE is >= 2. */
272 /* Is there a fast sqrt instruction defined for this machine? */
275 initial_approx = SQRT (t_high0 * 2.0
276 * ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1))
278 /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have
279 become incorrect due to overflow in the conversion from double to
280 mp_limb_t above. It will typically be zero in that case, but might be
281 a small number on some machines. The most significant bit of
282 INITIAL_APPROX should be set, so that bit is a good overflow
284 if ((mp_limb_signed_t) initial_approx >= 0)
285 initial_approx = ~(mp_limb_t)0;
288 /* Get a 9 bit approximation from the tables. The tables expect to
289 be indexed with the 8 high bits right below the highest bit.
290 Also, the highest result bit is not returned by the tables, and
291 must be or:ed into the result. The scheme gives 9 bits of start
292 approximation with just 256-entry 8 bit tables. */
296 /* The most sign bit of t_high0 is set. */
297 initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1);
298 initial_approx &= 0xff;
299 initial_approx = even_approx_tab[initial_approx];
303 /* The most significant bit of T_HIGH0 is unset,
304 the second most significant is set. */
305 initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2);
306 initial_approx &= 0xff;
307 initial_approx = odd_approx_tab[initial_approx];
309 initial_approx |= 0x100;
310 initial_approx <<= BITS_PER_MP_LIMB - 8 - 1;
312 /* Perform small precision Newtonian iterations to get a full word
313 approximation. For small operands, these iteration will make the
315 if (t_high0 == ~(mp_limb_t)0)
316 initial_approx = t_high0;
321 if (t_high0 >= initial_approx)
322 initial_approx = t_high0 + 1;
324 /* First get about 18 bits with pure C arithmetics. */
325 quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2;
326 initial_approx = (initial_approx + quot) / 2;
327 initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
329 /* Now get a full word by one (or for > 36 bit machines) several
331 for (i = 16; i < BITS_PER_MP_LIMB; i <<= 1)
333 mp_limb_t ignored_remainder;
335 udiv_qrnnd (quot, ignored_remainder,
336 t_high0, t_high1, initial_approx);
337 initial_approx = (initial_approx + quot) / 2;
338 initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
343 rp[0] = initial_approx;
348 mpn_dump (tp, tsize);
353 /* Determine the successive precisions to use in the iteration. We
354 minimize the precisions, beginning with the highest (i.e. last
355 iteration) to the lowest (i.e. first iteration). */
357 xp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
358 ttp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
360 t_end_ptr = tp + tsize;
365 tsize = (tmp + 1) / 2;
368 tsizes[i] = tsize + tmp;
372 /* Main Newton iteration loop. For big arguments, most of the
373 time is spent here. */
375 /* It is possible to do a great optimization here. The successive
376 divisors in the mpn_divmod call below has more and more leading
377 words equal to its predecessor. Therefore the beginning of
378 each division will repeat the same work as did the last
379 division. If we could guarantee that the leading words of two
380 consecutive divisors are the same (i.e. in this case, a later
381 divisor has just more digits at the end) it would be a simple
382 matter of just using the old remainder of the last division in
383 a subsequent division, to take care of this optimization. This
384 idea would surely make a difference even for small arguments. */
388 R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1.
389 X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X.
390 R <= shiftdown_to_same_size(X). */
396 mp_limb_t old_least_sign_r = rp[0];
397 mp_size_t old_rsize = rsize;
400 mpn_dump (rp, rsize);
404 /* Need to copy the numerator into temporary space, as
405 mpn_divmod overwrites its numerator argument with the
406 remainder (which we currently ignore). */
407 MPN_COPY (ttp, t_end_ptr - tsize, tsize);
408 cy = mpn_divmod (xp, ttp, tsize, rp, rsize);
409 xsize = tsize - rsize;
412 printf ("X =%d ", cy);
413 mpn_dump (xp, xsize);
416 /* Add X and R with the most significant limbs aligned,
417 temporarily ignoring at least one limb at the low end of X. */
419 cy += mpn_add_n (xp + tmp, rp, xp + tmp, rsize);
421 /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get
422 intermediate roots that'd need an extra bit. We don't want to
423 handle that since it would make the subsequent divisor
424 non-normalized, so round such roots down to be only ones in the
425 current precision. */
429 for (j = xsize; j >= 0; j--)
430 xp[j] = ~(mp_limb_t)0;
433 /* Divide X by 2 and put the result in R. This is the new
434 approximation. Shift in the carry from the addition. */
435 mpn_rshift (rp, xp, xsize, 1);
436 rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1));
439 if (old_least_sign_r != rp[rsize - old_rsize])
440 printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n",
441 i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r,
442 2 * BYTES_PER_MP_LIMB, rp[rsize - old_rsize]);
448 printf ("(final) R = ");
449 mpn_dump (rp, rsize);
452 /* We computed the square root of OP * 2**(2*floor(cnt/2)).
453 This has resulted in R being 2**floor(cnt/2) to large.
454 Shift it down here to fix that. */
457 mpn_rshift (rp, rp, rsize, cnt/2);
458 rsize -= rp[rsize - 1] == 0;
461 /* Calculate the remainder. */
462 mpn_mul_n (tp, rp, rp, rsize);
463 tsize = rsize + rsize;
464 tsize -= tp[tsize - 1] == 0;
466 || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0))
468 /* R is too large. Decrement it. */
470 /* These operations can't overflow. */
471 cy_limb = mpn_sub_n (tp, tp, rp, rsize);
472 cy_limb += mpn_sub_n (tp, tp, rp, rsize);
473 mpn_sub_1 (tp + rsize, tp + rsize, tsize - rsize, cy_limb);
474 mpn_add_1 (tp, tp, tsize, (mp_limb_t) 1);
476 mpn_sub_1 (rp, rp, rsize, (mp_limb_t) 1);
479 printf ("(adjusted) R = ");
480 mpn_dump (rp, rsize);
486 cy_limb = mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize);
487 MPN_NORMALIZE (rem_ptr, op_size);
494 res = op_size != tsize || mpn_cmp (op_ptr, tp, op_size);