1 /* mpn/gcd.c: mpn_gcd for gcd of two odd integers.
3 Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000 Free Software
6 This file is part of the GNU MP Library.
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or (at your
11 option) any later version.
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21 MA 02111-1307, USA. */
23 /* Integer greatest common divisor of two unsigned integers, using
24 the accelerated algorithm (see reference below).
26 mp_size_t mpn_gcd (up, usize, vp, vsize).
28 Preconditions [U = (up, usize) and V = (vp, vsize)]:
31 2. numbits(U) >= numbits(V).
33 Both U and V are destroyed by the operation. The result is left at vp,
34 and its size is returned.
36 Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
38 Funding for this work has been partially provided by Conselho Nacional
39 de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
40 301314194-2, and was done while I was a visiting reseacher in the Instituto
41 de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
44 K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
45 Mathematical Software, v. 21 (March), 1995, pp. 111-122. */
51 /* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated
52 algorithm is used, otherwise the binary algorithm is used. This may be
53 adjusted for different architectures. */
54 #ifndef GCD_ACCEL_THRESHOLD
55 #define GCD_ACCEL_THRESHOLD 5
58 /* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated
59 algorithm reduces using the bmod operation. Otherwise, the k-ary reduction
60 is used. 0 <= BMOD_THRESHOLD < BITS_PER_MP_LIMB. */
63 BMOD_THRESHOLD = BITS_PER_MP_LIMB/2
67 /* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2.
68 Both U and V must be odd. */
69 static __gmp_inline mp_size_t
71 gcd_2 (mp_ptr vp, mp_srcptr up)
78 mp_limb_t u0, u1, v0, v1;
81 u0 = up[0], u1 = up[1], v0 = vp[0], v1 = vp[1];
83 while (u1 != v1 && u0 != v0)
88 u1 -= v1 + (u0 < v0), u0 -= v0;
89 count_trailing_zeros (r, u0);
90 u0 = u1 << (BITS_PER_MP_LIMB - r) | u0 >> r;
95 v1 -= u1 + (v0 < u0), v0 -= u0;
96 count_trailing_zeros (r, v0);
97 v0 = v1 << (BITS_PER_MP_LIMB - r) | v0 >> r;
102 vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0);
104 /* If U == V == GCD, done. Otherwise, compute GCD (V, |U - V|). */
105 if (u1 == v1 && u0 == v0)
108 v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0;
109 vp[0] = mpn_gcd_1 (vp, vsize, v0);
114 /* The function find_a finds 0 < N < 2^BITS_PER_MP_LIMB such that there exists
115 0 < |D| < 2^BITS_PER_MP_LIMB, and N == D * C mod 2^(2*BITS_PER_MP_LIMB).
116 In the reference article, D was computed along with N, but it is better to
117 compute D separately as D <-- N / C mod 2^(BITS_PER_MP_LIMB + 1), treating
118 the result as a twos' complement signed integer.
120 Initialize N1 to C mod 2^(2*BITS_PER_MP_LIMB). According to the reference
121 article, N2 should be initialized to 2^(2*BITS_PER_MP_LIMB), but we use
122 2^(2*BITS_PER_MP_LIMB) - N1 to start the calculations within double
123 precision. If N2 > N1 initially, the first iteration of the while loop
124 will swap them. In all other situations, N1 >= N2 is maintained. */
127 #if ! defined (__i386__)
128 __gmp_inline /* don't inline this for the x86 */
132 find_a (mp_srcptr cp)
138 unsigned long int leading_zero_bits = 0;
140 mp_limb_t n1_l = cp[0]; /* N1 == n1_h * 2^BITS_PER_MP_LIMB + n1_l. */
141 mp_limb_t n1_h = cp[1];
143 mp_limb_t n2_l = -n1_l; /* N2 == n2_h * 2^BITS_PER_MP_LIMB + n2_l. */
144 mp_limb_t n2_h = ~n1_h;
147 while (n2_h) /* While N2 >= 2^BITS_PER_MP_LIMB. */
149 /* N1 <-- N1 % N2. */
150 if ((MP_LIMB_T_HIGHBIT >> leading_zero_bits & n2_h) == 0)
153 count_leading_zeros (i, n2_h);
154 i -= leading_zero_bits, leading_zero_bits += i;
155 n2_h = n2_h<<i | n2_l>>(BITS_PER_MP_LIMB - i), n2_l <<= i;
158 if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
159 n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
160 n2_l = n2_l>>1 | n2_h<<(BITS_PER_MP_LIMB - 1), n2_h >>= 1;
165 if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
166 n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
168 MP_LIMB_T_SWAP (n1_h, n2_h);
169 MP_LIMB_T_SWAP (n1_l, n2_l);
177 mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
179 mpn_gcd (gp, up, usize, vp, vsize)
188 mp_size_t orig_vsize = vsize;
189 int binary_gcd_ctr; /* Number of times binary gcd will execute. */
194 /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD.
195 Two EXTRA limbs for U and V are required for kary reduction. */
196 if (vsize >= GCD_ACCEL_THRESHOLD)
198 unsigned long int vbitsize, d;
200 mp_size_t orig_usize = usize;
201 mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB);
203 MPN_COPY (anchor_up, orig_up, usize);
206 count_leading_zeros (d, up[usize-1]);
207 d = usize * BITS_PER_MP_LIMB - d;
208 count_leading_zeros (vbitsize, vp[vsize-1]);
209 vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
210 d = d - vbitsize + 1;
212 /* Use bmod reduction to quickly discover whether V divides U. */
213 up[usize++] = 0; /* Insert leading zero. */
214 mpn_bdivmod (up, up, usize, vp, vsize, d);
216 /* Now skip U/V mod 2^d and any low zero limbs. */
217 d /= BITS_PER_MP_LIMB, up += d, usize -= d;
218 while (usize != 0 && up[0] == 0)
221 if (usize == 0) /* GCD == ORIG_V. */
224 vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB);
225 MPN_COPY (vp, orig_vp, vsize);
229 /* mpn_com_n can't be used here because anchor_up and up may
231 if (up[usize-1] & MP_LIMB_T_HIGHBIT) /* U < 0; take twos' compl. */
234 anchor_up[0] = -up[0];
235 for (i = 1; i < usize; i++)
236 anchor_up[i] = ~up[i];
240 MPN_NORMALIZE_NOT_ZERO (up, usize);
242 if ((up[0] & 1) == 0) /* Result even; remove twos. */
245 count_trailing_zeros (r, up[0]);
246 mpn_rshift (anchor_up, up, usize, r);
247 usize -= (anchor_up[usize-1] == 0);
249 else if (anchor_up != up)
250 MPN_COPY_INCR (anchor_up, up, usize);
252 MPN_PTR_SWAP (anchor_up,usize, vp,vsize);
255 if (vsize <= 2) /* Kary can't handle < 2 limbs and */
256 break; /* isn't efficient for == 2 limbs. */
259 count_leading_zeros (vbitsize, vp[vsize-1]);
260 vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
261 d = d - vbitsize + 1;
263 if (d > BMOD_THRESHOLD) /* Bmod reduction. */
266 mpn_bdivmod (up, up, usize, vp, vsize, d);
267 d /= BITS_PER_MP_LIMB, up += d, usize -= d;
269 else /* Kary reduction. */
271 mp_limb_t bp[2], cp[2];
273 /* C <-- V/U mod 2^(2*BITS_PER_MP_LIMB). */
275 mp_limb_t u_inv, hi, lo;
276 modlimb_invert (u_inv, up[0]);
277 cp[0] = vp[0] * u_inv;
278 umul_ppmm (hi, lo, cp[0], up[0]);
279 cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv;
282 /* U <-- find_a (C) * U. */
283 up[usize] = mpn_mul_1 (up, up, usize, find_a (cp));
286 /* B <-- A/C == U/V mod 2^(BITS_PER_MP_LIMB + 1).
287 bp[0] <-- U/V mod 2^BITS_PER_MP_LIMB and
288 bp[1] <-- ( (U - bp[0] * V)/2^BITS_PER_MP_LIMB ) / V mod 2
290 Like V/U above, but simplified because only the low bit of
293 mp_limb_t v_inv, hi, lo;
294 modlimb_invert (v_inv, vp[0]);
295 bp[0] = up[0] * v_inv;
296 umul_ppmm (hi, lo, bp[0], vp[0]);
297 bp[1] = (up[1] + hi + (bp[0]&vp[1])) & 1;
301 if (bp[1]) /* B < 0: U <-- U + (-B) * V. */
303 mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0]);
304 mpn_add_1 (up + vsize, up + vsize, usize - vsize, c);
306 else /* B >= 0: U <-- U - B * V. */
308 mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]);
309 mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
312 up += 2, usize -= 2; /* At least two low limbs are zero. */
315 /* Must remove low zero limbs before complementing. */
316 while (usize != 0 && up[0] == 0)
321 /* Compute GCD (ORIG_V, GCD (ORIG_U, V)). Binary will execute twice. */
322 up = orig_up, usize = orig_usize;
328 /* Finish up with the binary algorithm. Executes once or twice. */
329 for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize)
331 if (usize > 2) /* First make U close to V in size. */
333 unsigned long int vbitsize, d;
334 count_leading_zeros (d, up[usize-1]);
335 d = usize * BITS_PER_MP_LIMB - d;
336 count_leading_zeros (vbitsize, vp[vsize-1]);
337 vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
338 d = d - vbitsize - 1;
339 if (d != -(unsigned long int)1 && d > 2)
341 mpn_bdivmod (up, up, usize, vp, vsize, d); /* Result > 0. */
342 d /= (unsigned long int)BITS_PER_MP_LIMB, up += d, usize -= d;
346 /* Start binary GCD. */
351 /* Make sure U is odd. */
352 MPN_NORMALIZE (up, usize);
355 if ((up[0] & 1) == 0)
358 count_trailing_zeros (r, up[0]);
359 mpn_rshift (up, up, usize, r);
360 usize -= (up[usize-1] == 0);
363 /* Keep usize >= vsize. */
365 MPN_PTR_SWAP (up, usize, vp, vsize);
367 if (usize <= 2) /* Double precision. */
370 vp[0] = mpn_gcd_1 (up, usize, vp[0]);
372 vsize = gcd_2 (vp, up);
373 break; /* Binary GCD done. */
376 /* Count number of low zero limbs of U - V. */
377 for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; )
380 /* If U < V, swap U and V; in any case, subtract V from U. */
381 if (zeros == vsize) /* Subtract done. */
382 up += zeros, usize -= zeros;
383 else if (usize == vsize)
385 mp_size_t size = vsize;
388 while (up[size] == vp[size]);
389 if (up[size] < vp[size]) /* usize == vsize. */
390 MP_PTR_SWAP (up, vp);
391 up += zeros, usize = size + 1 - zeros;
392 mpn_sub_n (up, up, vp + zeros, usize);
396 mp_size_t size = vsize - zeros;
397 up += zeros, usize -= zeros;
398 if (mpn_sub_n (up, up, vp + zeros, size))
400 while (up[size] == 0) /* Propagate borrow. */
401 up[size++] = -(mp_limb_t)1;
406 while (usize); /* End binary GCD. */
411 MPN_COPY (gp, vp, vsize);