1 /* mpn_gcdext -- Extended Greatest Common Divisor.
3 Copyright (C) 1996, 1998, 2000 Free Software Foundation, Inc.
5 This file is part of the GNU MP Library.
7 The GNU MP Library is free software; you can redistribute it and/or modify
8 it under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or (at your
10 option) any later version.
12 The GNU MP Library is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with the GNU MP Library; see the file COPYING.LIB. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20 MA 02111-1307, USA. */
26 #ifndef GCDEXT_THRESHOLD
27 #define GCDEXT_THRESHOLD 17
35 int arr[BITS_PER_MP_LIMB];
39 /* mpn_gcdext (GP, SP, SSIZE, UP, USIZE, VP, VSIZE)
41 Compute the extended GCD of {UP,USIZE} and {VP,VSIZE} and store the
42 greatest common divisor at GP (unless it is 0), and the first cofactor at
43 SP. Write the size of the cofactor through the pointer SSIZE. Return the
44 size of the value at GP. Note that SP might be a negative number; this is
45 denoted by storing the negative of the size through SSIZE.
47 {UP,USIZE} and {VP,VSIZE} are both clobbered.
49 The space allocation for all four areas needs to be USIZE+1.
51 Preconditions: 1) U >= V.
54 /* We use Lehmer's algorithm. The idea is to extract the most significant
55 bits of the operands, and compute the continued fraction for them. We then
56 apply the gathered cofactors to the full operands.
58 Idea 1: After we have performed a full division, don't shift operands back,
59 but instead account for the extra factors-of-2 thus introduced.
60 Idea 2: Simple generalization to use divide-and-conquer would give us an
61 algorithm that runs faster than O(n^2).
62 Idea 3: The input numbers need less space as the computation progresses,
63 while the s0 and s1 variables need more space. To save memory, we
64 could make them share space, and have the latter variables grow
66 Idea 4: We should not do double-limb arithmetic from the start. Instead,
67 do things in single-limb arithmetic until the quotients differ,
68 and then switch to double-limb arithmetic. */
71 /* Division optimized for small quotients. If the quotient is more than one limb,
72 store 1 in *qh and return 0. */
75 div2 (mp_limb_t *qh, mp_limb_t n1, mp_limb_t n0, mp_limb_t d1, mp_limb_t d0)
77 div2 (qh, n1, n0, d1, d0)
91 if ((mp_limb_signed_t) n1 < 0)
95 for (cnt = 1; (mp_limb_signed_t) d1 >= 0; cnt++)
97 d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1));
105 if (n1 > d1 || (n1 == d1 && n0 >= d0))
107 sub_ddmmss (n1, n0, n1, n0, d1, d0);
110 d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1);
122 for (cnt = 0; n1 > d1 || (n1 == d1 && n0 >= d0); cnt++)
124 d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1));
131 d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1);
134 if (n1 > d1 || (n1 == d1 && n0 >= d0))
136 sub_ddmmss (n1, n0, n1, n0, d1, d0);
150 mpn_gcdext (mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
151 mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
153 mpn_gcdext (gp, s0p, s0size, up, size, vp, vsize)
165 mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
167 mpn_gcd (gp, up, size, vp, vsize)
176 mp_limb_t A, B, C, D;
184 mp_ptr orig_s0p = s0p;
193 use_double_flag = (size >= GCDEXT_THRESHOLD);
195 tp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
196 wp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
198 s1p = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
200 MPN_ZERO (s0p, size);
201 MPN_ZERO (s1p, size);
210 /* Normalize V (and shift up U the same amount). */
211 count_leading_zeros (cnt, vp[vsize - 1]);
215 mpn_lshift (vp, vp, vsize, cnt);
216 cy = mpn_lshift (up, up, size, cnt);
221 mpn_divmod (up + vsize, up, size, vp, vsize);
223 /* This is really what it boils down to in this case... */
231 mpn_rshift (up, up, size, cnt);
232 mpn_rshift (vp, vp, size, cnt);
234 MP_PTR_SWAP (up, vp);
240 /* Figure out exact size of V. */
242 MPN_NORMALIZE (vp, vsize);
248 mp_limb_t uh, vh, ul, vl;
249 /* Let UH,UL be the most significant limbs of U, and let VH,VL be
250 the corresponding bits from V. */
255 count_leading_zeros (cnt, uh);
258 uh = (uh << cnt) | (ul >> (BITS_PER_MP_LIMB - cnt));
259 vh = (vh << cnt) | (vl >> (BITS_PER_MP_LIMB - cnt));
264 ul |= (up[size - 3] >> (BITS_PER_MP_LIMB - cnt));
265 vl |= (vp[size - 3] >> (BITS_PER_MP_LIMB - cnt));
278 mp_limb_t qh, q1, q2;
279 mp_limb_t nh, nl, dh, dl;
283 sub_ddmmss (dh, dl, vh, vl, 0, C);
286 add_ssaaaa (nh, nl, uh, ul, 0, A);
287 q1 = div2 (&qh, nh, nl, dh, dl);
289 break; /* could handle this */
291 add_ssaaaa (dh, dl, vh, vl, 0, D);
294 sub_ddmmss (nh, nl, uh, ul, 0, B);
295 q2 = div2 (&qh, nh, nl, dh, dl);
297 break; /* could handle this */
310 umul_ppmm (t1, t0, q1, vl);
312 sub_ddmmss (Th, Tl, uh, ul, t1, t0);
316 add_ssaaaa (dh, dl, vh, vl, 0, C);
317 sub_ddmmss (nh, nl, uh, ul, 0, A);
318 q1 = div2 (&qh, nh, nl, dh, dl);
320 break; /* could handle this */
322 sub_ddmmss (dh, dl, vh, vl, 0, D);
325 add_ssaaaa (nh, nl, uh, ul, 0, B);
326 q2 = div2 (&qh, nh, nl, dh, dl);
328 break; /* could handle this */
341 umul_ppmm (t1, t0, q1, vl);
343 sub_ddmmss (Th, Tl, uh, ul, t1, t0);
352 else /* Same, but using single-limb calculations. */
355 /* Make UH be the most significant limb of U, and make VH be
356 corresponding bits from V. */
359 count_leading_zeros (cnt, uh);
362 uh = (uh << cnt) | (up[size - 2] >> (BITS_PER_MP_LIMB - cnt));
363 vh = (vh << cnt) | (vp[size - 2] >> (BITS_PER_MP_LIMB - cnt));
375 if (vh - C == 0 || vh + D == 0)
378 q = (uh + A) / (vh - C);
379 if (q != (uh - B) / (vh + D))
397 q = (uh - A) / (vh + C);
398 if (q != (uh + B) / (vh - D))
420 max = MAX (A, max); max = MAX (B, max);
421 max = MAX (C, max); max = MAX (D, max);
428 /* This is quite rare. I.e., optimize something else! */
430 /* Normalize V (and shift up U the same amount). */
431 count_leading_zeros (cnt, vp[vsize - 1]);
435 mpn_lshift (vp, vp, vsize, cnt);
436 cy = mpn_lshift (up, up, size, cnt);
441 qh = mpn_divmod (up + vsize, up, size, vp, vsize);
443 MPN_COPY (tp, s0p, ssize);
447 qsize = size - vsize; /* size of stored quotient from division */
450 MPN_ZERO (tp + ssize, qsize - ssize);
451 MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
452 for (i = 0; i < ssize; i++)
455 cy = mpn_addmul_1 (tp + i, up + vsize, qsize, s1p[i]);
461 cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize);
468 MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
469 for (i = 0; i < qsize; i++)
472 cy = mpn_addmul_1 (tp + i, s1p, ssize, up[vsize + i]);
478 cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize);
481 tp[qsize + ssize] = cy;
482 s1p[qsize + ssize] = 0;
488 ssize -= tp[ssize - 1] == 0;
492 MP_PTR_SWAP (s0p, s1p);
493 MP_PTR_SWAP (s1p, tp);
498 mpn_rshift (up, up, size, cnt);
499 mpn_rshift (vp, vp, size, cnt);
501 MP_PTR_SWAP (up, vp);
506 mp_size_t tsize, wsize;
514 { mp_limb_t x; x = A | B | C | D; count_leading_zeros (cnt, x);
515 arr[BITS_PER_MP_LIMB - cnt]++; }
519 /* B == 1 and C == 1 (D is arbitrary) */
521 MPN_COPY (tp, vp, size);
522 MPN_COPY (wp, up, size);
523 mpn_submul_1 (wp, vp, size, D);
524 MP_PTR_SWAP (tp, up);
525 MP_PTR_SWAP (wp, vp);
527 MPN_COPY (tp, s1p, ssize);
529 tp[ssize] = 0; /* must zero since wp might spill below */
530 MPN_COPY (wp, s0p, ssize);
531 cy = mpn_addmul_1 (wp, s1p, ssize, D);
533 wsize = ssize + (cy != 0);
534 MP_PTR_SWAP (tp, s0p);
535 MP_PTR_SWAP (wp, s1p);
536 ssize = MAX (wsize, tsize);
544 mpn_mul_1 (tp, vp, size, B);
545 mpn_submul_1 (tp, up, size, A);
546 mpn_mul_1 (wp, up, size, C);
547 mpn_submul_1 (wp, vp, size, D);
548 MP_PTR_SWAP (tp, up);
549 MP_PTR_SWAP (wp, vp);
551 cy = mpn_mul_1 (tp, s1p, ssize, B);
552 cy += mpn_addmul_1 (tp, s0p, ssize, A);
554 tsize = ssize + (cy != 0);
555 cy = mpn_mul_1 (wp, s0p, ssize, C);
556 cy += mpn_addmul_1 (wp, s1p, ssize, D);
558 wsize = ssize + (cy != 0);
559 MP_PTR_SWAP (tp, s0p);
560 MP_PTR_SWAP (wp, s1p);
561 ssize = MAX (wsize, tsize);
567 mpn_mul_1 (tp, up, size, A);
568 mpn_submul_1 (tp, vp, size, B);
569 mpn_mul_1 (wp, vp, size, D);
570 mpn_submul_1 (wp, up, size, C);
571 MP_PTR_SWAP (tp, up);
572 MP_PTR_SWAP (wp, vp);
574 cy = mpn_mul_1 (tp, s0p, ssize, A);
575 cy += mpn_addmul_1 (tp, s1p, ssize, B);
577 tsize = ssize + (cy != 0);
578 cy = mpn_mul_1 (wp, s1p, ssize, D);
579 cy += mpn_addmul_1 (wp, s0p, ssize, C);
581 wsize = ssize + (cy != 0);
582 MP_PTR_SWAP (tp, s0p);
583 MP_PTR_SWAP (wp, s1p);
584 ssize = MAX (wsize, tsize);
589 size -= up[size - 1] == 0;
594 printf ("max: %lx\n", max);
598 {int i; for (i = 0; i < BITS_PER_MP_LIMB; i++) printf ("%d:%d\n", i, arr[i]);}
603 if (gp != up && gp != 0)
604 MPN_COPY (gp, up, size);
606 MPN_NORMALIZE (s0p, ssize);
608 MPN_COPY (orig_s0p, s0p, ssize);
609 *s0size = sign >= 0 ? ssize : -ssize;
622 t = mpn_divmod_1 (wp, up, size, vl);
624 MPN_COPY (tp, s0p, ssize);
626 qsize = size - (wp[size - 1] == 0); /* size of quotient from division */
629 MPN_ZERO (tp + ssize, qsize - ssize);
630 MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
631 for (i = 0; i < ssize; i++)
634 cy = mpn_addmul_1 (tp + i, wp, qsize, s1p[i]);
640 MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
641 for (i = 0; i < qsize; i++)
644 cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]);
649 ssize -= tp[ssize - 1] == 0;
652 MP_PTR_SWAP (s0p, s1p);
653 MP_PTR_SWAP (s1p, tp);
655 t = mpn_mod_1 (up, size, vl);
667 MPN_COPY (tp, s0p, ssize);
669 MPN_ZERO (s1p + ssize, 1); /* zero s1 too */
673 cy = mpn_addmul_1 (tp, s1p, ssize, q);
678 ssize -= tp[ssize - 1] == 0;
681 MP_PTR_SWAP (s0p, s1p);
682 MP_PTR_SWAP (s1p, tp);
692 MPN_NORMALIZE (s0p, ssize);
694 MPN_COPY (orig_s0p, s0p, ssize);
695 *s0size = sign >= 0 ? ssize : -ssize;