ghc/rts/gmp/mpn/generic/gcd.c

   1 /* mpn/gcd.c: mpn_gcd for gcd of two odd integers.
   2
   3 Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000 Free Software
   4 Foundation, Inc.
   5
   6 This file is part of the GNU MP Library.
   7
   8 The GNU MP Library is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU Lesser General Public License as published by
  10 the Free Software Foundation; either version 2.1 of the License, or (at your
  11 option) any later version.
  12
  13 The GNU MP Library is distributed in the hope that it will be useful, but
  14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  16 License for more details.
  17
  18 You should have received a copy of the GNU Lesser General Public License
  19 along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  21 MA 02111-1307, USA. */
  22
  23 /* Integer greatest common divisor of two unsigned integers, using
  24    the accelerated algorithm (see reference below).
  25
  26    mp_size_t mpn_gcd (up, usize, vp, vsize).
  27
  28    Preconditions [U = (up, usize) and V = (vp, vsize)]:
  29
  30    1.  V is odd.
  31    2.  numbits(U) >= numbits(V).
  32
  33    Both U and V are destroyed by the operation.  The result is left at vp,
  34    and its size is returned.
  35
  36    Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
  37
  38    Funding for this work has been partially provided by Conselho Nacional
  39    de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
  40    301314194-2, and was done while I was a visiting reseacher in the Instituto
  41    de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
  42
  43    Refer to
  44         K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
  45         Mathematical Software, v. 21 (March), 1995, pp. 111-122.  */
  46
  47 #include "gmp.h"
  48 #include "gmp-impl.h"
  49 #include "longlong.h"
  50
  51 /* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated
  52    algorithm is used, otherwise the binary algorithm is used.  This may be
  53    adjusted for different architectures.  */
  54 #ifndef GCD_ACCEL_THRESHOLD
  55 #define GCD_ACCEL_THRESHOLD 5
  56 #endif
  57
  58 /* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated
  59    algorithm reduces using the bmod operation.  Otherwise, the k-ary reduction
  60    is used.  0 <= BMOD_THRESHOLD < BITS_PER_MP_LIMB.  */
  61 enum
  62   {
  63     BMOD_THRESHOLD = BITS_PER_MP_LIMB/2
  64   };
  65
  66
  67 /* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2.
  68    Both U and V must be odd.  */
  69 static __gmp_inline mp_size_t
  70 #if __STDC__
  71 gcd_2 (mp_ptr vp, mp_srcptr up)
  72 #else
  73 gcd_2 (vp, up)
  74      mp_ptr vp;
  75      mp_srcptr up;
  76 #endif
  77 {
  78   mp_limb_t u0, u1, v0, v1;
  79   mp_size_t vsize;
  80
  81   u0 = up[0], u1 = up[1], v0 = vp[0], v1 = vp[1];
  82
  83   while (u1 != v1 && u0 != v0)
  84     {
  85       unsigned long int r;
  86       if (u1 > v1)
  87         {
  88           u1 -= v1 + (u0 < v0), u0 -= v0;
  89           count_trailing_zeros (r, u0);
  90           u0 = u1 << (BITS_PER_MP_LIMB - r) | u0 >> r;
  91           u1 >>= r;
  92         }
  93       else  /* u1 < v1.  */
  94         {
  95           v1 -= u1 + (v0 < u0), v0 -= u0;
  96           count_trailing_zeros (r, v0);
  97           v0 = v1 << (BITS_PER_MP_LIMB - r) | v0 >> r;
  98           v1 >>= r;
  99         }
 100     }
 101
 102   vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0);
 103
 104   /* If U == V == GCD, done.  Otherwise, compute GCD (V, |U - V|).  */
 105   if (u1 == v1 && u0 == v0)
 106     return vsize;
 107
 108   v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0;
 109   vp[0] = mpn_gcd_1 (vp, vsize, v0);
 110
 111   return 1;
 112 }
 113
 114 /* The function find_a finds 0 < N < 2^BITS_PER_MP_LIMB such that there exists
 115    0 < |D| < 2^BITS_PER_MP_LIMB, and N == D * C mod 2^(2*BITS_PER_MP_LIMB).
 116    In the reference article, D was computed along with N, but it is better to
 117    compute D separately as D <-- N / C mod 2^(BITS_PER_MP_LIMB + 1), treating
 118    the result as a twos' complement signed integer.
 119
 120    Initialize N1 to C mod 2^(2*BITS_PER_MP_LIMB).  According to the reference
 121    article, N2 should be initialized to 2^(2*BITS_PER_MP_LIMB), but we use
 122    2^(2*BITS_PER_MP_LIMB) - N1 to start the calculations within double
 123    precision.  If N2 > N1 initially, the first iteration of the while loop
 124    will swap them.  In all other situations, N1 >= N2 is maintained.  */
 125
 126 static
 127 #if ! defined (__i386__)
 128 __gmp_inline                    /* don't inline this for the x86 */
 129 #endif
 130 mp_limb_t
 131 #if __STDC__
 132 find_a (mp_srcptr cp)
 133 #else
 134 find_a (cp)
 135      mp_srcptr cp;
 136 #endif
 137 {
 138   unsigned long int leading_zero_bits = 0;
 139
 140   mp_limb_t n1_l = cp[0];       /* N1 == n1_h * 2^BITS_PER_MP_LIMB + n1_l.  */
 141   mp_limb_t n1_h = cp[1];
 142
 143   mp_limb_t n2_l = -n1_l;       /* N2 == n2_h * 2^BITS_PER_MP_LIMB + n2_l.  */
 144   mp_limb_t n2_h = ~n1_h;
 145
 146   /* Main loop.  */
 147   while (n2_h)                  /* While N2 >= 2^BITS_PER_MP_LIMB.  */
 148     {
 149       /* N1 <-- N1 % N2.  */
 150       if ((MP_LIMB_T_HIGHBIT >> leading_zero_bits & n2_h) == 0)
 151         {
 152           unsigned long int i;
 153           count_leading_zeros (i, n2_h);
 154           i -= leading_zero_bits, leading_zero_bits += i;
 155           n2_h = n2_h<<i | n2_l>>(BITS_PER_MP_LIMB - i), n2_l <<= i;
 156           do
 157             {
 158               if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
 159                 n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
 160               n2_l = n2_l>>1 | n2_h<<(BITS_PER_MP_LIMB - 1), n2_h >>= 1;
 161               i -= 1;
 162             }
 163           while (i);
 164         }
 165       if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
 166         n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
 167
 168       MP_LIMB_T_SWAP (n1_h, n2_h);
 169       MP_LIMB_T_SWAP (n1_l, n2_l);
 170     }
 171
 172   return n2_l;
 173 }
 174
 175 mp_size_t
 176 #if __STDC__
 177 mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
 178 #else
 179 mpn_gcd (gp, up, usize, vp, vsize)
 180      mp_ptr gp;
 181      mp_ptr up;
 182      mp_size_t usize;
 183      mp_ptr vp;
 184      mp_size_t vsize;
 185 #endif
 186 {
 187   mp_ptr orig_vp = vp;
 188   mp_size_t orig_vsize = vsize;
 189   int binary_gcd_ctr;           /* Number of times binary gcd will execute.  */
 190   TMP_DECL (marker);
 191
 192   TMP_MARK (marker);
 193
 194   /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD.
 195      Two EXTRA limbs for U and V are required for kary reduction.  */
 196   if (vsize >= GCD_ACCEL_THRESHOLD)
 197     {
 198       unsigned long int vbitsize, d;
 199       mp_ptr orig_up = up;
 200       mp_size_t orig_usize = usize;
 201       mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB);
 202
 203       MPN_COPY (anchor_up, orig_up, usize);
 204       up = anchor_up;
 205
 206       count_leading_zeros (d, up[usize-1]);
 207       d = usize * BITS_PER_MP_LIMB - d;
 208       count_leading_zeros (vbitsize, vp[vsize-1]);
 209       vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
 210       d = d - vbitsize + 1;
 211
 212       /* Use bmod reduction to quickly discover whether V divides U.  */
 213       up[usize++] = 0;                          /* Insert leading zero.  */
 214       mpn_bdivmod (up, up, usize, vp, vsize, d);
 215
 216       /* Now skip U/V mod 2^d and any low zero limbs.  */
 217       d /= BITS_PER_MP_LIMB, up += d, usize -= d;
 218       while (usize != 0 && up[0] == 0)
 219         up++, usize--;
 220
 221       if (usize == 0)                           /* GCD == ORIG_V.  */
 222         goto done;
 223
 224       vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB);
 225       MPN_COPY (vp, orig_vp, vsize);
 226
 227       do                                        /* Main loop.  */
 228         {
 229           /* mpn_com_n can't be used here because anchor_up and up may
 230              partially overlap */
 231           if (up[usize-1] & MP_LIMB_T_HIGHBIT)  /* U < 0; take twos' compl. */
 232             {
 233               mp_size_t i;
 234               anchor_up[0] = -up[0];
 235               for (i = 1; i < usize; i++)
 236                 anchor_up[i] = ~up[i];
 237               up = anchor_up;
 238             }
 239
 240           MPN_NORMALIZE_NOT_ZERO (up, usize);
 241
 242           if ((up[0] & 1) == 0)                 /* Result even; remove twos. */
 243             {
 244               unsigned int r;
 245               count_trailing_zeros (r, up[0]);
 246               mpn_rshift (anchor_up, up, usize, r);
 247               usize -= (anchor_up[usize-1] == 0);
 248             }
 249           else if (anchor_up != up)
 250             MPN_COPY_INCR (anchor_up, up, usize);
 251
 252           MPN_PTR_SWAP (anchor_up,usize, vp,vsize);
 253           up = anchor_up;
 254
 255           if (vsize <= 2)               /* Kary can't handle < 2 limbs and  */
 256             break;                      /* isn't efficient for == 2 limbs.  */
 257
 258           d = vbitsize;
 259           count_leading_zeros (vbitsize, vp[vsize-1]);
 260           vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
 261           d = d - vbitsize + 1;
 262
 263           if (d > BMOD_THRESHOLD)       /* Bmod reduction.  */
 264             {
 265               up[usize++] = 0;
 266               mpn_bdivmod (up, up, usize, vp, vsize, d);
 267               d /= BITS_PER_MP_LIMB, up += d, usize -= d;
 268             }
 269           else                          /* Kary reduction.  */
 270             {
 271               mp_limb_t bp[2], cp[2];
 272
 273               /* C <-- V/U mod 2^(2*BITS_PER_MP_LIMB).  */
 274               {
 275                 mp_limb_t u_inv, hi, lo;
 276                 modlimb_invert (u_inv, up[0]);
 277                 cp[0] = vp[0] * u_inv;
 278                 umul_ppmm (hi, lo, cp[0], up[0]);
 279                 cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv;
 280               }
 281
 282               /* U <-- find_a (C)  *  U.  */
 283               up[usize] = mpn_mul_1 (up, up, usize, find_a (cp));
 284               usize++;
 285
 286               /* B <-- A/C == U/V mod 2^(BITS_PER_MP_LIMB + 1).
 287                   bp[0] <-- U/V mod 2^BITS_PER_MP_LIMB and
 288                   bp[1] <-- ( (U - bp[0] * V)/2^BITS_PER_MP_LIMB ) / V mod 2
 289
 290                  Like V/U above, but simplified because only the low bit of
 291                  bp[1] is wanted. */
 292               {
 293                 mp_limb_t  v_inv, hi, lo;
 294                 modlimb_invert (v_inv, vp[0]);
 295                 bp[0] = up[0] * v_inv;
 296                 umul_ppmm (hi, lo, bp[0], vp[0]);
 297                 bp[1] = (up[1] + hi + (bp[0]&vp[1])) & 1;
 298               }
 299
 300               up[usize++] = 0;
 301               if (bp[1])        /* B < 0: U <-- U + (-B)  * V.  */
 302                 {
 303                    mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0]);
 304                    mpn_add_1 (up + vsize, up + vsize, usize - vsize, c);
 305                 }
 306               else              /* B >= 0:  U <-- U - B * V.  */
 307                 {
 308                   mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]);
 309                   mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
 310                 }
 311
 312               up += 2, usize -= 2;  /* At least two low limbs are zero.  */
 313             }
 314
 315           /* Must remove low zero limbs before complementing.  */
 316           while (usize != 0 && up[0] == 0)
 317             up++, usize--;
 318         }
 319       while (usize);
 320
 321       /* Compute GCD (ORIG_V, GCD (ORIG_U, V)).  Binary will execute twice.  */
 322       up = orig_up, usize = orig_usize;
 323       binary_gcd_ctr = 2;
 324     }
 325   else
 326     binary_gcd_ctr = 1;
 327
 328   /* Finish up with the binary algorithm.  Executes once or twice.  */
 329   for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize)
 330     {
 331       if (usize > 2)            /* First make U close to V in size.  */
 332         {
 333           unsigned long int vbitsize, d;
 334           count_leading_zeros (d, up[usize-1]);
 335           d = usize * BITS_PER_MP_LIMB - d;
 336           count_leading_zeros (vbitsize, vp[vsize-1]);
 337           vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
 338           d = d - vbitsize - 1;
 339           if (d != -(unsigned long int)1 && d > 2)
 340             {
 341               mpn_bdivmod (up, up, usize, vp, vsize, d);  /* Result > 0.  */
 342               d /= (unsigned long int)BITS_PER_MP_LIMB, up += d, usize -= d;
 343             }
 344         }
 345
 346       /* Start binary GCD.  */
 347       do
 348         {
 349           mp_size_t zeros;
 350
 351           /* Make sure U is odd.  */
 352           MPN_NORMALIZE (up, usize);
 353           while (up[0] == 0)
 354             up += 1, usize -= 1;
 355           if ((up[0] & 1) == 0)
 356             {
 357               unsigned int r;
 358               count_trailing_zeros (r, up[0]);
 359               mpn_rshift (up, up, usize, r);
 360               usize -= (up[usize-1] == 0);
 361             }
 362
 363           /* Keep usize >= vsize.  */
 364           if (usize < vsize)
 365             MPN_PTR_SWAP (up, usize, vp, vsize);
 366
 367           if (usize <= 2)                               /* Double precision. */
 368             {
 369               if (vsize == 1)
 370                 vp[0] = mpn_gcd_1 (up, usize, vp[0]);
 371               else
 372                 vsize = gcd_2 (vp, up);
 373               break;                                    /* Binary GCD done.  */
 374             }
 375
 376           /* Count number of low zero limbs of U - V.  */
 377           for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; )
 378             continue;
 379
 380           /* If U < V, swap U and V; in any case, subtract V from U.  */
 381           if (zeros == vsize)                           /* Subtract done.  */
 382             up += zeros, usize -= zeros;
 383           else if (usize == vsize)
 384             {
 385               mp_size_t size = vsize;
 386               do
 387                 size--;
 388               while (up[size] == vp[size]);
 389               if (up[size] < vp[size])                  /* usize == vsize.  */
 390                 MP_PTR_SWAP (up, vp);
 391               up += zeros, usize = size + 1 - zeros;
 392               mpn_sub_n (up, up, vp + zeros, usize);
 393             }
 394           else
 395             {
 396               mp_size_t size = vsize - zeros;
 397               up += zeros, usize -= zeros;
 398               if (mpn_sub_n (up, up, vp + zeros, size))
 399                 {
 400                   while (up[size] == 0)                 /* Propagate borrow. */
 401                     up[size++] = -(mp_limb_t)1;
 402                   up[size] -= 1;
 403                 }
 404             }
 405         }
 406       while (usize);                                    /* End binary GCD.  */
 407     }
 408
 409 done:
 410   if (vp != gp)
 411     MPN_COPY (gp, vp, vsize);
 412   TMP_FREE (marker);
 413   return vsize;
 414 }