From a796e51208fb0e56d66f223c01ee6a4fbcbf8e98 Mon Sep 17 00:00:00 2001
From: simonm <unknown>
Date: Fri, 5 Jun 1998 14:37:50 +0000
Subject: [PATCH] [project @ 1998-06-05 14:37:50 by simonm] Import GMP 2.0.2

---
 ghc/rts/gmp/mpn/cray/gmp-mparam.h  |   27 +++
 ghc/rts/gmp/mpn/generic/add_n.c    |   62 +++++
 ghc/rts/gmp/mpn/generic/addmul_1.c |   65 ++++++
 ghc/rts/gmp/mpn/generic/bdivmod.c  |  129 +++++++++++
 ghc/rts/gmp/mpn/generic/cmp.c      |   56 +++++
 ghc/rts/gmp/mpn/generic/divrem.c   |  245 ++++++++++++++++++++
 ghc/rts/gmp/mpn/generic/divrem_1.c |   58 +++++
 ghc/rts/gmp/mpn/generic/dump.c     |   20 ++
 ghc/rts/gmp/mpn/generic/gcd.c      |  402 ++++++++++++++++++++++++++++++++
 ghc/rts/gmp/mpn/generic/gcd_1.c    |   73 ++++++
 ghc/rts/gmp/mpn/generic/gcdext.c   |  441 ++++++++++++++++++++++++++++++++++++
 11 files changed, 1578 insertions(+)
 create mode 100644 ghc/rts/gmp/mpn/cray/gmp-mparam.h
 create mode 100644 ghc/rts/gmp/mpn/generic/add_n.c
 create mode 100644 ghc/rts/gmp/mpn/generic/addmul_1.c
 create mode 100644 ghc/rts/gmp/mpn/generic/bdivmod.c
 create mode 100644 ghc/rts/gmp/mpn/generic/cmp.c
 create mode 100644 ghc/rts/gmp/mpn/generic/divrem.c
 create mode 100644 ghc/rts/gmp/mpn/generic/divrem_1.c
 create mode 100644 ghc/rts/gmp/mpn/generic/dump.c
 create mode 100644 ghc/rts/gmp/mpn/generic/gcd.c
 create mode 100644 ghc/rts/gmp/mpn/generic/gcd_1.c
 create mode 100644 ghc/rts/gmp/mpn/generic/gcdext.c
diff --git a/ghc/rts/gmp/mpn/cray/gmp-mparam.h b/ghc/rts/gmp/mpn/cray/gmp-mparam.h
new file mode 100644
index 0000000..349c812
--- /dev/null
+++ b/ghc/rts/gmp/mpn/cray/gmp-mparam.h
@@ -0,0 +1,27 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 64
+#define BITS_PER_SHORTINT 32
+#define BITS_PER_CHAR 8
diff --git a/ghc/rts/gmp/mpn/generic/add_n.c b/ghc/rts/gmp/mpn/generic/add_n.c
new file mode 100644
index 0000000..9d71df1
--- /dev/null
+++ b/ghc/rts/gmp/mpn/generic/add_n.c
@@ -0,0 +1,62 @@
+/* mpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+#if __STDC__
+mpn_add_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
+#else
+mpn_add_n (res_ptr, s1_ptr, s2_ptr, size)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     register mp_srcptr s2_ptr;
+     mp_size_t size;
+#endif
+{
+  register mp_limb_t x, y, cy;
+  register mp_size_t j;
+
+  /* The loop counter and index J goes from -SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  s1_ptr -= j;
+  s2_ptr -= j;
+  res_ptr -= j;
+
+  cy = 0;
+  do
+    {
+      y = s2_ptr[j];
+      x = s1_ptr[j];
+      y += cy;			/* add previous carry to one addend */
+      cy = (y < cy);		/* get out carry from that addition */
+      y = x + y;		/* add other addend */
+      cy = (y < x) + cy;	/* get out carry from that add, combine */
+      res_ptr[j] = y;
+    }
+  while (++j != 0);
+
+  return cy;
+}
diff --git a/ghc/rts/gmp/mpn/generic/addmul_1.c b/ghc/rts/gmp/mpn/generic/addmul_1.c
new file mode 100644
index 0000000..3a5e214
--- /dev/null
+++ b/ghc/rts/gmp/mpn/generic/addmul_1.c
@@ -0,0 +1,65 @@
+/* mpn_addmul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
+   by S2_LIMB, add the S1_SIZE least significant limbs of the product to the
+   limb vector pointed to by RES_PTR.  Return the most significant limb of
+   the product, adjusted for carry-out from the addition.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_addmul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     mp_size_t s1_size;
+     register mp_limb_t s2_limb;
+{
+  register mp_limb_t cy_limb;
+  register mp_size_t j;
+  register mp_limb_t prod_high, prod_low;
+  register mp_limb_t x;
+
+  /* The loop counter and index J goes from -SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -s1_size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  res_ptr -= j;
+  s1_ptr -= j;
+
+  cy_limb = 0;
+  do
+    {
+      umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+      prod_low += cy_limb;
+      cy_limb = (prod_low < cy_limb) + prod_high;
+
+      x = res_ptr[j];
+      prod_low = x + prod_low;
+      cy_limb += (prod_low < x);
+      res_ptr[j] = prod_low;
+    }
+  while (++j != 0);
+
+  return cy_limb;
+}
diff --git a/ghc/rts/gmp/mpn/generic/bdivmod.c b/ghc/rts/gmp/mpn/generic/bdivmod.c
new file mode 100644
index 0000000..f095288
--- /dev/null
+++ b/ghc/rts/gmp/mpn/generic/bdivmod.c
@@ -0,0 +1,129 @@
+/* mpn/bdivmod.c: mpn_bdivmod for computing U/V mod 2^d.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* q_high = mpn_bdivmod (qp, up, usize, vp, vsize, d).
+
+   Puts the low d/BITS_PER_MP_LIMB limbs of Q = U / V mod 2^d at qp, and
+   returns the high d%BITS_PER_MP_LIMB bits of Q as the result.
+
+   Also, U - Q * V mod 2^(usize*BITS_PER_MP_LIMB) is placed at up.  Since the
+   low d/BITS_PER_MP_LIMB limbs of this difference are zero, the code allows
+   the limb vectors at qp to overwrite the low limbs at up, provided qp <= up.
+
+   Preconditions:
+   1.  V is odd.
+   2.  usize * BITS_PER_MP_LIMB >= d.
+   3.  If Q and U overlap, qp <= up.
+
+   Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
+
+   Funding for this work has been partially provided by Conselho Nacional
+   de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
+   301314194-2, and was done while I was a visiting reseacher in the Instituto
+   de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
+
+   References:
+       T. Jebelean, An algorithm for exact division, Journal of Symbolic
+       Computation, v. 15, 1993, pp. 169-180.
+
+       K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
+       Mathematical Software, v. 21 (March), 1995, pp. 111-122.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+#if __STDC__
+mpn_bdivmod (mp_ptr qp, mp_ptr up, mp_size_t usize,
+	     mp_srcptr vp, mp_size_t vsize, unsigned long int d)
+#else
+mpn_bdivmod (qp, up, usize, vp, vsize, d)
+     mp_ptr qp;
+     mp_ptr up;
+     mp_size_t usize;
+     mp_srcptr vp;
+     mp_size_t vsize;
+     unsigned long int d;
+#endif
+{
+  /* Cache for v_inv is used to make mpn_accelgcd faster.  */
+  static mp_limb_t previous_low_vlimb = 0;
+  static mp_limb_t v_inv;		/* 1/V mod 2^BITS_PER_MP_LIMB.  */
+
+  if (vp[0] != previous_low_vlimb)	/* Cache miss; compute v_inv.  */
+    {
+      mp_limb_t v = previous_low_vlimb = vp[0];
+      mp_limb_t make_zero = 1;
+      mp_limb_t two_i = 1;
+      v_inv = 0;
+      do
+	{
+	  while ((two_i & make_zero) == 0)
+	    two_i <<= 1, v <<= 1;
+	  v_inv += two_i;
+	  make_zero -= v;
+	}
+      while (make_zero);
+    }
+
+  /* Need faster computation for some common cases in mpn_accelgcd.  */
+  if (usize == 2 && vsize == 2 &&
+      (d == BITS_PER_MP_LIMB || d == 2*BITS_PER_MP_LIMB))
+    {
+      mp_limb_t hi, lo;
+      mp_limb_t q = up[0] * v_inv;
+      umul_ppmm (hi, lo, q, vp[0]);
+      up[0] = 0, up[1] -= hi + q*vp[1], qp[0] = q;
+      if (d == 2*BITS_PER_MP_LIMB)
+	q = up[1] * v_inv, up[1] = 0, qp[1] = q;
+      return 0;
+    }
+
+  /* Main loop.  */
+  while (d >= BITS_PER_MP_LIMB)
+    {
+      mp_limb_t q = up[0] * v_inv;
+      mp_limb_t b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
+      if (usize > vsize)
+	mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+      d -= BITS_PER_MP_LIMB;
+      up += 1, usize -= 1;
+      *qp++ = q;
+    }
+
+  if (d)
+    {
+      mp_limb_t b;
+      mp_limb_t q = (up[0] * v_inv) & (((mp_limb_t)1<<d) - 1);
+      switch (q)
+	{
+	  case 0:  return 0;
+	  case 1:  b = mpn_sub_n (up, up, vp, MIN (usize, vsize));   break;
+	  default: b = mpn_submul_1 (up, vp, MIN (usize, vsize), q); break;
+	}
+      if (usize > vsize)
+	mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+      return q;
+    }
+
+  return 0;
+}
diff --git a/ghc/rts/gmp/mpn/generic/cmp.c b/ghc/rts/gmp/mpn/generic/cmp.c
new file mode 100644
index 0000000..4e9c60d
--- /dev/null
+++ b/ghc/rts/gmp/mpn/generic/cmp.c
@@ -0,0 +1,56 @@
+/* mpn_cmp -- Compare two low-level natural-number integers.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Compare OP1_PTR/OP1_SIZE with OP2_PTR/OP2_SIZE.
+   There are no restrictions on the relative sizes of
+   the two arguments.
+   Return 1 if OP1 > OP2, 0 if they are equal, and -1 if OP1 < OP2.  */
+
+int
+#if __STDC__
+mpn_cmp (mp_srcptr op1_ptr, mp_srcptr op2_ptr, mp_size_t size)
+#else
+mpn_cmp (op1_ptr, op2_ptr, size)
+     mp_srcptr op1_ptr;
+     mp_srcptr op2_ptr;
+     mp_size_t size;
+#endif
+{
+  mp_size_t i;
+  mp_limb_t op1_word, op2_word;
+
+  for (i = size - 1; i >= 0; i--)
+    {
+      op1_word = op1_ptr[i];
+      op2_word = op2_ptr[i];
+      if (op1_word != op2_word)
+	goto diff;
+    }
+  return 0;
+ diff:
+  /* This can *not* be simplified to
+	op2_word - op2_word
+     since that expression might give signed overflow.  */
+  return (op1_word > op2_word) ? 1 : -1;
+}
diff --git a/ghc/rts/gmp/mpn/generic/divrem.c b/ghc/rts/gmp/mpn/generic/divrem.c
new file mode 100644
index 0000000..1fe865a
--- /dev/null
+++ b/ghc/rts/gmp/mpn/generic/divrem.c
@@ -0,0 +1,245 @@
+/* mpn_divrem -- Divide natural numbers, producing both remainder and
+   quotient.
+
+Copyright (C) 1993, 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Divide num (NP/NSIZE) by den (DP/DSIZE) and write
+   the NSIZE-DSIZE least significant quotient limbs at QP
+   and the DSIZE long remainder at NP.  If QEXTRA_LIMBS is
+   non-zero, generate that many fraction bits and append them after the
+   other quotient limbs.
+   Return the most significant limb of the quotient, this is always 0 or 1.
+
+   Preconditions:
+   0. NSIZE >= DSIZE.
+   1. The most significant bit of the divisor must be set.
+   2. QP must either not overlap with the input operands at all, or
+      QP + DSIZE >= NP must hold true.  (This means that it's
+      possible to put the quotient in the high part of NUM, right after the
+      remainder in NUM.
+   3. NSIZE >= DSIZE, even if QEXTRA_LIMBS is non-zero.  */
+
+mp_limb_t
+#if __STDC__
+mpn_divrem (mp_ptr qp, mp_size_t qextra_limbs,
+	    mp_ptr np, mp_size_t nsize,
+	    mp_srcptr dp, mp_size_t dsize)
+#else
+mpn_divrem (qp, qextra_limbs, np, nsize, dp, dsize)
+     mp_ptr qp;
+     mp_size_t qextra_limbs;
+     mp_ptr np;
+     mp_size_t nsize;
+     mp_srcptr dp;
+     mp_size_t dsize;
+#endif
+{
+  mp_limb_t most_significant_q_limb = 0;
+
+  switch (dsize)
+    {
+    case 0:
+      /* We are asked to divide by zero, so go ahead and do it!  (To make
+	 the compiler not remove this statement, return the value.)  */
+      return 1 / dsize;
+
+    case 1:
+      {
+	mp_size_t i;
+	mp_limb_t n1;
+	mp_limb_t d;
+
+	d = dp[0];
+	n1 = np[nsize - 1];
+
+	if (n1 >= d)
+	  {
+	    n1 -= d;
+	    most_significant_q_limb = 1;
+	  }
+
+	qp += qextra_limbs;
+	for (i = nsize - 2; i >= 0; i--)
+	  udiv_qrnnd (qp[i], n1, n1, np[i], d);
+	qp -= qextra_limbs;
+
+	for (i = qextra_limbs - 1; i >= 0; i--)
+	  udiv_qrnnd (qp[i], n1, n1, 0, d);
+
+	np[0] = n1;
+      }
+      break;
+
+    case 2:
+      {
+	mp_size_t i;
+	mp_limb_t n1, n0, n2;
+	mp_limb_t d1, d0;
+
+	np += nsize - 2;
+	d1 = dp[1];
+	d0 = dp[0];
+	n1 = np[1];
+	n0 = np[0];
+
+	if (n1 >= d1 && (n1 > d1 || n0 >= d0))
+	  {
+	    sub_ddmmss (n1, n0, n1, n0, d1, d0);
+	    most_significant_q_limb = 1;
+	  }
+
+	for (i = qextra_limbs + nsize - 2 - 1; i >= 0; i--)
+	  {
+	    mp_limb_t q;
+	    mp_limb_t r;
+
+	    if (i >= qextra_limbs)
+	      np--;
+	    else
+	      np[0] = 0;
+
+	    if (n1 == d1)
+	      {
+		/* Q should be either 111..111 or 111..110.  Need special
+		   treatment of this rare case as normal division would
+		   give overflow.  */
+		q = ~(mp_limb_t) 0;
+
+		r = n0 + d1;
+		if (r < d1)	/* Carry in the addition? */
+		  {
+		    add_ssaaaa (n1, n0, r - d0, np[0], 0, d0);
+		    qp[i] = q;
+		    continue;
+		  }
+		n1 = d0 - (d0 != 0);
+		n0 = -d0;
+	      }
+	    else
+	      {
+		udiv_qrnnd (q, r, n1, n0, d1);
+		umul_ppmm (n1, n0, d0, q);
+	      }
+
+	    n2 = np[0];
+	  q_test:
+	    if (n1 > r || (n1 == r && n0 > n2))
+	      {
+		/* The estimated Q was too large.  */
+		q--;
+
+		sub_ddmmss (n1, n0, n1, n0, 0, d0);
+		r += d1;
+		if (r >= d1)	/* If not carry, test Q again.  */
+		  goto q_test;
+	      }
+
+	    qp[i] = q;
+	    sub_ddmmss (n1, n0, r, n2, n1, n0);
+	  }
+	np[1] = n1;
+	np[0] = n0;
+      }
+      break;
+
+    default:
+      {
+	mp_size_t i;
+	mp_limb_t dX, d1, n0;
+
+	np += nsize - dsize;
+	dX = dp[dsize - 1];
+	d1 = dp[dsize - 2];
+	n0 = np[dsize - 1];
+
+	if (n0 >= dX)
+	  {
+	    if (n0 > dX || mpn_cmp (np, dp, dsize - 1) >= 0)
+	      {
+		mpn_sub_n (np, np, dp, dsize);
+		n0 = np[dsize - 1];
+		most_significant_q_limb = 1;
+	      }
+	  }
+
+	for (i = qextra_limbs + nsize - dsize - 1; i >= 0; i--)
+	  {
+	    mp_limb_t q;
+	    mp_limb_t n1, n2;
+	    mp_limb_t cy_limb;
+
+	    if (i >= qextra_limbs)
+	      {
+		np--;
+		n2 = np[dsize];
+	      }
+	    else
+	      {
+		n2 = np[dsize - 1];
+		MPN_COPY_DECR (np + 1, np, dsize);
+		np[0] = 0;
+	      }
+
+	    if (n0 == dX)
+	      /* This might over-estimate q, but it's probably not worth
+		 the extra code here to find out.  */
+	      q = ~(mp_limb_t) 0;
+	    else
+	      {
+		mp_limb_t r;
+
+		udiv_qrnnd (q, r, n0, np[dsize - 1], dX);
+		umul_ppmm (n1, n0, d1, q);
+
+		while (n1 > r || (n1 == r && n0 > np[dsize - 2]))
+		  {
+		    q--;
+		    r += dX;
+		    if (r < dX)	/* I.e. "carry in previous addition?"  */
+		      break;
+		    n1 -= n0 < d1;
+		    n0 -= d1;
+		  }
+	      }
+
+	    /* Possible optimization: We already have (q * n0) and (1 * n1)
+	       after the calculation of q.  Taking advantage of that, we
+	       could make this loop make two iterations less.  */
+
+	    cy_limb = mpn_submul_1 (np, dp, dsize, q);
+
+	    if (n2 != cy_limb)
+	      {
+		mpn_add_n (np, np, dp, dsize);
+		q--;
+	      }
+
+	    qp[i] = q;
+	    n0 = np[dsize - 1];
+	  }
+      }
+    }
+
+  return most_significant_q_limb;
+}
diff --git a/ghc/rts/gmp/mpn/generic/divrem_1.c b/ghc/rts/gmp/mpn/generic/divrem_1.c
new file mode 100644
index 0000000..d213267
--- /dev/null
+++ b/ghc/rts/gmp/mpn/generic/divrem_1.c
@@ -0,0 +1,58 @@
+/* mpn_divrem_1(quot_ptr, qsize, dividend_ptr, dividend_size, divisor_limb) --
+   Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+   Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
+   Return the single-limb remainder.
+   There are no constraints on the value of the divisor.
+
+   QUOT_PTR and DIVIDEND_PTR might point to the same limb.
+
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+#if __STDC__
+mpn_divrem_1 (mp_ptr qp, mp_size_t qsize,
+	      mp_srcptr dividend_ptr, mp_size_t dividend_size,
+	      mp_limb_t divisor_limb)
+#else
+mpn_divrem_1 (qp, qsize, dividend_ptr, dividend_size, divisor_limb)
+     mp_ptr qp;
+     mp_size_t qsize;
+     mp_srcptr dividend_ptr;
+     mp_size_t dividend_size;
+     mp_limb_t divisor_limb;
+#endif
+{
+  mp_limb_t rlimb;
+  long i;
+
+  /* Develop integer part of quotient.  */
+  rlimb = mpn_divmod_1 (qp + qsize, dividend_ptr, dividend_size, divisor_limb);
+
+  if (qsize != 0)
+    {
+      for (i = qsize - 1; i >= 0; i--)
+	udiv_qrnnd (qp[i], rlimb, rlimb, 0, divisor_limb);
+    }
+  return rlimb;
+}
diff --git a/ghc/rts/gmp/mpn/generic/dump.c b/ghc/rts/gmp/mpn/generic/dump.c
new file mode 100644
index 0000000..a5831c4
--- /dev/null
+++ b/ghc/rts/gmp/mpn/generic/dump.c
@@ -0,0 +1,20 @@
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+mpn_dump (ptr, size)
+     mp_srcptr ptr;
+     mp_size_t size;
+{
+  if (size == 0)
+    printf ("0\n");
+  {
+    while (size)
+      {
+	size--;
+	printf ("%0*lX", (int) (2 * BYTES_PER_MP_LIMB), ptr[size]);
+      }
+    printf ("\n");
+  }
+}
diff --git a/ghc/rts/gmp/mpn/generic/gcd.c b/ghc/rts/gmp/mpn/generic/gcd.c
new file mode 100644
index 0000000..8c2bbf0
--- /dev/null
+++ b/ghc/rts/gmp/mpn/generic/gcd.c
@@ -0,0 +1,402 @@
+/* mpn/gcd.c: mpn_gcd for gcd of two odd integers.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* Integer greatest common divisor of two unsigned integers, using
+   the accelerated algorithm (see reference below).
+
+   mp_size_t mpn_gcd (vp, vsize, up, usize).
+
+   Preconditions [U = (up, usize) and V = (vp, vsize)]:
+
+   1.  V is odd.
+   2.  numbits(U) >= numbits(V).
+
+   Both U and V are destroyed by the operation.  The result is left at vp,
+   and its size is returned.
+
+   Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
+
+   Funding for this work has been partially provided by Conselho Nacional
+   de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
+   301314194-2, and was done while I was a visiting reseacher in the Instituto
+   de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
+
+   Refer to
+	K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
+	Mathematical Software, v. 21 (March), 1995, pp. 111-122.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* If MIN (usize, vsize) > ACCEL_THRESHOLD, then the accelerated algorithm is
+   used, otherwise the binary algorithm is used.  This may be adjusted for
+   different architectures.  */
+#ifndef ACCEL_THRESHOLD
+#define ACCEL_THRESHOLD 4
+#endif
+
+/* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated
+   algorithm reduces using the bmod operation.  Otherwise, the k-ary reduction
+   is used.  0 <= BMOD_THRESHOLD < BITS_PER_MP_LIMB.  */
+enum
+  {
+    BMOD_THRESHOLD = BITS_PER_MP_LIMB/2
+  };
+
+#define SIGN_BIT  (~(~(mp_limb_t)0 >> 1))
+
+
+#define SWAP_LIMB(UL, VL) do{mp_limb_t __l=(UL);(UL)=(VL);(VL)=__l;}while(0)
+#define SWAP_PTR(UP, VP) do{mp_ptr __p=(UP);(UP)=(VP);(VP)=__p;}while(0)
+#define SWAP_SZ(US, VS) do{mp_size_t __s=(US);(US)=(VS);(VS)=__s;}while(0)
+#define SWAP_MPN(UP, US, VP, VS) do{SWAP_PTR(UP,VP);SWAP_SZ(US,VS);}while(0)
+
+/* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2.
+   Both U and V must be odd.  */
+static __gmp_inline mp_size_t
+#if __STDC__
+gcd_2 (mp_ptr vp, mp_srcptr up)
+#else
+gcd_2 (vp, up)
+     mp_ptr vp;
+     mp_srcptr up;
+#endif
+{
+  mp_limb_t u0, u1, v0, v1;
+  mp_size_t vsize;
+
+  u0 = up[0], u1 = up[1], v0 = vp[0], v1 = vp[1];
+
+  while (u1 != v1 && u0 != v0)
+    {
+      unsigned long int r;
+      if (u1 > v1)
+	{
+	  u1 -= v1 + (u0 < v0), u0 -= v0;
+	  count_trailing_zeros (r, u0);
+	  u0 = u1 << (BITS_PER_MP_LIMB - r) | u0 >> r;
+	  u1 >>= r;
+	}
+      else  /* u1 < v1.  */
+	{
+	  v1 -= u1 + (v0 < u0), v0 -= u0;
+	  count_trailing_zeros (r, v0);
+	  v0 = v1 << (BITS_PER_MP_LIMB - r) | v0 >> r;
+	  v1 >>= r;
+	}
+    }
+
+  vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0);
+
+  /* If U == V == GCD, done.  Otherwise, compute GCD (V, |U - V|).  */
+  if (u1 == v1 && u0 == v0)
+    return vsize;
+
+  v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0;
+  vp[0] = mpn_gcd_1 (vp, vsize, v0);
+
+  return 1;
+}
+
+/* The function find_a finds 0 < N < 2^BITS_PER_MP_LIMB such that there exists
+   0 < |D| < 2^BITS_PER_MP_LIMB, and N == D * C mod 2^(2*BITS_PER_MP_LIMB).
+   In the reference article, D was computed along with N, but it is better to
+   compute D separately as D <-- N / C mod 2^(BITS_PER_MP_LIMB + 1), treating
+   the result as a twos' complement signed integer.
+
+   Initialize N1 to C mod 2^(2*BITS_PER_MP_LIMB).  According to the reference
+   article, N2 should be initialized to 2^(2*BITS_PER_MP_LIMB), but we use
+   2^(2*BITS_PER_MP_LIMB) - N1 to start the calculations within double
+   precision.  If N2 > N1 initially, the first iteration of the while loop
+   will swap them.  In all other situations, N1 >= N2 is maintained.  */
+
+static __gmp_inline mp_limb_t
+#if __STDC__
+find_a (mp_srcptr cp)
+#else
+find_a (cp)
+     mp_srcptr cp;
+#endif
+{
+  unsigned long int leading_zero_bits = 0;
+
+  mp_limb_t n1_l = cp[0];	/* N1 == n1_h * 2^BITS_PER_MP_LIMB + n1_l.  */
+  mp_limb_t n1_h = cp[1];
+
+  mp_limb_t n2_l = -n1_l;	/* N2 == n2_h * 2^BITS_PER_MP_LIMB + n2_l.  */
+  mp_limb_t n2_h = ~n1_h;
+
+  /* Main loop.  */
+  while (n2_h)			/* While N2 >= 2^BITS_PER_MP_LIMB.  */
+    {
+      /* N1 <-- N1 % N2.  */
+      if ((SIGN_BIT >> leading_zero_bits & n2_h) == 0)
+	{
+	  unsigned long int i;
+	  count_leading_zeros (i, n2_h);
+	  i -= leading_zero_bits, leading_zero_bits += i;
+	  n2_h = n2_h<<i | n2_l>>(BITS_PER_MP_LIMB - i), n2_l <<= i;
+	  do
+	    {
+	      if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
+		n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
+	      n2_l = n2_l>>1 | n2_h<<(BITS_PER_MP_LIMB - 1), n2_h >>= 1;
+	      i -= 1;
+	    }
+	  while (i);
+	}
+      if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
+	n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
+
+      SWAP_LIMB (n1_h, n2_h);
+      SWAP_LIMB (n1_l, n2_l);
+    }
+
+  return n2_l;
+}
+
+mp_size_t
+#if __STDC__
+mpn_gcd (mp_ptr gp, mp_ptr vp, mp_size_t vsize, mp_ptr up, mp_size_t usize)
+#else
+mpn_gcd (gp, vp, vsize, up, usize)
+     mp_ptr gp;
+     mp_ptr vp;
+     mp_size_t vsize;
+     mp_ptr up;
+     mp_size_t usize;
+#endif
+{
+  mp_ptr orig_vp = vp;
+  mp_size_t orig_vsize = vsize;
+  int binary_gcd_ctr;		/* Number of times binary gcd will execute.  */
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* Use accelerated algorithm if vsize is over ACCEL_THRESHOLD.
+     Two EXTRA limbs for U and V are required for kary reduction.  */
+  if (vsize > ACCEL_THRESHOLD)
+    {
+      unsigned long int vbitsize, d;
+      mp_ptr orig_up = up;
+      mp_size_t orig_usize = usize;
+      mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB);
+
+      MPN_COPY (anchor_up, orig_up, usize);
+      up = anchor_up;
+
+      count_leading_zeros (d, up[usize-1]);
+      d = usize * BITS_PER_MP_LIMB - d;
+      count_leading_zeros (vbitsize, vp[vsize-1]);
+      vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+      d = d - vbitsize + 1;
+
+      /* Use bmod reduction to quickly discover whether V divides U.  */
+      up[usize++] = 0;				/* Insert leading zero.  */
+      mpn_bdivmod (up, up, usize, vp, vsize, d);
+
+      /* Now skip U/V mod 2^d and any low zero limbs.  */
+      d /= BITS_PER_MP_LIMB, up += d, usize -= d;
+      while (usize != 0 && up[0] == 0)
+	up++, usize--;
+
+      if (usize == 0)				/* GCD == ORIG_V.  */
+	goto done;
+
+      vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB);
+      MPN_COPY (vp, orig_vp, vsize);
+
+      do					/* Main loop.  */
+	{
+	  if (up[usize-1] & SIGN_BIT)		/* U < 0; take twos' compl. */
+	    {
+	      mp_size_t i;
+	      anchor_up[0] = -up[0];
+	      for (i = 1; i < usize; i++)
+		anchor_up[i] = ~up[i];
+	      up = anchor_up;
+	    }
+
+	  MPN_NORMALIZE_NOT_ZERO (up, usize);
+
+	  if ((up[0] & 1) == 0)			/* Result even; remove twos. */
+	    {
+	      unsigned long int r;
+	      count_trailing_zeros (r, up[0]);
+	      mpn_rshift (anchor_up, up, usize, r);
+	      usize -= (anchor_up[usize-1] == 0);
+	    }
+	  else if (anchor_up != up)
+	    MPN_COPY (anchor_up, up, usize);
+
+	  SWAP_MPN (anchor_up, usize, vp, vsize);
+	  up = anchor_up;
+
+	  if (vsize <= 2)		/* Kary can't handle < 2 limbs and  */
+	    break;			/* isn't efficient for == 2 limbs.  */
+
+	  d = vbitsize;
+	  count_leading_zeros (vbitsize, vp[vsize-1]);
+	  vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+	  d = d - vbitsize + 1;
+
+	  if (d > BMOD_THRESHOLD)	/* Bmod reduction.  */
+	    {
+	      up[usize++] = 0;
+	      mpn_bdivmod (up, up, usize, vp, vsize, d);
+	      d /= BITS_PER_MP_LIMB, up += d, usize -= d;
+	    }
+	  else				/* Kary reduction.  */
+	    {
+	      mp_limb_t bp[2], cp[2];
+
+	      /* C <-- V/U mod 2^(2*BITS_PER_MP_LIMB).  */
+	      cp[0] = vp[0], cp[1] = vp[1];
+	      mpn_bdivmod (cp, cp, 2, up, 2, 2*BITS_PER_MP_LIMB);
+
+	      /* U <-- find_a (C)  *  U.  */
+	      up[usize] = mpn_mul_1 (up, up, usize, find_a (cp));
+	      usize++;
+
+	      /* B <-- A/C == U/V mod 2^(BITS_PER_MP_LIMB + 1).
+		  bp[0] <-- U/V mod 2^BITS_PER_MP_LIMB and
+		  bp[1] <-- ( (U - bp[0] * V)/2^BITS_PER_MP_LIMB ) / V mod 2 */
+	      bp[0] = up[0], bp[1] = up[1];
+	      mpn_bdivmod (bp, bp, 2, vp, 2, BITS_PER_MP_LIMB);
+	      bp[1] &= 1;	/* Since V is odd, division is unnecessary.  */
+
+	      up[usize++] = 0;
+	      if (bp[1])	/* B < 0: U <-- U + (-B)  * V.  */
+		{
+		   mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0]);
+		   mpn_add_1 (up + vsize, up + vsize, usize - vsize, c);
+		}
+	      else		/* B >= 0:  U <-- U - B * V.  */
+		{
+		  mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]);
+		  mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+		}
+
+	      up += 2, usize -= 2;  /* At least two low limbs are zero.  */
+	    }
+
+	  /* Must remove low zero limbs before complementing.  */
+	  while (usize != 0 && up[0] == 0)
+	    up++, usize--;
+	}
+      while (usize);
+
+      /* Compute GCD (ORIG_V, GCD (ORIG_U, V)).  Binary will execute twice.  */
+      up = orig_up, usize = orig_usize;
+      binary_gcd_ctr = 2;
+    }
+  else
+    binary_gcd_ctr = 1;
+
+  /* Finish up with the binary algorithm.  Executes once or twice.  */
+  for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize)
+    {
+      if (usize > 2)		/* First make U close to V in size.  */
+	{
+	  unsigned long int vbitsize, d;
+	  count_leading_zeros (d, up[usize-1]);
+	  d = usize * BITS_PER_MP_LIMB - d;
+	  count_leading_zeros (vbitsize, vp[vsize-1]);
+	  vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+	  d = d - vbitsize - 1;
+	  if (d != -(unsigned long int)1 && d > 2)
+	    {
+	      mpn_bdivmod (up, up, usize, vp, vsize, d);  /* Result > 0.  */
+	      d /= (unsigned long int)BITS_PER_MP_LIMB, up += d, usize -= d;
+	    }
+	}
+
+      /* Start binary GCD.  */
+      do
+	{
+	  mp_size_t zeros;
+
+	  /* Make sure U is odd.  */
+	  MPN_NORMALIZE (up, usize);
+	  while (up[0] == 0)
+	    up += 1, usize -= 1;
+	  if ((up[0] & 1) == 0)
+	    {
+	      unsigned long int r;
+	      count_trailing_zeros (r, up[0]);
+	      mpn_rshift (up, up, usize, r);
+	      usize -= (up[usize-1] == 0);
+	    }
+
+	  /* Keep usize >= vsize.  */
+	  if (usize < vsize)
+	    SWAP_MPN (up, usize, vp, vsize);
+
+	  if (usize <= 2)				/* Double precision. */
+	    {
+	      if (vsize == 1)
+		vp[0] = mpn_gcd_1 (up, usize, vp[0]);
+	      else
+		vsize = gcd_2 (vp, up);
+	      break;					/* Binary GCD done.  */
+	    }
+
+	  /* Count number of low zero limbs of U - V.  */
+	  for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; )
+	    continue;
+
+	  /* If U < V, swap U and V; in any case, subtract V from U.  */
+	  if (zeros == vsize)				/* Subtract done.  */
+	    up += zeros, usize -= zeros;
+	  else if (usize == vsize)
+	    {
+	      mp_size_t size = vsize;
+	      do
+		size--;
+	      while (up[size] == vp[size]);
+	      if (up[size] < vp[size])			/* usize == vsize.  */
+		SWAP_PTR (up, vp);
+	      up += zeros, usize = size + 1 - zeros;
+	      mpn_sub_n (up, up, vp + zeros, usize);
+	    }
+	  else
+	    {
+	      mp_size_t size = vsize - zeros;
+	      up += zeros, usize -= zeros;
+	      if (mpn_sub_n (up, up, vp + zeros, size))
+		{
+		  while (up[size] == 0)			/* Propagate borrow. */
+		    up[size++] = -(mp_limb_t)1;
+		  up[size] -= 1;
+		}
+	    }
+	}
+      while (usize);					/* End binary GCD.  */
+    }
+
+done:
+  if (vp != gp)
+    MPN_COPY (gp, vp, vsize);
+  TMP_FREE (marker);
+  return vsize;
+}
diff --git a/ghc/rts/gmp/mpn/generic/gcd_1.c b/ghc/rts/gmp/mpn/generic/gcd_1.c
new file mode 100644
index 0000000..ebcdfb5
--- /dev/null
+++ b/ghc/rts/gmp/mpn/generic/gcd_1.c
@@ -0,0 +1,73 @@
+/* mpn_gcd_1 --
+
+Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Does not work for U == 0 or V == 0.  It would be tough to make it work for
+   V == 0 since gcd(x,0) = x, and U does not generally fit in an mp_limb_t.  */
+
+mp_limb_t
+mpn_gcd_1 (up, size, vlimb)
+     mp_srcptr up;
+     mp_size_t size;
+     mp_limb_t vlimb;
+{
+  mp_limb_t ulimb;
+  unsigned long int u_low_zero_bits, v_low_zero_bits;
+
+  if (size > 1)
+    {
+      ulimb = mpn_mod_1 (up, size, vlimb);
+      if (ulimb == 0)
+	return vlimb;
+    }
+  else
+    ulimb = up[0];
+
+  /*  Need to eliminate low zero bits.  */
+  count_trailing_zeros (u_low_zero_bits, ulimb);
+  ulimb >>= u_low_zero_bits;
+
+  count_trailing_zeros (v_low_zero_bits, vlimb);
+  vlimb >>= v_low_zero_bits;
+
+  while (ulimb != vlimb)
+    {
+      if (ulimb > vlimb)
+	{
+	  ulimb -= vlimb;
+	  do
+	    ulimb >>= 1;
+	  while ((ulimb & 1) == 0);
+	}
+      else /*  vlimb > ulimb.  */
+	{
+	  vlimb -= ulimb;
+	  do
+	    vlimb >>= 1;
+	  while ((vlimb & 1) == 0);
+	}
+    }
+
+  return  ulimb << MIN (u_low_zero_bits, v_low_zero_bits);
+}
diff --git a/ghc/rts/gmp/mpn/generic/gcdext.c b/ghc/rts/gmp/mpn/generic/gcdext.c
new file mode 100644
index 0000000..245e20a
--- /dev/null
+++ b/ghc/rts/gmp/mpn/generic/gcdext.c
@@ -0,0 +1,441 @@
+/* mpn_gcdext -- Extended Greatest Common Divisor.
+
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef EXTEND
+#define EXTEND 1
+#endif
+
+#if STAT
+int arr[BITS_PER_MP_LIMB];
+#endif
+
+#define SGN(A) (((A) < 0) ? -1 : ((A) > 0))
+
+/* Idea 1: After we have performed a full division, don't shift operands back,
+	   but instead account for the extra factors-of-2 thus introduced.
+   Idea 2: Simple generalization to use divide-and-conquer would give us an
+	   algorithm that runs faster than O(n^2).
+   Idea 3: The input numbers need less space as the computation progresses,
+	   while the s0 and s1 variables need more space.  To save space, we
+	   could make them share space, and have the latter variables grow
+	   into the former.  */
+
+/* Precondition: U >= V.  */
+
+mp_size_t
+#if EXTEND
+#if __STDC__
+mpn_gcdext (mp_ptr gp, mp_ptr s0p,
+	    mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcdext (gp, s0p, up, size, vp, vsize)
+     mp_ptr gp;
+     mp_ptr s0p;
+     mp_ptr up;
+     mp_size_t size;
+     mp_ptr vp;
+     mp_size_t vsize;
+#endif
+#else
+#if __STDC__
+mpn_gcd (mp_ptr gp,
+	 mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcd (gp, up, size, vp, vsize)
+     mp_ptr gp;
+     mp_ptr up;
+     mp_size_t size;
+     mp_ptr vp;
+     mp_size_t vsize;
+#endif
+#endif
+{
+  mp_limb_t uh, vh;
+  mp_limb_signed_t A, B, C, D;
+  int cnt;
+  mp_ptr tp, wp;
+#if RECORD
+  mp_limb_signed_t min = 0, max = 0;
+#endif
+#if EXTEND
+  mp_ptr s1p;
+  mp_ptr orig_s0p = s0p;
+  mp_size_t ssize, orig_size = size;
+  TMP_DECL (mark);
+
+  TMP_MARK (mark);
+
+  tp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+  wp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+  s1p = (mp_ptr) TMP_ALLOC (size * BYTES_PER_MP_LIMB);
+
+  MPN_ZERO (s0p, size);
+  MPN_ZERO (s1p, size);
+
+  s0p[0] = 1;
+  s1p[0] = 0;
+  ssize = 1;
+#endif
+
+  if (size > vsize)
+    {
+      /* Normalize V (and shift up U the same amount).  */
+      count_leading_zeros (cnt, vp[vsize - 1]);
+      if (cnt != 0)
+	{
+	  mp_limb_t cy;
+	  mpn_lshift (vp, vp, vsize, cnt);
+	  cy = mpn_lshift (up, up, size, cnt);
+	  up[size] = cy;
+	  size += cy != 0;
+	}
+
+      mpn_divmod (up + vsize, up, size, vp, vsize);
+#if EXTEND
+      /* This is really what it boils down to in this case... */
+      s0p[0] = 0;
+      s1p[0] = 1;
+#endif
+      size = vsize;
+      if (cnt != 0)
+	{
+	  mpn_rshift (up, up, size, cnt);
+	  mpn_rshift (vp, vp, size, cnt);
+	}
+      {
+	mp_ptr xp;
+	xp = up; up = vp; vp = xp;
+      }
+    }
+
+  for (;;)
+    {
+      /* Figure out exact size of V.  */
+      vsize = size;
+      MPN_NORMALIZE (vp, vsize);
+      if (vsize <= 1)
+	break;
+
+      /* Make UH be the most significant limb of U, and make VH be
+	 corresponding bits from V.  */
+      uh = up[size - 1];
+      vh = vp[size - 1];
+      count_leading_zeros (cnt, uh);
+      if (cnt != 0)
+	{
+	  uh = (uh << cnt) | (up[size - 2] >> (BITS_PER_MP_LIMB - cnt));
+	  vh = (vh << cnt) | (vp[size - 2] >> (BITS_PER_MP_LIMB - cnt));
+	}
+
+#if 0
+      /* For now, only handle BITS_PER_MP_LIMB-1 bits.  This makes
+	 room for sign bit.  */
+      uh >>= 1;
+      vh >>= 1;
+#endif
+      A = 1;
+      B = 0;
+      C = 0;
+      D = 1;
+
+      for (;;)
+	{
+	  mp_limb_signed_t q, T;
+	  if (vh + C == 0 || vh + D == 0)
+	    break;
+
+	  q = (uh + A) / (vh + C);
+	  if (q != (uh + B) / (vh + D))
+	    break;
+
+	  T = A - q * C;
+	  A = C;
+	  C = T;
+	  T = B - q * D;
+	  B = D;
+	  D = T;
+	  T = uh - q * vh;
+	  uh = vh;
+	  vh = T;
+	}
+
+#if RECORD
+      min = MIN (A, min);  min = MIN (B, min);
+      min = MIN (C, min);  min = MIN (D, min);
+      max = MAX (A, max);  max = MAX (B, max);
+      max = MAX (C, max);  max = MAX (D, max);
+#endif
+
+      if (B == 0)
+	{
+	  mp_limb_t qh;
+	  mp_size_t i;
+
+	  /* This is quite rare.  I.e., optimize something else!  */
+
+	  /* Normalize V (and shift up U the same amount).  */
+	  count_leading_zeros (cnt, vp[vsize - 1]);
+	  if (cnt != 0)
+	    {
+	      mp_limb_t cy;
+	      mpn_lshift (vp, vp, vsize, cnt);
+	      cy = mpn_lshift (up, up, size, cnt);
+	      up[size] = cy;
+	      size += cy != 0;
+	    }
+
+	  qh = mpn_divmod (up + vsize, up, size, vp, vsize);
+#if EXTEND
+	  MPN_COPY (tp, s0p, ssize);
+	  for (i = 0; i < size - vsize; i++)
+	    {
+	      mp_limb_t cy;
+	      cy = mpn_addmul_1 (tp + i, s1p, ssize, up[vsize + i]);
+	      if (cy != 0)
+		tp[ssize++] = cy;
+	    }
+	  if (qh != 0)
+	    {
+	      mp_limb_t cy;
+	      abort ();
+	      /* XXX since qh == 1, mpn_addmul_1 is overkill */
+	      cy = mpn_addmul_1 (tp + size - vsize, s1p, ssize, qh);
+	      if (cy != 0)
+		tp[ssize++] = cy;
+      	    }
+#if 0
+	  MPN_COPY (s0p, s1p, ssize); /* should be old ssize, kind of */
+	  MPN_COPY (s1p, tp, ssize);
+#else
+	  {
+	    mp_ptr xp;
+	    xp = s0p; s0p = s1p; s1p = xp;
+	    xp = s1p; s1p = tp; tp = xp;
+	  }
+#endif
+#endif
+	  size = vsize;
+	  if (cnt != 0)
+	    {
+	      mpn_rshift (up, up, size, cnt);
+	      mpn_rshift (vp, vp, size, cnt);
+	    }
+
+	  {
+	    mp_ptr xp;
+	    xp = up; up = vp; vp = xp;
+	  }
+	  MPN_NORMALIZE (up, size);
+	}
+      else
+	{
+	  /* T = U*A + V*B
+	     W = U*C + V*D
+	     U = T
+	     V = W	   */
+
+	  if (SGN(A) == SGN(B))	/* should be different sign */
+	    abort ();
+	  if (SGN(C) == SGN(D))	/* should be different sign */
+	    abort ();
+#if STAT
+	  { mp_limb_t x;
+	    x = ABS (A) | ABS (B) | ABS (C) | ABS (D);
+	    count_leading_zeros (cnt, x);
+	    arr[BITS_PER_MP_LIMB - cnt]++; }
+#endif
+	  if (A == 0)
+	    {
+	      if (B != 1) abort ();
+	      MPN_COPY (tp, vp, size);
+	    }
+	  else
+	    {
+	      if (A < 0)
+		{
+		  mpn_mul_1 (tp, vp, size, B);
+		  mpn_submul_1 (tp, up, size, -A);
+		}
+	      else
+		{
+		  mpn_mul_1 (tp, up, size, A);
+		  mpn_submul_1 (tp, vp, size, -B);
+		}
+	    }
+	  if (C < 0)
+	    {
+	      mpn_mul_1 (wp, vp, size, D);
+	      mpn_submul_1 (wp, up, size, -C);
+	    }
+	  else
+	    {
+	      mpn_mul_1 (wp, up, size, C);
+	      mpn_submul_1 (wp, vp, size, -D);
+	    }
+
+	  {
+	    mp_ptr xp;
+	    xp = tp; tp = up; up = xp;
+	    xp = wp; wp = vp; vp = xp;
+	  }
+
+#if EXTEND
+	  { mp_limb_t cy;
+	  MPN_ZERO (tp, orig_size);
+	  if (A == 0)
+	    {
+	      if (B != 1) abort ();
+	      MPN_COPY (tp, s1p, ssize);
+	    }
+	  else
+	    {
+	      if (A < 0)
+		{
+		  cy = mpn_mul_1 (tp, s1p, ssize, B);
+		  cy += mpn_addmul_1 (tp, s0p, ssize, -A);
+		}
+	      else
+		{
+		  cy = mpn_mul_1 (tp, s0p, ssize, A);
+		  cy += mpn_addmul_1 (tp, s1p, ssize, -B);
+		}
+	      if (cy != 0)
+		tp[ssize++] = cy;
+	    }
+	  MPN_ZERO (wp, orig_size);
+	  if (C < 0)
+	    {
+	      cy = mpn_mul_1 (wp, s1p, ssize, D);
+	      cy += mpn_addmul_1 (wp, s0p, ssize, -C);
+	    }
+	  else
+	    {
+	      cy = mpn_mul_1 (wp, s0p, ssize, C);
+	      cy += mpn_addmul_1 (wp, s1p, ssize, -D);
+	    }
+	  if (cy != 0)
+	    wp[ssize++] = cy;
+	  }
+	  {
+	    mp_ptr xp;
+	    xp = tp; tp = s0p; s0p = xp;
+	    xp = wp; wp = s1p; s1p = xp;
+	  }
+#endif
+#if 0	/* Is it a win to remove multiple zeros here? */
+	  MPN_NORMALIZE (up, size);
+#else
+	  if (up[size - 1] == 0)
+	    size--;
+#endif
+	}
+    }
+
+#if RECORD
+  printf ("min: %ld\n", min);
+  printf ("max: %ld\n", max);
+#endif
+
+  if (vsize == 0)
+    {
+      if (gp != up)
+	MPN_COPY (gp, up, size);
+#if EXTEND
+      if (orig_s0p != s0p)
+	MPN_COPY (orig_s0p, s0p, ssize);
+#endif
+      TMP_FREE (mark);
+      return size;
+    }
+  else
+    {
+      mp_limb_t vl, ul, t;
+#if EXTEND
+      mp_limb_t cy;
+      mp_size_t i;
+#endif
+      vl = vp[0];
+#if EXTEND
+      t = mpn_divmod_1 (wp, up, size, vl);
+      MPN_COPY (tp, s0p, ssize);
+      for (i = 0; i < size; i++)
+	{
+	  cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]);
+	  if (cy != 0)
+	    tp[ssize++] = cy;
+	}
+#if 0
+      MPN_COPY (s0p, s1p, ssize);
+      MPN_COPY (s1p, tp, ssize);
+#else
+      {
+	mp_ptr xp;
+	xp = s0p; s0p = s1p; s1p = xp;
+	xp = s1p; s1p = tp; tp = xp;
+      }
+#endif
+#else
+      t = mpn_mod_1 (up, size, vl);
+#endif
+      ul = vl;
+      vl = t;
+      while (vl != 0)
+	{
+	  mp_limb_t t;
+#if EXTEND
+	  mp_limb_t q, cy;
+	  q = ul / vl;
+	  t = ul - q*vl;
+
+	  MPN_COPY (tp, s0p, ssize);
+	  cy = mpn_addmul_1 (tp, s1p, ssize, q);
+	  if (cy != 0)
+	    tp[ssize++] = cy;
+#if 0
+	  MPN_COPY (s0p, s1p, ssize);
+	  MPN_COPY (s1p, tp, ssize);
+#else
+	  {
+	    mp_ptr xp;
+	    xp = s0p; s0p = s1p; s1p = xp;
+	    xp = s1p; s1p = tp; tp = xp;
+	  }
+#endif
+
+#else
+	  t = ul % vl;
+#endif
+	  ul = vl;
+	  vl = t;
+	}
+      gp[0] = ul;
+#if EXTEND
+      if (orig_s0p != s0p)
+	MPN_COPY (orig_s0p, s0p, ssize);
+#endif
+      TMP_FREE (mark);
+      return 1;
+    }
+}
-- 
1.7.10.4