+++ /dev/null
-/* mpn_mul_n and helper function -- Multiply/square natural numbers.
-
- THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n)
- ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH
- THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
- THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
-
-
-Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
-Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of the GNU Lesser General Public License as published by
-the Free Software Foundation; either version 2.1 of the License, or (at your
-option) any later version.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-License for more details.
-
-You should have received a copy of the GNU Lesser General Public License
-along with the GNU MP Library; see the file COPYING.LIB. If not, write to
-the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-MA 02111-1307, USA. */
-
-#include "gmp.h"
-#include "gmp-impl.h"
-#include "longlong.h"
-
-
-/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
- 0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
-#define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1)
-
-#if !defined (__alpha) && !defined (__mips)
-/* For all other machines, we want to call mpn functions for the compund
- operations instead of open-coding them. */
-#define USE_MORE_MPN
-#endif
-
-/*== Function declarations =================================================*/
-
-static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr,
- mp_ptr, mp_ptr, mp_ptr,
- mp_srcptr, mp_srcptr, mp_srcptr,
- mp_size_t, mp_size_t));
-static void interpolate3 _PROTO ((mp_srcptr,
- mp_ptr, mp_ptr, mp_ptr,
- mp_srcptr,
- mp_ptr, mp_ptr, mp_ptr,
- mp_size_t, mp_size_t));
-static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
-
-
-/*-- mpn_kara_mul_n ---------------------------------------------------------------*/
-
-/* Multiplies using 3 half-sized mults and so on recursively.
- * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
- * No overlap of p[...] with a[...] or b[...].
- * ws is workspace.
- */
-
-void
-#if __STDC__
-mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
-#else
-mpn_kara_mul_n(p, a, b, n, ws)
- mp_ptr p;
- mp_srcptr a;
- mp_srcptr b;
- mp_size_t n;
- mp_ptr ws;
-#endif
-{
- mp_limb_t i, sign, w, w0, w1;
- mp_size_t n2;
- mp_srcptr x, y;
-
- n2 = n >> 1;
- ASSERT (n2 > 0);
-
- if (n & 1)
- {
- /* Odd length. */
- mp_size_t n1, n3, nm1;
-
- n3 = n - n2;
-
- sign = 0;
- w = a[n2];
- if (w != 0)
- w -= mpn_sub_n (p, a, a + n3, n2);
- else
- {
- i = n2;
- do
- {
- --i;
- w0 = a[i];
- w1 = a[n3+i];
- }
- while (w0 == w1 && i != 0);
- if (w0 < w1)
- {
- x = a + n3;
- y = a;
- sign = 1;
- }
- else
- {
- x = a;
- y = a + n3;
- }
- mpn_sub_n (p, x, y, n2);
- }
- p[n2] = w;
-
- w = b[n2];
- if (w != 0)
- w -= mpn_sub_n (p + n3, b, b + n3, n2);
- else
- {
- i = n2;
- do
- {
- --i;
- w0 = b[i];
- w1 = b[n3+i];
- }
- while (w0 == w1 && i != 0);
- if (w0 < w1)
- {
- x = b + n3;
- y = b;
- sign ^= 1;
- }
- else
- {
- x = b;
- y = b + n3;
- }
- mpn_sub_n (p + n3, x, y, n2);
- }
- p[n] = w;
-
- n1 = n + 1;
- if (n2 < KARATSUBA_MUL_THRESHOLD)
- {
- if (n3 < KARATSUBA_MUL_THRESHOLD)
- {
- mpn_mul_basecase (ws, p, n3, p + n3, n3);
- mpn_mul_basecase (p, a, n3, b, n3);
- }
- else
- {
- mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
- mpn_kara_mul_n (p, a, b, n3, ws + n1);
- }
- mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
- }
- else
- {
- mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
- mpn_kara_mul_n (p, a, b, n3, ws + n1);
- mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
- }
-
- if (sign)
- mpn_add_n (ws, p, ws, n1);
- else
- mpn_sub_n (ws, p, ws, n1);
-
- nm1 = n - 1;
- if (mpn_add_n (ws, p + n1, ws, nm1))
- {
- mp_limb_t x = ws[nm1] + 1;
- ws[nm1] = x;
- if (x == 0)
- ++ws[n];
- }
- if (mpn_add_n (p + n3, p + n3, ws, n1))
- {
- mp_limb_t x;
- i = n1 + n3;
- do
- {
- x = p[i] + 1;
- p[i] = x;
- ++i;
- } while (x == 0);
- }
- }
- else
- {
- /* Even length. */
- mp_limb_t t;
-
- i = n2;
- do
- {
- --i;
- w0 = a[i];
- w1 = a[n2+i];
- }
- while (w0 == w1 && i != 0);
- sign = 0;
- if (w0 < w1)
- {
- x = a + n2;
- y = a;
- sign = 1;
- }
- else
- {
- x = a;
- y = a + n2;
- }
- mpn_sub_n (p, x, y, n2);
-
- i = n2;
- do
- {
- --i;
- w0 = b[i];
- w1 = b[n2+i];
- }
- while (w0 == w1 && i != 0);
- if (w0 < w1)
- {
- x = b + n2;
- y = b;
- sign ^= 1;
- }
- else
- {
- x = b;
- y = b + n2;
- }
- mpn_sub_n (p + n2, x, y, n2);
-
- /* Pointwise products. */
- if (n2 < KARATSUBA_MUL_THRESHOLD)
- {
- mpn_mul_basecase (ws, p, n2, p + n2, n2);
- mpn_mul_basecase (p, a, n2, b, n2);
- mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
- }
- else
- {
- mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
- mpn_kara_mul_n (p, a, b, n2, ws + n);
- mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
- }
-
- /* Interpolate. */
- if (sign)
- w = mpn_add_n (ws, p, ws, n);
- else
- w = -mpn_sub_n (ws, p, ws, n);
- w += mpn_add_n (ws, p + n, ws, n);
- w += mpn_add_n (p + n2, p + n2, ws, n);
- /* TO DO: could put "if (w) { ... }" here.
- * Less work but badly predicted branch.
- * No measurable difference in speed on Alpha.
- */
- i = n + n2;
- t = p[i] + w;
- p[i] = t;
- if (t < w)
- {
- do
- {
- ++i;
- w = p[i] + 1;
- p[i] = w;
- }
- while (w == 0);
- }
- }
-}
-
-void
-#if __STDC__
-mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
-#else
-mpn_kara_sqr_n (p, a, n, ws)
- mp_ptr p;
- mp_srcptr a;
- mp_size_t n;
- mp_ptr ws;
-#endif
-{
- mp_limb_t i, sign, w, w0, w1;
- mp_size_t n2;
- mp_srcptr x, y;
-
- n2 = n >> 1;
- ASSERT (n2 > 0);
-
- if (n & 1)
- {
- /* Odd length. */
- mp_size_t n1, n3, nm1;
-
- n3 = n - n2;
-
- sign = 0;
- w = a[n2];
- if (w != 0)
- w -= mpn_sub_n (p, a, a + n3, n2);
- else
- {
- i = n2;
- do
- {
- --i;
- w0 = a[i];
- w1 = a[n3+i];
- }
- while (w0 == w1 && i != 0);
- if (w0 < w1)
- {
- x = a + n3;
- y = a;
- sign = 1;
- }
- else
- {
- x = a;
- y = a + n3;
- }
- mpn_sub_n (p, x, y, n2);
- }
- p[n2] = w;
-
- w = a[n2];
- if (w != 0)
- w -= mpn_sub_n (p + n3, a, a + n3, n2);
- else
- {
- i = n2;
- do
- {
- --i;
- w0 = a[i];
- w1 = a[n3+i];
- }
- while (w0 == w1 && i != 0);
- if (w0 < w1)
- {
- x = a + n3;
- y = a;
- sign ^= 1;
- }
- else
- {
- x = a;
- y = a + n3;
- }
- mpn_sub_n (p + n3, x, y, n2);
- }
- p[n] = w;
-
- n1 = n + 1;
- if (n2 < KARATSUBA_SQR_THRESHOLD)
- {
- if (n3 < KARATSUBA_SQR_THRESHOLD)
- {
- mpn_sqr_basecase (ws, p, n3);
- mpn_sqr_basecase (p, a, n3);
- }
- else
- {
- mpn_kara_sqr_n (ws, p, n3, ws + n1);
- mpn_kara_sqr_n (p, a, n3, ws + n1);
- }
- mpn_sqr_basecase (p + n1, a + n3, n2);
- }
- else
- {
- mpn_kara_sqr_n (ws, p, n3, ws + n1);
- mpn_kara_sqr_n (p, a, n3, ws + n1);
- mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1);
- }
-
- if (sign)
- mpn_add_n (ws, p, ws, n1);
- else
- mpn_sub_n (ws, p, ws, n1);
-
- nm1 = n - 1;
- if (mpn_add_n (ws, p + n1, ws, nm1))
- {
- mp_limb_t x = ws[nm1] + 1;
- ws[nm1] = x;
- if (x == 0)
- ++ws[n];
- }
- if (mpn_add_n (p + n3, p + n3, ws, n1))
- {
- mp_limb_t x;
- i = n1 + n3;
- do
- {
- x = p[i] + 1;
- p[i] = x;
- ++i;
- } while (x == 0);
- }
- }
- else
- {
- /* Even length. */
- mp_limb_t t;
-
- i = n2;
- do
- {
- --i;
- w0 = a[i];
- w1 = a[n2+i];
- }
- while (w0 == w1 && i != 0);
- sign = 0;
- if (w0 < w1)
- {
- x = a + n2;
- y = a;
- sign = 1;
- }
- else
- {
- x = a;
- y = a + n2;
- }
- mpn_sub_n (p, x, y, n2);
-
- i = n2;
- do
- {
- --i;
- w0 = a[i];
- w1 = a[n2+i];
- }
- while (w0 == w1 && i != 0);
- if (w0 < w1)
- {
- x = a + n2;
- y = a;
- sign ^= 1;
- }
- else
- {
- x = a;
- y = a + n2;
- }
- mpn_sub_n (p + n2, x, y, n2);
-
- /* Pointwise products. */
- if (n2 < KARATSUBA_SQR_THRESHOLD)
- {
- mpn_sqr_basecase (ws, p, n2);
- mpn_sqr_basecase (p, a, n2);
- mpn_sqr_basecase (p + n, a + n2, n2);
- }
- else
- {
- mpn_kara_sqr_n (ws, p, n2, ws + n);
- mpn_kara_sqr_n (p, a, n2, ws + n);
- mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
- }
-
- /* Interpolate. */
- if (sign)
- w = mpn_add_n (ws, p, ws, n);
- else
- w = -mpn_sub_n (ws, p, ws, n);
- w += mpn_add_n (ws, p + n, ws, n);
- w += mpn_add_n (p + n2, p + n2, ws, n);
- /* TO DO: could put "if (w) { ... }" here.
- * Less work but badly predicted branch.
- * No measurable difference in speed on Alpha.
- */
- i = n + n2;
- t = p[i] + w;
- p[i] = t;
- if (t < w)
- {
- do
- {
- ++i;
- w = p[i] + 1;
- p[i] = w;
- }
- while (w == 0);
- }
- }
-}
-
-/*-- add2Times -------------------------------------------------------------*/
-
-/* z[] = x[] + 2 * y[]
- Note that z and x might point to the same vectors. */
-#ifdef USE_MORE_MPN
-static inline mp_limb_t
-#if __STDC__
-add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
-#else
-add2Times (z, x, y, n)
- mp_ptr z;
- mp_srcptr x;
- mp_srcptr y;
- mp_size_t n;
-#endif
-{
- mp_ptr t;
- mp_limb_t c;
- TMP_DECL (marker);
- TMP_MARK (marker);
- t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
- c = mpn_lshift (t, y, n, 1);
- c += mpn_add_n (z, x, t, n);
- TMP_FREE (marker);
- return c;
-}
-#else
-
-static mp_limb_t
-#if __STDC__
-add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
-#else
-add2Times (z, x, y, n)
- mp_ptr z;
- mp_srcptr x;
- mp_srcptr y;
- mp_size_t n;
-#endif
-{
- mp_limb_t c, v, w;
-
- ASSERT (n > 0);
- v = *x; w = *y;
- c = w >> (BITS_PER_MP_LIMB - 1);
- w <<= 1;
- v += w;
- c += v < w;
- *z = v;
- ++x; ++y; ++z;
- while (--n)
- {
- v = *x;
- w = *y;
- v += c;
- c = v < c;
- c += w >> (BITS_PER_MP_LIMB - 1);
- w <<= 1;
- v += w;
- c += v < w;
- *z = v;
- ++x; ++y; ++z;
- }
-
- return c;
-}
-#endif
-
-/*-- evaluate3 -------------------------------------------------------------*/
-
-/* Evaluates:
- * ph := 4*A+2*B+C
- * p1 := A+B+C
- * p2 := A+2*B+4*C
- * where:
- * ph[], p1[], p2[], A[] and B[] all have length len,
- * C[] has length len2 with len-len2 = 0, 1 or 2.
- * Returns top words (overflow) at pth, pt1 and pt2 respectively.
- */
-#ifdef USE_MORE_MPN
-static void
-#if __STDC__
-evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
- mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2)
-#else
-evaluate3 (ph, p1, p2, pth, pt1, pt2,
- A, B, C, len, len2)
- mp_ptr ph;
- mp_ptr p1;
- mp_ptr p2;
- mp_ptr pth;
- mp_ptr pt1;
- mp_ptr pt2;
- mp_srcptr A;
- mp_srcptr B;
- mp_srcptr C;
- mp_size_t len;
- mp_size_t len2;
-#endif
-{
- mp_limb_t c, d, e;
-
- ASSERT (len - len2 <= 2);
-
- e = mpn_lshift (p1, B, len, 1);
-
- c = mpn_lshift (ph, A, len, 2);
- c += e + mpn_add_n (ph, ph, p1, len);
- d = mpn_add_n (ph, ph, C, len2);
- if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
- ASSERT (c < 7);
- *pth = c;
-
- c = mpn_lshift (p2, C, len2, 2);
-#if 1
- if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; }
- c += e + mpn_add_n (p2, p2, p1, len);
-#else
- d = mpn_add_n (p2, p2, p1, len2);
- c += d;
- if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
- c += e;
-#endif
- c += mpn_add_n (p2, p2, A, len);
- ASSERT (c < 7);
- *pt2 = c;
-
- c = mpn_add_n (p1, A, B, len);
- d = mpn_add_n (p1, p1, C, len2);
- if (len2 == len) c += d;
- else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
- ASSERT (c < 3);
- *pt1 = c;
-
-}
-
-#else
-
-static void
-#if __STDC__
-evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
- mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls)
-#else
-evaluate3 (ph, p1, p2, pth, pt1, pt2,
- A, B, C, l, ls)
- mp_ptr ph;
- mp_ptr p1;
- mp_ptr p2;
- mp_ptr pth;
- mp_ptr pt1;
- mp_ptr pt2;
- mp_srcptr A;
- mp_srcptr B;
- mp_srcptr C;
- mp_size_t l;
- mp_size_t ls;
-#endif
-{
- mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2;
-
- ASSERT (l - ls <= 2);
-
- th = t1 = t2 = 0;
- for (i = 0; i < l; ++i)
- {
- a = *A;
- b = *B;
- c = i < ls ? *C : 0;
-
- /* TO DO: choose one of the following alternatives. */
-#if 0
- t = a << 2;
- vh = th + t;
- th = vh < t;
- th += a >> (BITS_PER_MP_LIMB - 2);
- t = b << 1;
- vh += t;
- th += vh < t;
- th += b >> (BITS_PER_MP_LIMB - 1);
- vh += c;
- th += vh < c;
-#else
- vh = th + c;
- th = vh < c;
- t = b << 1;
- vh += t;
- th += vh < t;
- th += b >> (BITS_PER_MP_LIMB - 1);
- t = a << 2;
- vh += t;
- th += vh < t;
- th += a >> (BITS_PER_MP_LIMB - 2);
-#endif
-
- v1 = t1 + a;
- t1 = v1 < a;
- v1 += b;
- t1 += v1 < b;
- v1 += c;
- t1 += v1 < c;
-
- v2 = t2 + a;
- t2 = v2 < a;
- t = b << 1;
- v2 += t;
- t2 += v2 < t;
- t2 += b >> (BITS_PER_MP_LIMB - 1);
- t = c << 2;
- v2 += t;
- t2 += v2 < t;
- t2 += c >> (BITS_PER_MP_LIMB - 2);
-
- *ph = vh;
- *p1 = v1;
- *p2 = v2;
-
- ++A; ++B; ++C;
- ++ph; ++p1; ++p2;
- }
-
- ASSERT (th < 7);
- ASSERT (t1 < 3);
- ASSERT (t2 < 7);
-
- *pth = th;
- *pt1 = t1;
- *pt2 = t2;
-}
-#endif
-
-
-/*-- interpolate3 ----------------------------------------------------------*/
-
-/* Interpolates B, C, D (in-place) from:
- * 16*A+8*B+4*C+2*D+E
- * A+B+C+D+E
- * A+2*B+4*C+8*D+16*E
- * where:
- * A[], B[], C[] and D[] all have length l,
- * E[] has length ls with l-ls = 0, 2 or 4.
- *
- * Reads top words (from earlier overflow) from ptb, ptc and ptd,
- * and returns new top words there.
- */
-
-#ifdef USE_MORE_MPN
-static void
-#if __STDC__
-interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
- mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2)
-#else
-interpolate3 (A, B, C, D, E,
- ptb, ptc, ptd, len, len2)
- mp_srcptr A;
- mp_ptr B;
- mp_ptr C;
- mp_ptr D;
- mp_srcptr E;
- mp_ptr ptb;
- mp_ptr ptc;
- mp_ptr ptd;
- mp_size_t len;
- mp_size_t len2;
-#endif
-{
- mp_ptr ws;
- mp_limb_t t, tb,tc,td;
- TMP_DECL (marker);
- TMP_MARK (marker);
-
- ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4);
-
- /* Let x1, x2, x3 be the values to interpolate. We have:
- * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e
- * c = a + x1 + x2 + x3 + e
- * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e
- */
-
- ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB);
-
- tb = *ptb; tc = *ptc; td = *ptd;
-
-
- /* b := b - 16*a - e
- * c := c - a - e
- * d := d - a - 16*e
- */
-
- t = mpn_lshift (ws, A, len, 4);
- tb -= t + mpn_sub_n (B, B, ws, len);
- t = mpn_sub_n (B, B, E, len2);
- if (len2 == len) tb -= t;
- else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
-
- tc -= mpn_sub_n (C, C, A, len);
- t = mpn_sub_n (C, C, E, len2);
- if (len2 == len) tc -= t;
- else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
-
- t = mpn_lshift (ws, E, len2, 4);
- t += mpn_add_n (ws, ws, A, len2);
-#if 1
- if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
- td -= t + mpn_sub_n (D, D, ws, len);
-#else
- t += mpn_sub_n (D, D, ws, len2);
- if (len2 != len) {
- t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
- t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
- } /* end if/else */
- td -= t;
-#endif
-
-
- /* b, d := b + d, b - d */
-
-#ifdef HAVE_MPN_ADD_SUB_N
- /* #error TO DO ... */
-#else
- t = tb + td + mpn_add_n (ws, B, D, len);
- td = tb - td - mpn_sub_n (D, B, D, len);
- tb = t;
- MPN_COPY (B, ws, len);
-#endif
-
- /* b := b-8*c */
- t = 8 * tc + mpn_lshift (ws, C, len, 3);
- tb -= t + mpn_sub_n (B, B, ws, len);
-
- /* c := 2*c - b */
- tc = 2 * tc + mpn_lshift (C, C, len, 1);
- tc -= tb + mpn_sub_n (C, C, B, len);
-
- /* d := d/3 */
- td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3;
-
- /* b, d := b + d, b - d */
-#ifdef HAVE_MPN_ADD_SUB_N
- /* #error TO DO ... */
-#else
- t = tb + td + mpn_add_n (ws, B, D, len);
- td = tb - td - mpn_sub_n (D, B, D, len);
- tb = t;
- MPN_COPY (B, ws, len);
-#endif
-
- /* Now:
- * b = 4*x1
- * c = 2*x2
- * d = 4*x3
- */
-
- ASSERT(!(*B & 3));
- mpn_rshift (B, B, len, 2);
- B[len-1] |= tb<<(BITS_PER_MP_LIMB-2);
- ASSERT((long)tb >= 0);
- tb >>= 2;
-
- ASSERT(!(*C & 1));
- mpn_rshift (C, C, len, 1);
- C[len-1] |= tc<<(BITS_PER_MP_LIMB-1);
- ASSERT((long)tc >= 0);
- tc >>= 1;
-
- ASSERT(!(*D & 3));
- mpn_rshift (D, D, len, 2);
- D[len-1] |= td<<(BITS_PER_MP_LIMB-2);
- ASSERT((long)td >= 0);
- td >>= 2;
-
-#if WANT_ASSERT
- ASSERT (tb < 2);
- if (len == len2)
- {
- ASSERT (tc < 3);
- ASSERT (td < 2);
- }
- else
- {
- ASSERT (tc < 2);
- ASSERT (!td);
- }
-#endif
-
- *ptb = tb;
- *ptc = tc;
- *ptd = td;
-
- TMP_FREE (marker);
-}
-
-#else
-
-static void
-#if __STDC__
-interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
- mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls)
-#else
-interpolate3 (A, B, C, D, E,
- ptb, ptc, ptd, l, ls)
- mp_srcptr A;
- mp_ptr B;
- mp_ptr C;
- mp_ptr D;
- mp_srcptr E;
- mp_ptr ptb;
- mp_ptr ptc;
- mp_ptr ptd;
- mp_size_t l;
- mp_size_t ls;
-#endif
-{
- mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od;
- const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);
-
-#if WANT_ASSERT
- t = l - ls;
- ASSERT (t == 0 || t == 2 || t == 4);
-#endif
-
- sb = sc = sd = 0;
- for (i = 0; i < l; ++i)
- {
- mp_limb_t tb, tc, td, tt;
-
- a = *A;
- b = *B;
- c = *C;
- d = *D;
- e = i < ls ? *E : 0;
-
- /* Let x1, x2, x3 be the values to interpolate. We have:
- * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e
- * c = a + x1 + x2 + x3 + e
- * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e
- */
-
- /* b := b - 16*a - e
- * c := c - a - e
- * d := d - a - 16*e
- */
- t = a << 4;
- tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t);
- b -= t;
- tb -= b < e;
- b -= e;
- tc = -(c < a);
- c -= a;
- tc -= c < e;
- c -= e;
- td = -(d < a);
- d -= a;
- t = e << 4;
- td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t);
- d -= t;
-
- /* b, d := b + d, b - d */
- t = b + d;
- tt = tb + td + (t < b);
- td = tb - td - (b < d);
- d = b - d;
- b = t;
- tb = tt;
-
- /* b := b-8*c */
- t = c << 3;
- tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t);
- b -= t;
-
- /* c := 2*c - b */
- t = c << 1;
- tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b);
- c = t - b;
-
- /* d := d/3 */
- d *= INVERSE_3;
- td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d);
- td *= INVERSE_3;
-
- /* b, d := b + d, b - d */
- t = b + d;
- tt = tb + td + (t < b);
- td = tb - td - (b < d);
- d = b - d;
- b = t;
- tb = tt;
-
- /* Now:
- * b = 4*x1
- * c = 2*x2
- * d = 4*x3
- */
-
- /* sb has period 2. */
- b += sb;
- tb += b < sb;
- sb &= maskOffHalf;
- sb |= sb >> (BITS_PER_MP_LIMB >> 1);
- sb += tb;
-
- /* sc has period 1. */
- c += sc;
- tc += c < sc;
- /* TO DO: choose one of the following alternatives. */
-#if 1
- sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1));
- sc += tc;
-#else
- sc = tc - ((long)sc < 0L);
-#endif
-
- /* sd has period 2. */
- d += sd;
- td += d < sd;
- sd &= maskOffHalf;
- sd |= sd >> (BITS_PER_MP_LIMB >> 1);
- sd += td;
-
- if (i != 0)
- {
- B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
- C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
- D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
- }
- ob = b >> 2;
- oc = c >> 1;
- od = d >> 2;
-
- ++A; ++B; ++C; ++D; ++E;
- }
-
- /* Handle top words. */
- b = *ptb;
- c = *ptc;
- d = *ptd;
-
- t = b + d;
- d = b - d;
- b = t;
- b -= c << 3;
- c = (c << 1) - b;
- d *= INVERSE_3;
- t = b + d;
- d = b - d;
- b = t;
-
- b += sb;
- c += sc;
- d += sd;
-
- B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
- C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
- D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
-
- b >>= 2;
- c >>= 1;
- d >>= 2;
-
-#if WANT_ASSERT
- ASSERT (b < 2);
- if (l == ls)
- {
- ASSERT (c < 3);
- ASSERT (d < 2);
- }
- else
- {
- ASSERT (c < 2);
- ASSERT (!d);
- }
-#endif
-
- *ptb = b;
- *ptc = c;
- *ptd = d;
-}
-#endif
-
-
-/*-- mpn_toom3_mul_n --------------------------------------------------------------*/
-
-/* Multiplies using 5 mults of one third size and so on recursively.
- * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
- * No overlap of p[...] with a[...] or b[...].
- * ws is workspace.
- */
-
-/* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the
- * recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
- * because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false.
- */
-
-#define TOOM3_MUL_REC(p, a, b, n, ws) \
- do { \
- if (n < KARATSUBA_MUL_THRESHOLD) \
- mpn_mul_basecase (p, a, n, b, n); \
- else if (n < TOOM3_MUL_THRESHOLD) \
- mpn_kara_mul_n (p, a, b, n, ws); \
- else \
- mpn_toom3_mul_n (p, a, b, n, ws); \
- } while (0)
-
-void
-#if __STDC__
-mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
-#else
-mpn_toom3_mul_n (p, a, b, n, ws)
- mp_ptr p;
- mp_srcptr a;
- mp_srcptr b;
- mp_size_t n;
- mp_ptr ws;
-#endif
-{
- mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD;
- mp_limb_t *A,*B,*C,*D,*E, *W;
- mp_size_t l,l2,l3,l4,l5,ls;
-
- /* Break n words into chunks of size l, l and ls.
- * n = 3*k => l = k, ls = k
- * n = 3*k+1 => l = k+1, ls = k-1
- * n = 3*k+2 => l = k+1, ls = k
- */
- {
- mp_limb_t m;
-
- ASSERT (n >= TOOM3_MUL_THRESHOLD);
- l = ls = n / 3;
- m = n - l * 3;
- if (m != 0)
- ++l;
- if (m == 1)
- --ls;
-
- l2 = l * 2;
- l3 = l * 3;
- l4 = l * 4;
- l5 = l * 5;
- A = p;
- B = ws;
- C = p + l2;
- D = ws + l2;
- E = p + l4;
- W = ws + l4;
- }
-
- /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
- evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
- evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls);
-
- /** Second stage: pointwise multiplies. **/
- TOOM3_MUL_REC(D, C, C + l, l, W);
- tD = cD*dD;
- if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD);
- if (dD) tD += mpn_addmul_1 (D + l, C, l, dD);
- ASSERT (tD < 49);
- TOOM3_MUL_REC(C, B, B + l, l, W);
- tC = cC*dC;
- /* TO DO: choose one of the following alternatives. */
-#if 0
- if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC);
- if (dC) tC += mpn_addmul_1 (C + l, B, l, dC);
-#else
- if (cC)
- {
- if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l);
- else tC += add2Times (C + l, C + l, B + l, l);
- }
- if (dC)
- {
- if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l);
- else tC += add2Times (C + l, C + l, B, l);
- }
-#endif
- ASSERT (tC < 9);
- TOOM3_MUL_REC(B, A, A + l, l, W);
- tB = cB*dB;
- if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB);
- if (dB) tB += mpn_addmul_1 (B + l, A, l, dB);
- ASSERT (tB < 49);
- TOOM3_MUL_REC(A, a, b, l, W);
- TOOM3_MUL_REC(E, a + l2, b + l2, ls, W);
-
- /** Third stage: interpolation. **/
- interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
-
- /** Final stage: add up the coefficients. **/
- {
- mp_limb_t i, x, y;
- tB += mpn_add_n (p + l, p + l, B, l2);
- tD += mpn_add_n (p + l3, p + l3, D, l2);
- mpn_incr_u (p + l3, tB);
- mpn_incr_u (p + l4, tC);
- mpn_incr_u (p + l5, tD);
- }
-}
-
-/*-- mpn_toom3_sqr_n --------------------------------------------------------------*/
-
-/* Like previous function but for squaring */
-
-#define TOOM3_SQR_REC(p, a, n, ws) \
- do { \
- if (n < KARATSUBA_SQR_THRESHOLD) \
- mpn_sqr_basecase (p, a, n); \
- else if (n < TOOM3_SQR_THRESHOLD) \
- mpn_kara_sqr_n (p, a, n, ws); \
- else \
- mpn_toom3_sqr_n (p, a, n, ws); \
- } while (0)
-
-void
-#if __STDC__
-mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
-#else
-mpn_toom3_sqr_n (p, a, n, ws)
- mp_ptr p;
- mp_srcptr a;
- mp_size_t n;
- mp_ptr ws;
-#endif
-{
- mp_limb_t cB,cC,cD, tB,tC,tD;
- mp_limb_t *A,*B,*C,*D,*E, *W;
- mp_size_t l,l2,l3,l4,l5,ls;
-
- /* Break n words into chunks of size l, l and ls.
- * n = 3*k => l = k, ls = k
- * n = 3*k+1 => l = k+1, ls = k-1
- * n = 3*k+2 => l = k+1, ls = k
- */
- {
- mp_limb_t m;
-
- ASSERT (n >= TOOM3_MUL_THRESHOLD);
- l = ls = n / 3;
- m = n - l * 3;
- if (m != 0)
- ++l;
- if (m == 1)
- --ls;
-
- l2 = l * 2;
- l3 = l * 3;
- l4 = l * 4;
- l5 = l * 5;
- A = p;
- B = ws;
- C = p + l2;
- D = ws + l2;
- E = p + l4;
- W = ws + l4;
- }
-
- /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
- evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
-
- /** Second stage: pointwise multiplies. **/
- TOOM3_SQR_REC(D, C, l, W);
- tD = cD*cD;
- if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD);
- ASSERT (tD < 49);
- TOOM3_SQR_REC(C, B, l, W);
- tC = cC*cC;
- /* TO DO: choose one of the following alternatives. */
-#if 0
- if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC);
-#else
- if (cC >= 1)
- {
- tC += add2Times (C + l, C + l, B, l);
- if (cC == 2)
- tC += add2Times (C + l, C + l, B, l);
- }
-#endif
- ASSERT (tC < 9);
- TOOM3_SQR_REC(B, A, l, W);
- tB = cB*cB;
- if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB);
- ASSERT (tB < 49);
- TOOM3_SQR_REC(A, a, l, W);
- TOOM3_SQR_REC(E, a + l2, ls, W);
-
- /** Third stage: interpolation. **/
- interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
-
- /** Final stage: add up the coefficients. **/
- {
- mp_limb_t i, x, y;
- tB += mpn_add_n (p + l, p + l, B, l2);
- tD += mpn_add_n (p + l3, p + l3, D, l2);
- mpn_incr_u (p + l3, tB);
- mpn_incr_u (p + l4, tC);
- mpn_incr_u (p + l5, tD);
- }
-}
-
-void
-#if __STDC__
-mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
-#else
-mpn_mul_n (p, a, b, n)
- mp_ptr p;
- mp_srcptr a;
- mp_srcptr b;
- mp_size_t n;
-#endif
-{
- if (n < KARATSUBA_MUL_THRESHOLD)
- mpn_mul_basecase (p, a, n, b, n);
- else if (n < TOOM3_MUL_THRESHOLD)
- {
- /* Allocate workspace of fixed size on stack: fast! */
-#if TUNE_PROGRAM_BUILD
- mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB];
-#else
- mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB];
-#endif
- mpn_kara_mul_n (p, a, b, n, ws);
- }
-#if WANT_FFT || TUNE_PROGRAM_BUILD
- else if (n < FFT_MUL_THRESHOLD)
-#else
- else
-#endif
- {
- /* Use workspace of unknown size in heap, as stack space may
- * be limited. Since n is at least TOOM3_MUL_THRESHOLD, the
- * multiplication will take much longer than malloc()/free(). */
- mp_limb_t wsLen, *ws;
- wsLen = 2 * n + 3 * BITS_PER_MP_LIMB;
- ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t));
- mpn_toom3_mul_n (p, a, b, n, ws);
- (*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t));
- }
-#if WANT_FFT || TUNE_PROGRAM_BUILD
- else
- {
- mpn_mul_fft_full (p, a, n, b, n);
- }
-#endif
-}