1 /* mpn_mul_n and helper function -- Multiply/square natural numbers.
3 THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n)
4 ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH
5 THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
6 THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
12 This file is part of the GNU MP Library.
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of the GNU Lesser General Public License as published by
16 the Free Software Foundation; either version 2.1 of the License, or (at your
17 option) any later version.
19 The GNU MP Library is distributed in the hope that it will be useful, but
20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
21 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
22 License for more details.
24 You should have received a copy of the GNU Lesser General Public License
25 along with the GNU MP Library; see the file COPYING.LIB. If not, write to
26 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
27 MA 02111-1307, USA. */
34 /* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
35 0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
36 #define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1)
38 #if !defined (__alpha) && !defined (__mips)
39 /* For all other machines, we want to call mpn functions for the compund
40 operations instead of open-coding them. */
44 /*== Function declarations =================================================*/
46 static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr,
47 mp_ptr, mp_ptr, mp_ptr,
48 mp_srcptr, mp_srcptr, mp_srcptr,
49 mp_size_t, mp_size_t));
50 static void interpolate3 _PROTO ((mp_srcptr,
51 mp_ptr, mp_ptr, mp_ptr,
53 mp_ptr, mp_ptr, mp_ptr,
54 mp_size_t, mp_size_t));
55 static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
58 /*-- mpn_kara_mul_n ---------------------------------------------------------------*/
60 /* Multiplies using 3 half-sized mults and so on recursively.
61 * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
62 * No overlap of p[...] with a[...] or b[...].
68 mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
70 mpn_kara_mul_n(p, a, b, n, ws)
78 mp_limb_t i, sign, w, w0, w1;
88 mp_size_t n1, n3, nm1;
95 w -= mpn_sub_n (p, a, a + n3, n2);
105 while (w0 == w1 && i != 0);
117 mpn_sub_n (p, x, y, n2);
123 w -= mpn_sub_n (p + n3, b, b + n3, n2);
133 while (w0 == w1 && i != 0);
145 mpn_sub_n (p + n3, x, y, n2);
150 if (n2 < KARATSUBA_MUL_THRESHOLD)
152 if (n3 < KARATSUBA_MUL_THRESHOLD)
154 mpn_mul_basecase (ws, p, n3, p + n3, n3);
155 mpn_mul_basecase (p, a, n3, b, n3);
159 mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
160 mpn_kara_mul_n (p, a, b, n3, ws + n1);
162 mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
166 mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
167 mpn_kara_mul_n (p, a, b, n3, ws + n1);
168 mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
172 mpn_add_n (ws, p, ws, n1);
174 mpn_sub_n (ws, p, ws, n1);
177 if (mpn_add_n (ws, p + n1, ws, nm1))
179 mp_limb_t x = ws[nm1] + 1;
184 if (mpn_add_n (p + n3, p + n3, ws, n1))
208 while (w0 == w1 && i != 0);
221 mpn_sub_n (p, x, y, n2);
230 while (w0 == w1 && i != 0);
242 mpn_sub_n (p + n2, x, y, n2);
244 /* Pointwise products. */
245 if (n2 < KARATSUBA_MUL_THRESHOLD)
247 mpn_mul_basecase (ws, p, n2, p + n2, n2);
248 mpn_mul_basecase (p, a, n2, b, n2);
249 mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
253 mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
254 mpn_kara_mul_n (p, a, b, n2, ws + n);
255 mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
260 w = mpn_add_n (ws, p, ws, n);
262 w = -mpn_sub_n (ws, p, ws, n);
263 w += mpn_add_n (ws, p + n, ws, n);
264 w += mpn_add_n (p + n2, p + n2, ws, n);
265 /* TO DO: could put "if (w) { ... }" here.
266 * Less work but badly predicted branch.
267 * No measurable difference in speed on Alpha.
287 mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
289 mpn_kara_sqr_n (p, a, n, ws)
296 mp_limb_t i, sign, w, w0, w1;
306 mp_size_t n1, n3, nm1;
313 w -= mpn_sub_n (p, a, a + n3, n2);
323 while (w0 == w1 && i != 0);
335 mpn_sub_n (p, x, y, n2);
341 w -= mpn_sub_n (p + n3, a, a + n3, n2);
351 while (w0 == w1 && i != 0);
363 mpn_sub_n (p + n3, x, y, n2);
368 if (n2 < KARATSUBA_SQR_THRESHOLD)
370 if (n3 < KARATSUBA_SQR_THRESHOLD)
372 mpn_sqr_basecase (ws, p, n3);
373 mpn_sqr_basecase (p, a, n3);
377 mpn_kara_sqr_n (ws, p, n3, ws + n1);
378 mpn_kara_sqr_n (p, a, n3, ws + n1);
380 mpn_sqr_basecase (p + n1, a + n3, n2);
384 mpn_kara_sqr_n (ws, p, n3, ws + n1);
385 mpn_kara_sqr_n (p, a, n3, ws + n1);
386 mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1);
390 mpn_add_n (ws, p, ws, n1);
392 mpn_sub_n (ws, p, ws, n1);
395 if (mpn_add_n (ws, p + n1, ws, nm1))
397 mp_limb_t x = ws[nm1] + 1;
402 if (mpn_add_n (p + n3, p + n3, ws, n1))
426 while (w0 == w1 && i != 0);
439 mpn_sub_n (p, x, y, n2);
448 while (w0 == w1 && i != 0);
460 mpn_sub_n (p + n2, x, y, n2);
462 /* Pointwise products. */
463 if (n2 < KARATSUBA_SQR_THRESHOLD)
465 mpn_sqr_basecase (ws, p, n2);
466 mpn_sqr_basecase (p, a, n2);
467 mpn_sqr_basecase (p + n, a + n2, n2);
471 mpn_kara_sqr_n (ws, p, n2, ws + n);
472 mpn_kara_sqr_n (p, a, n2, ws + n);
473 mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
478 w = mpn_add_n (ws, p, ws, n);
480 w = -mpn_sub_n (ws, p, ws, n);
481 w += mpn_add_n (ws, p + n, ws, n);
482 w += mpn_add_n (p + n2, p + n2, ws, n);
483 /* TO DO: could put "if (w) { ... }" here.
484 * Less work but badly predicted branch.
485 * No measurable difference in speed on Alpha.
503 /*-- add2Times -------------------------------------------------------------*/
505 /* z[] = x[] + 2 * y[]
506 Note that z and x might point to the same vectors. */
508 static inline mp_limb_t
510 add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
512 add2Times (z, x, y, n)
523 t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
524 c = mpn_lshift (t, y, n, 1);
525 c += mpn_add_n (z, x, t, n);
533 add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
535 add2Times (z, x, y, n)
546 c = w >> (BITS_PER_MP_LIMB - 1);
558 c += w >> (BITS_PER_MP_LIMB - 1);
570 /*-- evaluate3 -------------------------------------------------------------*/
577 * ph[], p1[], p2[], A[] and B[] all have length len,
578 * C[] has length len2 with len-len2 = 0, 1 or 2.
579 * Returns top words (overflow) at pth, pt1 and pt2 respectively.
584 evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
585 mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2)
587 evaluate3 (ph, p1, p2, pth, pt1, pt2,
604 ASSERT (len - len2 <= 2);
606 e = mpn_lshift (p1, B, len, 1);
608 c = mpn_lshift (ph, A, len, 2);
609 c += e + mpn_add_n (ph, ph, p1, len);
610 d = mpn_add_n (ph, ph, C, len2);
611 if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
615 c = mpn_lshift (p2, C, len2, 2);
617 if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; }
618 c += e + mpn_add_n (p2, p2, p1, len);
620 d = mpn_add_n (p2, p2, p1, len2);
622 if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
625 c += mpn_add_n (p2, p2, A, len);
629 c = mpn_add_n (p1, A, B, len);
630 d = mpn_add_n (p1, p1, C, len2);
631 if (len2 == len) c += d;
632 else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
642 evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
643 mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls)
645 evaluate3 (ph, p1, p2, pth, pt1, pt2,
660 mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2;
662 ASSERT (l - ls <= 2);
665 for (i = 0; i < l; ++i)
671 /* TO DO: choose one of the following alternatives. */
676 th += a >> (BITS_PER_MP_LIMB - 2);
680 th += b >> (BITS_PER_MP_LIMB - 1);
689 th += b >> (BITS_PER_MP_LIMB - 1);
693 th += a >> (BITS_PER_MP_LIMB - 2);
708 t2 += b >> (BITS_PER_MP_LIMB - 1);
712 t2 += c >> (BITS_PER_MP_LIMB - 2);
733 /*-- interpolate3 ----------------------------------------------------------*/
735 /* Interpolates B, C, D (in-place) from:
740 * A[], B[], C[] and D[] all have length l,
741 * E[] has length ls with l-ls = 0, 2 or 4.
743 * Reads top words (from earlier overflow) from ptb, ptc and ptd,
744 * and returns new top words there.
750 interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
751 mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2)
753 interpolate3 (A, B, C, D, E,
754 ptb, ptc, ptd, len, len2)
768 mp_limb_t t, tb,tc,td;
772 ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4);
774 /* Let x1, x2, x3 be the values to interpolate. We have:
775 * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e
776 * c = a + x1 + x2 + x3 + e
777 * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e
780 ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB);
782 tb = *ptb; tc = *ptc; td = *ptd;
790 t = mpn_lshift (ws, A, len, 4);
791 tb -= t + mpn_sub_n (B, B, ws, len);
792 t = mpn_sub_n (B, B, E, len2);
793 if (len2 == len) tb -= t;
794 else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
796 tc -= mpn_sub_n (C, C, A, len);
797 t = mpn_sub_n (C, C, E, len2);
798 if (len2 == len) tc -= t;
799 else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
801 t = mpn_lshift (ws, E, len2, 4);
802 t += mpn_add_n (ws, ws, A, len2);
804 if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
805 td -= t + mpn_sub_n (D, D, ws, len);
807 t += mpn_sub_n (D, D, ws, len2);
809 t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
810 t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
816 /* b, d := b + d, b - d */
818 #ifdef HAVE_MPN_ADD_SUB_N
819 /* #error TO DO ... */
821 t = tb + td + mpn_add_n (ws, B, D, len);
822 td = tb - td - mpn_sub_n (D, B, D, len);
824 MPN_COPY (B, ws, len);
828 t = 8 * tc + mpn_lshift (ws, C, len, 3);
829 tb -= t + mpn_sub_n (B, B, ws, len);
832 tc = 2 * tc + mpn_lshift (C, C, len, 1);
833 tc -= tb + mpn_sub_n (C, C, B, len);
836 td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3;
838 /* b, d := b + d, b - d */
839 #ifdef HAVE_MPN_ADD_SUB_N
840 /* #error TO DO ... */
842 t = tb + td + mpn_add_n (ws, B, D, len);
843 td = tb - td - mpn_sub_n (D, B, D, len);
845 MPN_COPY (B, ws, len);
855 mpn_rshift (B, B, len, 2);
856 B[len-1] |= tb<<(BITS_PER_MP_LIMB-2);
857 ASSERT((long)tb >= 0);
861 mpn_rshift (C, C, len, 1);
862 C[len-1] |= tc<<(BITS_PER_MP_LIMB-1);
863 ASSERT((long)tc >= 0);
867 mpn_rshift (D, D, len, 2);
868 D[len-1] |= td<<(BITS_PER_MP_LIMB-2);
869 ASSERT((long)td >= 0);
897 interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
898 mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls)
900 interpolate3 (A, B, C, D, E,
901 ptb, ptc, ptd, l, ls)
914 mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od;
915 const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);
919 ASSERT (t == 0 || t == 2 || t == 4);
923 for (i = 0; i < l; ++i)
925 mp_limb_t tb, tc, td, tt;
933 /* Let x1, x2, x3 be the values to interpolate. We have:
934 * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e
935 * c = a + x1 + x2 + x3 + e
936 * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e
944 tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t);
955 td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t);
958 /* b, d := b + d, b - d */
960 tt = tb + td + (t < b);
961 td = tb - td - (b < d);
968 tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t);
973 tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b);
978 td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d);
981 /* b, d := b + d, b - d */
983 tt = tb + td + (t < b);
984 td = tb - td - (b < d);
995 /* sb has period 2. */
999 sb |= sb >> (BITS_PER_MP_LIMB >> 1);
1002 /* sc has period 1. */
1005 /* TO DO: choose one of the following alternatives. */
1007 sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1));
1010 sc = tc - ((long)sc < 0L);
1013 /* sd has period 2. */
1017 sd |= sd >> (BITS_PER_MP_LIMB >> 1);
1022 B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
1023 C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
1024 D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
1030 ++A; ++B; ++C; ++D; ++E;
1033 /* Handle top words. */
1052 B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
1053 C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
1054 D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
1081 /*-- mpn_toom3_mul_n --------------------------------------------------------------*/
1083 /* Multiplies using 5 mults of one third size and so on recursively.
1084 * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
1085 * No overlap of p[...] with a[...] or b[...].
1089 /* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the
1090 * recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
1091 * because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false.
1094 #define TOOM3_MUL_REC(p, a, b, n, ws) \
1096 if (n < KARATSUBA_MUL_THRESHOLD) \
1097 mpn_mul_basecase (p, a, n, b, n); \
1098 else if (n < TOOM3_MUL_THRESHOLD) \
1099 mpn_kara_mul_n (p, a, b, n, ws); \
1101 mpn_toom3_mul_n (p, a, b, n, ws); \
1106 mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
1108 mpn_toom3_mul_n (p, a, b, n, ws)
1116 mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD;
1117 mp_limb_t *A,*B,*C,*D,*E, *W;
1118 mp_size_t l,l2,l3,l4,l5,ls;
1120 /* Break n words into chunks of size l, l and ls.
1121 * n = 3*k => l = k, ls = k
1122 * n = 3*k+1 => l = k+1, ls = k-1
1123 * n = 3*k+2 => l = k+1, ls = k
1128 ASSERT (n >= TOOM3_MUL_THRESHOLD);
1148 /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
1149 evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
1150 evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls);
1152 /** Second stage: pointwise multiplies. **/
1153 TOOM3_MUL_REC(D, C, C + l, l, W);
1155 if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD);
1156 if (dD) tD += mpn_addmul_1 (D + l, C, l, dD);
1158 TOOM3_MUL_REC(C, B, B + l, l, W);
1160 /* TO DO: choose one of the following alternatives. */
1162 if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC);
1163 if (dC) tC += mpn_addmul_1 (C + l, B, l, dC);
1167 if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l);
1168 else tC += add2Times (C + l, C + l, B + l, l);
1172 if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l);
1173 else tC += add2Times (C + l, C + l, B, l);
1177 TOOM3_MUL_REC(B, A, A + l, l, W);
1179 if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB);
1180 if (dB) tB += mpn_addmul_1 (B + l, A, l, dB);
1182 TOOM3_MUL_REC(A, a, b, l, W);
1183 TOOM3_MUL_REC(E, a + l2, b + l2, ls, W);
1185 /** Third stage: interpolation. **/
1186 interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
1188 /** Final stage: add up the coefficients. **/
1191 tB += mpn_add_n (p + l, p + l, B, l2);
1192 tD += mpn_add_n (p + l3, p + l3, D, l2);
1193 mpn_incr_u (p + l3, tB);
1194 mpn_incr_u (p + l4, tC);
1195 mpn_incr_u (p + l5, tD);
1199 /*-- mpn_toom3_sqr_n --------------------------------------------------------------*/
1201 /* Like previous function but for squaring */
1203 #define TOOM3_SQR_REC(p, a, n, ws) \
1205 if (n < KARATSUBA_SQR_THRESHOLD) \
1206 mpn_sqr_basecase (p, a, n); \
1207 else if (n < TOOM3_SQR_THRESHOLD) \
1208 mpn_kara_sqr_n (p, a, n, ws); \
1210 mpn_toom3_sqr_n (p, a, n, ws); \
1215 mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
1217 mpn_toom3_sqr_n (p, a, n, ws)
1224 mp_limb_t cB,cC,cD, tB,tC,tD;
1225 mp_limb_t *A,*B,*C,*D,*E, *W;
1226 mp_size_t l,l2,l3,l4,l5,ls;
1228 /* Break n words into chunks of size l, l and ls.
1229 * n = 3*k => l = k, ls = k
1230 * n = 3*k+1 => l = k+1, ls = k-1
1231 * n = 3*k+2 => l = k+1, ls = k
1236 ASSERT (n >= TOOM3_MUL_THRESHOLD);
1256 /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
1257 evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
1259 /** Second stage: pointwise multiplies. **/
1260 TOOM3_SQR_REC(D, C, l, W);
1262 if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD);
1264 TOOM3_SQR_REC(C, B, l, W);
1266 /* TO DO: choose one of the following alternatives. */
1268 if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC);
1272 tC += add2Times (C + l, C + l, B, l);
1274 tC += add2Times (C + l, C + l, B, l);
1278 TOOM3_SQR_REC(B, A, l, W);
1280 if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB);
1282 TOOM3_SQR_REC(A, a, l, W);
1283 TOOM3_SQR_REC(E, a + l2, ls, W);
1285 /** Third stage: interpolation. **/
1286 interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
1288 /** Final stage: add up the coefficients. **/
1291 tB += mpn_add_n (p + l, p + l, B, l2);
1292 tD += mpn_add_n (p + l3, p + l3, D, l2);
1293 mpn_incr_u (p + l3, tB);
1294 mpn_incr_u (p + l4, tC);
1295 mpn_incr_u (p + l5, tD);
1301 mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
1303 mpn_mul_n (p, a, b, n)
1310 if (n < KARATSUBA_MUL_THRESHOLD)
1311 mpn_mul_basecase (p, a, n, b, n);
1312 else if (n < TOOM3_MUL_THRESHOLD)
1314 /* Allocate workspace of fixed size on stack: fast! */
1315 #if TUNE_PROGRAM_BUILD
1316 mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB];
1318 mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB];
1320 mpn_kara_mul_n (p, a, b, n, ws);
1322 #if WANT_FFT || TUNE_PROGRAM_BUILD
1323 else if (n < FFT_MUL_THRESHOLD)
1328 /* Use workspace of unknown size in heap, as stack space may
1329 * be limited. Since n is at least TOOM3_MUL_THRESHOLD, the
1330 * multiplication will take much longer than malloc()/free(). */
1331 mp_limb_t wsLen, *ws;
1332 wsLen = 2 * n + 3 * BITS_PER_MP_LIMB;
1333 ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t));
1334 mpn_toom3_mul_n (p, a, b, n, ws);
1335 (*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t));
1337 #if WANT_FFT || TUNE_PROGRAM_BUILD
1340 mpn_mul_fft_full (p, a, n, b, n);