1 /* An implementation in GMP of Scho"nhage's fast multiplication algorithm
2 modulo 2^N+1, by Paul Zimmermann, INRIA Lorraine, February 1998.
4 THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND THE FUNCTIONS HAVE
5 MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
6 INTERFACES. IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN
7 A FUTURE GNU MP RELEASE.
9 Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
11 This file is part of the GNU MP Library.
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of the GNU Lesser General Public License as published by
15 the Free Software Foundation; either version 2.1 of the License, or (at your
16 option) any later version.
18 The GNU MP Library is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
21 License for more details.
23 You should have received a copy of the GNU Lesser General Public License
24 along with the GNU MP Library; see the file COPYING.LIB. If not, write to
25 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
26 MA 02111-1307, USA. */
31 Schnelle Multiplikation grosser Zahlen, by Arnold Scho"nhage and Volker
32 Strassen, Computing 7, p. 281-292, 1971.
34 Asymptotically fast algorithms for the numerical multiplication
35 and division of polynomials with complex coefficients, by Arnold Scho"nhage,
36 Computer Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982.
38 Tapes versus Pointers, a study in implementing fast algorithms,
39 by Arnold Scho"nhage, Bulletin of the EATCS, 30, p. 23-32, 1986.
41 See also http://www.loria.fr/~zimmerma/bignum
46 K==2 isn't needed in the current uses of this code and the bits specific
47 for that could be dropped.
49 It might be possible to avoid a small number of MPN_COPYs by using a
50 rotating temporary or two.
52 Multiplications of unequal sized operands can be done with this code, but
53 it needs a tighter test for identifying squaring (same sizes as well as
62 /* Change this to "#define TRACE(x) x" for some traces. */
67 FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] = {
73 static void mpn_mul_fft_internal
74 _PROTO ((mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
76 mp_limb_t **Ap, mp_limb_t **Bp,
77 mp_limb_t *A, mp_limb_t *B,
78 mp_size_t nprime, mp_size_t l, mp_size_t Mp, int **_fft_l,
79 mp_limb_t *T, int rec));
82 /* Find the best k to use for a mod 2^(n*BITS_PER_MP_LIMB)+1 FFT.
83 sqr==0 if for a multiply, sqr==1 for a square */
86 mpn_fft_best_k (mp_size_t n, int sqr)
88 mpn_fft_best_k (n, sqr)
96 for (i = 0; mpn_fft_table[sqr][i] != 0; i++)
97 if (n < mpn_fft_table[sqr][i])
98 return i + FFT_FIRST_K;
100 /* treat 4*last as one further entry */
101 if (i == 0 || n < 4*mpn_fft_table[sqr][i-1])
102 return i + FFT_FIRST_K;
104 return i + FFT_FIRST_K + 1;
108 /* Returns smallest possible number of limbs >= pl for a fft of size 2^k.
109 FIXME: Is this simply pl rounded up to the next multiple of 2^k ? */
113 mpn_fft_next_size (mp_size_t pl, int k)
115 mpn_fft_next_size (pl, k)
123 /* if (k==0) k = mpn_fft_best_k (pl, sqr); */
124 N = pl*BITS_PER_MP_LIMB;
126 if (N%K) N=(N/K+1)*K;
128 if (M%BITS_PER_MP_LIMB) N=((M/BITS_PER_MP_LIMB)+1)*BITS_PER_MP_LIMB*K;
129 return (N/BITS_PER_MP_LIMB);
135 mpn_fft_initl(int **l, int k)
145 for (i=1,K=2;i<=k;i++,K*=2) {
146 for (j=0;j<K/2;j++) {
147 l[i][j] = 2*l[i-1][j];
148 l[i][K/2+j] = 1+l[i][j];
154 /* a <- -a mod 2^(n*BITS_PER_MP_LIMB)+1 */
157 mpn_fft_neg_modF(mp_limb_t *ap, mp_size_t n)
159 mpn_fft_neg_modF(ap, n)
167 mpn_com_n (ap, ap, n);
168 ap[n]=0; mpn_incr_u(ap, c);
172 /* a <- a*2^e mod 2^(n*BITS_PER_MP_LIMB)+1 */
175 mpn_fft_mul_2exp_modF(mp_limb_t *ap, int e, mp_size_t n, mp_limb_t *tp)
177 mpn_fft_mul_2exp_modF(ap, e, n, tp)
184 int d, sh, i; mp_limb_t cc;
186 d = e%(n*BITS_PER_MP_LIMB); /* 2^e = (+/-) 2^d */
187 sh = d % BITS_PER_MP_LIMB;
188 if (sh) mpn_lshift(tp, ap, n+1, sh); /* no carry here */
189 else MPN_COPY(tp, ap, n+1);
190 d /= BITS_PER_MP_LIMB; /* now shift of d limbs to the left */
192 /* ap[d..n-1] = tp[0..n-d-1], ap[0..d-1] = -tp[n-d..n-1] */
193 /* mpn_xor would be more efficient here */
194 for (i=d-1;i>=0;i--) ap[i] = ~tp[n-d+i];
195 cc = 1-mpn_add_1(ap, ap, d, 1);
196 if (cc) cc=mpn_sub_1(ap+d, tp, n-d, 1);
197 else MPN_COPY(ap+d, tp, n-d);
198 if (cc+=mpn_sub_1(ap+d, ap+d, n-d, tp[n]))
199 ap[n]=mpn_add_1(ap, ap, n, cc);
202 else if ((ap[n]=mpn_sub_1(ap, tp, n, tp[n]))) {
203 ap[n]=mpn_add_1(ap, ap, n, 1);
205 if ((e/(n*BITS_PER_MP_LIMB))%2) mpn_fft_neg_modF(ap, n);
209 /* a <- a+b mod 2^(n*BITS_PER_MP_LIMB)+1 */
212 mpn_fft_add_modF (mp_limb_t *ap, mp_limb_t *bp, int n)
214 mpn_fft_add_modF (ap, bp, n)
221 c = ap[n] + bp[n] + mpn_add_n(ap, ap, bp, n);
222 if (c>1) c -= 1+mpn_sub_1(ap,ap,n,1);
227 /* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
229 2^omega is a primitive root mod 2^N+1
230 output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
234 mpn_fft_fft_sqr (mp_limb_t **Ap, mp_size_t K, int **ll,
235 mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
237 mpn_fft_fft_sqr(Ap,K,ll,omega,n,inc,tp)
239 mp_size_t K,omega,n,inc;
245 if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
247 MPN_COPY(tp, Ap[0], n+1);
248 mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
249 if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
251 Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
260 tmp = TMP_ALLOC_LIMBS (n+1);
261 mpn_fft_fft_sqr(Ap, K/2,ll-1,2*omega,n,inc2, tp);
262 mpn_fft_fft_sqr(Ap+inc, K/2,ll-1,2*omega,n,inc2, tp);
263 /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
264 A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
265 for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc) {
266 MPN_COPY(tp, Ap[inc], n+1);
267 mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
268 mpn_fft_add_modF(Ap[inc], Ap[0], n);
269 mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
270 mpn_fft_add_modF(Ap[0], tp, n);
277 /* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
279 2^omega is a primitive root mod 2^N+1
280 output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
284 mpn_fft_fft (mp_limb_t **Ap, mp_limb_t **Bp, mp_size_t K, int **ll,
285 mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
287 mpn_fft_fft(Ap,Bp,K,ll,omega,n,inc,tp)
288 mp_limb_t **Ap,**Bp,*tp;
289 mp_size_t K,omega,n,inc;
295 if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
297 MPN_COPY(tp, Ap[0], n+1);
298 mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
299 if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
301 Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
303 if (mpn_addsub_n(Bp[0], Bp[inc], Bp[0], Bp[inc], n+1) & 1)
305 MPN_COPY(tp, Bp[0], n+1);
306 mpn_add_n(Bp[0], Bp[0], Bp[inc],n+1);
307 if (mpn_sub_n(Bp[inc], tp, Bp[inc],n+1))
309 Bp[inc][n] = mpn_add_1(Bp[inc], Bp[inc], n, 1);
318 tmp = TMP_ALLOC_LIMBS (n+1);
319 mpn_fft_fft(Ap, Bp, K/2,ll-1,2*omega,n,inc2, tp);
320 mpn_fft_fft(Ap+inc, Bp+inc, K/2,ll-1,2*omega,n,inc2, tp);
321 /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
322 A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
323 for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc,Bp+=2*inc) {
324 MPN_COPY(tp, Ap[inc], n+1);
325 mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
326 mpn_fft_add_modF(Ap[inc], Ap[0], n);
327 mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
328 mpn_fft_add_modF(Ap[0], tp, n);
329 MPN_COPY(tp, Bp[inc], n+1);
330 mpn_fft_mul_2exp_modF(Bp[inc], lk[1]*omega, n, tmp);
331 mpn_fft_add_modF(Bp[inc], Bp[0], n);
332 mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
333 mpn_fft_add_modF(Bp[0], tp, n);
340 /* a[i] <- a[i]*b[i] mod 2^(n*BITS_PER_MP_LIMB)+1 for 0 <= i < K */
343 mpn_fft_mul_modF_K (mp_limb_t **ap, mp_limb_t **bp, mp_size_t n, int K)
345 mpn_fft_mul_modF_K(ap, bp, n, K)
346 mp_limb_t **ap, **bp;
352 int sqr = (ap == bp);
357 if (n >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
358 int k, K2,nprime2,Nprime2,M2,maxLK,l,Mp2;
360 mp_limb_t **Ap,**Bp,*A,*B,*T;
362 k = mpn_fft_best_k (n, sqr);
364 maxLK = (K2>BITS_PER_MP_LIMB) ? K2 : BITS_PER_MP_LIMB;
365 M2 = n*BITS_PER_MP_LIMB/K2;
367 Nprime2 = ((2*M2+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M2+k+3)/maxLK)*maxLK*/
368 nprime2 = Nprime2/BITS_PER_MP_LIMB;
371 Ap = TMP_ALLOC_MP_PTRS (K2);
372 Bp = TMP_ALLOC_MP_PTRS (K2);
373 A = TMP_ALLOC_LIMBS (2*K2*(nprime2+1));
374 T = TMP_ALLOC_LIMBS (nprime2+1);
375 B = A + K2*(nprime2+1);
376 _fft_l = TMP_ALLOC_TYPE (k+1, int*);
378 _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
379 mpn_fft_initl(_fft_l, k);
381 TRACE (printf("recurse: %dx%d limbs -> %d times %dx%d (%1.2f)\n", n,
382 n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2));
384 for (i=0;i<K;i++,ap++,bp++)
385 mpn_mul_fft_internal(*ap, *ap, *bp, n, k, K2, Ap, Bp, A, B, nprime2,
386 l, Mp2, _fft_l, T, 1);
389 mp_limb_t *a, *b, cc, *tp, *tpn; int n2=2*n;
390 tp = TMP_ALLOC_LIMBS (n2);
392 TRACE (printf (" mpn_mul_n %d of %d limbs\n", K, n));
398 mpn_mul_n(tp, b, a, n);
399 if (a[n]) cc=mpn_add_n(tpn, tpn, b, n); else cc=0;
400 if (b[n]) cc += mpn_add_n(tpn, tpn, a, n) + a[n];
402 cc = mpn_add_1(tp, tp, n2, cc);
403 ASSERT_NOCARRY (mpn_add_1(tp, tp, n2, cc));
405 a[n] = mpn_sub_n(a, tp, tpn, n) && mpn_add_1(a, a, n, 1);
412 /* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]]
413 output: K*A[0] K*A[K-1] ... K*A[1] */
417 mpn_fft_fftinv (mp_limb_t **Ap, int K, mp_size_t omega, mp_size_t n,
420 mpn_fft_fftinv(Ap,K,omega,n,tp)
428 if (mpn_addsub_n(Ap[0], Ap[1], Ap[0], Ap[1], n+1) & 1)
430 MPN_COPY(tp, Ap[0], n+1);
431 mpn_add_n(Ap[0], Ap[0], Ap[1], n+1);
432 if (mpn_sub_n(Ap[1], tp, Ap[1], n+1))
434 Ap[1][n] = mpn_add_1(Ap[1], Ap[1], n, 1);
437 int j, K2=K/2; mp_limb_t **Bp=Ap+K2, *tmp;
441 tmp = TMP_ALLOC_LIMBS (n+1);
442 mpn_fft_fftinv(Ap, K2, 2*omega, n, tp);
443 mpn_fft_fftinv(Bp, K2, 2*omega, n, tp);
444 /* A[j] <- A[j] + omega^j A[j+K/2]
445 A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */
446 for (j=0;j<K2;j++,Ap++,Bp++) {
447 MPN_COPY(tp, Bp[0], n+1);
448 mpn_fft_mul_2exp_modF(Bp[0], (j+K2)*omega, n, tmp);
449 mpn_fft_add_modF(Bp[0], Ap[0], n);
450 mpn_fft_mul_2exp_modF(tp, j*omega, n, tmp);
451 mpn_fft_add_modF(Ap[0], tp, n);
458 /* A <- A/2^k mod 2^(n*BITS_PER_MP_LIMB)+1 */
461 mpn_fft_div_2exp_modF (mp_limb_t *ap, int k, mp_size_t n, mp_limb_t *tp)
463 mpn_fft_div_2exp_modF(ap,k,n,tp)
471 i = 2*n*BITS_PER_MP_LIMB;
473 mpn_fft_mul_2exp_modF(ap,i,n,tp);
474 /* 1/2^k = 2^(2nL-k) mod 2^(n*BITS_PER_MP_LIMB)+1 */
475 /* normalize so that A < 2^(n*BITS_PER_MP_LIMB)+1 */
477 for (i=0;i<n && ap[i]==0;i++);
480 mpn_sub_1(ap, ap, n, 1);
486 /* R <- A mod 2^(n*BITS_PER_MP_LIMB)+1, n<=an<=3*n */
489 mpn_fft_norm_modF(mp_limb_t *rp, mp_limb_t *ap, mp_size_t n, mp_size_t an)
491 mpn_fft_norm_modF(rp, ap, n, an)
502 rp[n] = mpn_add_1(rp+an-2*n, ap+an-2*n, 3*n-an,
503 mpn_add_n(rp,ap,ap+2*n,an-2*n));
510 if (mpn_sub_n(rp,rp,ap+n,l)) {
511 if (mpn_sub_1(rp+l,rp+l,n+1-l,1))
512 rp[n]=mpn_add_1(rp,rp,n,1);
519 mpn_mul_fft_internal(mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
521 mp_limb_t **Ap, mp_limb_t **Bp,
522 mp_limb_t *A, mp_limb_t *B,
523 mp_size_t nprime, mp_size_t l, mp_size_t Mp,
525 mp_limb_t *T, int rec)
527 mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,rec)
530 mp_limb_t **Ap,**Bp,*A,*B,*T;
536 int i, sqr, pla, lo, sh, j;
541 TRACE (printf ("pl=%d k=%d K=%d np=%d l=%d Mp=%d rec=%d sqr=%d\n",
542 pl,k,K,nprime,l,Mp,rec,sqr));
544 /* decomposition of inputs into arrays Ap[i] and Bp[i] */
545 if (rec) for (i=0;i<K;i++) {
546 Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
547 /* store the next M bits of n into A[i] */
548 /* supposes that M is a multiple of BITS_PER_MP_LIMB */
549 MPN_COPY(Ap[i], n, l); n+=l; MPN_ZERO(Ap[i]+l, nprime+1-l);
550 /* set most significant bits of n and m (important in recursive calls) */
551 if (i==K-1) Ap[i][l]=n[0];
552 mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
554 MPN_COPY(Bp[i], m, l); m+=l; MPN_ZERO(Bp[i]+l, nprime+1-l);
555 if (i==K-1) Bp[i][l]=m[0];
556 mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
561 if (sqr) mpn_fft_fft_sqr(Ap,K,_fft_l+k,2*Mp,nprime,1, T);
562 else mpn_fft_fft(Ap,Bp,K,_fft_l+k,2*Mp,nprime,1, T);
564 /* term to term multiplications */
565 mpn_fft_mul_modF_K(Ap, (sqr) ? Ap : Bp, nprime, K);
568 mpn_fft_fftinv(Ap, K, 2*Mp, nprime, T);
570 /* division of terms after inverse fft */
571 for (i=0;i<K;i++) mpn_fft_div_2exp_modF(Ap[i],k+((K-i)%K)*Mp,nprime, T);
573 /* addition of terms in result p */
574 MPN_ZERO(T,nprime+1);
575 pla = l*(K-1)+nprime+1; /* number of required limbs for p */
576 p = B; /* B has K*(n'+1) limbs, which is >= pla, i.e. enough */
578 sqr=0; /* will accumulate the (signed) carry at p[pla] */
579 for (i=K-1,lo=l*i+nprime,sh=l*i;i>=0;i--,lo-=l,sh-=l) {
582 if (mpn_add_n(n,n,Ap[j],nprime+1))
583 sqr += mpn_add_1(n+nprime+1,n+nprime+1,pla-sh-nprime-1,1);
584 T[2*l]=i+1; /* T = (i+1)*2^(2*M) */
585 if (mpn_cmp(Ap[j],T,nprime+1)>0) { /* subtract 2^N'+1 */
586 sqr -= mpn_sub_1(n,n,pla-sh,1);
587 sqr -= mpn_sub_1(p+lo,p+lo,pla-lo,1);
591 if ((sqr=mpn_add_1(p+pla-pl,p+pla-pl,pl,1))) {
592 /* p[pla-pl]...p[pla-1] are all zero */
593 mpn_sub_1(p+pla-pl-1,p+pla-pl-1,pl+1,1);
594 mpn_sub_1(p+pla-1,p+pla-1,1,1);
599 while ((sqr=mpn_add_1(p+pla-2*pl,p+pla-2*pl,2*pl,sqr)));
601 sqr = mpn_sub_1(p+pla-pl,p+pla-pl,pl,sqr);
608 /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ]
609 < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ]
610 < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */
611 mpn_fft_norm_modF(op,p,pl,pla);
615 /* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*BITS_PER_MP_LIMB
616 n and m have respectively nl and ml limbs
617 op must have space for pl+1 limbs
618 One must have pl = mpn_fft_next_size(pl, k).
623 mpn_mul_fft (mp_ptr op, mp_size_t pl,
624 mp_srcptr n, mp_size_t nl,
625 mp_srcptr m, mp_size_t ml,
628 mpn_mul_fft (op, pl, n, nl, m, ml, k)
639 mp_size_t N,Nprime,nprime,M,Mp,l;
640 mp_limb_t **Ap,**Bp,*A,*T,*B;
642 int sqr = (n==m && nl==ml);
645 TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n",
647 ASSERT_ALWAYS (mpn_fft_next_size(pl, k) == pl);
650 N = pl*BITS_PER_MP_LIMB;
651 _fft_l = TMP_ALLOC_TYPE (k+1, int*);
653 _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
654 mpn_fft_initl(_fft_l, k);
656 M = N/K; /* N = 2^k M */
657 l = M/BITS_PER_MP_LIMB;
658 maxLK = (K>BITS_PER_MP_LIMB) ? K : BITS_PER_MP_LIMB;
660 Nprime = ((2*M+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M+k+3)/maxLK)*maxLK; */
661 nprime = Nprime/BITS_PER_MP_LIMB;
662 TRACE (printf ("N=%d K=%d, M=%d, l=%d, maxLK=%d, Np=%d, np=%d\n",
663 N, K, M, l, maxLK, Nprime, nprime));
664 if (nprime >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
665 maxLK = (1<<mpn_fft_best_k(nprime,n==m))*BITS_PER_MP_LIMB;
666 if (Nprime % maxLK) {
667 Nprime=((Nprime/maxLK)+1)*maxLK;
668 nprime = Nprime/BITS_PER_MP_LIMB;
670 TRACE (printf ("new maxLK=%d, Np=%d, np=%d\n", maxLK, Nprime, nprime));
673 T = TMP_ALLOC_LIMBS (nprime+1);
676 TRACE (printf("%dx%d limbs -> %d times %dx%d limbs (%1.2f)\n",
677 pl,pl,K,nprime,nprime,2.0*(double)N/Nprime/K);
678 printf(" temp space %ld\n", 2*K*(nprime+1)));
680 A = _MP_ALLOCATE_FUNC_LIMBS (2*K*(nprime+1));
682 Ap = TMP_ALLOC_MP_PTRS (K);
683 Bp = TMP_ALLOC_MP_PTRS (K);
684 /* special decomposition for main call */
686 Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
687 /* store the next M bits of n into A[i] */
688 /* supposes that M is a multiple of BITS_PER_MP_LIMB */
690 j = (nl>=l) ? l : nl; /* limbs to store in Ap[i] */
691 MPN_COPY(Ap[i], n, j); n+=l; MPN_ZERO(Ap[i]+j, nprime+1-j);
692 mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
694 else MPN_ZERO(Ap[i], nprime+1);
698 j = (ml>=l) ? l : ml; /* limbs to store in Bp[i] */
699 MPN_COPY(Bp[i], m, j); m+=l; MPN_ZERO(Bp[i]+j, nprime+1-j);
700 mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
702 else MPN_ZERO(Bp[i], nprime+1);
706 mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,0);
708 _MP_FREE_FUNC_LIMBS (A, 2*K*(nprime+1));
715 mpn_zero_p (mp_ptr p, mp_size_t n)
724 for (i = 0; i < n; i++)
735 /* Multiply {n,nl}*{m,ml} and write the result to {op,nl+ml}.
737 FIXME: Duplicating the result like this is wasteful, do something better
738 perhaps at the norm_modF stage above. */
742 mpn_mul_fft_full (mp_ptr op,
743 mp_srcptr n, mp_size_t nl,
744 mp_srcptr m, mp_size_t ml)
746 mpn_mul_fft_full (op, n, nl, m, ml)
757 int sqr = (n==m && nl==ml);
759 k = mpn_fft_best_k (nl+ml, sqr);
760 pl = mpn_fft_next_size (nl+ml, k);
762 TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl=%ld k=%d\n",
765 pad_op = _MP_ALLOCATE_FUNC_LIMBS (pl+1);
766 mpn_mul_fft (pad_op, pl, n, nl, m, ml, k);
768 ASSERT (mpn_zero_p (pad_op+nl+ml, pl+1-(nl+ml)));
769 MPN_COPY (op, pad_op, nl+ml);
771 _MP_FREE_FUNC_LIMBS (pad_op, pl+1);