rts/gmp/mpn/generic/mul_n.c

   1 /* mpn_mul_n and helper function -- Multiply/square natural numbers.
   2
   3    THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n)
   4    ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY SAFE TO REACH
   5    THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
   6    THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   7
   8
   9 Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
  10 Foundation, Inc.
  11
  12 This file is part of the GNU MP Library.
  13
  14 The GNU MP Library is free software; you can redistribute it and/or modify
  15 it under the terms of the GNU Lesser General Public License as published by
  16 the Free Software Foundation; either version 2.1 of the License, or (at your
  17 option) any later version.
  18
  19 The GNU MP Library is distributed in the hope that it will be useful, but
  20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  21 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  22 License for more details.
  23
  24 You should have received a copy of the GNU Lesser General Public License
  25 along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
  26 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  27 MA 02111-1307, USA. */
  28
  29 #include "gmp.h"
  30 #include "gmp-impl.h"
  31 #include "longlong.h"
  32
  33
  34 /* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
  35    0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
  36 #define INVERSE_3      ((MP_LIMB_T_MAX / 3) * 2 + 1)
  37
  38 #if !defined (__alpha) && !defined (__mips)
  39 /* For all other machines, we want to call mpn functions for the compund
  40    operations instead of open-coding them.  */
  41 #define USE_MORE_MPN
  42 #endif
  43
  44 /*== Function declarations =================================================*/
  45
  46 static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr,
  47                                mp_ptr, mp_ptr, mp_ptr,
  48                                mp_srcptr, mp_srcptr, mp_srcptr,
  49                                mp_size_t, mp_size_t));
  50 static void interpolate3 _PROTO ((mp_srcptr,
  51                                   mp_ptr, mp_ptr, mp_ptr,
  52                                   mp_srcptr,
  53                                   mp_ptr, mp_ptr, mp_ptr,
  54                                   mp_size_t, mp_size_t));
  55 static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
  56
  57
  58 /*-- mpn_kara_mul_n ---------------------------------------------------------------*/
  59
  60 /* Multiplies using 3 half-sized mults and so on recursively.
  61  * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
  62  * No overlap of p[...] with a[...] or b[...].
  63  * ws is workspace.
  64  */
  65
  66 void
  67 #if __STDC__
  68 mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
  69 #else
  70 mpn_kara_mul_n(p, a, b, n, ws)
  71      mp_ptr    p;
  72      mp_srcptr a;
  73      mp_srcptr b;
  74      mp_size_t n;
  75      mp_ptr    ws;
  76 #endif
  77 {
  78   mp_limb_t i, sign, w, w0, w1;
  79   mp_size_t n2;
  80   mp_srcptr x, y;
  81
  82   n2 = n >> 1;
  83   ASSERT (n2 > 0);
  84
  85   if (n & 1)
  86     {
  87       /* Odd length. */
  88       mp_size_t n1, n3, nm1;
  89
  90       n3 = n - n2;
  91
  92       sign = 0;
  93       w = a[n2];
  94       if (w != 0)
  95         w -= mpn_sub_n (p, a, a + n3, n2);
  96       else
  97         {
  98           i = n2;
  99           do
 100             {
 101               --i;
 102               w0 = a[i];
 103               w1 = a[n3+i];
 104             }
 105           while (w0 == w1 && i != 0);
 106           if (w0 < w1)
 107             {
 108               x = a + n3;
 109               y = a;
 110               sign = 1;
 111             }
 112           else
 113             {
 114               x = a;
 115               y = a + n3;
 116             }
 117           mpn_sub_n (p, x, y, n2);
 118         }
 119       p[n2] = w;
 120
 121       w = b[n2];
 122       if (w != 0)
 123         w -= mpn_sub_n (p + n3, b, b + n3, n2);
 124       else
 125         {
 126           i = n2;
 127           do
 128             {
 129               --i;
 130               w0 = b[i];
 131               w1 = b[n3+i];
 132             }
 133           while (w0 == w1 && i != 0);
 134           if (w0 < w1)
 135             {
 136               x = b + n3;
 137               y = b;
 138               sign ^= 1;
 139             }
 140           else
 141             {
 142               x = b;
 143               y = b + n3;
 144             }
 145           mpn_sub_n (p + n3, x, y, n2);
 146         }
 147       p[n] = w;
 148
 149       n1 = n + 1;
 150       if (n2 < KARATSUBA_MUL_THRESHOLD)
 151         {
 152           if (n3 < KARATSUBA_MUL_THRESHOLD)
 153             {
 154               mpn_mul_basecase (ws, p, n3, p + n3, n3);
 155               mpn_mul_basecase (p, a, n3, b, n3);
 156             }
 157           else
 158             {
 159               mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
 160               mpn_kara_mul_n (p, a, b, n3, ws + n1);
 161             }
 162           mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
 163         }
 164       else
 165         {
 166           mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
 167           mpn_kara_mul_n (p, a, b, n3, ws + n1);
 168           mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
 169         }
 170
 171       if (sign)
 172         mpn_add_n (ws, p, ws, n1);
 173       else
 174         mpn_sub_n (ws, p, ws, n1);
 175
 176       nm1 = n - 1;
 177       if (mpn_add_n (ws, p + n1, ws, nm1))
 178         {
 179           mp_limb_t x = ws[nm1] + 1;
 180           ws[nm1] = x;
 181           if (x == 0)
 182             ++ws[n];
 183         }
 184       if (mpn_add_n (p + n3, p + n3, ws, n1))
 185         {
 186           mp_limb_t x;
 187           i = n1 + n3;
 188           do
 189             {
 190               x = p[i] + 1;
 191               p[i] = x;
 192               ++i;
 193             } while (x == 0);
 194         }
 195     }
 196   else
 197     {
 198       /* Even length. */
 199       mp_limb_t t;
 200
 201       i = n2;
 202       do
 203         {
 204           --i;
 205           w0 = a[i];
 206           w1 = a[n2+i];
 207         }
 208       while (w0 == w1 && i != 0);
 209       sign = 0;
 210       if (w0 < w1)
 211         {
 212           x = a + n2;
 213           y = a;
 214           sign = 1;
 215         }
 216       else
 217         {
 218           x = a;
 219           y = a + n2;
 220         }
 221       mpn_sub_n (p, x, y, n2);
 222
 223       i = n2;
 224       do
 225         {
 226           --i;
 227           w0 = b[i];
 228           w1 = b[n2+i];
 229         }
 230       while (w0 == w1 && i != 0);
 231       if (w0 < w1)
 232         {
 233           x = b + n2;
 234           y = b;
 235           sign ^= 1;
 236         }
 237       else
 238         {
 239           x = b;
 240           y = b + n2;
 241         }
 242       mpn_sub_n (p + n2, x, y, n2);
 243
 244       /* Pointwise products. */
 245       if (n2 < KARATSUBA_MUL_THRESHOLD)
 246         {
 247           mpn_mul_basecase (ws, p, n2, p + n2, n2);
 248           mpn_mul_basecase (p, a, n2, b, n2);
 249           mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
 250         }
 251       else
 252         {
 253           mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
 254           mpn_kara_mul_n (p, a, b, n2, ws + n);
 255           mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
 256         }
 257
 258       /* Interpolate. */
 259       if (sign)
 260         w = mpn_add_n (ws, p, ws, n);
 261       else
 262         w = -mpn_sub_n (ws, p, ws, n);
 263       w += mpn_add_n (ws, p + n, ws, n);
 264       w += mpn_add_n (p + n2, p + n2, ws, n);
 265       /* TO DO: could put "if (w) { ... }" here.
 266        * Less work but badly predicted branch.
 267        * No measurable difference in speed on Alpha.
 268        */
 269       i = n + n2;
 270       t = p[i] + w;
 271       p[i] = t;
 272       if (t < w)
 273         {
 274           do
 275             {
 276               ++i;
 277               w = p[i] + 1;
 278               p[i] = w;
 279             }
 280           while (w == 0);
 281         }
 282     }
 283 }
 284
 285 void
 286 #if __STDC__
 287 mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
 288 #else
 289 mpn_kara_sqr_n (p, a, n, ws)
 290      mp_ptr    p;
 291      mp_srcptr a;
 292      mp_size_t n;
 293      mp_ptr    ws;
 294 #endif
 295 {
 296   mp_limb_t i, sign, w, w0, w1;
 297   mp_size_t n2;
 298   mp_srcptr x, y;
 299
 300   n2 = n >> 1;
 301   ASSERT (n2 > 0);
 302
 303   if (n & 1)
 304     {
 305       /* Odd length. */
 306       mp_size_t n1, n3, nm1;
 307
 308       n3 = n - n2;
 309
 310       sign = 0;
 311       w = a[n2];
 312       if (w != 0)
 313         w -= mpn_sub_n (p, a, a + n3, n2);
 314       else
 315         {
 316           i = n2;
 317           do
 318             {
 319               --i;
 320               w0 = a[i];
 321               w1 = a[n3+i];
 322             }
 323           while (w0 == w1 && i != 0);
 324           if (w0 < w1)
 325             {
 326               x = a + n3;
 327               y = a;
 328               sign = 1;
 329             }
 330           else
 331             {
 332               x = a;
 333               y = a + n3;
 334             }
 335           mpn_sub_n (p, x, y, n2);
 336         }
 337       p[n2] = w;
 338
 339       w = a[n2];
 340       if (w != 0)
 341         w -= mpn_sub_n (p + n3, a, a + n3, n2);
 342       else
 343         {
 344           i = n2;
 345           do
 346             {
 347               --i;
 348               w0 = a[i];
 349               w1 = a[n3+i];
 350             }
 351           while (w0 == w1 && i != 0);
 352           if (w0 < w1)
 353             {
 354               x = a + n3;
 355               y = a;
 356               sign ^= 1;
 357             }
 358           else
 359             {
 360               x = a;
 361               y = a + n3;
 362             }
 363           mpn_sub_n (p + n3, x, y, n2);
 364         }
 365       p[n] = w;
 366
 367       n1 = n + 1;
 368       if (n2 < KARATSUBA_SQR_THRESHOLD)
 369         {
 370           if (n3 < KARATSUBA_SQR_THRESHOLD)
 371             {
 372               mpn_sqr_basecase (ws, p, n3);
 373               mpn_sqr_basecase (p, a, n3);
 374             }
 375           else
 376             {
 377               mpn_kara_sqr_n (ws, p, n3, ws + n1);
 378               mpn_kara_sqr_n (p, a, n3, ws + n1);
 379             }
 380           mpn_sqr_basecase (p + n1, a + n3, n2);
 381         }
 382       else
 383         {
 384           mpn_kara_sqr_n (ws, p, n3, ws + n1);
 385           mpn_kara_sqr_n (p, a, n3, ws + n1);
 386           mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1);
 387         }
 388
 389       if (sign)
 390         mpn_add_n (ws, p, ws, n1);
 391       else
 392         mpn_sub_n (ws, p, ws, n1);
 393
 394       nm1 = n - 1;
 395       if (mpn_add_n (ws, p + n1, ws, nm1))
 396         {
 397           mp_limb_t x = ws[nm1] + 1;
 398           ws[nm1] = x;
 399           if (x == 0)
 400             ++ws[n];
 401         }
 402       if (mpn_add_n (p + n3, p + n3, ws, n1))
 403         {
 404           mp_limb_t x;
 405           i = n1 + n3;
 406           do
 407             {
 408               x = p[i] + 1;
 409               p[i] = x;
 410               ++i;
 411             } while (x == 0);
 412         }
 413     }
 414   else
 415     {
 416       /* Even length. */
 417       mp_limb_t t;
 418
 419       i = n2;
 420       do
 421         {
 422           --i;
 423           w0 = a[i];
 424           w1 = a[n2+i];
 425         }
 426       while (w0 == w1 && i != 0);
 427       sign = 0;
 428       if (w0 < w1)
 429         {
 430           x = a + n2;
 431           y = a;
 432           sign = 1;
 433         }
 434       else
 435         {
 436           x = a;
 437           y = a + n2;
 438         }
 439       mpn_sub_n (p, x, y, n2);
 440
 441       i = n2;
 442       do
 443         {
 444           --i;
 445           w0 = a[i];
 446           w1 = a[n2+i];
 447         }
 448       while (w0 == w1 && i != 0);
 449       if (w0 < w1)
 450         {
 451           x = a + n2;
 452           y = a;
 453           sign ^= 1;
 454         }
 455       else
 456         {
 457           x = a;
 458           y = a + n2;
 459         }
 460       mpn_sub_n (p + n2, x, y, n2);
 461
 462       /* Pointwise products. */
 463       if (n2 < KARATSUBA_SQR_THRESHOLD)
 464         {
 465           mpn_sqr_basecase (ws, p, n2);
 466           mpn_sqr_basecase (p, a, n2);
 467           mpn_sqr_basecase (p + n, a + n2, n2);
 468         }
 469       else
 470         {
 471           mpn_kara_sqr_n (ws, p, n2, ws + n);
 472           mpn_kara_sqr_n (p, a, n2, ws + n);
 473           mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
 474         }
 475
 476       /* Interpolate. */
 477       if (sign)
 478         w = mpn_add_n (ws, p, ws, n);
 479       else
 480         w = -mpn_sub_n (ws, p, ws, n);
 481       w += mpn_add_n (ws, p + n, ws, n);
 482       w += mpn_add_n (p + n2, p + n2, ws, n);
 483       /* TO DO: could put "if (w) { ... }" here.
 484        * Less work but badly predicted branch.
 485        * No measurable difference in speed on Alpha.
 486        */
 487       i = n + n2;
 488       t = p[i] + w;
 489       p[i] = t;
 490       if (t < w)
 491         {
 492           do
 493             {
 494               ++i;
 495               w = p[i] + 1;
 496               p[i] = w;
 497             }
 498           while (w == 0);
 499         }
 500     }
 501 }
 502
 503 /*-- add2Times -------------------------------------------------------------*/
 504
 505 /* z[] = x[] + 2 * y[]
 506    Note that z and x might point to the same vectors. */
 507 #ifdef USE_MORE_MPN
 508 static inline mp_limb_t
 509 #if __STDC__
 510 add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
 511 #else
 512 add2Times (z, x, y, n)
 513      mp_ptr    z;
 514      mp_srcptr x;
 515      mp_srcptr y;
 516      mp_size_t n;
 517 #endif
 518 {
 519   mp_ptr t;
 520   mp_limb_t c;
 521   TMP_DECL (marker);
 522   TMP_MARK (marker);
 523   t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
 524   c = mpn_lshift (t, y, n, 1);
 525   c += mpn_add_n (z, x, t, n);
 526   TMP_FREE (marker);
 527   return c;
 528 }
 529 #else
 530
 531 static mp_limb_t
 532 #if __STDC__
 533 add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
 534 #else
 535 add2Times (z, x, y, n)
 536      mp_ptr    z;
 537      mp_srcptr x;
 538      mp_srcptr y;
 539      mp_size_t n;
 540 #endif
 541 {
 542   mp_limb_t c, v, w;
 543
 544   ASSERT (n > 0);
 545   v = *x; w = *y;
 546   c = w >> (BITS_PER_MP_LIMB - 1);
 547   w <<= 1;
 548   v += w;
 549   c += v < w;
 550   *z = v;
 551   ++x; ++y; ++z;
 552   while (--n)
 553     {
 554       v = *x;
 555       w = *y;
 556       v += c;
 557       c = v < c;
 558       c += w >> (BITS_PER_MP_LIMB - 1);
 559       w <<= 1;
 560       v += w;
 561       c += v < w;
 562       *z = v;
 563       ++x; ++y; ++z;
 564     }
 565
 566   return c;
 567 }
 568 #endif
 569
 570 /*-- evaluate3 -------------------------------------------------------------*/
 571
 572 /* Evaluates:
 573  *   ph := 4*A+2*B+C
 574  *   p1 := A+B+C
 575  *   p2 := A+2*B+4*C
 576  * where:
 577  *   ph[], p1[], p2[], A[] and B[] all have length len,
 578  *   C[] has length len2 with len-len2 = 0, 1 or 2.
 579  * Returns top words (overflow) at pth, pt1 and pt2 respectively.
 580  */
 581 #ifdef USE_MORE_MPN
 582 static void
 583 #if __STDC__
 584 evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
 585            mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2)
 586 #else
 587 evaluate3 (ph, p1, p2, pth, pt1, pt2,
 588            A, B, C, len, len2)
 589      mp_ptr    ph;
 590      mp_ptr    p1;
 591      mp_ptr    p2;
 592      mp_ptr    pth;
 593      mp_ptr    pt1;
 594      mp_ptr    pt2;
 595      mp_srcptr A;
 596      mp_srcptr B;
 597      mp_srcptr C;
 598      mp_size_t len;
 599      mp_size_t len2;
 600 #endif
 601 {
 602   mp_limb_t c, d, e;
 603
 604   ASSERT (len - len2 <= 2);
 605
 606   e = mpn_lshift (p1, B, len, 1);
 607
 608   c = mpn_lshift (ph, A, len, 2);
 609   c += e + mpn_add_n (ph, ph, p1, len);
 610   d = mpn_add_n (ph, ph, C, len2);
 611   if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
 612   ASSERT (c < 7);
 613   *pth = c;
 614
 615   c = mpn_lshift (p2, C, len2, 2);
 616 #if 1
 617   if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; }
 618   c += e + mpn_add_n (p2, p2, p1, len);
 619 #else
 620   d = mpn_add_n (p2, p2, p1, len2);
 621   c += d;
 622   if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
 623   c += e;
 624 #endif
 625   c += mpn_add_n (p2, p2, A, len);
 626   ASSERT (c < 7);
 627   *pt2 = c;
 628
 629   c = mpn_add_n (p1, A, B, len);
 630   d = mpn_add_n (p1, p1, C, len2);
 631   if (len2 == len) c += d;
 632   else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
 633   ASSERT (c < 3);
 634   *pt1 = c;
 635
 636 }
 637
 638 #else
 639
 640 static void
 641 #if __STDC__
 642 evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
 643            mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls)
 644 #else
 645 evaluate3 (ph, p1, p2, pth, pt1, pt2,
 646            A, B, C, l, ls)
 647      mp_ptr    ph;
 648      mp_ptr    p1;
 649      mp_ptr    p2;
 650      mp_ptr    pth;
 651      mp_ptr    pt1;
 652      mp_ptr    pt2;
 653      mp_srcptr A;
 654      mp_srcptr B;
 655      mp_srcptr C;
 656      mp_size_t l;
 657      mp_size_t ls;
 658 #endif
 659 {
 660   mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2;
 661
 662   ASSERT (l - ls <= 2);
 663
 664   th = t1 = t2 = 0;
 665   for (i = 0; i < l; ++i)
 666     {
 667       a = *A;
 668       b = *B;
 669       c = i < ls ? *C : 0;
 670
 671       /* TO DO: choose one of the following alternatives. */
 672 #if 0
 673       t = a << 2;
 674       vh = th + t;
 675       th = vh < t;
 676       th += a >> (BITS_PER_MP_LIMB - 2);
 677       t = b << 1;
 678       vh += t;
 679       th += vh < t;
 680       th += b >> (BITS_PER_MP_LIMB - 1);
 681       vh += c;
 682       th += vh < c;
 683 #else
 684       vh = th + c;
 685       th = vh < c;
 686       t = b << 1;
 687       vh += t;
 688       th += vh < t;
 689       th += b >> (BITS_PER_MP_LIMB - 1);
 690       t = a << 2;
 691       vh += t;
 692       th += vh < t;
 693       th += a >> (BITS_PER_MP_LIMB - 2);
 694 #endif
 695
 696       v1 = t1 + a;
 697       t1 = v1 < a;
 698       v1 += b;
 699       t1 += v1 < b;
 700       v1 += c;
 701       t1 += v1 < c;
 702
 703       v2 = t2 + a;
 704       t2 = v2 < a;
 705       t = b << 1;
 706       v2 += t;
 707       t2 += v2 < t;
 708       t2 += b >> (BITS_PER_MP_LIMB - 1);
 709       t = c << 2;
 710       v2 += t;
 711       t2 += v2 < t;
 712       t2 += c >> (BITS_PER_MP_LIMB - 2);
 713
 714       *ph = vh;
 715       *p1 = v1;
 716       *p2 = v2;
 717
 718       ++A; ++B; ++C;
 719       ++ph; ++p1; ++p2;
 720     }
 721
 722   ASSERT (th < 7);
 723   ASSERT (t1 < 3);
 724   ASSERT (t2 < 7);
 725
 726   *pth = th;
 727   *pt1 = t1;
 728   *pt2 = t2;
 729 }
 730 #endif
 731
 732
 733 /*-- interpolate3 ----------------------------------------------------------*/
 734
 735 /* Interpolates B, C, D (in-place) from:
 736  *   16*A+8*B+4*C+2*D+E
 737  *   A+B+C+D+E
 738  *   A+2*B+4*C+8*D+16*E
 739  * where:
 740  *   A[], B[], C[] and D[] all have length l,
 741  *   E[] has length ls with l-ls = 0, 2 or 4.
 742  *
 743  * Reads top words (from earlier overflow) from ptb, ptc and ptd,
 744  * and returns new top words there.
 745  */
 746
 747 #ifdef USE_MORE_MPN
 748 static void
 749 #if __STDC__
 750 interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
 751               mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2)
 752 #else
 753 interpolate3 (A, B, C, D, E,
 754               ptb, ptc, ptd, len, len2)
 755      mp_srcptr A;
 756      mp_ptr    B;
 757      mp_ptr    C;
 758      mp_ptr    D;
 759      mp_srcptr E;
 760      mp_ptr    ptb;
 761      mp_ptr    ptc;
 762      mp_ptr    ptd;
 763      mp_size_t len;
 764      mp_size_t len2;
 765 #endif
 766 {
 767   mp_ptr ws;
 768   mp_limb_t t, tb,tc,td;
 769   TMP_DECL (marker);
 770   TMP_MARK (marker);
 771
 772   ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4);
 773
 774   /* Let x1, x2, x3 be the values to interpolate.  We have:
 775    *         b = 16*a + 8*x1 + 4*x2 + 2*x3 +    e
 776    *         c =    a +   x1 +   x2 +   x3 +    e
 777    *         d =    a + 2*x1 + 4*x2 + 8*x3 + 16*e
 778    */
 779
 780   ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB);
 781
 782   tb = *ptb; tc = *ptc; td = *ptd;
 783
 784
 785   /* b := b - 16*a -    e
 786    * c := c -    a -    e
 787    * d := d -    a - 16*e
 788    */
 789
 790   t = mpn_lshift (ws, A, len, 4);
 791   tb -= t + mpn_sub_n (B, B, ws, len);
 792   t = mpn_sub_n (B, B, E, len2);
 793   if (len2 == len) tb -= t;
 794   else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
 795
 796   tc -= mpn_sub_n (C, C, A, len);
 797   t = mpn_sub_n (C, C, E, len2);
 798   if (len2 == len) tc -= t;
 799   else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
 800
 801   t = mpn_lshift (ws, E, len2, 4);
 802   t += mpn_add_n (ws, ws, A, len2);
 803 #if 1
 804   if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
 805   td -= t + mpn_sub_n (D, D, ws, len);
 806 #else
 807   t += mpn_sub_n (D, D, ws, len2);
 808   if (len2 != len) {
 809     t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
 810     t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
 811   } /* end if/else */
 812   td -= t;
 813 #endif
 814
 815
 816   /* b, d := b + d, b - d */
 817
 818 #ifdef HAVE_MPN_ADD_SUB_N
 819   /* #error TO DO ... */
 820 #else
 821   t = tb + td + mpn_add_n (ws, B, D, len);
 822   td = tb - td - mpn_sub_n (D, B, D, len);
 823   tb = t;
 824   MPN_COPY (B, ws, len);
 825 #endif
 826
 827   /* b := b-8*c */
 828   t = 8 * tc + mpn_lshift (ws, C, len, 3);
 829   tb -= t + mpn_sub_n (B, B, ws, len);
 830
 831   /* c := 2*c - b */
 832   tc = 2 * tc + mpn_lshift (C, C, len, 1);
 833   tc -= tb + mpn_sub_n (C, C, B, len);
 834
 835   /* d := d/3 */
 836   td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3;
 837
 838   /* b, d := b + d, b - d */
 839 #ifdef HAVE_MPN_ADD_SUB_N
 840   /* #error TO DO ... */
 841 #else
 842   t = tb + td + mpn_add_n (ws, B, D, len);
 843   td = tb - td - mpn_sub_n (D, B, D, len);
 844   tb = t;
 845   MPN_COPY (B, ws, len);
 846 #endif
 847
 848       /* Now:
 849        *         b = 4*x1
 850        *         c = 2*x2
 851        *         d = 4*x3
 852        */
 853
 854   ASSERT(!(*B & 3));
 855   mpn_rshift (B, B, len, 2);
 856   B[len-1] |= tb<<(BITS_PER_MP_LIMB-2);
 857   ASSERT((long)tb >= 0);
 858   tb >>= 2;
 859
 860   ASSERT(!(*C & 1));
 861   mpn_rshift (C, C, len, 1);
 862   C[len-1] |= tc<<(BITS_PER_MP_LIMB-1);
 863   ASSERT((long)tc >= 0);
 864   tc >>= 1;
 865
 866   ASSERT(!(*D & 3));
 867   mpn_rshift (D, D, len, 2);
 868   D[len-1] |= td<<(BITS_PER_MP_LIMB-2);
 869   ASSERT((long)td >= 0);
 870   td >>= 2;
 871
 872 #if WANT_ASSERT
 873   ASSERT (tb < 2);
 874   if (len == len2)
 875     {
 876       ASSERT (tc < 3);
 877       ASSERT (td < 2);
 878     }
 879   else
 880     {
 881       ASSERT (tc < 2);
 882       ASSERT (!td);
 883     }
 884 #endif
 885
 886   *ptb = tb;
 887   *ptc = tc;
 888   *ptd = td;
 889
 890   TMP_FREE (marker);
 891 }
 892
 893 #else
 894
 895 static void
 896 #if __STDC__
 897 interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
 898               mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls)
 899 #else
 900 interpolate3 (A, B, C, D, E,
 901               ptb, ptc, ptd, l, ls)
 902      mp_srcptr A;
 903      mp_ptr    B;
 904      mp_ptr    C;
 905      mp_ptr    D;
 906      mp_srcptr E;
 907      mp_ptr    ptb;
 908      mp_ptr    ptc;
 909      mp_ptr    ptd;
 910      mp_size_t l;
 911      mp_size_t ls;
 912 #endif
 913 {
 914   mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od;
 915   const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);
 916
 917 #if WANT_ASSERT
 918   t = l - ls;
 919   ASSERT (t == 0 || t == 2 || t == 4);
 920 #endif
 921
 922   sb = sc = sd = 0;
 923   for (i = 0; i < l; ++i)
 924     {
 925       mp_limb_t tb, tc, td, tt;
 926
 927       a = *A;
 928       b = *B;
 929       c = *C;
 930       d = *D;
 931       e = i < ls ? *E : 0;
 932
 933       /* Let x1, x2, x3 be the values to interpolate.  We have:
 934        *         b = 16*a + 8*x1 + 4*x2 + 2*x3 +    e
 935        *         c =    a +   x1 +   x2 +   x3 +    e
 936        *         d =    a + 2*x1 + 4*x2 + 8*x3 + 16*e
 937        */
 938
 939       /* b := b - 16*a -    e
 940        * c := c -    a -    e
 941        * d := d -    a - 16*e
 942        */
 943       t = a << 4;
 944       tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t);
 945       b -= t;
 946       tb -= b < e;
 947       b -= e;
 948       tc = -(c < a);
 949       c -= a;
 950       tc -= c < e;
 951       c -= e;
 952       td = -(d < a);
 953       d -= a;
 954       t = e << 4;
 955       td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t);
 956       d -= t;
 957
 958       /* b, d := b + d, b - d */
 959       t = b + d;
 960       tt = tb + td + (t < b);
 961       td = tb - td - (b < d);
 962       d = b - d;
 963       b = t;
 964       tb = tt;
 965
 966       /* b := b-8*c */
 967       t = c << 3;
 968       tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t);
 969       b -= t;
 970
 971       /* c := 2*c - b */
 972       t = c << 1;
 973       tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b);
 974       c = t - b;
 975
 976       /* d := d/3 */
 977       d *= INVERSE_3;
 978       td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d);
 979       td *= INVERSE_3;
 980
 981       /* b, d := b + d, b - d */
 982       t = b + d;
 983       tt = tb + td + (t < b);
 984       td = tb - td - (b < d);
 985       d = b - d;
 986       b = t;
 987       tb = tt;
 988
 989       /* Now:
 990        *         b = 4*x1
 991        *         c = 2*x2
 992        *         d = 4*x3
 993        */
 994
 995       /* sb has period 2. */
 996       b += sb;
 997       tb += b < sb;
 998       sb &= maskOffHalf;
 999       sb |= sb >> (BITS_PER_MP_LIMB >> 1);
1000       sb += tb;
1001
1002       /* sc has period 1. */
1003       c += sc;
1004       tc += c < sc;
1005       /* TO DO: choose one of the following alternatives. */
1006 #if 1
1007       sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1));
1008       sc += tc;
1009 #else
1010       sc = tc - ((long)sc < 0L);
1011 #endif
1012
1013       /* sd has period 2. */
1014       d += sd;
1015       td += d < sd;
1016       sd &= maskOffHalf;
1017       sd |= sd >> (BITS_PER_MP_LIMB >> 1);
1018       sd += td;
1019
1020       if (i != 0)
1021         {
1022           B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
1023           C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
1024           D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
1025         }
1026       ob = b >> 2;
1027       oc = c >> 1;
1028       od = d >> 2;
1029
1030       ++A; ++B; ++C; ++D; ++E;
1031     }
1032
1033   /* Handle top words. */
1034   b = *ptb;
1035   c = *ptc;
1036   d = *ptd;
1037
1038   t = b + d;
1039   d = b - d;
1040   b = t;
1041   b -= c << 3;
1042   c = (c << 1) - b;
1043   d *= INVERSE_3;
1044   t = b + d;
1045   d = b - d;
1046   b = t;
1047
1048   b += sb;
1049   c += sc;
1050   d += sd;
1051
1052   B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
1053   C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
1054   D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
1055
1056   b >>= 2;
1057   c >>= 1;
1058   d >>= 2;
1059
1060 #if WANT_ASSERT
1061   ASSERT (b < 2);
1062   if (l == ls)
1063     {
1064       ASSERT (c < 3);
1065       ASSERT (d < 2);
1066     }
1067   else
1068     {
1069       ASSERT (c < 2);
1070       ASSERT (!d);
1071     }
1072 #endif
1073
1074   *ptb = b;
1075   *ptc = c;
1076   *ptd = d;
1077 }
1078 #endif
1079
1080
1081 /*-- mpn_toom3_mul_n --------------------------------------------------------------*/
1082
1083 /* Multiplies using 5 mults of one third size and so on recursively.
1084  * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
1085  * No overlap of p[...] with a[...] or b[...].
1086  * ws is workspace.
1087  */
1088
1089 /* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the
1090  *        recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
1091  *        because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false.
1092  */
1093
1094 #define TOOM3_MUL_REC(p, a, b, n, ws) \
1095   do {                                                          \
1096     if (n < KARATSUBA_MUL_THRESHOLD)                            \
1097       mpn_mul_basecase (p, a, n, b, n);                         \
1098     else if (n < TOOM3_MUL_THRESHOLD)                           \
1099       mpn_kara_mul_n (p, a, b, n, ws);                          \
1100     else                                                        \
1101       mpn_toom3_mul_n (p, a, b, n, ws);                         \
1102   } while (0)
1103
1104 void
1105 #if __STDC__
1106 mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
1107 #else
1108 mpn_toom3_mul_n (p, a, b, n, ws)
1109      mp_ptr    p;
1110      mp_srcptr a;
1111      mp_srcptr b;
1112      mp_size_t n;
1113      mp_ptr    ws;
1114 #endif
1115 {
1116   mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD;
1117   mp_limb_t *A,*B,*C,*D,*E, *W;
1118   mp_size_t l,l2,l3,l4,l5,ls;
1119
1120   /* Break n words into chunks of size l, l and ls.
1121    * n = 3*k   => l = k,   ls = k
1122    * n = 3*k+1 => l = k+1, ls = k-1
1123    * n = 3*k+2 => l = k+1, ls = k
1124    */
1125   {
1126     mp_limb_t m;
1127
1128     ASSERT (n >= TOOM3_MUL_THRESHOLD);
1129     l = ls = n / 3;
1130     m = n - l * 3;
1131     if (m != 0)
1132       ++l;
1133     if (m == 1)
1134       --ls;
1135
1136     l2 = l * 2;
1137     l3 = l * 3;
1138     l4 = l * 4;
1139     l5 = l * 5;
1140     A = p;
1141     B = ws;
1142     C = p + l2;
1143     D = ws + l2;
1144     E = p + l4;
1145     W = ws + l4;
1146   }
1147
1148   /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
1149   evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
1150   evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls);
1151
1152   /** Second stage: pointwise multiplies. **/
1153   TOOM3_MUL_REC(D, C, C + l, l, W);
1154   tD = cD*dD;
1155   if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD);
1156   if (dD) tD += mpn_addmul_1 (D + l, C, l, dD);
1157   ASSERT (tD < 49);
1158   TOOM3_MUL_REC(C, B, B + l, l, W);
1159   tC = cC*dC;
1160   /* TO DO: choose one of the following alternatives. */
1161 #if 0
1162   if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC);
1163   if (dC) tC += mpn_addmul_1 (C + l, B, l, dC);
1164 #else
1165   if (cC)
1166     {
1167       if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l);
1168       else tC += add2Times (C + l, C + l, B + l, l);
1169     }
1170   if (dC)
1171     {
1172       if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l);
1173       else tC += add2Times (C + l, C + l, B, l);
1174     }
1175 #endif
1176   ASSERT (tC < 9);
1177   TOOM3_MUL_REC(B, A, A + l, l, W);
1178   tB = cB*dB;
1179   if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB);
1180   if (dB) tB += mpn_addmul_1 (B + l, A, l, dB);
1181   ASSERT (tB < 49);
1182   TOOM3_MUL_REC(A, a, b, l, W);
1183   TOOM3_MUL_REC(E, a + l2, b + l2, ls, W);
1184
1185   /** Third stage: interpolation. **/
1186   interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
1187
1188   /** Final stage: add up the coefficients. **/
1189   {
1190     mp_limb_t i, x, y;
1191     tB += mpn_add_n (p + l, p + l, B, l2);
1192     tD += mpn_add_n (p + l3, p + l3, D, l2);
1193     mpn_incr_u (p + l3, tB);
1194     mpn_incr_u (p + l4, tC);
1195     mpn_incr_u (p + l5, tD);
1196   }
1197 }
1198
1199 /*-- mpn_toom3_sqr_n --------------------------------------------------------------*/
1200
1201 /* Like previous function but for squaring */
1202
1203 #define TOOM3_SQR_REC(p, a, n, ws) \
1204   do {                                                          \
1205     if (n < KARATSUBA_SQR_THRESHOLD)                            \
1206       mpn_sqr_basecase (p, a, n);                               \
1207     else if (n < TOOM3_SQR_THRESHOLD)                           \
1208       mpn_kara_sqr_n (p, a, n, ws);                             \
1209     else                                                        \
1210       mpn_toom3_sqr_n (p, a, n, ws);                            \
1211   } while (0)
1212
1213 void
1214 #if __STDC__
1215 mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
1216 #else
1217 mpn_toom3_sqr_n (p, a, n, ws)
1218      mp_ptr    p;
1219      mp_srcptr a;
1220      mp_size_t n;
1221      mp_ptr    ws;
1222 #endif
1223 {
1224   mp_limb_t cB,cC,cD, tB,tC,tD;
1225   mp_limb_t *A,*B,*C,*D,*E, *W;
1226   mp_size_t l,l2,l3,l4,l5,ls;
1227
1228   /* Break n words into chunks of size l, l and ls.
1229    * n = 3*k   => l = k,   ls = k
1230    * n = 3*k+1 => l = k+1, ls = k-1
1231    * n = 3*k+2 => l = k+1, ls = k
1232    */
1233   {
1234     mp_limb_t m;
1235
1236     ASSERT (n >= TOOM3_MUL_THRESHOLD);
1237     l = ls = n / 3;
1238     m = n - l * 3;
1239     if (m != 0)
1240       ++l;
1241     if (m == 1)
1242       --ls;
1243
1244     l2 = l * 2;
1245     l3 = l * 3;
1246     l4 = l * 4;
1247     l5 = l * 5;
1248     A = p;
1249     B = ws;
1250     C = p + l2;
1251     D = ws + l2;
1252     E = p + l4;
1253     W = ws + l4;
1254   }
1255
1256   /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
1257   evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
1258
1259   /** Second stage: pointwise multiplies. **/
1260   TOOM3_SQR_REC(D, C, l, W);
1261   tD = cD*cD;
1262   if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD);
1263   ASSERT (tD < 49);
1264   TOOM3_SQR_REC(C, B, l, W);
1265   tC = cC*cC;
1266   /* TO DO: choose one of the following alternatives. */
1267 #if 0
1268   if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC);
1269 #else
1270   if (cC >= 1)
1271     {
1272       tC += add2Times (C + l, C + l, B, l);
1273       if (cC == 2)
1274         tC += add2Times (C + l, C + l, B, l);
1275     }
1276 #endif
1277   ASSERT (tC < 9);
1278   TOOM3_SQR_REC(B, A, l, W);
1279   tB = cB*cB;
1280   if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB);
1281   ASSERT (tB < 49);
1282   TOOM3_SQR_REC(A, a, l, W);
1283   TOOM3_SQR_REC(E, a + l2, ls, W);
1284
1285   /** Third stage: interpolation. **/
1286   interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
1287
1288   /** Final stage: add up the coefficients. **/
1289   {
1290     mp_limb_t i, x, y;
1291     tB += mpn_add_n (p + l, p + l, B, l2);
1292     tD += mpn_add_n (p + l3, p + l3, D, l2);
1293     mpn_incr_u (p + l3, tB);
1294     mpn_incr_u (p + l4, tC);
1295     mpn_incr_u (p + l5, tD);
1296   }
1297 }
1298
1299 void
1300 #if __STDC__
1301 mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
1302 #else
1303 mpn_mul_n (p, a, b, n)
1304      mp_ptr    p;
1305      mp_srcptr a;
1306      mp_srcptr b;
1307      mp_size_t n;
1308 #endif
1309 {
1310   if (n < KARATSUBA_MUL_THRESHOLD)
1311     mpn_mul_basecase (p, a, n, b, n);
1312   else if (n < TOOM3_MUL_THRESHOLD)
1313     {
1314       /* Allocate workspace of fixed size on stack: fast! */
1315 #if TUNE_PROGRAM_BUILD
1316       mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB];
1317 #else
1318       mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB];
1319 #endif
1320       mpn_kara_mul_n (p, a, b, n, ws);
1321     }
1322 #if WANT_FFT || TUNE_PROGRAM_BUILD
1323   else if (n < FFT_MUL_THRESHOLD)
1324 #else
1325   else
1326 #endif
1327     {
1328       /* Use workspace of unknown size in heap, as stack space may
1329        * be limited.  Since n is at least TOOM3_MUL_THRESHOLD, the
1330        * multiplication will take much longer than malloc()/free().  */
1331       mp_limb_t wsLen, *ws;
1332       wsLen = 2 * n + 3 * BITS_PER_MP_LIMB;
1333       ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t));
1334       mpn_toom3_mul_n (p, a, b, n, ws);
1335       (*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t));
1336     }
1337 #if WANT_FFT || TUNE_PROGRAM_BUILD
1338   else
1339     {
1340       mpn_mul_fft_full (p, a, n, b, n);
1341     }
1342 #endif
1343 }