1 /* mpn_mul -- Multiply two natural numbers.
3 Copyright (C) 1991, 1992 Free Software Foundation, Inc.
5 This file is part of the GNU MP Library.
7 The GNU MP Library is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 The GNU MP Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with the GNU MP Library; see the file COPYING. If not, write to
19 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
25 #ifdef GMP_DEBUG /* partain: was DEBUG */
26 #define MPN_MUL_VERIFY(res_ptr,res_size,op1_ptr,op1_size,op2_ptr,op2_size) \
27 mpn_mul_verify (res_ptr, res_size, op1_ptr, op1_size, op2_ptr, op2_size)
31 mpn_mul_verify (res_ptr, res_size, op1_ptr, op1_size, op2_ptr, op2_size)
32 mp_ptr res_ptr, op1_ptr, op2_ptr;
33 mp_size res_size, op1_size, op2_size;
37 tmp_ptr = alloca ((op1_size + op2_size) * BYTES_PER_MP_LIMB);
38 if (op1_size >= op2_size)
39 tmp_size = mpn_mul_classic (tmp_ptr,
40 op1_ptr, op1_size, op2_ptr, op2_size);
42 tmp_size = mpn_mul_classic (tmp_ptr,
43 op2_ptr, op2_size, op1_ptr, op1_size);
44 if (tmp_size != res_size
45 || mpn_cmp (tmp_ptr, res_ptr, tmp_size) != 0)
47 fprintf (stderr, "GNU MP internal error: Wrong result in mpn_mul.\n");
48 fprintf (stderr, "op1{%d} = ", op1_size); mpn_dump (op1_ptr, op1_size);
49 fprintf (stderr, "op2{%d} = ", op2_size); mpn_dump (op2_ptr, op2_size);
54 #define MPN_MUL_VERIFY(a,b,c,d,e,f)
57 /* Multiply the natural numbers u (pointed to by UP, with USIZE limbs)
58 and v (pointed to by VP, with VSIZE limbs), and store the result at
59 PRODP. USIZE + VSIZE limbs are always stored, but if the input
60 operands are normalized, the return value will reflect the true
61 result size (which is either USIZE + VSIZE, or USIZE + VSIZE -1).
63 NOTE: The space pointed to by PRODP is overwritten before finished
64 with U and V, so overlap is an error.
68 2. PRODP != UP and PRODP != VP, i.e. the destination
69 must be distinct from the multiplier and the multiplicand. */
71 /* If KARATSUBA_THRESHOLD is not already defined, define it to a
72 value which is good on most machines. */
73 #ifndef KARATSUBA_THRESHOLD
74 #define KARATSUBA_THRESHOLD 8
77 /* The code can't handle KARATSUBA_THRESHOLD smaller than 4. */
78 #if KARATSUBA_THRESHOLD < 4
79 #undef KARATSUBA_THRESHOLD
80 #define KARATSUBA_THRESHOLD 4
85 mpn_mul (mp_ptr prodp,
86 mp_srcptr up, mp_size usize,
87 mp_srcptr vp, mp_size vsize)
89 mpn_mul (prodp, up, usize, vp, vsize)
101 if (vsize < KARATSUBA_THRESHOLD)
103 /* Handle simple cases with traditional multiplication.
105 This is the most critical code of the entire function. All
106 multiplies rely on this, both small and huge. Small ones arrive
107 here immediately. Huge ones arrive here as this is the base case
108 for the recursive algorithm below. */
110 mp_limb prod_low, prod_high;
117 /* Offset UP and PRODP so that the inner loop can be faster. */
121 /* Multiply by the first limb in V separately, as the result can
122 be stored (not added) to PROD. We also avoid a loop for zeroing. */
127 MPN_COPY (prodp - usize, up - usize, usize);
129 MPN_ZERO (prodp - usize, usize);
138 umul_ppmm (prod_high, prod_low, up[j], v_limb);
139 add_ssaaaa (cy_limb, prodp[j], prod_high, prod_low, 0, cy_limb);
148 /* For each iteration in the outer loop, multiply one limb from
149 U with one limb from V, and add it to PROD. */
150 for (i = 1; i < vsize; i++)
157 cy_limb = mpn_add (prodp - usize,
158 prodp - usize, usize, up - usize, usize);
167 umul_ppmm (prod_high, prod_low, up[j], v_limb);
168 add_ssaaaa (cy_limb, prod_low,
169 prod_high, prod_low, 0, cy_limb);
170 add_ssaaaa (cy_limb, prodp[j],
171 cy_limb, prod_low, 0, prodp[j]);
181 return usize + vsize - (cy_limb == 0);
186 /* Is USIZE larger than 1.5 times VSIZE? Avoid Karatsuba's algorithm. */
187 if (2 * usize > 3 * vsize)
189 /* If U has at least twice as many limbs as V. Split U in two
190 pieces, U1 and U0, such that U = U0 + U1*(2**BITS_PER_MP_LIMB)**N,
191 and recursively multiply the two pieces separately with V. */
197 /* V1 (the high part of V) is zero. */
199 /* Calculate the length of U0. It is normally equal to n, but
200 of course not for sure. */
201 for (u0_size = n; u0_size > 0 && up[u0_size - 1] == 0; u0_size--)
204 /* Perform (U0 * V). */
205 if (u0_size >= vsize)
206 prod_size = mpn_mul (prodp, up, u0_size, vp, vsize);
208 prod_size = mpn_mul (prodp, vp, vsize, up, u0_size);
209 MPN_MUL_VERIFY (prodp, prod_size, up, u0_size, vp, vsize);
211 /* We have to zero-extend the lower partial product to n limbs,
212 since the mpn_add some lines below expect the first n limbs
213 to be well defined. (This is normally a no-op. It may
214 do something when U1 has many leading 0 limbs.) */
215 while (prod_size < n)
216 prodp[prod_size++] = 0;
218 tmp = (mp_ptr) alloca ((usize + vsize - n) * BYTES_PER_MP_LIMB);
220 /* Perform (U1 * V). Make sure the first source argument to mpn_mul
221 is not less than the second source argument. */
222 if (vsize <= usize - n)
223 tmp_size = mpn_mul (tmp, up + n, usize - n, vp, vsize);
225 tmp_size = mpn_mul (tmp, vp, vsize, up + n, usize - n);
226 MPN_MUL_VERIFY (tmp, tmp_size, up + n, usize - n, vp, vsize);
228 /* In this addition hides a potentially large copying of TMP. */
229 if (prod_size - n >= tmp_size)
230 cy = mpn_add (prodp + n, prodp + n, prod_size - n, tmp, tmp_size);
232 cy = mpn_add (prodp + n, tmp, tmp_size, prodp + n, prod_size - n);
234 abort (); /* prodp[prod_size] = cy; */
241 /* Karatsuba's divide-and-conquer algorithm.
243 Split U in two pieces, U1 and U0, such that
245 and V in V1 and V0, such that
248 UV is then computed recursively using the identity
251 UV = (B + B )U V + B (U -U )(V -V ) + (B + 1)U V
254 Where B = 2**BITS_PER_MP_LIMB.
257 /* It's possible to decrease the temporary allocation by using the
258 prodp area for temporary storage of the middle term, and doing
259 that recursive multiplication first. (Do this later.) */
274 pp = (mp_ptr) alloca (4 * n * BYTES_PER_MP_LIMB);
276 /* Calculate the lengths of U0 and V0. They are normally equal
277 to n, but of course not for sure. */
278 for (u0_size = n; u0_size > 0 && up[u0_size - 1] == 0; u0_size--)
280 for (v0_size = n; v0_size > 0 && vp[v0_size - 1] == 0; v0_size--)
283 /*** 1. PROD]2n..0] := U0 x V0
284 (Recursive call to mpn_mul may NOT overwrite input operands.)
285 ________________ ________________
286 |________________||____U0 x V0_____| */
288 if (u0_size >= v0_size)
289 u0v0_size = mpn_mul (pp, up, u0_size, vp, v0_size);
291 u0v0_size = mpn_mul (pp, vp, v0_size, up, u0_size);
292 MPN_MUL_VERIFY (pp, u0v0_size, up, u0_size, vp, v0_size);
294 /* Zero-extend to 2n limbs. */
295 while (u0v0_size < 2 * n)
299 /*** 2. PROD]4n..2n] := U1 x V1
300 (Recursive call to mpn_mul may NOT overwrite input operands.)
301 ________________ ________________
302 |_____U1 x V1____||____U0 x V0_____| */
304 u1v1_size = mpn_mul (pp + 2*n,
307 MPN_MUL_VERIFY (pp + 2*n, u1v1_size,
308 up + n, usize - n, vp + n, vsize - n);
309 prod_size = 2 * n + u1v1_size;
312 /*** 3. PTEM]2n..0] := (U1-U0) x (V0-V1)
313 (Recursive call to mpn_mul may overwrite input operands.)
315 |_(U1-U0)(V0-V1)_| */
317 temp = (mp_ptr) alloca ((2 * n + 1) * BYTES_PER_MP_LIMB);
318 if (usize - n > u0_size
319 || (usize - n == u0_size
320 && mpn_cmp (up + n, up, u0_size) >= 0))
322 utem_size = usize - n
323 + mpn_sub (temp, up + n, usize - n, up, u0_size);
329 + mpn_sub (temp, up, u0_size, up + n, usize - n);
332 if (vsize - n > v0_size
333 || (vsize - n == v0_size
334 && mpn_cmp (vp + n, vp, v0_size) >= 0))
336 vtem_size = vsize - n
337 + mpn_sub (temp + n, vp + n, vsize - n, vp, v0_size);
343 + mpn_sub (temp + n, vp, v0_size, vp + n, vsize - n);
344 /* No change of NEGFLG. */
346 ptem = (mp_ptr) alloca (2 * n * BYTES_PER_MP_LIMB);
347 if (utem_size >= vtem_size)
348 ptem_size = mpn_mul (ptem, temp, utem_size, temp + n, vtem_size);
350 ptem_size = mpn_mul (ptem, temp + n, vtem_size, temp, utem_size);
351 MPN_MUL_VERIFY (ptem, ptem_size, temp, utem_size, temp + n, vtem_size);
353 /*** 4. TEMP]2n..0] := PROD]2n..0] + PROD]4n..2n]
357 |_____U0_x_V0____| */
359 cy = mpn_add (temp, pp, 2*n, pp + 2*n, u1v1_size);
367 /* Normalize temp. pp[2*n-1] might have been zero in the
368 mpn_add call above, and thus temp might be unnormalized. */
369 for (temp_size = 2*n; temp_size > 0 && temp[temp_size - 1] == 0;
374 if (prod_size - n >= temp_size)
375 cy = mpn_add (pp + n, pp + n, prod_size - n, temp, temp_size);
378 /* This is a weird special case that should not happen (often)! */
379 cy = mpn_add (pp + n, temp, temp_size, pp + n, prod_size - n);
380 prod_size = temp_size + n;
387 #ifdef GMP_DEBUG /* partain: was DEBUG */
388 if (prod_size > 4 * n)
392 prod_size = prod_size
393 + mpn_sub (pp + n, pp + n, prod_size - n, ptem, ptem_size);
396 if (prod_size - n < ptem_size)
398 cy = mpn_add (pp + n, pp + n, prod_size - n, ptem, ptem_size);
403 #ifdef GMP_DEBUG /* partain: was DEBUG */
404 if (prod_size > 4 * n)
410 MPN_COPY (prodp, pp, prod_size);