1 dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
3 dnl K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
4 dnl limbs/loop unrolling).
7 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
9 dnl This file is part of the GNU MP Library.
11 dnl The GNU MP Library is free software; you can redistribute it and/or
12 dnl modify it under the terms of the GNU Lesser General Public License as
13 dnl published by the Free Software Foundation; either version 2.1 of the
14 dnl License, or (at your option) any later version.
16 dnl The GNU MP Library is distributed in the hope that it will be useful,
17 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
18 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 dnl Lesser General Public License for more details.
21 dnl You should have received a copy of the GNU Lesser General Public
22 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
23 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
24 dnl Suite 330, Boston, MA 02111-1307, USA.
27 include(`../config.m4')
30 dnl K7 UNROLL_COUNT cycles/product (at around 20x20)
34 dnl Maximum possible with the current code is 32.
36 dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get
37 dnl done with a straight run through a block of code, no inner loop. Using
38 dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache.
40 deflit(UNROLL_COUNT, 32)
43 C void mpn_mul_basecase (mp_ptr wp,
44 C mp_srcptr xp, mp_size_t xsize,
45 C mp_srcptr yp, mp_size_t ysize);
47 C Calculate xp,xsize multiplied by yp,ysize, storing the result in
50 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
51 C it's faster because it does most of the mpn_addmul_1() startup
52 C calculations only once. The saving is 15-25% on typical sizes coming from
53 C the Karatsuba multiply code.
56 deflit(UNROLL_THRESHOLD, 5)
58 deflit(UNROLL_THRESHOLD, 5)
61 defframe(PARAM_YSIZE,20)
62 defframe(PARAM_YP, 16)
63 defframe(PARAM_XSIZE,12)
69 PROLOGUE(mpn_mul_basecase)
72 movl PARAM_XSIZE, %ecx
76 movl (%eax), %eax C yp low limb
79 ja L(xsize_more_than_two)
80 je L(two_by_something)
83 C one limb by one limb
93 C -----------------------------------------------------------------------------
97 pushl %ebx defframe_pushl(`SAVE_EBX')
98 movl %eax, %ecx C yp low limb
101 pushl %esi defframe_pushl(`SAVE_ESI')
104 movl (%edx), %eax C xp low limb
108 C two limbs by one limb
114 movl %edx, %esi C carry
133 C -----------------------------------------------------------------------------
134 C Could load yp earlier into another register.
146 dnl FRAME carries on from previous
148 mull %ecx C xp[0] * yp[0]
150 push %edi defframe_pushl(`SAVE_EDI')
151 movl %edx, %edi C carry, for wp[1]
156 mull %ecx C xp[1] * yp[0]
162 movl 4(%ecx), %ecx C yp[1]
165 movl 4(%esi), %eax C xp[1]
166 movl %edx, %edi C carry, for wp[2]
168 mull %ecx C xp[1] * yp[1]
173 movl (%esi), %eax C xp[0]
175 movl %edx, %esi C carry, for wp[3]
177 mull %ecx C xp[0] * yp[1]
194 C -----------------------------------------------------------------------------
196 L(xsize_more_than_two):
198 C The first limb of yp is processed with a simple mpn_mul_1 style loop
199 C inline. Unrolling this doesn't seem worthwhile since it's only run once
200 C (whereas the addmul below is run ysize-1 many times). A call to the
201 C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
202 C popping, and doesn't seem likely to be worthwhile on the typical 13-26
203 C limb operations the Karatsuba code calls here with.
213 dnl FRAME doesn't carry on from previous, no pushes yet here
214 defframe(`SAVE_EBX',-4)
215 defframe(`SAVE_ESI',-8)
216 defframe(`SAVE_EDI',-12)
217 defframe(`SAVE_EBP',-16)
232 leal (%edx,%ecx,4), %esi C xp end
234 leal (%edi,%ecx,4), %edi C wp end of mul1
241 C ecx counter, negative
247 movl (%esi,%ecx,4), %eax
252 movl %eax, (%edi,%ecx,4)
260 movl PARAM_YSIZE, %edx
261 movl PARAM_XSIZE, %ecx
263 movl %ebx, (%edi) C final carry
266 jnz L(ysize_more_than_one)
279 L(ysize_more_than_one):
280 cmpl $UNROLL_THRESHOLD, %ecx
286 C -----------------------------------------------------------------------------
287 C simple addmul looping
297 leal 4(%eax,%edx,4), %ebp C yp end
301 movl (%esi,%ecx,4), %eax C xp low limb
302 movl %edx, PARAM_YSIZE C -(ysize-1)
305 xorl %ebx, %ebx C initial carry
306 movl %ecx, PARAM_XSIZE C -(xsize-1)
309 movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier
310 jmp L(simple_outer_entry)
313 C this is offset 0x121 so close enough to aligned
315 C ebp ysize counter, negative
318 movl PARAM_XSIZE, %ecx C -(xsize-1)
319 xorl %ebx, %ebx C carry
321 movl %ebp, PARAM_YSIZE
322 addl $4, %edi C next position in wp
324 movl (%edx,%ebp,4), %ebp C yp limb - multiplier
325 movl -4(%esi,%ecx,4), %eax C xp low limb
328 L(simple_outer_entry):
333 C ecx loop counter (negative)
344 addl %ebx, (%edi,%ecx,4)
345 movl (%esi,%ecx,4), %eax
355 movl PARAM_YSIZE, %ebp
365 jnz L(simple_outer_top)
379 C -----------------------------------------------------------------------------
381 C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
384 C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
385 C increment xp and wp. This is used to adjust back xp and wp, and rshifted
386 C to given an initial VAR_COUNTER at the top of the outer loop.
388 C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
389 C up to -1, inclusive.
391 C VAR_JMP is the computed jump into the unrolled loop.
393 C VAR_XP_LOW is the least significant limb of xp, which is needed at the
394 C start of the unrolled loop.
396 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
399 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
400 C added to give the location of the next limb of yp, which is the multiplier
401 C in the unrolled loop.
403 C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
404 C outer loop to take care of xp, wp and the inner loop counter.
406 defframe(VAR_COUNTER, -20)
407 defframe(VAR_ADJUST, -24)
408 defframe(VAR_JMP, -28)
409 defframe(VAR_XP_LOW, -32)
410 deflit(VAR_EXTRA_SPACE, 16)
423 movl 4(%eax), %ebp C multiplier (yp second limb)
424 leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing
430 movl %edx, PARAM_YSIZE
431 leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1
434 movl (%esi), %eax C xp low limb
435 andl $-UNROLL_MASK-1, %ebx
438 subl $VAR_EXTRA_SPACE, %esp
439 deflit(`FRAME',16+VAR_EXTRA_SPACE)
441 andl $UNROLL_MASK, %ecx
443 movl %ebx, VAR_ADJUST
447 sarl $UNROLL_LOG2, %ebx
449 C 17 code bytes per limb
454 leal L(unroll_entry) (%ecx,%edx,1), %ecx
458 movl %eax, VAR_XP_LOW
460 leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling,
461 leal 4(%esi,%edx,4), %esi C and start at second limb
462 jmp L(unroll_outer_entry)
467 C See README.family about old gas bugs
468 leal (%ecx,%edx,1), %ecx
469 addl $L(unroll_entry)-L(unroll_here), %ecx
475 C --------------------------------------------------------------------------
478 C ebp ysize counter, negative
480 movl VAR_ADJUST, %ebx
483 movl VAR_XP_LOW, %eax
484 movl %ebp, PARAM_YSIZE C store incremented ysize counter
486 leal 4(%edi,%ebx,4), %edi
487 leal (%esi,%ebx,4), %esi
488 sarl $UNROLL_LOG2, %ebx
490 movl (%edx,%ebp,4), %ebp C yp next multiplier
493 L(unroll_outer_entry):
496 testb $1, %cl C and clear carry bit
497 movl %ebx, VAR_COUNTER
501 cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb
504 C Extra fetch of VAR_JMP is bad, but registers are tight
508 C -----------------------------------------------------------------------------
517 C ebp yp multiplier limb
519 C VAR_COUNTER loop counter, negative
525 deflit(CHUNK_COUNT,2)
526 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
527 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
528 deflit(`disp1', eval(disp0 + 4))
530 Zdisp( movl, disp0,(%esi), %eax)
535 Zdisp( addl, %ecx, disp0,(%edi))
541 movl disp1(%esi), %eax
546 addl %ebx, disp1(%edi)
554 leal UNROLL_BYTES(%esi), %esi
555 leal UNROLL_BYTES(%edi), %edi
565 C edi wp, pointing at second last limb)
568 C carry flag to be added to high
570 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
571 deflit(`disp1', eval(disp0-0 + 4))
573 movl PARAM_YSIZE, %ebp
575 addl %ecx, disp0(%edi)
580 movl %edx, disp1(%edi)
581 jnz L(unroll_outer_top)