1 dnl AMD K6 mpn_sqr_basecase -- square an mpn number.
3 dnl K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
4 dnl product (measured on the speed difference between 17 and 33 limbs,
5 dnl which is roughly the Karatsuba recursing range).
8 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
10 dnl This file is part of the GNU MP Library.
12 dnl The GNU MP Library is free software; you can redistribute it and/or
13 dnl modify it under the terms of the GNU Lesser General Public License as
14 dnl published by the Free Software Foundation; either version 2.1 of the
15 dnl License, or (at your option) any later version.
17 dnl The GNU MP Library is distributed in the hope that it will be useful,
18 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
19 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 dnl Lesser General Public License for more details.
22 dnl You should have received a copy of the GNU Lesser General Public
23 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
24 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
25 dnl Suite 330, Boston, MA 02111-1307, USA.
28 include(`../config.m4')
31 dnl KARATSUBA_SQR_THRESHOLD_MAX is the maximum KARATSUBA_SQR_THRESHOLD this
32 dnl code supports. This value is used only by the tune program to know
33 dnl what it can go up to. (An attempt to compile with a bigger value will
34 dnl trigger some m4_assert()s in the code, making the build fail.)
36 dnl The value is determined by requiring the displacements in the unrolled
37 dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of
38 dnl 63, giving a maximum KARATSUBA_SQR_THRESHOLD of 66.
40 deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
43 dnl Allow a value from the tune program to override config.m4.
45 ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
46 `define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
49 dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The
50 dnl number required is determined by KARATSUBA_SQR_THRESHOLD, since
51 dnl mpn_sqr_basecase only needs to handle sizes < KARATSUBA_SQR_THRESHOLD.
53 dnl The first addmul is the biggest, and this takes the second least
54 dnl significant limb and multiplies it by the third least significant and
55 dnl up. Hence for a maximum operand size of KARATSUBA_SQR_THRESHOLD-1
56 dnl limbs, UNROLL_COUNT needs to be KARATSUBA_SQR_THRESHOLD-3.
58 m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
59 deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
62 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
64 C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
65 C lot of function call overheads are avoided, especially when the given size
68 C The code size might look a bit excessive, but not all of it is executed
69 C and so won't fill up the code cache. The 1x1, 2x2 and 3x3 special cases
70 C clearly apply only to those sizes; mid sizes like 10x10 only need part of
71 C the unrolled addmul; and big sizes like 35x35 that do need all of it will
72 C at least be getting value for money, because 35x35 spends something like
75 C Different values of UNROLL_COUNT give slightly different speeds, between
76 C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
77 C This isn't a big difference, but it's presumably some alignment effect
78 C which if understood could give a simple speedup.
80 defframe(PARAM_SIZE,12)
81 defframe(PARAM_SRC, 8)
82 defframe(PARAM_DST, 4)
86 PROLOGUE(mpn_sqr_basecase)
99 C -----------------------------------------------------------------------------
116 C -----------------------------------------------------------------------------
125 movl %eax, %ebx C src
146 mull %edx C src[0]*src[1]
162 C -----------------------------------------------------------------------------
169 C -----------------------------------------------------------------------------
176 movl %eax, %ebx C src
179 movl %edx, %ecx C dst
181 mull %eax C src[0] ^ 2
189 mull %eax C src[1] ^ 2
197 mull %eax C src[2] ^ 2
205 mull %edx C src[0] * src[1]
216 mull %edx C src[0] * src[2]
225 mull %edx C src[1] * src[2]
265 C -----------------------------------------------------------------------------
267 defframe(SAVE_EBX, -4)
268 defframe(SAVE_ESI, -8)
269 defframe(SAVE_EDI, -12)
270 defframe(SAVE_EBP, -16)
271 defframe(VAR_COUNTER,-20)
272 defframe(VAR_JMP, -24)
273 deflit(STACK_SPACE, 24)
286 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
288 C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
289 C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
290 C a 5780 cycle operation, which is not surprising since the loop here is 8
291 C c/l and mpn_mul_1 is 6.25 c/l.
293 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
314 movl (%eax), %ebp C multiplier
315 leal -1(%ecx), %ecx C size-1, and pad to a 16 byte boundary
344 C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
346 C The last two addmuls, which are the bottom right corner of the product
347 C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
348 C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
349 C cases that need to be done.
351 C The unrolled code is the same as mpn_addmul_1(), see that routine for some
354 C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
356 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
357 C chunk each outer loop.
359 C K6 doesn't do any branch prediction on indirect jumps, which is good
360 C actually because it's a different target each time. The unrolled addmul
361 C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
362 C the indirect jump is quickly recovered.
365 dnl This value is also implicitly encoded in a shift and add.
367 deflit(CODE_BYTES_PER_LIMB, 15)
369 dnl With the unmodified &src[size] and &dst[size] pointers, the
370 dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
371 dnl values up to 31. Above that an offset must be added to them.
374 ifelse(eval(UNROLL_COUNT>31),1,
375 eval((UNROLL_COUNT-31)*4),
386 movl PARAM_SIZE, %ecx
394 ` subl $OFFSET, %ebx')
398 ` subl $OFFSET, %edi')
406 leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
411 C The calculated jump mustn't be before the start of the available
412 C code. This is the limitation UNROLL_COUNT puts on the src operand
413 C size, but checked here using the jump address directly.
416 movl_text_address( L(unroll_inner_start), %eax)
421 C -----------------------------------------------------------------------------
425 C ebx &src[size], constant
427 C edx VAR_COUNTER, limbs, negative
428 C esi high limb to store
429 C edi dst ptr, high of last addmul
432 movl -12+OFFSET(%ebx,%edx,4), %ebp C multiplier
433 movl %edx, VAR_COUNTER
435 movl -8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand
441 movl %edx, %esi C high carry
442 movl %ecx, %edx C jump
444 movl %eax, %ecx C low carry
445 leal CODE_BYTES_PER_LIMB(%edx), %edx
450 C A branch-free version of this using some xors was found to be a
451 C touch slower than just a conditional jump, despite the jump
452 C switching between taken and not taken on every loop.
454 ifelse(eval(UNROLL_COUNT%2),0,
455 jz,jnz) L(unroll_noswap)
456 movl %esi, %eax C high,low carry other way around
465 C Must be on an even address here so the low bit of the jump address
466 C will indicate which way around ecx/esi should start.
468 C An attempt was made at padding here to get the end of the unrolled
469 C code to come out on a good alignment, to save padding before
470 C L(corner). This worked, but turned out to run slower than just an
471 C ALIGN(2). The reason for this is not clear, it might be related
472 C to the different speeds on different UNROLL_COUNTs noted above.
476 L(unroll_inner_start):
485 C 15 code bytes each limb
486 C ecx/esi swapped on each chunk
488 forloop(`i', UNROLL_COUNT, 1, `
489 deflit(`disp_src', eval(-i*4 + OFFSET))
490 deflit(`disp_dst', eval(disp_src - 4))
492 m4_assert(`disp_src>=-128 && disp_src<128')
493 m4_assert(`disp_dst>=-128 && disp_dst<128')
496 Zdisp( movl, disp_src,(%ebx), %eax)
498 Zdisp( addl, %esi, disp_dst,(%edi))
503 dnl this one comes out last
504 Zdisp( movl, disp_src,(%ebx), %eax)
506 Zdisp( addl, %ecx, disp_dst,(%edi))
514 addl %esi, -4+OFFSET(%edi)
516 movl VAR_COUNTER, %edx
519 movl %ecx, m4_empty_if_zero(OFFSET)(%edi)
523 jnz L(unroll_outer_top)
532 C -----------------------------------------------------------------------------
573 C -----------------------------------------------------------------------------
574 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
575 C The loop measures about 6 cycles/iteration, though it looks like it should
579 movl PARAM_SIZE, %ecx
582 subl $1, %ecx C size-1 and clear carry
587 xorl %eax, %eax C ready for adcl
593 C ebx src (for later use)
594 C ecx counter, decrementing
595 C edx size-1 (for later use)
597 C edi dst, incrementing
608 movl %eax, 4(%edi) C dst most significant limb
609 movl (%ebx), %eax C src[0]
611 leal 4(%ebx,%edx,4), %ebx C &src[size]
612 subl %edx, %ecx C -(size-1)
615 C -----------------------------------------------------------------------------
616 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
617 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
618 C low limb of src[0]^2.
623 movl %eax, (%edi,%ecx,8) C dst[0]
630 C ecx counter, negative
636 movl (%ebx,%ecx,4), %eax
641 addl %esi, 4(%edi,%ecx,8)
642 adcl %eax, 8(%edi,%ecx,8)
652 addl %edx, 4(%edi) C dst most significant limb
661 C -----------------------------------------------------------------------------
664 C See README.family about old gas bugs
666 addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx