1 dnl Intel P6 mpn_sqr_basecase -- square an mpn number.
3 dnl P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
4 dnl product (measured on the speed difference between 20 and 40 limbs,
5 dnl which is the Karatsuba recursing range).
8 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
10 dnl This file is part of the GNU MP Library.
12 dnl The GNU MP Library is free software; you can redistribute it and/or
13 dnl modify it under the terms of the GNU Lesser General Public License as
14 dnl published by the Free Software Foundation; either version 2.1 of the
15 dnl License, or (at your option) any later version.
17 dnl The GNU MP Library is distributed in the hope that it will be useful,
18 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
19 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 dnl Lesser General Public License for more details.
22 dnl You should have received a copy of the GNU Lesser General Public
23 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
24 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
25 dnl Suite 330, Boston, MA 02111-1307, USA.
28 include(`../config.m4')
31 dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
32 dnl a description. The only difference here is that UNROLL_COUNT can go up
33 dnl to 64 (not 63) making KARATSUBA_SQR_THRESHOLD_MAX 67.
35 deflit(KARATSUBA_SQR_THRESHOLD_MAX, 67)
37 ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
38 `define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
40 m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
41 deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
44 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
46 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
47 C lot of function call overheads are avoided, especially when the given size
50 C The code size might look a bit excessive, but not all of it is executed so
51 C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases
52 C clearly apply only to those sizes; mid sizes like 10x10 only need part of
53 C the unrolled addmul; and big sizes like 40x40 that do use the full
54 C unrolling will least be making good use of it, because 40x40 will take
55 C something like 7000 cycles.
57 defframe(PARAM_SIZE,12)
58 defframe(PARAM_SRC, 8)
59 defframe(PARAM_DST, 4)
63 PROLOGUE(mpn_sqr_basecase)
78 C -----------------------------------------------------------------------------
93 C -----------------------------------------------------------------------------
100 defframe(SAVE_ESI, -4)
101 defframe(SAVE_EBX, -8)
102 defframe(SAVE_EDI, -12)
103 defframe(SAVE_EBP, -16)
104 deflit(`STACK_SPACE',16)
106 subl $STACK_SPACE, %esp
107 deflit(`FRAME',STACK_SPACE)
115 movl %eax, (%ecx) C dst[0]
119 movl %edx, %ebx C dst[1]
124 movl %eax, %edi C dst[2]
128 movl %edx, %ebp C dst[3]
130 mull 4(%esi) C src[0]*src[1]
157 C -----------------------------------------------------------------------------
165 pushl %esi defframe_pushl(`SAVE_ESI')
172 C -----------------------------------------------------------------------------
183 pushl %ebp defframe_pushl(`SAVE_EBP')
184 pushl %edi defframe_pushl(`SAVE_EDI')
186 mull %eax C src[0] ^ 2
194 mull %eax C src[1] ^ 2
200 pushl %ebx defframe_pushl(`SAVE_EBX')
202 mull %eax C src[2] ^ 2
209 mull 4(%esi) C src[0] * src[1]
216 mull 8(%esi) C src[0] * src[2]
224 mull 8(%esi) C src[1] * src[2]
233 C esi zero, will be dst[5]
271 adcl %esi, %eax C no carry out of this
281 C -----------------------------------------------------------------------------
282 defframe(VAR_COUNTER,-20)
283 defframe(VAR_JMP, -24)
284 deflit(`STACK_SPACE',24)
294 deflit(`FRAME',4) dnl %esi already pushed
296 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
298 subl $STACK_SPACE-FRAME, %esp
299 deflit(`FRAME',STACK_SPACE)
306 subl %edx, %ecx C -(size-1)
309 movl $0, %ebx C initial carry
311 leal (%esi,%edx,4), %esi C &src[size]
312 movl %eax, %ebp C multiplier
314 leal -4(%edi,%edx,4), %edi C &dst[size-1]
317 C This loop runs at just over 6 c/l.
322 C ecx counter, limbs, negative, -(size-1) to -1
336 movl %eax, 4(%edi,%ecx,4)
345 C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
347 C The last two addmuls, which are the bottom right corner of the product
348 C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
349 C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
350 C cases that need to be done.
352 C The unrolled code is the same as mpn_addmul_1(), see that routine for some
355 C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
357 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
358 C chunk each outer loop.
360 dnl This is also hard-coded in the address calculation below.
361 deflit(CODE_BYTES_PER_LIMB, 15)
363 dnl With &src[size] and &dst[size-1] pointers, the displacements in the
364 dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
365 dnl that an offset must be added to them.
367 ifelse(eval(UNROLL_COUNT>32),1,
368 eval((UNROLL_COUNT-32)*4),
379 movl PARAM_SIZE, %ecx
388 ifelse(OFFSET,0,,`subl $OFFSET, %esi')
394 leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
398 ifelse(OFFSET,0,,`subl $OFFSET, %edi')
400 C The calculated jump mustn't be before the start of the available
401 C code. This is the limit that UNROLL_COUNT puts on the src operand
402 C size, but checked here using the jump address directly.
405 `movl_text_address( L(unroll_inner_start), %eax)
409 C -----------------------------------------------------------------------------
413 C ebx high limb to store
415 C edx VAR_COUNTER, limbs, negative
416 C esi &src[size], constant
417 C edi dst ptr, second highest limb of last addmul
420 movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier
421 movl %edx, VAR_COUNTER
423 movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand
427 define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
431 movl %edx, %ebx C high carry
434 movl %ecx, %edx C jump
436 movl %eax, %ecx C low carry
437 leal CODE_BYTES_PER_LIMB(%edx), %edx
439 cmovX( %ebx, %ecx) C high carry reverse
440 cmovX( %eax, %ebx) C low carry reverse
445 C Must be on an even address here so the low bit of the jump address
446 C will indicate which way around ecx/ebx should start.
450 L(unroll_inner_start):
459 C 15 code bytes each limb
460 C ecx/ebx reversed on each chunk
462 forloop(`i', UNROLL_COUNT, 1, `
463 deflit(`disp_src', eval(-i*4 + OFFSET))
464 deflit(`disp_dst', eval(disp_src))
466 m4_assert(`disp_src>=-128 && disp_src<128')
467 m4_assert(`disp_dst>=-128 && disp_dst<128')
470 Zdisp( movl, disp_src,(%esi), %eax)
472 Zdisp( addl, %ebx, disp_dst,(%edi))
477 dnl this one comes out last
478 Zdisp( movl, disp_src,(%esi), %eax)
480 Zdisp( addl, %ecx, disp_dst,(%edi))
488 addl %ebx, m4_empty_if_zero(OFFSET)(%edi)
490 movl VAR_COUNTER, %edx
493 movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi)
497 jnz L(unroll_outer_top)
506 C -----------------------------------------------------------------------------
541 movl PARAM_SIZE, %ecx
552 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
554 subl $1, %ecx C size-1
555 xorl %eax, %eax C ready for final adcl, and clear carry
564 C ecx counter, size-1 to 1
565 C edx size-1 (for later use)
566 C esi src (for later use)
567 C edi dst, incrementing
580 movl %eax, 4(%edi) C dst most significant limb
581 movl (%esi), %eax C src[0]
583 leal 4(%esi,%edx,4), %esi C &src[size]
584 subl %edx, %ecx C -(size-1)
587 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
588 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
589 C low limb of src[0]^2.
594 movl %eax, (%edi,%ecx,8) C dst[0]
600 C ecx counter, negative
606 movl (%esi,%ecx,4), %eax
611 addl %ebx, 4(%edi,%ecx,8)
612 adcl %eax, 8(%edi,%ecx,8)
622 addl %edx, 4(%edi) C dst most significant limb
631 C -----------------------------------------------------------------------------
635 addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx