X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=rts%2Fgmp%2Fmpn%2Fx86%2Fpentium%2Fsqr_basecase.asm;fp=rts%2Fgmp%2Fmpn%2Fx86%2Fpentium%2Fsqr_basecase.asm;h=0000000000000000000000000000000000000000;hb=fdf1cd0399158308769fcb2ab7e46e215a68d865;hp=c8584df13c2f0d832a0e3099b48f127b0420331b;hpb=e552cfc427d2734b9a9629f2ab1d22f493e775f6;p=ghc-hetmet.git diff --git a/rts/gmp/mpn/x86/pentium/sqr_basecase.asm b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm deleted file mode 100644 index c8584df..0000000 --- a/rts/gmp/mpn/x86/pentium/sqr_basecase.asm +++ /dev/null @@ -1,520 +0,0 @@ -dnl Intel P5 mpn_sqr_basecase -- square an mpn number. -dnl -dnl P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular -dnl product at around 20x20 limbs. - - -dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. -dnl -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or -dnl modify it under the terms of the GNU Lesser General Public License as -dnl published by the Free Software Foundation; either version 2.1 of the -dnl License, or (at your option) any later version. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, -dnl but WITHOUT ANY WARRANTY; without even the implied warranty of -dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -dnl Lesser General Public License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public -dnl License along with the GNU MP Library; see the file COPYING.LIB. If -dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - -dnl Suite 330, Boston, MA 02111-1307, USA. - - -include(`../config.m4') - - -C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); -C -C Calculate src,size squared, storing the result in dst,2*size. -C -C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a -C lot of function call overheads are avoided, especially when the size is -C small. - -defframe(PARAM_SIZE,12) -defframe(PARAM_SRC, 8) -defframe(PARAM_DST, 4) - - .text - ALIGN(8) -PROLOGUE(mpn_sqr_basecase) -deflit(`FRAME',0) - - movl PARAM_SIZE, %edx - movl PARAM_SRC, %eax - - cmpl $2, %edx - movl PARAM_DST, %ecx - - je L(two_limbs) - - movl (%eax), %eax - ja L(three_or_more) - -C ----------------------------------------------------------------------------- -C one limb only - C eax src - C ebx - C ecx dst - C edx - - mull %eax - - movl %eax, (%ecx) - movl %edx, 4(%ecx) - - ret - -C ----------------------------------------------------------------------------- - ALIGN(8) -L(two_limbs): - C eax src - C ebx - C ecx dst - C edx size - - pushl %ebp - pushl %edi - - pushl %esi - pushl %ebx - - movl %eax, %ebx - movl (%eax), %eax - - mull %eax C src[0]^2 - - movl %eax, (%ecx) C dst[0] - movl %edx, %esi C dst[1] - - movl 4(%ebx), %eax - - mull %eax C src[1]^2 - - movl %eax, %edi C dst[2] - movl %edx, %ebp C dst[3] - - movl (%ebx), %eax - - mull 4(%ebx) C src[0]*src[1] - - addl %eax, %esi - popl %ebx - - adcl %edx, %edi - - adcl $0, %ebp - addl %esi, %eax - - adcl %edi, %edx - movl %eax, 4(%ecx) - - adcl $0, %ebp - popl %esi - - movl %edx, 8(%ecx) - movl %ebp, 12(%ecx) - - popl %edi - popl %ebp - - ret - - -C ----------------------------------------------------------------------------- - ALIGN(8) -L(three_or_more): - C eax src low limb - C ebx - C ecx dst - C edx size - - cmpl $4, %edx - pushl %ebx -deflit(`FRAME',4) - - movl PARAM_SRC, %ebx - jae L(four_or_more) - - -C ----------------------------------------------------------------------------- -C three limbs - C eax src low limb - C ebx src - C ecx dst - C edx size - - pushl %ebp - pushl %edi - - mull %eax C src[0] ^ 2 - - movl %eax, (%ecx) - movl %edx, 4(%ecx) - - movl 4(%ebx), %eax - xorl %ebp, %ebp - - mull %eax C src[1] ^ 2 - - movl %eax, 8(%ecx) - movl %edx, 12(%ecx) - - movl 8(%ebx), %eax - pushl %esi C risk of cache bank clash - - mull %eax C src[2] ^ 2 - - movl %eax, 16(%ecx) - movl %edx, 20(%ecx) - - movl (%ebx), %eax - - mull 4(%ebx) C src[0] * src[1] - - movl %eax, %esi - movl %edx, %edi - - movl (%ebx), %eax - - mull 8(%ebx) C src[0] * src[2] - - addl %eax, %edi - movl %edx, %ebp - - adcl $0, %ebp - movl 4(%ebx), %eax - - mull 8(%ebx) C src[1] * src[2] - - xorl %ebx, %ebx - addl %eax, %ebp - - C eax - C ebx zero, will be dst[5] - C ecx dst - C edx dst[4] - C esi dst[1] - C edi dst[2] - C ebp dst[3] - - adcl $0, %edx - addl %esi, %esi - - adcl %edi, %edi - - adcl %ebp, %ebp - - adcl %edx, %edx - movl 4(%ecx), %eax - - adcl $0, %ebx - addl %esi, %eax - - movl %eax, 4(%ecx) - movl 8(%ecx), %eax - - adcl %edi, %eax - movl 12(%ecx), %esi - - adcl %ebp, %esi - movl 16(%ecx), %edi - - movl %eax, 8(%ecx) - movl %esi, 12(%ecx) - - adcl %edx, %edi - popl %esi - - movl 20(%ecx), %eax - movl %edi, 16(%ecx) - - popl %edi - popl %ebp - - adcl %ebx, %eax C no carry out of this - popl %ebx - - movl %eax, 20(%ecx) - - ret - - -C ----------------------------------------------------------------------------- - ALIGN(8) -L(four_or_more): - C eax src low limb - C ebx src - C ecx dst - C edx size - C esi - C edi - C ebp - C - C First multiply src[0]*src[1..size-1] and store at dst[1..size]. - -deflit(`FRAME',4) - - pushl %edi -FRAME_pushl() - pushl %esi -FRAME_pushl() - - pushl %ebp -FRAME_pushl() - leal (%ecx,%edx,4), %edi C dst end of this mul1 - - leal (%ebx,%edx,4), %esi C src end - movl %ebx, %ebp C src - - negl %edx C -size - xorl %ebx, %ebx C clear carry limb and carry flag - - leal 1(%edx), %ecx C -(size-1) - -L(mul1): - C eax scratch - C ebx carry - C ecx counter, negative - C edx scratch - C esi &src[size] - C edi &dst[size] - C ebp src - - adcl $0, %ebx - movl (%esi,%ecx,4), %eax - - mull (%ebp) - - addl %eax, %ebx - - movl %ebx, (%edi,%ecx,4) - incl %ecx - - movl %edx, %ebx - jnz L(mul1) - - - C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for - C n=1..size-2. - C - C The last two products, which are the end corner of the product - C triangle, are handled separately to save looping overhead. These - C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1]. - C If size is 4 then it's only these that need to be done. - C - C In the outer loop %esi is a constant, and %edi just advances by 1 - C limb each time. The size of the operation decreases by 1 limb - C each time. - - C eax - C ebx carry (needing carry flag added) - C ecx - C edx - C esi &src[size] - C edi &dst[size] - C ebp - - adcl $0, %ebx - movl PARAM_SIZE, %edx - - movl %ebx, (%edi) - subl $4, %edx - - negl %edx - jz L(corner) - - -L(outer): - C ebx previous carry limb to store - C edx outer loop counter (negative) - C esi &src[size] - C edi dst, pointing at stored carry limb of previous loop - - pushl %edx C new outer loop counter - leal -2(%edx), %ecx - - movl %ebx, (%edi) - addl $4, %edi - - addl $4, %ebp - xorl %ebx, %ebx C initial carry limb, clear carry flag - -L(inner): - C eax scratch - C ebx carry (needing carry flag added) - C ecx counter, negative - C edx scratch - C esi &src[size] - C edi dst end of this addmul - C ebp &src[j] - - adcl $0, %ebx - movl (%esi,%ecx,4), %eax - - mull (%ebp) - - addl %ebx, %eax - movl (%edi,%ecx,4), %ebx - - adcl $0, %edx - addl %eax, %ebx - - movl %ebx, (%edi,%ecx,4) - incl %ecx - - movl %edx, %ebx - jnz L(inner) - - - adcl $0, %ebx - popl %edx C outer loop counter - - incl %edx - jnz L(outer) - - - movl %ebx, (%edi) - -L(corner): - C esi &src[size] - C edi &dst[2*size-4] - - movl -8(%esi), %eax - movl -4(%edi), %ebx C risk of data cache bank clash here - - mull -12(%esi) C src[size-2]*src[size-3] - - addl %eax, %ebx - movl %edx, %ecx - - adcl $0, %ecx - movl -4(%esi), %eax - - mull -12(%esi) C src[size-1]*src[size-3] - - addl %ecx, %eax - movl (%edi), %ecx - - adcl $0, %edx - movl %ebx, -4(%edi) - - addl %eax, %ecx - movl %edx, %ebx - - adcl $0, %ebx - movl -4(%esi), %eax - - mull -8(%esi) C src[size-1]*src[size-2] - - movl %ecx, 0(%edi) - addl %eax, %ebx - - adcl $0, %edx - movl PARAM_SIZE, %eax - - negl %eax - movl %ebx, 4(%edi) - - addl $1, %eax C -(size-1) and clear carry - movl %edx, 8(%edi) - - -C ----------------------------------------------------------------------------- -C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. - -L(lshift): - C eax counter, negative - C ebx next limb - C ecx - C edx - C esi - C edi &dst[2*size-4] - C ebp - - movl 12(%edi,%eax,8), %ebx - - rcll %ebx - movl 16(%edi,%eax,8), %ecx - - rcll %ecx - movl %ebx, 12(%edi,%eax,8) - - movl %ecx, 16(%edi,%eax,8) - incl %eax - - jnz L(lshift) - - - adcl %eax, %eax C high bit out - movl PARAM_SRC, %esi - - movl PARAM_SIZE, %ecx C risk of cache bank clash - movl %eax, 12(%edi) C dst most significant limb - - -C ----------------------------------------------------------------------------- -C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., -C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the -C low limb of src[0]^2. - - movl (%esi), %eax C src[0] - leal (%esi,%ecx,4), %esi C src end - - negl %ecx - - mull %eax - - movl %eax, 16(%edi,%ecx,8) C dst[0] - movl %edx, %ebx - - addl $1, %ecx C size-1 and clear carry - -L(diag): - C eax scratch (low product) - C ebx carry limb - C ecx counter, negative - C edx scratch (high product) - C esi &src[size] - C edi &dst[2*size-4] - C ebp scratch (fetched dst limbs) - - movl (%esi,%ecx,4), %eax - adcl $0, %ebx - - mull %eax - - movl 16-4(%edi,%ecx,8), %ebp - - addl %ebp, %ebx - movl 16(%edi,%ecx,8), %ebp - - adcl %eax, %ebp - movl %ebx, 16-4(%edi,%ecx,8) - - movl %ebp, 16(%edi,%ecx,8) - incl %ecx - - movl %edx, %ebx - jnz L(diag) - - - adcl $0, %edx - movl 16-4(%edi), %eax C dst most significant limb - - addl %eax, %edx - popl %ebp - - movl %edx, 16-4(%edi) - popl %esi C risk of cache bank clash - - popl %edi - popl %ebx - - ret - -EPILOGUE()