X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=rts%2Fgmp%2Fmpn%2Fx86%2Fpentium%2Frshift.asm;fp=rts%2Fgmp%2Fmpn%2Fx86%2Fpentium%2Frshift.asm;h=0000000000000000000000000000000000000000;hb=fdf1cd0399158308769fcb2ab7e46e215a68d865;hp=e8f5ae8ec8adbd8d79b0a5ea6c12f473c42b1718;hpb=e552cfc427d2734b9a9629f2ab1d22f493e775f6;p=ghc-hetmet.git diff --git a/rts/gmp/mpn/x86/pentium/rshift.asm b/rts/gmp/mpn/x86/pentium/rshift.asm deleted file mode 100644 index e8f5ae8..0000000 --- a/rts/gmp/mpn/x86/pentium/rshift.asm +++ /dev/null @@ -1,236 +0,0 @@ -dnl Intel Pentium mpn_rshift -- mpn right shift. -dnl -dnl cycles/limb -dnl P5,P54: 6.0 -dnl P55: 5.375 - - -dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software -dnl Foundation, Inc. -dnl -dnl This file is part of the GNU MP Library. -dnl -dnl The GNU MP Library is free software; you can redistribute it and/or -dnl modify it under the terms of the GNU Lesser General Public License as -dnl published by the Free Software Foundation; either version 2.1 of the -dnl License, or (at your option) any later version. -dnl -dnl The GNU MP Library is distributed in the hope that it will be useful, -dnl but WITHOUT ANY WARRANTY; without even the implied warranty of -dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -dnl Lesser General Public License for more details. -dnl -dnl You should have received a copy of the GNU Lesser General Public -dnl License along with the GNU MP Library; see the file COPYING.LIB. If -dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - -dnl Suite 330, Boston, MA 02111-1307, USA. - - -include(`../config.m4') - - -C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, -C unsigned shift); -C -C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does, -C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere. - -defframe(PARAM_SHIFT,16) -defframe(PARAM_SIZE, 12) -defframe(PARAM_SRC, 8) -defframe(PARAM_DST, 4) - - .text - ALIGN(8) -PROLOGUE(mpn_rshift) - - pushl %edi - pushl %esi - pushl %ebx - pushl %ebp -deflit(`FRAME',16) - - movl PARAM_DST,%edi - movl PARAM_SRC,%esi - movl PARAM_SIZE,%ebp - movl PARAM_SHIFT,%ecx - -C We can use faster code for shift-by-1 under certain conditions. - cmp $1,%ecx - jne L(normal) - leal 4(%edi),%eax - cmpl %esi,%eax - jnc L(special) C jump if res_ptr + 1 >= s_ptr - leal (%edi,%ebp,4),%eax - cmpl %eax,%esi - jnc L(special) C jump if s_ptr >= res_ptr + size - -L(normal): - movl (%esi),%edx - addl $4,%esi - xorl %eax,%eax - shrdl( %cl, %edx, %eax) C compute carry limb - pushl %eax C push carry limb onto stack - - decl %ebp - pushl %ebp - shrl $3,%ebp - jz L(end) - - movl (%edi),%eax C fetch destination cache line - - ALIGN(4) -L(oop): movl 28(%edi),%eax C fetch destination cache line - movl %edx,%ebx - - movl (%esi),%eax - movl 4(%esi),%edx - shrdl( %cl, %eax, %ebx) - shrdl( %cl, %edx, %eax) - movl %ebx,(%edi) - movl %eax,4(%edi) - - movl 8(%esi),%ebx - movl 12(%esi),%eax - shrdl( %cl, %ebx, %edx) - shrdl( %cl, %eax, %ebx) - movl %edx,8(%edi) - movl %ebx,12(%edi) - - movl 16(%esi),%edx - movl 20(%esi),%ebx - shrdl( %cl, %edx, %eax) - shrdl( %cl, %ebx, %edx) - movl %eax,16(%edi) - movl %edx,20(%edi) - - movl 24(%esi),%eax - movl 28(%esi),%edx - shrdl( %cl, %eax, %ebx) - shrdl( %cl, %edx, %eax) - movl %ebx,24(%edi) - movl %eax,28(%edi) - - addl $32,%esi - addl $32,%edi - decl %ebp - jnz L(oop) - -L(end): popl %ebp - andl $7,%ebp - jz L(end2) -L(oop2): - movl (%esi),%eax - shrdl( %cl,%eax,%edx) C compute result limb - movl %edx,(%edi) - movl %eax,%edx - addl $4,%esi - addl $4,%edi - decl %ebp - jnz L(oop2) - -L(end2): - shrl %cl,%edx C compute most significant limb - movl %edx,(%edi) C store it - - popl %eax C pop carry limb - - popl %ebp - popl %ebx - popl %esi - popl %edi - ret - - -C We loop from least significant end of the arrays, which is only -C permissable if the source and destination don't overlap, since the -C function is documented to work for overlapping source and destination. - -L(special): - leal -4(%edi,%ebp,4),%edi - leal -4(%esi,%ebp,4),%esi - - movl (%esi),%edx - subl $4,%esi - - decl %ebp - pushl %ebp - shrl $3,%ebp - - shrl %edx - incl %ebp - decl %ebp - jz L(Lend) - - movl (%edi),%eax C fetch destination cache line - - ALIGN(4) -L(Loop): - movl -28(%edi),%eax C fetch destination cache line - movl %edx,%ebx - - movl (%esi),%eax - movl -4(%esi),%edx - rcrl %eax - movl %ebx,(%edi) - rcrl %edx - movl %eax,-4(%edi) - - movl -8(%esi),%ebx - movl -12(%esi),%eax - rcrl %ebx - movl %edx,-8(%edi) - rcrl %eax - movl %ebx,-12(%edi) - - movl -16(%esi),%edx - movl -20(%esi),%ebx - rcrl %edx - movl %eax,-16(%edi) - rcrl %ebx - movl %edx,-20(%edi) - - movl -24(%esi),%eax - movl -28(%esi),%edx - rcrl %eax - movl %ebx,-24(%edi) - rcrl %edx - movl %eax,-28(%edi) - - leal -32(%esi),%esi C use leal not to clobber carry - leal -32(%edi),%edi - decl %ebp - jnz L(Loop) - -L(Lend): - popl %ebp - sbbl %eax,%eax C save carry in %eax - andl $7,%ebp - jz L(Lend2) - addl %eax,%eax C restore carry from eax -L(Loop2): - movl %edx,%ebx - movl (%esi),%edx - rcrl %edx - movl %ebx,(%edi) - - leal -4(%esi),%esi C use leal not to clobber carry - leal -4(%edi),%edi - decl %ebp - jnz L(Loop2) - - jmp L(L1) -L(Lend2): - addl %eax,%eax C restore carry from eax -L(L1): movl %edx,(%edi) C store last limb - - movl $0,%eax - rcrl %eax - - popl %ebp - popl %ebx - popl %esi - popl %edi - ret - -EPILOGUE()