1 dnl AMD K6-2 mpn_rshift -- mpn right shift.
3 dnl K6-2: 1.75 cycles/limb
6 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software; you can redistribute it and/or
11 dnl modify it under the terms of the GNU Lesser General Public License as
12 dnl published by the Free Software Foundation; either version 2.1 of the
13 dnl License, or (at your option) any later version.
15 dnl The GNU MP Library is distributed in the hope that it will be useful,
16 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 dnl Lesser General Public License for more details.
20 dnl You should have received a copy of the GNU Lesser General Public
21 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23 dnl Suite 330, Boston, MA 02111-1307, USA.
26 include(`../config.m4')
29 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
33 defframe(PARAM_SHIFT,16)
34 defframe(PARAM_SIZE, 12)
35 defframe(PARAM_SRC, 8)
36 defframe(PARAM_DST, 4)
39 dnl Minimum 9, because the unrolled loop can't handle less.
41 deflit(UNROLL_THRESHOLD, 9)
49 C The 1 limb case can be done without the push %ebx, but it's then
50 C still the same speed. The push is left as a free helping hand for
51 C the two_or_more code.
54 pushl %ebx FRAME_pushl()
59 movl PARAM_SHIFT, %ecx
62 movl (%ebx), %edx C src limb
65 shrdl( %cl, %edx, %eax) C return value
69 movl %edx, (%ebx) C dst limb
75 C -----------------------------------------------------------------------------
76 ALIGN(16) C avoid offset 0x1f
83 movl (%ebx), %edx C src low limb
87 movd PARAM_SHIFT, %mm6
90 cmpl $UNROLL_THRESHOLD-1, %eax
103 leal (%ebx,%eax,4), %ebx
105 leal -4(%ecx,%eax,4), %ecx
108 C This loop runs at about 3 cycles/limb, which is the amount of
109 C decoding, and this is despite every second access being unaligned.
112 C eax counter, -(size-1) to -1
120 Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
125 Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
138 C -----------------------------------------------------------------------------
149 subl $7, %eax C size-8
154 movq (%ebx), %mm2 C src low qword
155 leal (%ebx,%eax,4), %ebx C src end - 32
158 leal (%ecx,%eax,4), %ecx C dst end - 32
160 notl %eax C -(size-7)
166 Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb
167 movq 4(%ebx,%eax,4), %mm2 C new src low qword
170 movq 12(%ebx,%eax,4), %mm0 C src second lowest qword
171 nop C avoid bad cache line crossing
174 C This loop is the important bit, the rest is just support for it.
175 C Four src limbs are held at the start, and four more will be read.
176 C Four dst limbs will be written. This schedule seems necessary for
179 C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
180 C and leaves 0 to 3 which can be tested with test $1 and $2.
183 C eax counter, -(size-7) step by +4 until >=0
201 movq 4(%ebx,%eax,4), %mm0
204 movq %mm2, -12(%ecx,%eax,4)
210 movq 12(%ebx,%eax,4), %mm0
212 movq %mm1, -4(%ecx,%eax,4)
213 ja L(top) C jump if no carry and not zero
217 C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
218 C to 3 representing respectively 3 to 0 further limbs.
220 testl $2, %eax C testl to avoid bad cache line crossings
223 C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
224 C becomes new mm2 and a new mm0 is loaded.
233 movq 12(%ebx,%eax,4), %mm0
235 movq %mm2, -4(%ecx,%eax,4)
249 movq %mm2, 4(%ecx,%eax,4)
253 C one further extra limb to process
255 movd 32-4(%ebx), %mm0 C src[size-1], most significant limb
264 movq %mm1, 32-12(%ecx) C dst[size-3,size-2]
265 movd %mm2, 32-4(%ecx) C dst[size-1]
267 movl %edx, %eax C retval
273 nop C avoid bad cache line crossing
275 C no further extra limbs
277 movq %mm1, 32-8(%ecx) C dst[size-2,size-1]
278 movl %edx, %eax C retval