1 dnl AMD K7 mpn_rshift -- mpn right shift.
3 dnl K7: 1.21 cycles/limb (at 16 limbs/loop).
6 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software; you can redistribute it and/or
11 dnl modify it under the terms of the GNU Lesser General Public License as
12 dnl published by the Free Software Foundation; either version 2.1 of the
13 dnl License, or (at your option) any later version.
15 dnl The GNU MP Library is distributed in the hope that it will be useful,
16 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 dnl Lesser General Public License for more details.
20 dnl You should have received a copy of the GNU Lesser General Public
21 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23 dnl Suite 330, Boston, MA 02111-1307, USA.
26 include(`../config.m4')
29 dnl K7: UNROLL_COUNT cycles/limb
34 dnl Maximum possible with the current code is 64.
36 deflit(UNROLL_COUNT, 16)
39 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
42 C Shift src,size right by shift many bits and store the result in dst,size.
43 C Zeros are shifted in at the left. The bits shifted out at the right are
46 C This code uses 64-bit MMX operations, which makes it possible to handle
47 C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer
48 C code, on the other hand, suffers from shrd being a vector path decode and
49 C running at 3 cycles back-to-back.
51 C Full speed depends on source and destination being aligned, and some hairy
52 C setups and finish-ups are done to arrange this for the loop.
55 deflit(UNROLL_THRESHOLD, 10)
57 deflit(UNROLL_THRESHOLD, 10)
60 defframe(PARAM_SHIFT,16)
61 defframe(PARAM_SIZE, 12)
62 defframe(PARAM_SRC, 8)
63 defframe(PARAM_DST, 4)
65 defframe(SAVE_EDI, -4)
66 defframe(SAVE_ESI, -8)
67 defframe(SAVE_EBX, -12)
79 deflit(`FRAME',SAVE_SIZE)
81 movl PARAM_SHIFT, %ecx
86 jnz L(more_than_one_limb)
88 movl (%edx), %edx C src limb
90 shrdl( %cl, %edx, %eax) C eax was decremented to zero
94 movl %edx, (%edi) C dst limb
101 C -----------------------------------------------------------------------------
102 L(more_than_one_limb):
111 movd PARAM_SHIFT, %mm6 C rshift
112 movd (%edx), %mm5 C src low limb
113 cmp $UNROLL_THRESHOLD-1, %eax
116 leal (%edx,%eax,4), %edx C &src[size-1]
117 leal -4(%edi,%eax,4), %edi C &dst[size-2]
119 movd (%edx), %mm4 C src high limb
124 C eax loop counter, limbs, negative
137 movq (%edx,%eax,4), %mm0
142 movd %mm0, (%edi,%eax,4)
150 movd %mm4, 4(%edi) C dst high limb
152 movd %mm5, %eax C return value
155 addl $SAVE_SIZE, %esp
161 C -----------------------------------------------------------------------------
180 jz L(start_src_aligned)
183 C src isn't aligned, process low limb separately (marked xxx) and
184 C step src and dst by one limb, making src aligned.
187 C --+-------+-------+-------+
189 C --+-------+-------+-------+
193 C --+-------+-------+
195 C --+-------+-------+
197 movq (%edx), %mm0 C src low two limbs
199 movl %eax, PARAM_SIZE C size-1
202 decl %eax C size-2 is new size-1
205 movl %edi, PARAM_DST C new dst
208 L(start_src_aligned):
211 movq (%edx), %mm1 C src low two limbs
212 decl %eax C size-2, two last limbs handled at end
216 jz L(start_dst_aligned)
219 C dst isn't aligned, add 4 to make it so, and pretend the shift is
220 C 32 bits extra. Low limb of dst (marked xxx) handled here separately.
223 C --+-------+-------+
225 C --+-------+-------+
229 C --+-------+-------+-------+
231 C --+-------+-------+-------+
236 addl $32, %ecx C shift+32
240 addl $4, %edi C new dst
243 L(start_dst_aligned):
246 movq %mm1, %mm2 C copy of src low two limbs
248 andl $-2, %eax C round size down to even
254 andl $UNROLL_MASK, %eax
259 movd %ecx, %mm7 C lshift = 64-rshift
265 leal L(entry) (%eax,%eax,4), %esi
268 shrl $UNROLL_LOG2, %ebx C loop counter
270 leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
271 leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
272 movl PARAM_SIZE, %eax C for use at end
279 C See README.family about old gas bugs
280 leal (%eax,%eax,4), %esi
281 addl $L(entry)-L(here), %esi
289 C -----------------------------------------------------------------------------
292 C eax size, for use at end
296 C esi was computed jump
301 C mm1 \ carry (alternating)
308 C The two chunks differ in whether mm1 or mm2 hold the carry.
309 C The computed jump puts the initial carry in both mm1 and mm2.
312 deflit(CHUNK_COUNT, 4)
313 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
314 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
315 deflit(`disp1', eval(disp0 + 8))
317 movq disp0(%edx), %mm0
324 movq %mm0, disp0(%edi)
327 movq disp1(%edx), %mm0
334 movq %mm0, disp1(%edi)
337 addl $UNROLL_BYTES, %edx
338 addl $UNROLL_BYTES, %edi
344 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
345 deflit(`disp1', eval(disp0-0 + 8))
348 psrlq %mm6, %mm2 C wanted rshifted in all cases below
351 movd %mm5, %eax C return value
357 C Size odd, destination was aligned.
361 C +-------+---------------+--
363 C +-------+---------------+--
366 C +-------+---------------+---------------+--
368 C +-------+---------------+---------------+--
371 C mm7 = ecx = 64-shift
374 C Size odd, destination was unaligned.
378 C +-------+---------------+--
380 C +-------+---------------+--
383 C +---------------+---------------+--
385 C +---------------+---------------+--
388 C mm7 = ecx = 64-(shift+32)
391 C In both cases there's one extra limb of src to fetch and combine
392 C with mm2 to make a qword to store, and in the aligned case there's
393 C a further extra limb of dst to be formed.
396 movd disp0(%edx), %mm0
405 movq %mm0, disp0(%edi)
406 jz L(finish_odd_unaligned)
408 movd %mm1, disp1(%edi)
409 L(finish_odd_unaligned):
412 addl $SAVE_SIZE, %esp
420 C Size even, destination was aligned.
423 C +---------------+--
425 C +---------------+--
428 C +---------------+---------------+--
430 C +---------------+---------------+--
433 C mm7 = ecx = 64-shift
436 C Size even, destination was unaligned.
439 C +---------------+--
441 C +---------------+--
444 C +-------+---------------+--
446 C +-------+---------------+--
449 C mm7 = 64-(shift+32)
452 C The movd for the unaligned case is the same data as the movq for
453 C the aligned case, it's just a choice between whether one or two
454 C limbs should be written.
458 movd %mm2, disp0(%edi)
460 jz L(end_even_unaligned)
462 movq %mm2, disp0(%edi)
463 L(end_even_unaligned):
466 addl $SAVE_SIZE, %esp