1 dnl AMD K7 mpn_lshift -- mpn left shift.
3 dnl K7: 1.21 cycles/limb (at 16 limbs/loop).
6 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software; you can redistribute it and/or
11 dnl modify it under the terms of the GNU Lesser General Public License as
12 dnl published by the Free Software Foundation; either version 2.1 of the
13 dnl License, or (at your option) any later version.
15 dnl The GNU MP Library is distributed in the hope that it will be useful,
16 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 dnl Lesser General Public License for more details.
20 dnl You should have received a copy of the GNU Lesser General Public
21 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23 dnl Suite 330, Boston, MA 02111-1307, USA.
26 include(`../config.m4')
29 dnl K7: UNROLL_COUNT cycles/limb
34 dnl Maximum possible with the current code is 64.
36 deflit(UNROLL_COUNT, 16)
39 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
42 C Shift src,size left by shift many bits and store the result in dst,size.
43 C Zeros are shifted in at the right. The bits shifted out at the left are
46 C The comments in mpn_rshift apply here too.
49 deflit(UNROLL_THRESHOLD, 10)
51 deflit(UNROLL_THRESHOLD, 10)
54 defframe(PARAM_SHIFT,16)
55 defframe(PARAM_SIZE, 12)
56 defframe(PARAM_SRC, 8)
57 defframe(PARAM_DST, 4)
59 defframe(SAVE_EDI, -4)
60 defframe(SAVE_ESI, -8)
61 defframe(SAVE_EBX, -12)
73 deflit(`FRAME',SAVE_SIZE)
75 movl PARAM_SHIFT, %ecx
80 jnz L(more_than_one_limb)
84 shldl( %cl, %edx, %eax) C eax was decremented to zero
95 C -----------------------------------------------------------------------------
96 L(more_than_one_limb):
105 movd PARAM_SHIFT, %mm6
106 movd (%edx,%eax,4), %mm5 C src high limb
107 cmp $UNROLL_THRESHOLD-1, %eax
111 movd (%edx), %mm4 C src low limb
118 C eax loop counter, limbs
132 movq -4(%edx,%eax,4), %mm0
137 movd %mm0, 4(%edi,%eax,4)
145 movd %mm4, (%edi) C dst low limb
147 movd %mm5, %eax C return value
150 addl $SAVE_SIZE, %esp
156 C -----------------------------------------------------------------------------
167 C mm5 src high limb, for return value
172 leal -4(%edx,%eax,4), %edx C &src[size-2]
175 movq (%edx), %mm1 C src high qword
177 jz L(start_src_aligned)
180 C src isn't aligned, process high limb (marked xxx) separately to
183 C source -4(edx,%eax,4)
185 C +-------+-------+-------+--
187 C +-------+-------+-------+--
190 C dest -4(edi,%eax,4)
192 C +-------+-------+--
194 C +-------+-------+--
198 movl %eax, PARAM_SIZE C size-1
201 decl %eax C size-2 is new size-1
203 movd %mm1, 4(%edi,%eax,4)
204 movq (%edx), %mm1 C new src high qword
205 L(start_src_aligned):
208 leal -4(%edi,%eax,4), %edi C &dst[size-2]
212 psrlq $32, %mm5 C return value
214 jz L(start_dst_aligned)
217 C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
218 C shift is 32 bits extra. High limb of dst (marked xxx) handled
222 C +-------+-------+--
224 C +-------+-------+--
228 C +-------+-------+-------+--
230 C +-------+-------+-------+--
235 addl $32, %ecx C shift+32
243 movd %ecx, %mm6 C new lshift
244 L(start_dst_aligned):
246 decl %eax C size-2, two last limbs handled at end
247 movq %mm1, %mm2 C copy of src high qword
250 andl $-2, %eax C round size down to even
256 andl $UNROLL_MASK, %eax
261 movd %ecx, %mm7 C rshift = 64-lshift
267 leal L(entry) (%eax,%eax,4), %esi
269 shrl $UNROLL_LOG2, %ebx C loop counter
271 leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
272 leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
273 movl PARAM_SIZE, %eax C for use at end
279 C See README.family about old gas bugs
280 leal (%eax,%eax,4), %esi
281 addl $L(entry)-L(here), %esi
288 C -----------------------------------------------------------------------------
291 C eax size (for use at end)
300 C mm1 \ carry (alternating, mm2 first)
307 C The two chunks differ in whether mm1 or mm2 hold the carry.
308 C The computed jump puts the initial carry in both mm1 and mm2.
311 deflit(CHUNK_COUNT, 4)
312 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
313 deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
314 deflit(`disp1', eval(disp0 - 8))
316 movq disp0(%edx), %mm0
323 movq %mm0, disp0(%edi)
326 movq disp1(%edx), %mm0
333 movq %mm0, disp1(%edi)
336 subl $UNROLL_BYTES, %edx
337 subl $UNROLL_BYTES, %edi
344 define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
349 psllq %mm6, %mm2 C wanted left shifted in all cases below
359 C Size odd, destination was aligned.
362 C --+---------------+-------+
364 C --+---------------+-------+
367 C --+---------------+---------------+-------+
369 C --+---------------+---------------+-------+
372 C mm7 = ecx = 64-shift
375 C Size odd, destination was unaligned.
378 C --+---------------+-------+
380 C --+---------------+-------+
383 C --+---------------+---------------+
385 C --+---------------+---------------+
388 C mm7 = ecx = 64-(shift+32)
391 C In both cases there's one extra limb of src to fetch and combine
392 C with mm2 to make a qword at (%edi), and in the aligned case
393 C there's an extra limb of dst to be formed from that extra src limb
396 movd disp(4) (%edx), %mm0
407 movq %mm0, disp(0) (%edi)
408 jz L(end_odd_unaligned)
409 movd %mm1, disp(-4) (%edi)
410 L(end_odd_unaligned):
413 addl $SAVE_SIZE, %esp
421 C Size even, destination was aligned.
424 C --+---------------+
426 C --+---------------+
429 C --+---------------+---------------+
431 C --+---------------+---------------+
434 C mm7 = ecx = 64-shift
437 C Size even, destination was unaligned.
440 C --+---------------+
442 C --+---------------+
445 C --+---------------+-------+
447 C --+---------------+-------+
450 C mm7 = ecx = 64-(shift+32)
453 C The movq for the aligned case overwrites the movd for the
460 movd %mm2, disp(4) (%edi)
462 jz L(end_even_unaligned)
463 movq %mm0, disp(0) (%edi)
464 L(end_even_unaligned):
467 addl $SAVE_SIZE, %esp