1 dnl Intel P5 mpn_lshift -- mpn left shift.
3 dnl P5: 1.75 cycles/limb.
6 dnl Copyright (C) 2000 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software; you can redistribute it and/or
11 dnl modify it under the terms of the GNU Lesser General Public License as
12 dnl published by the Free Software Foundation; either version 2.1 of the
13 dnl License, or (at your option) any later version.
15 dnl The GNU MP Library is distributed in the hope that it will be useful,
16 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 dnl Lesser General Public License for more details.
20 dnl You should have received a copy of the GNU Lesser General Public
21 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23 dnl Suite 330, Boston, MA 02111-1307, USA.
26 include(`../config.m4')
29 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
32 C Shift src,size left by shift many bits and store the result in dst,size.
33 C Zeros are shifted in at the right. Return the bits shifted out at the
36 C The comments in mpn_rshift apply here too.
38 defframe(PARAM_SHIFT,16)
39 defframe(PARAM_SIZE, 12)
40 defframe(PARAM_SRC, 8)
41 defframe(PARAM_DST, 4)
44 dnl minimum 5, because the unrolled loop can't handle less
45 deflit(UNROLL_THRESHOLD, 5)
60 movl PARAM_SHIFT, %ecx
62 cmp $UNROLL_THRESHOLD, %eax
65 movl -4(%ebx,%eax,4), %edi C src high limb
70 shldl( %cl, %edi, %eax) C eax was decremented to zero
74 movl %edi, (%edx) C dst low limb
75 popl %edi C risk of data cache bank clash
82 C -----------------------------------------------------------------------------
93 movd (%ebx,%eax,4), %mm5 C src high limb
95 movd %ecx, %mm6 C lshift
102 psrlq $32, %mm5 C retval
106 C eax counter, limbs, negative
118 movq -4(%ebx,%eax,4), %mm0
125 movd %mm0, 4(%edx,%eax,4)
144 C -----------------------------------------------------------------------------
156 movd -4(%ebx,%eax,4), %mm5 C src high limb
157 leal (%ebx,%eax,4), %edi
159 movd %ecx, %mm6 C lshift
163 jz L(start_src_aligned)
166 C src isn't aligned, process high limb separately (marked xxx) to
169 C source -8(ebx,%eax,4)
171 C +-------+-------+-------+--
173 C +-------+-------+-------+--
179 C +-------+-------+--
181 C +-------+-------+--
183 movq -8(%ebx,%eax,4), %mm0 C unaligned load
192 movd %mm0, (%edx,%eax,4)
193 L(start_src_aligned):
195 movq -8(%ebx,%eax,4), %mm1 C src high qword
196 leal (%edx,%eax,4), %edi
199 psrlq $32, %mm5 C return value
201 movq -16(%ebx,%eax,4), %mm3 C src second highest qword
202 jz L(start_dst_aligned)
204 C dst isn't aligned, subtract 4 to make it so, and pretend the shift
205 C is 32 bits extra. High limb of dst (marked xxx) handled here
208 C source -8(ebx,%eax,4)
210 C +-------+-------+--
212 C +-------+-------+--
218 C +-------+-------+-------+--
220 C +-------+-------+-------+--
224 addl $32, %ecx C new shift
231 C wasted cycle here waiting for %mm0
233 movd %mm0, -4(%edx,%eax,4)
235 L(start_dst_aligned):
241 addl $64, %ecx C 64-shift
245 subl $8, %eax C size-8
249 por %mm1, %mm3 C mm3 ready to store
253 C The comments in mpn_rshift apply here too.
266 C mm2 src qword from 48(%ebx,%eax,4)
267 C mm3 dst qword ready to store to 56(%edx,%eax,4)
273 movq 8(%ebx,%eax,4), %mm0
279 movq %mm3, 24(%edx,%eax,4) C prev
282 movq (%ebx,%eax,4), %mm3 C
285 movq %mm0, 16(%edx,%eax,4)
297 C eax -4 to -1 representing respectively 0 to 3 limbs remaining
303 movq 8(%ebx,%eax,4), %mm0
309 movq %mm3, 24(%edx,%eax,4) C prev
319 C eax -4 or -3 representing respectively 0 or 1 limbs remaining
321 C mm2 src prev qword, from 48(%ebx,%eax,4)
322 C mm3 dst qword, for 56(%edx,%eax,4)
325 movd %mm5, %eax C retval
331 C One extra src limb, destination was aligned.
334 C --+---------------+-------+
336 C --+---------------+-------+
338 C dest edx+12 edx+4 edx
339 C --+---------------+---------------+-------+
341 C --+---------------+---------------+-------+
344 C mm7 = ecx = 64-shift
347 C One extra src limb, destination was unaligned.
350 C --+---------------+-------+
352 C --+---------------+-------+
355 C --+---------------+---------------+
357 C --+---------------+---------------+
360 C mm7 = ecx = 64-(shift+32)
363 C In both cases there's one extra limb of src to fetch and combine
364 C with mm2 to make a qword at 4(%edx), and in the aligned case
365 C there's an extra limb of dst to be formed from that extra src limb
387 jz L(finish_one_unaligned)
390 L(finish_one_unaligned):
399 C No extra src limbs, destination was aligned.
402 C --+---------------+
404 C --+---------------+
407 C --+---------------+---------------+
409 C --+---------------+---------------+
412 C mm7 = ecx = 64-shift
415 C No extra src limbs, destination was unaligned.
418 C --+---------------+
420 C --+---------------+
423 C --+---------------+-------+
425 C --+---------------+-------+
428 C mm7 = ecx = 64-(shift+32)
431 C The movd for the unaligned case writes the same data to 4(%edx)
432 C that the movq does for the aligned case.
439 jz L(finish_zero_unaligned)
442 L(finish_zero_unaligned):
447 movd %mm5, %eax C retval