1 dnl Intel Pentium-II mpn_mod_1 -- mpn by limb remainder.
3 dnl P6MMX: 24.0 cycles/limb.
6 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software; you can redistribute it and/or
11 dnl modify it under the terms of the GNU Lesser General Public License as
12 dnl published by the Free Software Foundation; either version 2.1 of the
13 dnl License, or (at your option) any later version.
15 dnl The GNU MP Library is distributed in the hope that it will be useful,
16 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 dnl Lesser General Public License for more details.
20 dnl You should have received a copy of the GNU Lesser General Public
21 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23 dnl Suite 330, Boston, MA 02111-1307, USA.
26 include(`../config.m4')
29 C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
30 C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
33 C The code here very similar to mpn_divrem_1, but with the quotient
34 C discarded. What's here probably isn't optimal.
36 C See mpn/x86/p6/mmx/divrem_1.c and mpn/x86/k7/mmx/mod_1.asm for some
40 dnl MUL_THRESHOLD is the size at which the multiply by inverse method is
41 dnl used, rather than plain "divl"s. Minimum value 2.
43 deflit(MUL_THRESHOLD, 4)
46 defframe(PARAM_CARRY, 16)
47 defframe(PARAM_DIVISOR,12)
48 defframe(PARAM_SIZE, 8)
49 defframe(PARAM_SRC, 4)
51 defframe(SAVE_EBX, -4)
52 defframe(SAVE_ESI, -8)
53 defframe(SAVE_EDI, -12)
54 defframe(SAVE_EBP, -16)
56 defframe(VAR_NORM, -20)
57 defframe(VAR_INVERSE, -24)
58 defframe(VAR_SRC_STOP,-28)
60 deflit(STACK_SPACE, 28)
67 movl PARAM_CARRY, %edx
69 subl $STACK_SPACE, %esp
70 deflit(`FRAME',STACK_SPACE)
73 movl PARAM_DIVISOR, %ebp
77 jmp LF(mpn_mod_1,start_1c)
86 movl $0, %edx C initial carry (if can't skip a div)
88 subl $STACK_SPACE, %esp
89 deflit(`FRAME',STACK_SPACE)
95 movl PARAM_DIVISOR, %ebp
100 movl -4(%esi,%ecx,4), %eax C src high limb
102 cmpl %ebp, %eax C carry flag if high<divisor
104 cmovc( %eax, %edx) C src high limb as initial carry
105 sbbl $0, %ecx C size-1 to skip one div
119 cmpl $MUL_THRESHOLD, %ecx
120 jae L(mul_by_inverse)
128 C eax scratch (quotient)
130 C ecx counter, limbs, decrementing
131 C edx scratch (remainder)
136 movl -4(%esi,%ecx,4), %eax
149 addl $STACK_SPACE, %esp
155 C -----------------------------------------------------------------------------
169 movl %ebx, VAR_SRC_STOP
170 movl %ecx, %ebx C size
173 movl %edx, %edi C carry
175 bsrl %ebp, %ecx C 31-l
178 leal 1(%ecx), %eax C 32-l
182 shll %cl, %ebp C d normalized
186 subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
188 divl %ebp C floor (b*(b-d)-1) / d
192 movl %eax, VAR_INVERSE
193 leal -12(%esi,%ebx,4), %eax C &src[size-3]
195 movl 8(%eax), %esi C src high limb
196 movl 4(%eax), %edx C src second highest limb
198 shldl( %cl, %esi, %edi) C n2 = carry,high << l
200 shldl( %cl, %edx, %esi) C n10 = high,second << l
202 movl %eax, %ecx C &src[size-3]
205 ifelse(MUL_THRESHOLD,2,`
207 je L(inverse_two_left)
211 C The dependent chain here is the same as in mpn_divrem_1, but a few
212 C instructions are saved by not needing to store the quotient limbs. This
213 C gets it down to 24 c/l, which is still a bit away from a theoretical 19
219 C ebx scratch (nadj, q1)
220 C ecx src pointer, decrementing
226 C mm0 scratch (src qword)
227 C mm7 rshift for normalization
235 andl %eax, %ebx C -n1 & d
238 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
239 addl %edi, %eax C n2+n1
241 mull VAR_INVERSE C m*(n2+n1)
243 movq (%ecx), %mm0 C next src limb and the one below it
252 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
253 leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
256 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
262 movl VAR_SRC_STOP, %ebx
272 sbbl %edx, %edi C n - (q1+1)*d
273 movl %esi, %edi C remainder -> n2
274 leal (%ebp,%esi), %edx
276 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
283 L(inverse_loop_done):
286 C -----------------------------------------------------------------------------
290 C ebx scratch (nadj, q1)
297 C mm0 scratch (src dword)
305 andl %eax, %ebx C -n1 & d
308 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
309 addl %edi, %eax C n2+n1
311 mull VAR_INVERSE C m*(n2+n1)
313 movd 4(%ecx), %mm0 C src low limb
321 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
322 leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
324 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
341 sbbl %edx, %edi C n - (q1+1)*d
342 movl %esi, %edi C remainder -> n2
343 leal (%ebp,%esi), %edx
345 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
352 C ebx scratch (nadj, q1)
359 C mm0 src limb, shifted
367 andl %eax, %ebx C -n1 & d
370 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
371 addl %edi, %eax C n2+n1
373 mull VAR_INVERSE C m*(n2+n1)
375 movl VAR_NORM, %ecx C for final denorm
383 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
384 leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
386 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
403 sbbl %edx, %edi C n - (q1+1)*d
404 leal (%ebp,%esi), %edx
407 movl %esi, %eax C remainder
410 cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
413 shrl %cl, %eax C denorm remainder
414 addl $STACK_SPACE, %esp
420 C -----------------------------------------------------------------------------
422 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
423 C of q*d is simply -d and the remainder n-q*d = n10+d
434 leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
435 movl VAR_SRC_STOP, %edx
438 movd %mm0, %esi C next n10
442 jmp L(inverse_loop_done)