1 dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
3 dnl P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
6 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software; you can redistribute it and/or
11 dnl modify it under the terms of the GNU Lesser General Public License as
12 dnl published by the Free Software Foundation; either version 2.1 of the
13 dnl License, or (at your option) any later version.
15 dnl The GNU MP Library is distributed in the hope that it will be useful,
16 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 dnl Lesser General Public License for more details.
20 dnl You should have received a copy of the GNU Lesser General Public
21 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23 dnl Suite 330, Boston, MA 02111-1307, USA.
26 include(`../config.m4')
29 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
30 C mp_srcptr src, mp_size_t size,
32 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
33 C mp_srcptr src, mp_size_t size,
34 C mp_limb_t divisor, mp_limb_t carry);
36 C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
37 C see that file for some comments. It's likely what's here can be improved.
40 dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
41 dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
43 dnl The different speeds of the integer and fraction parts means that using
44 dnl xsize+size isn't quite right. The threshold wants to be a bit higher
45 dnl for the integer part and a bit lower for the fraction part. (Or what's
46 dnl really wanted is to speed up the integer part!)
48 dnl The threshold is set to make the integer part right. At 4 limbs the
49 dnl div and mul are about the same there, but on the fractional part the
50 dnl mul is much faster.
52 deflit(MUL_THRESHOLD, 4)
55 defframe(PARAM_CARRY, 24)
56 defframe(PARAM_DIVISOR,20)
57 defframe(PARAM_SIZE, 16)
58 defframe(PARAM_SRC, 12)
59 defframe(PARAM_XSIZE, 8)
60 defframe(PARAM_DST, 4)
62 defframe(SAVE_EBX, -4)
63 defframe(SAVE_ESI, -8)
64 defframe(SAVE_EDI, -12)
65 defframe(SAVE_EBP, -16)
67 defframe(VAR_NORM, -20)
68 defframe(VAR_INVERSE, -24)
69 defframe(VAR_SRC, -28)
70 defframe(VAR_DST, -32)
71 defframe(VAR_DST_STOP,-36)
73 deflit(STACK_SPACE, 36)
78 PROLOGUE(mpn_divrem_1c)
80 movl PARAM_CARRY, %edx
83 subl $STACK_SPACE, %esp
84 deflit(`FRAME',STACK_SPACE)
87 movl PARAM_XSIZE, %ebx
93 movl PARAM_DIVISOR, %ebp
98 leal -4(%edi,%ebx,4), %edi
99 jmp LF(mpn_divrem_1,start_1c)
104 C offset 0x31, close enough to aligned
105 PROLOGUE(mpn_divrem_1)
108 movl PARAM_SIZE, %ecx
109 movl $0, %edx C initial carry (if can't skip a div)
110 subl $STACK_SPACE, %esp
111 deflit(`FRAME',STACK_SPACE)
114 movl PARAM_DIVISOR, %ebp
117 movl PARAM_XSIZE, %ebx
126 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
129 movl -4(%esi,%ecx,4), %eax C src high limb
130 cmpl %ebp, %eax C one less div if high<divisor
133 movl $0, (%edi,%ecx,4) C dst high limb
135 movl %eax, %edx C src high limb as initial carry
148 leal (%ebx,%ecx), %eax C size+xsize
149 cmpl $MUL_THRESHOLD, %eax
150 jae L(mul_by_inverse)
153 jz L(divide_no_integer)
156 C eax scratch (quotient)
159 C edx scratch (remainder)
164 movl -4(%esi,%ecx,4), %eax
168 movl %eax, (%edi,%ecx,4)
170 jnz L(divide_integer)
173 L(divide_no_integer):
176 jnz L(divide_fraction)
187 addl $STACK_SPACE, %esp
193 C eax scratch (quotient)
196 C edx scratch (remainder)
205 movl %eax, -4(%edi,%ebx,4)
207 jnz L(divide_fraction)
213 C -----------------------------------------------------------------------------
226 movl %ebx, VAR_DST_STOP
227 leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
230 movl %ecx, %ebx C size
232 bsrl %ebp, %ecx C 31-l
233 movl %edx, %edi C carry
235 leal 1(%ecx), %eax C 32-l
241 shll %cl, %ebp C d normalized
245 subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
247 divl %ebp C floor (b*(b-d)-1) / d
249 movl %eax, VAR_INVERSE
250 orl %ebx, %ebx C size
251 leal -12(%esi,%ebx,4), %eax C &src[size-3]
256 movl 8(%eax), %esi C src high limb
260 L(start_two_or_more):
261 movl 4(%eax), %edx C src second highest limb
263 shldl( %cl, %esi, %edi) C n2 = carry,high << l
265 shldl( %cl, %edx, %esi) C n10 = high,second << l
268 je L(integer_two_left)
273 shldl( %cl, %esi, %edi) C n2 = carry,high << l
275 shll %cl, %esi C n10 = high << l
276 jmp L(integer_one_left)
280 shll %cl, %edi C n2 = carry << l
281 movl $0, %esi C n10 = 0
283 C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
289 C -----------------------------------------------------------------------------
291 C This loop runs at about 25 cycles, which is probably sub-optimal, and
292 C certainly more than the dependent chain would suggest. A better loop, or
293 C a better rough analysis of what's possible, would be welcomed.
295 C In the current implementation, the following successively dependent
296 C micro-ops seem to exist.
308 C Lack of registers hinders explicit scheduling and it might be that the
309 C normal out of order execution isn't able to hide enough under the mul
312 C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
313 C cmov (and takes one uop off the dependent chain). A sarl/andl/addl
314 C combination was tried for the addback (despite the fact it would lengthen
315 C the dependent chain) but found to be no faster.
321 C ebx scratch (nadj, q1)
322 C ecx scratch (src, dst)
328 C mm0 scratch (src qword)
329 C mm7 rshift for normalization
337 andl %eax, %ebx C -n1 & d
340 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
341 addl %edi, %eax C n2+n1
342 movq (%ecx), %mm0 C next src limb and the one below it
344 mull VAR_INVERSE C m*(n2+n1)
354 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
356 leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
358 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
373 movl VAR_DST_STOP, %eax
375 sbbl %edx, %edi C n - (q1+1)*d
376 movl %esi, %edi C remainder -> n2
377 leal (%ebp,%esi), %edx
379 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
392 L(integer_loop_done):
395 C -----------------------------------------------------------------------------
397 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
398 C q1_ff special case. This make the code a bit smaller and simpler, and
399 C costs only 2 cycles (each).
403 C ebx scratch (nadj, q1)
404 C ecx scratch (src, dst)
410 C mm0 src limb, shifted
420 andl %eax, %ebx C -n1 & d
423 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
424 addl %edi, %eax C n2+n1
426 mull VAR_INVERSE C m*(n2+n1)
428 movd (%ecx), %mm0 C src low limb
430 movl VAR_DST_STOP, %ecx
436 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
437 leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
440 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
456 sbbl %edx, %edi C n - (q1+1)*d
457 movl %esi, %edi C remainder -> n2
458 leal (%ebp,%esi), %edx
460 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
468 C -----------------------------------------------------------------------------
471 C ebx scratch (nadj, q1)
478 C mm0 src limb, shifted
486 movl VAR_DST_STOP, %ecx
488 andl %eax, %ebx C -n1 & d
491 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
492 addl %edi, %eax C n2+n1
494 mull VAR_INVERSE C m*(n2+n1)
502 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
503 leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
508 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
510 sbbl $0, %ebx C q1 if q1+1 overflowed
523 movl PARAM_XSIZE, %eax
525 sbbl %edx, %edi C n - (q1+1)*d
526 movl %esi, %edi C remainder -> n2
527 leal (%ebp,%esi), %edx
529 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
538 orl %eax, %eax C xsize
551 addl $STACK_SPACE, %esp
559 C -----------------------------------------------------------------------------
561 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
562 C of q*d is simply -d and the remainder n-q*d = n10+d
574 movl VAR_DST_STOP, %edx
579 leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
582 movd %mm0, %esi C next n10
587 jmp L(integer_loop_done)
591 C -----------------------------------------------------------------------------
593 C In the current implementation, the following successively dependent
594 C micro-ops seem to exist.
605 C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for
606 C the addback was found to be a touch slower.
620 movl VAR_DST_STOP, %ecx
628 C eax n2, then scratch
629 C ebx scratch (nadj, q1)
630 C ecx dst, decrementing
636 mull VAR_INVERSE C m*n2
648 addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
660 negl %eax C low of n - (q1+1)*d
662 sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
663 leal (%ebp,%eax), %edx
665 cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
668 movl %eax, %edi C remainder->n2
671 movl %ebx, (%ecx) C previous q