1 dnl AMD K7 mpn_divrem_1 -- mpn by limb division.
3 dnl K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
6 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software; you can redistribute it and/or
11 dnl modify it under the terms of the GNU Lesser General Public License as
12 dnl published by the Free Software Foundation; either version 2.1 of the
13 dnl License, or (at your option) any later version.
15 dnl The GNU MP Library is distributed in the hope that it will be useful,
16 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 dnl Lesser General Public License for more details.
20 dnl You should have received a copy of the GNU Lesser General Public
21 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23 dnl Suite 330, Boston, MA 02111-1307, USA.
26 include(`../config.m4')
29 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
30 C mp_srcptr src, mp_size_t size,
32 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
33 C mp_srcptr src, mp_size_t size,
34 C mp_limb_t divisor, mp_limb_t carry);
36 C The method and nomenclature follow part 8 of "Division by Invariant
37 C Integers using Multiplication" by Granlund and Montgomery, reference in
40 C The "and"s shown in the paper are done here with "cmov"s. "m" is written
41 C for m', and "d" for d_norm, which won't cause any confusion since it's
42 C only the normalized divisor that's of any use in the code. "b" is written
43 C for 2^N, the size of a limb, N being 32 here.
45 C mpn_divrem_1 avoids one division if the src high limb is less than the
46 C divisor. mpn_divrem_1c doesn't check for a zero carry, since in normal
47 C circumstances that will be a very rare event.
49 C There's a small bias towards expecting xsize==0, by having code for
50 C xsize==0 in a straight line and xsize!=0 under forward jumps.
53 dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
54 dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
56 dnl The inverse takes about 50 cycles to calculate, but after that the
57 dnl multiply is 17 c/l versus division at 42 c/l.
59 dnl At 3 limbs the mul is a touch faster than div on the integer part, and
60 dnl even more so on the fractional part.
62 deflit(MUL_THRESHOLD, 3)
65 defframe(PARAM_CARRY, 24)
66 defframe(PARAM_DIVISOR,20)
67 defframe(PARAM_SIZE, 16)
68 defframe(PARAM_SRC, 12)
69 defframe(PARAM_XSIZE, 8)
70 defframe(PARAM_DST, 4)
72 defframe(SAVE_EBX, -4)
73 defframe(SAVE_ESI, -8)
74 defframe(SAVE_EDI, -12)
75 defframe(SAVE_EBP, -16)
77 defframe(VAR_NORM, -20)
78 defframe(VAR_INVERSE, -24)
79 defframe(VAR_SRC, -28)
80 defframe(VAR_DST, -32)
81 defframe(VAR_DST_STOP,-36)
83 deflit(STACK_SPACE, 36)
88 PROLOGUE(mpn_divrem_1c)
90 movl PARAM_CARRY, %edx
92 subl $STACK_SPACE, %esp
93 deflit(`FRAME',STACK_SPACE)
96 movl PARAM_XSIZE, %ebx
102 movl PARAM_DIVISOR, %ebp
107 leal -4(%edi,%ebx,4), %edi
108 jmp LF(mpn_divrem_1,start_1c)
113 C offset 0x31, close enough to aligned
114 PROLOGUE(mpn_divrem_1)
117 movl PARAM_SIZE, %ecx
118 movl $0, %edx C initial carry (if can't skip a div)
119 subl $STACK_SPACE, %esp
120 deflit(`FRAME',STACK_SPACE)
123 movl PARAM_DIVISOR, %ebp
126 movl PARAM_XSIZE, %ebx
134 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
137 movl -4(%esi,%ecx,4), %eax C src high limb
139 cmpl %ebp, %eax C one less div if high<divisor
142 movl $0, (%edi,%ecx,4) C dst high limb
144 movl %eax, %edx C src high limb as initial carry
157 leal (%ebx,%ecx), %eax C size+xsize
158 cmpl $MUL_THRESHOLD, %eax
159 jae L(mul_by_inverse)
162 C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
163 C It'd be possible to write them out without the looping, but no speedup
166 C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
167 C integer part, but curiously not on the fractional part, where %ebp is a
168 C (fixed) couple of cycles faster.
171 jz L(divide_no_integer)
174 C eax scratch (quotient)
177 C edx scratch (remainder)
182 movl -4(%esi,%ecx,4), %eax
186 movl %eax, (%edi,%ecx,4)
188 jnz L(divide_integer)
191 L(divide_no_integer):
194 jnz L(divide_fraction)
203 addl $STACK_SPACE, %esp
209 C eax scratch (quotient)
212 C edx scratch (remainder)
221 movl %eax, -4(%edi,%ebx,4)
223 jnz L(divide_fraction)
229 C -----------------------------------------------------------------------------
240 bsrl %ebp, %eax C 31-l
243 leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
246 movl %ebx, VAR_DST_STOP
248 movl %ecx, %ebx C size
251 movl %edx, %edi C carry
259 shll %cl, %ebp C d normalized
265 subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
267 divl %ebp C floor (b*(b-d)-1) / d
269 orl %ebx, %ebx C size
270 movl %eax, VAR_INVERSE
271 leal -12(%esi,%ebx,4), %eax C &src[size-3]
277 movl 8(%eax), %esi C src high limb
280 L(start_two_or_more):
281 movl 4(%eax), %edx C src second highest limb
283 shldl( %cl, %esi, %edi) C n2 = carry,high << l
285 shldl( %cl, %edx, %esi) C n10 = high,second << l
288 je L(integer_two_left)
293 shldl( %cl, %esi, %edi) C n2 = carry,high << l
295 shll %cl, %esi C n10 = high << l
297 jmp L(integer_one_left)
301 shll %cl, %edi C n2 = carry << l
302 movl $0, %esi C n10 = 0
304 C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
310 C -----------------------------------------------------------------------------
312 C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
313 C execution. The instruction scheduling is important, with various
314 C apparently equivalent forms running 1 to 5 cycles slower.
316 C A lower bound for the time would seem to be 16 cycles, based on the
317 C following successive dependencies.
329 C This chain is what the loop has already, but 16 cycles isn't achieved.
330 C K7 has enough decode, and probably enough execute (depending maybe on what
331 C a mul actually consumes), but nothing running under 17 has been found.
333 C In theory n2+n1 could be done in the sub and addback stages (by
334 C calculating both n2 and n2+n1 there), but lack of registers makes this an
335 C unlikely proposition.
337 C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow
338 C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
339 C chain, and nothing better than 18 cycles has been found when using it.
340 C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
341 C be an extremely rare event.
343 C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
344 C if some special data is coming out with this always, the q1_ff special
345 C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to
346 C induce the q1_ff case, for speed measurements or testing. Note that
347 C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
349 C The instruction groupings and empty comments show the cycles for a naive
350 C in-order view of the code (conveniently ignoring the load latency on
351 C VAR_INVERSE). This shows some of where the time is going, but is nonsense
352 C to the extent that out-of-order execution rearranges it. In this case
353 C there's 19 cycles shown, but it executes at 17.
358 C ebx scratch (nadj, q1)
359 C ecx scratch (src, dst)
365 C mm0 scratch (src qword)
366 C mm7 rshift for normalization
368 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
372 leal (%ebp,%esi), %ebx
373 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
374 sbbl $-1, %eax C n2+n1
376 mull VAR_INVERSE C m*(n2+n1)
378 movq (%ecx), %mm0 C next limb and the one below it
385 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
386 leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
391 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
404 movl VAR_DST_STOP, %eax
408 sbbl %edx, %edi C n - (q1+1)*d
409 movl %esi, %edi C remainder -> n2
410 leal (%ebp,%esi), %edx
414 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
423 L(integer_loop_done):
426 C -----------------------------------------------------------------------------
428 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
429 C q1_ff special case. This make the code a bit smaller and simpler, and
430 C costs only 1 cycle (each).
434 C ebx scratch (nadj, q1)
435 C ecx scratch (src, dst)
441 C mm0 src limb, shifted
444 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
448 leal (%ebp,%esi), %ebx
449 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
450 sbbl $-1, %eax C n2+n1
452 mull VAR_INVERSE C m*(n2+n1)
454 movd (%ecx), %mm0 C src low limb
456 movl VAR_DST_STOP, %ecx
460 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
461 leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
464 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
480 sbbl %edx, %edi C n - (q1+1)*d
481 movl %esi, %edi C remainder -> n2
482 leal (%ebp,%esi), %edx
486 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
492 C -----------------------------------------------------------------------------
495 C ebx scratch (nadj, q1)
502 C mm0 src limb, shifted
505 movl VAR_DST_STOP, %ecx
506 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
509 leal (%ebp,%esi), %ebx
510 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
511 sbbl $-1, %eax C n2+n1
513 mull VAR_INVERSE C m*(n2+n1)
521 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
522 leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
527 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
529 sbbl $0, %ebx C q1 if q1+1 overflowed
543 sbbl %edx, %edi C n - (q1+1)*d
544 movl %esi, %edi C remainder -> n2
545 leal (%ebp,%esi), %edx
547 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
568 addl $STACK_SPACE, %esp
576 C -----------------------------------------------------------------------------
578 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
579 C of q*d is simply -d and the remainder n-q*d = n10+d
591 movl VAR_DST_STOP, %edx
595 leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
598 movd %mm0, %esi C next n10
604 jmp L(integer_loop_done)
608 C -----------------------------------------------------------------------------
610 C Being the fractional part, the "source" limbs are all zero, meaning
611 C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
613 C The loop runs at 15 cycles. The dependent chain is the same as the
614 C general case above, but without the n2+n1 stage (due to n1==0), so 15
615 C would seem to be the lower bound.
617 C A not entirely obvious simplification is that q1+1 never overflows a limb,
618 C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
619 C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
620 C rnd() means rounding down to a multiple of d.
622 C m*n2 + b*n2 <= m*(d-1) + b*(d-1)
623 C = m*d + b*d - m - b
624 C = floor((b(b-d)-1)/d)*d + b*d - m - b
625 C = rnd(b(b-d)-1) + b*d - m - b
626 C = rnd(b(b-d)-1 + b*d) - m - b
627 C = rnd(b*b-1) - m - b
630 C Unchanged from the general case is that the final quotient limb q can be
631 C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from
632 C equation 8.4 of the paper which simplifies as follows when n1==0 and
635 C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
637 C As before, the instruction groupings and empty comments show a naive
638 C in-order view of the code, which is made a nonsense by out of order
639 C execution. There's 17 cycles shown, but it executes at 15.
641 C Rotating the store q and remainder->n2 instructions up to the top of the
642 C loop gets the run time down from 16 to 15.
655 movl VAR_DST_STOP, %ecx
660 jmp L(fraction_entry)
665 C eax n2 carry, then scratch
666 C ebx scratch (nadj, q1)
667 C ecx dst, decrementing
673 movl %ebx, (%ecx) C previous q
674 movl %eax, %edi C remainder->n2
677 mull VAR_INVERSE C m*n2
691 addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
701 negl %eax C low of n - (q1+1)*d
705 sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
706 leal (%ebp,%eax), %edx
708 cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1