1 dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
3 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 2.1 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public
18 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20 dnl Suite 330, Boston, MA 02111-1307, USA.
27 dnl 486 approx 43 maybe
30 dnl The following have their own optimized divrem_1 implementations, but
31 dnl for reference the code here runs as follows.
38 include(`../config.m4')
41 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
42 C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
43 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
44 C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
46 C Divide src,size by divisor and store the quotient in dst+xsize,size.
47 C Extend the division to fractional quotient limbs in dst,xsize. Return the
48 C remainder. Either or both xsize and size can be 0.
50 C mpn_divrem_1c takes a carry parameter which is an initial high limb,
51 C effectively one extra limb at the top of src,size. Must have
55 C Essentially the code is the same as the division based part of
56 C mpn/generic/divrem_1.c, but has the following advantages.
58 C - If gcc isn't being used then divrem_1.c will get the generic C
59 C udiv_qrnnd() and be rather slow.
61 C - On K6, using the loop instruction is a 10% speedup, but gcc doesn't
62 C generate that instruction (as of gcc 2.95.2 at least).
64 C A test is done to see if the high limb is less the the divisor, and if so
65 C one less div is done. A div is between 20 and 40 cycles on the various
66 C x86s, so assuming high<divisor about half the time, then this test saves
67 C half that amount. The branch misprediction penalty on each chip is less
71 C K6: Back-to-back div instructions run at 20 cycles, the same as the loop
72 C here, so it seems there's nothing to gain by rearranging the loop.
73 C Pairing the mov and loop instructions was found to gain nothing. (The
74 C same is true of the mpn/x86/mod_1.asm loop.)
76 C With a "decl/jnz" rather than a "loop" this code runs at 22 cycles.
77 C The loop_or_decljnz macro is an easy way to get a 10% speedup.
79 C The fast K6 multiply might be thought to suit a multiply-by-inverse,
80 C but that algorithm has been found to suffer from the releatively poor
81 C carry handling on K6 and too many auxiliary instructions. The
82 C fractional part however could be done at about 13 c/l.
84 C P5: Moving the load down to pair with the store might save 1 cycle, but
85 C that doesn't seem worth bothering with, since it'd be only a 2.2%
88 C Again here the auxiliary instructions hinder a multiply-by-inverse,
89 C though there might be a 10-15% speedup available
92 defframe(PARAM_CARRY, 24)
93 defframe(PARAM_DIVISOR,20)
94 defframe(PARAM_SIZE, 16)
95 defframe(PARAM_SRC, 12)
96 defframe(PARAM_XSIZE, 8)
97 defframe(PARAM_DST, 4)
102 PROLOGUE(mpn_divrem_1c)
105 movl PARAM_SIZE, %ecx
106 pushl %edi FRAME_pushl()
109 pushl %esi FRAME_pushl()
111 movl PARAM_DIVISOR, %esi
112 pushl %ebx FRAME_pushl()
115 pushl %ebp FRAME_pushl()
117 movl PARAM_XSIZE, %ebp
120 movl PARAM_CARRY, %edx
121 jz LF(mpn_divrem_1,fraction)
123 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
124 jmp LF(mpn_divrem_1,integer_top)
129 PROLOGUE(mpn_divrem_1)
132 movl PARAM_SIZE, %ecx
133 pushl %edi FRAME_pushl()
136 pushl %esi FRAME_pushl()
138 movl PARAM_DIVISOR, %esi
142 pushl %ebx FRAME_pushl()
144 movl -4(%edi,%ecx,4), %eax C src high limb
148 pushl %ebp FRAME_pushl()
150 movl PARAM_XSIZE, %ebp
153 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
157 C high<divisor, so high of dst is zero, and avoid one div
159 movl %edx, (%ebx,%ecx,4)
167 C eax scratch (quotient)
170 C edx scratch (remainder)
175 movl -4(%edi,%ecx,4), %eax
180 movl %eax, (%ebx,%ecx,4)
181 loop_or_decljnz L(integer_top)
192 C eax scratch (quotient)
195 C edx scratch (remainder)
204 movl %eax, -4(%ebx,%ecx,4)
205 loop_or_decljnz L(fraction_top)
219 movl PARAM_XSIZE, %ecx
224 cld C better safe than sorry, see mpn/x86/README.family