1 dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
3 dnl K6: 7.65 to 8.5 cycles/limb (at 16 limbs/loop and depending on the data),
4 dnl PIC adds about 6 cycles at the start.
7 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
9 dnl This file is part of the GNU MP Library.
11 dnl The GNU MP Library is free software; you can redistribute it and/or
12 dnl modify it under the terms of the GNU Lesser General Public License as
13 dnl published by the Free Software Foundation; either version 2.1 of the
14 dnl License, or (at your option) any later version.
16 dnl The GNU MP Library is distributed in the hope that it will be useful,
17 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
18 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 dnl Lesser General Public License for more details.
21 dnl You should have received a copy of the GNU Lesser General Public
22 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
23 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
24 dnl Suite 330, Boston, MA 02111-1307, USA.
27 include(`../config.m4')
30 dnl K6: large multpliers small multpliers
31 dnl UNROLL_COUNT cycles/limb cycles/limb
37 dnl Maximum possible unrolling with the current code is 32.
39 dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
40 dnl byte block, which might explain the good speed at that unrolling.
42 deflit(UNROLL_COUNT, 16)
45 ifdef(`OPERATION_addmul_1', `
47 define(M4_function_1, mpn_addmul_1)
48 define(M4_function_1c, mpn_addmul_1c)
49 define(M4_description, add it to)
50 define(M4_desc_retval, carry)
51 ',`ifdef(`OPERATION_submul_1', `
53 define(M4_function_1, mpn_submul_1)
54 define(M4_function_1c, mpn_submul_1c)
55 define(M4_description, subtract it from)
56 define(M4_desc_retval, borrow)
57 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
60 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
63 C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
65 C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
66 C mp_limb_t mult, mp_limb_t carry);
68 C Calculate src,size multiplied by mult and M4_description dst,size.
69 C Return the M4_desc_retval limb from the top of the result.
71 C The jadcl0()s in the unrolled loop makes the speed data dependent. Small
72 C multipliers (most significant few bits clear) result in few carry bits and
73 C speeds up to 7.65 cycles/limb are attained. Large multipliers (most
74 C significant few bits set) make the carry bits 50/50 and lead to something
75 C more like 8.4 c/l. (With adcl's both of these would be 9.3 c/l.)
77 C It's important that the gains for jadcl0 on small multipliers don't come
78 C at the cost of slowing down other data. Tests on uniformly distributed
79 C random data, designed to confound branch prediction, show about a 7%
80 C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
81 C overheads included).
83 C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
84 C 11.0 cycles/limb), and hence isn't used.
86 C In the simple loop, note that running ecx from negative to zero and using
87 C it as an index in the two movs wouldn't help. It would save one
88 C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
89 C that would be collapsed by this.
95 C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
96 C firstly the instruction decoding and secondly the fact that there's a
97 C carry bit for the jadcl0 only on average about 1/4 of the time.
99 C The code in the unrolled loop decodes something like the following.
103 C M4_inst %esi, disp(%edi) 1
105 C movl %edx, %esi \ 1
108 C 1: movl disp(%ebx), %eax /
112 C In a back-to-back style test this measures 7 with the jnc not taken, or 8
113 C with it taken (both when correctly predicted). This is opposite to the
114 C measurements showing small multipliers running faster than large ones.
115 C Watch this space for more info ...
117 C It's not clear how much branch misprediction might be costing. The K6
118 C doco says it will be 1 to 4 cycles, but presumably it's near the low end
119 C of that range to get the measured results.
122 C In the code the two carries are more or less the preceding mul product and
123 C the calculation is roughly
127 C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
128 C v are the two limbs it's added to (being the low of the next mul, and a
129 C limb from the destination).
131 C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
132 C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
133 C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0
134 C and b-1, then the total probability can be summed over x and y,
136 C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1)
137 C --- * sum sum --- = --- * ------- * ------- = 1/4
138 C b^2 x=0 y=1 b^2 b^4 2 2
140 C Actually it's a very tiny bit less than 1/4 of course. If y is fixed,
141 C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
145 deflit(UNROLL_THRESHOLD, 9)
147 deflit(UNROLL_THRESHOLD, 6)
150 defframe(PARAM_CARRY, 20)
151 defframe(PARAM_MULTIPLIER,16)
152 defframe(PARAM_SIZE, 12)
153 defframe(PARAM_SRC, 8)
154 defframe(PARAM_DST, 4)
159 PROLOGUE(M4_function_1c)
162 movl PARAM_CARRY, %esi
163 jmp LF(M4_function_1,start_nc)
166 PROLOGUE(M4_function_1)
169 xorl %esi, %esi C initial carry
172 movl PARAM_SIZE, %ecx
180 cmpl $UNROLL_THRESHOLD, %ecx
190 movl PARAM_MULTIPLIER, %ebp
211 M4_inst %eax, -4(%edi)
230 C -----------------------------------------------------------------------------
231 C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop
232 C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
233 C For the computed jump an odd size means they start one way around, an even
236 C VAR_JUMP holds the computed jump temporarily because there's not enough
237 C registers at the point of doing the mul for the initial two carry limbs.
239 C The add/adc for the initial carry in %esi is necessary only for the
240 C mpn_addmul/submul_1c entry points. Duplicating the startup code to
241 C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
244 dnl overlapping with parameters already fetched
245 define(VAR_COUNTER, `PARAM_SIZE')
246 define(VAR_JUMP, `PARAM_DST')
263 shrl $UNROLL_LOG2, %edx
264 andl $UNROLL_MASK, %ecx
266 movl %edx, VAR_COUNTER
272 C 15 code bytes per limb
277 leal L(entry) (%edx,%ecx,1), %edx
279 movl (%ebx), %eax C src low limb
281 movl PARAM_MULTIPLIER, %ebp
286 addl %esi, %eax C initial carry (from _1c)
290 leal 4(%ebx,%ecx,4), %ebx
291 movl %edx, %esi C high carry
294 leal (%edi,%ecx,4), %edi
297 movl %eax, %ecx C low carry
300 movl %esi, %ecx C high,low carry other way around
310 C See README.family about old gas bugs
311 leal (%edx,%ecx,1), %edx
312 addl $L(entry)-L(here), %edx
318 C -----------------------------------------------------------
330 C 15 code bytes per limb
332 leal UNROLL_BYTES(%edi), %edi
335 forloop(`i', 0, UNROLL_COUNT/2-1, `
336 deflit(`disp0', eval(2*i*4))
337 deflit(`disp1', eval(disp0 + 4))
339 Zdisp( movl, disp0,(%ebx), %eax)
341 Zdisp( M4_inst,%ecx, disp0,(%edi))
346 movl disp1(%ebx), %eax
348 M4_inst %esi, disp1(%edi)
355 leal UNROLL_BYTES(%ebx), %ebx
361 M4_inst %ecx, UNROLL_BYTES(%edi)