X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=ghc%2Frts%2Fgmp%2Fmpn%2Fx86%2Fk6%2Faorsmul_1.asm;fp=ghc%2Frts%2Fgmp%2Fmpn%2Fx86%2Fk6%2Faorsmul_1.asm;h=da4120fe2f64815c63cb4381ccc570b6f322ad4a;hb=e72806f1683fc7acf7bd10885e8e45ff06d790f1;hp=0000000000000000000000000000000000000000;hpb=a6c4cc3666b15a6392279d492f87861c1f18b2de;p=ghc-hetmet.git diff --git a/ghc/rts/gmp/mpn/x86/k6/aorsmul_1.asm b/ghc/rts/gmp/mpn/x86/k6/aorsmul_1.asm new file mode 100644 index 0000000..da4120f --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/aorsmul_1.asm @@ -0,0 +1,372 @@ +dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. +dnl +dnl K6: 7.65 to 8.5 cycles/limb (at 16 limbs/loop and depending on the data), +dnl PIC adds about 6 cycles at the start. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K6: large multpliers small multpliers +dnl UNROLL_COUNT cycles/limb cycles/limb +dnl 4 9.5 7.78 +dnl 8 9.0 7.78 +dnl 16 8.4 7.65 +dnl 32 8.4 8.2 +dnl +dnl Maximum possible unrolling with the current code is 32. +dnl +dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256 +dnl byte block, which might explain the good speed at that unrolling. + +deflit(UNROLL_COUNT, 16) + + +ifdef(`OPERATION_addmul_1', ` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + define(M4_function_1c, mpn_addmul_1c) + define(M4_description, add it to) + define(M4_desc_retval, carry) +',`ifdef(`OPERATION_submul_1', ` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + define(M4_function_1c, mpn_submul_1c) + define(M4_description, subtract it from) + define(M4_desc_retval, borrow) +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + + +C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C +C Calculate src,size multiplied by mult and M4_description dst,size. +C Return the M4_desc_retval limb from the top of the result. +C +C The jadcl0()s in the unrolled loop makes the speed data dependent. Small +C multipliers (most significant few bits clear) result in few carry bits and +C speeds up to 7.65 cycles/limb are attained. Large multipliers (most +C significant few bits set) make the carry bits 50/50 and lead to something +C more like 8.4 c/l. (With adcl's both of these would be 9.3 c/l.) +C +C It's important that the gains for jadcl0 on small multipliers don't come +C at the cost of slowing down other data. Tests on uniformly distributed +C random data, designed to confound branch prediction, show about a 7% +C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all +C overheads included). +C +C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus +C 11.0 cycles/limb), and hence isn't used. +C +C In the simple loop, note that running ecx from negative to zero and using +C it as an index in the two movs wouldn't help. It would save one +C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired +C that would be collapsed by this. +C +C +C jadcl0 +C ------ +C +C jadcl0() being faster than adcl $0 seems to be an artifact of two things, +C firstly the instruction decoding and secondly the fact that there's a +C carry bit for the jadcl0 only on average about 1/4 of the time. +C +C The code in the unrolled loop decodes something like the following. +C +C decode cycles +C mull %ebp 2 +C M4_inst %esi, disp(%edi) 1 +C adcl %eax, %ecx 2 +C movl %edx, %esi \ 1 +C jnc 1f / +C incl %esi \ 1 +C 1: movl disp(%ebx), %eax / +C --- +C 7 +C +C In a back-to-back style test this measures 7 with the jnc not taken, or 8 +C with it taken (both when correctly predicted). This is opposite to the +C measurements showing small multipliers running faster than large ones. +C Watch this space for more info ... +C +C It's not clear how much branch misprediction might be costing. The K6 +C doco says it will be 1 to 4 cycles, but presumably it's near the low end +C of that range to get the measured results. +C +C +C In the code the two carries are more or less the preceding mul product and +C the calculation is roughly +C +C x*y + u*b+v +C +C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and +C v are the two limbs it's added to (being the low of the next mul, and a +C limb from the destination). +C +C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and +C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of +C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0 +C and b-1, then the total probability can be summed over x and y, +C +C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1) +C --- * sum sum --- = --- * ------- * ------- = 1/4 +C b^2 x=0 y=1 b^2 b^4 2 2 +C +C Actually it's a very tiny bit less than 1/4 of course. If y is fixed, +C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2. + + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 9) +',` +deflit(UNROLL_THRESHOLD, 6) +') + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) + +PROLOGUE(M4_function_1c) + pushl %esi +deflit(`FRAME',4) + movl PARAM_CARRY, %esi + jmp LF(M4_function_1,start_nc) +EPILOGUE() + +PROLOGUE(M4_function_1) + push %esi +deflit(`FRAME',4) + xorl %esi, %esi C initial carry + +L(start_nc): + movl PARAM_SIZE, %ecx + pushl %ebx +deflit(`FRAME',8) + + movl PARAM_SRC, %ebx + pushl %edi +deflit(`FRAME',12) + + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_DST, %edi + + pushl %ebp +deflit(`FRAME',16) + jae L(unroll) + + + C simple loop + + movl PARAM_MULTIPLIER, %ebp + +L(simple): + C eax scratch + C ebx src + C ecx counter + C edx scratch + C esi carry + C edi dst + C ebp multiplier + + movl (%ebx), %eax + addl $4, %ebx + + mull %ebp + + addl $4, %edi + addl %esi, %eax + + adcl $0, %edx + + M4_inst %eax, -4(%edi) + + adcl $0, %edx + + movl %edx, %esi + loop L(simple) + + + popl %ebp + popl %edi + + popl %ebx + movl %esi, %eax + + popl %esi + ret + + + +C ----------------------------------------------------------------------------- +C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop +C the carries are ecx=lo, esi=hi, then they swap for each limb processed. +C For the computed jump an odd size means they start one way around, an even +C size the other. +C +C VAR_JUMP holds the computed jump temporarily because there's not enough +C registers at the point of doing the mul for the initial two carry limbs. +C +C The add/adc for the initial carry in %esi is necessary only for the +C mpn_addmul/submul_1c entry points. Duplicating the startup code to +C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good +C idea. + +dnl overlapping with parameters already fetched +define(VAR_COUNTER, `PARAM_SIZE') +define(VAR_JUMP, `PARAM_DST') + +L(unroll): + C eax + C ebx src + C ecx size + C edx + C esi initial carry + C edi dst + C ebp + + movl %ecx, %edx + decl %ecx + + subl $2, %edx + negl %ecx + + shrl $UNROLL_LOG2, %edx + andl $UNROLL_MASK, %ecx + + movl %edx, VAR_COUNTER + movl %ecx, %edx + + shll $4, %edx + negl %ecx + + C 15 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%edx,%ecx,1), %edx +') + movl (%ebx), %eax C src low limb + + movl PARAM_MULTIPLIER, %ebp + movl %edx, VAR_JUMP + + mull %ebp + + addl %esi, %eax C initial carry (from _1c) + jadcl0( %edx) + + + leal 4(%ebx,%ecx,4), %ebx + movl %edx, %esi C high carry + + movl VAR_JUMP, %edx + leal (%edi,%ecx,4), %edi + + testl $1, %ecx + movl %eax, %ecx C low carry + + jz L(noswap) + movl %esi, %ecx C high,low carry other way around + + movl %eax, %esi +L(noswap): + + jmp *%edx + + +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + leal (%edx,%ecx,1), %edx + addl $L(entry)-L(here), %edx + addl (%esp), %edx + ret +') + + +C ----------------------------------------------------------- + ALIGN(32) +L(top): +deflit(`FRAME',16) + C eax scratch + C ebx src + C ecx carry lo + C edx scratch + C esi carry hi + C edi dst + C ebp multiplier + C + C 15 code bytes per limb + + leal UNROLL_BYTES(%edi), %edi + +L(entry): +forloop(`i', 0, UNROLL_COUNT/2-1, ` + deflit(`disp0', eval(2*i*4)) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%ebx), %eax) + mull %ebp +Zdisp( M4_inst,%ecx, disp0,(%edi)) + adcl %eax, %esi + movl %edx, %ecx + jadcl0( %ecx) + + movl disp1(%ebx), %eax + mull %ebp + M4_inst %esi, disp1(%edi) + adcl %eax, %ecx + movl %edx, %esi + jadcl0( %esi) +') + + decl VAR_COUNTER + leal UNROLL_BYTES(%ebx), %ebx + + jns L(top) + + + popl %ebp + M4_inst %ecx, UNROLL_BYTES(%edi) + + popl %edi + movl %esi, %eax + + popl %ebx + jadcl0( %eax) + + popl %esi + ret + +EPILOGUE()