rts/gmp/mpn/x86/k6/aorsmul_1.asm

   1 dnl  AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
   2 dnl
   3 dnl  K6: 7.65 to 8.5 cycles/limb (at 16 limbs/loop and depending on the data),
   4 dnl  PIC adds about 6 cycles at the start.
   5
   6
   7 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   8 dnl
   9 dnl  This file is part of the GNU MP Library.
  10 dnl
  11 dnl  The GNU MP Library is free software; you can redistribute it and/or
  12 dnl  modify it under the terms of the GNU Lesser General Public License as
  13 dnl  published by the Free Software Foundation; either version 2.1 of the
  14 dnl  License, or (at your option) any later version.
  15 dnl
  16 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  17 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19 dnl  Lesser General Public License for more details.
  20 dnl
  21 dnl  You should have received a copy of the GNU Lesser General Public
  22 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  23 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  24 dnl  Suite 330, Boston, MA 02111-1307, USA.
  25
  26
  27 include(`../config.m4')
  28
  29
  30 dnl  K6:           large multpliers  small multpliers
  31 dnl  UNROLL_COUNT    cycles/limb       cycles/limb
  32 dnl        4             9.5              7.78
  33 dnl        8             9.0              7.78
  34 dnl       16             8.4              7.65
  35 dnl       32             8.4              8.2
  36 dnl
  37 dnl  Maximum possible unrolling with the current code is 32.
  38 dnl
  39 dnl  Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
  40 dnl  byte block, which might explain the good speed at that unrolling.
  41
  42 deflit(UNROLL_COUNT, 16)
  43
  44
  45 ifdef(`OPERATION_addmul_1', `
  46         define(M4_inst,        addl)
  47         define(M4_function_1,  mpn_addmul_1)
  48         define(M4_function_1c, mpn_addmul_1c)
  49         define(M4_description, add it to)
  50         define(M4_desc_retval, carry)
  51 ',`ifdef(`OPERATION_submul_1', `
  52         define(M4_inst,        subl)
  53         define(M4_function_1,  mpn_submul_1)
  54         define(M4_function_1c, mpn_submul_1c)
  55         define(M4_description, subtract it from)
  56         define(M4_desc_retval, borrow)
  57 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
  58 ')')')
  59
  60 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
  61
  62
  63 C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  64 C                          mp_limb_t mult);
  65 C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
  66 C                           mp_limb_t mult, mp_limb_t carry);
  67 C
  68 C Calculate src,size multiplied by mult and M4_description dst,size.
  69 C Return the M4_desc_retval limb from the top of the result.
  70 C
  71 C The jadcl0()s in the unrolled loop makes the speed data dependent.  Small
  72 C multipliers (most significant few bits clear) result in few carry bits and
  73 C speeds up to 7.65 cycles/limb are attained.  Large multipliers (most
  74 C significant few bits set) make the carry bits 50/50 and lead to something
  75 C more like 8.4 c/l.  (With adcl's both of these would be 9.3 c/l.)
  76 C
  77 C It's important that the gains for jadcl0 on small multipliers don't come
  78 C at the cost of slowing down other data.  Tests on uniformly distributed
  79 C random data, designed to confound branch prediction, show about a 7%
  80 C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
  81 C overheads included).
  82 C
  83 C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
  84 C 11.0 cycles/limb), and hence isn't used.
  85 C
  86 C In the simple loop, note that running ecx from negative to zero and using
  87 C it as an index in the two movs wouldn't help.  It would save one
  88 C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
  89 C that would be collapsed by this.
  90 C
  91 C
  92 C jadcl0
  93 C ------
  94 C
  95 C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
  96 C firstly the instruction decoding and secondly the fact that there's a
  97 C carry bit for the jadcl0 only on average about 1/4 of the time.
  98 C
  99 C The code in the unrolled loop decodes something like the following.
 100 C
 101 C                                         decode cycles
 102 C               mull    %ebp                    2
 103 C               M4_inst %esi, disp(%edi)        1
 104 C               adcl    %eax, %ecx              2
 105 C               movl    %edx, %esi            \ 1
 106 C               jnc     1f                    /
 107 C               incl    %esi                  \ 1
 108 C       1:      movl    disp(%ebx), %eax      /
 109 C                                              ---
 110 C                                               7
 111 C
 112 C In a back-to-back style test this measures 7 with the jnc not taken, or 8
 113 C with it taken (both when correctly predicted).  This is opposite to the
 114 C measurements showing small multipliers running faster than large ones.
 115 C Watch this space for more info ...
 116 C
 117 C It's not clear how much branch misprediction might be costing.  The K6
 118 C doco says it will be 1 to 4 cycles, but presumably it's near the low end
 119 C of that range to get the measured results.
 120 C
 121 C
 122 C In the code the two carries are more or less the preceding mul product and
 123 C the calculation is roughly
 124 C
 125 C       x*y + u*b+v
 126 C
 127 C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
 128 C v are the two limbs it's added to (being the low of the next mul, and a
 129 C limb from the destination).
 130 C
 131 C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
 132 C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
 133 C x*y/b^2.  If x, y, u and v are random and uniformly distributed between 0
 134 C and b-1, then the total probability can be summed over x and y,
 135 C
 136 C        1    b-1 b-1 x*y    1    b*(b-1)   b*(b-1)
 137 C       --- * sum sum --- = --- * ------- * ------- = 1/4
 138 C       b^2   x=0 y=1 b^2   b^4      2         2
 139 C
 140 C Actually it's a very tiny bit less than 1/4 of course.  If y is fixed,
 141 C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
 142
 143
 144 ifdef(`PIC',`
 145 deflit(UNROLL_THRESHOLD, 9)
 146 ',`
 147 deflit(UNROLL_THRESHOLD, 6)
 148 ')
 149
 150 defframe(PARAM_CARRY,     20)
 151 defframe(PARAM_MULTIPLIER,16)
 152 defframe(PARAM_SIZE,      12)
 153 defframe(PARAM_SRC,       8)
 154 defframe(PARAM_DST,       4)
 155
 156         .text
 157         ALIGN(32)
 158
 159 PROLOGUE(M4_function_1c)
 160         pushl   %esi
 161 deflit(`FRAME',4)
 162         movl    PARAM_CARRY, %esi
 163         jmp     LF(M4_function_1,start_nc)
 164 EPILOGUE()
 165
 166 PROLOGUE(M4_function_1)
 167         push    %esi
 168 deflit(`FRAME',4)
 169         xorl    %esi, %esi      C initial carry
 170
 171 L(start_nc):
 172         movl    PARAM_SIZE, %ecx
 173         pushl   %ebx
 174 deflit(`FRAME',8)
 175
 176         movl    PARAM_SRC, %ebx
 177         pushl   %edi
 178 deflit(`FRAME',12)
 179
 180         cmpl    $UNROLL_THRESHOLD, %ecx
 181         movl    PARAM_DST, %edi
 182
 183         pushl   %ebp
 184 deflit(`FRAME',16)
 185         jae     L(unroll)
 186
 187
 188         C simple loop
 189
 190         movl    PARAM_MULTIPLIER, %ebp
 191
 192 L(simple):
 193         C eax   scratch
 194         C ebx   src
 195         C ecx   counter
 196         C edx   scratch
 197         C esi   carry
 198         C edi   dst
 199         C ebp   multiplier
 200
 201         movl    (%ebx), %eax
 202         addl    $4, %ebx
 203
 204         mull    %ebp
 205
 206         addl    $4, %edi
 207         addl    %esi, %eax
 208
 209         adcl    $0, %edx
 210
 211         M4_inst %eax, -4(%edi)
 212
 213         adcl    $0, %edx
 214
 215         movl    %edx, %esi
 216         loop    L(simple)
 217
 218
 219         popl    %ebp
 220         popl    %edi
 221
 222         popl    %ebx
 223         movl    %esi, %eax
 224
 225         popl    %esi
 226         ret
 227
 228
 229
 230 C -----------------------------------------------------------------------------
 231 C The unrolled loop uses a "two carry limbs" scheme.  At the top of the loop
 232 C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
 233 C For the computed jump an odd size means they start one way around, an even
 234 C size the other.
 235 C
 236 C VAR_JUMP holds the computed jump temporarily because there's not enough
 237 C registers at the point of doing the mul for the initial two carry limbs.
 238 C
 239 C The add/adc for the initial carry in %esi is necessary only for the
 240 C mpn_addmul/submul_1c entry points.  Duplicating the startup code to
 241 C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
 242 C idea.
 243
 244 dnl  overlapping with parameters already fetched
 245 define(VAR_COUNTER, `PARAM_SIZE')
 246 define(VAR_JUMP,    `PARAM_DST')
 247
 248 L(unroll):
 249         C eax
 250         C ebx   src
 251         C ecx   size
 252         C edx
 253         C esi   initial carry
 254         C edi   dst
 255         C ebp
 256
 257         movl    %ecx, %edx
 258         decl    %ecx
 259
 260         subl    $2, %edx
 261         negl    %ecx
 262
 263         shrl    $UNROLL_LOG2, %edx
 264         andl    $UNROLL_MASK, %ecx
 265
 266         movl    %edx, VAR_COUNTER
 267         movl    %ecx, %edx
 268
 269         shll    $4, %edx
 270         negl    %ecx
 271
 272         C 15 code bytes per limb
 273 ifdef(`PIC',`
 274         call    L(pic_calc)
 275 L(here):
 276 ',`
 277         leal    L(entry) (%edx,%ecx,1), %edx
 278 ')
 279         movl    (%ebx), %eax            C src low limb
 280
 281         movl    PARAM_MULTIPLIER, %ebp
 282         movl    %edx, VAR_JUMP
 283
 284         mull    %ebp
 285
 286         addl    %esi, %eax      C initial carry (from _1c)
 287         jadcl0( %edx)
 288
 289
 290         leal    4(%ebx,%ecx,4), %ebx
 291         movl    %edx, %esi      C high carry
 292
 293         movl    VAR_JUMP, %edx
 294         leal    (%edi,%ecx,4), %edi
 295
 296         testl   $1, %ecx
 297         movl    %eax, %ecx      C low carry
 298
 299         jz      L(noswap)
 300         movl    %esi, %ecx      C high,low carry other way around
 301
 302         movl    %eax, %esi
 303 L(noswap):
 304
 305         jmp     *%edx
 306
 307
 308 ifdef(`PIC',`
 309 L(pic_calc):
 310         C See README.family about old gas bugs
 311         leal    (%edx,%ecx,1), %edx
 312         addl    $L(entry)-L(here), %edx
 313         addl    (%esp), %edx
 314         ret
 315 ')
 316
 317
 318 C -----------------------------------------------------------
 319         ALIGN(32)
 320 L(top):
 321 deflit(`FRAME',16)
 322         C eax   scratch
 323         C ebx   src
 324         C ecx   carry lo
 325         C edx   scratch
 326         C esi   carry hi
 327         C edi   dst
 328         C ebp   multiplier
 329         C
 330         C 15 code bytes per limb
 331
 332         leal    UNROLL_BYTES(%edi), %edi
 333
 334 L(entry):
 335 forloop(`i', 0, UNROLL_COUNT/2-1, `
 336         deflit(`disp0', eval(2*i*4))
 337         deflit(`disp1', eval(disp0 + 4))
 338
 339 Zdisp(  movl,   disp0,(%ebx), %eax)
 340         mull    %ebp
 341 Zdisp(  M4_inst,%ecx, disp0,(%edi))
 342         adcl    %eax, %esi
 343         movl    %edx, %ecx
 344         jadcl0( %ecx)
 345
 346         movl    disp1(%ebx), %eax
 347         mull    %ebp
 348         M4_inst %esi, disp1(%edi)
 349         adcl    %eax, %ecx
 350         movl    %edx, %esi
 351         jadcl0( %esi)
 352 ')
 353
 354         decl    VAR_COUNTER
 355         leal    UNROLL_BYTES(%ebx), %ebx
 356
 357         jns     L(top)
 358
 359
 360         popl    %ebp
 361         M4_inst %ecx, UNROLL_BYTES(%edi)
 362
 363         popl    %edi
 364         movl    %esi, %eax
 365
 366         popl    %ebx
 367         jadcl0( %eax)
 368
 369         popl    %esi
 370         ret
 371
 372 EPILOGUE()