rts/gmp/mpn/x86/k7/aorsmul_1.asm

   1 dnl  AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
   2 dnl
   3 dnl  K7: 3.9 cycles/limb.
   4 dnl
   5 dnl  Future: It should be possible to avoid the separate mul after the
   6 dnl  unrolled loop by moving the movl/adcl to the top.
   7
   8
   9 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
  10 dnl
  11 dnl  This file is part of the GNU MP Library.
  12 dnl
  13 dnl  The GNU MP Library is free software; you can redistribute it and/or
  14 dnl  modify it under the terms of the GNU Lesser General Public License as
  15 dnl  published by the Free Software Foundation; either version 2.1 of the
  16 dnl  License, or (at your option) any later version.
  17 dnl
  18 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  19 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21 dnl  Lesser General Public License for more details.
  22 dnl
  23 dnl  You should have received a copy of the GNU Lesser General Public
  24 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  25 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  26 dnl  Suite 330, Boston, MA 02111-1307, USA.
  27
  28
  29 include(`../config.m4')
  30
  31
  32 dnl  K7: UNROLL_COUNT  cycles/limb
  33 dnl           4            4.42
  34 dnl           8            4.16
  35 dnl          16            3.9
  36 dnl          32            3.9
  37 dnl          64            3.87
  38 dnl  Maximum possible with the current code is 64.
  39
  40 deflit(UNROLL_COUNT, 16)
  41
  42
  43 ifdef(`OPERATION_addmul_1',`
  44         define(M4_inst,        addl)
  45         define(M4_function_1,  mpn_addmul_1)
  46         define(M4_function_1c, mpn_addmul_1c)
  47         define(M4_description, add it to)
  48         define(M4_desc_retval, carry)
  49 ',`ifdef(`OPERATION_submul_1',`
  50         define(M4_inst,        subl)
  51         define(M4_function_1,  mpn_submul_1)
  52         define(M4_function_1c, mpn_submul_1c)
  53         define(M4_description, subtract it from)
  54         define(M4_desc_retval, borrow)
  55 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
  56 ')')')
  57
  58 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
  59
  60
  61 C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  62 C                            mp_limb_t mult);
  63 C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
  64 C                             mp_limb_t mult, mp_limb_t carry);
  65 C
  66 C Calculate src,size multiplied by mult and M4_description dst,size.
  67 C Return the M4_desc_retval limb from the top of the result.
  68
  69 ifdef(`PIC',`
  70 deflit(UNROLL_THRESHOLD, 9)
  71 ',`
  72 deflit(UNROLL_THRESHOLD, 6)
  73 ')
  74
  75 defframe(PARAM_CARRY,     20)
  76 defframe(PARAM_MULTIPLIER,16)
  77 defframe(PARAM_SIZE,      12)
  78 defframe(PARAM_SRC,       8)
  79 defframe(PARAM_DST,       4)
  80 deflit(`FRAME',0)
  81
  82 defframe(SAVE_EBX, -4)
  83 defframe(SAVE_ESI, -8)
  84 defframe(SAVE_EDI, -12)
  85 defframe(SAVE_EBP, -16)
  86 deflit(SAVE_SIZE, 16)
  87
  88         .text
  89         ALIGN(32)
  90 PROLOGUE(M4_function_1)
  91         movl    PARAM_SIZE, %edx
  92         movl    PARAM_SRC, %eax
  93         xorl    %ecx, %ecx
  94
  95         decl    %edx
  96         jnz     LF(M4_function_1c,start_1)
  97
  98         movl    (%eax), %eax
  99         movl    PARAM_DST, %ecx
 100
 101         mull    PARAM_MULTIPLIER
 102
 103         M4_inst %eax, (%ecx)
 104         adcl    $0, %edx
 105         movl    %edx, %eax
 106
 107         ret
 108 EPILOGUE()
 109
 110         ALIGN(16)
 111 PROLOGUE(M4_function_1c)
 112         movl    PARAM_SIZE, %edx
 113         movl    PARAM_SRC, %eax
 114
 115         decl    %edx
 116         jnz     L(more_than_one_limb)
 117
 118         movl    (%eax), %eax
 119         movl    PARAM_DST, %ecx
 120
 121         mull    PARAM_MULTIPLIER
 122
 123         addl    PARAM_CARRY, %eax
 124
 125         adcl    $0, %edx
 126         M4_inst %eax, (%ecx)
 127
 128         adcl    $0, %edx
 129         movl    %edx, %eax
 130
 131         ret
 132
 133
 134         C offset 0x44 so close enough to aligned
 135 L(more_than_one_limb):
 136         movl    PARAM_CARRY, %ecx
 137 L(start_1):
 138         C eax   src
 139         C ecx   initial carry
 140         C edx   size-1
 141         subl    $SAVE_SIZE, %esp
 142 deflit(`FRAME',16)
 143
 144         movl    %ebx, SAVE_EBX
 145         movl    %esi, SAVE_ESI
 146         movl    %edx, %ebx      C size-1
 147
 148         movl    PARAM_SRC, %esi
 149         movl    %ebp, SAVE_EBP
 150         cmpl    $UNROLL_THRESHOLD, %edx
 151
 152         movl    PARAM_MULTIPLIER, %ebp
 153         movl    %edi, SAVE_EDI
 154
 155         movl    (%esi), %eax    C src low limb
 156         movl    PARAM_DST, %edi
 157         ja      L(unroll)
 158
 159
 160         C simple loop
 161
 162         leal    4(%esi,%ebx,4), %esi    C point one limb past last
 163         leal    (%edi,%ebx,4), %edi     C point at last limb
 164         negl    %ebx
 165
 166         C The movl to load the next source limb is done well ahead of the
 167         C mul.  This is necessary for full speed, and leads to one limb
 168         C handled separately at the end.
 169
 170 L(simple):
 171         C eax   src limb
 172         C ebx   loop counter
 173         C ecx   carry limb
 174         C edx   scratch
 175         C esi   src
 176         C edi   dst
 177         C ebp   multiplier
 178
 179         mull    %ebp
 180
 181         addl    %eax, %ecx
 182         adcl    $0, %edx
 183
 184         M4_inst %ecx, (%edi,%ebx,4)
 185         movl    (%esi,%ebx,4), %eax
 186         adcl    $0, %edx
 187
 188         incl    %ebx
 189         movl    %edx, %ecx
 190         jnz     L(simple)
 191
 192
 193         mull    %ebp
 194
 195         movl    SAVE_EBX, %ebx
 196         movl    SAVE_ESI, %esi
 197         movl    SAVE_EBP, %ebp
 198
 199         addl    %eax, %ecx
 200         adcl    $0, %edx
 201
 202         M4_inst %ecx, (%edi)
 203         adcl    $0, %edx
 204         movl    SAVE_EDI, %edi
 205
 206         addl    $SAVE_SIZE, %esp
 207         movl    %edx, %eax
 208         ret
 209
 210
 211
 212 C -----------------------------------------------------------------------------
 213         ALIGN(16)
 214 L(unroll):
 215         C eax   src low limb
 216         C ebx   size-1
 217         C ecx   carry
 218         C edx   size-1
 219         C esi   src
 220         C edi   dst
 221         C ebp   multiplier
 222
 223 dnl  overlapping with parameters no longer needed
 224 define(VAR_COUNTER,`PARAM_SIZE')
 225 define(VAR_JUMP,   `PARAM_MULTIPLIER')
 226
 227         subl    $2, %ebx        C (size-2)-1
 228         decl    %edx            C size-2
 229
 230         shrl    $UNROLL_LOG2, %ebx
 231         negl    %edx
 232
 233         movl    %ebx, VAR_COUNTER
 234         andl    $UNROLL_MASK, %edx
 235
 236         movl    %edx, %ebx
 237         shll    $4, %edx
 238
 239 ifdef(`PIC',`
 240         call    L(pic_calc)
 241 L(here):
 242 ',`
 243         leal    L(entry) (%edx,%ebx,1), %edx
 244 ')
 245         negl    %ebx
 246         movl    %edx, VAR_JUMP
 247
 248         mull    %ebp
 249
 250         addl    %eax, %ecx      C initial carry, becomes low carry
 251         adcl    $0, %edx
 252         testb   $1, %bl
 253
 254         movl    4(%esi), %eax   C src second limb
 255         leal    ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi
 256         leal    ifelse(UNROLL_BYTES,256,128)   (%edi,%ebx,4), %edi
 257
 258         movl    %edx, %ebx      C high carry
 259         cmovnz( %ecx, %ebx)     C high,low carry other way around
 260         cmovnz( %edx, %ecx)
 261
 262         jmp     *VAR_JUMP
 263
 264
 265 ifdef(`PIC',`
 266 L(pic_calc):
 267         C See README.family about old gas bugs
 268         leal    (%edx,%ebx,1), %edx
 269         addl    $L(entry)-L(here), %edx
 270         addl    (%esp), %edx
 271         ret
 272 ')
 273
 274
 275 C -----------------------------------------------------------------------------
 276 C This code uses a "two carry limbs" scheme.  At the top of the loop the
 277 C carries are ebx=lo, ecx=hi, then they swap for each limb processed.  For
 278 C the computed jump an odd size means they start one way around, an even
 279 C size the other.  Either way one limb is handled separately at the start of
 280 C the loop.
 281 C
 282 C The positioning of the movl to load the next source limb is important.
 283 C Moving it after the adcl with a view to avoiding a separate mul at the end
 284 C of the loop slows the code down.
 285
 286         ALIGN(32)
 287 L(top):
 288         C eax   src limb
 289         C ebx   carry high
 290         C ecx   carry low
 291         C edx   scratch
 292         C esi   src+8
 293         C edi   dst
 294         C ebp   multiplier
 295         C
 296         C VAR_COUNTER  loop counter
 297         C
 298         C 17 bytes each limb
 299
 300 L(entry):
 301 deflit(CHUNK_COUNT,2)
 302 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 303         deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
 304         deflit(`disp1', eval(disp0 + 4))
 305
 306         mull    %ebp
 307
 308 Zdisp(  M4_inst,%ecx, disp0,(%edi))
 309         movl    $0, %ecx
 310
 311         adcl    %eax, %ebx
 312
 313 Zdisp(  movl,   disp0,(%esi), %eax)
 314         adcl    %edx, %ecx
 315
 316
 317         mull    %ebp
 318
 319         M4_inst %ebx, disp1(%edi)
 320         movl    $0, %ebx
 321
 322         adcl    %eax, %ecx
 323
 324         movl    disp1(%esi), %eax
 325         adcl    %edx, %ebx
 326 ')
 327
 328         decl    VAR_COUNTER
 329         leal    UNROLL_BYTES(%esi), %esi
 330         leal    UNROLL_BYTES(%edi), %edi
 331
 332         jns     L(top)
 333
 334
 335         C eax   src limb
 336         C ebx   carry high
 337         C ecx   carry low
 338         C edx
 339         C esi
 340         C edi   dst (points at second last limb)
 341         C ebp   multiplier
 342 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
 343 deflit(`disp1', eval(disp0-0 + 4))
 344
 345         mull    %ebp
 346
 347         M4_inst %ecx, disp0(%edi)
 348         movl    SAVE_EBP, %ebp
 349
 350         adcl    %ebx, %eax
 351         movl    SAVE_EBX, %ebx
 352         movl    SAVE_ESI, %esi
 353
 354         adcl    $0, %edx
 355         M4_inst %eax, disp1(%edi)
 356         movl    SAVE_EDI, %edi
 357
 358         adcl    $0, %edx
 359         addl    $SAVE_SIZE, %esp
 360
 361         movl    %edx, %eax
 362         ret
 363
 364 EPILOGUE()