ghc/rts/gmp/mpn/x86/k7/mul_1.asm

   1 dnl  AMD K7 mpn_mul_1 -- mpn by limb multiply.
   2 dnl
   3 dnl  K7: 3.4 cycles/limb (at 16 limbs/loop).
   4
   5
   6 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   7 dnl
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or
  11 dnl  modify it under the terms of the GNU Lesser General Public License as
  12 dnl  published by the Free Software Foundation; either version 2.1 of the
  13 dnl  License, or (at your option) any later version.
  14 dnl
  15 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  16 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 dnl  Lesser General Public License for more details.
  19 dnl
  20 dnl  You should have received a copy of the GNU Lesser General Public
  21 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  22 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  23 dnl  Suite 330, Boston, MA 02111-1307, USA.
  24
  25
  26 include(`../config.m4')
  27
  28
  29 dnl  K7: UNROLL_COUNT cycles/limb
  30 dnl           8           3.9
  31 dnl          16           3.4
  32 dnl          32           3.4
  33 dnl          64           3.35
  34 dnl  Maximum possible with the current code is 64.
  35
  36 deflit(UNROLL_COUNT, 16)
  37
  38
  39 C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  40 C                      mp_limb_t multiplier);
  41 C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
  42 C                       mp_limb_t multiplier, mp_limb_t carry);
  43 C
  44 C Multiply src,size by mult and store the result in dst,size.
  45 C Return the carry limb from the top of the result.
  46 C
  47 C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
  48 C the low limb of the destination.
  49 C
  50 C Variations on the unrolled loop have been tried, with the current
  51 C registers or with the counter on the stack to free up ecx.  The current
  52 C code is the fastest found.
  53 C
  54 C An interesting effect is that removing the stores "movl %ebx, disp0(%edi)"
  55 C from the unrolled loop actually slows it down to 5.0 cycles/limb.  Code
  56 C with this change can be tested on sizes of the form UNROLL_COUNT*n+1
  57 C without having to change the computed jump.  There's obviously something
  58 C fishy going on, perhaps with what execution units the mul needs.
  59
  60 defframe(PARAM_CARRY,     20)
  61 defframe(PARAM_MULTIPLIER,16)
  62 defframe(PARAM_SIZE,      12)
  63 defframe(PARAM_SRC,       8)
  64 defframe(PARAM_DST,       4)
  65
  66 defframe(SAVE_EBP, -4)
  67 defframe(SAVE_EDI, -8)
  68 defframe(SAVE_ESI, -12)
  69 defframe(SAVE_EBX, -16)
  70 deflit(STACK_SPACE, 16)
  71
  72 dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
  73 ifdef(`PIC',`
  74 deflit(UNROLL_THRESHOLD, 7)
  75 ',`
  76 deflit(UNROLL_THRESHOLD, 5)
  77 ')
  78
  79         .text
  80         ALIGN(32)
  81 PROLOGUE(mpn_mul_1c)
  82 deflit(`FRAME',0)
  83         movl    PARAM_CARRY, %edx
  84         jmp     LF(mpn_mul_1,start_nc)
  85 EPILOGUE()
  86
  87
  88 PROLOGUE(mpn_mul_1)
  89 deflit(`FRAME',0)
  90         xorl    %edx, %edx      C initial carry
  91 L(start_nc):
  92         movl    PARAM_SIZE, %ecx
  93         subl    $STACK_SPACE, %esp
  94 deflit(`FRAME', STACK_SPACE)
  95
  96         movl    %edi, SAVE_EDI
  97         movl    %ebx, SAVE_EBX
  98         movl    %edx, %ebx
  99
 100         movl    %esi, SAVE_ESI
 101         movl    PARAM_SRC, %esi
 102         cmpl    $UNROLL_THRESHOLD, %ecx
 103
 104         movl    PARAM_DST, %edi
 105         movl    %ebp, SAVE_EBP
 106         jae     L(unroll)
 107
 108         leal    (%esi,%ecx,4), %esi
 109         leal    (%edi,%ecx,4), %edi
 110         negl    %ecx
 111
 112         movl    PARAM_MULTIPLIER, %ebp
 113
 114 L(simple):
 115         C eax   scratch
 116         C ebx   carry
 117         C ecx   counter (negative)
 118         C edx   scratch
 119         C esi   src
 120         C edi   dst
 121         C ebp   multiplier
 122
 123         movl    (%esi,%ecx,4), %eax
 124
 125         mull    %ebp
 126
 127         addl    %ebx, %eax
 128         movl    %eax, (%edi,%ecx,4)
 129         movl    $0, %ebx
 130
 131         adcl    %edx, %ebx
 132         incl    %ecx
 133         jnz     L(simple)
 134
 135         movl    %ebx, %eax
 136         movl    SAVE_EBX, %ebx
 137         movl    SAVE_ESI, %esi
 138
 139         movl    SAVE_EDI, %edi
 140         movl    SAVE_EBP, %ebp
 141         addl    $STACK_SPACE, %esp
 142
 143         ret
 144
 145
 146 C -----------------------------------------------------------------------------
 147 C The mov to load the next source limb is done well ahead of the mul, this
 148 C is necessary for full speed.  It leads to one limb handled separately
 149 C after the loop.
 150 C
 151 C When unrolling to 32 or more, an offset of +4 is used on the src pointer,
 152 C to avoid having an 0x80 displacement in the code for the last limb in the
 153 C unrolled loop.  This is for a fair comparison between 16 and 32 unrolling.
 154
 155 ifelse(eval(UNROLL_COUNT >= 32),1,`
 156 deflit(SRC_OFFSET,4)
 157 ',`
 158 deflit(SRC_OFFSET,)
 159 ')
 160
 161         C this is offset 0x62, so close enough to aligned
 162 L(unroll):
 163         C eax
 164         C ebx   initial carry
 165         C ecx   size
 166         C edx
 167         C esi   src
 168         C edi   dst
 169         C ebp
 170 deflit(`FRAME', STACK_SPACE)
 171
 172         leal    -1(%ecx), %edx  C one limb handled at end
 173         leal    -2(%ecx), %ecx  C and ecx is one less than edx
 174         movl    %ebp, SAVE_EBP
 175
 176         negl    %edx
 177         shrl    $UNROLL_LOG2, %ecx      C unrolled loop counter
 178         movl    (%esi), %eax            C src low limb
 179
 180         andl    $UNROLL_MASK, %edx
 181         movl    PARAM_DST, %edi
 182
 183         movl    %edx, %ebp
 184         shll    $4, %edx
 185
 186         C 17 code bytes per limb
 187 ifdef(`PIC',`
 188         call    L(add_eip_to_edx)
 189 L(here):
 190 ',`
 191         leal    L(entry) (%edx,%ebp), %edx
 192 ')
 193         negl    %ebp
 194
 195         leal    ifelse(UNROLL_BYTES,256,128+) SRC_OFFSET(%esi,%ebp,4), %esi
 196         leal    ifelse(UNROLL_BYTES,256,128) (%edi,%ebp,4), %edi
 197         movl    PARAM_MULTIPLIER, %ebp
 198
 199         jmp     *%edx
 200
 201
 202 ifdef(`PIC',`
 203 L(add_eip_to_edx):
 204         C See README.family about old gas bugs
 205         leal    (%edx,%ebp), %edx
 206         addl    $L(entry)-L(here), %edx
 207         addl    (%esp), %edx
 208         ret
 209 ')
 210
 211
 212 C ----------------------------------------------------------------------------
 213         ALIGN(32)
 214 L(top):
 215         C eax   next src limb
 216         C ebx   carry
 217         C ecx   counter
 218         C edx   scratch
 219         C esi   src+4
 220         C edi   dst
 221         C ebp   multiplier
 222         C
 223         C 17 code bytes per limb processed
 224
 225 L(entry):
 226 forloop(i, 0, UNROLL_COUNT-1, `
 227         deflit(`disp_dst', eval(i*4 ifelse(UNROLL_BYTES,256,-128)))
 228         deflit(`disp_src', eval(disp_dst + 4-(SRC_OFFSET-0)))
 229
 230         mull    %ebp
 231
 232         addl    %eax, %ebx
 233 Zdisp(  movl,   disp_src,(%esi), %eax)
 234 Zdisp(  movl,   %ebx, disp_dst,(%edi))
 235
 236         movl    $0, %ebx
 237         adcl    %edx, %ebx
 238 ')
 239
 240         decl    %ecx
 241
 242         leal    UNROLL_BYTES(%esi), %esi
 243         leal    UNROLL_BYTES(%edi), %edi
 244         jns     L(top)
 245
 246
 247 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
 248
 249         mull    %ebp
 250
 251         addl    %eax, %ebx
 252         movl    $0, %eax
 253         movl    SAVE_ESI, %esi
 254
 255         movl    %ebx, disp0(%edi)
 256         movl    SAVE_EBX, %ebx
 257         movl    SAVE_EDI, %edi
 258
 259         adcl    %edx, %eax
 260         movl    SAVE_EBP, %ebp
 261         addl    $STACK_SPACE, %esp
 262
 263         ret
 264
 265 EPILOGUE()