rts/gmp/mpn/x86/k6/mul_1.asm

   1 dnl  AMD K6 mpn_mul_1 -- mpn by limb multiply.
   2 dnl
   3 dnl  K6: 6.25 cycles/limb.
   4
   5
   6 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   7 dnl
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or
  11 dnl  modify it under the terms of the GNU Lesser General Public License as
  12 dnl  published by the Free Software Foundation; either version 2.1 of the
  13 dnl  License, or (at your option) any later version.
  14 dnl
  15 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  16 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 dnl  Lesser General Public License for more details.
  19 dnl
  20 dnl  You should have received a copy of the GNU Lesser General Public
  21 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  22 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  23 dnl  Suite 330, Boston, MA 02111-1307, USA.
  24
  25
  26 include(`../config.m4')
  27
  28
  29 C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  30 C                      mp_limb_t multiplier);
  31 C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
  32 C                       mp_limb_t multiplier, mp_limb_t carry);
  33 C
  34 C Multiply src,size by mult and store the result in dst,size.
  35 C Return the carry limb from the top of the result.
  36 C
  37 C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
  38 C the low limb of the result.
  39
  40 defframe(PARAM_CARRY,     20)
  41 defframe(PARAM_MULTIPLIER,16)
  42 defframe(PARAM_SIZE,      12)
  43 defframe(PARAM_SRC,       8)
  44 defframe(PARAM_DST,       4)
  45
  46 dnl  minimum 5 because the unrolled code can't handle less
  47 deflit(UNROLL_THRESHOLD, 5)
  48
  49         .text
  50         ALIGN(32)
  51
  52 PROLOGUE(mpn_mul_1c)
  53         pushl   %esi
  54 deflit(`FRAME',4)
  55         movl    PARAM_CARRY, %esi
  56         jmp     LF(mpn_mul_1,start_nc)
  57 EPILOGUE()
  58
  59
  60 PROLOGUE(mpn_mul_1)
  61         push    %esi
  62 deflit(`FRAME',4)
  63         xorl    %esi, %esi      C initial carry
  64
  65 L(start_nc):
  66         mov     PARAM_SIZE, %ecx
  67         push    %ebx
  68 FRAME_pushl()
  69
  70         movl    PARAM_SRC, %ebx
  71         push    %edi
  72 FRAME_pushl()
  73
  74         movl    PARAM_DST, %edi
  75         pushl   %ebp
  76 FRAME_pushl()
  77
  78         cmpl    $UNROLL_THRESHOLD, %ecx
  79         movl    PARAM_MULTIPLIER, %ebp
  80
  81         jae     L(unroll)
  82
  83
  84         C code offset 0x22 here, close enough to aligned
  85 L(simple):
  86         C eax   scratch
  87         C ebx   src
  88         C ecx   counter
  89         C edx   scratch
  90         C esi   carry
  91         C edi   dst
  92         C ebp   multiplier
  93         C
  94         C this loop 8 cycles/limb
  95
  96         movl    (%ebx), %eax
  97         addl    $4, %ebx
  98
  99         mull    %ebp
 100
 101         addl    %esi, %eax
 102         movl    $0, %esi
 103
 104         adcl    %edx, %esi
 105
 106         movl    %eax, (%edi)
 107         addl    $4, %edi
 108
 109         loop    L(simple)
 110
 111
 112         popl    %ebp
 113
 114         popl    %edi
 115         popl    %ebx
 116
 117         movl    %esi, %eax
 118         popl    %esi
 119
 120         ret
 121
 122
 123 C -----------------------------------------------------------------------------
 124 C The code for each limb is 6 cycles, with instruction decoding being the
 125 C limiting factor.  At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
 126 C cycles/limb in total.
 127 C
 128 C The secret ingredient to get 6.25 is to start the loop with the mul and
 129 C have the load/store pair at the end.  Rotating the load/store to the top
 130 C is an 0.5 c/l slowdown.  (Some address generation effect probably.)
 131 C
 132 C The whole unrolled loop fits nicely in exactly 80 bytes.
 133
 134
 135         ALIGN(16)       C already aligned to 16 here actually
 136 L(unroll):
 137         movl    (%ebx), %eax
 138         leal    -16(%ebx,%ecx,4), %ebx
 139
 140         leal    -16(%edi,%ecx,4), %edi
 141         subl    $4, %ecx
 142
 143         negl    %ecx
 144
 145
 146         ALIGN(16)       C one byte nop for this alignment
 147 L(top):
 148         C eax   scratch
 149         C ebx   &src[size-4]
 150         C ecx   counter
 151         C edx   scratch
 152         C esi   carry
 153         C edi   &dst[size-4]
 154         C ebp   multiplier
 155
 156         mull    %ebp
 157
 158         addl    %esi, %eax
 159         movl    $0, %esi
 160
 161         adcl    %edx, %esi
 162
 163         movl    %eax, (%edi,%ecx,4)
 164         movl    4(%ebx,%ecx,4), %eax
 165
 166
 167         mull    %ebp
 168
 169         addl    %esi, %eax
 170         movl    $0, %esi
 171
 172         adcl    %edx, %esi
 173
 174         movl    %eax, 4(%edi,%ecx,4)
 175         movl    8(%ebx,%ecx,4), %eax
 176
 177
 178         mull    %ebp
 179
 180         addl    %esi, %eax
 181         movl    $0, %esi
 182
 183         adcl    %edx, %esi
 184
 185         movl    %eax, 8(%edi,%ecx,4)
 186         movl    12(%ebx,%ecx,4), %eax
 187
 188
 189         mull    %ebp
 190
 191         addl    %esi, %eax
 192         movl    $0, %esi
 193
 194         adcl    %edx, %esi
 195
 196         movl    %eax, 12(%edi,%ecx,4)
 197         movl    16(%ebx,%ecx,4), %eax
 198
 199
 200         addl    $4, %ecx
 201         js      L(top)
 202
 203
 204
 205         C eax   next src limb
 206         C ebx   &src[size-4]
 207         C ecx   0 to 3 representing respectively 4 to 1 further limbs
 208         C edx
 209         C esi   carry
 210         C edi   &dst[size-4]
 211
 212         testb   $2, %cl
 213         jnz     L(finish_not_two)
 214
 215         mull    %ebp
 216
 217         addl    %esi, %eax
 218         movl    $0, %esi
 219
 220         adcl    %edx, %esi
 221
 222         movl    %eax, (%edi,%ecx,4)
 223         movl    4(%ebx,%ecx,4), %eax
 224
 225
 226         mull    %ebp
 227
 228         addl    %esi, %eax
 229         movl    $0, %esi
 230
 231         adcl    %edx, %esi
 232
 233         movl    %eax, 4(%edi,%ecx,4)
 234         movl    8(%ebx,%ecx,4), %eax
 235
 236         addl    $2, %ecx
 237 L(finish_not_two):
 238
 239
 240         testb   $1, %cl
 241         jnz     L(finish_not_one)
 242
 243         mull    %ebp
 244
 245         addl    %esi, %eax
 246         movl    $0, %esi
 247
 248         adcl    %edx, %esi
 249
 250         movl    %eax, 8(%edi)
 251         movl    12(%ebx), %eax
 252 L(finish_not_one):
 253
 254
 255         mull    %ebp
 256
 257         addl    %esi, %eax
 258         popl    %ebp
 259
 260         adcl    $0, %edx
 261
 262         movl    %eax, 12(%edi)
 263         popl    %edi
 264
 265         popl    %ebx
 266         movl    %edx, %eax
 267
 268         popl    %esi
 269
 270         ret
 271
 272 EPILOGUE()