rts/gmp/mpn/x86/k7/aors_n.asm

   1 dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
   2 dnl
   3 dnl  K7: 1.64 cycles/limb (at 16 limb/loop).
   4
   5
   6 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   7 dnl
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or
  11 dnl  modify it under the terms of the GNU Lesser General Public License as
  12 dnl  published by the Free Software Foundation; either version 2.1 of the
  13 dnl  License, or (at your option) any later version.
  14 dnl
  15 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  16 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 dnl  Lesser General Public License for more details.
  19 dnl
  20 dnl  You should have received a copy of the GNU Lesser General Public
  21 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  22 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  23 dnl  Suite 330, Boston, MA 02111-1307, USA.
  24
  25
  26 include(`../config.m4')
  27
  28
  29 dnl  K7: UNROLL_COUNT cycles/limb
  30 dnl           8           1.9
  31 dnl          16           1.64
  32 dnl          32           1.7
  33 dnl          64           2.0
  34 dnl  Maximum possible with the current code is 64.
  35
  36 deflit(UNROLL_COUNT, 16)
  37
  38
  39 ifdef(`OPERATION_add_n', `
  40         define(M4_inst,        adcl)
  41         define(M4_function_n,  mpn_add_n)
  42         define(M4_function_nc, mpn_add_nc)
  43         define(M4_description, add)
  44 ',`ifdef(`OPERATION_sub_n', `
  45         define(M4_inst,        sbbl)
  46         define(M4_function_n,  mpn_sub_n)
  47         define(M4_function_nc, mpn_sub_nc)
  48         define(M4_description, subtract)
  49 ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
  50 ')')')
  51
  52 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
  53
  54
  55 C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
  56 C                         mp_size_t size);
  57 C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
  58 C                          mp_size_t size, mp_limb_t carry);
  59 C
  60 C Calculate src1,size M4_description src2,size, and store the result in
  61 C dst,size.  The return value is the carry bit from the top of the result (1
  62 C or 0).
  63 C
  64 C The _nc version accepts 1 or 0 for an initial carry into the low limb of
  65 C the calculation.  Note values other than 1 or 0 here will lead to garbage
  66 C results.
  67 C
  68 C This code runs at 1.64 cycles/limb, which is probably the best possible
  69 C with plain integer operations.  Each limb is 2 loads and 1 store, and in
  70 C one cycle the K7 can do two loads, or a load and a store, leading to 1.5
  71 C c/l.
  72
  73 dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
  74 ifdef(`PIC',`
  75 deflit(UNROLL_THRESHOLD, 8)
  76 ',`
  77 deflit(UNROLL_THRESHOLD, 8)
  78 ')
  79
  80 defframe(PARAM_CARRY,20)
  81 defframe(PARAM_SIZE, 16)
  82 defframe(PARAM_SRC2, 12)
  83 defframe(PARAM_SRC1, 8)
  84 defframe(PARAM_DST,  4)
  85
  86 defframe(SAVE_EBP, -4)
  87 defframe(SAVE_ESI, -8)
  88 defframe(SAVE_EBX, -12)
  89 defframe(SAVE_EDI, -16)
  90 deflit(STACK_SPACE, 16)
  91
  92         .text
  93         ALIGN(32)
  94 deflit(`FRAME',0)
  95
  96 PROLOGUE(M4_function_nc)
  97         movl    PARAM_CARRY, %eax
  98         jmp     LF(M4_function_n,start)
  99 EPILOGUE()
 100
 101 PROLOGUE(M4_function_n)
 102
 103         xorl    %eax, %eax      C carry
 104 L(start):
 105         movl    PARAM_SIZE, %ecx
 106         subl    $STACK_SPACE, %esp
 107 deflit(`FRAME',STACK_SPACE)
 108
 109         movl    %edi, SAVE_EDI
 110         movl    %ebx, SAVE_EBX
 111         cmpl    $UNROLL_THRESHOLD, %ecx
 112
 113         movl    PARAM_SRC2, %edx
 114         movl    PARAM_SRC1, %ebx
 115         jae     L(unroll)
 116
 117         movl    PARAM_DST, %edi
 118         leal    (%ebx,%ecx,4), %ebx
 119         leal    (%edx,%ecx,4), %edx
 120
 121         leal    (%edi,%ecx,4), %edi
 122         negl    %ecx
 123         shrl    %eax
 124
 125         C This loop in in a single 16 byte code block already, so no
 126         C alignment necessary.
 127 L(simple):
 128         C eax   scratch
 129         C ebx   src1
 130         C ecx   counter
 131         C edx   src2
 132         C esi
 133         C edi   dst
 134         C ebp
 135
 136         movl    (%ebx,%ecx,4), %eax
 137         M4_inst (%edx,%ecx,4), %eax
 138         movl    %eax, (%edi,%ecx,4)
 139         incl    %ecx
 140         jnz     L(simple)
 141
 142         movl    $0, %eax
 143         movl    SAVE_EDI, %edi
 144
 145         movl    SAVE_EBX, %ebx
 146         setc    %al
 147         addl    $STACK_SPACE, %esp
 148
 149         ret
 150
 151
 152 C -----------------------------------------------------------------------------
 153         C This is at 0x55, close enough to aligned.
 154 L(unroll):
 155 deflit(`FRAME',STACK_SPACE)
 156         movl    %ebp, SAVE_EBP
 157         andl    $-2, %ecx               C size low bit masked out
 158         andl    $1, PARAM_SIZE          C size low bit kept
 159
 160         movl    %ecx, %edi
 161         decl    %ecx
 162         movl    PARAM_DST, %ebp
 163
 164         shrl    $UNROLL_LOG2, %ecx
 165         negl    %edi
 166         movl    %esi, SAVE_ESI
 167
 168         andl    $UNROLL_MASK, %edi
 169
 170 ifdef(`PIC',`
 171         call    L(pic_calc)
 172 L(here):
 173 ',`
 174         leal    L(entry) (%edi,%edi,8), %esi    C 9 bytes per
 175 ')
 176         negl    %edi
 177         shrl    %eax
 178
 179         leal    ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
 180         leal    ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
 181         leal    ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
 182
 183         jmp     *%esi
 184
 185
 186 ifdef(`PIC',`
 187 L(pic_calc):
 188         C See README.family about old gas bugs
 189         leal    (%edi,%edi,8), %esi
 190         addl    $L(entry)-L(here), %esi
 191         addl    (%esp), %esi
 192         ret
 193 ')
 194
 195
 196 C -----------------------------------------------------------------------------
 197         ALIGN(32)
 198 L(top):
 199         C eax   zero
 200         C ebx   src1
 201         C ecx   counter
 202         C edx   src2
 203         C esi   scratch (was computed jump)
 204         C edi   dst
 205         C ebp   scratch
 206
 207         leal    UNROLL_BYTES(%edx), %edx
 208
 209 L(entry):
 210 deflit(CHUNK_COUNT, 2)
 211 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 212         deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
 213         deflit(`disp1', eval(disp0 + 4))
 214
 215 Zdisp(  movl,   disp0,(%ebx), %esi)
 216         movl    disp1(%ebx), %ebp
 217 Zdisp(  M4_inst,disp0,(%edx), %esi)
 218 Zdisp(  movl,   %esi, disp0,(%edi))
 219         M4_inst disp1(%edx), %ebp
 220         movl    %ebp, disp1(%edi)
 221 ')
 222
 223         decl    %ecx
 224         leal    UNROLL_BYTES(%ebx), %ebx
 225         leal    UNROLL_BYTES(%edi), %edi
 226         jns     L(top)
 227
 228
 229         mov     PARAM_SIZE, %esi
 230         movl    SAVE_EBP, %ebp
 231         movl    $0, %eax
 232
 233         decl    %esi
 234         js      L(even)
 235
 236         movl    (%ebx), %ecx
 237         M4_inst UNROLL_BYTES(%edx), %ecx
 238         movl    %ecx, (%edi)
 239 L(even):
 240
 241         movl    SAVE_EDI, %edi
 242         movl    SAVE_EBX, %ebx
 243         setc    %al
 244
 245         movl    SAVE_ESI, %esi
 246         addl    $STACK_SPACE, %esp
 247
 248         ret
 249
 250 EPILOGUE()