rts/gmp/mpn/x86/k6/k62mmx/copyd.asm

   1 dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
   2 dnl
   3 dnl  K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
   4 dnl  alignment.
   5
   6
   7 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   8 dnl
   9 dnl  This file is part of the GNU MP Library.
  10 dnl
  11 dnl  The GNU MP Library is free software; you can redistribute it and/or
  12 dnl  modify it under the terms of the GNU Lesser General Public License as
  13 dnl  published by the Free Software Foundation; either version 2.1 of the
  14 dnl  License, or (at your option) any later version.
  15 dnl
  16 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  17 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19 dnl  Lesser General Public License for more details.
  20 dnl
  21 dnl  You should have received a copy of the GNU Lesser General Public
  22 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  23 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  24 dnl  Suite 330, Boston, MA 02111-1307, USA.
  25
  26
  27 include(`../config.m4')
  28
  29
  30 dnl  K6-2 aligned:
  31 dnl  UNROLL_COUNT cycles/limb
  32 dnl        8          0.75
  33 dnl       16          0.625
  34 dnl       32          0.5625
  35 dnl       64          0.53
  36 dnl  Maximum possible with the current code is 64, the minimum is 2.
  37
  38 deflit(UNROLL_COUNT, 32)
  39
  40
  41 C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
  42 C
  43 C Copy src,size to dst,size, processing limbs from high to low addresses.
  44 C
  45 C The comments in copyi.asm apply here too.
  46
  47
  48 defframe(PARAM_SIZE,12)
  49 defframe(PARAM_SRC, 8)
  50 defframe(PARAM_DST, 4)
  51 deflit(`FRAME',0)
  52
  53         .text
  54         ALIGN(32)
  55
  56 PROLOGUE(mpn_copyd)
  57         movl    PARAM_SIZE, %ecx
  58         movl    %esi, %eax
  59
  60         movl    PARAM_SRC, %esi
  61         movl    %edi, %edx
  62
  63         std
  64
  65         movl    PARAM_DST, %edi
  66         cmpl    $UNROLL_COUNT, %ecx
  67
  68         leal    -4(%esi,%ecx,4), %esi
  69
  70         leal    -4(%edi,%ecx,4), %edi
  71         ja      L(unroll)
  72
  73 L(simple):
  74         rep
  75         movsl
  76
  77         cld
  78
  79         movl    %eax, %esi
  80         movl    %edx, %edi
  81
  82         ret
  83
  84
  85 L(unroll):
  86         C if src and dst are different alignments mod8, then use rep movs
  87         C if src and dst are both 4mod8 then process one limb to get 0mod8
  88
  89         pushl   %ebx
  90         leal    (%esi,%edi), %ebx
  91
  92         testb   $4, %bl
  93         popl    %ebx
  94
  95         jnz     L(simple)
  96         testl   $4, %esi
  97
  98         leal    -UNROLL_COUNT(%ecx), %ecx
  99         jnz     L(already_aligned)
 100
 101         movsl
 102
 103         decl    %ecx
 104 L(already_aligned):
 105
 106
 107 ifelse(UNROLL_BYTES,256,`
 108         subl    $128, %esi
 109         subl    $128, %edi
 110 ')
 111
 112         C offset 0x3D here, but gets full speed without further alignment
 113 L(top):
 114         C eax   saved esi
 115         C ebx
 116         C ecx   counter, limbs
 117         C edx   saved edi
 118         C esi   src, incrementing
 119         C edi   dst, incrementing
 120         C ebp
 121         C
 122         C `disp' is never 0, so don't need to force 0(%esi).
 123
 124 deflit(CHUNK_COUNT, 2)
 125 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 126         deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
 127         movq    disp(%esi), %mm0
 128         movq    %mm0, disp(%edi)
 129 ')
 130
 131         leal    -UNROLL_BYTES(%esi), %esi
 132         subl    $UNROLL_COUNT, %ecx
 133
 134         leal    -UNROLL_BYTES(%edi), %edi
 135         jns     L(top)
 136
 137
 138         C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
 139         C UNROLL_COUNT-1 limbs remaining
 140
 141         testb   $eval(UNROLL_COUNT/2), %cl
 142
 143         leal    UNROLL_COUNT(%ecx), %ecx
 144         jz      L(not_half)
 145
 146
 147         C at an unroll count of 32 this block of code is 16 cycles faster than
 148         C the rep movs, less 3 or 4 to test whether to do it
 149
 150 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
 151         deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
 152         movq    disp(%esi), %mm0
 153         movq    %mm0, disp(%edi)
 154 ')
 155
 156         subl    $eval(UNROLL_BYTES/2), %esi
 157         subl    $eval(UNROLL_BYTES/2), %edi
 158
 159         subl    $eval(UNROLL_COUNT/2), %ecx
 160 L(not_half):
 161
 162
 163 ifelse(UNROLL_BYTES,256,`
 164         addl    $128, %esi
 165         addl    $128, %edi
 166 ')
 167
 168         rep
 169         movsl
 170
 171         cld
 172
 173         movl    %eax, %esi
 174         movl    %edx, %edi
 175
 176         femms
 177         ret
 178
 179 EPILOGUE()