rts/gmp/mpn/x86/k6/k62mmx/copyi.asm

   1 dnl  AMD K6-2 mpn_copyi -- copy limb vector, incrementing.
   2 dnl
   3 dnl  K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
   4 dnl  alignment.
   5
   6
   7 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   8 dnl
   9 dnl  This file is part of the GNU MP Library.
  10 dnl
  11 dnl  The GNU MP Library is free software; you can redistribute it and/or
  12 dnl  modify it under the terms of the GNU Lesser General Public License as
  13 dnl  published by the Free Software Foundation; either version 2.1 of the
  14 dnl  License, or (at your option) any later version.
  15 dnl
  16 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  17 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19 dnl  Lesser General Public License for more details.
  20 dnl
  21 dnl  You should have received a copy of the GNU Lesser General Public
  22 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  23 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  24 dnl  Suite 330, Boston, MA 02111-1307, USA.
  25
  26
  27 include(`../config.m4')
  28
  29
  30 dnl  K6-2 aligned:
  31 dnl  UNROLL_COUNT cycles/limb
  32 dnl        8          0.75
  33 dnl       16          0.625
  34 dnl       32          0.5625
  35 dnl       64          0.53
  36 dnl  Maximum possible with the current code is 64, the minimum is 2.
  37
  38 deflit(UNROLL_COUNT, 32)
  39
  40
  41 C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
  42 C
  43 C The MMX loop is faster than a rep movs when src and dst are both 0mod8.
  44 C With one 0mod8 and one 4mod8 it's 1.056 c/l and the rep movs at 1.0 c/l is
  45 C used instead.
  46 C
  47 C         mod8
  48 C       src  dst
  49 C        0    0    both aligned, use mmx
  50 C        0    4    unaligned, use rep movs
  51 C        4    0    unaligned, use rep movs
  52 C        4    4    do one movs, then both aligned, use mmx
  53 C
  54 C The MMX code on aligned data is 0.5 c/l, plus loop overhead of 2
  55 C cycles/loop, which is 0.0625 c/l at 32 limbs/loop.
  56 C
  57 C A pattern of two movq loads and two movq stores (or four and four) was
  58 C tried, but found to be the same speed as just one of each.
  59 C
  60 C Note that this code only suits K6-2 and K6-3.  Plain K6 does only one mmx
  61 C instruction per cycle, so "movq"s are no faster than the simple 1 c/l rep
  62 C movs.
  63 C
  64 C Enhancement:
  65 C
  66 C Addressing modes like disp(%esi,%ecx,4) aren't currently used.  They'd
  67 C make it possible to avoid incrementing %esi and %edi in the loop and hence
  68 C get loop overhead down to 1 cycle.  Care would be needed to avoid bad
  69 C cache line crossings since the "movq"s would then be 5 code bytes rather
  70 C than 4.
  71
  72
  73 defframe(PARAM_SIZE,12)
  74 defframe(PARAM_SRC, 8)
  75 defframe(PARAM_DST, 4)
  76 deflit(`FRAME',0)
  77
  78         .text
  79         ALIGN(32)
  80
  81 PROLOGUE(mpn_copyi)
  82         movl    PARAM_SIZE, %ecx
  83         movl    %esi, %eax
  84
  85         movl    PARAM_SRC, %esi
  86         movl    %edi, %edx
  87
  88         cld
  89
  90         movl    PARAM_DST, %edi
  91         cmpl    $UNROLL_COUNT, %ecx
  92
  93         ja      L(unroll)
  94
  95 L(simple):
  96         rep
  97         movsl
  98
  99         movl    %eax, %esi
 100         movl    %edx, %edi
 101
 102         ret
 103
 104
 105 L(unroll):
 106         C if src and dst are different alignments mod8, then use rep movs
 107         C if src and dst are both 4mod8 then process one limb to get 0mod8
 108
 109         pushl   %ebx
 110         leal    (%esi,%edi), %ebx
 111
 112         testb   $4, %bl
 113         popl    %ebx
 114
 115         jnz     L(simple)
 116         testl   $4, %esi
 117
 118         leal    -UNROLL_COUNT(%ecx), %ecx
 119         jz      L(already_aligned)
 120
 121         decl    %ecx
 122
 123         movsl
 124 L(already_aligned):
 125
 126
 127 ifelse(UNROLL_BYTES,256,`
 128         addl    $128, %esi
 129         addl    $128, %edi
 130 ')
 131
 132         C this is offset 0x34, no alignment needed
 133 L(top):
 134         C eax   saved esi
 135         C ebx
 136         C ecx   counter, limbs
 137         C edx   saved edi
 138         C esi   src, incrementing
 139         C edi   dst, incrementing
 140         C ebp
 141         C
 142         C Zdisp gets 0(%esi) left that way to avoid vector decode, and with
 143         C 0(%edi) keeps code aligned to 16 byte boundaries.
 144
 145 deflit(CHUNK_COUNT, 2)
 146 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 147         deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
 148 Zdisp(  movq,   disp,(%esi), %mm0)
 149 Zdisp(  movq,   %mm0, disp,(%edi))
 150 ')
 151
 152         addl    $UNROLL_BYTES, %esi
 153         subl    $UNROLL_COUNT, %ecx
 154
 155         leal    UNROLL_BYTES(%edi), %edi
 156         jns     L(top)
 157
 158
 159         C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
 160         C UNROLL_COUNT-1 limbs remaining
 161
 162         testb   $eval(UNROLL_COUNT/2), %cl
 163
 164         leal    UNROLL_COUNT(%ecx), %ecx
 165         jz      L(not_half)
 166
 167         C at an unroll count of 32 this block of code is 16 cycles faster than
 168         C the rep movs, less 3 or 4 to test whether to do it
 169
 170 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
 171         deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
 172         movq    disp(%esi), %mm0
 173         movq    %mm0, disp(%edi)
 174 ')
 175         addl    $eval(UNROLL_BYTES/2), %esi
 176         addl    $eval(UNROLL_BYTES/2), %edi
 177
 178         subl    $eval(UNROLL_COUNT/2), %ecx
 179 L(not_half):
 180
 181
 182 ifelse(UNROLL_BYTES,256,`
 183         subl    $128, %esi
 184         subl    $128, %edi
 185 ')
 186
 187         rep
 188         movsl
 189
 190         movl    %eax, %esi
 191         movl    %edx, %edi
 192
 193         femms
 194         ret
 195
 196 EPILOGUE()