rts/gmp/mpn/x86/k6/k62mmx/rshift.asm

   1 dnl  AMD K6-2 mpn_rshift -- mpn right shift.
   2 dnl
   3 dnl  K6-2: 1.75 cycles/limb
   4
   5
   6 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   7 dnl
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or
  11 dnl  modify it under the terms of the GNU Lesser General Public License as
  12 dnl  published by the Free Software Foundation; either version 2.1 of the
  13 dnl  License, or (at your option) any later version.
  14 dnl
  15 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  16 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 dnl  Lesser General Public License for more details.
  19 dnl
  20 dnl  You should have received a copy of the GNU Lesser General Public
  21 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  22 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  23 dnl  Suite 330, Boston, MA 02111-1307, USA.
  24
  25
  26 include(`../config.m4')
  27
  28
  29 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  30 C                       unsigned shift);
  31 C
  32
  33 defframe(PARAM_SHIFT,16)
  34 defframe(PARAM_SIZE, 12)
  35 defframe(PARAM_SRC,  8)
  36 defframe(PARAM_DST,  4)
  37 deflit(`FRAME',0)
  38
  39 dnl  Minimum 9, because the unrolled loop can't handle less.
  40 dnl
  41 deflit(UNROLL_THRESHOLD, 9)
  42
  43         .text
  44         ALIGN(32)
  45
  46 PROLOGUE(mpn_rshift)
  47 deflit(`FRAME',0)
  48
  49         C The 1 limb case can be done without the push %ebx, but it's then
  50         C still the same speed.  The push is left as a free helping hand for
  51         C the two_or_more code.
  52
  53         movl    PARAM_SIZE, %eax
  54         pushl   %ebx                    FRAME_pushl()
  55
  56         movl    PARAM_SRC, %ebx
  57         decl    %eax
  58
  59         movl    PARAM_SHIFT, %ecx
  60         jnz     L(two_or_more)
  61
  62         movl    (%ebx), %edx            C src limb
  63         movl    PARAM_DST, %ebx
  64
  65         shrdl(  %cl, %edx, %eax)        C return value
  66
  67         shrl    %cl, %edx
  68
  69         movl    %edx, (%ebx)            C dst limb
  70         popl    %ebx
  71
  72         ret
  73
  74
  75 C -----------------------------------------------------------------------------
  76         ALIGN(16)       C avoid offset 0x1f
  77 L(two_or_more):
  78         C eax   size-1
  79         C ebx   src
  80         C ecx   shift
  81         C edx
  82
  83         movl    (%ebx), %edx    C src low limb
  84         negl    %ecx
  85
  86         addl    $32, %ecx
  87         movd    PARAM_SHIFT, %mm6
  88
  89         shll    %cl, %edx
  90         cmpl    $UNROLL_THRESHOLD-1, %eax
  91
  92         jae     L(unroll)
  93
  94
  95         C eax   size-1
  96         C ebx   src
  97         C ecx   32-shift
  98         C edx   retval
  99         C
 100         C mm6   shift
 101
 102         movl    PARAM_DST, %ecx
 103         leal    (%ebx,%eax,4), %ebx
 104
 105         leal    -4(%ecx,%eax,4), %ecx
 106         negl    %eax
 107
 108         C This loop runs at about 3 cycles/limb, which is the amount of
 109         C decoding, and this is despite every second access being unaligned.
 110
 111 L(simple):
 112         C eax   counter, -(size-1) to -1
 113         C ebx   &src[size-1]
 114         C ecx   &dst[size-1]
 115         C edx   retval
 116         C
 117         C mm0   scratch
 118         C mm6   shift
 119
 120 Zdisp(  movq,   0,(%ebx,%eax,4), %mm0)
 121         incl    %eax
 122
 123         psrlq   %mm6, %mm0
 124
 125 Zdisp(  movd,   %mm0, 0,(%ecx,%eax,4))
 126         jnz     L(simple)
 127
 128
 129         movq    %mm0, (%ecx)
 130         movl    %edx, %eax
 131
 132         popl    %ebx
 133
 134         femms
 135         ret
 136
 137
 138 C -----------------------------------------------------------------------------
 139         ALIGN(16)
 140 L(unroll):
 141         C eax   size-1
 142         C ebx   src
 143         C ecx   32-shift
 144         C edx   retval
 145         C
 146         C mm6   shift
 147
 148         addl    $32, %ecx
 149         subl    $7, %eax                C size-8
 150
 151         movd    %ecx, %mm7
 152         movl    PARAM_DST, %ecx
 153
 154         movq    (%ebx), %mm2            C src low qword
 155         leal    (%ebx,%eax,4), %ebx     C src end - 32
 156
 157         testb   $4, %cl
 158         leal    (%ecx,%eax,4), %ecx     C dst end - 32
 159
 160         notl    %eax                    C -(size-7)
 161         jz      L(dst_aligned)
 162
 163         psrlq   %mm6, %mm2
 164         incl    %eax
 165
 166 Zdisp(  movd,   %mm2, 0,(%ecx,%eax,4))  C dst low limb
 167         movq    4(%ebx,%eax,4), %mm2    C new src low qword
 168 L(dst_aligned):
 169
 170         movq    12(%ebx,%eax,4), %mm0   C src second lowest qword
 171         nop     C avoid bad cache line crossing
 172
 173
 174         C This loop is the important bit, the rest is just support for it.
 175         C Four src limbs are held at the start, and four more will be read.
 176         C Four dst limbs will be written.  This schedule seems necessary for
 177         C full speed.
 178         C
 179         C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
 180         C and leaves 0 to 3 which can be tested with test $1 and $2.
 181
 182 L(top):
 183         C eax   counter, -(size-7) step by +4 until >=0
 184         C ebx   src end - 32
 185         C ecx   dst end - 32
 186         C edx   retval
 187         C
 188         C mm0   src next qword
 189         C mm1   scratch
 190         C mm2   src prev qword
 191         C mm6   shift
 192         C mm7   64-shift
 193
 194         psrlq   %mm6, %mm2
 195         addl    $4, %eax
 196
 197         movq    %mm0, %mm1
 198         psllq   %mm7, %mm0
 199
 200         por     %mm0, %mm2
 201         movq    4(%ebx,%eax,4), %mm0
 202
 203         psrlq   %mm6, %mm1
 204         movq    %mm2, -12(%ecx,%eax,4)
 205
 206         movq    %mm0, %mm2
 207         psllq   %mm7, %mm0
 208
 209         por     %mm0, %mm1
 210         movq    12(%ebx,%eax,4), %mm0
 211
 212         movq    %mm1, -4(%ecx,%eax,4)
 213         ja      L(top)          C jump if no carry and not zero
 214
 215
 216
 217         C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
 218         C to 3 representing respectively 3 to 0 further limbs.
 219
 220         testl   $2, %eax        C testl to avoid bad cache line crossings
 221         jnz     L(finish_nottwo)
 222
 223         C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
 224         C becomes new mm2 and a new mm0 is loaded.
 225
 226         psrlq   %mm6, %mm2
 227         movq    %mm0, %mm1
 228
 229         psllq   %mm7, %mm0
 230         addl    $2, %eax
 231
 232         por     %mm0, %mm2
 233         movq    12(%ebx,%eax,4), %mm0
 234
 235         movq    %mm2, -4(%ecx,%eax,4)
 236         movq    %mm1, %mm2
 237 L(finish_nottwo):
 238
 239
 240         testb   $1, %al
 241         psrlq   %mm6, %mm2
 242
 243         movq    %mm0, %mm1
 244         psllq   %mm7, %mm0
 245
 246         por     %mm0, %mm2
 247         psrlq   %mm6, %mm1
 248
 249         movq    %mm2, 4(%ecx,%eax,4)
 250         jnz     L(finish_even)
 251
 252
 253         C one further extra limb to process
 254
 255         movd    32-4(%ebx), %mm0        C src[size-1], most significant limb
 256         popl    %ebx
 257
 258         movq    %mm0, %mm2
 259         psllq   %mm7, %mm0
 260
 261         por     %mm0, %mm1
 262         psrlq   %mm6, %mm2
 263
 264         movq    %mm1, 32-12(%ecx)       C dst[size-3,size-2]
 265         movd    %mm2, 32-4(%ecx)        C dst[size-1]
 266
 267         movl    %edx, %eax              C retval
 268
 269         femms
 270         ret
 271
 272
 273         nop     C avoid bad cache line crossing
 274 L(finish_even):
 275         C no further extra limbs
 276
 277         movq    %mm1, 32-8(%ecx)        C dst[size-2,size-1]
 278         movl    %edx, %eax              C retval
 279
 280         popl    %ebx
 281
 282         femms
 283         ret
 284
 285 EPILOGUE()