rts/gmp/mpn/x86/pentium/mmx/rshift.asm

   1 dnl  Intel P5 mpn_rshift -- mpn right shift.
   2 dnl
   3 dnl  P5: 1.75 cycles/limb.
   4
   5
   6 dnl  Copyright (C) 2000 Free Software Foundation, Inc.
   7 dnl
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or
  11 dnl  modify it under the terms of the GNU Lesser General Public License as
  12 dnl  published by the Free Software Foundation; either version 2.1 of the
  13 dnl  License, or (at your option) any later version.
  14 dnl
  15 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  16 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 dnl  Lesser General Public License for more details.
  19 dnl
  20 dnl  You should have received a copy of the GNU Lesser General Public
  21 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  22 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  23 dnl  Suite 330, Boston, MA 02111-1307, USA.
  24
  25
  26 include(`../config.m4')
  27
  28
  29 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  30 C                       unsigned shift);
  31 C
  32 C Shift src,size right by shift many bits and store the result in dst,size.
  33 C Zeros are shifted in at the left.  Return the bits shifted out at the
  34 C right.
  35 C
  36 C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
  37 C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
  38 C
  39 C Full speed depends on source and destination being aligned.  Unaligned mmx
  40 C loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
  41 C setups and finish-ups are done to ensure alignment for the loop.
  42 C
  43 C MMX shifts work out a bit faster even for the simple loop.
  44
  45 defframe(PARAM_SHIFT,16)
  46 defframe(PARAM_SIZE, 12)
  47 defframe(PARAM_SRC,  8)
  48 defframe(PARAM_DST,  4)
  49 deflit(`FRAME',0)
  50
  51 dnl  Minimum 5, because the unrolled loop can't handle less.
  52 deflit(UNROLL_THRESHOLD, 5)
  53
  54         .text
  55         ALIGN(8)
  56
  57 PROLOGUE(mpn_rshift)
  58
  59         pushl   %ebx
  60         pushl   %edi
  61 deflit(`FRAME',8)
  62
  63         movl    PARAM_SIZE, %eax
  64         movl    PARAM_DST, %edx
  65
  66         movl    PARAM_SRC, %ebx
  67         movl    PARAM_SHIFT, %ecx
  68
  69         cmp     $UNROLL_THRESHOLD, %eax
  70         jae     L(unroll)
  71
  72         decl    %eax
  73         movl    (%ebx), %edi            C src low limb
  74
  75         jnz     L(simple)
  76
  77         shrdl(  %cl, %edi, %eax)        C eax was decremented to zero
  78
  79         shrl    %cl, %edi
  80
  81         movl    %edi, (%edx)            C dst low limb
  82         popl    %edi                    C risk of data cache bank clash
  83
  84         popl    %ebx
  85
  86         ret
  87
  88
  89 C -----------------------------------------------------------------------------
  90         ALIGN(8)
  91 L(simple):
  92         C eax   size-1
  93         C ebx   src
  94         C ecx   shift
  95         C edx   dst
  96         C esi
  97         C edi
  98         C ebp
  99 deflit(`FRAME',8)
 100
 101         movd    (%ebx), %mm5            C src[0]
 102         leal    (%ebx,%eax,4), %ebx     C &src[size-1]
 103
 104         movd    %ecx, %mm6              C rshift
 105         leal    -4(%edx,%eax,4), %edx   C &dst[size-2]
 106
 107         psllq   $32, %mm5
 108         negl    %eax
 109
 110
 111 C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
 112 C cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
 113 C cycles and would be 8 in a simple loop.  Using mmx helps the return value
 114 C and last limb calculations too.
 115
 116 L(simple_top):
 117         C eax   counter, limbs, negative
 118         C ebx   &src[size-1]
 119         C ecx   return value
 120         C edx   &dst[size-2]
 121         C
 122         C mm0   scratch
 123         C mm5   return value
 124         C mm6   shift
 125
 126         movq    (%ebx,%eax,4), %mm0
 127         incl    %eax
 128
 129         psrlq   %mm6, %mm0
 130
 131         movd    %mm0, (%edx,%eax,4)
 132         jnz     L(simple_top)
 133
 134
 135         movd    (%ebx), %mm0
 136         psrlq   %mm6, %mm5              C return value
 137
 138         psrlq   %mm6, %mm0
 139         popl    %edi
 140
 141         movd    %mm5, %eax
 142         popl    %ebx
 143
 144         movd    %mm0, 4(%edx)
 145
 146         emms
 147
 148         ret
 149
 150
 151 C -----------------------------------------------------------------------------
 152         ALIGN(8)
 153 L(unroll):
 154         C eax   size
 155         C ebx   src
 156         C ecx   shift
 157         C edx   dst
 158         C esi
 159         C edi
 160         C ebp
 161 deflit(`FRAME',8)
 162
 163         movd    (%ebx), %mm5            C src[0]
 164         movl    $4, %edi
 165
 166         movd    %ecx, %mm6              C rshift
 167         testl   %edi, %ebx
 168
 169         psllq   $32, %mm5
 170         jz      L(start_src_aligned)
 171
 172
 173         C src isn't aligned, process low limb separately (marked xxx) and
 174         C step src and dst by one limb, making src aligned.
 175         C
 176         C source                  ebx
 177         C --+-------+-------+-------+
 178         C           |          xxx  |
 179         C --+-------+-------+-------+
 180         C         4mod8   0mod8   4mod8
 181         C
 182         C         dest            edx
 183         C         --+-------+-------+
 184         C           |       |  xxx  |
 185         C         --+-------+-------+
 186
 187         movq    (%ebx), %mm0            C unaligned load
 188
 189         psrlq   %mm6, %mm0
 190         addl    $4, %ebx
 191
 192         decl    %eax
 193
 194         movd    %mm0, (%edx)
 195         addl    $4, %edx
 196 L(start_src_aligned):
 197
 198
 199         movq    (%ebx), %mm1
 200         testl   %edi, %edx
 201
 202         psrlq   %mm6, %mm5              C retval
 203         jz      L(start_dst_aligned)
 204
 205         C dst isn't aligned, add 4 to make it so, and pretend the shift is
 206         C 32 bits extra.  Low limb of dst (marked xxx) handled here
 207         C separately.
 208         C
 209         C          source          ebx
 210         C          --+-------+-------+
 211         C            |      mm1      |
 212         C          --+-------+-------+
 213         C                  4mod8   0mod8
 214         C
 215         C  dest                    edx
 216         C  --+-------+-------+-------+
 217         C                    |  xxx  |
 218         C  --+-------+-------+-------+
 219         C          4mod8   0mod8   4mod8
 220
 221         movq    %mm1, %mm0
 222         addl    $32, %ecx               C new shift
 223
 224         psrlq   %mm6, %mm0
 225
 226         movd    %ecx, %mm6
 227
 228         movd    %mm0, (%edx)
 229         addl    $4, %edx
 230 L(start_dst_aligned):
 231
 232
 233         movq    8(%ebx), %mm3
 234         negl    %ecx
 235
 236         movq    %mm3, %mm2              C mm2 src qword
 237         addl    $64, %ecx
 238
 239         movd    %ecx, %mm7
 240         psrlq   %mm6, %mm1
 241
 242         leal    -12(%ebx,%eax,4), %ebx
 243         leal    -20(%edx,%eax,4), %edx
 244
 245         psllq   %mm7, %mm3
 246         subl    $7, %eax                C size-7
 247
 248         por     %mm1, %mm3              C mm3 ready to store
 249         negl    %eax                    C -(size-7)
 250
 251         jns     L(finish)
 252
 253
 254         C This loop is the important bit, the rest is just support.  Careful
 255         C instruction scheduling achieves the claimed 1.75 c/l.  The
 256         C relevant parts of the pairing rules are:
 257         C
 258         C - mmx loads and stores execute only in the U pipe
 259         C - only one mmx shift in a pair
 260         C - wait one cycle before storing an mmx register result
 261         C - the usual address generation interlock
 262         C
 263         C Two qword calculations are slightly interleaved.  The instructions
 264         C marked "C" belong to the second qword, and the "C prev" one is for
 265         C the second qword from the previous iteration.
 266
 267         ALIGN(8)
 268 L(unroll_loop):
 269         C eax   counter, limbs, negative
 270         C ebx   &src[size-12]
 271         C ecx
 272         C edx   &dst[size-12]
 273         C esi
 274         C edi
 275         C
 276         C mm0
 277         C mm1
 278         C mm2   src qword from -8(%ebx,%eax,4)
 279         C mm3   dst qword ready to store to -8(%edx,%eax,4)
 280         C
 281         C mm5   return value
 282         C mm6   rshift
 283         C mm7   lshift
 284
 285         movq    (%ebx,%eax,4), %mm0
 286         psrlq   %mm6, %mm2
 287
 288         movq    %mm0, %mm1
 289         psllq   %mm7, %mm0
 290
 291         movq    %mm3, -8(%edx,%eax,4)   C prev
 292         por     %mm2, %mm0
 293
 294         movq    8(%ebx,%eax,4), %mm3    C
 295         psrlq   %mm6, %mm1              C
 296
 297         movq    %mm0, (%edx,%eax,4)
 298         movq    %mm3, %mm2              C
 299
 300         psllq   %mm7, %mm3              C
 301         addl    $4, %eax
 302
 303         por     %mm1, %mm3              C
 304         js      L(unroll_loop)
 305
 306
 307 L(finish):
 308         C eax   0 to 3 representing respectively 3 to 0 limbs remaining
 309
 310         testb   $2, %al
 311
 312         jnz     L(finish_no_two)
 313
 314         movq    (%ebx,%eax,4), %mm0
 315         psrlq   %mm6, %mm2
 316
 317         movq    %mm0, %mm1
 318         psllq   %mm7, %mm0
 319
 320         movq    %mm3, -8(%edx,%eax,4)   C prev
 321         por     %mm2, %mm0
 322
 323         movq    %mm1, %mm2
 324         movq    %mm0, %mm3
 325
 326         addl    $2, %eax
 327 L(finish_no_two):
 328
 329
 330         C eax   2 or 3 representing respectively 1 or 0 limbs remaining
 331         C
 332         C mm2   src prev qword, from -8(%ebx,%eax,4)
 333         C mm3   dst qword, for -8(%edx,%eax,4)
 334
 335         testb   $1, %al
 336         popl    %edi
 337
 338         movd    %mm5, %eax      C retval
 339         jnz     L(finish_zero)
 340
 341
 342         C One extra limb, destination was aligned.
 343         C
 344         C source                ebx
 345         C +-------+---------------+--
 346         C |       |      mm2      |
 347         C +-------+---------------+--
 348         C
 349         C dest                                  edx
 350         C +-------+---------------+---------------+--
 351         C |       |               |      mm3      |
 352         C +-------+---------------+---------------+--
 353         C
 354         C mm6 = shift
 355         C mm7 = ecx = 64-shift
 356
 357
 358         C One extra limb, destination was unaligned.
 359         C
 360         C source                ebx
 361         C +-------+---------------+--
 362         C |       |      mm2      |
 363         C +-------+---------------+--
 364         C
 365         C dest                          edx
 366         C +---------------+---------------+--
 367         C |               |      mm3      |
 368         C +---------------+---------------+--
 369         C
 370         C mm6 = shift+32
 371         C mm7 = ecx = 64-(shift+32)
 372
 373
 374         C In both cases there's one extra limb of src to fetch and combine
 375         C with mm2 to make a qword at 8(%edx), and in the aligned case
 376         C there's a further extra limb of dst to be formed.
 377
 378
 379         movd    8(%ebx), %mm0
 380         psrlq   %mm6, %mm2
 381
 382         movq    %mm0, %mm1
 383         psllq   %mm7, %mm0
 384
 385         movq    %mm3, (%edx)
 386         por     %mm2, %mm0
 387
 388         psrlq   %mm6, %mm1
 389         andl    $32, %ecx
 390
 391         popl    %ebx
 392         jz      L(finish_one_unaligned)
 393
 394         C dst was aligned, must store one extra limb
 395         movd    %mm1, 16(%edx)
 396 L(finish_one_unaligned):
 397
 398         movq    %mm0, 8(%edx)
 399
 400         emms
 401
 402         ret
 403
 404
 405 L(finish_zero):
 406
 407         C No extra limbs, destination was aligned.
 408         C
 409         C source        ebx
 410         C +---------------+--
 411         C |      mm2      |
 412         C +---------------+--
 413         C
 414         C dest                        edx+4
 415         C +---------------+---------------+--
 416         C |               |      mm3      |
 417         C +---------------+---------------+--
 418         C
 419         C mm6 = shift
 420         C mm7 = ecx = 64-shift
 421
 422
 423         C No extra limbs, destination was unaligned.
 424         C
 425         C source        ebx
 426         C +---------------+--
 427         C |      mm2      |
 428         C +---------------+--
 429         C
 430         C dest                edx+4
 431         C +-------+---------------+--
 432         C |       |      mm3      |
 433         C +-------+---------------+--
 434         C
 435         C mm6 = shift+32
 436         C mm7 = 64-(shift+32)
 437
 438
 439         C The movd for the unaligned case is clearly the same data as the
 440         C movq for the aligned case, it's just a choice between whether one
 441         C or two limbs should be written.
 442
 443
 444         movq    %mm3, 4(%edx)
 445         psrlq   %mm6, %mm2
 446
 447         movd    %mm2, 12(%edx)
 448         andl    $32, %ecx
 449
 450         popl    %ebx
 451         jz      L(finish_zero_unaligned)
 452
 453         movq    %mm2, 12(%edx)
 454 L(finish_zero_unaligned):
 455
 456         emms
 457
 458         ret
 459
 460 EPILOGUE()