ghc/rts/gmp/mpn/x86/k7/mmx/rshift.asm

   1 dnl  AMD K7 mpn_rshift -- mpn right shift.
   2 dnl
   3 dnl  K7: 1.21 cycles/limb (at 16 limbs/loop).
   4
   5
   6 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   7 dnl
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or
  11 dnl  modify it under the terms of the GNU Lesser General Public License as
  12 dnl  published by the Free Software Foundation; either version 2.1 of the
  13 dnl  License, or (at your option) any later version.
  14 dnl
  15 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  16 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 dnl  Lesser General Public License for more details.
  19 dnl
  20 dnl  You should have received a copy of the GNU Lesser General Public
  21 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  22 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  23 dnl  Suite 330, Boston, MA 02111-1307, USA.
  24
  25
  26 include(`../config.m4')
  27
  28
  29 dnl  K7: UNROLL_COUNT cycles/limb
  30 dnl           4           1.51
  31 dnl           8           1.26
  32 dnl          16           1.21
  33 dnl          32           1.2
  34 dnl  Maximum possible with the current code is 64.
  35
  36 deflit(UNROLL_COUNT, 16)
  37
  38
  39 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  40 C                       unsigned shift);
  41 C
  42 C Shift src,size right by shift many bits and store the result in dst,size.
  43 C Zeros are shifted in at the left.  The bits shifted out at the right are
  44 C the return value.
  45 C
  46 C This code uses 64-bit MMX operations, which makes it possible to handle
  47 C two limbs at a time, for a theoretical 1.0 cycles/limb.  Plain integer
  48 C code, on the other hand, suffers from shrd being a vector path decode and
  49 C running at 3 cycles back-to-back.
  50 C
  51 C Full speed depends on source and destination being aligned, and some hairy
  52 C setups and finish-ups are done to arrange this for the loop.
  53
  54 ifdef(`PIC',`
  55 deflit(UNROLL_THRESHOLD, 10)
  56 ',`
  57 deflit(UNROLL_THRESHOLD, 10)
  58 ')
  59
  60 defframe(PARAM_SHIFT,16)
  61 defframe(PARAM_SIZE, 12)
  62 defframe(PARAM_SRC,  8)
  63 defframe(PARAM_DST,  4)
  64
  65 defframe(SAVE_EDI, -4)
  66 defframe(SAVE_ESI, -8)
  67 defframe(SAVE_EBX, -12)
  68 deflit(SAVE_SIZE, 12)
  69
  70         .text
  71         ALIGN(32)
  72
  73 PROLOGUE(mpn_rshift)
  74 deflit(`FRAME',0)
  75
  76         movl    PARAM_SIZE, %eax
  77         movl    PARAM_SRC, %edx
  78         subl    $SAVE_SIZE, %esp
  79 deflit(`FRAME',SAVE_SIZE)
  80
  81         movl    PARAM_SHIFT, %ecx
  82         movl    %edi, SAVE_EDI
  83
  84         movl    PARAM_DST, %edi
  85         decl    %eax
  86         jnz     L(more_than_one_limb)
  87
  88         movl    (%edx), %edx            C src limb
  89
  90         shrdl(  %cl, %edx, %eax)        C eax was decremented to zero
  91
  92         shrl    %cl, %edx
  93
  94         movl    %edx, (%edi)            C dst limb
  95         movl    SAVE_EDI, %edi
  96         addl    $SAVE_SIZE, %esp
  97
  98         ret
  99
 100
 101 C -----------------------------------------------------------------------------
 102 L(more_than_one_limb):
 103         C eax   size-1
 104         C ebx
 105         C ecx   shift
 106         C edx   src
 107         C esi
 108         C edi   dst
 109         C ebp
 110
 111         movd    PARAM_SHIFT, %mm6       C rshift
 112         movd    (%edx), %mm5            C src low limb
 113         cmp     $UNROLL_THRESHOLD-1, %eax
 114
 115         jae     L(unroll)
 116         leal    (%edx,%eax,4), %edx     C &src[size-1]
 117         leal    -4(%edi,%eax,4), %edi   C &dst[size-2]
 118
 119         movd    (%edx), %mm4            C src high limb
 120         negl    %eax
 121
 122
 123 L(simple_top):
 124         C eax   loop counter, limbs, negative
 125         C ebx
 126         C ecx   shift
 127         C edx   carry
 128         C edx   &src[size-1]
 129         C edi   &dst[size-2]
 130         C ebp
 131         C
 132         C mm0   scratch
 133         C mm4   src high limb
 134         C mm5   src low limb
 135         C mm6   shift
 136
 137         movq    (%edx,%eax,4), %mm0
 138         incl    %eax
 139
 140         psrlq   %mm6, %mm0
 141
 142         movd    %mm0, (%edi,%eax,4)
 143         jnz     L(simple_top)
 144
 145
 146         psllq   $32, %mm5
 147         psrlq   %mm6, %mm4
 148
 149         psrlq   %mm6, %mm5
 150         movd    %mm4, 4(%edi)           C dst high limb
 151
 152         movd    %mm5, %eax              C return value
 153
 154         movl    SAVE_EDI, %edi
 155         addl    $SAVE_SIZE, %esp
 156         emms
 157
 158         ret
 159
 160
 161 C -----------------------------------------------------------------------------
 162         ALIGN(16)
 163 L(unroll):
 164         C eax   size-1
 165         C ebx
 166         C ecx   shift
 167         C edx   src
 168         C esi
 169         C edi   dst
 170         C ebp
 171         C
 172         C mm5   src low limb
 173         C mm6   rshift
 174
 175         testb   $4, %dl
 176         movl    %esi, SAVE_ESI
 177         movl    %ebx, SAVE_EBX
 178
 179         psllq   $32, %mm5
 180         jz      L(start_src_aligned)
 181
 182
 183         C src isn't aligned, process low limb separately (marked xxx) and
 184         C step src and dst by one limb, making src aligned.
 185         C
 186         C source                  edx
 187         C --+-------+-------+-------+
 188         C           |          xxx  |
 189         C --+-------+-------+-------+
 190         C         4mod8   0mod8   4mod8
 191         C
 192         C         dest            edi
 193         C         --+-------+-------+
 194         C           |       |  xxx  |
 195         C         --+-------+-------+
 196
 197         movq    (%edx), %mm0            C src low two limbs
 198         addl    $4, %edx
 199         movl    %eax, PARAM_SIZE        C size-1
 200
 201         addl    $4, %edi
 202         decl    %eax                    C size-2 is new size-1
 203
 204         psrlq   %mm6, %mm0
 205         movl    %edi, PARAM_DST         C new dst
 206
 207         movd    %mm0, -4(%edi)
 208 L(start_src_aligned):
 209
 210
 211         movq    (%edx), %mm1            C src low two limbs
 212         decl    %eax                    C size-2, two last limbs handled at end
 213         testl   $4, %edi
 214
 215         psrlq   %mm6, %mm5
 216         jz      L(start_dst_aligned)
 217
 218
 219         C dst isn't aligned, add 4 to make it so, and pretend the shift is
 220         C 32 bits extra.  Low limb of dst (marked xxx) handled here separately.
 221         C
 222         C          source          edx
 223         C          --+-------+-------+
 224         C            |      mm1      |
 225         C          --+-------+-------+
 226         C                  4mod8   0mod8
 227         C
 228         C  dest                    edi
 229         C  --+-------+-------+-------+
 230         C                    |  xxx  |
 231         C  --+-------+-------+-------+
 232         C          4mod8   0mod8   4mod8
 233
 234         movq    %mm1, %mm0
 235         psrlq   %mm6, %mm1
 236         addl    $32, %ecx               C shift+32
 237
 238         movd    %mm1, (%edi)
 239         movq    %mm0, %mm1
 240         addl    $4, %edi                C new dst
 241
 242         movd    %ecx, %mm6
 243 L(start_dst_aligned):
 244
 245
 246         movq    %mm1, %mm2              C copy of src low two limbs
 247         negl    %ecx
 248         andl    $-2, %eax               C round size down to even
 249
 250         movl    %eax, %ebx
 251         negl    %eax
 252         addl    $64, %ecx
 253
 254         andl    $UNROLL_MASK, %eax
 255         decl    %ebx
 256
 257         shll    %eax
 258
 259         movd    %ecx, %mm7              C lshift = 64-rshift
 260
 261 ifdef(`PIC',`
 262         call    L(pic_calc)
 263 L(here):
 264 ',`
 265         leal    L(entry) (%eax,%eax,4), %esi
 266         negl    %eax
 267 ')
 268         shrl    $UNROLL_LOG2, %ebx      C loop counter
 269
 270         leal    ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
 271         leal    ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
 272         movl    PARAM_SIZE, %eax        C for use at end
 273
 274         jmp     *%esi
 275
 276
 277 ifdef(`PIC',`
 278 L(pic_calc):
 279         C See README.family about old gas bugs
 280         leal    (%eax,%eax,4), %esi
 281         addl    $L(entry)-L(here), %esi
 282         addl    (%esp), %esi
 283         negl    %eax
 284
 285         ret
 286 ')
 287
 288
 289 C -----------------------------------------------------------------------------
 290         ALIGN(64)
 291 L(top):
 292         C eax   size, for use at end
 293         C ebx   loop counter
 294         C ecx   lshift
 295         C edx   src
 296         C esi   was computed jump
 297         C edi   dst
 298         C ebp
 299         C
 300         C mm0   scratch
 301         C mm1   \ carry (alternating)
 302         C mm2   /
 303         C mm6   rshift
 304         C mm7   lshift
 305         C
 306         C 10 code bytes/limb
 307         C
 308         C The two chunks differ in whether mm1 or mm2 hold the carry.
 309         C The computed jump puts the initial carry in both mm1 and mm2.
 310
 311 L(entry):
 312 deflit(CHUNK_COUNT, 4)
 313 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 314         deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
 315         deflit(`disp1', eval(disp0 + 8))
 316
 317         movq    disp0(%edx), %mm0
 318         psrlq   %mm6, %mm2
 319
 320         movq    %mm0, %mm1
 321         psllq   %mm7, %mm0
 322
 323         por     %mm2, %mm0
 324         movq    %mm0, disp0(%edi)
 325
 326
 327         movq    disp1(%edx), %mm0
 328         psrlq   %mm6, %mm1
 329
 330         movq    %mm0, %mm2
 331         psllq   %mm7, %mm0
 332
 333         por     %mm1, %mm0
 334         movq    %mm0, disp1(%edi)
 335 ')
 336
 337         addl    $UNROLL_BYTES, %edx
 338         addl    $UNROLL_BYTES, %edi
 339         decl    %ebx
 340
 341         jns     L(top)
 342
 343
 344 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
 345 deflit(`disp1', eval(disp0-0 + 8))
 346
 347         testb   $1, %al
 348         psrlq   %mm6, %mm2      C wanted rshifted in all cases below
 349         movl    SAVE_ESI, %esi
 350
 351         movd    %mm5, %eax              C return value
 352
 353         movl    SAVE_EBX, %ebx
 354         jz      L(end_even)
 355
 356
 357         C Size odd, destination was aligned.
 358         C
 359         C source
 360         C       edx
 361         C +-------+---------------+--
 362         C |       |      mm2      |
 363         C +-------+---------------+--
 364         C
 365         C dest                  edi
 366         C +-------+---------------+---------------+--
 367         C |       |               |    written    |
 368         C +-------+---------------+---------------+--
 369         C
 370         C mm6 = shift
 371         C mm7 = ecx = 64-shift
 372
 373
 374         C Size odd, destination was unaligned.
 375         C
 376         C source
 377         C       edx
 378         C +-------+---------------+--
 379         C |       |      mm2      |
 380         C +-------+---------------+--
 381         C
 382         C dest          edi
 383         C +---------------+---------------+--
 384         C |               |    written    |
 385         C +---------------+---------------+--
 386         C
 387         C mm6 = shift+32
 388         C mm7 = ecx = 64-(shift+32)
 389
 390
 391         C In both cases there's one extra limb of src to fetch and combine
 392         C with mm2 to make a qword to store, and in the aligned case there's
 393         C a further extra limb of dst to be formed.
 394
 395
 396         movd    disp0(%edx), %mm0
 397         movq    %mm0, %mm1
 398
 399         psllq   %mm7, %mm0
 400         testb   $32, %cl
 401
 402         por     %mm2, %mm0
 403         psrlq   %mm6, %mm1
 404
 405         movq    %mm0, disp0(%edi)
 406         jz      L(finish_odd_unaligned)
 407
 408         movd    %mm1, disp1(%edi)
 409 L(finish_odd_unaligned):
 410
 411         movl    SAVE_EDI, %edi
 412         addl    $SAVE_SIZE, %esp
 413         emms
 414
 415         ret
 416
 417
 418 L(end_even):
 419
 420         C Size even, destination was aligned.
 421         C
 422         C source
 423         C +---------------+--
 424         C |      mm2      |
 425         C +---------------+--
 426         C
 427         C dest          edi
 428         C +---------------+---------------+--
 429         C |               |      mm3      |
 430         C +---------------+---------------+--
 431         C
 432         C mm6 = shift
 433         C mm7 = ecx = 64-shift
 434
 435
 436         C Size even, destination was unaligned.
 437         C
 438         C source
 439         C +---------------+--
 440         C |      mm2      |
 441         C +---------------+--
 442         C
 443         C dest  edi
 444         C +-------+---------------+--
 445         C |       |      mm3      |
 446         C +-------+---------------+--
 447         C
 448         C mm6 = shift+32
 449         C mm7 = 64-(shift+32)
 450
 451
 452         C The movd for the unaligned case is the same data as the movq for
 453         C the aligned case, it's just a choice between whether one or two
 454         C limbs should be written.
 455
 456
 457         testb   $32, %cl
 458         movd    %mm2, disp0(%edi)
 459
 460         jz      L(end_even_unaligned)
 461
 462         movq    %mm2, disp0(%edi)
 463 L(end_even_unaligned):
 464
 465         movl    SAVE_EDI, %edi
 466         addl    $SAVE_SIZE, %esp
 467         emms
 468
 469         ret
 470
 471 EPILOGUE()