ghc/rts/gmp/mpn/x86/k7/mmx/lshift.asm

   1 dnl  AMD K7 mpn_lshift -- mpn left shift.
   2 dnl
   3 dnl  K7: 1.21 cycles/limb (at 16 limbs/loop).
   4
   5
   6 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   7 dnl
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or
  11 dnl  modify it under the terms of the GNU Lesser General Public License as
  12 dnl  published by the Free Software Foundation; either version 2.1 of the
  13 dnl  License, or (at your option) any later version.
  14 dnl
  15 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  16 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 dnl  Lesser General Public License for more details.
  19 dnl
  20 dnl  You should have received a copy of the GNU Lesser General Public
  21 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  22 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  23 dnl  Suite 330, Boston, MA 02111-1307, USA.
  24
  25
  26 include(`../config.m4')
  27
  28
  29 dnl  K7: UNROLL_COUNT cycles/limb
  30 dnl           4           1.51
  31 dnl           8           1.26
  32 dnl          16           1.21
  33 dnl          32           1.2
  34 dnl  Maximum possible with the current code is 64.
  35
  36 deflit(UNROLL_COUNT, 16)
  37
  38
  39 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  40 C                       unsigned shift);
  41 C
  42 C Shift src,size left by shift many bits and store the result in dst,size.
  43 C Zeros are shifted in at the right.  The bits shifted out at the left are
  44 C the return value.
  45 C
  46 C The comments in mpn_rshift apply here too.
  47
  48 ifdef(`PIC',`
  49 deflit(UNROLL_THRESHOLD, 10)
  50 ',`
  51 deflit(UNROLL_THRESHOLD, 10)
  52 ')
  53
  54 defframe(PARAM_SHIFT,16)
  55 defframe(PARAM_SIZE, 12)
  56 defframe(PARAM_SRC,  8)
  57 defframe(PARAM_DST,  4)
  58
  59 defframe(SAVE_EDI, -4)
  60 defframe(SAVE_ESI, -8)
  61 defframe(SAVE_EBX, -12)
  62 deflit(SAVE_SIZE, 12)
  63
  64         .text
  65         ALIGN(32)
  66
  67 PROLOGUE(mpn_lshift)
  68 deflit(`FRAME',0)
  69
  70         movl    PARAM_SIZE, %eax
  71         movl    PARAM_SRC, %edx
  72         subl    $SAVE_SIZE, %esp
  73 deflit(`FRAME',SAVE_SIZE)
  74
  75         movl    PARAM_SHIFT, %ecx
  76         movl    %edi, SAVE_EDI
  77
  78         movl    PARAM_DST, %edi
  79         decl    %eax
  80         jnz     L(more_than_one_limb)
  81
  82         movl    (%edx), %edx
  83
  84         shldl(  %cl, %edx, %eax)        C eax was decremented to zero
  85
  86         shll    %cl, %edx
  87
  88         movl    %edx, (%edi)
  89         movl    SAVE_EDI, %edi
  90         addl    $SAVE_SIZE, %esp
  91
  92         ret
  93
  94
  95 C -----------------------------------------------------------------------------
  96 L(more_than_one_limb):
  97         C eax   size-1
  98         C ebx
  99         C ecx   shift
 100         C edx   src
 101         C esi
 102         C edi   dst
 103         C ebp
 104
 105         movd    PARAM_SHIFT, %mm6
 106         movd    (%edx,%eax,4), %mm5     C src high limb
 107         cmp     $UNROLL_THRESHOLD-1, %eax
 108
 109         jae     L(unroll)
 110         negl    %ecx
 111         movd    (%edx), %mm4            C src low limb
 112
 113         addl    $32, %ecx
 114
 115         movd    %ecx, %mm7
 116
 117 L(simple_top):
 118         C eax   loop counter, limbs
 119         C ebx
 120         C ecx
 121         C edx   src
 122         C esi
 123         C edi   dst
 124         C ebp
 125         C
 126         C mm0   scratch
 127         C mm4   src low limb
 128         C mm5   src high limb
 129         C mm6   shift
 130         C mm7   32-shift
 131
 132         movq    -4(%edx,%eax,4), %mm0
 133         decl    %eax
 134
 135         psrlq   %mm7, %mm0
 136
 137         movd    %mm0, 4(%edi,%eax,4)
 138         jnz     L(simple_top)
 139
 140
 141         psllq   %mm6, %mm5
 142         psllq   %mm6, %mm4
 143
 144         psrlq   $32, %mm5
 145         movd    %mm4, (%edi)            C dst low limb
 146
 147         movd    %mm5, %eax              C return value
 148
 149         movl    SAVE_EDI, %edi
 150         addl    $SAVE_SIZE, %esp
 151         emms
 152
 153         ret
 154
 155
 156 C -----------------------------------------------------------------------------
 157         ALIGN(16)
 158 L(unroll):
 159         C eax   size-1
 160         C ebx   (saved)
 161         C ecx   shift
 162         C edx   src
 163         C esi
 164         C edi   dst
 165         C ebp
 166         C
 167         C mm5   src high limb, for return value
 168         C mm6   lshift
 169
 170         movl    %esi, SAVE_ESI
 171         movl    %ebx, SAVE_EBX
 172         leal    -4(%edx,%eax,4), %edx   C &src[size-2]
 173
 174         testb   $4, %dl
 175         movq    (%edx), %mm1            C src high qword
 176
 177         jz      L(start_src_aligned)
 178
 179
 180         C src isn't aligned, process high limb (marked xxx) separately to
 181         C make it so
 182         C
 183         C  source    -4(edx,%eax,4)
 184         C                  |
 185         C  +-------+-------+-------+--
 186         C  |  xxx          |
 187         C  +-------+-------+-------+--
 188         C        0mod8   4mod8   0mod8
 189         C
 190         C  dest      -4(edi,%eax,4)
 191         C                  |
 192         C  +-------+-------+--
 193         C  |  xxx  |       |
 194         C  +-------+-------+--
 195
 196         psllq   %mm6, %mm1
 197         subl    $4, %edx
 198         movl    %eax, PARAM_SIZE        C size-1
 199
 200         psrlq   $32, %mm1
 201         decl    %eax                    C size-2 is new size-1
 202
 203         movd    %mm1, 4(%edi,%eax,4)
 204         movq    (%edx), %mm1            C new src high qword
 205 L(start_src_aligned):
 206
 207
 208         leal    -4(%edi,%eax,4), %edi   C &dst[size-2]
 209         psllq   %mm6, %mm5
 210
 211         testl   $4, %edi
 212         psrlq   $32, %mm5               C return value
 213
 214         jz      L(start_dst_aligned)
 215
 216
 217         C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
 218         C shift is 32 bits extra.  High limb of dst (marked xxx) handled
 219         C here separately.
 220         C
 221         C  source       %edx
 222         C  +-------+-------+--
 223         C  |      mm1      |
 224         C  +-------+-------+--
 225         C                0mod8   4mod8
 226         C
 227         C  dest         %edi
 228         C  +-------+-------+-------+--
 229         C  |  xxx  |
 230         C  +-------+-------+-------+--
 231         C        0mod8   4mod8   0mod8
 232
 233         movq    %mm1, %mm0
 234         psllq   %mm6, %mm1
 235         addl    $32, %ecx               C shift+32
 236
 237         psrlq   $32, %mm1
 238
 239         movd    %mm1, 4(%edi)
 240         movq    %mm0, %mm1
 241         subl    $4, %edi
 242
 243         movd    %ecx, %mm6              C new lshift
 244 L(start_dst_aligned):
 245
 246         decl    %eax                    C size-2, two last limbs handled at end
 247         movq    %mm1, %mm2              C copy of src high qword
 248         negl    %ecx
 249
 250         andl    $-2, %eax               C round size down to even
 251         addl    $64, %ecx
 252
 253         movl    %eax, %ebx
 254         negl    %eax
 255
 256         andl    $UNROLL_MASK, %eax
 257         decl    %ebx
 258
 259         shll    %eax
 260
 261         movd    %ecx, %mm7              C rshift = 64-lshift
 262
 263 ifdef(`PIC',`
 264         call    L(pic_calc)
 265 L(here):
 266 ',`
 267         leal    L(entry) (%eax,%eax,4), %esi
 268 ')
 269         shrl    $UNROLL_LOG2, %ebx      C loop counter
 270
 271         leal    ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
 272         leal    ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
 273         movl    PARAM_SIZE, %eax        C for use at end
 274         jmp     *%esi
 275
 276
 277 ifdef(`PIC',`
 278 L(pic_calc):
 279         C See README.family about old gas bugs
 280         leal    (%eax,%eax,4), %esi
 281         addl    $L(entry)-L(here), %esi
 282         addl    (%esp), %esi
 283
 284         ret
 285 ')
 286
 287
 288 C -----------------------------------------------------------------------------
 289         ALIGN(32)
 290 L(top):
 291         C eax   size (for use at end)
 292         C ebx   loop counter
 293         C ecx   rshift
 294         C edx   src
 295         C esi   computed jump
 296         C edi   dst
 297         C ebp
 298         C
 299         C mm0   scratch
 300         C mm1   \ carry (alternating, mm2 first)
 301         C mm2   /
 302         C mm6   lshift
 303         C mm7   rshift
 304         C
 305         C 10 code bytes/limb
 306         C
 307         C The two chunks differ in whether mm1 or mm2 hold the carry.
 308         C The computed jump puts the initial carry in both mm1 and mm2.
 309
 310 L(entry):
 311 deflit(CHUNK_COUNT, 4)
 312 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 313         deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
 314         deflit(`disp1', eval(disp0 - 8))
 315
 316         movq    disp0(%edx), %mm0
 317         psllq   %mm6, %mm2
 318
 319         movq    %mm0, %mm1
 320         psrlq   %mm7, %mm0
 321
 322         por     %mm2, %mm0
 323         movq    %mm0, disp0(%edi)
 324
 325
 326         movq    disp1(%edx), %mm0
 327         psllq   %mm6, %mm1
 328
 329         movq    %mm0, %mm2
 330         psrlq   %mm7, %mm0
 331
 332         por     %mm1, %mm0
 333         movq    %mm0, disp1(%edi)
 334 ')
 335
 336         subl    $UNROLL_BYTES, %edx
 337         subl    $UNROLL_BYTES, %edi
 338         decl    %ebx
 339
 340         jns     L(top)
 341
 342
 343
 344 define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
 345
 346 L(end):
 347         testb   $1, %al
 348         movl    SAVE_EBX, %ebx
 349         psllq   %mm6, %mm2      C wanted left shifted in all cases below
 350
 351         movd    %mm5, %eax
 352
 353         movl    SAVE_ESI, %esi
 354         jz      L(end_even)
 355
 356
 357 L(end_odd):
 358
 359         C Size odd, destination was aligned.
 360         C
 361         C                 source        edx+8   edx+4
 362         C                 --+---------------+-------+
 363         C                   |      mm2      |       |
 364         C                 --+---------------+-------+
 365         C
 366         C dest                            edi
 367         C --+---------------+---------------+-------+
 368         C   |   written     |               |       |
 369         C --+---------------+---------------+-------+
 370         C
 371         C mm6 = shift
 372         C mm7 = ecx = 64-shift
 373
 374
 375         C Size odd, destination was unaligned.
 376         C
 377         C                 source        edx+8   edx+4
 378         C                 --+---------------+-------+
 379         C                   |      mm2      |       |
 380         C                 --+---------------+-------+
 381         C
 382         C         dest                            edi
 383         C         --+---------------+---------------+
 384         C           |   written     |               |
 385         C         --+---------------+---------------+
 386         C
 387         C mm6 = shift+32
 388         C mm7 = ecx = 64-(shift+32)
 389
 390
 391         C In both cases there's one extra limb of src to fetch and combine
 392         C with mm2 to make a qword at (%edi), and in the aligned case
 393         C there's an extra limb of dst to be formed from that extra src limb
 394         C left shifted.
 395
 396         movd    disp(4) (%edx), %mm0
 397         testb   $32, %cl
 398
 399         movq    %mm0, %mm1
 400         psllq   $32, %mm0
 401
 402         psrlq   %mm7, %mm0
 403         psllq   %mm6, %mm1
 404
 405         por     %mm2, %mm0
 406
 407         movq    %mm0, disp(0) (%edi)
 408         jz      L(end_odd_unaligned)
 409         movd    %mm1, disp(-4) (%edi)
 410 L(end_odd_unaligned):
 411
 412         movl    SAVE_EDI, %edi
 413         addl    $SAVE_SIZE, %esp
 414         emms
 415
 416         ret
 417
 418
 419 L(end_even):
 420
 421         C Size even, destination was aligned.
 422         C
 423         C                 source        edx+8
 424         C                 --+---------------+
 425         C                   |      mm2      |
 426         C                 --+---------------+
 427         C
 428         C dest                            edi
 429         C --+---------------+---------------+
 430         C   |   written     |               |
 431         C --+---------------+---------------+
 432         C
 433         C mm6 = shift
 434         C mm7 = ecx = 64-shift
 435
 436
 437         C Size even, destination was unaligned.
 438         C
 439         C               source          edx+8
 440         C                 --+---------------+
 441         C                   |      mm2      |
 442         C                 --+---------------+
 443         C
 444         C         dest                  edi+4
 445         C         --+---------------+-------+
 446         C           |    written    |       |
 447         C         --+---------------+-------+
 448         C
 449         C mm6 = shift+32
 450         C mm7 = ecx = 64-(shift+32)
 451
 452
 453         C The movq for the aligned case overwrites the movd for the
 454         C unaligned case.
 455
 456         movq    %mm2, %mm0
 457         psrlq   $32, %mm2
 458
 459         testb   $32, %cl
 460         movd    %mm2, disp(4) (%edi)
 461
 462         jz      L(end_even_unaligned)
 463         movq    %mm0, disp(0) (%edi)
 464 L(end_even_unaligned):
 465
 466         movl    SAVE_EDI, %edi
 467         addl    $SAVE_SIZE, %esp
 468         emms
 469
 470         ret
 471
 472 EPILOGUE()