rts/gmp/mpn/x86/pentium/mmx/lshift.asm

   1 dnl  Intel P5 mpn_lshift -- mpn left shift.
   2 dnl
   3 dnl  P5: 1.75 cycles/limb.
   4
   5
   6 dnl  Copyright (C) 2000 Free Software Foundation, Inc.
   7 dnl
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or
  11 dnl  modify it under the terms of the GNU Lesser General Public License as
  12 dnl  published by the Free Software Foundation; either version 2.1 of the
  13 dnl  License, or (at your option) any later version.
  14 dnl
  15 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  16 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 dnl  Lesser General Public License for more details.
  19 dnl
  20 dnl  You should have received a copy of the GNU Lesser General Public
  21 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  22 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  23 dnl  Suite 330, Boston, MA 02111-1307, USA.
  24
  25
  26 include(`../config.m4')
  27
  28
  29 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  30 C                       unsigned shift);
  31 C
  32 C Shift src,size left by shift many bits and store the result in dst,size.
  33 C Zeros are shifted in at the right.  Return the bits shifted out at the
  34 C left.
  35 C
  36 C The comments in mpn_rshift apply here too.
  37
  38 defframe(PARAM_SHIFT,16)
  39 defframe(PARAM_SIZE, 12)
  40 defframe(PARAM_SRC,  8)
  41 defframe(PARAM_DST,  4)
  42 deflit(`FRAME',0)
  43
  44 dnl  minimum 5, because the unrolled loop can't handle less
  45 deflit(UNROLL_THRESHOLD, 5)
  46
  47         .text
  48         ALIGN(8)
  49
  50 PROLOGUE(mpn_lshift)
  51
  52         pushl   %ebx
  53         pushl   %edi
  54 deflit(`FRAME',8)
  55
  56         movl    PARAM_SIZE, %eax
  57         movl    PARAM_DST, %edx
  58
  59         movl    PARAM_SRC, %ebx
  60         movl    PARAM_SHIFT, %ecx
  61
  62         cmp     $UNROLL_THRESHOLD, %eax
  63         jae     L(unroll)
  64
  65         movl    -4(%ebx,%eax,4), %edi   C src high limb
  66         decl    %eax
  67
  68         jnz     L(simple)
  69
  70         shldl(  %cl, %edi, %eax)        C eax was decremented to zero
  71
  72         shll    %cl, %edi
  73
  74         movl    %edi, (%edx)            C dst low limb
  75         popl    %edi                    C risk of data cache bank clash
  76
  77         popl    %ebx
  78
  79         ret
  80
  81
  82 C -----------------------------------------------------------------------------
  83 L(simple):
  84         C eax   size-1
  85         C ebx   src
  86         C ecx   shift
  87         C edx   dst
  88         C esi
  89         C edi
  90         C ebp
  91 deflit(`FRAME',8)
  92
  93         movd    (%ebx,%eax,4), %mm5     C src high limb
  94
  95         movd    %ecx, %mm6              C lshift
  96         negl    %ecx
  97
  98         psllq   %mm6, %mm5
  99         addl    $32, %ecx
 100
 101         movd    %ecx, %mm7
 102         psrlq   $32, %mm5               C retval
 103
 104
 105 L(simple_top):
 106         C eax   counter, limbs, negative
 107         C ebx   src
 108         C ecx
 109         C edx   dst
 110         C esi
 111         C edi
 112         C
 113         C mm0   scratch
 114         C mm5   return value
 115         C mm6   shift
 116         C mm7   32-shift
 117
 118         movq    -4(%ebx,%eax,4), %mm0
 119         decl    %eax
 120
 121         psrlq   %mm7, %mm0
 122
 123         C
 124
 125         movd    %mm0, 4(%edx,%eax,4)
 126         jnz     L(simple_top)
 127
 128
 129         movd    (%ebx), %mm0
 130
 131         movd    %mm5, %eax
 132         psllq   %mm6, %mm0
 133
 134         popl    %edi
 135         popl    %ebx
 136
 137         movd    %mm0, (%edx)
 138
 139         emms
 140
 141         ret
 142
 143
 144 C -----------------------------------------------------------------------------
 145         ALIGN(8)
 146 L(unroll):
 147         C eax   size
 148         C ebx   src
 149         C ecx   shift
 150         C edx   dst
 151         C esi
 152         C edi
 153         C ebp
 154 deflit(`FRAME',8)
 155
 156         movd    -4(%ebx,%eax,4), %mm5   C src high limb
 157         leal    (%ebx,%eax,4), %edi
 158
 159         movd    %ecx, %mm6              C lshift
 160         andl    $4, %edi
 161
 162         psllq   %mm6, %mm5
 163         jz      L(start_src_aligned)
 164
 165
 166         C src isn't aligned, process high limb separately (marked xxx) to
 167         C make it so.
 168         C
 169         C  source     -8(ebx,%eax,4)
 170         C                  |
 171         C  +-------+-------+-------+--
 172         C  |               |
 173         C  +-------+-------+-------+--
 174         C        0mod8   4mod8   0mod8
 175         C
 176         C  dest
 177         C     -4(edx,%eax,4)
 178         C          |
 179         C  +-------+-------+--
 180         C  |  xxx  |       |
 181         C  +-------+-------+--
 182
 183         movq    -8(%ebx,%eax,4), %mm0   C unaligned load
 184
 185         psllq   %mm6, %mm0
 186         decl    %eax
 187
 188         psrlq   $32, %mm0
 189
 190         C
 191
 192         movd    %mm0, (%edx,%eax,4)
 193 L(start_src_aligned):
 194
 195         movq    -8(%ebx,%eax,4), %mm1   C src high qword
 196         leal    (%edx,%eax,4), %edi
 197
 198         andl    $4, %edi
 199         psrlq   $32, %mm5               C return value
 200
 201         movq    -16(%ebx,%eax,4), %mm3  C src second highest qword
 202         jz      L(start_dst_aligned)
 203
 204         C dst isn't aligned, subtract 4 to make it so, and pretend the shift
 205         C is 32 bits extra.  High limb of dst (marked xxx) handled here
 206         C separately.
 207         C
 208         C  source     -8(ebx,%eax,4)
 209         C                  |
 210         C  +-------+-------+--
 211         C  |      mm1      |
 212         C  +-------+-------+--
 213         C                0mod8   4mod8
 214         C
 215         C  dest
 216         C     -4(edx,%eax,4)
 217         C          |
 218         C  +-------+-------+-------+--
 219         C  |  xxx  |               |
 220         C  +-------+-------+-------+--
 221         C        0mod8   4mod8   0mod8
 222
 223         movq    %mm1, %mm0
 224         addl    $32, %ecx               C new shift
 225
 226         psllq   %mm6, %mm0
 227
 228         movd    %ecx, %mm6
 229         psrlq   $32, %mm0
 230
 231         C wasted cycle here waiting for %mm0
 232
 233         movd    %mm0, -4(%edx,%eax,4)
 234         subl    $4, %edx
 235 L(start_dst_aligned):
 236
 237
 238         psllq   %mm6, %mm1
 239         negl    %ecx                    C -shift
 240
 241         addl    $64, %ecx               C 64-shift
 242         movq    %mm3, %mm2
 243
 244         movd    %ecx, %mm7
 245         subl    $8, %eax                C size-8
 246
 247         psrlq   %mm7, %mm3
 248
 249         por     %mm1, %mm3              C mm3 ready to store
 250         jc      L(finish)
 251
 252
 253         C The comments in mpn_rshift apply here too.
 254
 255         ALIGN(8)
 256 L(unroll_loop):
 257         C eax   counter, limbs
 258         C ebx   src
 259         C ecx
 260         C edx   dst
 261         C esi
 262         C edi
 263         C
 264         C mm0
 265         C mm1
 266         C mm2   src qword from 48(%ebx,%eax,4)
 267         C mm3   dst qword ready to store to 56(%edx,%eax,4)
 268         C
 269         C mm5   return value
 270         C mm6   lshift
 271         C mm7   rshift
 272
 273         movq    8(%ebx,%eax,4), %mm0
 274         psllq   %mm6, %mm2
 275
 276         movq    %mm0, %mm1
 277         psrlq   %mm7, %mm0
 278
 279         movq    %mm3, 24(%edx,%eax,4)   C prev
 280         por     %mm2, %mm0
 281
 282         movq    (%ebx,%eax,4), %mm3     C
 283         psllq   %mm6, %mm1              C
 284
 285         movq    %mm0, 16(%edx,%eax,4)
 286         movq    %mm3, %mm2              C
 287
 288         psrlq   %mm7, %mm3              C
 289         subl    $4, %eax
 290
 291         por     %mm1, %mm3              C
 292         jnc     L(unroll_loop)
 293
 294
 295
 296 L(finish):
 297         C eax   -4 to -1 representing respectively 0 to 3 limbs remaining
 298
 299         testb   $2, %al
 300
 301         jz      L(finish_no_two)
 302
 303         movq    8(%ebx,%eax,4), %mm0
 304         psllq   %mm6, %mm2
 305
 306         movq    %mm0, %mm1
 307         psrlq   %mm7, %mm0
 308
 309         movq    %mm3, 24(%edx,%eax,4)   C prev
 310         por     %mm2, %mm0
 311
 312         movq    %mm1, %mm2
 313         movq    %mm0, %mm3
 314
 315         subl    $2, %eax
 316 L(finish_no_two):
 317
 318
 319         C eax   -4 or -3 representing respectively 0 or 1 limbs remaining
 320         C
 321         C mm2   src prev qword, from 48(%ebx,%eax,4)
 322         C mm3   dst qword, for 56(%edx,%eax,4)
 323
 324         testb   $1, %al
 325         movd    %mm5, %eax      C retval
 326
 327         popl    %edi
 328         jz      L(finish_zero)
 329
 330
 331         C One extra src limb, destination was aligned.
 332         C
 333         C                 source                  ebx
 334         C                 --+---------------+-------+
 335         C                   |      mm2      |       |
 336         C                 --+---------------+-------+
 337         C
 338         C dest         edx+12           edx+4     edx
 339         C --+---------------+---------------+-------+
 340         C   |      mm3      |               |       |
 341         C --+---------------+---------------+-------+
 342         C
 343         C mm6 = shift
 344         C mm7 = ecx = 64-shift
 345
 346
 347         C One extra src limb, destination was unaligned.
 348         C
 349         C                 source                  ebx
 350         C                 --+---------------+-------+
 351         C                   |      mm2      |       |
 352         C                 --+---------------+-------+
 353         C
 354         C         dest         edx+12           edx+4
 355         C         --+---------------+---------------+
 356         C           |      mm3      |               |
 357         C         --+---------------+---------------+
 358         C
 359         C mm6 = shift+32
 360         C mm7 = ecx = 64-(shift+32)
 361
 362
 363         C In both cases there's one extra limb of src to fetch and combine
 364         C with mm2 to make a qword at 4(%edx), and in the aligned case
 365         C there's an extra limb of dst to be formed from that extra src limb
 366         C left shifted.
 367
 368
 369         movd    (%ebx), %mm0
 370         psllq   %mm6, %mm2
 371
 372         movq    %mm3, 12(%edx)
 373         psllq   $32, %mm0
 374
 375         movq    %mm0, %mm1
 376         psrlq   %mm7, %mm0
 377
 378         por     %mm2, %mm0
 379         psllq   %mm6, %mm1
 380
 381         movq    %mm0, 4(%edx)
 382         psrlq   $32, %mm1
 383
 384         andl    $32, %ecx
 385         popl    %ebx
 386
 387         jz      L(finish_one_unaligned)
 388
 389         movd    %mm1, (%edx)
 390 L(finish_one_unaligned):
 391
 392         emms
 393
 394         ret
 395
 396
 397 L(finish_zero):
 398
 399         C No extra src limbs, destination was aligned.
 400         C
 401         C                 source          ebx
 402         C                 --+---------------+
 403         C                   |      mm2      |
 404         C                 --+---------------+
 405         C
 406         C dest          edx+8             edx
 407         C --+---------------+---------------+
 408         C   |      mm3      |               |
 409         C --+---------------+---------------+
 410         C
 411         C mm6 = shift
 412         C mm7 = ecx = 64-shift
 413
 414
 415         C No extra src limbs, destination was unaligned.
 416         C
 417         C               source            ebx
 418         C                 --+---------------+
 419         C                   |      mm2      |
 420         C                 --+---------------+
 421         C
 422         C         dest          edx+8   edx+4
 423         C         --+---------------+-------+
 424         C           |      mm3      |       |
 425         C         --+---------------+-------+
 426         C
 427         C mm6 = shift+32
 428         C mm7 = ecx = 64-(shift+32)
 429
 430
 431         C The movd for the unaligned case writes the same data to 4(%edx)
 432         C that the movq does for the aligned case.
 433
 434
 435         movq    %mm3, 8(%edx)
 436         andl    $32, %ecx
 437
 438         psllq   %mm6, %mm2
 439         jz      L(finish_zero_unaligned)
 440
 441         movq    %mm2, (%edx)
 442 L(finish_zero_unaligned):
 443
 444         psrlq   $32, %mm2
 445         popl    %ebx
 446
 447         movd    %mm5, %eax      C retval
 448
 449         movd    %mm2, 4(%edx)
 450
 451         emms
 452
 453         ret
 454
 455 EPILOGUE()