ghc/rts/gmp/mpn/x86/k7/mmx/mod_1.asm

   1 dnl  AMD K7 mpn_mod_1 -- mpn by limb remainder.
   2 dnl
   3 dnl  K7: 17.0 cycles/limb.
   4
   5
   6 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   7 dnl
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or
  11 dnl  modify it under the terms of the GNU Lesser General Public License as
  12 dnl  published by the Free Software Foundation; either version 2.1 of the
  13 dnl  License, or (at your option) any later version.
  14 dnl
  15 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  16 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 dnl  Lesser General Public License for more details.
  19 dnl
  20 dnl  You should have received a copy of the GNU Lesser General Public
  21 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  22 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  23 dnl  Suite 330, Boston, MA 02111-1307, USA.
  24
  25
  26 include(`../config.m4')
  27
  28
  29 C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
  30 C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
  31 C                       mp_limb_t carry);
  32 C
  33 C The code here is the same as mpn_divrem_1, but with the quotient
  34 C discarded.  See mpn/x86/k7/mmx/divrem_1.c for some comments.
  35
  36
  37 dnl  MUL_THRESHOLD is the size at which the multiply by inverse method is
  38 dnl  used, rather than plain "divl"s.  Minimum value 2.
  39 dnl
  40 dnl  The inverse takes about 50 cycles to calculate, but after that the
  41 dnl  multiply is 17 c/l versus division at 41 c/l.
  42 dnl
  43 dnl  Using mul or div is about the same speed at 3 limbs, so the threshold
  44 dnl  is set to 4 to get the smaller div code used at 3.
  45
  46 deflit(MUL_THRESHOLD, 4)
  47
  48
  49 defframe(PARAM_CARRY,  16)
  50 defframe(PARAM_DIVISOR,12)
  51 defframe(PARAM_SIZE,    8)
  52 defframe(PARAM_SRC,     4)
  53
  54 defframe(SAVE_EBX,    -4)
  55 defframe(SAVE_ESI,    -8)
  56 defframe(SAVE_EDI,    -12)
  57 defframe(SAVE_EBP,    -16)
  58
  59 defframe(VAR_NORM,    -20)
  60 defframe(VAR_INVERSE, -24)
  61 defframe(VAR_SRC_STOP,-28)
  62
  63 deflit(STACK_SPACE, 28)
  64
  65         .text
  66         ALIGN(32)
  67
  68 PROLOGUE(mpn_mod_1c)
  69 deflit(`FRAME',0)
  70         movl    PARAM_CARRY, %edx
  71         movl    PARAM_SIZE, %ecx
  72         subl    $STACK_SPACE, %esp
  73 deflit(`FRAME',STACK_SPACE)
  74
  75         movl    %ebp, SAVE_EBP
  76         movl    PARAM_DIVISOR, %ebp
  77
  78         movl    %esi, SAVE_ESI
  79         movl    PARAM_SRC, %esi
  80         jmp     LF(mpn_mod_1,start_1c)
  81
  82 EPILOGUE()
  83
  84
  85         ALIGN(32)
  86 PROLOGUE(mpn_mod_1)
  87 deflit(`FRAME',0)
  88
  89         movl    PARAM_SIZE, %ecx
  90         movl    $0, %edx                C initial carry (if can't skip a div)
  91         subl    $STACK_SPACE, %esp
  92 deflit(`FRAME',STACK_SPACE)
  93
  94         movl    %esi, SAVE_ESI
  95         movl    PARAM_SRC, %esi
  96
  97         movl    %ebp, SAVE_EBP
  98         movl    PARAM_DIVISOR, %ebp
  99
 100         orl     %ecx, %ecx
 101         jz      L(divide_done)
 102
 103         movl    -4(%esi,%ecx,4), %eax   C src high limb
 104
 105         cmpl    %ebp, %eax              C carry flag if high<divisor
 106
 107         cmovc(  %eax, %edx)             C src high limb as initial carry
 108         sbbl    $0, %ecx                C size-1 to skip one div
 109         jz      L(divide_done)
 110
 111
 112         ALIGN(16)
 113 L(start_1c):
 114         C eax
 115         C ebx
 116         C ecx   size
 117         C edx   carry
 118         C esi   src
 119         C edi
 120         C ebp   divisor
 121
 122         cmpl    $MUL_THRESHOLD, %ecx
 123         jae     L(mul_by_inverse)
 124
 125
 126
 127 C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations,
 128 C but it's already fast and compact, and there's nothing to gain by
 129 C expanding it out.
 130 C
 131 C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp.
 132
 133         orl     %ecx, %ecx
 134         jz      L(divide_done)
 135
 136
 137 L(divide_top):
 138         C eax   scratch (quotient)
 139         C ebx
 140         C ecx   counter, limbs, decrementing
 141         C edx   scratch (remainder)
 142         C esi   src
 143         C edi
 144         C ebp
 145
 146         movl    -4(%esi,%ecx,4), %eax
 147
 148         divl    PARAM_DIVISOR
 149
 150         decl    %ecx
 151         jnz     L(divide_top)
 152
 153
 154 L(divide_done):
 155         movl    SAVE_ESI, %esi
 156         movl    SAVE_EBP, %ebp
 157         addl    $STACK_SPACE, %esp
 158
 159         movl    %edx, %eax
 160
 161         ret
 162
 163
 164
 165 C -----------------------------------------------------------------------------
 166
 167 L(mul_by_inverse):
 168         C eax
 169         C ebx
 170         C ecx   size
 171         C edx   carry
 172         C esi   src
 173         C edi
 174         C ebp   divisor
 175
 176         bsrl    %ebp, %eax              C 31-l
 177
 178         movl    %ebx, SAVE_EBX
 179         leal    -4(%esi), %ebx
 180
 181         movl    %ebx, VAR_SRC_STOP
 182         movl    %edi, SAVE_EDI
 183
 184         movl    %ecx, %ebx              C size
 185         movl    $31, %ecx
 186
 187         movl    %edx, %edi              C carry
 188         movl    $-1, %edx
 189
 190         C
 191
 192         xorl    %eax, %ecx              C l
 193         incl    %eax                    C 32-l
 194
 195         shll    %cl, %ebp               C d normalized
 196         movl    %ecx, VAR_NORM
 197
 198         movd    %eax, %mm7
 199
 200         movl    $-1, %eax
 201         subl    %ebp, %edx              C (b-d)-1 so  edx:eax = b*(b-d)-1
 202
 203         divl    %ebp                    C floor (b*(b-d)-1) / d
 204
 205         C
 206
 207         movl    %eax, VAR_INVERSE
 208         leal    -12(%esi,%ebx,4), %eax  C &src[size-3]
 209
 210         movl    8(%eax), %esi           C src high limb
 211         movl    4(%eax), %edx           C src second highest limb
 212
 213         shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
 214
 215         shldl(  %cl, %edx, %esi)        C n10 = high,second << l
 216
 217         movl    %eax, %ecx              C &src[size-3]
 218
 219
 220 ifelse(MUL_THRESHOLD,2,`
 221         cmpl    $2, %ebx
 222         je      L(inverse_two_left)
 223 ')
 224
 225
 226 C The dependent chain here is the same as in mpn_divrem_1, but a few
 227 C instructions are saved by not needing to store the quotient limbs.
 228 C Unfortunately this doesn't get the code down to the theoretical 16 c/l.
 229 C
 230 C There's four dummy instructions in the loop, all of which are necessary
 231 C for the claimed 17 c/l.  It's a 1 to 3 cycle slowdown if any are removed,
 232 C or changed from load to store or vice versa.  They're not completely
 233 C random, since they correspond to what mpn_divrem_1 has, but there's no
 234 C obvious reason why they're necessary.  Presumably they induce something
 235 C good in the out of order execution, perhaps through some load/store
 236 C ordering and/or decoding effects.
 237 C
 238 C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1.  On
 239 C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at
 240 C about 13.5 c/l.
 241
 242         ALIGN(32)
 243 L(inverse_top):
 244         C eax   scratch
 245         C ebx   scratch (nadj, q1)
 246         C ecx   src pointer, decrementing
 247         C edx   scratch
 248         C esi   n10
 249         C edi   n2
 250         C ebp   divisor
 251         C
 252         C mm0   scratch (src qword)
 253         C mm7   rshift for normalization
 254
 255         cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
 256         movl    %edi, %eax         C n2
 257         movl    PARAM_SIZE, %ebx   C dummy
 258
 259         leal    (%ebp,%esi), %ebx
 260         cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
 261         sbbl    $-1, %eax          C n2+n1
 262
 263         mull    VAR_INVERSE        C m*(n2+n1)
 264
 265         movq    (%ecx), %mm0       C next src limb and the one below it
 266         subl    $4, %ecx
 267
 268         movl    %ecx, PARAM_SIZE   C dummy
 269
 270         C
 271
 272         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 273         leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
 274         movl    %ebp, %eax         C d
 275
 276         C
 277
 278         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 279         jz      L(q1_ff)
 280         nop                        C dummy
 281
 282         mull    %ebx               C (q1+1)*d
 283
 284         psrlq   %mm7, %mm0
 285         leal    0(%ecx), %ecx      C dummy
 286
 287         C
 288
 289         C
 290
 291         subl    %eax, %esi
 292         movl    VAR_SRC_STOP, %eax
 293
 294         C
 295
 296         sbbl    %edx, %edi         C n - (q1+1)*d
 297         movl    %esi, %edi         C remainder -> n2
 298         leal    (%ebp,%esi), %edx
 299
 300         movd    %mm0, %esi
 301
 302         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 303         cmpl    %eax, %ecx
 304         jne     L(inverse_top)
 305
 306
 307 L(inverse_loop_done):
 308
 309
 310 C -----------------------------------------------------------------------------
 311
 312 L(inverse_two_left):
 313         C eax   scratch
 314         C ebx   scratch (nadj, q1)
 315         C ecx   &src[-1]
 316         C edx   scratch
 317         C esi   n10
 318         C edi   n2
 319         C ebp   divisor
 320         C
 321         C mm0   scratch (src dword)
 322         C mm7   rshift
 323
 324         cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
 325         movl    %edi, %eax         C n2
 326
 327         leal    (%ebp,%esi), %ebx
 328         cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
 329         sbbl    $-1, %eax          C n2+n1
 330
 331         mull    VAR_INVERSE        C m*(n2+n1)
 332
 333         movd    4(%ecx), %mm0      C src low limb
 334
 335         C
 336
 337         C
 338
 339         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 340         leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
 341         movl    %ebp, %eax         C d
 342
 343         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 344
 345         sbbl    $0, %ebx
 346
 347         mull    %ebx               C (q1+1)*d
 348
 349         psllq   $32, %mm0
 350
 351         psrlq   %mm7, %mm0
 352
 353         C
 354
 355         subl    %eax, %esi
 356
 357         C
 358
 359         sbbl    %edx, %edi         C n - (q1+1)*d
 360         movl    %esi, %edi         C remainder -> n2
 361         leal    (%ebp,%esi), %edx
 362
 363         movd    %mm0, %esi
 364
 365         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 366
 367
 368 C One limb left
 369
 370         C eax   scratch
 371         C ebx   scratch (nadj, q1)
 372         C ecx
 373         C edx   scratch
 374         C esi   n10
 375         C edi   n2
 376         C ebp   divisor
 377         C
 378         C mm0   src limb, shifted
 379         C mm7   rshift
 380
 381         cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
 382         movl    %edi, %eax         C n2
 383
 384         leal    (%ebp,%esi), %ebx
 385         cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
 386         sbbl    $-1, %eax          C n2+n1
 387
 388         mull    VAR_INVERSE        C m*(n2+n1)
 389
 390         movl    VAR_NORM, %ecx     C for final denorm
 391
 392         C
 393
 394         C
 395
 396         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 397         leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
 398         movl    %ebp, %eax         C d
 399
 400         C
 401
 402         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 403
 404         sbbl    $0, %ebx
 405
 406         mull    %ebx               C (q1+1)*d
 407
 408         movl    SAVE_EBX, %ebx
 409
 410         C
 411
 412         C
 413
 414         subl    %eax, %esi
 415
 416         movl    %esi, %eax         C remainder
 417         movl    SAVE_ESI, %esi
 418
 419         sbbl    %edx, %edi         C n - (q1+1)*d
 420         leal    (%ebp,%eax), %edx
 421         movl    SAVE_EBP, %ebp
 422
 423         cmovc(  %edx, %eax)        C n - q1*d if underflow from using q1+1
 424         movl    SAVE_EDI, %edi
 425
 426         shrl    %cl, %eax          C denorm remainder
 427         addl    $STACK_SPACE, %esp
 428         emms
 429
 430         ret
 431
 432
 433 C -----------------------------------------------------------------------------
 434 C
 435 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
 436 C of q*d is simply -d and the remainder n-q*d = n10+d
 437
 438 L(q1_ff):
 439         C eax   (divisor)
 440         C ebx   (q1+1 == 0)
 441         C ecx   src pointer
 442         C edx
 443         C esi   n10
 444         C edi   (n2)
 445         C ebp   divisor
 446
 447         movl    VAR_SRC_STOP, %edx
 448         leal    (%ebp,%esi), %edi       C n-q*d remainder -> next n2
 449         psrlq   %mm7, %mm0
 450
 451         movd    %mm0, %esi              C next n10
 452
 453         cmpl    %ecx, %edx
 454         jne     L(inverse_top)
 455         jmp     L(inverse_loop_done)
 456
 457 EPILOGUE()