rts/gmp/mpn/x86/p6/mmx/mod_1.asm

   1 dnl  Intel Pentium-II mpn_mod_1 -- mpn by limb remainder.
   2 dnl
   3 dnl  P6MMX: 24.0 cycles/limb.
   4
   5
   6 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   7 dnl
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or
  11 dnl  modify it under the terms of the GNU Lesser General Public License as
  12 dnl  published by the Free Software Foundation; either version 2.1 of the
  13 dnl  License, or (at your option) any later version.
  14 dnl
  15 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  16 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 dnl  Lesser General Public License for more details.
  19 dnl
  20 dnl  You should have received a copy of the GNU Lesser General Public
  21 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  22 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  23 dnl  Suite 330, Boston, MA 02111-1307, USA.
  24
  25
  26 include(`../config.m4')
  27
  28
  29 C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
  30 C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
  31 C                       mp_limb_t carry);
  32 C
  33 C The code here very similar to mpn_divrem_1, but with the quotient
  34 C discarded.  What's here probably isn't optimal.
  35 C
  36 C See mpn/x86/p6/mmx/divrem_1.c and mpn/x86/k7/mmx/mod_1.asm for some
  37 C comments.
  38
  39
  40 dnl  MUL_THRESHOLD is the size at which the multiply by inverse method is
  41 dnl  used, rather than plain "divl"s.  Minimum value 2.
  42
  43 deflit(MUL_THRESHOLD, 4)
  44
  45
  46 defframe(PARAM_CARRY,  16)
  47 defframe(PARAM_DIVISOR,12)
  48 defframe(PARAM_SIZE,    8)
  49 defframe(PARAM_SRC,     4)
  50
  51 defframe(SAVE_EBX,    -4)
  52 defframe(SAVE_ESI,    -8)
  53 defframe(SAVE_EDI,    -12)
  54 defframe(SAVE_EBP,    -16)
  55
  56 defframe(VAR_NORM,    -20)
  57 defframe(VAR_INVERSE, -24)
  58 defframe(VAR_SRC_STOP,-28)
  59
  60 deflit(STACK_SPACE, 28)
  61
  62         .text
  63         ALIGN(16)
  64
  65 PROLOGUE(mpn_mod_1c)
  66 deflit(`FRAME',0)
  67         movl    PARAM_CARRY, %edx
  68         movl    PARAM_SIZE, %ecx
  69         subl    $STACK_SPACE, %esp
  70 deflit(`FRAME',STACK_SPACE)
  71
  72         movl    %ebp, SAVE_EBP
  73         movl    PARAM_DIVISOR, %ebp
  74
  75         movl    %esi, SAVE_ESI
  76         movl    PARAM_SRC, %esi
  77         jmp     LF(mpn_mod_1,start_1c)
  78
  79 EPILOGUE()
  80
  81
  82         ALIGN(16)
  83 PROLOGUE(mpn_mod_1)
  84 deflit(`FRAME',0)
  85
  86         movl    $0, %edx                C initial carry (if can't skip a div)
  87         movl    PARAM_SIZE, %ecx
  88         subl    $STACK_SPACE, %esp
  89 deflit(`FRAME',STACK_SPACE)
  90
  91         movl    %esi, SAVE_ESI
  92         movl    PARAM_SRC, %esi
  93
  94         movl    %ebp, SAVE_EBP
  95         movl    PARAM_DIVISOR, %ebp
  96
  97         orl     %ecx, %ecx
  98         jz      L(divide_done)
  99
 100         movl    -4(%esi,%ecx,4), %eax   C src high limb
 101
 102         cmpl    %ebp, %eax              C carry flag if high<divisor
 103
 104         cmovc(  %eax, %edx)             C src high limb as initial carry
 105         sbbl    $0, %ecx                C size-1 to skip one div
 106         jz      L(divide_done)
 107
 108
 109         ALIGN(16)
 110 L(start_1c):
 111         C eax
 112         C ebx
 113         C ecx   size
 114         C edx   carry
 115         C esi   src
 116         C edi
 117         C ebp   divisor
 118
 119         cmpl    $MUL_THRESHOLD, %ecx
 120         jae     L(mul_by_inverse)
 121
 122
 123         orl     %ecx, %ecx
 124         jz      L(divide_done)
 125
 126
 127 L(divide_top):
 128         C eax   scratch (quotient)
 129         C ebx
 130         C ecx   counter, limbs, decrementing
 131         C edx   scratch (remainder)
 132         C esi   src
 133         C edi
 134         C ebp
 135
 136         movl    -4(%esi,%ecx,4), %eax
 137
 138         divl    %ebp
 139
 140         decl    %ecx
 141         jnz     L(divide_top)
 142
 143
 144 L(divide_done):
 145         movl    SAVE_ESI, %esi
 146         movl    %edx, %eax
 147
 148         movl    SAVE_EBP, %ebp
 149         addl    $STACK_SPACE, %esp
 150
 151         ret
 152
 153
 154
 155 C -----------------------------------------------------------------------------
 156
 157 L(mul_by_inverse):
 158         C eax
 159         C ebx
 160         C ecx   size
 161         C edx   carry
 162         C esi   src
 163         C edi
 164         C ebp   divisor
 165
 166         movl    %ebx, SAVE_EBX
 167         leal    -4(%esi), %ebx
 168
 169         movl    %ebx, VAR_SRC_STOP
 170         movl    %ecx, %ebx              C size
 171
 172         movl    %edi, SAVE_EDI
 173         movl    %edx, %edi              C carry
 174
 175         bsrl    %ebp, %ecx              C 31-l
 176         movl    $-1, %edx
 177
 178         leal    1(%ecx), %eax           C 32-l
 179         xorl    $31, %ecx               C l
 180
 181         movl    %ecx, VAR_NORM
 182         shll    %cl, %ebp               C d normalized
 183
 184         movd    %eax, %mm7
 185         movl    $-1, %eax
 186         subl    %ebp, %edx              C (b-d)-1 so  edx:eax = b*(b-d)-1
 187
 188         divl    %ebp                    C floor (b*(b-d)-1) / d
 189
 190         C
 191
 192         movl    %eax, VAR_INVERSE
 193         leal    -12(%esi,%ebx,4), %eax  C &src[size-3]
 194
 195         movl    8(%eax), %esi           C src high limb
 196         movl    4(%eax), %edx           C src second highest limb
 197
 198         shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
 199
 200         shldl(  %cl, %edx, %esi)        C n10 = high,second << l
 201
 202         movl    %eax, %ecx              C &src[size-3]
 203
 204
 205 ifelse(MUL_THRESHOLD,2,`
 206         cmpl    $2, %ebx
 207         je      L(inverse_two_left)
 208 ')
 209
 210
 211 C The dependent chain here is the same as in mpn_divrem_1, but a few
 212 C instructions are saved by not needing to store the quotient limbs.  This
 213 C gets it down to 24 c/l, which is still a bit away from a theoretical 19
 214 C c/l.
 215
 216         ALIGN(16)
 217 L(inverse_top):
 218         C eax   scratch
 219         C ebx   scratch (nadj, q1)
 220         C ecx   src pointer, decrementing
 221         C edx   scratch
 222         C esi   n10
 223         C edi   n2
 224         C ebp   divisor
 225         C
 226         C mm0   scratch (src qword)
 227         C mm7   rshift for normalization
 228
 229
 230         movl    %esi, %eax
 231         movl    %ebp, %ebx
 232
 233         sarl    $31, %eax          C -n1
 234
 235         andl    %eax, %ebx         C -n1 & d
 236         negl    %eax               C n1
 237
 238         addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
 239         addl    %edi, %eax         C n2+n1
 240
 241         mull    VAR_INVERSE        C m*(n2+n1)
 242
 243         movq    (%ecx), %mm0       C next src limb and the one below it
 244         subl    $4, %ecx
 245
 246         C
 247
 248         C
 249
 250         C
 251
 252         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 253         leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
 254         movl    %ebp, %eax         C d
 255
 256         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 257         jz      L(q1_ff)
 258
 259         mull    %ebx               C (q1+1)*d
 260
 261         psrlq   %mm7, %mm0
 262         movl    VAR_SRC_STOP, %ebx
 263
 264         C
 265
 266         C
 267
 268         C
 269
 270         subl    %eax, %esi
 271
 272         sbbl    %edx, %edi         C n - (q1+1)*d
 273         movl    %esi, %edi         C remainder -> n2
 274         leal    (%ebp,%esi), %edx
 275
 276         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 277         movd    %mm0, %esi
 278         cmpl    %ebx, %ecx
 279
 280         jne     L(inverse_top)
 281
 282
 283 L(inverse_loop_done):
 284
 285
 286 C -----------------------------------------------------------------------------
 287
 288 L(inverse_two_left):
 289         C eax   scratch
 290         C ebx   scratch (nadj, q1)
 291         C ecx   &src[-1]
 292         C edx   scratch
 293         C esi   n10
 294         C edi   n2
 295         C ebp   divisor
 296         C
 297         C mm0   scratch (src dword)
 298         C mm7   rshift
 299
 300         movl    %esi, %eax
 301         movl    %ebp, %ebx
 302
 303         sarl    $31, %eax          C -n1
 304
 305         andl    %eax, %ebx         C -n1 & d
 306         negl    %eax               C n1
 307
 308         addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
 309         addl    %edi, %eax         C n2+n1
 310
 311         mull    VAR_INVERSE        C m*(n2+n1)
 312
 313         movd    4(%ecx), %mm0      C src low limb
 314
 315         C
 316
 317         C
 318
 319         C
 320
 321         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 322         leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
 323
 324         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 325
 326         sbbl    $0, %ebx
 327         movl    %ebp, %eax         C d
 328
 329         mull    %ebx               C (q1+1)*d
 330
 331         psllq   $32, %mm0
 332
 333         psrlq   %mm7, %mm0
 334
 335         C
 336
 337         C
 338
 339         subl    %eax, %esi
 340
 341         sbbl    %edx, %edi         C n - (q1+1)*d
 342         movl    %esi, %edi         C remainder -> n2
 343         leal    (%ebp,%esi), %edx
 344
 345         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 346         movd    %mm0, %esi
 347
 348
 349 C One limb left
 350
 351         C eax   scratch
 352         C ebx   scratch (nadj, q1)
 353         C ecx
 354         C edx   scratch
 355         C esi   n10
 356         C edi   n2
 357         C ebp   divisor
 358         C
 359         C mm0   src limb, shifted
 360         C mm7   rshift
 361
 362         movl    %esi, %eax
 363         movl    %ebp, %ebx
 364
 365         sarl    $31, %eax          C -n1
 366
 367         andl    %eax, %ebx         C -n1 & d
 368         negl    %eax               C n1
 369
 370         addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
 371         addl    %edi, %eax         C n2+n1
 372
 373         mull    VAR_INVERSE        C m*(n2+n1)
 374
 375         movl    VAR_NORM, %ecx     C for final denorm
 376
 377         C
 378
 379         C
 380
 381         C
 382
 383         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 384         leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
 385
 386         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 387
 388         sbbl    $0, %ebx
 389         movl    %ebp, %eax         C d
 390
 391         mull    %ebx               C (q1+1)*d
 392
 393         movl    SAVE_EBX, %ebx
 394
 395         C
 396
 397         C
 398
 399         C
 400
 401         subl    %eax, %esi
 402
 403         sbbl    %edx, %edi         C n - (q1+1)*d
 404         leal    (%ebp,%esi), %edx
 405         movl    SAVE_EBP, %ebp
 406
 407         movl    %esi, %eax         C remainder
 408         movl    SAVE_ESI, %esi
 409
 410         cmovc(  %edx, %eax)        C n - q1*d if underflow from using q1+1
 411         movl    SAVE_EDI, %edi
 412
 413         shrl    %cl, %eax          C denorm remainder
 414         addl    $STACK_SPACE, %esp
 415         emms
 416
 417         ret
 418
 419
 420 C -----------------------------------------------------------------------------
 421 C
 422 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
 423 C of q*d is simply -d and the remainder n-q*d = n10+d
 424
 425 L(q1_ff):
 426         C eax   (divisor)
 427         C ebx   (q1+1 == 0)
 428         C ecx   src pointer
 429         C edx
 430         C esi   n10
 431         C edi   (n2)
 432         C ebp   divisor
 433
 434         leal    (%ebp,%esi), %edi       C n-q*d remainder -> next n2
 435         movl    VAR_SRC_STOP, %edx
 436         psrlq   %mm7, %mm0
 437
 438         movd    %mm0, %esi              C next n10
 439         cmpl    %ecx, %edx
 440         jne     L(inverse_top)
 441
 442         jmp     L(inverse_loop_done)
 443
 444 EPILOGUE()