rts/gmp/mpn/x86/p6/mmx/divrem_1.asm

   1 dnl  Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
   2 dnl
   3 dnl  P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
   4
   5
   6 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   7 dnl
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or
  11 dnl  modify it under the terms of the GNU Lesser General Public License as
  12 dnl  published by the Free Software Foundation; either version 2.1 of the
  13 dnl  License, or (at your option) any later version.
  14 dnl
  15 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  16 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 dnl  Lesser General Public License for more details.
  19 dnl
  20 dnl  You should have received a copy of the GNU Lesser General Public
  21 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  22 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  23 dnl  Suite 330, Boston, MA 02111-1307, USA.
  24
  25
  26 include(`../config.m4')
  27
  28
  29 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
  30 C                         mp_srcptr src, mp_size_t size,
  31 C                         mp_limb_t divisor);
  32 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
  33 C                          mp_srcptr src, mp_size_t size,
  34 C                          mp_limb_t divisor, mp_limb_t carry);
  35 C
  36 C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
  37 C see that file for some comments.  It's likely what's here can be improved.
  38
  39
  40 dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
  41 dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
  42 dnl
  43 dnl  The different speeds of the integer and fraction parts means that using
  44 dnl  xsize+size isn't quite right.  The threshold wants to be a bit higher
  45 dnl  for the integer part and a bit lower for the fraction part.  (Or what's
  46 dnl  really wanted is to speed up the integer part!)
  47 dnl
  48 dnl  The threshold is set to make the integer part right.  At 4 limbs the
  49 dnl  div and mul are about the same there, but on the fractional part the
  50 dnl  mul is much faster.
  51
  52 deflit(MUL_THRESHOLD, 4)
  53
  54
  55 defframe(PARAM_CARRY,  24)
  56 defframe(PARAM_DIVISOR,20)
  57 defframe(PARAM_SIZE,   16)
  58 defframe(PARAM_SRC,    12)
  59 defframe(PARAM_XSIZE,  8)
  60 defframe(PARAM_DST,    4)
  61
  62 defframe(SAVE_EBX,    -4)
  63 defframe(SAVE_ESI,    -8)
  64 defframe(SAVE_EDI,    -12)
  65 defframe(SAVE_EBP,    -16)
  66
  67 defframe(VAR_NORM,    -20)
  68 defframe(VAR_INVERSE, -24)
  69 defframe(VAR_SRC,     -28)
  70 defframe(VAR_DST,     -32)
  71 defframe(VAR_DST_STOP,-36)
  72
  73 deflit(STACK_SPACE, 36)
  74
  75         .text
  76         ALIGN(16)
  77
  78 PROLOGUE(mpn_divrem_1c)
  79 deflit(`FRAME',0)
  80         movl    PARAM_CARRY, %edx
  81
  82         movl    PARAM_SIZE, %ecx
  83         subl    $STACK_SPACE, %esp
  84 deflit(`FRAME',STACK_SPACE)
  85
  86         movl    %ebx, SAVE_EBX
  87         movl    PARAM_XSIZE, %ebx
  88
  89         movl    %edi, SAVE_EDI
  90         movl    PARAM_DST, %edi
  91
  92         movl    %ebp, SAVE_EBP
  93         movl    PARAM_DIVISOR, %ebp
  94
  95         movl    %esi, SAVE_ESI
  96         movl    PARAM_SRC, %esi
  97
  98         leal    -4(%edi,%ebx,4), %edi
  99         jmp     LF(mpn_divrem_1,start_1c)
 100
 101 EPILOGUE()
 102
 103
 104         C offset 0x31, close enough to aligned
 105 PROLOGUE(mpn_divrem_1)
 106 deflit(`FRAME',0)
 107
 108         movl    PARAM_SIZE, %ecx
 109         movl    $0, %edx                C initial carry (if can't skip a div)
 110         subl    $STACK_SPACE, %esp
 111 deflit(`FRAME',STACK_SPACE)
 112
 113         movl    %ebp, SAVE_EBP
 114         movl    PARAM_DIVISOR, %ebp
 115
 116         movl    %ebx, SAVE_EBX
 117         movl    PARAM_XSIZE, %ebx
 118
 119         movl    %esi, SAVE_ESI
 120         movl    PARAM_SRC, %esi
 121         orl     %ecx, %ecx
 122
 123         movl    %edi, SAVE_EDI
 124         movl    PARAM_DST, %edi
 125
 126         leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
 127         jz      L(no_skip_div)
 128
 129         movl    -4(%esi,%ecx,4), %eax   C src high limb
 130         cmpl    %ebp, %eax              C one less div if high<divisor
 131         jnb     L(no_skip_div)
 132
 133         movl    $0, (%edi,%ecx,4)       C dst high limb
 134         decl    %ecx                    C size-1
 135         movl    %eax, %edx              C src high limb as initial carry
 136 L(no_skip_div):
 137
 138
 139 L(start_1c):
 140         C eax
 141         C ebx   xsize
 142         C ecx   size
 143         C edx   carry
 144         C esi   src
 145         C edi   &dst[xsize-1]
 146         C ebp   divisor
 147
 148         leal    (%ebx,%ecx), %eax       C size+xsize
 149         cmpl    $MUL_THRESHOLD, %eax
 150         jae     L(mul_by_inverse)
 151
 152         orl     %ecx, %ecx
 153         jz      L(divide_no_integer)
 154
 155 L(divide_integer):
 156         C eax   scratch (quotient)
 157         C ebx   xsize
 158         C ecx   counter
 159         C edx   scratch (remainder)
 160         C esi   src
 161         C edi   &dst[xsize-1]
 162         C ebp   divisor
 163
 164         movl    -4(%esi,%ecx,4), %eax
 165
 166         divl    %ebp
 167
 168         movl    %eax, (%edi,%ecx,4)
 169         decl    %ecx
 170         jnz     L(divide_integer)
 171
 172
 173 L(divide_no_integer):
 174         movl    PARAM_DST, %edi
 175         orl     %ebx, %ebx
 176         jnz     L(divide_fraction)
 177
 178 L(divide_done):
 179         movl    SAVE_ESI, %esi
 180
 181         movl    SAVE_EDI, %edi
 182
 183         movl    SAVE_EBX, %ebx
 184         movl    %edx, %eax
 185
 186         movl    SAVE_EBP, %ebp
 187         addl    $STACK_SPACE, %esp
 188
 189         ret
 190
 191
 192 L(divide_fraction):
 193         C eax   scratch (quotient)
 194         C ebx   counter
 195         C ecx
 196         C edx   scratch (remainder)
 197         C esi
 198         C edi   dst
 199         C ebp   divisor
 200
 201         movl    $0, %eax
 202
 203         divl    %ebp
 204
 205         movl    %eax, -4(%edi,%ebx,4)
 206         decl    %ebx
 207         jnz     L(divide_fraction)
 208
 209         jmp     L(divide_done)
 210
 211
 212
 213 C -----------------------------------------------------------------------------
 214
 215 L(mul_by_inverse):
 216         C eax
 217         C ebx   xsize
 218         C ecx   size
 219         C edx   carry
 220         C esi   src
 221         C edi   &dst[xsize-1]
 222         C ebp   divisor
 223
 224         leal    12(%edi), %ebx
 225
 226         movl    %ebx, VAR_DST_STOP
 227         leal    4(%edi,%ecx,4), %edi    C &dst[xsize+size]
 228
 229         movl    %edi, VAR_DST
 230         movl    %ecx, %ebx              C size
 231
 232         bsrl    %ebp, %ecx              C 31-l
 233         movl    %edx, %edi              C carry
 234
 235         leal    1(%ecx), %eax           C 32-l
 236         xorl    $31, %ecx               C l
 237
 238         movl    %ecx, VAR_NORM
 239         movl    $-1, %edx
 240
 241         shll    %cl, %ebp               C d normalized
 242         movd    %eax, %mm7
 243
 244         movl    $-1, %eax
 245         subl    %ebp, %edx              C (b-d)-1 giving edx:eax = b*(b-d)-1
 246
 247         divl    %ebp                    C floor (b*(b-d)-1) / d
 248
 249         movl    %eax, VAR_INVERSE
 250         orl     %ebx, %ebx              C size
 251         leal    -12(%esi,%ebx,4), %eax  C &src[size-3]
 252
 253         movl    %eax, VAR_SRC
 254         jz      L(start_zero)
 255
 256         movl    8(%eax), %esi           C src high limb
 257         cmpl    $1, %ebx
 258         jz      L(start_one)
 259
 260 L(start_two_or_more):
 261         movl    4(%eax), %edx           C src second highest limb
 262
 263         shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
 264
 265         shldl(  %cl, %edx, %esi)        C n10 = high,second << l
 266
 267         cmpl    $2, %ebx
 268         je      L(integer_two_left)
 269         jmp     L(integer_top)
 270
 271
 272 L(start_one):
 273         shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
 274
 275         shll    %cl, %esi               C n10 = high << l
 276         jmp     L(integer_one_left)
 277
 278
 279 L(start_zero):
 280         shll    %cl, %edi               C n2 = carry << l
 281         movl    $0, %esi                C n10 = 0
 282
 283         C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
 284         C must have xsize!=0
 285         jmp     L(fraction_some)
 286
 287
 288
 289 C -----------------------------------------------------------------------------
 290 C
 291 C This loop runs at about 25 cycles, which is probably sub-optimal, and
 292 C certainly more than the dependent chain would suggest.  A better loop, or
 293 C a better rough analysis of what's possible, would be welcomed.
 294 C
 295 C In the current implementation, the following successively dependent
 296 C micro-ops seem to exist.
 297 C
 298 C                      uops
 299 C               n2+n1   1   (addl)
 300 C               mul     5
 301 C               q1+1    3   (addl/adcl)
 302 C               mul     5
 303 C               sub     3   (subl/sbbl)
 304 C               addback 2   (cmov)
 305 C                      ---
 306 C                      19
 307 C
 308 C Lack of registers hinders explicit scheduling and it might be that the
 309 C normal out of order execution isn't able to hide enough under the mul
 310 C latencies.
 311 C
 312 C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
 313 C cmov (and takes one uop off the dependent chain).  A sarl/andl/addl
 314 C combination was tried for the addback (despite the fact it would lengthen
 315 C the dependent chain) but found to be no faster.
 316
 317
 318         ALIGN(16)
 319 L(integer_top):
 320         C eax   scratch
 321         C ebx   scratch (nadj, q1)
 322         C ecx   scratch (src, dst)
 323         C edx   scratch
 324         C esi   n10
 325         C edi   n2
 326         C ebp   d
 327         C
 328         C mm0   scratch (src qword)
 329         C mm7   rshift for normalization
 330
 331         movl    %esi, %eax
 332         movl    %ebp, %ebx
 333
 334         sarl    $31, %eax          C -n1
 335         movl    VAR_SRC, %ecx
 336
 337         andl    %eax, %ebx         C -n1 & d
 338         negl    %eax               C n1
 339
 340         addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
 341         addl    %edi, %eax         C n2+n1
 342         movq    (%ecx), %mm0       C next src limb and the one below it
 343
 344         mull    VAR_INVERSE        C m*(n2+n1)
 345
 346         subl    $4, %ecx
 347
 348         movl    %ecx, VAR_SRC
 349
 350         C
 351
 352         C
 353
 354         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 355         movl    %ebp, %eax         C d
 356         leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
 357
 358         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 359         jz      L(q1_ff)
 360
 361         mull    %ebx               C (q1+1)*d
 362
 363         movl    VAR_DST, %ecx
 364         psrlq   %mm7, %mm0
 365
 366         C
 367
 368         C
 369
 370         C
 371
 372         subl    %eax, %esi
 373         movl    VAR_DST_STOP, %eax
 374
 375         sbbl    %edx, %edi         C n - (q1+1)*d
 376         movl    %esi, %edi         C remainder -> n2
 377         leal    (%ebp,%esi), %edx
 378
 379         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 380         movd    %mm0, %esi
 381
 382         sbbl    $0, %ebx           C q
 383         subl    $4, %ecx
 384
 385         movl    %ebx, (%ecx)
 386         cmpl    %eax, %ecx
 387
 388         movl    %ecx, VAR_DST
 389         jne     L(integer_top)
 390
 391
 392 L(integer_loop_done):
 393
 394
 395 C -----------------------------------------------------------------------------
 396 C
 397 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
 398 C q1_ff special case.  This make the code a bit smaller and simpler, and
 399 C costs only 2 cycles (each).
 400
 401 L(integer_two_left):
 402         C eax   scratch
 403         C ebx   scratch (nadj, q1)
 404         C ecx   scratch (src, dst)
 405         C edx   scratch
 406         C esi   n10
 407         C edi   n2
 408         C ebp   divisor
 409         C
 410         C mm0   src limb, shifted
 411         C mm7   rshift
 412
 413
 414         movl    %esi, %eax
 415         movl    %ebp, %ebx
 416
 417         sarl    $31, %eax          C -n1
 418         movl    PARAM_SRC, %ecx
 419
 420         andl    %eax, %ebx         C -n1 & d
 421         negl    %eax               C n1
 422
 423         addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
 424         addl    %edi, %eax         C n2+n1
 425
 426         mull    VAR_INVERSE        C m*(n2+n1)
 427
 428         movd    (%ecx), %mm0       C src low limb
 429
 430         movl    VAR_DST_STOP, %ecx
 431
 432         C
 433
 434         C
 435
 436         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 437         leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
 438         movl    %ebp, %eax         C d
 439
 440         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 441
 442         sbbl    $0, %ebx
 443
 444         mull    %ebx               C (q1+1)*d
 445
 446         psllq   $32, %mm0
 447
 448         psrlq   %mm7, %mm0
 449
 450         C
 451
 452         C
 453
 454         subl    %eax, %esi
 455
 456         sbbl    %edx, %edi         C n - (q1+1)*d
 457         movl    %esi, %edi         C remainder -> n2
 458         leal    (%ebp,%esi), %edx
 459
 460         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 461         movd    %mm0, %esi
 462
 463         sbbl    $0, %ebx           C q
 464
 465         movl    %ebx, -4(%ecx)
 466
 467
 468 C -----------------------------------------------------------------------------
 469 L(integer_one_left):
 470         C eax   scratch
 471         C ebx   scratch (nadj, q1)
 472         C ecx   scratch (dst)
 473         C edx   scratch
 474         C esi   n10
 475         C edi   n2
 476         C ebp   divisor
 477         C
 478         C mm0   src limb, shifted
 479         C mm7   rshift
 480
 481
 482         movl    %esi, %eax
 483         movl    %ebp, %ebx
 484
 485         sarl    $31, %eax          C -n1
 486         movl    VAR_DST_STOP, %ecx
 487
 488         andl    %eax, %ebx         C -n1 & d
 489         negl    %eax               C n1
 490
 491         addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
 492         addl    %edi, %eax         C n2+n1
 493
 494         mull    VAR_INVERSE        C m*(n2+n1)
 495
 496         C
 497
 498         C
 499
 500         C
 501
 502         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 503         leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
 504         movl    %ebp, %eax         C d
 505
 506         C
 507
 508         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 509
 510         sbbl    $0, %ebx           C q1 if q1+1 overflowed
 511
 512         mull    %ebx
 513
 514         C
 515
 516         C
 517
 518         C
 519
 520         C
 521
 522         subl    %eax, %esi
 523         movl    PARAM_XSIZE, %eax
 524
 525         sbbl    %edx, %edi         C n - (q1+1)*d
 526         movl    %esi, %edi         C remainder -> n2
 527         leal    (%ebp,%esi), %edx
 528
 529         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 530
 531         sbbl    $0, %ebx           C q
 532
 533         movl    %ebx, -8(%ecx)
 534         subl    $8, %ecx
 535
 536
 537
 538         orl     %eax, %eax         C xsize
 539         jnz     L(fraction_some)
 540
 541         movl    %edi, %eax
 542 L(fraction_done):
 543         movl    VAR_NORM, %ecx
 544         movl    SAVE_EBP, %ebp
 545
 546         movl    SAVE_EDI, %edi
 547
 548         movl    SAVE_ESI, %esi
 549
 550         movl    SAVE_EBX, %ebx
 551         addl    $STACK_SPACE, %esp
 552
 553         shrl    %cl, %eax
 554         emms
 555
 556         ret
 557
 558
 559 C -----------------------------------------------------------------------------
 560 C
 561 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
 562 C of q*d is simply -d and the remainder n-q*d = n10+d
 563
 564 L(q1_ff):
 565         C eax   (divisor)
 566         C ebx   (q1+1 == 0)
 567         C ecx
 568         C edx
 569         C esi   n10
 570         C edi   n2
 571         C ebp   divisor
 572
 573         movl    VAR_DST, %ecx
 574         movl    VAR_DST_STOP, %edx
 575         subl    $4, %ecx
 576
 577         movl    %ecx, VAR_DST
 578         psrlq   %mm7, %mm0
 579         leal    (%ebp,%esi), %edi       C n-q*d remainder -> next n2
 580
 581         movl    $-1, (%ecx)
 582         movd    %mm0, %esi              C next n10
 583
 584         cmpl    %ecx, %edx
 585         jne     L(integer_top)
 586
 587         jmp     L(integer_loop_done)
 588
 589
 590
 591 C -----------------------------------------------------------------------------
 592 C
 593 C In the current implementation, the following successively dependent
 594 C micro-ops seem to exist.
 595 C
 596 C                      uops
 597 C               mul     5
 598 C               q1+1    1   (addl)
 599 C               mul     5
 600 C               sub     3   (negl/sbbl)
 601 C               addback 2   (cmov)
 602 C                      ---
 603 C                      16
 604 C
 605 C The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for
 606 C the addback was found to be a touch slower.
 607
 608
 609         ALIGN(16)
 610 L(fraction_some):
 611         C eax
 612         C ebx
 613         C ecx
 614         C edx
 615         C esi
 616         C edi   carry
 617         C ebp   divisor
 618
 619         movl    PARAM_DST, %esi
 620         movl    VAR_DST_STOP, %ecx
 621         movl    %edi, %eax
 622
 623         subl    $8, %ecx
 624
 625
 626         ALIGN(16)
 627 L(fraction_top):
 628         C eax   n2, then scratch
 629         C ebx   scratch (nadj, q1)
 630         C ecx   dst, decrementing
 631         C edx   scratch
 632         C esi   dst stop point
 633         C edi   n2
 634         C ebp   divisor
 635
 636         mull    VAR_INVERSE     C m*n2
 637
 638         movl    %ebp, %eax      C d
 639         subl    $4, %ecx        C dst
 640         leal    1(%edi), %ebx
 641
 642         C
 643
 644         C
 645
 646         C
 647
 648         addl    %edx, %ebx      C 1 + high(n2<<32 + m*n2) = q1+1
 649
 650         mull    %ebx            C (q1+1)*d
 651
 652         C
 653
 654         C
 655
 656         C
 657
 658         C
 659
 660         negl    %eax            C low of n - (q1+1)*d
 661
 662         sbbl    %edx, %edi      C high of n - (q1+1)*d, caring only about carry
 663         leal    (%ebp,%eax), %edx
 664
 665         cmovc(  %edx, %eax)     C n - q1*d if underflow from using q1+1
 666
 667         sbbl    $0, %ebx        C q
 668         movl    %eax, %edi      C remainder->n2
 669         cmpl    %esi, %ecx
 670
 671         movl    %ebx, (%ecx)    C previous q
 672         jne     L(fraction_top)
 673
 674
 675         jmp     L(fraction_done)
 676
 677 EPILOGUE()