rts/gmp/mpn/x86/k7/mmx/divrem_1.asm

   1 dnl  AMD K7 mpn_divrem_1 -- mpn by limb division.
   2 dnl
   3 dnl  K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
   4
   5
   6 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   7 dnl
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or
  11 dnl  modify it under the terms of the GNU Lesser General Public License as
  12 dnl  published by the Free Software Foundation; either version 2.1 of the
  13 dnl  License, or (at your option) any later version.
  14 dnl
  15 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  16 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 dnl  Lesser General Public License for more details.
  19 dnl
  20 dnl  You should have received a copy of the GNU Lesser General Public
  21 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  22 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  23 dnl  Suite 330, Boston, MA 02111-1307, USA.
  24
  25
  26 include(`../config.m4')
  27
  28
  29 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
  30 C                         mp_srcptr src, mp_size_t size,
  31 C                         mp_limb_t divisor);
  32 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
  33 C                          mp_srcptr src, mp_size_t size,
  34 C                          mp_limb_t divisor, mp_limb_t carry);
  35 C
  36 C The method and nomenclature follow part 8 of "Division by Invariant
  37 C Integers using Multiplication" by Granlund and Montgomery, reference in
  38 C gmp.texi.
  39 C
  40 C The "and"s shown in the paper are done here with "cmov"s.  "m" is written
  41 C for m', and "d" for d_norm, which won't cause any confusion since it's
  42 C only the normalized divisor that's of any use in the code.  "b" is written
  43 C for 2^N, the size of a limb, N being 32 here.
  44 C
  45 C mpn_divrem_1 avoids one division if the src high limb is less than the
  46 C divisor.  mpn_divrem_1c doesn't check for a zero carry, since in normal
  47 C circumstances that will be a very rare event.
  48 C
  49 C There's a small bias towards expecting xsize==0, by having code for
  50 C xsize==0 in a straight line and xsize!=0 under forward jumps.
  51
  52
  53 dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
  54 dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
  55 dnl
  56 dnl  The inverse takes about 50 cycles to calculate, but after that the
  57 dnl  multiply is 17 c/l versus division at 42 c/l.
  58 dnl
  59 dnl  At 3 limbs the mul is a touch faster than div on the integer part, and
  60 dnl  even more so on the fractional part.
  61
  62 deflit(MUL_THRESHOLD, 3)
  63
  64
  65 defframe(PARAM_CARRY,  24)
  66 defframe(PARAM_DIVISOR,20)
  67 defframe(PARAM_SIZE,   16)
  68 defframe(PARAM_SRC,    12)
  69 defframe(PARAM_XSIZE,  8)
  70 defframe(PARAM_DST,    4)
  71
  72 defframe(SAVE_EBX,    -4)
  73 defframe(SAVE_ESI,    -8)
  74 defframe(SAVE_EDI,    -12)
  75 defframe(SAVE_EBP,    -16)
  76
  77 defframe(VAR_NORM,    -20)
  78 defframe(VAR_INVERSE, -24)
  79 defframe(VAR_SRC,     -28)
  80 defframe(VAR_DST,     -32)
  81 defframe(VAR_DST_STOP,-36)
  82
  83 deflit(STACK_SPACE, 36)
  84
  85         .text
  86         ALIGN(32)
  87
  88 PROLOGUE(mpn_divrem_1c)
  89 deflit(`FRAME',0)
  90         movl    PARAM_CARRY, %edx
  91         movl    PARAM_SIZE, %ecx
  92         subl    $STACK_SPACE, %esp
  93 deflit(`FRAME',STACK_SPACE)
  94
  95         movl    %ebx, SAVE_EBX
  96         movl    PARAM_XSIZE, %ebx
  97
  98         movl    %edi, SAVE_EDI
  99         movl    PARAM_DST, %edi
 100
 101         movl    %ebp, SAVE_EBP
 102         movl    PARAM_DIVISOR, %ebp
 103
 104         movl    %esi, SAVE_ESI
 105         movl    PARAM_SRC, %esi
 106
 107         leal    -4(%edi,%ebx,4), %edi
 108         jmp     LF(mpn_divrem_1,start_1c)
 109
 110 EPILOGUE()
 111
 112
 113         C offset 0x31, close enough to aligned
 114 PROLOGUE(mpn_divrem_1)
 115 deflit(`FRAME',0)
 116
 117         movl    PARAM_SIZE, %ecx
 118         movl    $0, %edx                C initial carry (if can't skip a div)
 119         subl    $STACK_SPACE, %esp
 120 deflit(`FRAME',STACK_SPACE)
 121
 122         movl    %ebp, SAVE_EBP
 123         movl    PARAM_DIVISOR, %ebp
 124
 125         movl    %ebx, SAVE_EBX
 126         movl    PARAM_XSIZE, %ebx
 127
 128         movl    %esi, SAVE_ESI
 129         movl    PARAM_SRC, %esi
 130         orl     %ecx, %ecx
 131
 132         movl    %edi, SAVE_EDI
 133         movl    PARAM_DST, %edi
 134         leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
 135
 136         jz      L(no_skip_div)
 137         movl    -4(%esi,%ecx,4), %eax   C src high limb
 138
 139         cmpl    %ebp, %eax              C one less div if high<divisor
 140         jnb     L(no_skip_div)
 141
 142         movl    $0, (%edi,%ecx,4)       C dst high limb
 143         decl    %ecx                    C size-1
 144         movl    %eax, %edx              C src high limb as initial carry
 145 L(no_skip_div):
 146
 147
 148 L(start_1c):
 149         C eax
 150         C ebx   xsize
 151         C ecx   size
 152         C edx   carry
 153         C esi   src
 154         C edi   &dst[xsize-1]
 155         C ebp   divisor
 156
 157         leal    (%ebx,%ecx), %eax       C size+xsize
 158         cmpl    $MUL_THRESHOLD, %eax
 159         jae     L(mul_by_inverse)
 160
 161
 162 C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
 163 C It'd be possible to write them out without the looping, but no speedup
 164 C would be expected.
 165 C
 166 C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
 167 C integer part, but curiously not on the fractional part, where %ebp is a
 168 C (fixed) couple of cycles faster.
 169
 170         orl     %ecx, %ecx
 171         jz      L(divide_no_integer)
 172
 173 L(divide_integer):
 174         C eax   scratch (quotient)
 175         C ebx   xsize
 176         C ecx   counter
 177         C edx   scratch (remainder)
 178         C esi   src
 179         C edi   &dst[xsize-1]
 180         C ebp   divisor
 181
 182         movl    -4(%esi,%ecx,4), %eax
 183
 184         divl    PARAM_DIVISOR
 185
 186         movl    %eax, (%edi,%ecx,4)
 187         decl    %ecx
 188         jnz     L(divide_integer)
 189
 190
 191 L(divide_no_integer):
 192         movl    PARAM_DST, %edi
 193         orl     %ebx, %ebx
 194         jnz     L(divide_fraction)
 195
 196 L(divide_done):
 197         movl    SAVE_ESI, %esi
 198         movl    SAVE_EDI, %edi
 199         movl    %edx, %eax
 200
 201         movl    SAVE_EBX, %ebx
 202         movl    SAVE_EBP, %ebp
 203         addl    $STACK_SPACE, %esp
 204
 205         ret
 206
 207
 208 L(divide_fraction):
 209         C eax   scratch (quotient)
 210         C ebx   counter
 211         C ecx
 212         C edx   scratch (remainder)
 213         C esi
 214         C edi   dst
 215         C ebp   divisor
 216
 217         movl    $0, %eax
 218
 219         divl    %ebp
 220
 221         movl    %eax, -4(%edi,%ebx,4)
 222         decl    %ebx
 223         jnz     L(divide_fraction)
 224
 225         jmp     L(divide_done)
 226
 227
 228
 229 C -----------------------------------------------------------------------------
 230
 231 L(mul_by_inverse):
 232         C eax
 233         C ebx   xsize
 234         C ecx   size
 235         C edx   carry
 236         C esi   src
 237         C edi   &dst[xsize-1]
 238         C ebp   divisor
 239
 240         bsrl    %ebp, %eax              C 31-l
 241
 242         leal    12(%edi), %ebx
 243         leal    4(%edi,%ecx,4), %edi    C &dst[xsize+size]
 244
 245         movl    %edi, VAR_DST
 246         movl    %ebx, VAR_DST_STOP
 247
 248         movl    %ecx, %ebx              C size
 249         movl    $31, %ecx
 250
 251         movl    %edx, %edi              C carry
 252         movl    $-1, %edx
 253
 254         C
 255
 256         xorl    %eax, %ecx              C l
 257         incl    %eax                    C 32-l
 258
 259         shll    %cl, %ebp               C d normalized
 260         movl    %ecx, VAR_NORM
 261
 262         movd    %eax, %mm7
 263
 264         movl    $-1, %eax
 265         subl    %ebp, %edx              C (b-d)-1 giving edx:eax = b*(b-d)-1
 266
 267         divl    %ebp                    C floor (b*(b-d)-1) / d
 268
 269         orl     %ebx, %ebx              C size
 270         movl    %eax, VAR_INVERSE
 271         leal    -12(%esi,%ebx,4), %eax  C &src[size-3]
 272
 273         jz      L(start_zero)
 274         movl    %eax, VAR_SRC
 275         cmpl    $1, %ebx
 276
 277         movl    8(%eax), %esi           C src high limb
 278         jz      L(start_one)
 279
 280 L(start_two_or_more):
 281         movl    4(%eax), %edx           C src second highest limb
 282
 283         shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
 284
 285         shldl(  %cl, %edx, %esi)        C n10 = high,second << l
 286
 287         cmpl    $2, %ebx
 288         je      L(integer_two_left)
 289         jmp     L(integer_top)
 290
 291
 292 L(start_one):
 293         shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
 294
 295         shll    %cl, %esi               C n10 = high << l
 296         movl    %eax, VAR_SRC
 297         jmp     L(integer_one_left)
 298
 299
 300 L(start_zero):
 301         shll    %cl, %edi               C n2 = carry << l
 302         movl    $0, %esi                C n10 = 0
 303
 304         C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
 305         C must have xsize!=0
 306         jmp     L(fraction_some)
 307
 308
 309
 310 C -----------------------------------------------------------------------------
 311 C
 312 C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
 313 C execution.  The instruction scheduling is important, with various
 314 C apparently equivalent forms running 1 to 5 cycles slower.
 315 C
 316 C A lower bound for the time would seem to be 16 cycles, based on the
 317 C following successive dependencies.
 318 C
 319 C                     cycles
 320 C               n2+n1   1
 321 C               mul     6
 322 C               q1+1    1
 323 C               mul     6
 324 C               sub     1
 325 C               addback 1
 326 C                      ---
 327 C                      16
 328 C
 329 C This chain is what the loop has already, but 16 cycles isn't achieved.
 330 C K7 has enough decode, and probably enough execute (depending maybe on what
 331 C a mul actually consumes), but nothing running under 17 has been found.
 332 C
 333 C In theory n2+n1 could be done in the sub and addback stages (by
 334 C calculating both n2 and n2+n1 there), but lack of registers makes this an
 335 C unlikely proposition.
 336 C
 337 C The jz in the loop keeps the q1+1 stage to 1 cycle.  Handling an overflow
 338 C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
 339 C chain, and nothing better than 18 cycles has been found when using it.
 340 C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
 341 C be an extremely rare event.
 342 C
 343 C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
 344 C if some special data is coming out with this always, the q1_ff special
 345 C case actually runs at 15 c/l.  0x2FFF...FFFD divided by 3 is a good way to
 346 C induce the q1_ff case, for speed measurements or testing.  Note that
 347 C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
 348 C
 349 C The instruction groupings and empty comments show the cycles for a naive
 350 C in-order view of the code (conveniently ignoring the load latency on
 351 C VAR_INVERSE).  This shows some of where the time is going, but is nonsense
 352 C to the extent that out-of-order execution rearranges it.  In this case
 353 C there's 19 cycles shown, but it executes at 17.
 354
 355         ALIGN(16)
 356 L(integer_top):
 357         C eax   scratch
 358         C ebx   scratch (nadj, q1)
 359         C ecx   scratch (src, dst)
 360         C edx   scratch
 361         C esi   n10
 362         C edi   n2
 363         C ebp   divisor
 364         C
 365         C mm0   scratch (src qword)
 366         C mm7   rshift for normalization
 367
 368         cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
 369         movl    %edi, %eax         C n2
 370         movl    VAR_SRC, %ecx
 371
 372         leal    (%ebp,%esi), %ebx
 373         cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
 374         sbbl    $-1, %eax          C n2+n1
 375
 376         mull    VAR_INVERSE        C m*(n2+n1)
 377
 378         movq    (%ecx), %mm0       C next limb and the one below it
 379         subl    $4, %ecx
 380
 381         movl    %ecx, VAR_SRC
 382
 383         C
 384
 385         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 386         leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
 387         movl    %ebp, %eax         C d
 388
 389         C
 390
 391         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 392         jz      L(q1_ff)
 393         movl    VAR_DST, %ecx
 394
 395         mull    %ebx               C (q1+1)*d
 396
 397         psrlq   %mm7, %mm0
 398
 399         leal    -4(%ecx), %ecx
 400
 401         C
 402
 403         subl    %eax, %esi
 404         movl    VAR_DST_STOP, %eax
 405
 406         C
 407
 408         sbbl    %edx, %edi         C n - (q1+1)*d
 409         movl    %esi, %edi         C remainder -> n2
 410         leal    (%ebp,%esi), %edx
 411
 412         movd    %mm0, %esi
 413
 414         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 415         sbbl    $0, %ebx           C q
 416         cmpl    %eax, %ecx
 417
 418         movl    %ebx, (%ecx)
 419         movl    %ecx, VAR_DST
 420         jne     L(integer_top)
 421
 422
 423 L(integer_loop_done):
 424
 425
 426 C -----------------------------------------------------------------------------
 427 C
 428 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
 429 C q1_ff special case.  This make the code a bit smaller and simpler, and
 430 C costs only 1 cycle (each).
 431
 432 L(integer_two_left):
 433         C eax   scratch
 434         C ebx   scratch (nadj, q1)
 435         C ecx   scratch (src, dst)
 436         C edx   scratch
 437         C esi   n10
 438         C edi   n2
 439         C ebp   divisor
 440         C
 441         C mm0   src limb, shifted
 442         C mm7   rshift
 443
 444         cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
 445         movl    %edi, %eax         C n2
 446         movl    PARAM_SRC, %ecx
 447
 448         leal    (%ebp,%esi), %ebx
 449         cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
 450         sbbl    $-1, %eax          C n2+n1
 451
 452         mull    VAR_INVERSE        C m*(n2+n1)
 453
 454         movd    (%ecx), %mm0       C src low limb
 455
 456         movl    VAR_DST_STOP, %ecx
 457
 458         C
 459
 460         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 461         leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
 462         movl    %ebp, %eax         C d
 463
 464         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 465
 466         sbbl    $0, %ebx
 467
 468         mull    %ebx               C (q1+1)*d
 469
 470         psllq   $32, %mm0
 471
 472         psrlq   %mm7, %mm0
 473
 474         C
 475
 476         subl    %eax, %esi
 477
 478         C
 479
 480         sbbl    %edx, %edi         C n - (q1+1)*d
 481         movl    %esi, %edi         C remainder -> n2
 482         leal    (%ebp,%esi), %edx
 483
 484         movd    %mm0, %esi
 485
 486         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 487         sbbl    $0, %ebx           C q
 488
 489         movl    %ebx, -4(%ecx)
 490
 491
 492 C -----------------------------------------------------------------------------
 493 L(integer_one_left):
 494         C eax   scratch
 495         C ebx   scratch (nadj, q1)
 496         C ecx   dst
 497         C edx   scratch
 498         C esi   n10
 499         C edi   n2
 500         C ebp   divisor
 501         C
 502         C mm0   src limb, shifted
 503         C mm7   rshift
 504
 505         movl    VAR_DST_STOP, %ecx
 506         cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
 507         movl    %edi, %eax         C n2
 508
 509         leal    (%ebp,%esi), %ebx
 510         cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
 511         sbbl    $-1, %eax          C n2+n1
 512
 513         mull    VAR_INVERSE        C m*(n2+n1)
 514
 515         C
 516
 517         C
 518
 519         C
 520
 521         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 522         leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
 523         movl    %ebp, %eax         C d
 524
 525         C
 526
 527         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 528
 529         sbbl    $0, %ebx           C q1 if q1+1 overflowed
 530
 531         mull    %ebx
 532
 533         C
 534
 535         C
 536
 537         C
 538
 539         subl    %eax, %esi
 540
 541         C
 542
 543         sbbl    %edx, %edi         C n - (q1+1)*d
 544         movl    %esi, %edi         C remainder -> n2
 545         leal    (%ebp,%esi), %edx
 546
 547         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 548         sbbl    $0, %ebx           C q
 549
 550         movl    %ebx, -8(%ecx)
 551         subl    $8, %ecx
 552
 553
 554
 555 L(integer_none):
 556         cmpl    $0, PARAM_XSIZE
 557         jne     L(fraction_some)
 558
 559         movl    %edi, %eax
 560 L(fraction_done):
 561         movl    VAR_NORM, %ecx
 562         movl    SAVE_EBP, %ebp
 563
 564         movl    SAVE_EDI, %edi
 565         movl    SAVE_ESI, %esi
 566
 567         movl    SAVE_EBX, %ebx
 568         addl    $STACK_SPACE, %esp
 569
 570         shrl    %cl, %eax
 571         emms
 572
 573         ret
 574
 575
 576 C -----------------------------------------------------------------------------
 577 C
 578 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
 579 C of q*d is simply -d and the remainder n-q*d = n10+d
 580
 581 L(q1_ff):
 582         C eax   (divisor)
 583         C ebx   (q1+1 == 0)
 584         C ecx
 585         C edx
 586         C esi   n10
 587         C edi   n2
 588         C ebp   divisor
 589
 590         movl    VAR_DST, %ecx
 591         movl    VAR_DST_STOP, %edx
 592         subl    $4, %ecx
 593
 594         psrlq   %mm7, %mm0
 595         leal    (%ebp,%esi), %edi       C n-q*d remainder -> next n2
 596         movl    %ecx, VAR_DST
 597
 598         movd    %mm0, %esi              C next n10
 599
 600         movl    $-1, (%ecx)
 601         cmpl    %ecx, %edx
 602         jne     L(integer_top)
 603
 604         jmp     L(integer_loop_done)
 605
 606
 607
 608 C -----------------------------------------------------------------------------
 609 C
 610 C Being the fractional part, the "source" limbs are all zero, meaning
 611 C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
 612 C
 613 C The loop runs at 15 cycles.  The dependent chain is the same as the
 614 C general case above, but without the n2+n1 stage (due to n1==0), so 15
 615 C would seem to be the lower bound.
 616 C
 617 C A not entirely obvious simplification is that q1+1 never overflows a limb,
 618 C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
 619 C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
 620 C rnd() means rounding down to a multiple of d.
 621 C
 622 C       m*n2 + b*n2 <= m*(d-1) + b*(d-1)
 623 C                    = m*d + b*d - m - b
 624 C                    = floor((b(b-d)-1)/d)*d + b*d - m - b
 625 C                    = rnd(b(b-d)-1) + b*d - m - b
 626 C                    = rnd(b(b-d)-1 + b*d) - m - b
 627 C                    = rnd(b*b-1) - m - b
 628 C                    <= (b-2)*b
 629 C
 630 C Unchanged from the general case is that the final quotient limb q can be
 631 C either q1 or q1+1, and the q1+1 case occurs often.  This can be seen from
 632 C equation 8.4 of the paper which simplifies as follows when n1==0 and
 633 C n0==0.
 634 C
 635 C       n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
 636 C
 637 C As before, the instruction groupings and empty comments show a naive
 638 C in-order view of the code, which is made a nonsense by out of order
 639 C execution.  There's 17 cycles shown, but it executes at 15.
 640 C
 641 C Rotating the store q and remainder->n2 instructions up to the top of the
 642 C loop gets the run time down from 16 to 15.
 643
 644         ALIGN(16)
 645 L(fraction_some):
 646         C eax
 647         C ebx
 648         C ecx
 649         C edx
 650         C esi
 651         C edi   carry
 652         C ebp   divisor
 653
 654         movl    PARAM_DST, %esi
 655         movl    VAR_DST_STOP, %ecx
 656         movl    %edi, %eax
 657
 658         subl    $8, %ecx
 659
 660         jmp     L(fraction_entry)
 661
 662
 663         ALIGN(16)
 664 L(fraction_top):
 665         C eax   n2 carry, then scratch
 666         C ebx   scratch (nadj, q1)
 667         C ecx   dst, decrementing
 668         C edx   scratch
 669         C esi   dst stop point
 670         C edi   (will be n2)
 671         C ebp   divisor
 672
 673         movl    %ebx, (%ecx)    C previous q
 674         movl    %eax, %edi      C remainder->n2
 675
 676 L(fraction_entry):
 677         mull    VAR_INVERSE     C m*n2
 678
 679         movl    %ebp, %eax      C d
 680         subl    $4, %ecx        C dst
 681         leal    1(%edi), %ebx
 682
 683         C
 684
 685         C
 686
 687         C
 688
 689         C
 690
 691         addl    %edx, %ebx      C 1 + high(n2<<32 + m*n2) = q1+1
 692
 693         mull    %ebx            C (q1+1)*d
 694
 695         C
 696
 697         C
 698
 699         C
 700
 701         negl    %eax            C low of n - (q1+1)*d
 702
 703         C
 704
 705         sbbl    %edx, %edi      C high of n - (q1+1)*d, caring only about carry
 706         leal    (%ebp,%eax), %edx
 707
 708         cmovc(  %edx, %eax)     C n - q1*d if underflow from using q1+1
 709         sbbl    $0, %ebx        C q
 710         cmpl    %esi, %ecx
 711
 712         jne     L(fraction_top)
 713
 714
 715         movl    %ebx, (%ecx)
 716         jmp     L(fraction_done)
 717
 718 EPILOGUE()