rts/gmp/mpn/x86/k7/mul_basecase.asm

   1 dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
   2 dnl
   3 dnl  K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
   4 dnl      limbs/loop unrolling).
   5
   6
   7 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   8 dnl
   9 dnl  This file is part of the GNU MP Library.
  10 dnl
  11 dnl  The GNU MP Library is free software; you can redistribute it and/or
  12 dnl  modify it under the terms of the GNU Lesser General Public License as
  13 dnl  published by the Free Software Foundation; either version 2.1 of the
  14 dnl  License, or (at your option) any later version.
  15 dnl
  16 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  17 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19 dnl  Lesser General Public License for more details.
  20 dnl
  21 dnl  You should have received a copy of the GNU Lesser General Public
  22 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  23 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  24 dnl  Suite 330, Boston, MA 02111-1307, USA.
  25
  26
  27 include(`../config.m4')
  28
  29
  30 dnl  K7 UNROLL_COUNT cycles/product (at around 20x20)
  31 dnl           8           4.67
  32 dnl          16           4.59
  33 dnl          32           4.42
  34 dnl  Maximum possible with the current code is 32.
  35 dnl
  36 dnl  At 32 the typical 13-26 limb sizes from the karatsuba code will get
  37 dnl  done with a straight run through a block of code, no inner loop.  Using
  38 dnl  32 gives 1k of code, but the k7 has a 64k L1 code cache.
  39
  40 deflit(UNROLL_COUNT, 32)
  41
  42
  43 C void mpn_mul_basecase (mp_ptr wp,
  44 C                        mp_srcptr xp, mp_size_t xsize,
  45 C                        mp_srcptr yp, mp_size_t ysize);
  46 C
  47 C Calculate xp,xsize multiplied by yp,ysize, storing the result in
  48 C wp,xsize+ysize.
  49 C
  50 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
  51 C it's faster because it does most of the mpn_addmul_1() startup
  52 C calculations only once.  The saving is 15-25% on typical sizes coming from
  53 C the Karatsuba multiply code.
  54
  55 ifdef(`PIC',`
  56 deflit(UNROLL_THRESHOLD, 5)
  57 ',`
  58 deflit(UNROLL_THRESHOLD, 5)
  59 ')
  60
  61 defframe(PARAM_YSIZE,20)
  62 defframe(PARAM_YP,   16)
  63 defframe(PARAM_XSIZE,12)
  64 defframe(PARAM_XP,   8)
  65 defframe(PARAM_WP,   4)
  66
  67         .text
  68         ALIGN(32)
  69 PROLOGUE(mpn_mul_basecase)
  70 deflit(`FRAME',0)
  71
  72         movl    PARAM_XSIZE, %ecx
  73         movl    PARAM_YP, %eax
  74
  75         movl    PARAM_XP, %edx
  76         movl    (%eax), %eax    C yp low limb
  77
  78         cmpl    $2, %ecx
  79         ja      L(xsize_more_than_two)
  80         je      L(two_by_something)
  81
  82
  83         C one limb by one limb
  84
  85         mull    (%edx)
  86
  87         movl    PARAM_WP, %ecx
  88         movl    %eax, (%ecx)
  89         movl    %edx, 4(%ecx)
  90         ret
  91
  92
  93 C -----------------------------------------------------------------------------
  94 L(two_by_something):
  95 deflit(`FRAME',0)
  96         decl    PARAM_YSIZE
  97         pushl   %ebx            defframe_pushl(`SAVE_EBX')
  98         movl    %eax, %ecx      C yp low limb
  99
 100         movl    PARAM_WP, %ebx
 101         pushl   %esi            defframe_pushl(`SAVE_ESI')
 102         movl    %edx, %esi      C xp
 103
 104         movl    (%edx), %eax    C xp low limb
 105         jnz     L(two_by_two)
 106
 107
 108         C two limbs by one limb
 109
 110         mull    %ecx
 111
 112         movl    %eax, (%ebx)
 113         movl    4(%esi), %eax
 114         movl    %edx, %esi      C carry
 115
 116         mull    %ecx
 117
 118         addl    %eax, %esi
 119
 120         movl    %esi, 4(%ebx)
 121         movl    SAVE_ESI, %esi
 122
 123         adcl    $0, %edx
 124
 125         movl    %edx, 8(%ebx)
 126         movl    SAVE_EBX, %ebx
 127         addl    $FRAME, %esp
 128
 129         ret
 130
 131
 132
 133 C -----------------------------------------------------------------------------
 134 C Could load yp earlier into another register.
 135
 136         ALIGN(16)
 137 L(two_by_two):
 138         C eax   xp low limb
 139         C ebx   wp
 140         C ecx   yp low limb
 141         C edx
 142         C esi   xp
 143         C edi
 144         C ebp
 145
 146 dnl  FRAME carries on from previous
 147
 148         mull    %ecx            C xp[0] * yp[0]
 149
 150         push    %edi            defframe_pushl(`SAVE_EDI')
 151         movl    %edx, %edi      C carry, for wp[1]
 152
 153         movl    %eax, (%ebx)
 154         movl    4(%esi), %eax
 155
 156         mull    %ecx            C xp[1] * yp[0]
 157
 158         addl    %eax, %edi
 159         movl    PARAM_YP, %ecx
 160
 161         adcl    $0, %edx
 162         movl    4(%ecx), %ecx   C yp[1]
 163         movl    %edi, 4(%ebx)
 164
 165         movl    4(%esi), %eax   C xp[1]
 166         movl    %edx, %edi      C carry, for wp[2]
 167
 168         mull    %ecx            C xp[1] * yp[1]
 169
 170         addl    %eax, %edi
 171
 172         adcl    $0, %edx
 173         movl    (%esi), %eax    C xp[0]
 174
 175         movl    %edx, %esi      C carry, for wp[3]
 176
 177         mull    %ecx            C xp[0] * yp[1]
 178
 179         addl    %eax, 4(%ebx)
 180         adcl    %edx, %edi
 181         movl    %edi, 8(%ebx)
 182
 183         adcl    $0, %esi
 184         movl    SAVE_EDI, %edi
 185         movl    %esi, 12(%ebx)
 186
 187         movl    SAVE_ESI, %esi
 188         movl    SAVE_EBX, %ebx
 189         addl    $FRAME, %esp
 190
 191         ret
 192
 193
 194 C -----------------------------------------------------------------------------
 195         ALIGN(16)
 196 L(xsize_more_than_two):
 197
 198 C The first limb of yp is processed with a simple mpn_mul_1 style loop
 199 C inline.  Unrolling this doesn't seem worthwhile since it's only run once
 200 C (whereas the addmul below is run ysize-1 many times).  A call to the
 201 C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
 202 C popping, and doesn't seem likely to be worthwhile on the typical 13-26
 203 C limb operations the Karatsuba code calls here with.
 204
 205         C eax   yp[0]
 206         C ebx
 207         C ecx   xsize
 208         C edx   xp
 209         C esi
 210         C edi
 211         C ebp
 212
 213 dnl  FRAME doesn't carry on from previous, no pushes yet here
 214 defframe(`SAVE_EBX',-4)
 215 defframe(`SAVE_ESI',-8)
 216 defframe(`SAVE_EDI',-12)
 217 defframe(`SAVE_EBP',-16)
 218 deflit(`FRAME',0)
 219
 220         subl    $16, %esp
 221 deflit(`FRAME',16)
 222
 223         movl    %edi, SAVE_EDI
 224         movl    PARAM_WP, %edi
 225
 226         movl    %ebx, SAVE_EBX
 227         movl    %ebp, SAVE_EBP
 228         movl    %eax, %ebp
 229
 230         movl    %esi, SAVE_ESI
 231         xorl    %ebx, %ebx
 232         leal    (%edx,%ecx,4), %esi     C xp end
 233
 234         leal    (%edi,%ecx,4), %edi     C wp end of mul1
 235         negl    %ecx
 236
 237
 238 L(mul1):
 239         C eax   scratch
 240         C ebx   carry
 241         C ecx   counter, negative
 242         C edx   scratch
 243         C esi   xp end
 244         C edi   wp end of mul1
 245         C ebp   multiplier
 246
 247         movl    (%esi,%ecx,4), %eax
 248
 249         mull    %ebp
 250
 251         addl    %ebx, %eax
 252         movl    %eax, (%edi,%ecx,4)
 253         movl    $0, %ebx
 254
 255         adcl    %edx, %ebx
 256         incl    %ecx
 257         jnz     L(mul1)
 258
 259
 260         movl    PARAM_YSIZE, %edx
 261         movl    PARAM_XSIZE, %ecx
 262
 263         movl    %ebx, (%edi)            C final carry
 264         decl    %edx
 265
 266         jnz     L(ysize_more_than_one)
 267
 268
 269         movl    SAVE_EDI, %edi
 270         movl    SAVE_EBX, %ebx
 271
 272         movl    SAVE_EBP, %ebp
 273         movl    SAVE_ESI, %esi
 274         addl    $FRAME, %esp
 275
 276         ret
 277
 278
 279 L(ysize_more_than_one):
 280         cmpl    $UNROLL_THRESHOLD, %ecx
 281         movl    PARAM_YP, %eax
 282
 283         jae     L(unroll)
 284
 285
 286 C -----------------------------------------------------------------------------
 287         C simple addmul looping
 288         C
 289         C eax   yp
 290         C ebx
 291         C ecx   xsize
 292         C edx   ysize-1
 293         C esi   xp end
 294         C edi   wp end of mul1
 295         C ebp
 296
 297         leal    4(%eax,%edx,4), %ebp    C yp end
 298         negl    %ecx
 299         negl    %edx
 300
 301         movl    (%esi,%ecx,4), %eax     C xp low limb
 302         movl    %edx, PARAM_YSIZE       C -(ysize-1)
 303         incl    %ecx
 304
 305         xorl    %ebx, %ebx              C initial carry
 306         movl    %ecx, PARAM_XSIZE       C -(xsize-1)
 307         movl    %ebp, PARAM_YP
 308
 309         movl    (%ebp,%edx,4), %ebp     C yp second lowest limb - multiplier
 310         jmp     L(simple_outer_entry)
 311
 312
 313         C this is offset 0x121 so close enough to aligned
 314 L(simple_outer_top):
 315         C ebp   ysize counter, negative
 316
 317         movl    PARAM_YP, %edx
 318         movl    PARAM_XSIZE, %ecx       C -(xsize-1)
 319         xorl    %ebx, %ebx              C carry
 320
 321         movl    %ebp, PARAM_YSIZE
 322         addl    $4, %edi                C next position in wp
 323
 324         movl    (%edx,%ebp,4), %ebp     C yp limb - multiplier
 325         movl    -4(%esi,%ecx,4), %eax   C xp low limb
 326
 327
 328 L(simple_outer_entry):
 329
 330 L(simple_inner):
 331         C eax   xp limb
 332         C ebx   carry limb
 333         C ecx   loop counter (negative)
 334         C edx   scratch
 335         C esi   xp end
 336         C edi   wp end
 337         C ebp   multiplier
 338
 339         mull    %ebp
 340
 341         addl    %eax, %ebx
 342         adcl    $0, %edx
 343
 344         addl    %ebx, (%edi,%ecx,4)
 345         movl    (%esi,%ecx,4), %eax
 346         adcl    $0, %edx
 347
 348         incl    %ecx
 349         movl    %edx, %ebx
 350         jnz     L(simple_inner)
 351
 352
 353         mull    %ebp
 354
 355         movl    PARAM_YSIZE, %ebp
 356         addl    %eax, %ebx
 357
 358         adcl    $0, %edx
 359         addl    %ebx, (%edi)
 360
 361         adcl    $0, %edx
 362         incl    %ebp
 363
 364         movl    %edx, 4(%edi)
 365         jnz     L(simple_outer_top)
 366
 367
 368         movl    SAVE_EBX, %ebx
 369         movl    SAVE_ESI, %esi
 370
 371         movl    SAVE_EDI, %edi
 372         movl    SAVE_EBP, %ebp
 373         addl    $FRAME, %esp
 374
 375         ret
 376
 377
 378
 379 C -----------------------------------------------------------------------------
 380 C
 381 C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
 382 C comments.
 383 C
 384 C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
 385 C increment xp and wp.  This is used to adjust back xp and wp, and rshifted
 386 C to given an initial VAR_COUNTER at the top of the outer loop.
 387 C
 388 C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
 389 C up to -1, inclusive.
 390 C
 391 C VAR_JMP is the computed jump into the unrolled loop.
 392 C
 393 C VAR_XP_LOW is the least significant limb of xp, which is needed at the
 394 C start of the unrolled loop.
 395 C
 396 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
 397 C inclusive.
 398 C
 399 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
 400 C added to give the location of the next limb of yp, which is the multiplier
 401 C in the unrolled loop.
 402 C
 403 C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
 404 C outer loop to take care of xp, wp and the inner loop counter.
 405
 406 defframe(VAR_COUNTER,  -20)
 407 defframe(VAR_ADJUST,   -24)
 408 defframe(VAR_JMP,      -28)
 409 defframe(VAR_XP_LOW,   -32)
 410 deflit(VAR_EXTRA_SPACE, 16)
 411
 412
 413 L(unroll):
 414         C eax   yp
 415         C ebx
 416         C ecx   xsize
 417         C edx   ysize-1
 418         C esi   xp end
 419         C edi   wp end of mul1
 420         C ebp
 421
 422         movl    PARAM_XP, %esi
 423         movl    4(%eax), %ebp           C multiplier (yp second limb)
 424         leal    4(%eax,%edx,4), %eax    C yp adjust for ysize indexing
 425
 426         movl    PARAM_WP, %edi
 427         movl    %eax, PARAM_YP
 428         negl    %edx
 429
 430         movl    %edx, PARAM_YSIZE
 431         leal    UNROLL_COUNT-2(%ecx), %ebx      C (xsize-1)+UNROLL_COUNT-1
 432         decl    %ecx                            C xsize-1
 433
 434         movl    (%esi), %eax            C xp low limb
 435         andl    $-UNROLL_MASK-1, %ebx
 436         negl    %ecx
 437
 438         subl    $VAR_EXTRA_SPACE, %esp
 439 deflit(`FRAME',16+VAR_EXTRA_SPACE)
 440         negl    %ebx
 441         andl    $UNROLL_MASK, %ecx
 442
 443         movl    %ebx, VAR_ADJUST
 444         movl    %ecx, %edx
 445         shll    $4, %ecx
 446
 447         sarl    $UNROLL_LOG2, %ebx
 448
 449         C 17 code bytes per limb
 450 ifdef(`PIC',`
 451         call    L(pic_calc)
 452 L(unroll_here):
 453 ',`
 454         leal    L(unroll_entry) (%ecx,%edx,1), %ecx
 455 ')
 456         negl    %edx
 457
 458         movl    %eax, VAR_XP_LOW
 459         movl    %ecx, VAR_JMP
 460         leal    4(%edi,%edx,4), %edi    C wp and xp, adjust for unrolling,
 461         leal    4(%esi,%edx,4), %esi    C  and start at second limb
 462         jmp     L(unroll_outer_entry)
 463
 464
 465 ifdef(`PIC',`
 466 L(pic_calc):
 467         C See README.family about old gas bugs
 468         leal    (%ecx,%edx,1), %ecx
 469         addl    $L(unroll_entry)-L(unroll_here), %ecx
 470         addl    (%esp), %ecx
 471         ret
 472 ')
 473
 474
 475 C --------------------------------------------------------------------------
 476         ALIGN(32)
 477 L(unroll_outer_top):
 478         C ebp   ysize counter, negative
 479
 480         movl    VAR_ADJUST, %ebx
 481         movl    PARAM_YP, %edx
 482
 483         movl    VAR_XP_LOW, %eax
 484         movl    %ebp, PARAM_YSIZE       C store incremented ysize counter
 485
 486         leal    4(%edi,%ebx,4), %edi
 487         leal    (%esi,%ebx,4), %esi
 488         sarl    $UNROLL_LOG2, %ebx
 489
 490         movl    (%edx,%ebp,4), %ebp     C yp next multiplier
 491         movl    VAR_JMP, %ecx
 492
 493 L(unroll_outer_entry):
 494         mull    %ebp
 495
 496         testb   $1, %cl         C and clear carry bit
 497         movl    %ebx, VAR_COUNTER
 498         movl    $0, %ebx
 499
 500         movl    $0, %ecx
 501         cmovz(  %eax, %ecx)     C eax into low carry, zero into high carry limb
 502         cmovnz( %eax, %ebx)
 503
 504         C Extra fetch of VAR_JMP is bad, but registers are tight
 505         jmp     *VAR_JMP
 506
 507
 508 C -----------------------------------------------------------------------------
 509         ALIGN(32)
 510 L(unroll_top):
 511         C eax   xp limb
 512         C ebx   carry high
 513         C ecx   carry low
 514         C edx   scratch
 515         C esi   xp+8
 516         C edi   wp
 517         C ebp   yp multiplier limb
 518         C
 519         C VAR_COUNTER  loop counter, negative
 520         C
 521         C 17 bytes each limb
 522
 523 L(unroll_entry):
 524
 525 deflit(CHUNK_COUNT,2)
 526 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 527         deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
 528         deflit(`disp1', eval(disp0 + 4))
 529
 530 Zdisp(  movl,   disp0,(%esi), %eax)
 531         adcl    %edx, %ebx
 532
 533         mull    %ebp
 534
 535 Zdisp(  addl,   %ecx, disp0,(%edi))
 536         movl    $0, %ecx
 537
 538         adcl    %eax, %ebx
 539
 540
 541         movl    disp1(%esi), %eax
 542         adcl    %edx, %ecx
 543
 544         mull    %ebp
 545
 546         addl    %ebx, disp1(%edi)
 547         movl    $0, %ebx
 548
 549         adcl    %eax, %ecx
 550 ')
 551
 552
 553         incl    VAR_COUNTER
 554         leal    UNROLL_BYTES(%esi), %esi
 555         leal    UNROLL_BYTES(%edi), %edi
 556
 557         jnz     L(unroll_top)
 558
 559
 560         C eax
 561         C ebx   zero
 562         C ecx   low
 563         C edx   high
 564         C esi
 565         C edi   wp, pointing at second last limb)
 566         C ebp
 567         C
 568         C carry flag to be added to high
 569
 570 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
 571 deflit(`disp1', eval(disp0-0 + 4))
 572
 573         movl    PARAM_YSIZE, %ebp
 574         adcl    $0, %edx
 575         addl    %ecx, disp0(%edi)
 576
 577         adcl    $0, %edx
 578         incl    %ebp
 579
 580         movl    %edx, disp1(%edi)
 581         jnz     L(unroll_outer_top)
 582
 583
 584         movl    SAVE_ESI, %esi
 585         movl    SAVE_EBP, %ebp
 586
 587         movl    SAVE_EDI, %edi
 588         movl    SAVE_EBX, %ebx
 589         addl    $FRAME, %esp
 590
 591         ret
 592
 593 EPILOGUE()