rts/gmp/mpn/x86/k6/mul_basecase.asm

   1 dnl  AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
   2 dnl
   3 dnl  K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
   4 dnl      unrolling).
   5
   6
   7 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   8 dnl
   9 dnl  This file is part of the GNU MP Library.
  10 dnl
  11 dnl  The GNU MP Library is free software; you can redistribute it and/or
  12 dnl  modify it under the terms of the GNU Lesser General Public License as
  13 dnl  published by the Free Software Foundation; either version 2.1 of the
  14 dnl  License, or (at your option) any later version.
  15 dnl
  16 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  17 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19 dnl  Lesser General Public License for more details.
  20 dnl
  21 dnl  You should have received a copy of the GNU Lesser General Public
  22 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  23 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  24 dnl  Suite 330, Boston, MA 02111-1307, USA.
  25
  26
  27 include(`../config.m4')
  28
  29
  30 dnl  K6: UNROLL_COUNT cycles/product (approx)
  31 dnl           8           9.75
  32 dnl          16           9.3
  33 dnl          32           9.3
  34 dnl  Maximum possible with the current code is 32.
  35 dnl
  36 dnl  With 16 the inner unrolled loop fits exactly in a 256 byte block, which
  37 dnl  might explain it's good performance.
  38
  39 deflit(UNROLL_COUNT, 16)
  40
  41
  42 C void mpn_mul_basecase (mp_ptr wp,
  43 C                        mp_srcptr xp, mp_size_t xsize,
  44 C                        mp_srcptr yp, mp_size_t ysize);
  45 C
  46 C Calculate xp,xsize multiplied by yp,ysize, storing the result in
  47 C wp,xsize+ysize.
  48 C
  49 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
  50 C it's faster because it does most of the mpn_addmul_1() entry code only
  51 C once.  The saving is about 10-20% on typical sizes coming from the
  52 C Karatsuba multiply code.
  53 C
  54 C Future:
  55 C
  56 C The unrolled loop could be shared by mpn_addmul_1, with some extra stack
  57 C setups and maybe 2 or 3 wasted cycles at the end.  Code saving would be
  58 C 256 bytes.
  59
  60 ifdef(`PIC',`
  61 deflit(UNROLL_THRESHOLD, 8)
  62 ',`
  63 deflit(UNROLL_THRESHOLD, 8)
  64 ')
  65
  66 defframe(PARAM_YSIZE,20)
  67 defframe(PARAM_YP,   16)
  68 defframe(PARAM_XSIZE,12)
  69 defframe(PARAM_XP,   8)
  70 defframe(PARAM_WP,   4)
  71
  72         .text
  73         ALIGN(32)
  74 PROLOGUE(mpn_mul_basecase)
  75 deflit(`FRAME',0)
  76
  77         movl    PARAM_XSIZE, %ecx
  78         movl    PARAM_YP, %eax
  79
  80         movl    PARAM_XP, %edx
  81         movl    (%eax), %eax    C yp low limb
  82
  83         cmpl    $2, %ecx
  84         ja      L(xsize_more_than_two_limbs)
  85         je      L(two_by_something)
  86
  87
  88         C one limb by one limb
  89
  90         movl    (%edx), %edx    C xp low limb
  91         movl    PARAM_WP, %ecx
  92
  93         mull    %edx
  94
  95         movl    %eax, (%ecx)
  96         movl    %edx, 4(%ecx)
  97         ret
  98
  99
 100 C -----------------------------------------------------------------------------
 101 L(two_by_something):
 102         decl    PARAM_YSIZE
 103         pushl   %ebx
 104 deflit(`FRAME',4)
 105
 106         movl    PARAM_WP, %ebx
 107         pushl   %esi
 108 deflit(`FRAME',8)
 109
 110         movl    %eax, %ecx      C yp low limb
 111         movl    (%edx), %eax    C xp low limb
 112
 113         movl    %edx, %esi      C xp
 114         jnz     L(two_by_two)
 115
 116
 117         C two limbs by one limb
 118
 119         mull    %ecx
 120
 121         movl    %eax, (%ebx)
 122         movl    4(%esi), %eax
 123
 124         movl    %edx, %esi      C carry
 125
 126         mull    %ecx
 127
 128         addl    %eax, %esi
 129         movl    %esi, 4(%ebx)
 130
 131         adcl    $0, %edx
 132
 133         movl    %edx, 8(%ebx)
 134         popl    %esi
 135
 136         popl    %ebx
 137         ret
 138
 139
 140
 141 C -----------------------------------------------------------------------------
 142         ALIGN(16)
 143 L(two_by_two):
 144         C eax   xp low limb
 145         C ebx   wp
 146         C ecx   yp low limb
 147         C edx
 148         C esi   xp
 149         C edi
 150         C ebp
 151 deflit(`FRAME',8)
 152
 153         mull    %ecx            C xp[0] * yp[0]
 154
 155         push    %edi
 156 deflit(`FRAME',12)
 157         movl    %eax, (%ebx)
 158
 159         movl    4(%esi), %eax
 160         movl    %edx, %edi      C carry, for wp[1]
 161
 162         mull    %ecx            C xp[1] * yp[0]
 163
 164         addl    %eax, %edi
 165         movl    PARAM_YP, %ecx
 166
 167         adcl    $0, %edx
 168
 169         movl    %edi, 4(%ebx)
 170         movl    4(%ecx), %ecx   C yp[1]
 171
 172         movl    4(%esi), %eax   C xp[1]
 173         movl    %edx, %edi      C carry, for wp[2]
 174
 175         mull    %ecx            C xp[1] * yp[1]
 176
 177         addl    %eax, %edi
 178
 179         adcl    $0, %edx
 180
 181         movl    (%esi), %eax    C xp[0]
 182         movl    %edx, %esi      C carry, for wp[3]
 183
 184         mull    %ecx            C xp[0] * yp[1]
 185
 186         addl    %eax, 4(%ebx)
 187         adcl    %edx, %edi
 188         adcl    $0, %esi
 189
 190         movl    %edi, 8(%ebx)
 191         popl    %edi
 192
 193         movl    %esi, 12(%ebx)
 194         popl    %esi
 195
 196         popl    %ebx
 197         ret
 198
 199
 200 C -----------------------------------------------------------------------------
 201         ALIGN(16)
 202 L(xsize_more_than_two_limbs):
 203
 204 C The first limb of yp is processed with a simple mpn_mul_1 style loop
 205 C inline.  Unrolling this doesn't seem worthwhile since it's only run once
 206 C (whereas the addmul below is run ysize-1 many times).  A call to the
 207 C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
 208 C popping, and doesn't seem likely to be worthwhile on the typical 10-20
 209 C limb operations the Karatsuba code calls here with.
 210
 211         C eax   yp[0]
 212         C ebx
 213         C ecx   xsize
 214         C edx   xp
 215         C esi
 216         C edi
 217         C ebp
 218 deflit(`FRAME',0)
 219
 220         pushl   %edi            defframe_pushl(SAVE_EDI)
 221         pushl   %ebp            defframe_pushl(SAVE_EBP)
 222
 223         movl    PARAM_WP, %edi
 224         pushl   %esi            defframe_pushl(SAVE_ESI)
 225
 226         movl    %eax, %ebp
 227         pushl   %ebx            defframe_pushl(SAVE_EBX)
 228
 229         leal    (%edx,%ecx,4), %ebx     C xp end
 230         xorl    %esi, %esi
 231
 232         leal    (%edi,%ecx,4), %edi     C wp end of mul1
 233         negl    %ecx
 234
 235
 236 L(mul1):
 237         C eax   scratch
 238         C ebx   xp end
 239         C ecx   counter, negative
 240         C edx   scratch
 241         C esi   carry
 242         C edi   wp end of mul1
 243         C ebp   multiplier
 244
 245         movl    (%ebx,%ecx,4), %eax
 246
 247         mull    %ebp
 248
 249         addl    %esi, %eax
 250         movl    $0, %esi
 251
 252         adcl    %edx, %esi
 253
 254         movl    %eax, (%edi,%ecx,4)
 255         incl    %ecx
 256
 257         jnz     L(mul1)
 258
 259
 260         movl    PARAM_YSIZE, %edx
 261         movl    %esi, (%edi)            C final carry
 262
 263         movl    PARAM_XSIZE, %ecx
 264         decl    %edx
 265
 266         jnz     L(ysize_more_than_one_limb)
 267
 268         popl    %ebx
 269         popl    %esi
 270         popl    %ebp
 271         popl    %edi
 272         ret
 273
 274
 275 L(ysize_more_than_one_limb):
 276         cmpl    $UNROLL_THRESHOLD, %ecx
 277         movl    PARAM_YP, %eax
 278
 279         jae     L(unroll)
 280
 281
 282 C -----------------------------------------------------------------------------
 283 C Simple addmul loop.
 284 C
 285 C Using ebx and edi pointing at the ends of their respective locations saves
 286 C a couple of instructions in the outer loop.  The inner loop is still 11
 287 C cycles, the same as the simple loop in aorsmul_1.asm.
 288
 289         C eax   yp
 290         C ebx   xp end
 291         C ecx   xsize
 292         C edx   ysize-1
 293         C esi
 294         C edi   wp end of mul1
 295         C ebp
 296
 297         movl    4(%eax), %ebp           C multiplier
 298         negl    %ecx
 299
 300         movl    %ecx, PARAM_XSIZE       C -xsize
 301         xorl    %esi, %esi              C initial carry
 302
 303         leal    4(%eax,%edx,4), %eax    C yp end
 304         negl    %edx
 305
 306         movl    %eax, PARAM_YP
 307         movl    %edx, PARAM_YSIZE
 308
 309         jmp     L(simple_outer_entry)
 310
 311
 312         C aligning here saves a couple of cycles
 313         ALIGN(16)
 314 L(simple_outer_top):
 315         C edx   ysize counter, negative
 316
 317         movl    PARAM_YP, %eax          C yp end
 318         xorl    %esi, %esi              C carry
 319
 320         movl    PARAM_XSIZE, %ecx       C -xsize
 321         movl    %edx, PARAM_YSIZE
 322
 323         movl    (%eax,%edx,4), %ebp     C yp limb multiplier
 324 L(simple_outer_entry):
 325         addl    $4, %edi
 326
 327
 328 L(simple_inner):
 329         C eax   scratch
 330         C ebx   xp end
 331         C ecx   counter, negative
 332         C edx   scratch
 333         C esi   carry
 334         C edi   wp end of this addmul
 335         C ebp   multiplier
 336
 337         movl    (%ebx,%ecx,4), %eax
 338
 339         mull    %ebp
 340
 341         addl    %esi, %eax
 342         movl    $0, %esi
 343
 344         adcl    $0, %edx
 345         addl    %eax, (%edi,%ecx,4)
 346         adcl    %edx, %esi
 347
 348         incl    %ecx
 349         jnz     L(simple_inner)
 350
 351
 352         movl    PARAM_YSIZE, %edx
 353         movl    %esi, (%edi)
 354
 355         incl    %edx
 356         jnz     L(simple_outer_top)
 357
 358
 359         popl    %ebx
 360         popl    %esi
 361         popl    %ebp
 362         popl    %edi
 363         ret
 364
 365
 366 C -----------------------------------------------------------------------------
 367 C Unrolled loop.
 368 C
 369 C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
 370 C some comments.
 371 C
 372 C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
 373 C 0, inclusive.
 374 C
 375 C VAR_JMP is the computed jump into the unrolled loop.
 376 C
 377 C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
 378 C is entered.
 379 C
 380 C VAR_XP_LOW is the least significant limb of xp, which is needed at the
 381 C start of the unrolled loop.  This can't just be fetched through the xp
 382 C pointer because of the offset applied to it.
 383 C
 384 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
 385 C inclusive.
 386 C
 387 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
 388 C added to give the location of the next limb of yp, which is the multiplier
 389 C in the unrolled loop.
 390 C
 391 C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
 392 C to give the starting point in the destination for each unrolled loop (this
 393 C point is one limb upwards for each limb of yp processed).
 394 C
 395 C Having PARAM_YSIZE count negative to zero means it's not necessary to
 396 C store new values of PARAM_YP and PARAM_WP on each loop.  Those values on
 397 C the stack remain constant and on each loop an leal adjusts them with the
 398 C PARAM_YSIZE counter value.
 399
 400
 401 defframe(VAR_COUNTER,      -20)
 402 defframe(VAR_COUNTER_INIT, -24)
 403 defframe(VAR_JMP,          -28)
 404 defframe(VAR_XP_LOW,       -32)
 405 deflit(VAR_STACK_SPACE, 16)
 406
 407 dnl  For some strange reason using (%esp) instead of 0(%esp) is a touch
 408 dnl  slower in this code, hence the defframe empty-if-zero feature is
 409 dnl  disabled.
 410 dnl
 411 dnl  If VAR_COUNTER is at (%esp), the effect is worse.  In this case the
 412 dnl  unrolled loop is 255 instead of 256 bytes, but quite how this affects
 413 dnl  anything isn't clear.
 414 dnl
 415 define(`defframe_empty_if_zero_disabled',1)
 416
 417 L(unroll):
 418         C eax   yp (not used)
 419         C ebx   xp end (not used)
 420         C ecx   xsize
 421         C edx   ysize-1
 422         C esi
 423         C edi   wp end of mul1 (not used)
 424         C ebp
 425 deflit(`FRAME', 16)
 426
 427         leal    -2(%ecx), %ebp  C one limb processed at start,
 428         decl    %ecx            C and ebp is one less
 429
 430         shrl    $UNROLL_LOG2, %ebp
 431         negl    %ecx
 432
 433         subl    $VAR_STACK_SPACE, %esp
 434 deflit(`FRAME', 16+VAR_STACK_SPACE)
 435         andl    $UNROLL_MASK, %ecx
 436
 437         movl    %ecx, %esi
 438         shll    $4, %ecx
 439
 440         movl    %ebp, VAR_COUNTER_INIT
 441         negl    %esi
 442
 443         C 15 code bytes per limb
 444 ifdef(`PIC',`
 445         call    L(pic_calc)
 446 L(unroll_here):
 447 ',`
 448         leal    L(unroll_entry) (%ecx,%esi,1), %ecx
 449 ')
 450
 451         movl    PARAM_XP, %ebx
 452         movl    %ebp, VAR_COUNTER
 453
 454         movl    PARAM_WP, %edi
 455         movl    %ecx, VAR_JMP
 456
 457         movl    (%ebx), %eax
 458         leal    4(%edi,%esi,4), %edi    C wp adjust for unrolling and mul1
 459
 460         leal    (%ebx,%esi,4), %ebx     C xp adjust for unrolling
 461
 462         movl    %eax, VAR_XP_LOW
 463
 464         movl    %ebx, PARAM_XP
 465         movl    PARAM_YP, %ebx
 466
 467         leal    (%edi,%edx,4), %ecx     C wp adjust for ysize indexing
 468         movl    4(%ebx), %ebp           C multiplier (yp second limb)
 469
 470         leal    4(%ebx,%edx,4), %ebx    C yp adjust for ysize indexing
 471
 472         movl    %ecx, PARAM_WP
 473
 474         leal    1(%esi), %ecx   C adjust parity for decl %ecx above
 475
 476         movl    %ebx, PARAM_YP
 477         negl    %edx
 478
 479         movl    %edx, PARAM_YSIZE
 480         jmp     L(unroll_outer_entry)
 481
 482
 483 ifdef(`PIC',`
 484 L(pic_calc):
 485         C See README.family about old gas bugs
 486         leal    (%ecx,%esi,1), %ecx
 487         addl    $L(unroll_entry)-L(unroll_here), %ecx
 488         addl    (%esp), %ecx
 489         ret
 490 ')
 491
 492
 493 C -----------------------------------------------------------------------------
 494         C Aligning here saves a couple of cycles per loop.  Using 32 doesn't
 495         C cost any extra space, since the inner unrolled loop below is
 496         C aligned to 32.
 497         ALIGN(32)
 498 L(unroll_outer_top):
 499         C edx   ysize
 500
 501         movl    PARAM_YP, %eax
 502         movl    %edx, PARAM_YSIZE       C incremented ysize counter
 503
 504         movl    PARAM_WP, %edi
 505
 506         movl    VAR_COUNTER_INIT, %ebx
 507         movl    (%eax,%edx,4), %ebp     C next multiplier
 508
 509         movl    PARAM_XSIZE, %ecx
 510         leal    (%edi,%edx,4), %edi     C adjust wp for where we are in yp
 511
 512         movl    VAR_XP_LOW, %eax
 513         movl    %ebx, VAR_COUNTER
 514
 515 L(unroll_outer_entry):
 516         mull    %ebp
 517
 518         C using testb is a tiny bit faster than testl
 519         testb   $1, %cl
 520
 521         movl    %eax, %ecx      C low carry
 522         movl    VAR_JMP, %eax
 523
 524         movl    %edx, %esi      C high carry
 525         movl    PARAM_XP, %ebx
 526
 527         jnz     L(unroll_noswap)
 528         movl    %ecx, %esi      C high,low carry other way around
 529
 530         movl    %edx, %ecx
 531 L(unroll_noswap):
 532
 533         jmp     *%eax
 534
 535
 536
 537 C -----------------------------------------------------------------------------
 538         ALIGN(32)
 539 L(unroll_top):
 540         C eax   scratch
 541         C ebx   xp
 542         C ecx   carry low
 543         C edx   scratch
 544         C esi   carry high
 545         C edi   wp
 546         C ebp   multiplier
 547         C VAR_COUNTER  loop counter
 548         C
 549         C 15 code bytes each limb
 550
 551         leal    UNROLL_BYTES(%edi), %edi
 552
 553 L(unroll_entry):
 554 deflit(CHUNK_COUNT,2)
 555 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 556         deflit(`disp0', eval(i*CHUNK_COUNT*4))
 557         deflit(`disp1', eval(disp0 + 4))
 558         deflit(`disp2', eval(disp1 + 4))
 559
 560         movl    disp1(%ebx), %eax
 561         mull    %ebp
 562 Zdisp(  addl,   %ecx, disp0,(%edi))
 563         adcl    %eax, %esi
 564         movl    %edx, %ecx
 565         jadcl0( %ecx)
 566
 567         movl    disp2(%ebx), %eax
 568         mull    %ebp
 569         addl    %esi, disp1(%edi)
 570         adcl    %eax, %ecx
 571         movl    %edx, %esi
 572         jadcl0( %esi)
 573 ')
 574
 575         decl    VAR_COUNTER
 576         leal    UNROLL_BYTES(%ebx), %ebx
 577
 578         jns     L(unroll_top)
 579
 580
 581         movl    PARAM_YSIZE, %edx
 582         addl    %ecx, UNROLL_BYTES(%edi)
 583
 584         adcl    $0, %esi
 585
 586         incl    %edx
 587         movl    %esi, UNROLL_BYTES+4(%edi)
 588
 589         jnz     L(unroll_outer_top)
 590
 591
 592         movl    SAVE_ESI, %esi
 593         movl    SAVE_EBP, %ebp
 594         movl    SAVE_EDI, %edi
 595         movl    SAVE_EBX, %ebx
 596
 597         addl    $FRAME, %esp
 598         ret
 599
 600 EPILOGUE()