rts/gmp/mpn/x86/k6/sqr_basecase.asm

   1 dnl  AMD K6 mpn_sqr_basecase -- square an mpn number.
   2 dnl
   3 dnl  K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
   4 dnl  product (measured on the speed difference between 17 and 33 limbs,
   5 dnl  which is roughly the Karatsuba recursing range).
   6
   7
   8 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   9 dnl
  10 dnl  This file is part of the GNU MP Library.
  11 dnl
  12 dnl  The GNU MP Library is free software; you can redistribute it and/or
  13 dnl  modify it under the terms of the GNU Lesser General Public License as
  14 dnl  published by the Free Software Foundation; either version 2.1 of the
  15 dnl  License, or (at your option) any later version.
  16 dnl
  17 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  18 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20 dnl  Lesser General Public License for more details.
  21 dnl
  22 dnl  You should have received a copy of the GNU Lesser General Public
  23 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  24 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  25 dnl  Suite 330, Boston, MA 02111-1307, USA.
  26
  27
  28 include(`../config.m4')
  29
  30
  31 dnl  KARATSUBA_SQR_THRESHOLD_MAX is the maximum KARATSUBA_SQR_THRESHOLD this
  32 dnl  code supports.  This value is used only by the tune program to know
  33 dnl  what it can go up to.  (An attempt to compile with a bigger value will
  34 dnl  trigger some m4_assert()s in the code, making the build fail.)
  35 dnl
  36 dnl  The value is determined by requiring the displacements in the unrolled
  37 dnl  addmul to fit in single bytes.  This means a maximum UNROLL_COUNT of
  38 dnl  63, giving a maximum KARATSUBA_SQR_THRESHOLD of 66.
  39
  40 deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
  41
  42
  43 dnl  Allow a value from the tune program to override config.m4.
  44
  45 ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
  46 `define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
  47
  48
  49 dnl  UNROLL_COUNT is the number of code chunks in the unrolled addmul.  The
  50 dnl  number required is determined by KARATSUBA_SQR_THRESHOLD, since
  51 dnl  mpn_sqr_basecase only needs to handle sizes < KARATSUBA_SQR_THRESHOLD.
  52 dnl
  53 dnl  The first addmul is the biggest, and this takes the second least
  54 dnl  significant limb and multiplies it by the third least significant and
  55 dnl  up.  Hence for a maximum operand size of KARATSUBA_SQR_THRESHOLD-1
  56 dnl  limbs, UNROLL_COUNT needs to be KARATSUBA_SQR_THRESHOLD-3.
  57
  58 m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
  59 deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
  60
  61
  62 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
  63 C
  64 C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
  65 C lot of function call overheads are avoided, especially when the given size
  66 C is small.
  67 C
  68 C The code size might look a bit excessive, but not all of it is executed
  69 C and so won't fill up the code cache.  The 1x1, 2x2 and 3x3 special cases
  70 C clearly apply only to those sizes; mid sizes like 10x10 only need part of
  71 C the unrolled addmul; and big sizes like 35x35 that do need all of it will
  72 C at least be getting value for money, because 35x35 spends something like
  73 C 5780 cycles here.
  74 C
  75 C Different values of UNROLL_COUNT give slightly different speeds, between
  76 C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
  77 C This isn't a big difference, but it's presumably some alignment effect
  78 C which if understood could give a simple speedup.
  79
  80 defframe(PARAM_SIZE,12)
  81 defframe(PARAM_SRC, 8)
  82 defframe(PARAM_DST, 4)
  83
  84         .text
  85         ALIGN(32)
  86 PROLOGUE(mpn_sqr_basecase)
  87 deflit(`FRAME',0)
  88
  89         movl    PARAM_SIZE, %ecx
  90         movl    PARAM_SRC, %eax
  91
  92         cmpl    $2, %ecx
  93         je      L(two_limbs)
  94
  95         movl    PARAM_DST, %edx
  96         ja      L(three_or_more)
  97
  98
  99 C -----------------------------------------------------------------------------
 100 C one limb only
 101         C eax   src
 102         C ebx
 103         C ecx   size
 104         C edx   dst
 105
 106         movl    (%eax), %eax
 107         movl    %edx, %ecx
 108
 109         mull    %eax
 110
 111         movl    %eax, (%ecx)
 112         movl    %edx, 4(%ecx)
 113         ret
 114
 115
 116 C -----------------------------------------------------------------------------
 117         ALIGN(16)
 118 L(two_limbs):
 119         C eax   src
 120         C ebx
 121         C ecx   size
 122         C edx   dst
 123
 124         pushl   %ebx
 125         movl    %eax, %ebx      C src
 126 deflit(`FRAME',4)
 127
 128         movl    (%ebx), %eax
 129         movl    PARAM_DST, %ecx
 130
 131         mull    %eax            C src[0]^2
 132
 133         movl    %eax, (%ecx)
 134         movl    4(%ebx), %eax
 135
 136         movl    %edx, 4(%ecx)
 137
 138         mull    %eax            C src[1]^2
 139
 140         movl    %eax, 8(%ecx)
 141         movl    (%ebx), %eax
 142
 143         movl    %edx, 12(%ecx)
 144         movl    4(%ebx), %edx
 145
 146         mull    %edx            C src[0]*src[1]
 147
 148         addl    %eax, 4(%ecx)
 149
 150         adcl    %edx, 8(%ecx)
 151         adcl    $0, 12(%ecx)
 152
 153         popl    %ebx
 154         addl    %eax, 4(%ecx)
 155
 156         adcl    %edx, 8(%ecx)
 157         adcl    $0, 12(%ecx)
 158
 159         ret
 160
 161
 162 C -----------------------------------------------------------------------------
 163 L(three_or_more):
 164 deflit(`FRAME',0)
 165         cmpl    $4, %ecx
 166         jae     L(four_or_more)
 167
 168
 169 C -----------------------------------------------------------------------------
 170 C three limbs
 171         C eax   src
 172         C ecx   size
 173         C edx   dst
 174
 175         pushl   %ebx
 176         movl    %eax, %ebx      C src
 177
 178         movl    (%ebx), %eax
 179         movl    %edx, %ecx      C dst
 180
 181         mull    %eax            C src[0] ^ 2
 182
 183         movl    %eax, (%ecx)
 184         movl    4(%ebx), %eax
 185
 186         movl    %edx, 4(%ecx)
 187         pushl   %esi
 188
 189         mull    %eax            C src[1] ^ 2
 190
 191         movl    %eax, 8(%ecx)
 192         movl    8(%ebx), %eax
 193
 194         movl    %edx, 12(%ecx)
 195         pushl   %edi
 196
 197         mull    %eax            C src[2] ^ 2
 198
 199         movl    %eax, 16(%ecx)
 200         movl    (%ebx), %eax
 201
 202         movl    %edx, 20(%ecx)
 203         movl    4(%ebx), %edx
 204
 205         mull    %edx            C src[0] * src[1]
 206
 207         movl    %eax, %esi
 208         movl    (%ebx), %eax
 209
 210         movl    %edx, %edi
 211         movl    8(%ebx), %edx
 212
 213         pushl   %ebp
 214         xorl    %ebp, %ebp
 215
 216         mull    %edx            C src[0] * src[2]
 217
 218         addl    %eax, %edi
 219         movl    4(%ebx), %eax
 220
 221         adcl    %edx, %ebp
 222
 223         movl    8(%ebx), %edx
 224
 225         mull    %edx            C src[1] * src[2]
 226
 227         addl    %eax, %ebp
 228
 229         adcl    $0, %edx
 230
 231
 232         C eax   will be dst[5]
 233         C ebx
 234         C ecx   dst
 235         C edx   dst[4]
 236         C esi   dst[1]
 237         C edi   dst[2]
 238         C ebp   dst[3]
 239
 240         xorl    %eax, %eax
 241         addl    %esi, %esi
 242         adcl    %edi, %edi
 243         adcl    %ebp, %ebp
 244         adcl    %edx, %edx
 245         adcl    $0, %eax
 246
 247         addl    %esi, 4(%ecx)
 248         adcl    %edi, 8(%ecx)
 249         adcl    %ebp, 12(%ecx)
 250
 251         popl    %ebp
 252         popl    %edi
 253
 254         adcl    %edx, 16(%ecx)
 255
 256         popl    %esi
 257         popl    %ebx
 258
 259         adcl    %eax, 20(%ecx)
 260         ASSERT(nc)
 261
 262         ret
 263
 264
 265 C -----------------------------------------------------------------------------
 266
 267 defframe(SAVE_EBX,   -4)
 268 defframe(SAVE_ESI,   -8)
 269 defframe(SAVE_EDI,   -12)
 270 defframe(SAVE_EBP,   -16)
 271 defframe(VAR_COUNTER,-20)
 272 defframe(VAR_JMP,    -24)
 273 deflit(STACK_SPACE, 24)
 274
 275         ALIGN(16)
 276 L(four_or_more):
 277
 278         C eax   src
 279         C ebx
 280         C ecx   size
 281         C edx   dst
 282         C esi
 283         C edi
 284         C ebp
 285
 286 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
 287 C
 288 C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
 289 C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
 290 C a 5780 cycle operation, which is not surprising since the loop here is 8
 291 C c/l and mpn_mul_1 is 6.25 c/l.
 292
 293         subl    $STACK_SPACE, %esp      deflit(`FRAME',STACK_SPACE)
 294
 295         movl    %edi, SAVE_EDI
 296         leal    4(%edx), %edi
 297
 298         movl    %ebx, SAVE_EBX
 299         leal    4(%eax), %ebx
 300
 301         movl    %esi, SAVE_ESI
 302         xorl    %esi, %esi
 303
 304         movl    %ebp, SAVE_EBP
 305
 306         C eax
 307         C ebx   src+4
 308         C ecx   size
 309         C edx
 310         C esi
 311         C edi   dst+4
 312         C ebp
 313
 314         movl    (%eax), %ebp    C multiplier
 315         leal    -1(%ecx), %ecx  C size-1, and pad to a 16 byte boundary
 316
 317
 318         ALIGN(16)
 319 L(mul_1):
 320         C eax   scratch
 321         C ebx   src ptr
 322         C ecx   counter
 323         C edx   scratch
 324         C esi   carry
 325         C edi   dst ptr
 326         C ebp   multiplier
 327
 328         movl    (%ebx), %eax
 329         addl    $4, %ebx
 330
 331         mull    %ebp
 332
 333         addl    %esi, %eax
 334         movl    $0, %esi
 335
 336         adcl    %edx, %esi
 337
 338         movl    %eax, (%edi)
 339         addl    $4, %edi
 340
 341         loop    L(mul_1)
 342
 343
 344 C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
 345 C
 346 C The last two addmuls, which are the bottom right corner of the product
 347 C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
 348 C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
 349 C cases that need to be done.
 350 C
 351 C The unrolled code is the same as mpn_addmul_1(), see that routine for some
 352 C comments.
 353 C
 354 C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
 355 C
 356 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
 357 C chunk each outer loop.
 358 C
 359 C K6 doesn't do any branch prediction on indirect jumps, which is good
 360 C actually because it's a different target each time.  The unrolled addmul
 361 C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
 362 C the indirect jump is quickly recovered.
 363
 364
 365 dnl  This value is also implicitly encoded in a shift and add.
 366 dnl
 367 deflit(CODE_BYTES_PER_LIMB, 15)
 368
 369 dnl  With the unmodified &src[size] and &dst[size] pointers, the
 370 dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
 371 dnl  values up to 31.  Above that an offset must be added to them.
 372 dnl
 373 deflit(OFFSET,
 374 ifelse(eval(UNROLL_COUNT>31),1,
 375 eval((UNROLL_COUNT-31)*4),
 376 0))
 377
 378         C eax
 379         C ebx   &src[size]
 380         C ecx
 381         C edx
 382         C esi   carry
 383         C edi   &dst[size]
 384         C ebp
 385
 386         movl    PARAM_SIZE, %ecx
 387         movl    %esi, (%edi)
 388
 389         subl    $4, %ecx
 390         jz      L(corner)
 391
 392         movl    %ecx, %edx
 393 ifelse(OFFSET,0,,
 394 `       subl    $OFFSET, %ebx')
 395
 396         shll    $4, %ecx
 397 ifelse(OFFSET,0,,
 398 `       subl    $OFFSET, %edi')
 399
 400         negl    %ecx
 401
 402 ifdef(`PIC',`
 403         call    L(pic_calc)
 404 L(here):
 405 ',`
 406         leal    L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
 407 ')
 408         negl    %edx
 409
 410
 411         C The calculated jump mustn't be before the start of the available
 412         C code.  This is the limitation UNROLL_COUNT puts on the src operand
 413         C size, but checked here using the jump address directly.
 414         C
 415         ASSERT(ae,`
 416         movl_text_address( L(unroll_inner_start), %eax)
 417         cmpl    %eax, %ecx
 418         ')
 419
 420
 421 C -----------------------------------------------------------------------------
 422         ALIGN(16)
 423 L(unroll_outer_top):
 424         C eax
 425         C ebx   &src[size], constant
 426         C ecx   VAR_JMP
 427         C edx   VAR_COUNTER, limbs, negative
 428         C esi   high limb to store
 429         C edi   dst ptr, high of last addmul
 430         C ebp
 431
 432         movl    -12+OFFSET(%ebx,%edx,4), %ebp   C multiplier
 433         movl    %edx, VAR_COUNTER
 434
 435         movl    -8+OFFSET(%ebx,%edx,4), %eax    C first limb of multiplicand
 436
 437         mull    %ebp
 438
 439         testb   $1, %cl
 440
 441         movl    %edx, %esi      C high carry
 442         movl    %ecx, %edx      C jump
 443
 444         movl    %eax, %ecx      C low carry
 445         leal    CODE_BYTES_PER_LIMB(%edx), %edx
 446
 447         movl    %edx, VAR_JMP
 448         leal    4(%edi), %edi
 449
 450         C A branch-free version of this using some xors was found to be a
 451         C touch slower than just a conditional jump, despite the jump
 452         C switching between taken and not taken on every loop.
 453
 454 ifelse(eval(UNROLL_COUNT%2),0,
 455         jz,jnz) L(unroll_noswap)
 456         movl    %esi, %eax      C high,low carry other way around
 457
 458         movl    %ecx, %esi
 459         movl    %eax, %ecx
 460 L(unroll_noswap):
 461
 462         jmp     *%edx
 463
 464
 465         C Must be on an even address here so the low bit of the jump address
 466         C will indicate which way around ecx/esi should start.
 467         C
 468         C An attempt was made at padding here to get the end of the unrolled
 469         C code to come out on a good alignment, to save padding before
 470         C L(corner).  This worked, but turned out to run slower than just an
 471         C ALIGN(2).  The reason for this is not clear, it might be related
 472         C to the different speeds on different UNROLL_COUNTs noted above.
 473
 474         ALIGN(2)
 475
 476 L(unroll_inner_start):
 477         C eax   scratch
 478         C ebx   src
 479         C ecx   carry low
 480         C edx   scratch
 481         C esi   carry high
 482         C edi   dst
 483         C ebp   multiplier
 484         C
 485         C 15 code bytes each limb
 486         C ecx/esi swapped on each chunk
 487
 488 forloop(`i', UNROLL_COUNT, 1, `
 489         deflit(`disp_src', eval(-i*4 + OFFSET))
 490         deflit(`disp_dst', eval(disp_src - 4))
 491
 492         m4_assert(`disp_src>=-128 && disp_src<128')
 493         m4_assert(`disp_dst>=-128 && disp_dst<128')
 494
 495 ifelse(eval(i%2),0,`
 496 Zdisp(  movl,   disp_src,(%ebx), %eax)
 497         mull    %ebp
 498 Zdisp(  addl,   %esi, disp_dst,(%edi))
 499         adcl    %eax, %ecx
 500         movl    %edx, %esi
 501         jadcl0( %esi)
 502 ',`
 503         dnl  this one comes out last
 504 Zdisp(  movl,   disp_src,(%ebx), %eax)
 505         mull    %ebp
 506 Zdisp(  addl,   %ecx, disp_dst,(%edi))
 507         adcl    %eax, %esi
 508         movl    %edx, %ecx
 509         jadcl0( %ecx)
 510 ')
 511 ')
 512 L(unroll_inner_end):
 513
 514         addl    %esi, -4+OFFSET(%edi)
 515
 516         movl    VAR_COUNTER, %edx
 517         jadcl0( %ecx)
 518
 519         movl    %ecx, m4_empty_if_zero(OFFSET)(%edi)
 520         movl    VAR_JMP, %ecx
 521
 522         incl    %edx
 523         jnz     L(unroll_outer_top)
 524
 525
 526 ifelse(OFFSET,0,,`
 527         addl    $OFFSET, %ebx
 528         addl    $OFFSET, %edi
 529 ')
 530
 531
 532 C -----------------------------------------------------------------------------
 533         ALIGN(16)
 534 L(corner):
 535         C ebx   &src[size]
 536         C edi   &dst[2*size-5]
 537
 538         movl    -12(%ebx), %ebp
 539
 540         movl    -8(%ebx), %eax
 541         movl    %eax, %ecx
 542
 543         mull    %ebp
 544
 545         addl    %eax, -4(%edi)
 546         adcl    $0, %edx
 547
 548         movl    -4(%ebx), %eax
 549         movl    %edx, %esi
 550         movl    %eax, %ebx
 551
 552         mull    %ebp
 553
 554         addl    %esi, %eax
 555         adcl    $0, %edx
 556
 557         addl    %eax, (%edi)
 558         adcl    $0, %edx
 559
 560         movl    %edx, %esi
 561         movl    %ebx, %eax
 562
 563         mull    %ecx
 564
 565         addl    %esi, %eax
 566         movl    %eax, 4(%edi)
 567
 568         adcl    $0, %edx
 569
 570         movl    %edx, 8(%edi)
 571
 572
 573 C -----------------------------------------------------------------------------
 574 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
 575 C The loop measures about 6 cycles/iteration, though it looks like it should
 576 C decode in 5.
 577
 578 L(lshift_start):
 579         movl    PARAM_SIZE, %ecx
 580
 581         movl    PARAM_DST, %edi
 582         subl    $1, %ecx                C size-1 and clear carry
 583
 584         movl    PARAM_SRC, %ebx
 585         movl    %ecx, %edx
 586
 587         xorl    %eax, %eax              C ready for adcl
 588
 589
 590         ALIGN(16)
 591 L(lshift):
 592         C eax
 593         C ebx   src (for later use)
 594         C ecx   counter, decrementing
 595         C edx   size-1 (for later use)
 596         C esi
 597         C edi   dst, incrementing
 598         C ebp
 599
 600         rcll    4(%edi)
 601         rcll    8(%edi)
 602         leal    8(%edi), %edi
 603         loop    L(lshift)
 604
 605
 606         adcl    %eax, %eax
 607
 608         movl    %eax, 4(%edi)           C dst most significant limb
 609         movl    (%ebx), %eax            C src[0]
 610
 611         leal    4(%ebx,%edx,4), %ebx    C &src[size]
 612         subl    %edx, %ecx              C -(size-1)
 613
 614
 615 C -----------------------------------------------------------------------------
 616 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
 617 C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
 618 C low limb of src[0]^2.
 619
 620
 621         mull    %eax
 622
 623         movl    %eax, (%edi,%ecx,8)     C dst[0]
 624
 625
 626         ALIGN(16)
 627 L(diag):
 628         C eax   scratch
 629         C ebx   &src[size]
 630         C ecx   counter, negative
 631         C edx   carry
 632         C esi   scratch
 633         C edi   dst[2*size-2]
 634         C ebp
 635
 636         movl    (%ebx,%ecx,4), %eax
 637         movl    %edx, %esi
 638
 639         mull    %eax
 640
 641         addl    %esi, 4(%edi,%ecx,8)
 642         adcl    %eax, 8(%edi,%ecx,8)
 643         adcl    $0, %edx
 644
 645         incl    %ecx
 646         jnz     L(diag)
 647
 648
 649         movl    SAVE_EBX, %ebx
 650         movl    SAVE_ESI, %esi
 651
 652         addl    %edx, 4(%edi)           C dst most significant limb
 653
 654         movl    SAVE_EDI, %edi
 655         movl    SAVE_EBP, %ebp
 656         addl    $FRAME, %esp
 657         ret
 658
 659
 660
 661 C -----------------------------------------------------------------------------
 662 ifdef(`PIC',`
 663 L(pic_calc):
 664         C See README.family about old gas bugs
 665         addl    (%esp), %ecx
 666         addl    $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
 667         addl    %edx, %ecx
 668         ret
 669 ')
 670
 671
 672 EPILOGUE()