rts/gmp/mpn/x86/k7/sqr_basecase.asm

   1 dnl  AMD K7 mpn_sqr_basecase -- square an mpn number.
   2 dnl
   3 dnl  K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
   4 dnl  (measured on the speed difference between 25 and 50 limbs, which is
   5 dnl  roughly the Karatsuba recursing range).
   6
   7
   8 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   9 dnl
  10 dnl  This file is part of the GNU MP Library.
  11 dnl
  12 dnl  The GNU MP Library is free software; you can redistribute it and/or
  13 dnl  modify it under the terms of the GNU Lesser General Public License as
  14 dnl  published by the Free Software Foundation; either version 2.1 of the
  15 dnl  License, or (at your option) any later version.
  16 dnl
  17 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  18 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20 dnl  Lesser General Public License for more details.
  21 dnl
  22 dnl  You should have received a copy of the GNU Lesser General Public
  23 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  24 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  25 dnl  Suite 330, Boston, MA 02111-1307, USA.
  26
  27
  28 include(`../config.m4')
  29
  30
  31 dnl  These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
  32 dnl  some comments.
  33
  34 deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
  35
  36 ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
  37 `define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
  38
  39 m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
  40 deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
  41
  42
  43 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
  44 C
  45 C With a KARATSUBA_SQR_THRESHOLD around 50 this code is about 1500 bytes,
  46 C which is quite a bit, but is considered good value since squares big
  47 C enough to use most of the code will be spending quite a few cycles in it.
  48
  49
  50 defframe(PARAM_SIZE,12)
  51 defframe(PARAM_SRC, 8)
  52 defframe(PARAM_DST, 4)
  53
  54         .text
  55         ALIGN(32)
  56 PROLOGUE(mpn_sqr_basecase)
  57 deflit(`FRAME',0)
  58
  59         movl    PARAM_SIZE, %ecx
  60         movl    PARAM_SRC, %eax
  61         cmpl    $2, %ecx
  62
  63         movl    PARAM_DST, %edx
  64         je      L(two_limbs)
  65         ja      L(three_or_more)
  66
  67
  68 C------------------------------------------------------------------------------
  69 C one limb only
  70         C eax   src
  71         C ecx   size
  72         C edx   dst
  73
  74         movl    (%eax), %eax
  75         movl    %edx, %ecx
  76
  77         mull    %eax
  78
  79         movl    %edx, 4(%ecx)
  80         movl    %eax, (%ecx)
  81         ret
  82
  83
  84 C------------------------------------------------------------------------------
  85 C
  86 C Using the read/modify/write "add"s seems to be faster than saving and
  87 C restoring registers.  Perhaps the loads for the first set hide under the
  88 C mul latency and the second gets store to load forwarding.
  89
  90         ALIGN(16)
  91 L(two_limbs):
  92         C eax   src
  93         C ebx
  94         C ecx   size
  95         C edx   dst
  96 deflit(`FRAME',0)
  97
  98         pushl   %ebx            FRAME_pushl()
  99         movl    %eax, %ebx      C src
 100         movl    (%eax), %eax
 101
 102         movl    %edx, %ecx      C dst
 103
 104         mull    %eax            C src[0]^2
 105
 106         movl    %eax, (%ecx)    C dst[0]
 107         movl    4(%ebx), %eax
 108
 109         movl    %edx, 4(%ecx)   C dst[1]
 110
 111         mull    %eax            C src[1]^2
 112
 113         movl    %eax, 8(%ecx)   C dst[2]
 114         movl    (%ebx), %eax
 115
 116         movl    %edx, 12(%ecx)  C dst[3]
 117
 118         mull    4(%ebx)         C src[0]*src[1]
 119
 120         popl    %ebx
 121
 122         addl    %eax, 4(%ecx)
 123         adcl    %edx, 8(%ecx)
 124         adcl    $0, 12(%ecx)
 125         ASSERT(nc)
 126
 127         addl    %eax, 4(%ecx)
 128         adcl    %edx, 8(%ecx)
 129         adcl    $0, 12(%ecx)
 130         ASSERT(nc)
 131
 132         ret
 133
 134
 135 C------------------------------------------------------------------------------
 136 defframe(SAVE_EBX,  -4)
 137 defframe(SAVE_ESI,  -8)
 138 defframe(SAVE_EDI, -12)
 139 defframe(SAVE_EBP, -16)
 140 deflit(STACK_SPACE, 16)
 141
 142 L(three_or_more):
 143         subl    $STACK_SPACE, %esp
 144         cmpl    $4, %ecx
 145         jae     L(four_or_more)
 146 deflit(`FRAME',STACK_SPACE)
 147
 148
 149 C------------------------------------------------------------------------------
 150 C Three limbs
 151 C
 152 C Writing out the loads and stores separately at the end of this code comes
 153 C out about 10 cycles faster than using adcls to memory.
 154
 155         C eax   src
 156         C ecx   size
 157         C edx   dst
 158
 159         movl    %ebx, SAVE_EBX
 160         movl    %eax, %ebx      C src
 161         movl    (%eax), %eax
 162
 163         movl    %edx, %ecx      C dst
 164         movl    %esi, SAVE_ESI
 165         movl    %edi, SAVE_EDI
 166
 167         mull    %eax            C src[0] ^ 2
 168
 169         movl    %eax, (%ecx)
 170         movl    4(%ebx), %eax
 171         movl    %edx, 4(%ecx)
 172
 173         mull    %eax            C src[1] ^ 2
 174
 175         movl    %eax, 8(%ecx)
 176         movl    8(%ebx), %eax
 177         movl    %edx, 12(%ecx)
 178
 179         mull    %eax            C src[2] ^ 2
 180
 181         movl    %eax, 16(%ecx)
 182         movl    (%ebx), %eax
 183         movl    %edx, 20(%ecx)
 184
 185         mull    4(%ebx)         C src[0] * src[1]
 186
 187         movl    %eax, %esi
 188         movl    (%ebx), %eax
 189         movl    %edx, %edi
 190
 191         mull    8(%ebx)         C src[0] * src[2]
 192
 193         addl    %eax, %edi
 194         movl    %ebp, SAVE_EBP
 195         movl    $0, %ebp
 196
 197         movl    4(%ebx), %eax
 198         adcl    %edx, %ebp
 199
 200         mull    8(%ebx)         C src[1] * src[2]
 201
 202         xorl    %ebx, %ebx
 203         addl    %eax, %ebp
 204
 205         adcl    $0, %edx
 206
 207         C eax
 208         C ebx   zero, will be dst[5]
 209         C ecx   dst
 210         C edx   dst[4]
 211         C esi   dst[1]
 212         C edi   dst[2]
 213         C ebp   dst[3]
 214
 215         adcl    $0, %edx
 216         addl    %esi, %esi
 217
 218         adcl    %edi, %edi
 219         movl    4(%ecx), %eax
 220
 221         adcl    %ebp, %ebp
 222
 223         adcl    %edx, %edx
 224
 225         adcl    $0, %ebx
 226         addl    %eax, %esi
 227         movl    8(%ecx), %eax
 228
 229         adcl    %eax, %edi
 230         movl    12(%ecx), %eax
 231         movl    %esi, 4(%ecx)
 232
 233         adcl    %eax, %ebp
 234         movl    16(%ecx), %eax
 235         movl    %edi, 8(%ecx)
 236
 237         movl    SAVE_ESI, %esi
 238         movl    SAVE_EDI, %edi
 239
 240         adcl    %eax, %edx
 241         movl    20(%ecx), %eax
 242         movl    %ebp, 12(%ecx)
 243
 244         adcl    %ebx, %eax
 245         ASSERT(nc)
 246         movl    SAVE_EBX, %ebx
 247         movl    SAVE_EBP, %ebp
 248
 249         movl    %edx, 16(%ecx)
 250         movl    %eax, 20(%ecx)
 251         addl    $FRAME, %esp
 252
 253         ret
 254
 255
 256 C------------------------------------------------------------------------------
 257 L(four_or_more):
 258
 259 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
 260 C Further products are added in rather than stored.
 261
 262         C eax   src
 263         C ebx
 264         C ecx   size
 265         C edx   dst
 266         C esi
 267         C edi
 268         C ebp
 269
 270 defframe(`VAR_COUNTER',-20)
 271 defframe(`VAR_JMP',    -24)
 272 deflit(EXTRA_STACK_SPACE, 8)
 273
 274         movl    %ebx, SAVE_EBX
 275         movl    %edi, SAVE_EDI
 276         leal    (%edx,%ecx,4), %edi     C &dst[size]
 277
 278         movl    %esi, SAVE_ESI
 279         movl    %ebp, SAVE_EBP
 280         leal    (%eax,%ecx,4), %esi     C &src[size]
 281
 282         movl    (%eax), %ebp            C multiplier
 283         movl    $0, %ebx
 284         decl    %ecx
 285
 286         negl    %ecx
 287         subl    $EXTRA_STACK_SPACE, %esp
 288 FRAME_subl_esp(EXTRA_STACK_SPACE)
 289
 290 L(mul_1):
 291         C eax   scratch
 292         C ebx   carry
 293         C ecx   counter
 294         C edx   scratch
 295         C esi   &src[size]
 296         C edi   &dst[size]
 297         C ebp   multiplier
 298
 299         movl    (%esi,%ecx,4), %eax
 300
 301         mull    %ebp
 302
 303         addl    %ebx, %eax
 304         movl    %eax, (%edi,%ecx,4)
 305         movl    $0, %ebx
 306
 307         adcl    %edx, %ebx
 308         incl    %ecx
 309         jnz     L(mul_1)
 310
 311
 312 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
 313 C
 314 C The last two products, which are the bottom right corner of the product
 315 C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
 316 C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
 317 C cases that need to be done.
 318 C
 319 C The unrolled code is the same as in mpn_addmul_1, see that routine for
 320 C some comments.
 321 C
 322 C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
 323 C
 324 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
 325 C chunk each outer loop.
 326 C
 327 C K7 does branch prediction on indirect jumps, which is bad since it's a
 328 C different target each time.  There seems no way to avoid this.
 329
 330 dnl  This value also hard coded in some shifts and adds
 331 deflit(CODE_BYTES_PER_LIMB, 17)
 332
 333 dnl  With the unmodified &src[size] and &dst[size] pointers, the
 334 dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
 335 dnl  values up to 31, but above that an offset must be added to them.
 336
 337 deflit(OFFSET,
 338 ifelse(eval(UNROLL_COUNT>31),1,
 339 eval((UNROLL_COUNT-31)*4),
 340 0))
 341
 342 dnl  Because the last chunk of code is generated differently, a label placed
 343 dnl  at the end doesn't work.  Instead calculate the implied end using the
 344 dnl  start and how many chunks of code there are.
 345
 346 deflit(UNROLL_INNER_END,
 347 `L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
 348
 349         C eax
 350         C ebx   carry
 351         C ecx
 352         C edx
 353         C esi   &src[size]
 354         C edi   &dst[size]
 355         C ebp
 356
 357         movl    PARAM_SIZE, %ecx
 358         movl    %ebx, (%edi)
 359
 360         subl    $4, %ecx
 361         jz      L(corner)
 362
 363         negl    %ecx
 364 ifelse(OFFSET,0,,`subl  $OFFSET, %edi')
 365 ifelse(OFFSET,0,,`subl  $OFFSET, %esi')
 366
 367         movl    %ecx, %edx
 368         shll    $4, %ecx
 369
 370 ifdef(`PIC',`
 371         call    L(pic_calc)
 372 L(here):
 373 ',`
 374         leal    UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
 375 ')
 376
 377
 378         C The calculated jump mustn't come out to before the start of the
 379         C code available.  This is the limit UNROLL_COUNT puts on the src
 380         C operand size, but checked here directly using the jump address.
 381         ASSERT(ae,
 382         `movl_text_address(L(unroll_inner_start), %eax)
 383         cmpl    %eax, %ecx')
 384
 385
 386 C------------------------------------------------------------------------------
 387         ALIGN(16)
 388 L(unroll_outer_top):
 389         C eax
 390         C ebx   high limb to store
 391         C ecx   VAR_JMP
 392         C edx   VAR_COUNTER, limbs, negative
 393         C esi   &src[size], constant
 394         C edi   dst ptr, high of last addmul
 395         C ebp
 396
 397         movl    -12+OFFSET(%esi,%edx,4), %ebp   C next multiplier
 398         movl    -8+OFFSET(%esi,%edx,4), %eax    C first of multiplicand
 399
 400         movl    %edx, VAR_COUNTER
 401
 402         mull    %ebp
 403
 404 define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
 405
 406         testb   $1, %cl
 407         movl    %edx, %ebx      C high carry
 408         movl    %ecx, %edx      C jump
 409
 410         movl    %eax, %ecx      C low carry
 411         cmovX(  %ebx, %ecx)     C high carry reverse
 412         cmovX(  %eax, %ebx)     C low carry reverse
 413
 414         leal    CODE_BYTES_PER_LIMB(%edx), %eax
 415         xorl    %edx, %edx
 416         leal    4(%edi), %edi
 417
 418         movl    %eax, VAR_JMP
 419
 420         jmp     *%eax
 421
 422
 423 ifdef(`PIC',`
 424 L(pic_calc):
 425         addl    (%esp), %ecx
 426         addl    $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
 427         addl    %edx, %ecx
 428         ret
 429 ')
 430
 431
 432         C Must be an even address to preserve the significance of the low
 433         C bit of the jump address indicating which way around ecx/ebx should
 434         C start.
 435         ALIGN(2)
 436
 437 L(unroll_inner_start):
 438         C eax   next limb
 439         C ebx   carry high
 440         C ecx   carry low
 441         C edx   scratch
 442         C esi   src
 443         C edi   dst
 444         C ebp   multiplier
 445
 446 forloop(`i', UNROLL_COUNT, 1, `
 447         deflit(`disp_src', eval(-i*4 + OFFSET))
 448         deflit(`disp_dst', eval(disp_src - 4))
 449
 450         m4_assert(`disp_src>=-128 && disp_src<128')
 451         m4_assert(`disp_dst>=-128 && disp_dst<128')
 452
 453 ifelse(eval(i%2),0,`
 454 Zdisp(  movl,   disp_src,(%esi), %eax)
 455         adcl    %edx, %ebx
 456
 457         mull    %ebp
 458
 459 Zdisp(  addl,   %ecx, disp_dst,(%edi))
 460         movl    $0, %ecx
 461
 462         adcl    %eax, %ebx
 463
 464 ',`
 465         dnl  this bit comes out last
 466 Zdisp(  movl,   disp_src,(%esi), %eax)
 467         adcl    %edx, %ecx
 468
 469         mull    %ebp
 470
 471 dnl Zdisp(      addl    %ebx, disp_src,(%edi))
 472         addl    %ebx, disp_dst(%edi)
 473 ifelse(forloop_last,0,
 474 `       movl    $0, %ebx')
 475
 476         adcl    %eax, %ecx
 477 ')
 478 ')
 479
 480         C eax   next limb
 481         C ebx   carry high
 482         C ecx   carry low
 483         C edx   scratch
 484         C esi   src
 485         C edi   dst
 486         C ebp   multiplier
 487
 488         adcl    $0, %edx
 489         addl    %ecx, -4+OFFSET(%edi)
 490         movl    VAR_JMP, %ecx
 491
 492         adcl    $0, %edx
 493
 494         movl    %edx, m4_empty_if_zero(OFFSET) (%edi)
 495         movl    VAR_COUNTER, %edx
 496
 497         incl    %edx
 498         jnz     L(unroll_outer_top)
 499
 500
 501 ifelse(OFFSET,0,,`
 502         addl    $OFFSET, %esi
 503         addl    $OFFSET, %edi
 504 ')
 505
 506
 507 C------------------------------------------------------------------------------
 508 L(corner):
 509         C esi   &src[size]
 510         C edi   &dst[2*size-5]
 511
 512         movl    -12(%esi), %ebp
 513         movl    -8(%esi), %eax
 514         movl    %eax, %ecx
 515
 516         mull    %ebp
 517
 518         addl    %eax, -4(%edi)
 519         movl    -4(%esi), %eax
 520
 521         adcl    $0, %edx
 522         movl    %edx, %ebx
 523         movl    %eax, %esi
 524
 525         mull    %ebp
 526
 527         addl    %ebx, %eax
 528
 529         adcl    $0, %edx
 530         addl    %eax, (%edi)
 531         movl    %esi, %eax
 532
 533         adcl    $0, %edx
 534         movl    %edx, %ebx
 535
 536         mull    %ecx
 537
 538         addl    %ebx, %eax
 539         movl    %eax, 4(%edi)
 540
 541         adcl    $0, %edx
 542         movl    %edx, 8(%edi)
 543
 544
 545
 546 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
 547
 548 L(lshift_start):
 549         movl    PARAM_SIZE, %eax
 550         movl    PARAM_DST, %edi
 551         xorl    %ecx, %ecx              C clear carry
 552
 553         leal    (%edi,%eax,8), %edi
 554         notl    %eax                    C -size-1, preserve carry
 555
 556         leal    2(%eax), %eax           C -(size-1)
 557
 558 L(lshift):
 559         C eax   counter, negative
 560         C ebx
 561         C ecx
 562         C edx
 563         C esi
 564         C edi   dst, pointing just after last limb
 565         C ebp
 566
 567         rcll    -4(%edi,%eax,8)
 568         rcll    (%edi,%eax,8)
 569         incl    %eax
 570         jnz     L(lshift)
 571
 572         setc    %al
 573
 574         movl    PARAM_SRC, %esi
 575         movl    %eax, -4(%edi)          C dst most significant limb
 576
 577         movl    PARAM_SIZE, %ecx
 578
 579
 580 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
 581 C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
 582 C low limb of src[0]^2.
 583
 584         movl    (%esi), %eax            C src[0]
 585
 586         mull    %eax
 587
 588         leal    (%esi,%ecx,4), %esi     C src point just after last limb
 589         negl    %ecx
 590
 591         movl    %eax, (%edi,%ecx,8)     C dst[0]
 592         incl    %ecx
 593
 594 L(diag):
 595         C eax   scratch
 596         C ebx   scratch
 597         C ecx   counter, negative
 598         C edx   carry
 599         C esi   src just after last limb
 600         C edi   dst just after last limb
 601         C ebp
 602
 603         movl    (%esi,%ecx,4), %eax
 604         movl    %edx, %ebx
 605
 606         mull    %eax
 607
 608         addl    %ebx, -4(%edi,%ecx,8)
 609         adcl    %eax, (%edi,%ecx,8)
 610         adcl    $0, %edx
 611
 612         incl    %ecx
 613         jnz     L(diag)
 614
 615
 616         movl    SAVE_ESI, %esi
 617         movl    SAVE_EBX, %ebx
 618
 619         addl    %edx, -4(%edi)          C dst most significant limb
 620         movl    SAVE_EDI, %edi
 621
 622         movl    SAVE_EBP, %ebp
 623         addl    $FRAME, %esp
 624
 625         ret
 626
 627 EPILOGUE()