rts/gmp/mpn/x86/p6/sqr_basecase.asm

   1 dnl  Intel P6 mpn_sqr_basecase -- square an mpn number.
   2 dnl
   3 dnl  P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
   4 dnl  product (measured on the speed difference between 20 and 40 limbs,
   5 dnl  which is the Karatsuba recursing range).
   6
   7
   8 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   9 dnl
  10 dnl  This file is part of the GNU MP Library.
  11 dnl
  12 dnl  The GNU MP Library is free software; you can redistribute it and/or
  13 dnl  modify it under the terms of the GNU Lesser General Public License as
  14 dnl  published by the Free Software Foundation; either version 2.1 of the
  15 dnl  License, or (at your option) any later version.
  16 dnl
  17 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  18 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20 dnl  Lesser General Public License for more details.
  21 dnl
  22 dnl  You should have received a copy of the GNU Lesser General Public
  23 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  24 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  25 dnl  Suite 330, Boston, MA 02111-1307, USA.
  26
  27
  28 include(`../config.m4')
  29
  30
  31 dnl  These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
  32 dnl  a description.  The only difference here is that UNROLL_COUNT can go up
  33 dnl  to 64 (not 63) making KARATSUBA_SQR_THRESHOLD_MAX 67.
  34
  35 deflit(KARATSUBA_SQR_THRESHOLD_MAX, 67)
  36
  37 ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
  38 `define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
  39
  40 m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
  41 deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
  42
  43
  44 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
  45 C
  46 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
  47 C lot of function call overheads are avoided, especially when the given size
  48 C is small.
  49 C
  50 C The code size might look a bit excessive, but not all of it is executed so
  51 C it won't all get into the code cache.  The 1x1, 2x2 and 3x3 special cases
  52 C clearly apply only to those sizes; mid sizes like 10x10 only need part of
  53 C the unrolled addmul; and big sizes like 40x40 that do use the full
  54 C unrolling will least be making good use of it, because 40x40 will take
  55 C something like 7000 cycles.
  56
  57 defframe(PARAM_SIZE,12)
  58 defframe(PARAM_SRC, 8)
  59 defframe(PARAM_DST, 4)
  60
  61         .text
  62         ALIGN(32)
  63 PROLOGUE(mpn_sqr_basecase)
  64 deflit(`FRAME',0)
  65
  66         movl    PARAM_SIZE, %edx
  67
  68         movl    PARAM_SRC, %eax
  69
  70         cmpl    $2, %edx
  71         movl    PARAM_DST, %ecx
  72         je      L(two_limbs)
  73
  74         movl    (%eax), %eax
  75         ja      L(three_or_more)
  76
  77
  78 C -----------------------------------------------------------------------------
  79 C one limb only
  80         C eax   src limb
  81         C ebx
  82         C ecx   dst
  83         C edx
  84
  85         mull    %eax
  86
  87         movl    %eax, (%ecx)
  88         movl    %edx, 4(%ecx)
  89
  90         ret
  91
  92
  93 C -----------------------------------------------------------------------------
  94 L(two_limbs):
  95         C eax   src
  96         C ebx
  97         C ecx   dst
  98         C edx
  99
 100 defframe(SAVE_ESI, -4)
 101 defframe(SAVE_EBX, -8)
 102 defframe(SAVE_EDI, -12)
 103 defframe(SAVE_EBP, -16)
 104 deflit(`STACK_SPACE',16)
 105
 106         subl    $STACK_SPACE, %esp
 107 deflit(`FRAME',STACK_SPACE)
 108
 109         movl    %esi, SAVE_ESI
 110         movl    %eax, %esi
 111         movl    (%eax), %eax
 112
 113         mull    %eax            C src[0]^2
 114
 115         movl    %eax, (%ecx)    C dst[0]
 116         movl    4(%esi), %eax
 117
 118         movl    %ebx, SAVE_EBX
 119         movl    %edx, %ebx      C dst[1]
 120
 121         mull    %eax            C src[1]^2
 122
 123         movl    %edi, SAVE_EDI
 124         movl    %eax, %edi      C dst[2]
 125         movl    (%esi), %eax
 126
 127         movl    %ebp, SAVE_EBP
 128         movl    %edx, %ebp      C dst[3]
 129
 130         mull    4(%esi)         C src[0]*src[1]
 131
 132         addl    %eax, %ebx
 133         movl    SAVE_ESI, %esi
 134
 135         adcl    %edx, %edi
 136
 137         adcl    $0, %ebp
 138         addl    %ebx, %eax
 139         movl    SAVE_EBX, %ebx
 140
 141         adcl    %edi, %edx
 142         movl    SAVE_EDI, %edi
 143
 144         adcl    $0, %ebp
 145
 146         movl    %eax, 4(%ecx)
 147
 148         movl    %ebp, 12(%ecx)
 149         movl    SAVE_EBP, %ebp
 150
 151         movl    %edx, 8(%ecx)
 152         addl    $FRAME, %esp
 153
 154         ret
 155
 156
 157 C -----------------------------------------------------------------------------
 158 L(three_or_more):
 159         C eax   src low limb
 160         C ebx
 161         C ecx   dst
 162         C edx   size
 163 deflit(`FRAME',0)
 164
 165         pushl   %esi    defframe_pushl(`SAVE_ESI')
 166         cmpl    $4, %edx
 167
 168         movl    PARAM_SRC, %esi
 169         jae     L(four_or_more)
 170
 171
 172 C -----------------------------------------------------------------------------
 173 C three limbs
 174
 175         C eax   src low limb
 176         C ebx
 177         C ecx   dst
 178         C edx
 179         C esi   src
 180         C edi
 181         C ebp
 182
 183         pushl   %ebp    defframe_pushl(`SAVE_EBP')
 184         pushl   %edi    defframe_pushl(`SAVE_EDI')
 185
 186         mull    %eax            C src[0] ^ 2
 187
 188         movl    %eax, (%ecx)
 189         movl    %edx, 4(%ecx)
 190
 191         movl    4(%esi), %eax
 192         xorl    %ebp, %ebp
 193
 194         mull    %eax            C src[1] ^ 2
 195
 196         movl    %eax, 8(%ecx)
 197         movl    %edx, 12(%ecx)
 198         movl    8(%esi), %eax
 199
 200         pushl   %ebx    defframe_pushl(`SAVE_EBX')
 201
 202         mull    %eax            C src[2] ^ 2
 203
 204         movl    %eax, 16(%ecx)
 205         movl    %edx, 20(%ecx)
 206
 207         movl    (%esi), %eax
 208
 209         mull    4(%esi)         C src[0] * src[1]
 210
 211         movl    %eax, %ebx
 212         movl    %edx, %edi
 213
 214         movl    (%esi), %eax
 215
 216         mull    8(%esi)         C src[0] * src[2]
 217
 218         addl    %eax, %edi
 219         movl    %edx, %ebp
 220
 221         adcl    $0, %ebp
 222         movl    4(%esi), %eax
 223
 224         mull    8(%esi)         C src[1] * src[2]
 225
 226         xorl    %esi, %esi
 227         addl    %eax, %ebp
 228
 229         C eax
 230         C ebx   dst[1]
 231         C ecx   dst
 232         C edx   dst[4]
 233         C esi   zero, will be dst[5]
 234         C edi   dst[2]
 235         C ebp   dst[3]
 236
 237         adcl    $0, %edx
 238         addl    %ebx, %ebx
 239
 240         adcl    %edi, %edi
 241
 242         adcl    %ebp, %ebp
 243
 244         adcl    %edx, %edx
 245         movl    4(%ecx), %eax
 246
 247         adcl    $0, %esi
 248         addl    %ebx, %eax
 249
 250         movl    %eax, 4(%ecx)
 251         movl    8(%ecx), %eax
 252
 253         adcl    %edi, %eax
 254         movl    12(%ecx), %ebx
 255
 256         adcl    %ebp, %ebx
 257         movl    16(%ecx), %edi
 258
 259         movl    %eax, 8(%ecx)
 260         movl    SAVE_EBP, %ebp
 261
 262         movl    %ebx, 12(%ecx)
 263         movl    SAVE_EBX, %ebx
 264
 265         adcl    %edx, %edi
 266         movl    20(%ecx), %eax
 267
 268         movl    %edi, 16(%ecx)
 269         movl    SAVE_EDI, %edi
 270
 271         adcl    %esi, %eax      C no carry out of this
 272         movl    SAVE_ESI, %esi
 273
 274         movl    %eax, 20(%ecx)
 275         addl    $FRAME, %esp
 276
 277         ret
 278
 279
 280
 281 C -----------------------------------------------------------------------------
 282 defframe(VAR_COUNTER,-20)
 283 defframe(VAR_JMP,    -24)
 284 deflit(`STACK_SPACE',24)
 285
 286 L(four_or_more):
 287         C eax   src low limb
 288         C ebx
 289         C ecx
 290         C edx   size
 291         C esi   src
 292         C edi
 293         C ebp
 294 deflit(`FRAME',4)  dnl  %esi already pushed
 295
 296 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
 297
 298         subl    $STACK_SPACE-FRAME, %esp
 299 deflit(`FRAME',STACK_SPACE)
 300         movl    $1, %ecx
 301
 302         movl    %edi, SAVE_EDI
 303         movl    PARAM_DST, %edi
 304
 305         movl    %ebx, SAVE_EBX
 306         subl    %edx, %ecx              C -(size-1)
 307
 308         movl    %ebp, SAVE_EBP
 309         movl    $0, %ebx                C initial carry
 310
 311         leal    (%esi,%edx,4), %esi     C &src[size]
 312         movl    %eax, %ebp              C multiplier
 313
 314         leal    -4(%edi,%edx,4), %edi   C &dst[size-1]
 315
 316
 317 C This loop runs at just over 6 c/l.
 318
 319 L(mul_1):
 320         C eax   scratch
 321         C ebx   carry
 322         C ecx   counter, limbs, negative, -(size-1) to -1
 323         C edx   scratch
 324         C esi   &src[size]
 325         C edi   &dst[size-1]
 326         C ebp   multiplier
 327
 328         movl    %ebp, %eax
 329
 330         mull    (%esi,%ecx,4)
 331
 332         addl    %ebx, %eax
 333         movl    $0, %ebx
 334
 335         adcl    %edx, %ebx
 336         movl    %eax, 4(%edi,%ecx,4)
 337
 338         incl    %ecx
 339         jnz     L(mul_1)
 340
 341
 342         movl    %ebx, 4(%edi)
 343
 344
 345 C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
 346 C
 347 C The last two addmuls, which are the bottom right corner of the product
 348 C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
 349 C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
 350 C cases that need to be done.
 351 C
 352 C The unrolled code is the same as mpn_addmul_1(), see that routine for some
 353 C comments.
 354 C
 355 C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
 356 C
 357 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
 358 C chunk each outer loop.
 359
 360 dnl  This is also hard-coded in the address calculation below.
 361 deflit(CODE_BYTES_PER_LIMB, 15)
 362
 363 dnl  With &src[size] and &dst[size-1] pointers, the displacements in the
 364 dnl  unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
 365 dnl  that an offset must be added to them.
 366 deflit(OFFSET,
 367 ifelse(eval(UNROLL_COUNT>32),1,
 368 eval((UNROLL_COUNT-32)*4),
 369 0))
 370
 371         C eax
 372         C ebx   carry
 373         C ecx
 374         C edx
 375         C esi   &src[size]
 376         C edi   &dst[size-1]
 377         C ebp
 378
 379         movl    PARAM_SIZE, %ecx
 380
 381         subl    $4, %ecx
 382         jz      L(corner)
 383
 384         movl    %ecx, %edx
 385         negl    %ecx
 386
 387         shll    $4, %ecx
 388 ifelse(OFFSET,0,,`subl  $OFFSET, %esi')
 389
 390 ifdef(`PIC',`
 391         call    L(pic_calc)
 392 L(here):
 393 ',`
 394         leal    L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
 395 ')
 396         negl    %edx
 397
 398 ifelse(OFFSET,0,,`subl  $OFFSET, %edi')
 399
 400         C The calculated jump mustn't be before the start of the available
 401         C code.  This is the limit that UNROLL_COUNT puts on the src operand
 402         C size, but checked here using the jump address directly.
 403
 404         ASSERT(ae,
 405         `movl_text_address( L(unroll_inner_start), %eax)
 406         cmpl    %eax, %ecx')
 407
 408
 409 C -----------------------------------------------------------------------------
 410         ALIGN(16)
 411 L(unroll_outer_top):
 412         C eax
 413         C ebx   high limb to store
 414         C ecx   VAR_JMP
 415         C edx   VAR_COUNTER, limbs, negative
 416         C esi   &src[size], constant
 417         C edi   dst ptr, second highest limb of last addmul
 418         C ebp
 419
 420         movl    -12+OFFSET(%esi,%edx,4), %ebp   C multiplier
 421         movl    %edx, VAR_COUNTER
 422
 423         movl    -8+OFFSET(%esi,%edx,4), %eax    C first limb of multiplicand
 424
 425         mull    %ebp
 426
 427 define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
 428
 429         testb   $1, %cl
 430
 431         movl    %edx, %ebx      C high carry
 432         leal    4(%edi), %edi
 433
 434         movl    %ecx, %edx      C jump
 435
 436         movl    %eax, %ecx      C low carry
 437         leal    CODE_BYTES_PER_LIMB(%edx), %edx
 438
 439         cmovX(  %ebx, %ecx)     C high carry reverse
 440         cmovX(  %eax, %ebx)     C low carry reverse
 441         movl    %edx, VAR_JMP
 442         jmp     *%edx
 443
 444
 445         C Must be on an even address here so the low bit of the jump address
 446         C will indicate which way around ecx/ebx should start.
 447
 448         ALIGN(2)
 449
 450 L(unroll_inner_start):
 451         C eax   scratch
 452         C ebx   carry high
 453         C ecx   carry low
 454         C edx   scratch
 455         C esi   src pointer
 456         C edi   dst pointer
 457         C ebp   multiplier
 458         C
 459         C 15 code bytes each limb
 460         C ecx/ebx reversed on each chunk
 461
 462 forloop(`i', UNROLL_COUNT, 1, `
 463         deflit(`disp_src', eval(-i*4 + OFFSET))
 464         deflit(`disp_dst', eval(disp_src))
 465
 466         m4_assert(`disp_src>=-128 && disp_src<128')
 467         m4_assert(`disp_dst>=-128 && disp_dst<128')
 468
 469 ifelse(eval(i%2),0,`
 470 Zdisp(  movl,   disp_src,(%esi), %eax)
 471         mull    %ebp
 472 Zdisp(  addl,   %ebx, disp_dst,(%edi))
 473         adcl    %eax, %ecx
 474         movl    %edx, %ebx
 475         adcl    $0, %ebx
 476 ',`
 477         dnl  this one comes out last
 478 Zdisp(  movl,   disp_src,(%esi), %eax)
 479         mull    %ebp
 480 Zdisp(  addl,   %ecx, disp_dst,(%edi))
 481         adcl    %eax, %ebx
 482         movl    %edx, %ecx
 483         adcl    $0, %ecx
 484 ')
 485 ')
 486 L(unroll_inner_end):
 487
 488         addl    %ebx, m4_empty_if_zero(OFFSET)(%edi)
 489
 490         movl    VAR_COUNTER, %edx
 491         adcl    $0, %ecx
 492
 493         movl    %ecx, m4_empty_if_zero(OFFSET+4)(%edi)
 494         movl    VAR_JMP, %ecx
 495
 496         incl    %edx
 497         jnz     L(unroll_outer_top)
 498
 499
 500 ifelse(OFFSET,0,,`
 501         addl    $OFFSET, %esi
 502         addl    $OFFSET, %edi
 503 ')
 504
 505
 506 C -----------------------------------------------------------------------------
 507         ALIGN(16)
 508 L(corner):
 509         C eax
 510         C ebx
 511         C ecx
 512         C edx
 513         C esi   &src[size]
 514         C edi   &dst[2*size-5]
 515         C ebp
 516
 517         movl    -12(%esi), %eax
 518
 519         mull    -8(%esi)
 520
 521         addl    %eax, (%edi)
 522         movl    -12(%esi), %eax
 523         movl    $0, %ebx
 524
 525         adcl    %edx, %ebx
 526
 527         mull    -4(%esi)
 528
 529         addl    %eax, %ebx
 530         movl    -8(%esi), %eax
 531
 532         adcl    $0, %edx
 533
 534         addl    %ebx, 4(%edi)
 535         movl    $0, %ebx
 536
 537         adcl    %edx, %ebx
 538
 539         mull    -4(%esi)
 540
 541         movl    PARAM_SIZE, %ecx
 542         addl    %ebx, %eax
 543
 544         adcl    $0, %edx
 545
 546         movl    %eax, 8(%edi)
 547
 548         movl    %edx, 12(%edi)
 549         movl    PARAM_DST, %edi
 550
 551
 552 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
 553
 554         subl    $1, %ecx                C size-1
 555         xorl    %eax, %eax              C ready for final adcl, and clear carry
 556
 557         movl    %ecx, %edx
 558         movl    PARAM_SRC, %esi
 559
 560
 561 L(lshift):
 562         C eax
 563         C ebx
 564         C ecx   counter, size-1 to 1
 565         C edx   size-1 (for later use)
 566         C esi   src (for later use)
 567         C edi   dst, incrementing
 568         C ebp
 569
 570         rcll    4(%edi)
 571         rcll    8(%edi)
 572
 573         leal    8(%edi), %edi
 574         decl    %ecx
 575         jnz     L(lshift)
 576
 577
 578         adcl    %eax, %eax
 579
 580         movl    %eax, 4(%edi)           C dst most significant limb
 581         movl    (%esi), %eax            C src[0]
 582
 583         leal    4(%esi,%edx,4), %esi    C &src[size]
 584         subl    %edx, %ecx              C -(size-1)
 585
 586
 587 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
 588 C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
 589 C low limb of src[0]^2.
 590
 591
 592         mull    %eax
 593
 594         movl    %eax, (%edi,%ecx,8)     C dst[0]
 595
 596
 597 L(diag):
 598         C eax   scratch
 599         C ebx   scratch
 600         C ecx   counter, negative
 601         C edx   carry
 602         C esi   &src[size]
 603         C edi   dst[2*size-2]
 604         C ebp
 605
 606         movl    (%esi,%ecx,4), %eax
 607         movl    %edx, %ebx
 608
 609         mull    %eax
 610
 611         addl    %ebx, 4(%edi,%ecx,8)
 612         adcl    %eax, 8(%edi,%ecx,8)
 613         adcl    $0, %edx
 614
 615         incl    %ecx
 616         jnz     L(diag)
 617
 618
 619         movl    SAVE_ESI, %esi
 620         movl    SAVE_EBX, %ebx
 621
 622         addl    %edx, 4(%edi)           C dst most significant limb
 623
 624         movl    SAVE_EDI, %edi
 625         movl    SAVE_EBP, %ebp
 626         addl    $FRAME, %esp
 627         ret
 628
 629
 630
 631 C -----------------------------------------------------------------------------
 632 ifdef(`PIC',`
 633 L(pic_calc):
 634         addl    (%esp), %ecx
 635         addl    $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
 636         addl    %edx, %ecx
 637         ret
 638 ')
 639
 640
 641 EPILOGUE()