ghc/rts/gmp/mpn/x86/pentium/sqr_basecase.asm

   1 dnl  Intel P5 mpn_sqr_basecase -- square an mpn number.
   2 dnl
   3 dnl  P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
   4 dnl  product at around 20x20 limbs.
   5
   6
   7 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
   8 dnl
   9 dnl  This file is part of the GNU MP Library.
  10 dnl
  11 dnl  The GNU MP Library is free software; you can redistribute it and/or
  12 dnl  modify it under the terms of the GNU Lesser General Public License as
  13 dnl  published by the Free Software Foundation; either version 2.1 of the
  14 dnl  License, or (at your option) any later version.
  15 dnl
  16 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  17 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19 dnl  Lesser General Public License for more details.
  20 dnl
  21 dnl  You should have received a copy of the GNU Lesser General Public
  22 dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
  23 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  24 dnl  Suite 330, Boston, MA 02111-1307, USA.
  25
  26
  27 include(`../config.m4')
  28
  29
  30 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
  31 C
  32 C Calculate src,size squared, storing the result in dst,2*size.
  33 C
  34 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
  35 C lot of function call overheads are avoided, especially when the size is
  36 C small.
  37
  38 defframe(PARAM_SIZE,12)
  39 defframe(PARAM_SRC, 8)
  40 defframe(PARAM_DST, 4)
  41
  42         .text
  43         ALIGN(8)
  44 PROLOGUE(mpn_sqr_basecase)
  45 deflit(`FRAME',0)
  46
  47         movl    PARAM_SIZE, %edx
  48         movl    PARAM_SRC, %eax
  49
  50         cmpl    $2, %edx
  51         movl    PARAM_DST, %ecx
  52
  53         je      L(two_limbs)
  54
  55         movl    (%eax), %eax
  56         ja      L(three_or_more)
  57
  58 C -----------------------------------------------------------------------------
  59 C one limb only
  60         C eax   src
  61         C ebx
  62         C ecx   dst
  63         C edx
  64
  65         mull    %eax
  66
  67         movl    %eax, (%ecx)
  68         movl    %edx, 4(%ecx)
  69
  70         ret
  71
  72 C -----------------------------------------------------------------------------
  73         ALIGN(8)
  74 L(two_limbs):
  75         C eax   src
  76         C ebx
  77         C ecx   dst
  78         C edx   size
  79
  80         pushl   %ebp
  81         pushl   %edi
  82
  83         pushl   %esi
  84         pushl   %ebx
  85
  86         movl    %eax, %ebx
  87         movl    (%eax), %eax
  88
  89         mull    %eax            C src[0]^2
  90
  91         movl    %eax, (%ecx)    C dst[0]
  92         movl    %edx, %esi      C dst[1]
  93
  94         movl    4(%ebx), %eax
  95
  96         mull    %eax            C src[1]^2
  97
  98         movl    %eax, %edi      C dst[2]
  99         movl    %edx, %ebp      C dst[3]
 100
 101         movl    (%ebx), %eax
 102
 103         mull    4(%ebx)         C src[0]*src[1]
 104
 105         addl    %eax, %esi
 106         popl    %ebx
 107
 108         adcl    %edx, %edi
 109
 110         adcl    $0, %ebp
 111         addl    %esi, %eax
 112
 113         adcl    %edi, %edx
 114         movl    %eax, 4(%ecx)
 115
 116         adcl    $0, %ebp
 117         popl    %esi
 118
 119         movl    %edx, 8(%ecx)
 120         movl    %ebp, 12(%ecx)
 121
 122         popl    %edi
 123         popl    %ebp
 124
 125         ret
 126
 127
 128 C -----------------------------------------------------------------------------
 129         ALIGN(8)
 130 L(three_or_more):
 131         C eax   src low limb
 132         C ebx
 133         C ecx   dst
 134         C edx   size
 135
 136         cmpl    $4, %edx
 137         pushl   %ebx
 138 deflit(`FRAME',4)
 139
 140         movl    PARAM_SRC, %ebx
 141         jae     L(four_or_more)
 142
 143
 144 C -----------------------------------------------------------------------------
 145 C three limbs
 146         C eax   src low limb
 147         C ebx   src
 148         C ecx   dst
 149         C edx   size
 150
 151         pushl   %ebp
 152         pushl   %edi
 153
 154         mull    %eax            C src[0] ^ 2
 155
 156         movl    %eax, (%ecx)
 157         movl    %edx, 4(%ecx)
 158
 159         movl    4(%ebx), %eax
 160         xorl    %ebp, %ebp
 161
 162         mull    %eax            C src[1] ^ 2
 163
 164         movl    %eax, 8(%ecx)
 165         movl    %edx, 12(%ecx)
 166
 167         movl    8(%ebx), %eax
 168         pushl   %esi            C risk of cache bank clash
 169
 170         mull    %eax            C src[2] ^ 2
 171
 172         movl    %eax, 16(%ecx)
 173         movl    %edx, 20(%ecx)
 174
 175         movl    (%ebx), %eax
 176
 177         mull    4(%ebx)         C src[0] * src[1]
 178
 179         movl    %eax, %esi
 180         movl    %edx, %edi
 181
 182         movl    (%ebx), %eax
 183
 184         mull    8(%ebx)         C src[0] * src[2]
 185
 186         addl    %eax, %edi
 187         movl    %edx, %ebp
 188
 189         adcl    $0, %ebp
 190         movl    4(%ebx), %eax
 191
 192         mull    8(%ebx)         C src[1] * src[2]
 193
 194         xorl    %ebx, %ebx
 195         addl    %eax, %ebp
 196
 197         C eax
 198         C ebx   zero, will be dst[5]
 199         C ecx   dst
 200         C edx   dst[4]
 201         C esi   dst[1]
 202         C edi   dst[2]
 203         C ebp   dst[3]
 204
 205         adcl    $0, %edx
 206         addl    %esi, %esi
 207
 208         adcl    %edi, %edi
 209
 210         adcl    %ebp, %ebp
 211
 212         adcl    %edx, %edx
 213         movl    4(%ecx), %eax
 214
 215         adcl    $0, %ebx
 216         addl    %esi, %eax
 217
 218         movl    %eax, 4(%ecx)
 219         movl    8(%ecx), %eax
 220
 221         adcl    %edi, %eax
 222         movl    12(%ecx), %esi
 223
 224         adcl    %ebp, %esi
 225         movl    16(%ecx), %edi
 226
 227         movl    %eax, 8(%ecx)
 228         movl    %esi, 12(%ecx)
 229
 230         adcl    %edx, %edi
 231         popl    %esi
 232
 233         movl    20(%ecx), %eax
 234         movl    %edi, 16(%ecx)
 235
 236         popl    %edi
 237         popl    %ebp
 238
 239         adcl    %ebx, %eax      C no carry out of this
 240         popl    %ebx
 241
 242         movl    %eax, 20(%ecx)
 243
 244         ret
 245
 246
 247 C -----------------------------------------------------------------------------
 248         ALIGN(8)
 249 L(four_or_more):
 250         C eax   src low limb
 251         C ebx   src
 252         C ecx   dst
 253         C edx   size
 254         C esi
 255         C edi
 256         C ebp
 257         C
 258         C First multiply src[0]*src[1..size-1] and store at dst[1..size].
 259
 260 deflit(`FRAME',4)
 261
 262         pushl   %edi
 263 FRAME_pushl()
 264         pushl   %esi
 265 FRAME_pushl()
 266
 267         pushl   %ebp
 268 FRAME_pushl()
 269         leal    (%ecx,%edx,4), %edi     C dst end of this mul1
 270
 271         leal    (%ebx,%edx,4), %esi     C src end
 272         movl    %ebx, %ebp              C src
 273
 274         negl    %edx                    C -size
 275         xorl    %ebx, %ebx              C clear carry limb and carry flag
 276
 277         leal    1(%edx), %ecx           C -(size-1)
 278
 279 L(mul1):
 280         C eax   scratch
 281         C ebx   carry
 282         C ecx   counter, negative
 283         C edx   scratch
 284         C esi   &src[size]
 285         C edi   &dst[size]
 286         C ebp   src
 287
 288         adcl    $0, %ebx
 289         movl    (%esi,%ecx,4), %eax
 290
 291         mull    (%ebp)
 292
 293         addl    %eax, %ebx
 294
 295         movl    %ebx, (%edi,%ecx,4)
 296         incl    %ecx
 297
 298         movl    %edx, %ebx
 299         jnz     L(mul1)
 300
 301
 302         C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
 303         C n=1..size-2.
 304         C
 305         C The last two products, which are the end corner of the product
 306         C triangle, are handled separately to save looping overhead.  These
 307         C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
 308         C If size is 4 then it's only these that need to be done.
 309         C
 310         C In the outer loop %esi is a constant, and %edi just advances by 1
 311         C limb each time.  The size of the operation decreases by 1 limb
 312         C each time.
 313
 314         C eax
 315         C ebx   carry (needing carry flag added)
 316         C ecx
 317         C edx
 318         C esi   &src[size]
 319         C edi   &dst[size]
 320         C ebp
 321
 322         adcl    $0, %ebx
 323         movl    PARAM_SIZE, %edx
 324
 325         movl    %ebx, (%edi)
 326         subl    $4, %edx
 327
 328         negl    %edx
 329         jz      L(corner)
 330
 331
 332 L(outer):
 333         C ebx   previous carry limb to store
 334         C edx   outer loop counter (negative)
 335         C esi   &src[size]
 336         C edi   dst, pointing at stored carry limb of previous loop
 337
 338         pushl   %edx                    C new outer loop counter
 339         leal    -2(%edx), %ecx
 340
 341         movl    %ebx, (%edi)
 342         addl    $4, %edi
 343
 344         addl    $4, %ebp
 345         xorl    %ebx, %ebx              C initial carry limb, clear carry flag
 346
 347 L(inner):
 348         C eax   scratch
 349         C ebx   carry (needing carry flag added)
 350         C ecx   counter, negative
 351         C edx   scratch
 352         C esi   &src[size]
 353         C edi   dst end of this addmul
 354         C ebp   &src[j]
 355
 356         adcl    $0, %ebx
 357         movl    (%esi,%ecx,4), %eax
 358
 359         mull    (%ebp)
 360
 361         addl    %ebx, %eax
 362         movl    (%edi,%ecx,4), %ebx
 363
 364         adcl    $0, %edx
 365         addl    %eax, %ebx
 366
 367         movl    %ebx, (%edi,%ecx,4)
 368         incl    %ecx
 369
 370         movl    %edx, %ebx
 371         jnz     L(inner)
 372
 373
 374         adcl    $0, %ebx
 375         popl    %edx            C outer loop counter
 376
 377         incl    %edx
 378         jnz     L(outer)
 379
 380
 381         movl    %ebx, (%edi)
 382
 383 L(corner):
 384         C esi   &src[size]
 385         C edi   &dst[2*size-4]
 386
 387         movl    -8(%esi), %eax
 388         movl    -4(%edi), %ebx          C risk of data cache bank clash here
 389
 390         mull    -12(%esi)               C src[size-2]*src[size-3]
 391
 392         addl    %eax, %ebx
 393         movl    %edx, %ecx
 394
 395         adcl    $0, %ecx
 396         movl    -4(%esi), %eax
 397
 398         mull    -12(%esi)               C src[size-1]*src[size-3]
 399
 400         addl    %ecx, %eax
 401         movl    (%edi), %ecx
 402
 403         adcl    $0, %edx
 404         movl    %ebx, -4(%edi)
 405
 406         addl    %eax, %ecx
 407         movl    %edx, %ebx
 408
 409         adcl    $0, %ebx
 410         movl    -4(%esi), %eax
 411
 412         mull    -8(%esi)                C src[size-1]*src[size-2]
 413
 414         movl    %ecx, 0(%edi)
 415         addl    %eax, %ebx
 416
 417         adcl    $0, %edx
 418         movl    PARAM_SIZE, %eax
 419
 420         negl    %eax
 421         movl    %ebx, 4(%edi)
 422
 423         addl    $1, %eax                C -(size-1) and clear carry
 424         movl    %edx, 8(%edi)
 425
 426
 427 C -----------------------------------------------------------------------------
 428 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
 429
 430 L(lshift):
 431         C eax   counter, negative
 432         C ebx   next limb
 433         C ecx
 434         C edx
 435         C esi
 436         C edi   &dst[2*size-4]
 437         C ebp
 438
 439         movl    12(%edi,%eax,8), %ebx
 440
 441         rcll    %ebx
 442         movl    16(%edi,%eax,8), %ecx
 443
 444         rcll    %ecx
 445         movl    %ebx, 12(%edi,%eax,8)
 446
 447         movl    %ecx, 16(%edi,%eax,8)
 448         incl    %eax
 449
 450         jnz     L(lshift)
 451
 452
 453         adcl    %eax, %eax              C high bit out
 454         movl    PARAM_SRC, %esi
 455
 456         movl    PARAM_SIZE, %ecx        C risk of cache bank clash
 457         movl    %eax, 12(%edi)          C dst most significant limb
 458
 459
 460 C -----------------------------------------------------------------------------
 461 C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
 462 C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
 463 C low limb of src[0]^2.
 464
 465         movl    (%esi), %eax            C src[0]
 466         leal    (%esi,%ecx,4), %esi     C src end
 467
 468         negl    %ecx
 469
 470         mull    %eax
 471
 472         movl    %eax, 16(%edi,%ecx,8)   C dst[0]
 473         movl    %edx, %ebx
 474
 475         addl    $1, %ecx                C size-1 and clear carry
 476
 477 L(diag):
 478         C eax   scratch (low product)
 479         C ebx   carry limb
 480         C ecx   counter, negative
 481         C edx   scratch (high product)
 482         C esi   &src[size]
 483         C edi   &dst[2*size-4]
 484         C ebp   scratch (fetched dst limbs)
 485
 486         movl    (%esi,%ecx,4), %eax
 487         adcl    $0, %ebx
 488
 489         mull    %eax
 490
 491         movl    16-4(%edi,%ecx,8), %ebp
 492
 493         addl    %ebp, %ebx
 494         movl    16(%edi,%ecx,8), %ebp
 495
 496         adcl    %eax, %ebp
 497         movl    %ebx, 16-4(%edi,%ecx,8)
 498
 499         movl    %ebp, 16(%edi,%ecx,8)
 500         incl    %ecx
 501
 502         movl    %edx, %ebx
 503         jnz     L(diag)
 504
 505
 506         adcl    $0, %edx
 507         movl    16-4(%edi), %eax        C dst most significant limb
 508
 509         addl    %eax, %edx
 510         popl    %ebp
 511
 512         movl    %edx, 16-4(%edi)
 513         popl    %esi            C risk of cache bank clash
 514
 515         popl    %edi
 516         popl    %ebx
 517
 518         ret
 519
 520 EPILOGUE()