1 dnl Intel P5 mpn_sqr_basecase -- square an mpn number.
3 dnl P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
4 dnl product at around 20x20 limbs.
7 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
9 dnl This file is part of the GNU MP Library.
11 dnl The GNU MP Library is free software; you can redistribute it and/or
12 dnl modify it under the terms of the GNU Lesser General Public License as
13 dnl published by the Free Software Foundation; either version 2.1 of the
14 dnl License, or (at your option) any later version.
16 dnl The GNU MP Library is distributed in the hope that it will be useful,
17 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
18 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 dnl Lesser General Public License for more details.
21 dnl You should have received a copy of the GNU Lesser General Public
22 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
23 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
24 dnl Suite 330, Boston, MA 02111-1307, USA.
27 include(`../config.m4')
30 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
32 C Calculate src,size squared, storing the result in dst,2*size.
34 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
35 C lot of function call overheads are avoided, especially when the size is
38 defframe(PARAM_SIZE,12)
39 defframe(PARAM_SRC, 8)
40 defframe(PARAM_DST, 4)
44 PROLOGUE(mpn_sqr_basecase)
58 C -----------------------------------------------------------------------------
72 C -----------------------------------------------------------------------------
91 movl %eax, (%ecx) C dst[0]
92 movl %edx, %esi C dst[1]
98 movl %eax, %edi C dst[2]
99 movl %edx, %ebp C dst[3]
103 mull 4(%ebx) C src[0]*src[1]
128 C -----------------------------------------------------------------------------
144 C -----------------------------------------------------------------------------
154 mull %eax C src[0] ^ 2
162 mull %eax C src[1] ^ 2
168 pushl %esi C risk of cache bank clash
170 mull %eax C src[2] ^ 2
177 mull 4(%ebx) C src[0] * src[1]
184 mull 8(%ebx) C src[0] * src[2]
192 mull 8(%ebx) C src[1] * src[2]
198 C ebx zero, will be dst[5]
239 adcl %ebx, %eax C no carry out of this
247 C -----------------------------------------------------------------------------
258 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
269 leal (%ecx,%edx,4), %edi C dst end of this mul1
271 leal (%ebx,%edx,4), %esi C src end
272 movl %ebx, %ebp C src
275 xorl %ebx, %ebx C clear carry limb and carry flag
277 leal 1(%edx), %ecx C -(size-1)
282 C ecx counter, negative
289 movl (%esi,%ecx,4), %eax
295 movl %ebx, (%edi,%ecx,4)
302 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
305 C The last two products, which are the end corner of the product
306 C triangle, are handled separately to save looping overhead. These
307 C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
308 C If size is 4 then it's only these that need to be done.
310 C In the outer loop %esi is a constant, and %edi just advances by 1
311 C limb each time. The size of the operation decreases by 1 limb
315 C ebx carry (needing carry flag added)
323 movl PARAM_SIZE, %edx
333 C ebx previous carry limb to store
334 C edx outer loop counter (negative)
336 C edi dst, pointing at stored carry limb of previous loop
338 pushl %edx C new outer loop counter
345 xorl %ebx, %ebx C initial carry limb, clear carry flag
349 C ebx carry (needing carry flag added)
350 C ecx counter, negative
353 C edi dst end of this addmul
357 movl (%esi,%ecx,4), %eax
362 movl (%edi,%ecx,4), %ebx
367 movl %ebx, (%edi,%ecx,4)
375 popl %edx C outer loop counter
388 movl -4(%edi), %ebx C risk of data cache bank clash here
390 mull -12(%esi) C src[size-2]*src[size-3]
398 mull -12(%esi) C src[size-1]*src[size-3]
412 mull -8(%esi) C src[size-1]*src[size-2]
418 movl PARAM_SIZE, %eax
423 addl $1, %eax C -(size-1) and clear carry
427 C -----------------------------------------------------------------------------
428 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
431 C eax counter, negative
439 movl 12(%edi,%eax,8), %ebx
442 movl 16(%edi,%eax,8), %ecx
445 movl %ebx, 12(%edi,%eax,8)
447 movl %ecx, 16(%edi,%eax,8)
453 adcl %eax, %eax C high bit out
456 movl PARAM_SIZE, %ecx C risk of cache bank clash
457 movl %eax, 12(%edi) C dst most significant limb
460 C -----------------------------------------------------------------------------
461 C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
462 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
463 C low limb of src[0]^2.
465 movl (%esi), %eax C src[0]
466 leal (%esi,%ecx,4), %esi C src end
472 movl %eax, 16(%edi,%ecx,8) C dst[0]
475 addl $1, %ecx C size-1 and clear carry
478 C eax scratch (low product)
480 C ecx counter, negative
481 C edx scratch (high product)
484 C ebp scratch (fetched dst limbs)
486 movl (%esi,%ecx,4), %eax
491 movl 16-4(%edi,%ecx,8), %ebp
494 movl 16(%edi,%ecx,8), %ebp
497 movl %ebx, 16-4(%edi,%ecx,8)
499 movl %ebp, 16(%edi,%ecx,8)
507 movl 16-4(%edi), %eax C dst most significant limb
512 movl %edx, 16-4(%edi)
513 popl %esi C risk of cache bank clash