1 dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
3 dnl P6: 6.35 cycles/limb (at 16 limbs/loop).
6 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software; you can redistribute it and/or
11 dnl modify it under the terms of the GNU Lesser General Public License as
12 dnl published by the Free Software Foundation; either version 2.1 of the
13 dnl License, or (at your option) any later version.
15 dnl The GNU MP Library is distributed in the hope that it will be useful,
16 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 dnl Lesser General Public License for more details.
20 dnl You should have received a copy of the GNU Lesser General Public
21 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23 dnl Suite 330, Boston, MA 02111-1307, USA.
26 include(`../config.m4')
29 dnl P6 UNROLL_COUNT cycles/limb
34 dnl Maximum possible with the current code is 64.
36 deflit(UNROLL_COUNT, 16)
39 ifdef(`OPERATION_addmul_1', `
41 define(M4_function_1, mpn_addmul_1)
42 define(M4_function_1c, mpn_addmul_1c)
43 define(M4_description, add it to)
44 define(M4_desc_retval, carry)
45 ',`ifdef(`OPERATION_submul_1', `
47 define(M4_function_1, mpn_submul_1)
48 define(M4_function_1c, mpn_submul_1c)
49 define(M4_description, subtract it from)
50 define(M4_desc_retval, borrow)
51 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
54 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
57 C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
59 C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
60 C mp_limb_t mult, mp_limb_t carry);
62 C Calculate src,size multiplied by mult and M4_description dst,size.
63 C Return the M4_desc_retval limb from the top of the result.
65 C This code is pretty much the same as the K6 code. The unrolled loop is
66 C the same, but there's just a few scheduling tweaks in the setups and the
69 C A number of variations have been tried for the unrolled loop, with one or
70 C two carries, and with loads scheduled earlier, but nothing faster than 6
71 C cycles/limb has been found.
74 deflit(UNROLL_THRESHOLD, 5)
76 deflit(UNROLL_THRESHOLD, 5)
79 defframe(PARAM_CARRY, 20)
80 defframe(PARAM_MULTIPLIER,16)
81 defframe(PARAM_SIZE, 12)
82 defframe(PARAM_SRC, 8)
83 defframe(PARAM_DST, 4)
88 PROLOGUE(M4_function_1c)
91 movl PARAM_CARRY, %ebx
92 jmp LF(M4_function_1,start_nc)
95 PROLOGUE(M4_function_1)
98 xorl %ebx, %ebx C initial carry
101 movl PARAM_SIZE, %ecx
112 cmpl $UNROLL_THRESHOLD, %ecx
114 movl PARAM_MULTIPLIER, %ebp
119 C this is offset 0x22, so close enough to aligned
137 M4_inst %eax, -4(%edi)
158 C------------------------------------------------------------------------------
159 C VAR_JUMP holds the computed jump temporarily because there's not enough
160 C registers when doing the mul for the initial two carry limbs.
162 C The add/adc for the initial carry in %ebx is necessary only for the
163 C mpn_add/submul_1c entry points. Duplicating the startup code to
164 C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
167 dnl overlapping with parameters already fetched
168 define(VAR_COUNTER,`PARAM_SIZE')
169 define(VAR_JUMP, `PARAM_DST')
171 C this is offset 0x43, so close enough to aligned
187 shrl $UNROLL_LOG2, %edx
188 andl $UNROLL_MASK, %ecx
190 movl %edx, VAR_COUNTER
193 C 15 code bytes per limb
201 leal L(entry) (%edx,%ecx,1), %edx
203 movl (%esi), %eax C src low limb
206 leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
210 addl %ebx, %eax C initial carry (from _1c)
213 movl %edx, %ebx C high carry
214 leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
218 movl %eax, %ecx C low carry
220 cmovnz( %ebx, %ecx) C high,low carry other way around
231 C See README.family about old gas bugs
232 leal (%edx,%ecx,1), %edx
233 addl $L(entry)-L(here), %edx
241 C -----------------------------------------------------------
253 C VAR_COUNTER loop counter
255 C 15 code bytes per limb
257 addl $UNROLL_BYTES, %edi
260 deflit(CHUNK_COUNT,2)
261 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
262 deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
263 deflit(`disp1', eval(disp0 + 4))
265 Zdisp( movl, disp0,(%esi), %eax)
267 Zdisp( M4_inst,%ecx, disp0,(%edi))
272 movl disp1(%esi), %eax
274 M4_inst %ebx, disp1(%edi)
281 leal UNROLL_BYTES(%esi), %esi
286 deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
288 M4_inst %ecx, disp0(%edi)