1 dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
3 dnl K7: 3.9 cycles/limb.
5 dnl Future: It should be possible to avoid the separate mul after the
6 dnl unrolled loop by moving the movl/adcl to the top.
9 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
11 dnl This file is part of the GNU MP Library.
13 dnl The GNU MP Library is free software; you can redistribute it and/or
14 dnl modify it under the terms of the GNU Lesser General Public License as
15 dnl published by the Free Software Foundation; either version 2.1 of the
16 dnl License, or (at your option) any later version.
18 dnl The GNU MP Library is distributed in the hope that it will be useful,
19 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
20 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 dnl Lesser General Public License for more details.
23 dnl You should have received a copy of the GNU Lesser General Public
24 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
25 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
26 dnl Suite 330, Boston, MA 02111-1307, USA.
29 include(`../config.m4')
32 dnl K7: UNROLL_COUNT cycles/limb
38 dnl Maximum possible with the current code is 64.
40 deflit(UNROLL_COUNT, 16)
43 ifdef(`OPERATION_addmul_1',`
45 define(M4_function_1, mpn_addmul_1)
46 define(M4_function_1c, mpn_addmul_1c)
47 define(M4_description, add it to)
48 define(M4_desc_retval, carry)
49 ',`ifdef(`OPERATION_submul_1',`
51 define(M4_function_1, mpn_submul_1)
52 define(M4_function_1c, mpn_submul_1c)
53 define(M4_description, subtract it from)
54 define(M4_desc_retval, borrow)
55 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
58 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
61 C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
63 C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
64 C mp_limb_t mult, mp_limb_t carry);
66 C Calculate src,size multiplied by mult and M4_description dst,size.
67 C Return the M4_desc_retval limb from the top of the result.
70 deflit(UNROLL_THRESHOLD, 9)
72 deflit(UNROLL_THRESHOLD, 6)
75 defframe(PARAM_CARRY, 20)
76 defframe(PARAM_MULTIPLIER,16)
77 defframe(PARAM_SIZE, 12)
78 defframe(PARAM_SRC, 8)
79 defframe(PARAM_DST, 4)
82 defframe(SAVE_EBX, -4)
83 defframe(SAVE_ESI, -8)
84 defframe(SAVE_EDI, -12)
85 defframe(SAVE_EBP, -16)
90 PROLOGUE(M4_function_1)
96 jnz LF(M4_function_1c,start_1)
101 mull PARAM_MULTIPLIER
111 PROLOGUE(M4_function_1c)
112 movl PARAM_SIZE, %edx
116 jnz L(more_than_one_limb)
121 mull PARAM_MULTIPLIER
123 addl PARAM_CARRY, %eax
134 C offset 0x44 so close enough to aligned
135 L(more_than_one_limb):
136 movl PARAM_CARRY, %ecx
141 subl $SAVE_SIZE, %esp
146 movl %edx, %ebx C size-1
150 cmpl $UNROLL_THRESHOLD, %edx
152 movl PARAM_MULTIPLIER, %ebp
155 movl (%esi), %eax C src low limb
162 leal 4(%esi,%ebx,4), %esi C point one limb past last
163 leal (%edi,%ebx,4), %edi C point at last limb
166 C The movl to load the next source limb is done well ahead of the
167 C mul. This is necessary for full speed, and leads to one limb
168 C handled separately at the end.
184 M4_inst %ecx, (%edi,%ebx,4)
185 movl (%esi,%ebx,4), %eax
206 addl $SAVE_SIZE, %esp
212 C -----------------------------------------------------------------------------
223 dnl overlapping with parameters no longer needed
224 define(VAR_COUNTER,`PARAM_SIZE')
225 define(VAR_JUMP, `PARAM_MULTIPLIER')
227 subl $2, %ebx C (size-2)-1
230 shrl $UNROLL_LOG2, %ebx
233 movl %ebx, VAR_COUNTER
234 andl $UNROLL_MASK, %edx
243 leal L(entry) (%edx,%ebx,1), %edx
250 addl %eax, %ecx C initial carry, becomes low carry
254 movl 4(%esi), %eax C src second limb
255 leal ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi
256 leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebx,4), %edi
258 movl %edx, %ebx C high carry
259 cmovnz( %ecx, %ebx) C high,low carry other way around
267 C See README.family about old gas bugs
268 leal (%edx,%ebx,1), %edx
269 addl $L(entry)-L(here), %edx
275 C -----------------------------------------------------------------------------
276 C This code uses a "two carry limbs" scheme. At the top of the loop the
277 C carries are ebx=lo, ecx=hi, then they swap for each limb processed. For
278 C the computed jump an odd size means they start one way around, an even
279 C size the other. Either way one limb is handled separately at the start of
282 C The positioning of the movl to load the next source limb is important.
283 C Moving it after the adcl with a view to avoiding a separate mul at the end
284 C of the loop slows the code down.
296 C VAR_COUNTER loop counter
301 deflit(CHUNK_COUNT,2)
302 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
303 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
304 deflit(`disp1', eval(disp0 + 4))
308 Zdisp( M4_inst,%ecx, disp0,(%edi))
313 Zdisp( movl, disp0,(%esi), %eax)
319 M4_inst %ebx, disp1(%edi)
324 movl disp1(%esi), %eax
329 leal UNROLL_BYTES(%esi), %esi
330 leal UNROLL_BYTES(%edi), %edi
340 C edi dst (points at second last limb)
342 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
343 deflit(`disp1', eval(disp0-0 + 4))
347 M4_inst %ecx, disp0(%edi)
355 M4_inst %eax, disp1(%edi)
359 addl $SAVE_SIZE, %esp