1 dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
3 dnl K7: 1.64 cycles/limb (at 16 limb/loop).
6 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software; you can redistribute it and/or
11 dnl modify it under the terms of the GNU Lesser General Public License as
12 dnl published by the Free Software Foundation; either version 2.1 of the
13 dnl License, or (at your option) any later version.
15 dnl The GNU MP Library is distributed in the hope that it will be useful,
16 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 dnl Lesser General Public License for more details.
20 dnl You should have received a copy of the GNU Lesser General Public
21 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23 dnl Suite 330, Boston, MA 02111-1307, USA.
26 include(`../config.m4')
29 dnl K7: UNROLL_COUNT cycles/limb
34 dnl Maximum possible with the current code is 64.
36 deflit(UNROLL_COUNT, 16)
39 ifdef(`OPERATION_add_n', `
41 define(M4_function_n, mpn_add_n)
42 define(M4_function_nc, mpn_add_nc)
43 define(M4_description, add)
44 ',`ifdef(`OPERATION_sub_n', `
46 define(M4_function_n, mpn_sub_n)
47 define(M4_function_nc, mpn_sub_nc)
48 define(M4_description, subtract)
49 ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
52 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
55 C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
57 C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
58 C mp_size_t size, mp_limb_t carry);
60 C Calculate src1,size M4_description src2,size, and store the result in
61 C dst,size. The return value is the carry bit from the top of the result (1
64 C The _nc version accepts 1 or 0 for an initial carry into the low limb of
65 C the calculation. Note values other than 1 or 0 here will lead to garbage
68 C This code runs at 1.64 cycles/limb, which is probably the best possible
69 C with plain integer operations. Each limb is 2 loads and 1 store, and in
70 C one cycle the K7 can do two loads, or a load and a store, leading to 1.5
73 dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
75 deflit(UNROLL_THRESHOLD, 8)
77 deflit(UNROLL_THRESHOLD, 8)
80 defframe(PARAM_CARRY,20)
81 defframe(PARAM_SIZE, 16)
82 defframe(PARAM_SRC2, 12)
83 defframe(PARAM_SRC1, 8)
84 defframe(PARAM_DST, 4)
86 defframe(SAVE_EBP, -4)
87 defframe(SAVE_ESI, -8)
88 defframe(SAVE_EBX, -12)
89 defframe(SAVE_EDI, -16)
90 deflit(STACK_SPACE, 16)
96 PROLOGUE(M4_function_nc)
97 movl PARAM_CARRY, %eax
98 jmp LF(M4_function_n,start)
101 PROLOGUE(M4_function_n)
103 xorl %eax, %eax C carry
105 movl PARAM_SIZE, %ecx
106 subl $STACK_SPACE, %esp
107 deflit(`FRAME',STACK_SPACE)
111 cmpl $UNROLL_THRESHOLD, %ecx
113 movl PARAM_SRC2, %edx
114 movl PARAM_SRC1, %ebx
118 leal (%ebx,%ecx,4), %ebx
119 leal (%edx,%ecx,4), %edx
121 leal (%edi,%ecx,4), %edi
125 C This loop in in a single 16 byte code block already, so no
126 C alignment necessary.
136 movl (%ebx,%ecx,4), %eax
137 M4_inst (%edx,%ecx,4), %eax
138 movl %eax, (%edi,%ecx,4)
147 addl $STACK_SPACE, %esp
152 C -----------------------------------------------------------------------------
153 C This is at 0x55, close enough to aligned.
155 deflit(`FRAME',STACK_SPACE)
157 andl $-2, %ecx C size low bit masked out
158 andl $1, PARAM_SIZE C size low bit kept
164 shrl $UNROLL_LOG2, %ecx
168 andl $UNROLL_MASK, %edi
174 leal L(entry) (%edi,%edi,8), %esi C 9 bytes per
179 leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
180 leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
181 leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
188 C See README.family about old gas bugs
189 leal (%edi,%edi,8), %esi
190 addl $L(entry)-L(here), %esi
196 C -----------------------------------------------------------------------------
203 C esi scratch (was computed jump)
207 leal UNROLL_BYTES(%edx), %edx
210 deflit(CHUNK_COUNT, 2)
211 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
212 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
213 deflit(`disp1', eval(disp0 + 4))
215 Zdisp( movl, disp0,(%ebx), %esi)
216 movl disp1(%ebx), %ebp
217 Zdisp( M4_inst,disp0,(%edx), %esi)
218 Zdisp( movl, %esi, disp0,(%edi))
219 M4_inst disp1(%edx), %ebp
220 movl %ebp, disp1(%edi)
224 leal UNROLL_BYTES(%ebx), %ebx
225 leal UNROLL_BYTES(%edi), %edi
237 M4_inst UNROLL_BYTES(%edx), %ecx
246 addl $STACK_SPACE, %esp