1 dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
3 dnl K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
6 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software; you can redistribute it and/or
11 dnl modify it under the terms of the GNU Lesser General Public License as
12 dnl published by the Free Software Foundation; either version 2.1 of the
13 dnl License, or (at your option) any later version.
15 dnl The GNU MP Library is distributed in the hope that it will be useful,
16 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 dnl Lesser General Public License for more details.
20 dnl You should have received a copy of the GNU Lesser General Public
21 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23 dnl Suite 330, Boston, MA 02111-1307, USA.
26 include(`../config.m4')
29 ifdef(`OPERATION_add_n', `
31 define(M4_function_n, mpn_add_n)
32 define(M4_function_nc, mpn_add_nc)
33 define(M4_description, add)
34 ',`ifdef(`OPERATION_sub_n', `
36 define(M4_function_n, mpn_sub_n)
37 define(M4_function_nc, mpn_sub_nc)
38 define(M4_description, subtract)
39 ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
42 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
45 C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
47 C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
48 C mp_size_t size, mp_limb_t carry);
50 C Calculate src1,size M4_description src2,size, and store the result in
51 C dst,size. The return value is the carry bit from the top of the result
54 C The _nc version accepts 1 or 0 for an initial carry into the low limb of
55 C the calculation. Note values other than 1 or 0 here will lead to garbage
58 C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
59 C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of
60 C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
62 define(PARAM_CARRY, `FRAME+20(%esp)')
63 define(PARAM_SIZE, `FRAME+16(%esp)')
64 define(PARAM_SRC2, `FRAME+12(%esp)')
65 define(PARAM_SRC1, `FRAME+8(%esp)')
66 define(PARAM_DST, `FRAME+4(%esp)')
69 dnl minimum 5 because the unrolled code can't handle less
70 deflit(UNROLL_THRESHOLD, 5)
75 PROLOGUE(M4_function_nc)
76 movl PARAM_CARRY, %eax
77 jmp LF(M4_function_n,start)
81 PROLOGUE(M4_function_n)
93 cmpl $UNROLL_THRESHOLD, %ecx
99 shrl %eax C initial carry flag
101 C offset 0x21 here, close enough to aligned
111 C The store to (%edi) could be done with a stosl; it'd be smaller
112 C code, but there's no speed gain and a cld would have to be added
113 C (per mpn/x86/README.family).
136 C -----------------------------------------------------------------------------
151 ifdef(`OPERATION_add_n',`
154 je L(inplace_reverse)
162 leal (%ebx,%ecx,4), %ebx
163 leal (%edx,%ecx,4), %edx
164 leal (%edi,%ecx,4), %edi
171 C eax counter, qwords, negative
179 movl (%ebx,%ecx,4), %eax
181 M4_inst -20(%edx,%ecx,4), %eax
182 movl %eax, -20(%edi,%ecx,4)
184 movl 4-20(%ebx,%ecx,4), %eax
185 M4_inst 4-20(%edx,%ecx,4), %eax
186 movl %eax, 4-20(%edi,%ecx,4)
188 movl 8-20(%ebx,%ecx,4), %eax
189 M4_inst 8-20(%edx,%ecx,4), %eax
190 movl %eax, 8-20(%edi,%ecx,4)
192 movl 12-20(%ebx,%ecx,4), %eax
193 M4_inst 12-20(%edx,%ecx,4), %eax
194 movl %eax, 12-20(%edi,%ecx,4)
200 jz L(normal_finish_one)
203 C two or three more limbs
210 M4_inst 4(%edx), %eax
217 L(normal_finish_one):
218 movl (%ebx,%ecx,4), %eax
219 M4_inst (%edx,%ecx,4), %eax
220 movl %eax, (%edi,%ecx,4)
234 C -----------------------------------------------------------------------------
236 ifdef(`OPERATION_add_n',`
258 movl (%edx), %ebx C src low limb
259 leal (%edx,%ecx,4), %edx
261 leal (%edi,%ecx,4), %edi
277 M4_inst %ebx, (%edi,%ecx,4)
279 movl 4(%edx,%ecx,4), %eax
282 M4_inst %eax, 4-20(%edi,%ecx,4)
284 movl 8-20(%edx,%ecx,4), %eax
285 movl 12-20(%edx,%ecx,4), %ebx
287 M4_inst %eax, 8-20(%edi,%ecx,4)
288 M4_inst %ebx, 12-20(%edi,%ecx,4)
290 movl 16-20(%edx,%ecx,4), %ebx
294 C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
299 jz L(inplace_finish_one)
302 C two or three more limbs
306 M4_inst %eax, 4(%edi)
307 M4_inst %ebx, 8(%edi)
314 L(inplace_finish_one):
315 movl 4(%edx,%ecx,4), %eax
316 M4_inst %eax, 4(%edi,%ecx,4)