1 dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
3 dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
7 dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
9 dnl This file is part of the GNU MP Library.
11 dnl The GNU MP Library is free software; you can redistribute it and/or
12 dnl modify it under the terms of the GNU Lesser General Public License as
13 dnl published by the Free Software Foundation; either version 2.1 of the
14 dnl License, or (at your option) any later version.
16 dnl The GNU MP Library is distributed in the hope that it will be useful,
17 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
18 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 dnl Lesser General Public License for more details.
21 dnl You should have received a copy of the GNU Lesser General Public
22 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
23 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
24 dnl Suite 330, Boston, MA 02111-1307, USA.
27 include(`../config.m4')
31 dnl UNROLL_COUNT cycles/limb
36 dnl Maximum possible with the current code is 64, the minimum is 2.
38 deflit(UNROLL_COUNT, 32)
41 C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
43 C Copy src,size to dst,size, processing limbs from high to low addresses.
45 C The comments in copyi.asm apply here too.
48 defframe(PARAM_SIZE,12)
49 defframe(PARAM_SRC, 8)
50 defframe(PARAM_DST, 4)
66 cmpl $UNROLL_COUNT, %ecx
68 leal -4(%esi,%ecx,4), %esi
70 leal -4(%edi,%ecx,4), %edi
86 C if src and dst are different alignments mod8, then use rep movs
87 C if src and dst are both 4mod8 then process one limb to get 0mod8
90 leal (%esi,%edi), %ebx
98 leal -UNROLL_COUNT(%ecx), %ecx
99 jnz L(already_aligned)
107 ifelse(UNROLL_BYTES,256,`
112 C offset 0x3D here, but gets full speed without further alignment
118 C esi src, incrementing
119 C edi dst, incrementing
122 C `disp' is never 0, so don't need to force 0(%esi).
124 deflit(CHUNK_COUNT, 2)
125 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
126 deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
127 movq disp(%esi), %mm0
128 movq %mm0, disp(%edi)
131 leal -UNROLL_BYTES(%esi), %esi
132 subl $UNROLL_COUNT, %ecx
134 leal -UNROLL_BYTES(%edi), %edi
138 C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
139 C UNROLL_COUNT-1 limbs remaining
141 testb $eval(UNROLL_COUNT/2), %cl
143 leal UNROLL_COUNT(%ecx), %ecx
147 C at an unroll count of 32 this block of code is 16 cycles faster than
148 C the rep movs, less 3 or 4 to test whether to do it
150 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
151 deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
152 movq disp(%esi), %mm0
153 movq %mm0, disp(%edi)
156 subl $eval(UNROLL_BYTES/2), %esi
157 subl $eval(UNROLL_BYTES/2), %edi
159 subl $eval(UNROLL_COUNT/2), %ecx
163 ifelse(UNROLL_BYTES,256,`