1 dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
4 dnl K7: popcount 5.0 cycles/limb, hamdist 6.0 cycles/limb
7 dnl Copyright (C) 2000 Free Software Foundation, Inc.
9 dnl This file is part of the GNU MP Library.
11 dnl The GNU MP Library is free software; you can redistribute it and/or
12 dnl modify it under the terms of the GNU Lesser General Public License as
13 dnl published by the Free Software Foundation; either version 2.1 of the
14 dnl License, or (at your option) any later version.
16 dnl The GNU MP Library is distributed in the hope that it will be useful,
17 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
18 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 dnl Lesser General Public License for more details.
21 dnl You should have received a copy of the GNU Lesser General Public
22 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
23 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
24 dnl Suite 330, Boston, MA 02111-1307, USA.
27 include(`../config.m4')
30 dnl Only recent versions of gas know psadbw, in particular gas 2.9.1 on
31 dnl FreeBSD 3.3 and 3.4 doesn't recognise it.
33 define(psadbw_mm4_mm0,
34 `ifelse(m4_ifdef_anyof_p(`HAVE_TARGET_CPU_athlon',
35 `HAVE_TARGET_CPU_pentium3'),1,
36 `.byte 0x0f,0xf6,0xc4 C psadbw %mm4, %mm0',
38 `m4_warning(`warning, using simulated and only partly functional psadbw, use for testing only
39 ') C this works enough for the sum of bytes done below, making it
40 C possible to test on an older cpu
56 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
57 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
59 C The code here is almost certainly not optimal, but is already a 3x speedup
60 C over the generic C code. The main improvement would be to interleave
61 C processing of two qwords in the loop so as to fully exploit the available
62 C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
64 C The loop is based on the example "Efficient 64-bit population count using
65 C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
66 C page 158 of rev E (reference in mpn/x86/k7/README).
68 ifdef(`OPERATION_popcount',,
69 `ifdef(`OPERATION_hamdist',,
70 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
75 `ifdef(`OPERATION_hamdist',`$1')')
79 `ifdef(`OPERATION_popcount',`$1')')
82 defframe(PARAM_SIZE, 12)
83 defframe(PARAM_SRC2, 8)
84 defframe(PARAM_SRC, 4)
85 define(M4_function,mpn_hamdist)
88 defframe(PARAM_SIZE, 8)
89 defframe(PARAM_SRC, 4)
90 define(M4_function,mpn_popcount)
93 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
104 `LF(M4_function,`$1')')
106 LS(rodata_AAAAAAAAAAAAAAAA):
110 LS(rodata_3333333333333333):
114 LS(rodata_0F0F0F0F0F0F0F0F):
122 PROLOGUE(M4_function)
125 movl PARAM_SIZE, %ecx
130 movl $0xAAAAAAAA, %eax
131 movl $0x33333333, %edx
136 movl $0x0F0F0F0F, %eax
147 movq LS(rodata_AAAAAAAAAAAAAAAA), %mm7
148 movq LS(rodata_3333333333333333), %mm6
149 movq LS(rodata_0F0F0F0F0F0F0F0F), %mm5
153 define(REG_AAAAAAAAAAAAAAAA,%mm7)
154 define(REG_3333333333333333,%mm6)
155 define(REG_0F0F0F0F0F0F0F0F,%mm5)
156 define(REG_0000000000000000,%mm4)
160 HAM(` movl PARAM_SRC2, %edx')
162 pxor %mm2, %mm2 C total
167 movd (%eax,%ecx,8), %mm1
169 HAM(` movd 0(%edx,%ecx,8), %mm0
180 C ecx counter, qwords, decrementing
185 C mm2 total (low dword)
188 C mm5 | special constants
192 movq -8(%eax,%ecx,8), %mm1
194 HAM(` pxor -8(%edx,%ecx,8), %mm1')
199 pand REG_AAAAAAAAAAAAAAAA, %mm1
203 psubd %mm1, %mm0 C bit pairs
209 pand REG_3333333333333333, %mm0
210 pand REG_3333333333333333, %mm1
212 paddd %mm1, %mm0 C nibbles
218 pand REG_0F0F0F0F0F0F0F0F, %mm0
219 pand REG_0F0F0F0F0F0F0F0F, %mm1
221 paddd %mm1, %mm0 C bytes
226 paddd %mm0, %mm2 C add to total