1 dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
5 dnl K6-2: 9.0 11.5 cycles/limb
9 dnl Copyright (C) 2000 Free Software Foundation, Inc.
11 dnl This file is part of the GNU MP Library.
13 dnl The GNU MP Library is free software; you can redistribute it and/or
14 dnl modify it under the terms of the GNU Lesser General Public License as
15 dnl published by the Free Software Foundation; either version 2.1 of the
16 dnl License, or (at your option) any later version.
18 dnl The GNU MP Library is distributed in the hope that it will be useful,
19 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
20 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 dnl Lesser General Public License for more details.
23 dnl You should have received a copy of the GNU Lesser General Public
24 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
25 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
26 dnl Suite 330, Boston, MA 02111-1307, USA.
29 include(`../config.m4')
32 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
33 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
35 C The code here isn't optimal, but it's already a 2x speedup over the plain
36 C integer mpn/generic/popcount.c,hamdist.c.
39 ifdef(`OPERATION_popcount',,
40 `ifdef(`OPERATION_hamdist',,
41 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist
46 `ifdef(`OPERATION_hamdist',`$1')')
50 `ifdef(`OPERATION_popcount',`$1')')
53 defframe(PARAM_SIZE, 12)
54 defframe(PARAM_SRC2, 8)
55 defframe(PARAM_SRC, 4)
56 define(M4_function,mpn_hamdist)
59 defframe(PARAM_SIZE, 8)
60 defframe(PARAM_SRC, 4)
61 define(M4_function,mpn_popcount)
64 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
75 `LF(M4_function,`$1')')
77 LS(rodata_AAAAAAAAAAAAAAAA):
81 LS(rodata_3333333333333333):
85 LS(rodata_0F0F0F0F0F0F0F0F):
89 LS(rodata_000000FF000000FF):
98 C avoid shrl crossing a 32-byte boundary
101 PROLOGUE(M4_function)
104 movl PARAM_SIZE, %ecx
109 movl $0xAAAAAAAA, %eax
110 movl $0x33333333, %edx
115 movl $0x0F0F0F0F, %eax
116 movl $0x000000FF, %edx
128 movq LS(rodata_AAAAAAAAAAAAAAAA), %mm7
129 movq LS(rodata_3333333333333333), %mm6
130 movq LS(rodata_0F0F0F0F0F0F0F0F), %mm5
131 movq LS(rodata_000000FF000000FF), %mm4
134 define(REG_AAAAAAAAAAAAAAAA, %mm7)
135 define(REG_3333333333333333, %mm6)
136 define(REG_0F0F0F0F0F0F0F0F, %mm5)
137 define(REG_000000FF000000FF, %mm4)
141 HAM(` movl PARAM_SRC2, %edx')
143 pxor %mm2, %mm2 C total
148 Zdisp( movd, 0,(%eax,%ecx,8), %mm1)
151 Zdisp( movd, 0,(%edx,%ecx,8), %mm0)
160 POP(` nop C alignment to avoid crossing 32-byte boundaries')
165 C ecx counter, qwords, decrementing
170 C mm2 total (low dword)
173 C mm5 | special constants
177 movq -8(%eax,%ecx,8), %mm1
178 HAM(` pxor -8(%edx,%ecx,8), %mm1')
182 pand REG_AAAAAAAAAAAAAAAA, %mm1
185 HAM(` nop C code alignment')
187 psubd %mm1, %mm0 C bit pairs
188 HAM(` nop C code alignment')
194 pand REG_3333333333333333, %mm0
195 pand REG_3333333333333333, %mm1
197 paddd %mm1, %mm0 C nibbles
203 pand REG_0F0F0F0F0F0F0F0F, %mm0
204 pand REG_0F0F0F0F0F0F0F0F, %mm1
206 paddd %mm1, %mm0 C bytes
212 paddb %mm1, %mm0 C words
218 paddd %mm1, %mm0 C dwords
220 pand REG_000000FF000000FF, %mm0
222 paddd %mm0, %mm2 C low to total
225 paddd %mm0, %mm2 C high to total