1 dnl Intel P5 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
3 dnl P5: 15.0 cycles/limb
6 dnl Copyright (C) 2000 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software; you can redistribute it and/or
11 dnl modify it under the terms of the GNU Lesser General Public License as
12 dnl published by the Free Software Foundation; either version 2.1 of the
13 dnl License, or (at your option) any later version.
15 dnl The GNU MP Library is distributed in the hope that it will be useful,
16 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 dnl Lesser General Public License for more details.
20 dnl You should have received a copy of the GNU Lesser General Public
21 dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22 dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23 dnl Suite 330, Boston, MA 02111-1307, USA.
26 include(`../config.m4')
29 C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
32 defframe(PARAM_CARRY,16)
33 defframe(PARAM_SIZE, 12)
34 defframe(PARAM_SRC, 8)
35 defframe(PARAM_DST, 4)
37 dnl multiplicative inverse of 3, modulo 2^32
38 deflit(INVERSE_3, 0xAAAAAAAB)
40 dnl ceil(b/3), ceil(b*2/3) and floor(b*2/3) where b=2^32
41 deflit(ONE_THIRD_CEIL, 0x55555556)
42 deflit(TWO_THIRDS_CEIL, 0xAAAAAAAB)
43 deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA)
48 PROLOGUE(mpn_divexact_by3c)
58 movl PARAM_CARRY, %eax C risk of cache bank clash here
63 sbbl %eax, %eax C 0 or -1
65 imull $INVERSE_3, %edx, %edx
68 cmpl $ONE_THIRD_CEIL, %edx
70 sbbl $-1, %eax C +1 if edx>=ceil(b/3)
71 cmpl $TWO_THIRDS_CEIL, %edx
73 sbbl $-1, %eax C +1 if edx>=ceil(b*2/3)
88 pushl %ebx FRAME_pushl()
89 pushl %esi FRAME_pushl()
91 pushl %edi FRAME_pushl()
92 pushl %ebp FRAME_pushl()
95 movl PARAM_CARRY, %esi
97 movl (%ecx), %eax C src low limb
101 movl $TWO_THIRDS_FLOOR, %esi
103 leal (%ecx,%edx,4), %ecx C &src[size-1]
104 leal (%edi,%edx,4), %edi C &dst[size-1]
106 adcl $0, %ebx C carry, 0 or 1
107 negl %edx C -(size-1)
110 C The loop needs a source limb ready at the top, which leads to one limb
111 C handled separately at the end, and the special case above for size==1.
112 C There doesn't seem to be any scheduling that would keep the speed but move
113 C the source load and carry subtract up to the top.
115 C The destination cache line prefetching adds 1 cycle to the loop but is
116 C considered worthwhile. The slowdown is a factor of 1.07, but will prevent
117 C repeated write-throughs if the destination isn't in L1. A version using
118 C an outer loop to prefetch only every 8 limbs (a cache line) proved to be
119 C no faster, due to unavoidable branch mispreditions in the inner loop.
121 C setc is 2 cycles on P54, so an adcl is used instead. If the movl $0,%ebx
122 C could be avoided then the src limb fetch could pair up and save a cycle.
123 C This would probably mean going to a two limb loop with the carry limb
124 C alternately positive or negative, since an sbbl %ebx,%ebx will leave a
125 C value which is in the opposite sense to the preceding sbbl/adcl %ebx,%eax.
127 C A register is used for TWO_THIRDS_FLOOR because a cmp can't be done as
128 C "cmpl %edx, $n" with the immediate as the second operand.
130 C The "4" source displacement is in the loop rather than the setup because
131 C this gets L(top) aligned to 8 bytes at no cost.
135 C eax source limb, carry subtracted
138 C edx counter, limbs, negative
139 C esi TWO_THIRDS_FLOOR
141 C ebp scratch (result limb)
143 imull $INVERSE_3, %eax, %ebp
145 cmpl $ONE_THIRD_CEIL, %ebp
146 movl (%edi,%edx,4), %eax C dst cache line prefetch
148 sbbl $-1, %ebx C +1 if ebp>=ceil(b/3)
151 movl 4(%ecx,%edx,4), %eax C next src limb
153 sbbl %ebx, %eax C and further -1 if ebp>=ceil(b*2/3)
156 adcl $0, %ebx C new carry
157 movl %ebp, (%edi,%edx,4)
164 imull $INVERSE_3, %eax, %edx
166 cmpl $ONE_THIRD_CEIL, %edx
169 sbbl $-1, %ebx C +1 if edx>=ceil(b/3)
170 cmpl $TWO_THIRDS_CEIL, %edx
172 sbbl $-1, %ebx C +1 if edx>=ceil(b*2/3)