1 dnl Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
2 dnl the result to a second limb vector.
4 dnl Copyright (C) 2000 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11 dnl your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20 dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21 dnl MA 02111-1307, USA.
23 include(`../config.m4')
31 dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
32 dnl exactly 3.625 cycles/limb on EV6...
34 dnl This code was written in close cooperation with ev6 pipeline expert
35 dnl Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though.
37 dnl Register usages for unrolled loop:
42 dnl 22,23 save for stores
44 dnl Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
46 dnl The stores can issue a cycle late so we have paired no-op's to 'catch'
47 dnl them, so that further disturbance to the schedule is damped.
49 dnl We couldn't pair the loads, because the entangled schedule of the
50 dnl carry's has to happen on one side {0} of the machine. Note, the total
51 dnl use of U0, and the total use of L0 (after attending to the stores).
52 dnl which is part of the reason why....
54 dnl This is a great schedule for the d_cache, a poor schedule for the
55 dnl b_cache. The lockup on U0 means that any stall can't be recovered
56 dnl from. Consider a ldq in L1. say that load gets stalled because it
57 dnl collides with a fill from the b_Cache. On the next cycle, this load
58 dnl gets priority. If first looks at L0, and goes there. The instruction
59 dnl we intended for L0 gets to look at L1, which is NOT where we want
60 dnl it. It either stalls 1, because it can't go in L0, or goes there, and
61 dnl causes a further instruction to stall.
63 dnl So for b_cache, we're likely going to want to put one or more cycles
64 dnl back into the code! And, of course, put in prefetches. For the
65 dnl accumulator, lds, intent to modify. For the multiplier, you might
66 dnl want ldq, evict next, if you're not wanting to use it again soon. Use
67 dnl 256 ahead of present pointer value. At a place where we have an mt
68 dnl followed by a bookkeeping, put the bookkeeping in upper, and the
69 dnl prefetch into lower.
71 dnl Note, the usage of physical registers per cycle is smoothed off, as
74 dnl Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
75 dnl like not to have a ldq or stq to preceded a conditional branch in a
76 dnl quadpack. The conditional branch moves the retire pointer one cycle
79 dnl Optimization notes:
80 dnl Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27?
81 dnl Reserved regs: r29 r30 r31
82 dnl Free caller-saves regs in unrolled code: r24 r25 r28
83 dnl We should swap some of the callee-saves regs for some of the free
84 dnl caller-saves regs, saving some overhead cycles.
85 dnl Most importantly, we should write fast code for the 0-7 case.
86 dnl The code we use there are for the 21164, and runs at 7 cycles/limb
87 dnl on the 21264. Should not be hard, if we write specialized code for
88 dnl 1-7 limbs (the one for 0 limbs should be straightforward). We then just
89 dnl need a jump table indexed by the low 3 bits of the count argument.
93 PROLOGUE(mpn_addmul_1)
97 ldq r2, 0(r17) C r2 = s1_limb
98 addq r17, 8, r17 C s1_ptr++
99 subq r18, 1, r18 C size--
100 mulq r2, r19, r3 C r3 = prod_low
101 ldq r5, 0(r16) C r5 = *res_ptr
102 umulh r2, r19, r0 C r0 = prod_high
103 beq r18, $Lend0b C jump if size was == 1
104 ldq r2, 0(r17) C r2 = s1_limb
105 addq r17, 8, r17 C s1_ptr++
106 subq r18, 1, r18 C size--
110 addq r16, 8, r16 C res_ptr++
111 beq r18, $Lend0a C jump if size was == 2
114 $Loop0: mulq r2, r19, r3 C r3 = prod_low
115 ldq r5, 0(r16) C r5 = *res_ptr
116 addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
117 subq r18, 1, r18 C size--
118 umulh r2, r19, r4 C r4 = cy_limb
119 ldq r2, 0(r17) C r2 = s1_limb
120 addq r17, 8, r17 C s1_ptr++
121 addq r3, r0, r3 C r3 = cy_limb + prod_low
122 cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
126 addq r16, 8, r16 C res_ptr++
127 addq r5, r0, r0 C combine carries
130 mulq r2, r19, r3 C r3 = prod_low
131 ldq r5, 0(r16) C r5 = *res_ptr
132 addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
133 umulh r2, r19, r4 C r4 = cy_limb
134 addq r3, r0, r3 C r3 = cy_limb + prod_low
135 cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
139 addq r5, r0, r0 C combine carries
140 addq r4, r0, r0 C cy_limb = prod_high + cy
159 and r18, 7, r20 C count for the first loop, 0-7
160 srl r18, 3, r18 C count for unrolled loop
163 ldq r2, 0(r17) C r2 = s1_limb
164 addq r17, 8, r17 C s1_ptr++
165 subq r20, 1, r20 C size--
166 mulq r2, r19, r3 C r3 = prod_low
167 ldq r5, 0(r16) C r5 = *res_ptr
168 umulh r2, r19, r0 C r0 = prod_high
169 beq r20, $Lend1b C jump if size was == 1
170 ldq r2, 0(r17) C r2 = s1_limb
171 addq r17, 8, r17 C s1_ptr++
172 subq r20, 1, r20 C size--
176 addq r16, 8, r16 C res_ptr++
177 beq r20, $Lend1a C jump if size was == 2
180 $Loop1: mulq r2, r19, r3 C r3 = prod_low
181 ldq r5, 0(r16) C r5 = *res_ptr
182 addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
183 subq r20, 1, r20 C size--
184 umulh r2, r19, r4 C r4 = cy_limb
185 ldq r2, 0(r17) C r2 = s1_limb
186 addq r17, 8, r17 C s1_ptr++
187 addq r3, r0, r3 C r3 = cy_limb + prod_low
188 cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
192 addq r16, 8, r16 C res_ptr++
193 addq r5, r0, r0 C combine carries
197 mulq r2, r19, r3 C r3 = prod_low
198 ldq r5, 0(r16) C r5 = *res_ptr
199 addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
200 umulh r2, r19, r4 C r4 = cy_limb
201 addq r3, r0, r3 C r3 = cy_limb + prod_low
202 cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
206 addq r16, 8, r16 C res_ptr++
207 addq r5, r0, r0 C combine carries
208 addq r4, r0, r0 C cy_limb = prod_high + cy
214 addq r16, 8, r16 C res_ptr++
218 lda r17, -16(r17) C L1 bookkeeping
219 lda r16, -16(r16) C L1 bookkeeping
222 C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
226 lda r18, -1(r18) C L1 bookkeeping
230 mulq r19, r2, r13 C U1
232 umulh r19, r2, r14 C U1
233 mulq r19, r3, r15 C U1
234 lda r17, 64(r17) C L1 bookkeeping
237 umulh r19, r3, r8 C U1
238 ldq r2, -16(r17) C L1
239 mulq r19, r0, r9 C U1
241 umulh r19, r0, r10 C U1
242 addq r6, r13, r6 C L0 lo + acc
243 mulq r19, r1, r11 C U1
244 cmpult r6, r13, r20 C L0 lo add => carry
245 lda r16, 64(r16) C L1 bookkeeping
246 addq r6, r12, r22 C U0 hi add => answer
247 cmpult r22, r12, r21 C L0 hi add => carry
248 addq r14, r20, r14 C U0 hi mul + carry
249 ldq r6, -16(r16) C L1
250 addq r7, r15, r23 C L0 lo + acc
251 addq r14, r21, r14 C U0 hi mul + carry
253 umulh r19, r1, r12 C U1
254 cmpult r23, r15, r20 C L0 lo add => carry
255 addq r23, r14, r23 C U0 hi add => answer
257 mulq r19, r2, r13 C U1
258 cmpult r23, r14, r21 C L0 hi add => carry
259 addq r8, r20, r8 C U0 hi mul + carry
261 umulh r19, r2, r14 C U1
262 addq r4, r9, r4 C L0 lo + acc
263 stq r22, -48(r16) C L0
264 stq r23, -40(r16) C L1
265 mulq r19, r3, r15 C U1
266 addq r8, r21, r8 C U0 hi mul + carry
267 cmpult r4, r9, r20 C L0 lo add => carry
268 addq r4, r8, r22 C U0 hi add => answer
269 ble r18, $Lend C U1 bookkeeping
271 C ____ MAIN UNROLLED LOOP ____
274 bis r31, r31, r31 C U1 mt
275 cmpult r22, r8, r21 C L0 hi add => carry
276 addq r10, r20, r10 C U0 hi mul + carry
279 bis r31, r31, r31 C U1 mt
280 addq r5, r11, r23 C L0 lo + acc
281 addq r10, r21, r10 C L0 hi mul + carry
284 umulh r19, r3, r8 C U1
285 cmpult r23, r11, r20 C L0 lo add => carry
286 addq r23, r10, r23 C U0 hi add => answer
289 mulq r19, r0, r9 C U1
290 cmpult r23, r10, r21 C L0 hi add => carry
291 addq r12, r20, r12 C U0 hi mul + carry
294 umulh r19, r0, r10 C U1
295 addq r6, r13, r6 C L0 lo + acc
296 stq r22, -32(r16) C L0
297 stq r23, -24(r16) C L1
299 bis r31, r31, r31 C L0 st slosh
300 mulq r19, r1, r11 C U1
301 bis r31, r31, r31 C L1 st slosh
302 addq r12, r21, r12 C U0 hi mul + carry
304 cmpult r6, r13, r20 C L0 lo add => carry
305 bis r31, r31, r31 C U1 mt
306 lda r18, -1(r18) C L1 bookkeeping
307 addq r6, r12, r22 C U0 hi add => answer
309 bis r31, r31, r31 C U1 mt
310 cmpult r22, r12, r21 C L0 hi add => carry
311 addq r14, r20, r14 C U0 hi mul + carry
314 bis r31, r31, r31 C U1 mt
315 addq r7, r15, r23 C L0 lo + acc
316 addq r14, r21, r14 C U0 hi mul + carry
319 umulh r19, r1, r12 C U1
320 cmpult r23, r15, r20 C L0 lo add => carry
321 addq r23, r14, r23 C U0 hi add => answer
324 mulq r19, r2, r13 C U1
325 cmpult r23, r14, r21 C L0 hi add => carry
326 addq r8, r20, r8 C U0 hi mul + carry
329 umulh r19, r2, r14 C U1
330 addq r4, r9, r4 C U0 lo + acc
331 stq r22, -16(r16) C L0
332 stq r23, -8(r16) C L1
334 bis r31, r31, r31 C L0 st slosh
335 mulq r19, r3, r15 C U1
336 bis r31, r31, r31 C L1 st slosh
337 addq r8, r21, r8 C L0 hi mul + carry
339 cmpult r4, r9, r20 C L0 lo add => carry
340 bis r31, r31, r31 C U1 mt
341 lda r17, 64(r17) C L1 bookkeeping
342 addq r4, r8, r22 C U0 hi add => answer
344 bis r31, r31, r31 C U1 mt
345 cmpult r22, r8, r21 C L0 hi add => carry
346 addq r10, r20, r10 C U0 hi mul + carry
349 bis r31, r31, r31 C U1 mt
350 addq r5, r11, r23 C L0 lo + acc
351 addq r10, r21, r10 C L0 hi mul + carry
354 umulh r19, r3, r8 C U1
355 cmpult r23, r11, r20 C L0 lo add => carry
356 addq r23, r10, r23 C U0 hi add => answer
357 ldq r2, -16(r17) C L1
359 mulq r19, r0, r9 C U1
360 cmpult r23, r10, r21 C L0 hi add => carry
361 addq r12, r20, r12 C U0 hi mul + carry
364 umulh r19, r0, r10 C U1
365 addq r6, r13, r6 C L0 lo + acc
369 bis r31, r31, r31 C L0 st slosh
370 mulq r19, r1, r11 C U1
371 bis r31, r31, r31 C L1 st slosh
372 addq r12, r21, r12 C U0 hi mul + carry
374 cmpult r6, r13, r20 C L0 lo add => carry
375 bis r31, r31, r31 C U1 mt
376 lda r16, 64(r16) C L1 bookkeeping
377 addq r6, r12, r22 C U0 hi add => answer
379 bis r31, r31, r31 C U1 mt
380 cmpult r22, r12, r21 C L0 hi add => carry
381 addq r14, r20, r14 C U0 hi mul + carry
382 ldq r6, -16(r16) C L1
384 bis r31, r31, r31 C U1 mt
385 addq r7, r15, r23 C L0 lo + acc
386 addq r14, r21, r14 C U0 hi mul + carry
389 umulh r19, r1, r12 C U1
390 cmpult r23, r15, r20 C L0 lo add => carry
391 addq r23, r14, r23 C U0 hi add => answer
394 mulq r19, r2, r13 C U1
395 cmpult r23, r14, r21 C L0 hi add => carry
396 addq r8, r20, r8 C U0 hi mul + carry
399 umulh r19, r2, r14 C U1
400 addq r4, r9, r4 C L0 lo + acc
401 stq r22, -48(r16) C L0
402 stq r23, -40(r16) C L1
404 bis r31, r31, r31 C L0 st slosh
405 mulq r19, r3, r15 C U1
406 bis r31, r31, r31 C L1 st slosh
407 addq r8, r21, r8 C U0 hi mul + carry
409 cmpult r4, r9, r20 C L0 lo add => carry
410 addq r4, r8, r22 C U0 hi add => answer
411 bis r31, r31, r31 C L1 mt
412 bgt r18, $Loop C U1 bookkeeping
414 C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
416 cmpult r22, r8, r21 C L0 hi add => carry
417 addq r10, r20, r10 C U0 hi mul + carry
419 addq r5, r11, r23 C L0 lo + acc
420 addq r10, r21, r10 C L0 hi mul + carry
422 umulh r19, r3, r8 C U1
423 cmpult r23, r11, r20 C L0 lo add => carry
424 addq r23, r10, r23 C U0 hi add => answer
425 mulq r19, r0, r9 C U1
426 cmpult r23, r10, r21 C L0 hi add => carry
427 addq r12, r20, r12 C U0 hi mul + carry
428 umulh r19, r0, r10 C U1
429 addq r6, r13, r6 C L0 lo + acc
430 stq r22, -32(r16) C L0
431 stq r23, -24(r16) C L1
432 mulq r19, r1, r11 C U1
433 addq r12, r21, r12 C U0 hi mul + carry
434 cmpult r6, r13, r20 C L0 lo add => carry
435 addq r6, r12, r22 C U0 hi add => answer
436 cmpult r22, r12, r21 C L0 hi add => carry
437 addq r14, r20, r14 C U0 hi mul + carry
438 addq r7, r15, r23 C L0 lo + acc
439 addq r14, r21, r14 C U0 hi mul + carry
440 umulh r19, r1, r12 C U1
441 cmpult r23, r15, r20 C L0 lo add => carry
442 addq r23, r14, r23 C U0 hi add => answer
443 cmpult r23, r14, r21 C L0 hi add => carry
444 addq r8, r20, r8 C U0 hi mul + carry
445 addq r4, r9, r4 C U0 lo + acc
446 stq r22, -16(r16) C L0
447 stq r23, -8(r16) C L1
448 bis r31, r31, r31 C L0 st slosh
449 addq r8, r21, r8 C L0 hi mul + carry
450 cmpult r4, r9, r20 C L0 lo add => carry
451 addq r4, r8, r22 C U0 hi add => answer
452 cmpult r22, r8, r21 C L0 hi add => carry
453 addq r10, r20, r10 C U0 hi mul + carry
454 addq r5, r11, r23 C L0 lo + acc
455 addq r10, r21, r10 C L0 hi mul + carry
456 cmpult r23, r11, r20 C L0 lo add => carry
457 addq r23, r10, r23 C U0 hi add => answer
458 cmpult r23, r10, r21 C L0 hi add => carry
459 addq r12, r20, r12 C U0 hi mul + carry
462 addq r12, r21, r0 C U0 hi mul + carry
473 EPILOGUE(mpn_addmul_1)