--- /dev/null
+! SPARC v9 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr %o0
+! s1_ptr %o1
+! s2_ptr %o2
+! size %o3
+
+include(`../config.m4')
+
+ASM_START()
+ .register %g2,#scratch
+ .register %g3,#scratch
+PROLOGUE(mpn_sub_n)
+
+! 12 mem ops >= 12 cycles
+! 8 shift insn >= 8 cycles
+! 8 addccc, executing alone, +8 cycles
+! Unrolling not mandatory...perhaps 2-way is best?
+! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl
+! All in all, it runs at 5 cycles/limb
+
+ save %sp,-160,%sp
+
+ addcc %g0,%g0,%g0
+
+ add %i3,-4,%i3
+ brlz,pn %i3,L(there)
+ nop
+
+ ldx [%i1+0],%l0
+ ldx [%i2+0],%l4
+ ldx [%i1+8],%l1
+ ldx [%i2+8],%l5
+ ldx [%i1+16],%l2
+ ldx [%i2+16],%l6
+ ldx [%i1+24],%l3
+ ldx [%i2+24],%l7
+ add %i1,32,%i1
+ add %i2,32,%i2
+
+ add %i3,-4,%i3
+ brlz,pn %i3,L(skip)
+ nop
+ b L(loop1) ! jump instead of executing many NOPs
+ nop
+ ALIGN(32)
+!--------- Start main loop ---------
+L(loop1):
+ subccc %l0,%l4,%g1
+!-
+ srlx %l0,32,%o0
+ ldx [%i1+0],%l0
+!-
+ srlx %l4,32,%o4
+ ldx [%i2+0],%l4
+!-
+ subccc %o0,%o4,%g0
+!-
+ subccc %l1,%l5,%g2
+!-
+ srlx %l1,32,%o1
+ ldx [%i1+8],%l1
+!-
+ srlx %l5,32,%o5
+ ldx [%i2+8],%l5
+!-
+ subccc %o1,%o5,%g0
+!-
+ subccc %l2,%l6,%g3
+!-
+ srlx %l2,32,%o2
+ ldx [%i1+16],%l2
+!-
+ srlx %l6,32,%g5 ! asymmetry
+ ldx [%i2+16],%l6
+!-
+ subccc %o2,%g5,%g0
+!-
+ subccc %l3,%l7,%g4
+!-
+ srlx %l3,32,%o3
+ ldx [%i1+24],%l3
+ add %i1,32,%i1
+!-
+ srlx %l7,32,%o7
+ ldx [%i2+24],%l7
+ add %i2,32,%i2
+!-
+ subccc %o3,%o7,%g0
+!-
+ stx %g1,[%i0+0]
+!-
+ stx %g2,[%i0+8]
+!-
+ stx %g3,[%i0+16]
+ add %i3,-4,%i3
+!-
+ stx %g4,[%i0+24]
+ add %i0,32,%i0
+
+ brgez,pt %i3,L(loop1)
+ nop
+!--------- End main loop ---------
+L(skip):
+ subccc %l0,%l4,%g1
+ srlx %l0,32,%o0
+ srlx %l4,32,%o4
+ subccc %o0,%o4,%g0
+ subccc %l1,%l5,%g2
+ srlx %l1,32,%o1
+ srlx %l5,32,%o5
+ subccc %o1,%o5,%g0
+ subccc %l2,%l6,%g3
+ srlx %l2,32,%o2
+ srlx %l6,32,%g5 ! asymmetry
+ subccc %o2,%g5,%g0
+ subccc %l3,%l7,%g4
+ srlx %l3,32,%o3
+ srlx %l7,32,%o7
+ subccc %o3,%o7,%g0
+ stx %g1,[%i0+0]
+ stx %g2,[%i0+8]
+ stx %g3,[%i0+16]
+ stx %g4,[%i0+24]
+ add %i0,32,%i0
+
+L(there):
+ add %i3,4,%i3
+ brz,pt %i3,L(end)
+ nop
+
+L(loop2):
+ ldx [%i1+0],%l0
+ add %i1,8,%i1
+ ldx [%i2+0],%l4
+ add %i2,8,%i2
+ srlx %l0,32,%g2
+ srlx %l4,32,%g3
+ subccc %l0,%l4,%g1
+ subccc %g2,%g3,%g0
+ stx %g1,[%i0+0]
+ add %i0,8,%i0
+ add %i3,-1,%i3
+ brgz,pt %i3,L(loop2)
+ nop
+
+L(end): addc %g0,%g0,%i0
+ ret
+ restore
+EPILOGUE(mpn_sub_n)