; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and
; subtract the result from a second limb vector.

; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.

; This file is part of the GNU MP Library.

; The GNU MP Library is free software; you can redistribute it and/or modify
; it under the terms of the GNU Lesser General Public License as published by
; the Free Software Foundation; either version 2.1 of the License, or (at your
; option) any later version.

; The GNU MP Library is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
; License for more details.

; You should have received a copy of the GNU Lesser General Public License
; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
; MA 02111-1307, USA.

; INPUT PARAMETERS
#define rptr		%r26
#define sptr		%r25
#define size		%r24
#define s2limb		%r23

; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
; it faster, but the PA8000 pipeline is not publically documented and it
; is very complex to reverse engineer

#define t1 %r19
#define rlimb %r20
#define hi %r21
#define lo %r22
#define m0 %r28
#define m1 %r3
#define cylimb %r29
#define t3 %r4
#define t2 %r6
#define t5 %r23
#define t4 %r31
	.level  2.0w
	.code
	.export __gmpn_submul_1,entry
__gmpn_submul_1
	.proc
	.callinfo frame=128,no_calls
	.entry
	std		s2limb,-56(%r30)
        fldd		-56(%r30),%fr5
	ldo		128(%r30),%r30
	add		%r0,%r0,cylimb		; clear cy and cylimb

	std		%r3,-96(%r30)
	std		%r4,-88(%r30)
	std		%r5,-80(%r30)
	std		%r6,-72(%r30)
	depdi,z		1,31,1,%r5

	fldd		0(sptr),%fr4
	ldo		8(sptr),sptr

	xmpyu		%fr5R,%fr4R,%fr6
	fstd		%fr6,-128(%r30)
	xmpyu		%fr5R,%fr4L,%fr7
	fstd		%fr7,-120(%r30)
	xmpyu		%fr5L,%fr4R,%fr8
	fstd		%fr8,-112(%r30)
	xmpyu		%fr5L,%fr4L,%fr9
	fstd		%fr9,-104(%r30)
	ldd		-128(%r30),lo		; lo = low 64 bit of product
	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
	ldd		-104(%r30),hi		; hi = high 64 bit of product
	addib,=		-1,%r24,L$end1
	nop
	fldd		0(sptr),%fr4
	ldo		8(sptr),sptr
	addib,=		-1,%r24,L$end2
	nop
L$loop
	xmpyu		%fr5R,%fr4R,%fr6
	fstd		%fr6,-128(%r30)
	xmpyu		%fr5R,%fr4L,%fr7
	fstd		%fr7,-120(%r30)
	xmpyu		%fr5L,%fr4R,%fr8
	fstd		%fr8,-112(%r30)
	xmpyu		%fr5L,%fr4L,%fr9
	fstd		%fr9,-104(%r30)
	ldd		0(rptr),rlimb
	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
	add,l		m0,t1,t1		; t1 += m0
	add,l,*nuv	m1,t1,t1		; t1 += m1
	 add,l		%r5,hi,hi		; propagate carry
	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
	depd,z		t1,31,32,t5		; t5 = lo32(t1)
	add,l		t5,t4,t4		; t4 += lo32(t1)
	ldd		-128(%r30),lo		; lo = low 64 bit of product
	add		cylimb,t4,t4
	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
	add,dc		t2,hi,cylimb
	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
	sub		rlimb,t4,t3
	add		t4,t3,%r0
	ldd		-104(%r30),hi		; hi = high 64 bit of product
	add,dc		%r0,cylimb,cylimb
	fldd		0(sptr),%fr4
	ldo		8(sptr),sptr
	std		t3,0(rptr)
	addib,<>	-1,%r24,L$loop
	ldo		8(rptr),rptr
L$end2
	xmpyu		%fr5R,%fr4R,%fr6
	fstd		%fr6,-128(%r30)
	xmpyu		%fr5R,%fr4L,%fr7
	fstd		%fr7,-120(%r30)
	xmpyu		%fr5L,%fr4R,%fr8
	fstd		%fr8,-112(%r30)
	xmpyu		%fr5L,%fr4L,%fr9
	fstd		%fr9,-104(%r30)
	ldd		0(rptr),rlimb
	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
	add,l		m0,t1,t1		; t1 += m0
	add,l,*nuv	m1,t1,t1		; t1 += m0
	 add,l		%r5,hi,hi		; propagate carry
	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
	depd,z		t1,31,32,t5		; t5 = lo32(t1)
	add,l		t5,t4,t4		; t4 += lo32(t1)
	ldd		-128(%r30),lo		; lo = low 64 bit of product
	add		cylimb,t4,t4
	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
	add,dc		t2,hi,cylimb
	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
	sub		rlimb,t4,t3
	add		t4,t3,%r0
	ldd		-104(%r30),hi		; hi = high 64 bit of product
	add,dc		%r0,cylimb,cylimb
	std		t3,0(rptr)
	ldo		8(rptr),rptr
L$end1
	ldd		0(rptr),rlimb
	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
	add,l		m0,t1,t1		; t1 += m0
	add,l,*nuv	m1,t1,t1		; t1 += m0
	 add,l		%r5,hi,hi		; propagate carry
	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
	depd,z		t1,31,32,t5		; t5 = lo32(t1)
	add,l		t5,t4,t4		; t4 += lo32(t1)
	add		cylimb,t4,t4
	add,dc		t2,hi,cylimb
	sub		rlimb,t4,t3
	add		t4,t3,%r0
	add,dc		%r0,cylimb,cylimb
	std		t3,0(rptr)
	ldo		8(rptr),rptr

	ldd		-96(%r30),%r3
	ldd		-88(%r30),%r4
	ldd		-80(%r30),%r5
	ldd		-72(%r30),%r6

	copy		cylimb,%r28
	bve		(%r2)
	.exit
	ldo		-128(%r30),%r30
	.procend