From: simonm Date: Fri, 5 Jun 1998 14:43:46 +0000 (+0000) Subject: [project @ 1998-06-05 14:43:45 by simonm] X-Git-Tag: Approx_2487_patches~600 X-Git-Url: http://git.megacz.com/?a=commitdiff_plain;ds=inline;h=7d11a7c78fa988c079a43b092284f255c3788a10;p=ghc-hetmet.git [project @ 1998-06-05 14:43:45 by simonm] Initial revision --- diff --git a/ghc/rts/gmp/mpn/alpha/add_n.s b/ghc/rts/gmp/mpn/alpha/add_n.s new file mode 100644 index 0000000..426556e --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/add_n.s @@ -0,0 +1,120 @@ + # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_add_n + .ent __mpn_add_n +__mpn_add_n: + .frame $30,0,$26,0 + + ldq $3,0($17) + ldq $4,0($18) + + subq $19,1,$19 + and $19,4-1,$2 # number of limbs in first loop + bis $31,$31,$0 + beq $2,.L0 # if multiple of 4 limbs, skip first loop + + subq $19,$2,$19 + +.Loop0: subq $2,1,$2 + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + + addq $17,8,$17 + addq $18,8,$18 + bis $5,$5,$3 + bis $6,$6,$4 + addq $16,8,$16 + bne $2,.Loop0 + +.L0: beq $19,.Lend + + .align 3 +.Loop: subq $19,4,$19 + + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + + ldq $3,16($17) + addq $6,$0,$6 + ldq $4,16($18) + cmpult $6,$0,$1 + addq $5,$6,$6 + cmpult $6,$5,$0 + stq $6,8($16) + or $0,$1,$0 + + ldq $5,24($17) + addq $4,$0,$4 + ldq $6,24($18) + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,16($16) + or $0,$1,$0 + + ldq $3,32($17) + addq $6,$0,$6 + ldq $4,32($18) + cmpult $6,$0,$1 + addq $5,$6,$6 + cmpult $6,$5,$0 + stq $6,24($16) + or $0,$1,$0 + + addq $17,32,$17 + addq $18,32,$18 + addq $16,32,$16 + bne $19,.Loop + +.Lend: addq $4,$0,$4 + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + ret $31,($26),1 + + .end __mpn_add_n diff --git a/ghc/rts/gmp/mpn/alpha/addmul_1.s b/ghc/rts/gmp/mpn/alpha/addmul_1.s new file mode 100644 index 0000000..048238a --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/addmul_1.s @@ -0,0 +1,92 @@ + # Alpha 21064 __mpn_addmul_1 -- Multiply a limb vector with a limb and add + # the result to a second limb vector. + + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # s2_limb r19 + + # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_addmul_1 + .ent __mpn_addmul_1 2 +__mpn_addmul_1: + .frame $30,0,$26 + + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + umulh $2,$19,$0 # $0 = prod_high + beq $18,.Lend1 # jump if size was == 1 + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + addq $5,$3,$3 + cmpult $3,$5,$4 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + beq $18,.Lend2 # jump if size was == 2 + + .align 3 +.Loop: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + subq $18,1,$18 # size-- + umulh $2,$19,$4 # $4 = cy_limb + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + addq $5,$3,$3 + cmpult $3,$5,$5 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + addq $5,$0,$0 # combine carries + bne $18,.Loop + +.Lend2: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + umulh $2,$19,$4 # $4 = cy_limb + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + addq $5,$3,$3 + cmpult $3,$5,$5 + stq $3,0($16) + addq $5,$0,$0 # combine carries + addq $4,$0,$0 # cy_limb = prod_high + cy + ret $31,($26),1 +.Lend1: addq $5,$3,$3 + cmpult $3,$5,$5 + stq $3,0($16) + addq $0,$5,$0 + ret $31,($26),1 + + .end __mpn_addmul_1 diff --git a/ghc/rts/gmp/mpn/alpha/ev5/add_n.s b/ghc/rts/gmp/mpn/alpha/ev5/add_n.s new file mode 100644 index 0000000..1251a1f --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/ev5/add_n.s @@ -0,0 +1,148 @@ + # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_add_n + .ent __mpn_add_n +__mpn_add_n: + .frame $30,0,$26,0 + + or $31,$31,$25 # clear cy + subq $19,4,$19 # decr loop cnt + blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop + # Start software pipeline for 1st loop + ldq $0,0($18) + ldq $1,8($18) + ldq $4,0($17) + ldq $5,8($17) + addq $17,32,$17 # update s1_ptr + ldq $2,16($18) + addq $0,$4,$20 # 1st main add + ldq $3,24($18) + subq $19,4,$19 # decr loop cnt + ldq $6,-16($17) + cmpult $20,$0,$25 # compute cy from last add + ldq $7,-8($17) + addq $1,$25,$28 # cy add + addq $18,32,$18 # update s2_ptr + addq $5,$28,$21 # 2nd main add + cmpult $28,$25,$8 # compute cy from last add + blt $19,.Lend1 # if less than 4 limbs remain, jump + # 1st loop handles groups of 4 limbs in a software pipeline + .align 4 +.Loop: cmpult $21,$28,$25 # compute cy from last add + ldq $0,0($18) + or $8,$25,$25 # combine cy from the two adds + ldq $1,8($18) + addq $2,$25,$28 # cy add + ldq $4,0($17) + addq $28,$6,$22 # 3rd main add + ldq $5,8($17) + cmpult $28,$25,$8 # compute cy from last add + cmpult $22,$28,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds + stq $21,8($16) + addq $3,$25,$28 # cy add + addq $28,$7,$23 # 4th main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $23,$28,$25 # compute cy from last add + addq $17,32,$17 # update s1_ptr + or $8,$25,$25 # combine cy from the two adds + addq $16,32,$16 # update res_ptr + addq $0,$25,$28 # cy add + ldq $2,16($18) + addq $4,$28,$20 # 1st main add + ldq $3,24($18) + cmpult $28,$25,$8 # compute cy from last add + ldq $6,-16($17) + cmpult $20,$28,$25 # compute cy from last add + ldq $7,-8($17) + or $8,$25,$25 # combine cy from the two adds + subq $19,4,$19 # decr loop cnt + stq $22,-16($16) + addq $1,$25,$28 # cy add + stq $23,-8($16) + addq $5,$28,$21 # 2nd main add + addq $18,32,$18 # update s2_ptr + cmpult $28,$25,$8 # compute cy from last add + bge $19,.Loop + # Finish software pipeline for 1st loop +.Lend1: cmpult $21,$28,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two adds + addq $2,$25,$28 # cy add + addq $28,$6,$22 # 3rd main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $22,$28,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds + stq $21,8($16) + addq $3,$25,$28 # cy add + addq $28,$7,$23 # 4th main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $23,$28,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two adds + addq $16,32,$16 # update res_ptr + stq $22,-16($16) + stq $23,-8($16) +.Lend2: addq $19,4,$19 # restore loop cnt + beq $19,.Lret + # Start software pipeline for 2nd loop + ldq $0,0($18) + ldq $4,0($17) + subq $19,1,$19 + beq $19,.Lend0 + # 2nd loop handles remaining 1-3 limbs + .align 4 +.Loop0: addq $0,$25,$28 # cy add + ldq $0,8($18) + addq $4,$28,$20 # main add + ldq $4,8($17) + addq $18,8,$18 + cmpult $28,$25,$8 # compute cy from last add + addq $17,8,$17 + stq $20,0($16) + cmpult $20,$28,$25 # compute cy from last add + subq $19,1,$19 # decr loop cnt + or $8,$25,$25 # combine cy from the two adds + addq $16,8,$16 + bne $19,.Loop0 +.Lend0: addq $0,$25,$28 # cy add + addq $4,$28,$20 # main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $20,$28,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds + +.Lret: or $25,$31,$0 # return cy + ret $31,($26),1 + .end __mpn_add_n diff --git a/ghc/rts/gmp/mpn/alpha/ev5/lshift.s b/ghc/rts/gmp/mpn/alpha/ev5/lshift.s new file mode 100644 index 0000000..ced55b7 --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/ev5/lshift.s @@ -0,0 +1,174 @@ + # Alpha EV5 __mpn_lshift -- + + # Copyright (C) 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 3.25 cycles/limb on the EV5. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_lshift + .ent __mpn_lshift +__mpn_lshift: + .frame $30,0,$26,0 + + s8addq $18,$17,$17 # make r17 point at end of s1 + ldq $4,-8($17) # load first limb + subq $31,$19,$20 + s8addq $18,$16,$16 # make r16 point at end of RES + subq $18,1,$18 + and $18,4-1,$28 # number of limbs in first loop + srl $4,$20,$0 # compute function result + + beq $28,.L0 + subq $18,$28,$18 + + .align 3 +.Loop0: ldq $3,-16($17) + subq $16,8,$16 + sll $4,$19,$5 + subq $17,8,$17 + subq $28,1,$28 + srl $3,$20,$6 + or $3,$3,$4 + or $5,$6,$8 + stq $8,0($16) + bne $28,.Loop0 + +.L0: sll $4,$19,$24 + beq $18,.Lend + # warm up phase 1 + ldq $1,-16($17) + subq $18,4,$18 + ldq $2,-24($17) + ldq $3,-32($17) + ldq $4,-40($17) + beq $18,.Lend1 + # warm up phase 2 + srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + ldq $1,-48($17) + sll $2,$19,$22 + ldq $2,-56($17) + srl $3,$20,$5 + or $7,$24,$7 + sll $3,$19,$23 + or $8,$21,$8 + srl $4,$20,$6 + ldq $3,-64($17) + sll $4,$19,$24 + ldq $4,-72($17) + subq $18,4,$18 + beq $18,.Lend2 + .align 4 + # main loop +.Loop: stq $7,-8($16) + or $5,$22,$5 + stq $8,-16($16) + or $6,$23,$6 + + srl $1,$20,$7 + subq $18,4,$18 + sll $1,$19,$21 + unop # ldq $31,-96($17) + + srl $2,$20,$8 + ldq $1,-80($17) + sll $2,$19,$22 + ldq $2,-88($17) + + stq $5,-24($16) + or $7,$24,$7 + stq $6,-32($16) + or $8,$21,$8 + + srl $3,$20,$5 + unop # ldq $31,-96($17) + sll $3,$19,$23 + subq $16,32,$16 + + srl $4,$20,$6 + ldq $3,-96($17) + sll $4,$19,$24 + ldq $4,-104($17) + + subq $17,32,$17 + bne $18,.Loop + # cool down phase 2/1 +.Lend2: stq $7,-8($16) + or $5,$22,$5 + stq $8,-16($16) + or $6,$23,$6 + srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + sll $2,$19,$22 + stq $5,-24($16) + or $7,$24,$7 + stq $6,-32($16) + or $8,$21,$8 + srl $3,$20,$5 + sll $3,$19,$23 + srl $4,$20,$6 + sll $4,$19,$24 + # cool down phase 2/2 + stq $7,-40($16) + or $5,$22,$5 + stq $8,-48($16) + or $6,$23,$6 + stq $5,-56($16) + stq $6,-64($16) + # cool down phase 2/3 + stq $24,-72($16) + ret $31,($26),1 + + # cool down phase 1/1 +.Lend1: srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + sll $2,$19,$22 + srl $3,$20,$5 + or $7,$24,$7 + sll $3,$19,$23 + or $8,$21,$8 + srl $4,$20,$6 + sll $4,$19,$24 + # cool down phase 1/2 + stq $7,-8($16) + or $5,$22,$5 + stq $8,-16($16) + or $6,$23,$6 + stq $5,-24($16) + stq $6,-32($16) + stq $24,-40($16) + ret $31,($26),1 + +.Lend: stq $24,-8($16) + ret $31,($26),1 + .end __mpn_lshift diff --git a/ghc/rts/gmp/mpn/alpha/ev5/rshift.s b/ghc/rts/gmp/mpn/alpha/ev5/rshift.s new file mode 100644 index 0000000..6e24fef --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/ev5/rshift.s @@ -0,0 +1,172 @@ + # Alpha EV5 __mpn_rshift -- + + # Copyright (C) 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 3.25 cycles/limb on the EV5. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_rshift + .ent __mpn_rshift +__mpn_rshift: + .frame $30,0,$26,0 + + ldq $4,0($17) # load first limb + subq $31,$19,$20 + subq $18,1,$18 + and $18,4-1,$28 # number of limbs in first loop + sll $4,$20,$0 # compute function result + + beq $28,.L0 + subq $18,$28,$18 + + .align 3 +.Loop0: ldq $3,8($17) + addq $16,8,$16 + srl $4,$19,$5 + addq $17,8,$17 + subq $28,1,$28 + sll $3,$20,$6 + or $3,$3,$4 + or $5,$6,$8 + stq $8,-8($16) + bne $28,.Loop0 + +.L0: srl $4,$19,$24 + beq $18,.Lend + # warm up phase 1 + ldq $1,8($17) + subq $18,4,$18 + ldq $2,16($17) + ldq $3,24($17) + ldq $4,32($17) + beq $18,.Lend1 + # warm up phase 2 + sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + ldq $1,40($17) + srl $2,$19,$22 + ldq $2,48($17) + sll $3,$20,$5 + or $7,$24,$7 + srl $3,$19,$23 + or $8,$21,$8 + sll $4,$20,$6 + ldq $3,56($17) + srl $4,$19,$24 + ldq $4,64($17) + subq $18,4,$18 + beq $18,.Lend2 + .align 4 + # main loop +.Loop: stq $7,0($16) + or $5,$22,$5 + stq $8,8($16) + or $6,$23,$6 + + sll $1,$20,$7 + subq $18,4,$18 + srl $1,$19,$21 + unop # ldq $31,-96($17) + + sll $2,$20,$8 + ldq $1,72($17) + srl $2,$19,$22 + ldq $2,80($17) + + stq $5,16($16) + or $7,$24,$7 + stq $6,24($16) + or $8,$21,$8 + + sll $3,$20,$5 + unop # ldq $31,-96($17) + srl $3,$19,$23 + addq $16,32,$16 + + sll $4,$20,$6 + ldq $3,88($17) + srl $4,$19,$24 + ldq $4,96($17) + + addq $17,32,$17 + bne $18,.Loop + # cool down phase 2/1 +.Lend2: stq $7,0($16) + or $5,$22,$5 + stq $8,8($16) + or $6,$23,$6 + sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + srl $2,$19,$22 + stq $5,16($16) + or $7,$24,$7 + stq $6,24($16) + or $8,$21,$8 + sll $3,$20,$5 + srl $3,$19,$23 + sll $4,$20,$6 + srl $4,$19,$24 + # cool down phase 2/2 + stq $7,32($16) + or $5,$22,$5 + stq $8,40($16) + or $6,$23,$6 + stq $5,48($16) + stq $6,56($16) + # cool down phase 2/3 + stq $24,64($16) + ret $31,($26),1 + + # cool down phase 1/1 +.Lend1: sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + srl $2,$19,$22 + sll $3,$20,$5 + or $7,$24,$7 + srl $3,$19,$23 + or $8,$21,$8 + sll $4,$20,$6 + srl $4,$19,$24 + # cool down phase 1/2 + stq $7,0($16) + or $5,$22,$5 + stq $8,8($16) + or $6,$23,$6 + stq $5,16($16) + stq $6,24($16) + stq $24,32($16) + ret $31,($26),1 + +.Lend: stq $24,0($16) + ret $31,($26),1 + .end __mpn_rshift diff --git a/ghc/rts/gmp/mpn/alpha/ev5/sub_n.s b/ghc/rts/gmp/mpn/alpha/ev5/sub_n.s new file mode 100644 index 0000000..6743af5 --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/ev5/sub_n.s @@ -0,0 +1,149 @@ + # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_sub_n + .ent __mpn_sub_n +__mpn_sub_n: + .frame $30,0,$26,0 + + or $31,$31,$25 # clear cy + subq $19,4,$19 # decr loop cnt + blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop + # Start software pipeline for 1st loop + ldq $0,0($18) + ldq $1,8($18) + ldq $4,0($17) + ldq $5,8($17) + addq $17,32,$17 # update s1_ptr + ldq $2,16($18) + subq $4,$0,$20 # 1st main sub + ldq $3,24($18) + subq $19,4,$19 # decr loop cnt + ldq $6,-16($17) + cmpult $4,$20,$25 # compute cy from last sub + ldq $7,-8($17) + addq $1,$25,$28 # cy add + addq $18,32,$18 # update s2_ptr + subq $5,$28,$21 # 2nd main sub + cmpult $28,$25,$8 # compute cy from last add + blt $19,.Lend1 # if less than 4 limbs remain, jump + # 1st loop handles groups of 4 limbs in a software pipeline + .align 4 +.Loop: cmpult $5,$21,$25 # compute cy from last add + ldq $0,0($18) + or $8,$25,$25 # combine cy from the two adds + ldq $1,8($18) + addq $2,$25,$28 # cy add + ldq $4,0($17) + subq $6,$28,$22 # 3rd main sub + ldq $5,8($17) + cmpult $28,$25,$8 # compute cy from last add + cmpult $6,$22,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds + stq $21,8($16) + addq $3,$25,$28 # cy add + subq $7,$28,$23 # 4th main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $7,$23,$25 # compute cy from last add + addq $17,32,$17 # update s1_ptr + or $8,$25,$25 # combine cy from the two adds + addq $16,32,$16 # update res_ptr + addq $0,$25,$28 # cy add + ldq $2,16($18) + subq $4,$28,$20 # 1st main sub + ldq $3,24($18) + cmpult $28,$25,$8 # compute cy from last add + ldq $6,-16($17) + cmpult $4,$20,$25 # compute cy from last add + ldq $7,-8($17) + or $8,$25,$25 # combine cy from the two adds + subq $19,4,$19 # decr loop cnt + stq $22,-16($16) + addq $1,$25,$28 # cy add + stq $23,-8($16) + subq $5,$28,$21 # 2nd main sub + addq $18,32,$18 # update s2_ptr + cmpult $28,$25,$8 # compute cy from last add + bge $19,.Loop + # Finish software pipeline for 1st loop +.Lend1: cmpult $5,$21,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two adds + addq $2,$25,$28 # cy add + subq $6,$28,$22 # 3rd main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $6,$22,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds + stq $21,8($16) + addq $3,$25,$28 # cy add + subq $7,$28,$23 # 4th main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $7,$23,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two adds + addq $16,32,$16 # update res_ptr + stq $22,-16($16) + stq $23,-8($16) +.Lend2: addq $19,4,$19 # restore loop cnt + beq $19,.Lret + # Start software pipeline for 2nd loop + ldq $0,0($18) + ldq $4,0($17) + subq $19,1,$19 + beq $19,.Lend0 + # 2nd loop handles remaining 1-3 limbs + .align 4 +.Loop0: addq $0,$25,$28 # cy add + ldq $0,8($18) + subq $4,$28,$20 # main sub + ldq $1,8($17) + addq $18,8,$18 + cmpult $28,$25,$8 # compute cy from last add + addq $17,8,$17 + stq $20,0($16) + cmpult $4,$20,$25 # compute cy from last add + subq $19,1,$19 # decr loop cnt + or $8,$25,$25 # combine cy from the two adds + addq $16,8,$16 + or $1,$31,$4 + bne $19,.Loop0 +.Lend0: addq $0,$25,$28 # cy add + subq $4,$28,$20 # main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $4,$20,$25 # compute cy from last add + stq $20,0($16) + or $8,$25,$25 # combine cy from the two adds + +.Lret: or $25,$31,$0 # return cy + ret $31,($26),1 + .end __mpn_sub_n diff --git a/ghc/rts/gmp/mpn/alpha/lshift.s b/ghc/rts/gmp/mpn/alpha/lshift.s new file mode 100644 index 0000000..13bd24a --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/lshift.s @@ -0,0 +1,109 @@ + # Alpha 21064 __mpn_lshift -- + + # Copyright (C) 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, + # it would take 4 cycles/limb. It should be possible to get down to 3 + # cycles/limb since both ldq and stq can be paired with the other used + # instructions. But there are many restrictions in the 21064 pipeline that + # makes it hard, if not impossible, to get down to 3 cycles/limb: + + # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. + # 2. Only aligned instruction pairs can be paired. + # 3. The store buffer or silo might not be able to deal with the bandwidth. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_lshift + .ent __mpn_lshift +__mpn_lshift: + .frame $30,0,$26,0 + + s8addq $18,$17,$17 # make r17 point at end of s1 + ldq $4,-8($17) # load first limb + subq $17,8,$17 + subq $31,$19,$7 + s8addq $18,$16,$16 # make r16 point at end of RES + subq $18,1,$18 + and $18,4-1,$20 # number of limbs in first loop + srl $4,$7,$0 # compute function result + + beq $20,.L0 + subq $18,$20,$18 + + .align 3 +.Loop0: + ldq $3,-8($17) + subq $16,8,$16 + subq $17,8,$17 + subq $20,1,$20 + sll $4,$19,$5 + srl $3,$7,$6 + bis $3,$3,$4 + bis $5,$6,$8 + stq $8,0($16) + bne $20,.Loop0 + +.L0: beq $18,.Lend + + .align 3 +.Loop: ldq $3,-8($17) + subq $16,32,$16 + subq $18,4,$18 + sll $4,$19,$5 + srl $3,$7,$6 + + ldq $4,-16($17) + sll $3,$19,$1 + bis $5,$6,$8 + stq $8,24($16) + srl $4,$7,$2 + + ldq $3,-24($17) + sll $4,$19,$5 + bis $1,$2,$8 + stq $8,16($16) + srl $3,$7,$6 + + ldq $4,-32($17) + sll $3,$19,$1 + bis $5,$6,$8 + stq $8,8($16) + srl $4,$7,$2 + + subq $17,32,$17 + bis $1,$2,$8 + stq $8,0($16) + + bgt $18,.Loop + +.Lend: sll $4,$19,$8 + stq $8,-8($16) + ret $31,($26),1 + .end __mpn_lshift diff --git a/ghc/rts/gmp/mpn/alpha/mul_1.s b/ghc/rts/gmp/mpn/alpha/mul_1.s new file mode 100644 index 0000000..a1f5a94 --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/mul_1.s @@ -0,0 +1,85 @@ + # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store + # the result in a second limb vector. + + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # s2_limb r19 + + # This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5. + + # To improve performance for long multiplications, we would use + # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use + # these instructions without slowing down the general code: 1. We can + # only have two prefetches in operation at any time in the Alpha + # architecture. 2. There will seldom be any special alignment + # between RES_PTR and S1_PTR. Maybe we can simply divide the current + # loop into an inner and outer loop, having the inner loop handle + # exactly one prefetch block? + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_mul_1 + .ent __mpn_mul_1 2 +__mpn_mul_1: + .frame $30,0,$26 + + ldq $2,0($17) # $2 = s1_limb + subq $18,1,$18 # size-- + mulq $2,$19,$3 # $3 = prod_low + bic $31,$31,$4 # clear cy_limb + umulh $2,$19,$0 # $0 = prod_high + beq $18,Lend1 # jump if size was == 1 + ldq $2,8($17) # $2 = s1_limb + subq $18,1,$18 # size-- + stq $3,0($16) + beq $18,Lend2 # jump if size was == 2 + + .align 3 +Loop: mulq $2,$19,$3 # $3 = prod_low + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + subq $18,1,$18 # size-- + umulh $2,$19,$4 # $4 = cy_limb + ldq $2,16($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + addq $3,$0,$3 # $3 = cy_limb + prod_low + stq $3,8($16) + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + addq $16,8,$16 # res_ptr++ + bne $18,Loop + +Lend2: mulq $2,$19,$3 # $3 = prod_low + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + umulh $2,$19,$4 # $4 = cy_limb + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + stq $3,8($16) + addq $4,$0,$0 # cy_limb = prod_high + cy + ret $31,($26),1 +Lend1: stq $3,0($16) + ret $31,($26),1 + + .end __mpn_mul_1 diff --git a/ghc/rts/gmp/mpn/alpha/rshift.s b/ghc/rts/gmp/mpn/alpha/rshift.s new file mode 100644 index 0000000..389054a --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/rshift.s @@ -0,0 +1,107 @@ + # Alpha 21064 __mpn_rshift -- + + # Copyright (C) 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, + # it would take 4 cycles/limb. It should be possible to get down to 3 + # cycles/limb since both ldq and stq can be paired with the other used + # instructions. But there are many restrictions in the 21064 pipeline that + # makes it hard, if not impossible, to get down to 3 cycles/limb: + + # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. + # 2. Only aligned instruction pairs can be paired. + # 3. The store buffer or silo might not be able to deal with the bandwidth. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_rshift + .ent __mpn_rshift +__mpn_rshift: + .frame $30,0,$26,0 + + ldq $4,0($17) # load first limb + addq $17,8,$17 + subq $31,$19,$7 + subq $18,1,$18 + and $18,4-1,$20 # number of limbs in first loop + sll $4,$7,$0 # compute function result + + beq $20,.L0 + subq $18,$20,$18 + + .align 3 +.Loop0: + ldq $3,0($17) + addq $16,8,$16 + addq $17,8,$17 + subq $20,1,$20 + srl $4,$19,$5 + sll $3,$7,$6 + bis $3,$3,$4 + bis $5,$6,$8 + stq $8,-8($16) + bne $20,.Loop0 + +.L0: beq $18,.Lend + + .align 3 +.Loop: ldq $3,0($17) + addq $16,32,$16 + subq $18,4,$18 + srl $4,$19,$5 + sll $3,$7,$6 + + ldq $4,8($17) + srl $3,$19,$1 + bis $5,$6,$8 + stq $8,-32($16) + sll $4,$7,$2 + + ldq $3,16($17) + srl $4,$19,$5 + bis $1,$2,$8 + stq $8,-24($16) + sll $3,$7,$6 + + ldq $4,24($17) + srl $3,$19,$1 + bis $5,$6,$8 + stq $8,-16($16) + sll $4,$7,$2 + + addq $17,32,$17 + bis $1,$2,$8 + stq $8,-8($16) + + bgt $18,.Loop + +.Lend: srl $4,$19,$8 + stq $8,0($16) + ret $31,($26),1 + .end __mpn_rshift diff --git a/ghc/rts/gmp/mpn/alpha/sub_n.s b/ghc/rts/gmp/mpn/alpha/sub_n.s new file mode 100644 index 0000000..3c90c11 --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/sub_n.s @@ -0,0 +1,120 @@ + # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_sub_n + .ent __mpn_sub_n +__mpn_sub_n: + .frame $30,0,$26,0 + + ldq $3,0($17) + ldq $4,0($18) + + subq $19,1,$19 + and $19,4-1,$2 # number of limbs in first loop + bis $31,$31,$0 + beq $2,.L0 # if multiple of 4 limbs, skip first loop + + subq $19,$2,$19 + +.Loop0: subq $2,1,$2 + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,0($16) + or $0,$1,$0 + + addq $17,8,$17 + addq $18,8,$18 + bis $5,$5,$3 + bis $6,$6,$4 + addq $16,8,$16 + bne $2,.Loop0 + +.L0: beq $19,.Lend + + .align 3 +.Loop: subq $19,4,$19 + + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,0($16) + or $0,$1,$0 + + ldq $3,16($17) + addq $6,$0,$6 + ldq $4,16($18) + cmpult $6,$0,$1 + subq $5,$6,$6 + cmpult $5,$6,$0 + stq $6,8($16) + or $0,$1,$0 + + ldq $5,24($17) + addq $4,$0,$4 + ldq $6,24($18) + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,16($16) + or $0,$1,$0 + + ldq $3,32($17) + addq $6,$0,$6 + ldq $4,32($18) + cmpult $6,$0,$1 + subq $5,$6,$6 + cmpult $5,$6,$0 + stq $6,24($16) + or $0,$1,$0 + + addq $17,32,$17 + addq $18,32,$18 + addq $16,32,$16 + bne $19,.Loop + +.Lend: addq $4,$0,$4 + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,0($16) + or $0,$1,$0 + ret $31,($26),1 + + .end __mpn_sub_n diff --git a/ghc/rts/gmp/mpn/alpha/submul_1.s b/ghc/rts/gmp/mpn/alpha/submul_1.s new file mode 100644 index 0000000..1ed0c6a --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/submul_1.s @@ -0,0 +1,92 @@ + # Alpha 21064 __mpn_submul_1 -- Multiply a limb vector with a limb and + # subtract the result from a second limb vector. + + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # s2_limb r19 + + # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_submul_1 + .ent __mpn_submul_1 2 +__mpn_submul_1: + .frame $30,0,$26 + + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + umulh $2,$19,$0 # $0 = prod_high + beq $18,.Lend1 # jump if size was == 1 + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + subq $5,$3,$3 + cmpult $5,$3,$4 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + beq $18,.Lend2 # jump if size was == 2 + + .align 3 +.Loop: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + subq $18,1,$18 # size-- + umulh $2,$19,$4 # $4 = cy_limb + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + subq $5,$3,$3 + cmpult $5,$3,$5 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + addq $5,$0,$0 # combine carries + bne $18,.Loop + +.Lend2: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + umulh $2,$19,$4 # $4 = cy_limb + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + subq $5,$3,$3 + cmpult $5,$3,$5 + stq $3,0($16) + addq $5,$0,$0 # combine carries + addq $4,$0,$0 # cy_limb = prod_high + cy + ret $31,($26),1 +.Lend1: subq $5,$3,$3 + cmpult $5,$3,$5 + stq $3,0($16) + addq $0,$5,$0 + ret $31,($26),1 + + .end __mpn_submul_1