[project @ 1998-06-05 14:43:45 by simonm]
authorsimonm <unknown>
Fri, 5 Jun 1998 14:43:46 +0000 (14:43 +0000)
committersimonm <unknown>
Fri, 5 Jun 1998 14:43:46 +0000 (14:43 +0000)
Initial revision

ghc/rts/gmp/mpn/alpha/add_n.s [new file with mode: 0644]
ghc/rts/gmp/mpn/alpha/addmul_1.s [new file with mode: 0644]
ghc/rts/gmp/mpn/alpha/ev5/add_n.s [new file with mode: 0644]
ghc/rts/gmp/mpn/alpha/ev5/lshift.s [new file with mode: 0644]
ghc/rts/gmp/mpn/alpha/ev5/rshift.s [new file with mode: 0644]
ghc/rts/gmp/mpn/alpha/ev5/sub_n.s [new file with mode: 0644]
ghc/rts/gmp/mpn/alpha/lshift.s [new file with mode: 0644]
ghc/rts/gmp/mpn/alpha/mul_1.s [new file with mode: 0644]
ghc/rts/gmp/mpn/alpha/rshift.s [new file with mode: 0644]
ghc/rts/gmp/mpn/alpha/sub_n.s [new file with mode: 0644]
ghc/rts/gmp/mpn/alpha/submul_1.s [new file with mode: 0644]

diff --git a/ghc/rts/gmp/mpn/alpha/add_n.s b/ghc/rts/gmp/mpn/alpha/add_n.s
new file mode 100644 (file)
index 0000000..426556e
--- /dev/null
@@ -0,0 +1,120 @@
+ # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $16
+ # s1_ptr      $17
+ # s2_ptr      $18
+ # size                $19
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_add_n
+       .ent    __mpn_add_n
+__mpn_add_n:
+       .frame  $30,0,$26,0
+
+       ldq     $3,0($17)
+       ldq     $4,0($18)
+
+       subq    $19,1,$19
+       and     $19,4-1,$2      # number of limbs in first loop
+       bis     $31,$31,$0
+       beq     $2,.L0          # if multiple of 4 limbs, skip first loop
+
+       subq    $19,$2,$19
+
+.Loop0:        subq    $2,1,$2
+       ldq     $5,8($17)
+       addq    $4,$0,$4
+       ldq     $6,8($18)
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+
+       addq    $17,8,$17
+       addq    $18,8,$18
+       bis     $5,$5,$3
+       bis     $6,$6,$4
+       addq    $16,8,$16
+       bne     $2,.Loop0
+
+.L0:   beq     $19,.Lend
+
+       .align  3
+.Loop: subq    $19,4,$19
+
+       ldq     $5,8($17)
+       addq    $4,$0,$4
+       ldq     $6,8($18)
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+
+       ldq     $3,16($17)
+       addq    $6,$0,$6
+       ldq     $4,16($18)
+       cmpult  $6,$0,$1
+       addq    $5,$6,$6
+       cmpult  $6,$5,$0
+       stq     $6,8($16)
+       or      $0,$1,$0
+
+       ldq     $5,24($17)
+       addq    $4,$0,$4
+       ldq     $6,24($18)
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,16($16)
+       or      $0,$1,$0
+
+       ldq     $3,32($17)
+       addq    $6,$0,$6
+       ldq     $4,32($18)
+       cmpult  $6,$0,$1
+       addq    $5,$6,$6
+       cmpult  $6,$5,$0
+       stq     $6,24($16)
+       or      $0,$1,$0
+
+       addq    $17,32,$17
+       addq    $18,32,$18
+       addq    $16,32,$16
+       bne     $19,.Loop
+
+.Lend: addq    $4,$0,$4
+       cmpult  $4,$0,$1
+       addq    $3,$4,$4
+       cmpult  $4,$3,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+       ret     $31,($26),1
+
+       .end    __mpn_add_n
diff --git a/ghc/rts/gmp/mpn/alpha/addmul_1.s b/ghc/rts/gmp/mpn/alpha/addmul_1.s
new file mode 100644 (file)
index 0000000..048238a
--- /dev/null
@@ -0,0 +1,92 @@
+ # Alpha 21064 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # s2_limb     r19
+
+ # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_addmul_1
+       .ent    __mpn_addmul_1 2
+__mpn_addmul_1:
+       .frame  $30,0,$26
+
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       subq    $18,1,$18       # size--
+       mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       umulh   $2,$19,$0       # $0 = prod_high
+       beq     $18,.Lend1      # jump if size was == 1
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       subq    $18,1,$18       # size--
+       addq    $5,$3,$3
+       cmpult  $3,$5,$4
+       stq     $3,0($16)
+       addq    $16,8,$16       # res_ptr++
+       beq     $18,.Lend2      # jump if size was == 2
+
+       .align  3
+.Loop: mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       subq    $18,1,$18       # size--
+       umulh   $2,$19,$4       # $4 = cy_limb
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       addq    $5,$3,$3
+       cmpult  $3,$5,$5
+       stq     $3,0($16)
+       addq    $16,8,$16       # res_ptr++
+       addq    $5,$0,$0        # combine carries
+       bne     $18,.Loop
+
+.Lend2:        mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       umulh   $2,$19,$4       # $4 = cy_limb
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       addq    $5,$3,$3
+       cmpult  $3,$5,$5
+       stq     $3,0($16)
+       addq    $5,$0,$0        # combine carries
+       addq    $4,$0,$0        # cy_limb = prod_high + cy
+       ret     $31,($26),1
+.Lend1:        addq    $5,$3,$3
+       cmpult  $3,$5,$5
+       stq     $3,0($16)
+       addq    $0,$5,$0
+       ret     $31,($26),1
+
+       .end    __mpn_addmul_1
diff --git a/ghc/rts/gmp/mpn/alpha/ev5/add_n.s b/ghc/rts/gmp/mpn/alpha/ev5/add_n.s
new file mode 100644 (file)
index 0000000..1251a1f
--- /dev/null
@@ -0,0 +1,148 @@
+ # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $16
+ # s1_ptr      $17
+ # s2_ptr      $18
+ # size                $19
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_add_n
+       .ent    __mpn_add_n
+__mpn_add_n:
+       .frame  $30,0,$26,0
+
+       or      $31,$31,$25             # clear cy
+       subq    $19,4,$19               # decr loop cnt
+       blt     $19,.Lend2              # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+       ldq     $0,0($18)
+       ldq     $1,8($18)
+       ldq     $4,0($17)
+       ldq     $5,8($17)
+       addq    $17,32,$17              # update s1_ptr
+       ldq     $2,16($18)
+       addq    $0,$4,$20               # 1st main add
+       ldq     $3,24($18)
+       subq    $19,4,$19               # decr loop cnt
+       ldq     $6,-16($17)
+       cmpult  $20,$0,$25              # compute cy from last add
+       ldq     $7,-8($17)
+       addq    $1,$25,$28              # cy add
+       addq    $18,32,$18              # update s2_ptr
+       addq    $5,$28,$21              # 2nd main add
+       cmpult  $28,$25,$8              # compute cy from last add
+       blt     $19,.Lend1              # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+       .align  4
+.Loop: cmpult  $21,$28,$25             # compute cy from last add
+       ldq     $0,0($18)
+       or      $8,$25,$25              # combine cy from the two adds
+       ldq     $1,8($18)
+       addq    $2,$25,$28              # cy add
+       ldq     $4,0($17)
+       addq    $28,$6,$22              # 3rd main add
+       ldq     $5,8($17)
+       cmpult  $28,$25,$8              # compute cy from last add
+       cmpult  $22,$28,$25             # compute cy from last add
+       stq     $20,0($16)
+       or      $8,$25,$25              # combine cy from the two adds
+       stq     $21,8($16)
+       addq    $3,$25,$28              # cy add
+       addq    $28,$7,$23              # 4th main add
+       cmpult  $28,$25,$8              # compute cy from last add
+       cmpult  $23,$28,$25             # compute cy from last add
+       addq    $17,32,$17              # update s1_ptr
+       or      $8,$25,$25              # combine cy from the two adds
+       addq    $16,32,$16              # update res_ptr
+       addq    $0,$25,$28              # cy add
+       ldq     $2,16($18)
+       addq    $4,$28,$20              # 1st main add
+       ldq     $3,24($18)
+       cmpult  $28,$25,$8              # compute cy from last add
+       ldq     $6,-16($17)
+       cmpult  $20,$28,$25             # compute cy from last add
+       ldq     $7,-8($17)
+       or      $8,$25,$25              # combine cy from the two adds
+       subq    $19,4,$19               # decr loop cnt
+       stq     $22,-16($16)
+       addq    $1,$25,$28              # cy add
+       stq     $23,-8($16)
+       addq    $5,$28,$21              # 2nd main add
+       addq    $18,32,$18              # update s2_ptr
+       cmpult  $28,$25,$8              # compute cy from last add
+       bge     $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:        cmpult  $21,$28,$25             # compute cy from last add
+       or      $8,$25,$25              # combine cy from the two adds
+       addq    $2,$25,$28              # cy add
+       addq    $28,$6,$22              # 3rd main add
+       cmpult  $28,$25,$8              # compute cy from last add
+       cmpult  $22,$28,$25             # compute cy from last add
+       stq     $20,0($16)
+       or      $8,$25,$25              # combine cy from the two adds
+       stq     $21,8($16)
+       addq    $3,$25,$28              # cy add
+       addq    $28,$7,$23              # 4th main add
+       cmpult  $28,$25,$8              # compute cy from last add
+       cmpult  $23,$28,$25             # compute cy from last add
+       or      $8,$25,$25              # combine cy from the two adds
+       addq    $16,32,$16              # update res_ptr
+       stq     $22,-16($16)
+       stq     $23,-8($16)
+.Lend2:        addq    $19,4,$19               # restore loop cnt
+       beq     $19,.Lret
+ # Start software pipeline for 2nd loop
+       ldq     $0,0($18)
+       ldq     $4,0($17)
+       subq    $19,1,$19
+       beq     $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+       .align  4
+.Loop0:        addq    $0,$25,$28              # cy add
+       ldq     $0,8($18)
+       addq    $4,$28,$20              # main add
+       ldq     $4,8($17)
+       addq    $18,8,$18
+       cmpult  $28,$25,$8              # compute cy from last add
+       addq    $17,8,$17
+       stq     $20,0($16)
+       cmpult  $20,$28,$25             # compute cy from last add
+       subq    $19,1,$19               # decr loop cnt
+       or      $8,$25,$25              # combine cy from the two adds
+       addq    $16,8,$16
+       bne     $19,.Loop0
+.Lend0:        addq    $0,$25,$28              # cy add
+       addq    $4,$28,$20              # main add
+       cmpult  $28,$25,$8              # compute cy from last add
+       cmpult  $20,$28,$25             # compute cy from last add
+       stq     $20,0($16)
+       or      $8,$25,$25              # combine cy from the two adds
+
+.Lret: or      $25,$31,$0              # return cy
+       ret     $31,($26),1
+       .end    __mpn_add_n
diff --git a/ghc/rts/gmp/mpn/alpha/ev5/lshift.s b/ghc/rts/gmp/mpn/alpha/ev5/lshift.s
new file mode 100644 (file)
index 0000000..ced55b7
--- /dev/null
@@ -0,0 +1,174 @@
+ # Alpha EV5 __mpn_lshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # cnt         r19
+
+ # This code runs at 3.25 cycles/limb on the EV5.
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_lshift
+       .ent    __mpn_lshift
+__mpn_lshift:
+       .frame  $30,0,$26,0
+
+       s8addq  $18,$17,$17     # make r17 point at end of s1
+       ldq     $4,-8($17)      # load first limb
+       subq    $31,$19,$20
+       s8addq  $18,$16,$16     # make r16 point at end of RES
+       subq    $18,1,$18
+       and     $18,4-1,$28     # number of limbs in first loop
+       srl     $4,$20,$0       # compute function result
+
+       beq     $28,.L0
+       subq    $18,$28,$18
+
+       .align  3
+.Loop0:        ldq     $3,-16($17)
+       subq    $16,8,$16
+       sll     $4,$19,$5
+       subq    $17,8,$17
+       subq    $28,1,$28
+       srl     $3,$20,$6
+       or      $3,$3,$4
+       or      $5,$6,$8
+       stq     $8,0($16)
+       bne     $28,.Loop0
+
+.L0:   sll     $4,$19,$24
+       beq     $18,.Lend
+ # warm up phase 1
+       ldq     $1,-16($17)
+       subq    $18,4,$18
+       ldq     $2,-24($17)
+       ldq     $3,-32($17)
+       ldq     $4,-40($17)
+       beq     $18,.Lend1
+ # warm up phase 2
+       srl     $1,$20,$7
+       sll     $1,$19,$21
+       srl     $2,$20,$8
+       ldq     $1,-48($17)
+       sll     $2,$19,$22
+       ldq     $2,-56($17)
+       srl     $3,$20,$5
+       or      $7,$24,$7
+       sll     $3,$19,$23
+       or      $8,$21,$8
+       srl     $4,$20,$6
+       ldq     $3,-64($17)
+       sll     $4,$19,$24
+       ldq     $4,-72($17)
+       subq    $18,4,$18
+       beq     $18,.Lend2
+       .align  4
+ # main loop
+.Loop: stq     $7,-8($16)
+       or      $5,$22,$5
+       stq     $8,-16($16)
+       or      $6,$23,$6
+
+       srl     $1,$20,$7
+       subq    $18,4,$18
+       sll     $1,$19,$21
+       unop    # ldq   $31,-96($17)
+
+       srl     $2,$20,$8
+       ldq     $1,-80($17)
+       sll     $2,$19,$22
+       ldq     $2,-88($17)
+
+       stq     $5,-24($16)
+       or      $7,$24,$7
+       stq     $6,-32($16)
+       or      $8,$21,$8
+
+       srl     $3,$20,$5
+       unop    # ldq   $31,-96($17)
+       sll     $3,$19,$23
+       subq    $16,32,$16
+
+       srl     $4,$20,$6
+       ldq     $3,-96($17)
+       sll     $4,$19,$24
+       ldq     $4,-104($17)
+
+       subq    $17,32,$17
+       bne     $18,.Loop
+ # cool down phase 2/1
+.Lend2:        stq     $7,-8($16)
+       or      $5,$22,$5
+       stq     $8,-16($16)
+       or      $6,$23,$6
+       srl     $1,$20,$7
+       sll     $1,$19,$21
+       srl     $2,$20,$8
+       sll     $2,$19,$22
+       stq     $5,-24($16)
+       or      $7,$24,$7
+       stq     $6,-32($16)
+       or      $8,$21,$8
+       srl     $3,$20,$5
+       sll     $3,$19,$23
+       srl     $4,$20,$6
+       sll     $4,$19,$24
+ # cool down phase 2/2
+       stq     $7,-40($16)
+       or      $5,$22,$5
+       stq     $8,-48($16)
+       or      $6,$23,$6
+       stq     $5,-56($16)
+       stq     $6,-64($16)
+ # cool down phase 2/3
+       stq     $24,-72($16)
+       ret     $31,($26),1
+
+ # cool down phase 1/1
+.Lend1:        srl     $1,$20,$7
+       sll     $1,$19,$21
+       srl     $2,$20,$8
+       sll     $2,$19,$22
+       srl     $3,$20,$5
+       or      $7,$24,$7
+       sll     $3,$19,$23
+       or      $8,$21,$8
+       srl     $4,$20,$6
+       sll     $4,$19,$24
+ # cool down phase 1/2
+       stq     $7,-8($16)
+       or      $5,$22,$5
+       stq     $8,-16($16)
+       or      $6,$23,$6
+       stq     $5,-24($16)
+       stq     $6,-32($16)
+       stq     $24,-40($16)
+       ret     $31,($26),1
+
+.Lend: stq     $24,-8($16)
+       ret     $31,($26),1
+       .end    __mpn_lshift
diff --git a/ghc/rts/gmp/mpn/alpha/ev5/rshift.s b/ghc/rts/gmp/mpn/alpha/ev5/rshift.s
new file mode 100644 (file)
index 0000000..6e24fef
--- /dev/null
@@ -0,0 +1,172 @@
+ # Alpha EV5 __mpn_rshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # cnt         r19
+
+ # This code runs at 3.25 cycles/limb on the EV5.
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_rshift
+       .ent    __mpn_rshift
+__mpn_rshift:
+       .frame  $30,0,$26,0
+
+       ldq     $4,0($17)       # load first limb
+       subq    $31,$19,$20
+       subq    $18,1,$18
+       and     $18,4-1,$28     # number of limbs in first loop
+       sll     $4,$20,$0       # compute function result
+
+       beq     $28,.L0
+       subq    $18,$28,$18
+
+       .align  3
+.Loop0:        ldq     $3,8($17)
+       addq    $16,8,$16
+       srl     $4,$19,$5
+       addq    $17,8,$17
+       subq    $28,1,$28
+       sll     $3,$20,$6
+       or      $3,$3,$4
+       or      $5,$6,$8
+       stq     $8,-8($16)
+       bne     $28,.Loop0
+
+.L0:   srl     $4,$19,$24
+       beq     $18,.Lend
+ # warm up phase 1
+       ldq     $1,8($17)
+       subq    $18,4,$18
+       ldq     $2,16($17)
+       ldq     $3,24($17)
+       ldq     $4,32($17)
+       beq     $18,.Lend1
+ # warm up phase 2
+       sll     $1,$20,$7
+       srl     $1,$19,$21
+       sll     $2,$20,$8
+       ldq     $1,40($17)
+       srl     $2,$19,$22
+       ldq     $2,48($17)
+       sll     $3,$20,$5
+       or      $7,$24,$7
+       srl     $3,$19,$23
+       or      $8,$21,$8
+       sll     $4,$20,$6
+       ldq     $3,56($17)
+       srl     $4,$19,$24
+       ldq     $4,64($17)
+       subq    $18,4,$18
+       beq     $18,.Lend2
+       .align  4
+ # main loop
+.Loop: stq     $7,0($16)
+       or      $5,$22,$5
+       stq     $8,8($16)
+       or      $6,$23,$6
+
+       sll     $1,$20,$7
+       subq    $18,4,$18
+       srl     $1,$19,$21
+       unop    # ldq   $31,-96($17)
+
+       sll     $2,$20,$8
+       ldq     $1,72($17)
+       srl     $2,$19,$22
+       ldq     $2,80($17)
+
+       stq     $5,16($16)
+       or      $7,$24,$7
+       stq     $6,24($16)
+       or      $8,$21,$8
+
+       sll     $3,$20,$5
+       unop    # ldq   $31,-96($17)
+       srl     $3,$19,$23
+       addq    $16,32,$16
+
+       sll     $4,$20,$6
+       ldq     $3,88($17)
+       srl     $4,$19,$24
+       ldq     $4,96($17)
+
+       addq    $17,32,$17
+       bne     $18,.Loop
+ # cool down phase 2/1
+.Lend2:        stq     $7,0($16)
+       or      $5,$22,$5
+       stq     $8,8($16)
+       or      $6,$23,$6
+       sll     $1,$20,$7
+       srl     $1,$19,$21
+       sll     $2,$20,$8
+       srl     $2,$19,$22
+       stq     $5,16($16)
+       or      $7,$24,$7
+       stq     $6,24($16)
+       or      $8,$21,$8
+       sll     $3,$20,$5
+       srl     $3,$19,$23
+       sll     $4,$20,$6
+       srl     $4,$19,$24
+ # cool down phase 2/2
+       stq     $7,32($16)
+       or      $5,$22,$5
+       stq     $8,40($16)
+       or      $6,$23,$6
+       stq     $5,48($16)
+       stq     $6,56($16)
+ # cool down phase 2/3
+       stq     $24,64($16)
+       ret     $31,($26),1
+
+ # cool down phase 1/1
+.Lend1:        sll     $1,$20,$7
+       srl     $1,$19,$21
+       sll     $2,$20,$8
+       srl     $2,$19,$22
+       sll     $3,$20,$5
+       or      $7,$24,$7
+       srl     $3,$19,$23
+       or      $8,$21,$8
+       sll     $4,$20,$6
+       srl     $4,$19,$24
+ # cool down phase 1/2
+       stq     $7,0($16)
+       or      $5,$22,$5
+       stq     $8,8($16)
+       or      $6,$23,$6
+       stq     $5,16($16)
+       stq     $6,24($16)
+       stq     $24,32($16)
+       ret     $31,($26),1
+
+.Lend: stq     $24,0($16)
+       ret     $31,($26),1
+       .end    __mpn_rshift
diff --git a/ghc/rts/gmp/mpn/alpha/ev5/sub_n.s b/ghc/rts/gmp/mpn/alpha/ev5/sub_n.s
new file mode 100644 (file)
index 0000000..6743af5
--- /dev/null
@@ -0,0 +1,149 @@
+ # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $16
+ # s1_ptr      $17
+ # s2_ptr      $18
+ # size                $19
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_sub_n
+       .ent    __mpn_sub_n
+__mpn_sub_n:
+       .frame  $30,0,$26,0
+
+       or      $31,$31,$25             # clear cy
+       subq    $19,4,$19               # decr loop cnt
+       blt     $19,.Lend2              # if less than 4 limbs, goto 2nd loop
+ # Start software pipeline for 1st loop
+       ldq     $0,0($18)
+       ldq     $1,8($18)
+       ldq     $4,0($17)
+       ldq     $5,8($17)
+       addq    $17,32,$17              # update s1_ptr
+       ldq     $2,16($18)
+       subq    $4,$0,$20               # 1st main sub
+       ldq     $3,24($18)
+       subq    $19,4,$19               # decr loop cnt
+       ldq     $6,-16($17)
+       cmpult  $4,$20,$25              # compute cy from last sub
+       ldq     $7,-8($17)
+       addq    $1,$25,$28              # cy add
+       addq    $18,32,$18              # update s2_ptr
+       subq    $5,$28,$21              # 2nd main sub
+       cmpult  $28,$25,$8              # compute cy from last add
+       blt     $19,.Lend1              # if less than 4 limbs remain, jump
+ # 1st loop handles groups of 4 limbs in a software pipeline
+       .align  4
+.Loop: cmpult  $5,$21,$25              # compute cy from last add
+       ldq     $0,0($18)
+       or      $8,$25,$25              # combine cy from the two adds
+       ldq     $1,8($18)
+       addq    $2,$25,$28              # cy add
+       ldq     $4,0($17)
+       subq    $6,$28,$22              # 3rd main sub
+       ldq     $5,8($17)
+       cmpult  $28,$25,$8              # compute cy from last add
+       cmpult  $6,$22,$25              # compute cy from last add
+       stq     $20,0($16)
+       or      $8,$25,$25              # combine cy from the two adds
+       stq     $21,8($16)
+       addq    $3,$25,$28              # cy add
+       subq    $7,$28,$23              # 4th main sub
+       cmpult  $28,$25,$8              # compute cy from last add
+       cmpult  $7,$23,$25              # compute cy from last add
+       addq    $17,32,$17              # update s1_ptr
+       or      $8,$25,$25              # combine cy from the two adds
+       addq    $16,32,$16              # update res_ptr
+       addq    $0,$25,$28              # cy add
+       ldq     $2,16($18)
+       subq    $4,$28,$20              # 1st main sub
+       ldq     $3,24($18)
+       cmpult  $28,$25,$8              # compute cy from last add
+       ldq     $6,-16($17)
+       cmpult  $4,$20,$25              # compute cy from last add
+       ldq     $7,-8($17)
+       or      $8,$25,$25              # combine cy from the two adds
+       subq    $19,4,$19               # decr loop cnt
+       stq     $22,-16($16)
+       addq    $1,$25,$28              # cy add
+       stq     $23,-8($16)
+       subq    $5,$28,$21              # 2nd main sub
+       addq    $18,32,$18              # update s2_ptr
+       cmpult  $28,$25,$8              # compute cy from last add
+       bge     $19,.Loop
+ # Finish software pipeline for 1st loop
+.Lend1:        cmpult  $5,$21,$25              # compute cy from last add
+       or      $8,$25,$25              # combine cy from the two adds
+       addq    $2,$25,$28              # cy add
+       subq    $6,$28,$22              # 3rd main sub
+       cmpult  $28,$25,$8              # compute cy from last add
+       cmpult  $6,$22,$25              # compute cy from last add
+       stq     $20,0($16)
+       or      $8,$25,$25              # combine cy from the two adds
+       stq     $21,8($16)
+       addq    $3,$25,$28              # cy add
+       subq    $7,$28,$23              # 4th main sub
+       cmpult  $28,$25,$8              # compute cy from last add
+       cmpult  $7,$23,$25              # compute cy from last add
+       or      $8,$25,$25              # combine cy from the two adds
+       addq    $16,32,$16              # update res_ptr
+       stq     $22,-16($16)
+       stq     $23,-8($16)
+.Lend2:        addq    $19,4,$19               # restore loop cnt
+       beq     $19,.Lret
+ # Start software pipeline for 2nd loop
+       ldq     $0,0($18)
+       ldq     $4,0($17)
+       subq    $19,1,$19
+       beq     $19,.Lend0
+ # 2nd loop handles remaining 1-3 limbs
+       .align  4
+.Loop0:        addq    $0,$25,$28              # cy add
+       ldq     $0,8($18)
+       subq    $4,$28,$20              # main sub
+       ldq     $1,8($17)
+       addq    $18,8,$18
+       cmpult  $28,$25,$8              # compute cy from last add
+       addq    $17,8,$17
+       stq     $20,0($16)
+       cmpult  $4,$20,$25              # compute cy from last add
+       subq    $19,1,$19               # decr loop cnt
+       or      $8,$25,$25              # combine cy from the two adds
+       addq    $16,8,$16
+       or      $1,$31,$4
+       bne     $19,.Loop0
+.Lend0:        addq    $0,$25,$28              # cy add
+       subq    $4,$28,$20              # main sub
+       cmpult  $28,$25,$8              # compute cy from last add
+       cmpult  $4,$20,$25              # compute cy from last add
+       stq     $20,0($16)
+       or      $8,$25,$25              # combine cy from the two adds
+
+.Lret: or      $25,$31,$0              # return cy
+       ret     $31,($26),1
+       .end    __mpn_sub_n
diff --git a/ghc/rts/gmp/mpn/alpha/lshift.s b/ghc/rts/gmp/mpn/alpha/lshift.s
new file mode 100644 (file)
index 0000000..13bd24a
--- /dev/null
@@ -0,0 +1,109 @@
+ # Alpha 21064 __mpn_lshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # cnt         r19
+
+ # This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+ # it would take 4 cycles/limb.  It should be possible to get down to 3
+ # cycles/limb since both ldq and stq can be paired with the other used
+ # instructions.  But there are many restrictions in the 21064 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_lshift
+       .ent    __mpn_lshift
+__mpn_lshift:
+       .frame  $30,0,$26,0
+
+       s8addq  $18,$17,$17     # make r17 point at end of s1
+       ldq     $4,-8($17)      # load first limb
+       subq    $17,8,$17
+       subq    $31,$19,$7
+       s8addq  $18,$16,$16     # make r16 point at end of RES
+       subq    $18,1,$18
+       and     $18,4-1,$20     # number of limbs in first loop
+       srl     $4,$7,$0        # compute function result
+
+       beq     $20,.L0
+       subq    $18,$20,$18
+
+       .align  3
+.Loop0:
+       ldq     $3,-8($17)
+       subq    $16,8,$16
+       subq    $17,8,$17
+       subq    $20,1,$20
+       sll     $4,$19,$5
+       srl     $3,$7,$6
+       bis     $3,$3,$4
+       bis     $5,$6,$8
+       stq     $8,0($16)
+       bne     $20,.Loop0
+
+.L0:   beq     $18,.Lend
+
+       .align  3
+.Loop: ldq     $3,-8($17)
+       subq    $16,32,$16
+       subq    $18,4,$18
+       sll     $4,$19,$5
+       srl     $3,$7,$6
+
+       ldq     $4,-16($17)
+       sll     $3,$19,$1
+       bis     $5,$6,$8
+       stq     $8,24($16)
+       srl     $4,$7,$2
+
+       ldq     $3,-24($17)
+       sll     $4,$19,$5
+       bis     $1,$2,$8
+       stq     $8,16($16)
+       srl     $3,$7,$6
+
+       ldq     $4,-32($17)
+       sll     $3,$19,$1
+       bis     $5,$6,$8
+       stq     $8,8($16)
+       srl     $4,$7,$2
+
+       subq    $17,32,$17
+       bis     $1,$2,$8
+       stq     $8,0($16)
+
+       bgt     $18,.Loop
+
+.Lend: sll     $4,$19,$8
+       stq     $8,-8($16)
+       ret     $31,($26),1
+       .end    __mpn_lshift
diff --git a/ghc/rts/gmp/mpn/alpha/mul_1.s b/ghc/rts/gmp/mpn/alpha/mul_1.s
new file mode 100644 (file)
index 0000000..a1f5a94
--- /dev/null
@@ -0,0 +1,85 @@
+ # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+ # the result in a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # s2_limb     r19
+
+ # This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5.
+
+ # To improve performance for long multiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Alpha
+ # architecture.  2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_mul_1
+       .ent    __mpn_mul_1 2
+__mpn_mul_1:
+       .frame  $30,0,$26
+
+       ldq     $2,0($17)       # $2 = s1_limb
+       subq    $18,1,$18       # size--
+       mulq    $2,$19,$3       # $3 = prod_low
+       bic     $31,$31,$4      # clear cy_limb
+       umulh   $2,$19,$0       # $0 = prod_high
+       beq     $18,Lend1       # jump if size was == 1
+       ldq     $2,8($17)       # $2 = s1_limb
+       subq    $18,1,$18       # size--
+       stq     $3,0($16)
+       beq     $18,Lend2       # jump if size was == 2
+
+       .align  3
+Loop:  mulq    $2,$19,$3       # $3 = prod_low
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       subq    $18,1,$18       # size--
+       umulh   $2,$19,$4       # $4 = cy_limb
+       ldq     $2,16($17)      # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       stq     $3,8($16)
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       addq    $16,8,$16       # res_ptr++
+       bne     $18,Loop
+
+Lend2: mulq    $2,$19,$3       # $3 = prod_low
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       umulh   $2,$19,$4       # $4 = cy_limb
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       stq     $3,8($16)
+       addq    $4,$0,$0        # cy_limb = prod_high + cy
+       ret     $31,($26),1
+Lend1: stq     $3,0($16)
+       ret     $31,($26),1
+
+       .end    __mpn_mul_1
diff --git a/ghc/rts/gmp/mpn/alpha/rshift.s b/ghc/rts/gmp/mpn/alpha/rshift.s
new file mode 100644 (file)
index 0000000..389054a
--- /dev/null
@@ -0,0 +1,107 @@
+ # Alpha 21064 __mpn_rshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # cnt         r19
+
+ # This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+ # it would take 4 cycles/limb.  It should be possible to get down to 3
+ # cycles/limb since both ldq and stq can be paired with the other used
+ # instructions.  But there are many restrictions in the 21064 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_rshift
+       .ent    __mpn_rshift
+__mpn_rshift:
+       .frame  $30,0,$26,0
+
+       ldq     $4,0($17)       # load first limb
+       addq    $17,8,$17
+       subq    $31,$19,$7
+       subq    $18,1,$18
+       and     $18,4-1,$20     # number of limbs in first loop
+       sll     $4,$7,$0        # compute function result
+
+       beq     $20,.L0
+       subq    $18,$20,$18
+
+       .align  3
+.Loop0:
+       ldq     $3,0($17)
+       addq    $16,8,$16
+       addq    $17,8,$17
+       subq    $20,1,$20
+       srl     $4,$19,$5
+       sll     $3,$7,$6
+       bis     $3,$3,$4
+       bis     $5,$6,$8
+       stq     $8,-8($16)
+       bne     $20,.Loop0
+
+.L0:   beq     $18,.Lend
+
+       .align  3
+.Loop: ldq     $3,0($17)
+       addq    $16,32,$16
+       subq    $18,4,$18
+       srl     $4,$19,$5
+       sll     $3,$7,$6
+
+       ldq     $4,8($17)
+       srl     $3,$19,$1
+       bis     $5,$6,$8
+       stq     $8,-32($16)
+       sll     $4,$7,$2
+
+       ldq     $3,16($17)
+       srl     $4,$19,$5
+       bis     $1,$2,$8
+       stq     $8,-24($16)
+       sll     $3,$7,$6
+
+       ldq     $4,24($17)
+       srl     $3,$19,$1
+       bis     $5,$6,$8
+       stq     $8,-16($16)
+       sll     $4,$7,$2
+
+       addq    $17,32,$17
+       bis     $1,$2,$8
+       stq     $8,-8($16)
+
+       bgt     $18,.Loop
+
+.Lend: srl     $4,$19,$8
+       stq     $8,0($16)
+       ret     $31,($26),1
+       .end    __mpn_rshift
diff --git a/ghc/rts/gmp/mpn/alpha/sub_n.s b/ghc/rts/gmp/mpn/alpha/sub_n.s
new file mode 100644 (file)
index 0000000..3c90c11
--- /dev/null
@@ -0,0 +1,120 @@
+ # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     $16
+ # s1_ptr      $17
+ # s2_ptr      $18
+ # size                $19
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_sub_n
+       .ent    __mpn_sub_n
+__mpn_sub_n:
+       .frame  $30,0,$26,0
+
+       ldq     $3,0($17)
+       ldq     $4,0($18)
+
+       subq    $19,1,$19
+       and     $19,4-1,$2      # number of limbs in first loop
+       bis     $31,$31,$0
+       beq     $2,.L0          # if multiple of 4 limbs, skip first loop
+
+       subq    $19,$2,$19
+
+.Loop0:        subq    $2,1,$2
+       ldq     $5,8($17)
+       addq    $4,$0,$4
+       ldq     $6,8($18)
+       cmpult  $4,$0,$1
+       subq    $3,$4,$4
+       cmpult  $3,$4,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+
+       addq    $17,8,$17
+       addq    $18,8,$18
+       bis     $5,$5,$3
+       bis     $6,$6,$4
+       addq    $16,8,$16
+       bne     $2,.Loop0
+
+.L0:   beq     $19,.Lend
+
+       .align  3
+.Loop: subq    $19,4,$19
+
+       ldq     $5,8($17)
+       addq    $4,$0,$4
+       ldq     $6,8($18)
+       cmpult  $4,$0,$1
+       subq    $3,$4,$4
+       cmpult  $3,$4,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+
+       ldq     $3,16($17)
+       addq    $6,$0,$6
+       ldq     $4,16($18)
+       cmpult  $6,$0,$1
+       subq    $5,$6,$6
+       cmpult  $5,$6,$0
+       stq     $6,8($16)
+       or      $0,$1,$0
+
+       ldq     $5,24($17)
+       addq    $4,$0,$4
+       ldq     $6,24($18)
+       cmpult  $4,$0,$1
+       subq    $3,$4,$4
+       cmpult  $3,$4,$0
+       stq     $4,16($16)
+       or      $0,$1,$0
+
+       ldq     $3,32($17)
+       addq    $6,$0,$6
+       ldq     $4,32($18)
+       cmpult  $6,$0,$1
+       subq    $5,$6,$6
+       cmpult  $5,$6,$0
+       stq     $6,24($16)
+       or      $0,$1,$0
+
+       addq    $17,32,$17
+       addq    $18,32,$18
+       addq    $16,32,$16
+       bne     $19,.Loop
+
+.Lend: addq    $4,$0,$4
+       cmpult  $4,$0,$1
+       subq    $3,$4,$4
+       cmpult  $3,$4,$0
+       stq     $4,0($16)
+       or      $0,$1,$0
+       ret     $31,($26),1
+
+       .end    __mpn_sub_n
diff --git a/ghc/rts/gmp/mpn/alpha/submul_1.s b/ghc/rts/gmp/mpn/alpha/submul_1.s
new file mode 100644 (file)
index 0000000..1ed0c6a
--- /dev/null
@@ -0,0 +1,92 @@
+ # Alpha 21064 __mpn_submul_1 -- Multiply a limb vector with a limb and
+ # subtract the result from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr     r16
+ # s1_ptr      r17
+ # size                r18
+ # s2_limb     r19
+
+ # This code runs at 42 cycles/limb on EV4 and 18 cycles/limb on EV5.
+
+       .set    noreorder
+       .set    noat
+.text
+       .align  3
+       .globl  __mpn_submul_1
+       .ent    __mpn_submul_1 2
+__mpn_submul_1:
+       .frame  $30,0,$26
+
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       subq    $18,1,$18       # size--
+       mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       umulh   $2,$19,$0       # $0 = prod_high
+       beq     $18,.Lend1      # jump if size was == 1
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       subq    $18,1,$18       # size--
+       subq    $5,$3,$3
+       cmpult  $5,$3,$4
+       stq     $3,0($16)
+       addq    $16,8,$16       # res_ptr++
+       beq     $18,.Lend2      # jump if size was == 2
+
+       .align  3
+.Loop: mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       subq    $18,1,$18       # size--
+       umulh   $2,$19,$4       # $4 = cy_limb
+       ldq     $2,0($17)       # $2 = s1_limb
+       addq    $17,8,$17       # s1_ptr++
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       subq    $5,$3,$3
+       cmpult  $5,$3,$5
+       stq     $3,0($16)
+       addq    $16,8,$16       # res_ptr++
+       addq    $5,$0,$0        # combine carries
+       bne     $18,.Loop
+
+.Lend2:        mulq    $2,$19,$3       # $3 = prod_low
+       ldq     $5,0($16)       # $5 = *res_ptr
+       addq    $4,$0,$0        # cy_limb = cy_limb + 'cy'
+       umulh   $2,$19,$4       # $4 = cy_limb
+       addq    $3,$0,$3        # $3 = cy_limb + prod_low
+       cmpult  $3,$0,$0        # $0 = carry from (cy_limb + prod_low)
+       subq    $5,$3,$3
+       cmpult  $5,$3,$5
+       stq     $3,0($16)
+       addq    $5,$0,$0        # combine carries
+       addq    $4,$0,$0        # cy_limb = prod_high + cy
+       ret     $31,($26),1
+.Lend1:        subq    $5,$3,$3
+       cmpult  $5,$3,$5
+       stq     $3,0($16)
+       addq    $0,$5,$0
+       ret     $31,($26),1
+
+       .end    __mpn_submul_1