From e72806f1683fc7acf7bd10885e8e45ff06d790f1 Mon Sep 17 00:00:00 2001 From: simonmar Date: Fri, 18 Aug 2000 15:44:37 +0000 Subject: [PATCH] [project @ 2000-08-18 15:44:28 by simonmar] Import stripped-down gmp-3.1. This will cause severe breakage until I can resolve the conflicts and check in the rest of the changes, so I'd advise not updating this directory for a while (unless you're using a system-installed gmp, in which case it won't matter). --- ghc/rts/gmp/.gdbinit | 34 + ghc/rts/gmp/assert.c | 52 + ghc/rts/gmp/compat.c | 46 + ghc/rts/gmp/depcomp | 269 ++ ghc/rts/gmp/errno.c | 26 + ghc/rts/gmp/extract-dbl.c | 187 + ghc/rts/gmp/insert-dbl.c | 98 + ghc/rts/gmp/install-sh | 251 ++ ghc/rts/gmp/ltconfig | 3109 ++++++++++++++++ ghc/rts/gmp/ltmain.sh | 4692 ++++++++++++++++++++++++ ghc/rts/gmp/mdate-sh | 92 + ghc/rts/gmp/memory.c | 95 +- ghc/rts/gmp/missing | 244 ++ ghc/rts/gmp/mp_bpl.c | 23 + ghc/rts/gmp/mp_clz_tab.c | 14 +- ghc/rts/gmp/mp_minv_tab.c | 50 + ghc/rts/gmp/mp_set_fns.c | 8 +- ghc/rts/gmp/mpn/Makefile.am | 94 + ghc/rts/gmp/mpn/README | 18 +- ghc/rts/gmp/mpn/a29k/add_n.s | 16 +- ghc/rts/gmp/mpn/a29k/addmul_1.s | 16 +- ghc/rts/gmp/mpn/a29k/lshift.s | 16 +- ghc/rts/gmp/mpn/a29k/mul_1.s | 16 +- ghc/rts/gmp/mpn/a29k/rshift.s | 16 +- ghc/rts/gmp/mpn/a29k/sub_n.s | 16 +- ghc/rts/gmp/mpn/a29k/submul_1.s | 16 +- ghc/rts/gmp/mpn/a29k/udiv.s | 30 + ghc/rts/gmp/mpn/a29k/umul.s | 29 + ghc/rts/gmp/mpn/alpha/README | 223 +- ghc/rts/gmp/mpn/alpha/add_n.asm | 114 + ghc/rts/gmp/mpn/alpha/addmul_1.asm | 87 + ghc/rts/gmp/mpn/alpha/cntlz.asm | 68 + ghc/rts/gmp/mpn/alpha/default.m4 | 77 + ghc/rts/gmp/mpn/alpha/ev5/add_n.asm | 143 + ghc/rts/gmp/mpn/alpha/ev5/lshift.asm | 169 + ghc/rts/gmp/mpn/alpha/ev5/rshift.asm | 167 + ghc/rts/gmp/mpn/alpha/ev5/sub_n.asm | 143 + ghc/rts/gmp/mpn/alpha/ev6/addmul_1.asm | 474 +++ ghc/rts/gmp/mpn/alpha/ev6/gmp-mparam.h | 62 + ghc/rts/gmp/mpn/alpha/gmp-mparam.h | 47 +- ghc/rts/gmp/mpn/alpha/invert_limb.asm | 345 ++ ghc/rts/gmp/mpn/alpha/lshift.asm | 104 + ghc/rts/gmp/mpn/alpha/mul_1.asm | 71 + ghc/rts/gmp/mpn/alpha/rshift.asm | 102 + ghc/rts/gmp/mpn/alpha/sub_n.asm | 114 + ghc/rts/gmp/mpn/alpha/submul_1.asm | 87 + ghc/rts/gmp/mpn/alpha/udiv_qrnnd.S | 28 +- ghc/rts/gmp/mpn/alpha/umul.asm | 39 + ghc/rts/gmp/mpn/alpha/unicos.m4 | 63 + ghc/rts/gmp/mpn/arm/add_n.S | 77 + ghc/rts/gmp/mpn/arm/addmul_1.S | 89 + ghc/rts/gmp/mpn/arm/gmp-mparam.h | 34 + ghc/rts/gmp/mpn/arm/mul_1.S | 81 + ghc/rts/gmp/mpn/arm/sub_n.S | 79 + ghc/rts/gmp/mpn/asm-defs.m4 | 1182 ++++++ ghc/rts/gmp/mpn/clipper/add_n.s | 16 +- ghc/rts/gmp/mpn/clipper/mul_1.s | 16 +- ghc/rts/gmp/mpn/clipper/sub_n.s | 16 +- ghc/rts/gmp/mpn/cray/README | 14 + ghc/rts/gmp/mpn/cray/add_n.c | 96 + ghc/rts/gmp/mpn/cray/addmul_1.c | 46 + ghc/rts/gmp/mpn/cray/gmp-mparam.h | 8 +- ghc/rts/gmp/mpn/cray/mul_1.c | 44 + ghc/rts/gmp/mpn/cray/mulww.f | 54 + ghc/rts/gmp/mpn/cray/mulww.s | 245 ++ ghc/rts/gmp/mpn/cray/sub_n.c | 97 + ghc/rts/gmp/mpn/cray/submul_1.c | 46 + ghc/rts/gmp/mpn/generic/add_n.c | 8 +- ghc/rts/gmp/mpn/generic/addmul_1.c | 8 +- ghc/rts/gmp/mpn/generic/addsub_n.c | 167 + ghc/rts/gmp/mpn/generic/bdivmod.c | 47 +- ghc/rts/gmp/mpn/generic/bz_divrem_n.c | 223 ++ ghc/rts/gmp/mpn/generic/cmp.c | 8 +- ghc/rts/gmp/mpn/generic/diveby3.c | 77 + ghc/rts/gmp/mpn/generic/divrem.c | 278 +- ghc/rts/gmp/mpn/generic/divrem_1.c | 220 +- ghc/rts/gmp/mpn/generic/divrem_2.c | 151 + ghc/rts/gmp/mpn/generic/dump.c | 72 +- ghc/rts/gmp/mpn/generic/gcd.c | 94 +- ghc/rts/gmp/mpn/generic/gcd_1.c | 14 +- ghc/rts/gmp/mpn/generic/gcdext.c | 659 +++- ghc/rts/gmp/mpn/generic/get_str.c | 15 +- ghc/rts/gmp/mpn/generic/gmp-mparam.h | 8 +- ghc/rts/gmp/mpn/generic/hamdist.c | 24 +- ghc/rts/gmp/mpn/generic/inlines.c | 21 + ghc/rts/gmp/mpn/generic/jacbase.c | 136 + ghc/rts/gmp/mpn/generic/lshift.c | 8 +- ghc/rts/gmp/mpn/generic/mod_1.c | 36 +- ghc/rts/gmp/mpn/generic/mod_1_rs.c | 111 + ghc/rts/gmp/mpn/generic/mul.c | 244 +- ghc/rts/gmp/mpn/generic/mul_1.c | 8 +- ghc/rts/gmp/mpn/generic/mul_basecase.c | 87 + ghc/rts/gmp/mpn/generic/mul_fft.c | 772 ++++ ghc/rts/gmp/mpn/generic/mul_n.c | 1508 ++++++-- ghc/rts/gmp/mpn/generic/perfsqr.c | 41 +- ghc/rts/gmp/mpn/generic/popcount.c | 24 +- ghc/rts/gmp/mpn/generic/pre_mod_1.c | 8 +- ghc/rts/gmp/mpn/generic/random.c | 43 + ghc/rts/gmp/mpn/generic/rshift.c | 8 +- ghc/rts/gmp/mpn/generic/sb_divrem_mn.c | 201 + ghc/rts/gmp/mpn/generic/scan0.c | 8 +- ghc/rts/gmp/mpn/generic/scan1.c | 8 +- ghc/rts/gmp/mpn/generic/set_str.c | 15 +- ghc/rts/gmp/mpn/generic/sqr_basecase.c | 83 + ghc/rts/gmp/mpn/generic/sqrtrem.c | 67 +- ghc/rts/gmp/mpn/generic/sub_n.c | 8 +- ghc/rts/gmp/mpn/generic/submul_1.c | 8 +- ghc/rts/gmp/mpn/generic/tdiv_qr.c | 401 ++ ghc/rts/gmp/mpn/generic/udiv_w_sdiv.c | 16 +- ghc/rts/gmp/mpn/hppa/README | 9 +- ghc/rts/gmp/mpn/hppa/add_n.s | 16 +- ghc/rts/gmp/mpn/hppa/gmp-mparam.h | 63 + ghc/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s | 16 +- ghc/rts/gmp/mpn/hppa/hppa1_1/mul_1.s | 16 +- ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s | 16 +- ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S | 16 +- ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s | 16 +- ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s | 16 +- ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s | 16 +- ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S | 16 +- ghc/rts/gmp/mpn/hppa/hppa1_1/submul_1.s | 18 +- ghc/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S | 80 + ghc/rts/gmp/mpn/hppa/hppa1_1/umul.s | 42 + ghc/rts/gmp/mpn/hppa/hppa2_0/add_n.s | 88 + ghc/rts/gmp/mpn/hppa/hppa2_0/sub_n.s | 88 + ghc/rts/gmp/mpn/hppa/lshift.s | 16 +- ghc/rts/gmp/mpn/hppa/rshift.s | 16 +- ghc/rts/gmp/mpn/hppa/sub_n.s | 16 +- ghc/rts/gmp/mpn/hppa/udiv_qrnnd.s | 14 +- ghc/rts/gmp/mpn/i960/add_n.s | 16 +- ghc/rts/gmp/mpn/i960/addmul_1.s | 16 +- ghc/rts/gmp/mpn/i960/mul_1.s | 16 +- ghc/rts/gmp/mpn/i960/sub_n.s | 16 +- ghc/rts/gmp/mpn/lisp/gmpasm-mode.el | 351 ++ ghc/rts/gmp/mpn/m68k/add_n.S | 21 +- ghc/rts/gmp/mpn/m68k/lshift.S | 21 +- ghc/rts/gmp/mpn/m68k/mc68020/addmul_1.S | 21 +- ghc/rts/gmp/mpn/m68k/mc68020/mul_1.S | 21 +- ghc/rts/gmp/mpn/m68k/mc68020/submul_1.S | 21 +- ghc/rts/gmp/mpn/m68k/mc68020/udiv.S | 31 + ghc/rts/gmp/mpn/m68k/mc68020/umul.S | 31 + ghc/rts/gmp/mpn/m68k/rshift.S | 21 +- ghc/rts/gmp/mpn/m68k/sub_n.S | 21 +- ghc/rts/gmp/mpn/m68k/syntax.h | 8 +- ghc/rts/gmp/mpn/m88k/add_n.s | 16 +- ghc/rts/gmp/mpn/m88k/mc88110/add_n.S | 16 +- ghc/rts/gmp/mpn/m88k/mc88110/addmul_1.s | 16 +- ghc/rts/gmp/mpn/m88k/mc88110/mul_1.s | 16 +- ghc/rts/gmp/mpn/m88k/mc88110/sub_n.S | 16 +- ghc/rts/gmp/mpn/m88k/mul_1.s | 16 +- ghc/rts/gmp/mpn/m88k/sub_n.s | 16 +- ghc/rts/gmp/mpn/mips2/add_n.s | 20 +- ghc/rts/gmp/mpn/mips2/addmul_1.s | 20 +- ghc/rts/gmp/mpn/mips2/lshift.s | 20 +- ghc/rts/gmp/mpn/mips2/mul_1.s | 20 +- ghc/rts/gmp/mpn/mips2/rshift.s | 20 +- ghc/rts/gmp/mpn/mips2/sub_n.s | 20 +- ghc/rts/gmp/mpn/mips2/submul_1.s | 20 +- ghc/rts/gmp/mpn/mips2/umul.s | 30 + ghc/rts/gmp/mpn/mips3/add_n.s | 20 +- ghc/rts/gmp/mpn/mips3/addmul_1.s | 20 +- ghc/rts/gmp/mpn/mips3/gmp-mparam.h | 41 +- ghc/rts/gmp/mpn/mips3/lshift.s | 20 +- ghc/rts/gmp/mpn/mips3/mul_1.s | 20 +- ghc/rts/gmp/mpn/mips3/rshift.s | 20 +- ghc/rts/gmp/mpn/mips3/sub_n.s | 20 +- ghc/rts/gmp/mpn/mips3/submul_1.s | 20 +- ghc/rts/gmp/mpn/mp_bases.c | 1025 +++--- ghc/rts/gmp/mpn/ns32k/add_n.s | 16 +- ghc/rts/gmp/mpn/ns32k/addmul_1.s | 16 +- ghc/rts/gmp/mpn/ns32k/mul_1.s | 16 +- ghc/rts/gmp/mpn/ns32k/sub_n.s | 16 +- ghc/rts/gmp/mpn/ns32k/submul_1.s | 16 +- ghc/rts/gmp/mpn/pa64/README | 38 + ghc/rts/gmp/mpn/pa64/add_n.s | 90 + ghc/rts/gmp/mpn/pa64/addmul_1.S | 167 + ghc/rts/gmp/mpn/pa64/gmp-mparam.h | 65 + ghc/rts/gmp/mpn/pa64/lshift.s | 103 + ghc/rts/gmp/mpn/pa64/mul_1.S | 158 + ghc/rts/gmp/mpn/pa64/rshift.s | 100 + ghc/rts/gmp/mpn/pa64/sub_n.s | 90 + ghc/rts/gmp/mpn/pa64/submul_1.S | 170 + ghc/rts/gmp/mpn/pa64/udiv_qrnnd.c | 111 + ghc/rts/gmp/mpn/pa64/umul_ppmm.S | 74 + ghc/rts/gmp/mpn/pa64w/README | 2 + ghc/rts/gmp/mpn/pa64w/add_n.s | 90 + ghc/rts/gmp/mpn/pa64w/addmul_1.S | 168 + ghc/rts/gmp/mpn/pa64w/gmp-mparam.h | 65 + ghc/rts/gmp/mpn/pa64w/lshift.s | 103 + ghc/rts/gmp/mpn/pa64w/mul_1.S | 159 + ghc/rts/gmp/mpn/pa64w/rshift.s | 100 + ghc/rts/gmp/mpn/pa64w/sub_n.s | 90 + ghc/rts/gmp/mpn/pa64w/submul_1.S | 171 + ghc/rts/gmp/mpn/pa64w/udiv_qrnnd.c | 117 + ghc/rts/gmp/mpn/pa64w/umul_ppmm.S | 72 + ghc/rts/gmp/mpn/power/add_n.s | 34 +- ghc/rts/gmp/mpn/power/addmul_1.s | 45 +- ghc/rts/gmp/mpn/power/lshift.s | 31 +- ghc/rts/gmp/mpn/power/mul_1.s | 45 +- ghc/rts/gmp/mpn/power/rshift.s | 31 +- ghc/rts/gmp/mpn/power/sdiv.s | 34 + ghc/rts/gmp/mpn/power/sub_n.s | 34 +- ghc/rts/gmp/mpn/power/submul_1.s | 45 +- ghc/rts/gmp/mpn/power/umul.s | 38 + ghc/rts/gmp/mpn/powerpc32/add_n.asm | 61 + ghc/rts/gmp/mpn/powerpc32/addmul_1.asm | 124 + ghc/rts/gmp/mpn/powerpc32/aix.m4 | 39 + ghc/rts/gmp/mpn/powerpc32/gmp-mparam.h | 66 + ghc/rts/gmp/mpn/powerpc32/lshift.asm | 145 + ghc/rts/gmp/mpn/powerpc32/mul_1.asm | 86 + ghc/rts/gmp/mpn/powerpc32/regmap.m4 | 34 + ghc/rts/gmp/mpn/powerpc32/rshift.asm | 60 + ghc/rts/gmp/mpn/powerpc32/sub_n.asm | 61 + ghc/rts/gmp/mpn/powerpc32/submul_1.asm | 130 + ghc/rts/gmp/mpn/powerpc32/umul.asm | 32 + ghc/rts/gmp/mpn/powerpc64/README | 36 + ghc/rts/gmp/mpn/powerpc64/add_n.asm | 61 + ghc/rts/gmp/mpn/powerpc64/addmul_1.asm | 52 + ghc/rts/gmp/mpn/powerpc64/addsub_n.asm | 107 + ghc/rts/gmp/mpn/powerpc64/aix.m4 | 40 + ghc/rts/gmp/mpn/powerpc64/copyd.asm | 45 + ghc/rts/gmp/mpn/powerpc64/copyi.asm | 44 + ghc/rts/gmp/mpn/powerpc64/gmp-mparam.h | 45 +- ghc/rts/gmp/mpn/powerpc64/lshift.asm | 159 + ghc/rts/gmp/mpn/powerpc64/mul_1.asm | 49 + ghc/rts/gmp/mpn/powerpc64/rshift.asm | 60 + ghc/rts/gmp/mpn/powerpc64/sub_n.asm | 61 + ghc/rts/gmp/mpn/powerpc64/submul_1.asm | 54 + ghc/rts/gmp/mpn/pyr/add_n.s | 16 +- ghc/rts/gmp/mpn/pyr/addmul_1.s | 16 +- ghc/rts/gmp/mpn/pyr/mul_1.s | 16 +- ghc/rts/gmp/mpn/pyr/sub_n.s | 16 +- ghc/rts/gmp/mpn/sh/add_n.s | 18 +- ghc/rts/gmp/mpn/sh/sh2/addmul_1.s | 16 +- ghc/rts/gmp/mpn/sh/sh2/mul_1.s | 16 +- ghc/rts/gmp/mpn/sh/sh2/submul_1.s | 16 +- ghc/rts/gmp/mpn/sh/sub_n.s | 18 +- ghc/rts/gmp/mpn/sparc32/add_n.asm | 236 ++ ghc/rts/gmp/mpn/sparc32/addmul_1.asm | 146 + ghc/rts/gmp/mpn/sparc32/lshift.asm | 97 + ghc/rts/gmp/mpn/sparc32/mul_1.asm | 137 + ghc/rts/gmp/mpn/sparc32/rshift.asm | 93 + ghc/rts/gmp/mpn/sparc32/sub_n.asm | 326 ++ ghc/rts/gmp/mpn/sparc32/submul_1.asm | 146 + ghc/rts/gmp/mpn/sparc32/udiv_fp.asm | 158 + ghc/rts/gmp/mpn/sparc32/udiv_nfp.asm | 193 + ghc/rts/gmp/mpn/sparc32/umul.asm | 68 + ghc/rts/gmp/mpn/sparc32/v8/addmul_1.asm | 122 + ghc/rts/gmp/mpn/sparc32/v8/mul_1.asm | 103 + ghc/rts/gmp/mpn/sparc32/v8/submul_1.asm | 58 + ghc/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm | 122 + ghc/rts/gmp/mpn/sparc32/v8/umul.asm | 31 + ghc/rts/gmp/mpn/sparc32/v9/README | 4 + ghc/rts/gmp/mpn/sparc32/v9/addmul_1.asm | 288 ++ ghc/rts/gmp/mpn/sparc32/v9/gmp-mparam.h | 69 + ghc/rts/gmp/mpn/sparc32/v9/mul_1.asm | 267 ++ ghc/rts/gmp/mpn/sparc32/v9/submul_1.asm | 291 ++ ghc/rts/gmp/mpn/sparc64/README | 48 + ghc/rts/gmp/mpn/sparc64/add_n.asm | 172 + ghc/rts/gmp/mpn/sparc64/addmul1h.asm | 203 + ghc/rts/gmp/mpn/sparc64/addmul_1.asm | 114 + ghc/rts/gmp/mpn/sparc64/copyi.asm | 79 + ghc/rts/gmp/mpn/sparc64/gmp-mparam.h | 73 +- ghc/rts/gmp/mpn/sparc64/lshift.asm | 97 + ghc/rts/gmp/mpn/sparc64/mul_1.asm | 113 + ghc/rts/gmp/mpn/sparc64/mul_1h.asm | 183 + ghc/rts/gmp/mpn/sparc64/rshift.asm | 94 + ghc/rts/gmp/mpn/sparc64/sub_n.asm | 172 + ghc/rts/gmp/mpn/sparc64/submul1h.asm | 204 ++ ghc/rts/gmp/mpn/sparc64/submul_1.asm | 114 + ghc/rts/gmp/mpn/thumb/add_n.s | 50 + ghc/rts/gmp/mpn/thumb/sub_n.s | 50 + ghc/rts/gmp/mpn/underscore.h | 26 + ghc/rts/gmp/mpn/vax/add_n.s | 37 +- ghc/rts/gmp/mpn/vax/addmul_1.s | 24 +- ghc/rts/gmp/mpn/vax/lshift.s | 58 + ghc/rts/gmp/mpn/vax/mul_1.s | 24 +- ghc/rts/gmp/mpn/vax/rshift.s | 56 + ghc/rts/gmp/mpn/vax/sub_n.s | 37 +- ghc/rts/gmp/mpn/vax/submul_1.s | 24 +- ghc/rts/gmp/mpn/x86/README | 40 + ghc/rts/gmp/mpn/x86/README.family | 333 ++ ghc/rts/gmp/mpn/x86/addsub_n.S | 174 + ghc/rts/gmp/mpn/x86/aors_n.asm | 187 + ghc/rts/gmp/mpn/x86/aorsmul_1.asm | 134 + ghc/rts/gmp/mpn/x86/copyd.asm | 80 + ghc/rts/gmp/mpn/x86/copyi.asm | 79 + ghc/rts/gmp/mpn/x86/diveby3.asm | 115 + ghc/rts/gmp/mpn/x86/divrem_1.asm | 232 ++ ghc/rts/gmp/mpn/x86/k6/README | 237 ++ ghc/rts/gmp/mpn/x86/k6/aors_n.asm | 329 ++ ghc/rts/gmp/mpn/x86/k6/aorsmul_1.asm | 372 ++ ghc/rts/gmp/mpn/x86/k6/cross.pl | 141 + ghc/rts/gmp/mpn/x86/k6/diveby3.asm | 110 + ghc/rts/gmp/mpn/x86/k6/gmp-mparam.h | 97 + ghc/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm | 179 + ghc/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm | 196 + ghc/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm | 286 ++ ghc/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm | 285 ++ ghc/rts/gmp/mpn/x86/k6/mmx/com_n.asm | 91 + ghc/rts/gmp/mpn/x86/k6/mmx/logops_n.asm | 212 ++ ghc/rts/gmp/mpn/x86/k6/mmx/lshift.asm | 122 + ghc/rts/gmp/mpn/x86/k6/mmx/popham.asm | 238 ++ ghc/rts/gmp/mpn/x86/k6/mmx/rshift.asm | 122 + ghc/rts/gmp/mpn/x86/k6/mul_1.asm | 272 ++ ghc/rts/gmp/mpn/x86/k6/mul_basecase.asm | 600 +++ ghc/rts/gmp/mpn/x86/k6/sqr_basecase.asm | 672 ++++ ghc/rts/gmp/mpn/x86/k7/README | 145 + ghc/rts/gmp/mpn/x86/k7/aors_n.asm | 250 ++ ghc/rts/gmp/mpn/x86/k7/aorsmul_1.asm | 364 ++ ghc/rts/gmp/mpn/x86/k7/diveby3.asm | 131 + ghc/rts/gmp/mpn/x86/k7/gmp-mparam.h | 100 + ghc/rts/gmp/mpn/x86/k7/mmx/copyd.asm | 136 + ghc/rts/gmp/mpn/x86/k7/mmx/copyi.asm | 147 + ghc/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm | 718 ++++ ghc/rts/gmp/mpn/x86/k7/mmx/lshift.asm | 472 +++ ghc/rts/gmp/mpn/x86/k7/mmx/mod_1.asm | 457 +++ ghc/rts/gmp/mpn/x86/k7/mmx/popham.asm | 239 ++ ghc/rts/gmp/mpn/x86/k7/mmx/rshift.asm | 471 +++ ghc/rts/gmp/mpn/x86/k7/mul_1.asm | 265 ++ ghc/rts/gmp/mpn/x86/k7/mul_basecase.asm | 593 +++ ghc/rts/gmp/mpn/x86/k7/sqr_basecase.asm | 627 ++++ ghc/rts/gmp/mpn/x86/lshift.asm | 90 + ghc/rts/gmp/mpn/x86/mod_1.asm | 141 + ghc/rts/gmp/mpn/x86/mul_1.asm | 130 + ghc/rts/gmp/mpn/x86/mul_basecase.asm | 209 ++ ghc/rts/gmp/mpn/x86/p6/README | 95 + ghc/rts/gmp/mpn/x86/p6/aorsmul_1.asm | 300 ++ ghc/rts/gmp/mpn/x86/p6/diveby3.asm | 37 + ghc/rts/gmp/mpn/x86/p6/gmp-mparam.h | 96 + ghc/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm | 677 ++++ ghc/rts/gmp/mpn/x86/p6/mmx/mod_1.asm | 444 +++ ghc/rts/gmp/mpn/x86/p6/mmx/popham.asm | 31 + ghc/rts/gmp/mpn/x86/p6/p3mmx/popham.asm | 30 + ghc/rts/gmp/mpn/x86/p6/sqr_basecase.asm | 641 ++++ ghc/rts/gmp/mpn/x86/pentium/README | 71 +- ghc/rts/gmp/mpn/x86/pentium/aors_n.asm | 196 + ghc/rts/gmp/mpn/x86/pentium/aorsmul_1.asm | 99 + ghc/rts/gmp/mpn/x86/pentium/diveby3.asm | 183 + ghc/rts/gmp/mpn/x86/pentium/gmp-mparam.h | 97 + ghc/rts/gmp/mpn/x86/pentium/lshift.asm | 236 ++ ghc/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h | 97 + ghc/rts/gmp/mpn/x86/pentium/mmx/lshift.asm | 455 +++ ghc/rts/gmp/mpn/x86/pentium/mmx/popham.asm | 30 + ghc/rts/gmp/mpn/x86/pentium/mmx/rshift.asm | 460 +++ ghc/rts/gmp/mpn/x86/pentium/mul_1.asm | 79 + ghc/rts/gmp/mpn/x86/pentium/mul_basecase.asm | 135 + ghc/rts/gmp/mpn/x86/pentium/rshift.asm | 236 ++ ghc/rts/gmp/mpn/x86/pentium/sqr_basecase.asm | 520 +++ ghc/rts/gmp/mpn/x86/rshift.asm | 92 + ghc/rts/gmp/mpn/x86/udiv.asm | 44 + ghc/rts/gmp/mpn/x86/umul.asm | 43 + ghc/rts/gmp/mpn/x86/x86-defs.m4 | 713 ++++ ghc/rts/gmp/mpn/z8000/add_n.s | 16 +- ghc/rts/gmp/mpn/z8000/gmp-mparam.h | 8 +- ghc/rts/gmp/mpn/z8000/mul_1.s | 16 +- ghc/rts/gmp/mpn/z8000/sub_n.s | 16 +- ghc/rts/gmp/mpn/z8000x/add_n.s | 16 +- ghc/rts/gmp/mpn/z8000x/sub_n.s | 16 +- ghc/rts/gmp/mpz/Makefile.am | 58 + ghc/rts/gmp/mpz/README | 23 + ghc/rts/gmp/mpz/abs.c | 8 +- ghc/rts/gmp/mpz/add.c | 19 +- ghc/rts/gmp/mpz/add_ui.c | 14 +- ghc/rts/gmp/mpz/addmul_ui.c | 214 ++ ghc/rts/gmp/mpz/and.c | 24 +- ghc/rts/gmp/mpz/array_init.c | 8 +- ghc/rts/gmp/mpz/bin_ui.c | 141 + ghc/rts/gmp/mpz/bin_uiui.c | 120 + ghc/rts/gmp/mpz/cdiv_q.c | 12 +- ghc/rts/gmp/mpz/cdiv_q_ui.c | 15 +- ghc/rts/gmp/mpz/cdiv_qr.c | 14 +- ghc/rts/gmp/mpz/cdiv_qr_ui.c | 17 +- ghc/rts/gmp/mpz/cdiv_r.c | 8 +- ghc/rts/gmp/mpz/cdiv_r_ui.c | 11 +- ghc/rts/gmp/mpz/cdiv_ui.c | 8 +- ghc/rts/gmp/mpz/clear.c | 8 +- ghc/rts/gmp/mpz/clrbit.c | 8 +- ghc/rts/gmp/mpz/cmp.c | 8 +- ghc/rts/gmp/mpz/cmp_si.c | 22 +- ghc/rts/gmp/mpz/cmp_ui.c | 17 +- ghc/rts/gmp/mpz/cmpabs.c | 57 + ghc/rts/gmp/mpz/cmpabs_ui.c | 56 + ghc/rts/gmp/mpz/com.c | 8 +- ghc/rts/gmp/mpz/divexact.c | 77 +- ghc/rts/gmp/mpz/dump.c | 44 + ghc/rts/gmp/mpz/fac_ui.c | 8 +- ghc/rts/gmp/mpz/fdiv_q.c | 12 +- ghc/rts/gmp/mpz/fdiv_q_2exp.c | 26 +- ghc/rts/gmp/mpz/fdiv_q_ui.c | 15 +- ghc/rts/gmp/mpz/fdiv_qr.c | 14 +- ghc/rts/gmp/mpz/fdiv_qr_ui.c | 17 +- ghc/rts/gmp/mpz/fdiv_r.c | 8 +- ghc/rts/gmp/mpz/fdiv_r_2exp.c | 81 +- ghc/rts/gmp/mpz/fdiv_r_ui.c | 11 +- ghc/rts/gmp/mpz/fdiv_ui.c | 8 +- ghc/rts/gmp/mpz/fib_ui.c | 165 + ghc/rts/gmp/mpz/fits_sint_p.c | 50 + ghc/rts/gmp/mpz/fits_slong_p.c | 50 + ghc/rts/gmp/mpz/fits_sshort_p.c | 50 + ghc/rts/gmp/mpz/fits_uint_p.c | 41 + ghc/rts/gmp/mpz/fits_ulong_p.c | 41 + ghc/rts/gmp/mpz/fits_ushort_p.c | 41 + ghc/rts/gmp/mpz/gcd.c | 20 +- ghc/rts/gmp/mpz/gcd_ui.c | 13 +- ghc/rts/gmp/mpz/gcdext.c | 137 +- ghc/rts/gmp/mpz/get_d.c | 96 +- ghc/rts/gmp/mpz/get_si.c | 8 +- ghc/rts/gmp/mpz/get_str.c | 8 +- ghc/rts/gmp/mpz/get_ui.c | 8 +- ghc/rts/gmp/mpz/getlimbn.c | 12 +- ghc/rts/gmp/mpz/hamdist.c | 8 +- ghc/rts/gmp/mpz/init.c | 8 +- ghc/rts/gmp/mpz/inp_raw.c | 8 +- ghc/rts/gmp/mpz/inp_str.c | 63 +- ghc/rts/gmp/mpz/invert.c | 62 +- ghc/rts/gmp/mpz/ior.c | 21 +- ghc/rts/gmp/mpz/iset.c | 8 +- ghc/rts/gmp/mpz/iset_d.c | 8 +- ghc/rts/gmp/mpz/iset_si.c | 12 +- ghc/rts/gmp/mpz/iset_str.c | 13 +- ghc/rts/gmp/mpz/iset_ui.c | 8 +- ghc/rts/gmp/mpz/jacobi.c | 8 +- ghc/rts/gmp/mpz/kronsz.c | 126 + ghc/rts/gmp/mpz/kronuz.c | 115 + ghc/rts/gmp/mpz/kronzs.c | 74 + ghc/rts/gmp/mpz/kronzu.c | 66 + ghc/rts/gmp/mpz/lcm.c | 56 + ghc/rts/gmp/mpz/legendre.c | 8 +- ghc/rts/gmp/mpz/mod.c | 8 +- ghc/rts/gmp/mpz/mul.c | 14 +- ghc/rts/gmp/mpz/mul_2exp.c | 8 +- ghc/rts/gmp/mpz/mul_siui.c | 81 + ghc/rts/gmp/mpz/neg.c | 8 +- ghc/rts/gmp/mpz/nextprime.c | 120 + ghc/rts/gmp/mpz/out_raw.c | 8 +- ghc/rts/gmp/mpz/out_str.c | 8 +- ghc/rts/gmp/mpz/perfpow.c | 272 ++ ghc/rts/gmp/mpz/perfsqr.c | 14 +- ghc/rts/gmp/mpz/popcount.c | 8 +- ghc/rts/gmp/mpz/pow_ui.c | 20 +- ghc/rts/gmp/mpz/powm.c | 484 ++- ghc/rts/gmp/mpz/powm_ui.c | 30 +- ghc/rts/gmp/mpz/pprime_p.c | 253 +- ghc/rts/gmp/mpz/random.c | 8 +- ghc/rts/gmp/mpz/random2.c | 8 +- ghc/rts/gmp/mpz/realloc.c | 8 +- ghc/rts/gmp/mpz/remove.c | 93 + ghc/rts/gmp/mpz/root.c | 183 + ghc/rts/gmp/mpz/rrandomb.c | 117 + ghc/rts/gmp/mpz/scan0.c | 8 +- ghc/rts/gmp/mpz/scan1.c | 8 +- ghc/rts/gmp/mpz/set.c | 8 +- ghc/rts/gmp/mpz/set_d.c | 39 +- ghc/rts/gmp/mpz/set_f.c | 8 +- ghc/rts/gmp/mpz/set_q.c | 8 +- ghc/rts/gmp/mpz/set_si.c | 12 +- ghc/rts/gmp/mpz/set_str.c | 37 +- ghc/rts/gmp/mpz/set_ui.c | 8 +- ghc/rts/gmp/mpz/setbit.c | 20 +- ghc/rts/gmp/mpz/size.c | 8 +- ghc/rts/gmp/mpz/sizeinbase.c | 8 +- ghc/rts/gmp/mpz/sqrt.c | 13 +- ghc/rts/gmp/mpz/sqrtrem.c | 16 +- ghc/rts/gmp/mpz/sub.c | 19 +- ghc/rts/gmp/mpz/sub_ui.c | 14 +- ghc/rts/gmp/mpz/swap.c | 52 + ghc/rts/gmp/mpz/tdiv_q.c | 118 +- ghc/rts/gmp/mpz/tdiv_q_2exp.c | 8 +- ghc/rts/gmp/mpz/tdiv_q_ui.c | 29 +- ghc/rts/gmp/mpz/tdiv_qr.c | 105 +- ghc/rts/gmp/mpz/tdiv_qr_ui.c | 28 +- ghc/rts/gmp/mpz/tdiv_r.c | 75 +- ghc/rts/gmp/mpz/tdiv_r_2exp.c | 8 +- ghc/rts/gmp/mpz/tdiv_r_ui.c | 23 +- ghc/rts/gmp/mpz/tdiv_ui.c | 53 + ghc/rts/gmp/mpz/tstbit.c | 70 + ghc/rts/gmp/mpz/ui_pow_ui.c | 118 +- ghc/rts/gmp/mpz/urandomb.c | 49 + ghc/rts/gmp/mpz/urandomm.c | 73 + ghc/rts/gmp/mpz/xor.c | 217 ++ ghc/rts/gmp/rand.c | 171 + ghc/rts/gmp/randclr.c | 54 + ghc/rts/gmp/randlc.c | 56 + ghc/rts/gmp/randlc2x.c | 59 + ghc/rts/gmp/randraw.c | 360 ++ ghc/rts/gmp/randsd.c | 37 + ghc/rts/gmp/randsdui.c | 37 + ghc/rts/gmp/stack-alloc.c | 76 +- ghc/rts/gmp/stack-alloc.h | 36 +- ghc/rts/gmp/stamp-vti | 3 + ghc/rts/gmp/version.c | 27 +- ghc/rts/gmp/version.texi | 3 + 493 files changed, 53567 insertions(+), 3564 deletions(-) create mode 100644 ghc/rts/gmp/.gdbinit create mode 100644 ghc/rts/gmp/assert.c create mode 100644 ghc/rts/gmp/compat.c create mode 100644 ghc/rts/gmp/depcomp create mode 100644 ghc/rts/gmp/errno.c create mode 100644 ghc/rts/gmp/extract-dbl.c create mode 100644 ghc/rts/gmp/insert-dbl.c create mode 100644 ghc/rts/gmp/install-sh create mode 100644 ghc/rts/gmp/ltconfig create mode 100644 ghc/rts/gmp/ltmain.sh create mode 100644 ghc/rts/gmp/mdate-sh create mode 100644 ghc/rts/gmp/missing create mode 100644 ghc/rts/gmp/mp_minv_tab.c create mode 100644 ghc/rts/gmp/mpn/Makefile.am create mode 100644 ghc/rts/gmp/mpn/a29k/udiv.s create mode 100644 ghc/rts/gmp/mpn/a29k/umul.s create mode 100644 ghc/rts/gmp/mpn/alpha/add_n.asm create mode 100644 ghc/rts/gmp/mpn/alpha/addmul_1.asm create mode 100644 ghc/rts/gmp/mpn/alpha/cntlz.asm create mode 100644 ghc/rts/gmp/mpn/alpha/default.m4 create mode 100644 ghc/rts/gmp/mpn/alpha/ev5/add_n.asm create mode 100644 ghc/rts/gmp/mpn/alpha/ev5/lshift.asm create mode 100644 ghc/rts/gmp/mpn/alpha/ev5/rshift.asm create mode 100644 ghc/rts/gmp/mpn/alpha/ev5/sub_n.asm create mode 100644 ghc/rts/gmp/mpn/alpha/ev6/addmul_1.asm create mode 100644 ghc/rts/gmp/mpn/alpha/ev6/gmp-mparam.h create mode 100644 ghc/rts/gmp/mpn/alpha/invert_limb.asm create mode 100644 ghc/rts/gmp/mpn/alpha/lshift.asm create mode 100644 ghc/rts/gmp/mpn/alpha/mul_1.asm create mode 100644 ghc/rts/gmp/mpn/alpha/rshift.asm create mode 100644 ghc/rts/gmp/mpn/alpha/sub_n.asm create mode 100644 ghc/rts/gmp/mpn/alpha/submul_1.asm create mode 100644 ghc/rts/gmp/mpn/alpha/umul.asm create mode 100644 ghc/rts/gmp/mpn/alpha/unicos.m4 create mode 100644 ghc/rts/gmp/mpn/arm/add_n.S create mode 100644 ghc/rts/gmp/mpn/arm/addmul_1.S create mode 100644 ghc/rts/gmp/mpn/arm/gmp-mparam.h create mode 100644 ghc/rts/gmp/mpn/arm/mul_1.S create mode 100644 ghc/rts/gmp/mpn/arm/sub_n.S create mode 100644 ghc/rts/gmp/mpn/asm-defs.m4 create mode 100644 ghc/rts/gmp/mpn/cray/README create mode 100644 ghc/rts/gmp/mpn/cray/add_n.c create mode 100644 ghc/rts/gmp/mpn/cray/addmul_1.c create mode 100644 ghc/rts/gmp/mpn/cray/mul_1.c create mode 100644 ghc/rts/gmp/mpn/cray/mulww.f create mode 100644 ghc/rts/gmp/mpn/cray/mulww.s create mode 100644 ghc/rts/gmp/mpn/cray/sub_n.c create mode 100644 ghc/rts/gmp/mpn/cray/submul_1.c create mode 100644 ghc/rts/gmp/mpn/generic/addsub_n.c create mode 100644 ghc/rts/gmp/mpn/generic/bz_divrem_n.c create mode 100644 ghc/rts/gmp/mpn/generic/diveby3.c create mode 100644 ghc/rts/gmp/mpn/generic/divrem_2.c create mode 100644 ghc/rts/gmp/mpn/generic/jacbase.c create mode 100644 ghc/rts/gmp/mpn/generic/mod_1_rs.c create mode 100644 ghc/rts/gmp/mpn/generic/mul_basecase.c create mode 100644 ghc/rts/gmp/mpn/generic/mul_fft.c create mode 100644 ghc/rts/gmp/mpn/generic/random.c create mode 100644 ghc/rts/gmp/mpn/generic/sb_divrem_mn.c create mode 100644 ghc/rts/gmp/mpn/generic/sqr_basecase.c create mode 100644 ghc/rts/gmp/mpn/generic/tdiv_qr.c create mode 100644 ghc/rts/gmp/mpn/hppa/gmp-mparam.h create mode 100644 ghc/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S create mode 100644 ghc/rts/gmp/mpn/hppa/hppa1_1/umul.s create mode 100644 ghc/rts/gmp/mpn/hppa/hppa2_0/add_n.s create mode 100644 ghc/rts/gmp/mpn/hppa/hppa2_0/sub_n.s create mode 100644 ghc/rts/gmp/mpn/lisp/gmpasm-mode.el create mode 100644 ghc/rts/gmp/mpn/m68k/mc68020/udiv.S create mode 100644 ghc/rts/gmp/mpn/m68k/mc68020/umul.S create mode 100644 ghc/rts/gmp/mpn/mips2/umul.s create mode 100644 ghc/rts/gmp/mpn/pa64/README create mode 100644 ghc/rts/gmp/mpn/pa64/add_n.s create mode 100644 ghc/rts/gmp/mpn/pa64/addmul_1.S create mode 100644 ghc/rts/gmp/mpn/pa64/gmp-mparam.h create mode 100644 ghc/rts/gmp/mpn/pa64/lshift.s create mode 100644 ghc/rts/gmp/mpn/pa64/mul_1.S create mode 100644 ghc/rts/gmp/mpn/pa64/rshift.s create mode 100644 ghc/rts/gmp/mpn/pa64/sub_n.s create mode 100644 ghc/rts/gmp/mpn/pa64/submul_1.S create mode 100644 ghc/rts/gmp/mpn/pa64/udiv_qrnnd.c create mode 100644 ghc/rts/gmp/mpn/pa64/umul_ppmm.S create mode 100644 ghc/rts/gmp/mpn/pa64w/README create mode 100644 ghc/rts/gmp/mpn/pa64w/add_n.s create mode 100644 ghc/rts/gmp/mpn/pa64w/addmul_1.S create mode 100644 ghc/rts/gmp/mpn/pa64w/gmp-mparam.h create mode 100644 ghc/rts/gmp/mpn/pa64w/lshift.s create mode 100644 ghc/rts/gmp/mpn/pa64w/mul_1.S create mode 100644 ghc/rts/gmp/mpn/pa64w/rshift.s create mode 100644 ghc/rts/gmp/mpn/pa64w/sub_n.s create mode 100644 ghc/rts/gmp/mpn/pa64w/submul_1.S create mode 100644 ghc/rts/gmp/mpn/pa64w/udiv_qrnnd.c create mode 100644 ghc/rts/gmp/mpn/pa64w/umul_ppmm.S create mode 100644 ghc/rts/gmp/mpn/power/sdiv.s create mode 100644 ghc/rts/gmp/mpn/power/umul.s create mode 100644 ghc/rts/gmp/mpn/powerpc32/add_n.asm create mode 100644 ghc/rts/gmp/mpn/powerpc32/addmul_1.asm create mode 100644 ghc/rts/gmp/mpn/powerpc32/aix.m4 create mode 100644 ghc/rts/gmp/mpn/powerpc32/gmp-mparam.h create mode 100644 ghc/rts/gmp/mpn/powerpc32/lshift.asm create mode 100644 ghc/rts/gmp/mpn/powerpc32/mul_1.asm create mode 100644 ghc/rts/gmp/mpn/powerpc32/regmap.m4 create mode 100644 ghc/rts/gmp/mpn/powerpc32/rshift.asm create mode 100644 ghc/rts/gmp/mpn/powerpc32/sub_n.asm create mode 100644 ghc/rts/gmp/mpn/powerpc32/submul_1.asm create mode 100644 ghc/rts/gmp/mpn/powerpc32/umul.asm create mode 100644 ghc/rts/gmp/mpn/powerpc64/README create mode 100644 ghc/rts/gmp/mpn/powerpc64/add_n.asm create mode 100644 ghc/rts/gmp/mpn/powerpc64/addmul_1.asm create mode 100644 ghc/rts/gmp/mpn/powerpc64/addsub_n.asm create mode 100644 ghc/rts/gmp/mpn/powerpc64/aix.m4 create mode 100644 ghc/rts/gmp/mpn/powerpc64/copyd.asm create mode 100644 ghc/rts/gmp/mpn/powerpc64/copyi.asm create mode 100644 ghc/rts/gmp/mpn/powerpc64/lshift.asm create mode 100644 ghc/rts/gmp/mpn/powerpc64/mul_1.asm create mode 100644 ghc/rts/gmp/mpn/powerpc64/rshift.asm create mode 100644 ghc/rts/gmp/mpn/powerpc64/sub_n.asm create mode 100644 ghc/rts/gmp/mpn/powerpc64/submul_1.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/add_n.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/addmul_1.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/lshift.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/mul_1.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/rshift.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/sub_n.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/submul_1.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/udiv_fp.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/udiv_nfp.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/umul.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/v8/addmul_1.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/v8/mul_1.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/v8/submul_1.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/v8/umul.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/v9/README create mode 100644 ghc/rts/gmp/mpn/sparc32/v9/addmul_1.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/v9/gmp-mparam.h create mode 100644 ghc/rts/gmp/mpn/sparc32/v9/mul_1.asm create mode 100644 ghc/rts/gmp/mpn/sparc32/v9/submul_1.asm create mode 100644 ghc/rts/gmp/mpn/sparc64/README create mode 100644 ghc/rts/gmp/mpn/sparc64/add_n.asm create mode 100644 ghc/rts/gmp/mpn/sparc64/addmul1h.asm create mode 100644 ghc/rts/gmp/mpn/sparc64/addmul_1.asm create mode 100644 ghc/rts/gmp/mpn/sparc64/copyi.asm create mode 100644 ghc/rts/gmp/mpn/sparc64/lshift.asm create mode 100644 ghc/rts/gmp/mpn/sparc64/mul_1.asm create mode 100644 ghc/rts/gmp/mpn/sparc64/mul_1h.asm create mode 100644 ghc/rts/gmp/mpn/sparc64/rshift.asm create mode 100644 ghc/rts/gmp/mpn/sparc64/sub_n.asm create mode 100644 ghc/rts/gmp/mpn/sparc64/submul1h.asm create mode 100644 ghc/rts/gmp/mpn/sparc64/submul_1.asm create mode 100644 ghc/rts/gmp/mpn/thumb/add_n.s create mode 100644 ghc/rts/gmp/mpn/thumb/sub_n.s create mode 100644 ghc/rts/gmp/mpn/underscore.h create mode 100644 ghc/rts/gmp/mpn/vax/lshift.s create mode 100644 ghc/rts/gmp/mpn/vax/rshift.s create mode 100644 ghc/rts/gmp/mpn/x86/README create mode 100644 ghc/rts/gmp/mpn/x86/README.family create mode 100644 ghc/rts/gmp/mpn/x86/addsub_n.S create mode 100644 ghc/rts/gmp/mpn/x86/aors_n.asm create mode 100644 ghc/rts/gmp/mpn/x86/aorsmul_1.asm create mode 100644 ghc/rts/gmp/mpn/x86/copyd.asm create mode 100644 ghc/rts/gmp/mpn/x86/copyi.asm create mode 100644 ghc/rts/gmp/mpn/x86/diveby3.asm create mode 100644 ghc/rts/gmp/mpn/x86/divrem_1.asm create mode 100644 ghc/rts/gmp/mpn/x86/k6/README create mode 100644 ghc/rts/gmp/mpn/x86/k6/aors_n.asm create mode 100644 ghc/rts/gmp/mpn/x86/k6/aorsmul_1.asm create mode 100644 ghc/rts/gmp/mpn/x86/k6/cross.pl create mode 100644 ghc/rts/gmp/mpn/x86/k6/diveby3.asm create mode 100644 ghc/rts/gmp/mpn/x86/k6/gmp-mparam.h create mode 100644 ghc/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm create mode 100644 ghc/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm create mode 100644 ghc/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm create mode 100644 ghc/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm create mode 100644 ghc/rts/gmp/mpn/x86/k6/mmx/com_n.asm create mode 100644 ghc/rts/gmp/mpn/x86/k6/mmx/logops_n.asm create mode 100644 ghc/rts/gmp/mpn/x86/k6/mmx/lshift.asm create mode 100644 ghc/rts/gmp/mpn/x86/k6/mmx/popham.asm create mode 100644 ghc/rts/gmp/mpn/x86/k6/mmx/rshift.asm create mode 100644 ghc/rts/gmp/mpn/x86/k6/mul_1.asm create mode 100644 ghc/rts/gmp/mpn/x86/k6/mul_basecase.asm create mode 100644 ghc/rts/gmp/mpn/x86/k6/sqr_basecase.asm create mode 100644 ghc/rts/gmp/mpn/x86/k7/README create mode 100644 ghc/rts/gmp/mpn/x86/k7/aors_n.asm create mode 100644 ghc/rts/gmp/mpn/x86/k7/aorsmul_1.asm create mode 100644 ghc/rts/gmp/mpn/x86/k7/diveby3.asm create mode 100644 ghc/rts/gmp/mpn/x86/k7/gmp-mparam.h create mode 100644 ghc/rts/gmp/mpn/x86/k7/mmx/copyd.asm create mode 100644 ghc/rts/gmp/mpn/x86/k7/mmx/copyi.asm create mode 100644 ghc/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm create mode 100644 ghc/rts/gmp/mpn/x86/k7/mmx/lshift.asm create mode 100644 ghc/rts/gmp/mpn/x86/k7/mmx/mod_1.asm create mode 100644 ghc/rts/gmp/mpn/x86/k7/mmx/popham.asm create mode 100644 ghc/rts/gmp/mpn/x86/k7/mmx/rshift.asm create mode 100644 ghc/rts/gmp/mpn/x86/k7/mul_1.asm create mode 100644 ghc/rts/gmp/mpn/x86/k7/mul_basecase.asm create mode 100644 ghc/rts/gmp/mpn/x86/k7/sqr_basecase.asm create mode 100644 ghc/rts/gmp/mpn/x86/lshift.asm create mode 100644 ghc/rts/gmp/mpn/x86/mod_1.asm create mode 100644 ghc/rts/gmp/mpn/x86/mul_1.asm create mode 100644 ghc/rts/gmp/mpn/x86/mul_basecase.asm create mode 100644 ghc/rts/gmp/mpn/x86/p6/README create mode 100644 ghc/rts/gmp/mpn/x86/p6/aorsmul_1.asm create mode 100644 ghc/rts/gmp/mpn/x86/p6/diveby3.asm create mode 100644 ghc/rts/gmp/mpn/x86/p6/gmp-mparam.h create mode 100644 ghc/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm create mode 100644 ghc/rts/gmp/mpn/x86/p6/mmx/mod_1.asm create mode 100644 ghc/rts/gmp/mpn/x86/p6/mmx/popham.asm create mode 100644 ghc/rts/gmp/mpn/x86/p6/p3mmx/popham.asm create mode 100644 ghc/rts/gmp/mpn/x86/p6/sqr_basecase.asm create mode 100644 ghc/rts/gmp/mpn/x86/pentium/aors_n.asm create mode 100644 ghc/rts/gmp/mpn/x86/pentium/aorsmul_1.asm create mode 100644 ghc/rts/gmp/mpn/x86/pentium/diveby3.asm create mode 100644 ghc/rts/gmp/mpn/x86/pentium/gmp-mparam.h create mode 100644 ghc/rts/gmp/mpn/x86/pentium/lshift.asm create mode 100644 ghc/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h create mode 100644 ghc/rts/gmp/mpn/x86/pentium/mmx/lshift.asm create mode 100644 ghc/rts/gmp/mpn/x86/pentium/mmx/popham.asm create mode 100644 ghc/rts/gmp/mpn/x86/pentium/mmx/rshift.asm create mode 100644 ghc/rts/gmp/mpn/x86/pentium/mul_1.asm create mode 100644 ghc/rts/gmp/mpn/x86/pentium/mul_basecase.asm create mode 100644 ghc/rts/gmp/mpn/x86/pentium/rshift.asm create mode 100644 ghc/rts/gmp/mpn/x86/pentium/sqr_basecase.asm create mode 100644 ghc/rts/gmp/mpn/x86/rshift.asm create mode 100644 ghc/rts/gmp/mpn/x86/udiv.asm create mode 100644 ghc/rts/gmp/mpn/x86/umul.asm create mode 100644 ghc/rts/gmp/mpn/x86/x86-defs.m4 create mode 100644 ghc/rts/gmp/mpz/Makefile.am create mode 100644 ghc/rts/gmp/mpz/README create mode 100644 ghc/rts/gmp/mpz/addmul_ui.c create mode 100644 ghc/rts/gmp/mpz/bin_ui.c create mode 100644 ghc/rts/gmp/mpz/bin_uiui.c create mode 100644 ghc/rts/gmp/mpz/cmpabs.c create mode 100644 ghc/rts/gmp/mpz/cmpabs_ui.c create mode 100644 ghc/rts/gmp/mpz/dump.c create mode 100644 ghc/rts/gmp/mpz/fib_ui.c create mode 100644 ghc/rts/gmp/mpz/fits_sint_p.c create mode 100644 ghc/rts/gmp/mpz/fits_slong_p.c create mode 100644 ghc/rts/gmp/mpz/fits_sshort_p.c create mode 100644 ghc/rts/gmp/mpz/fits_uint_p.c create mode 100644 ghc/rts/gmp/mpz/fits_ulong_p.c create mode 100644 ghc/rts/gmp/mpz/fits_ushort_p.c create mode 100644 ghc/rts/gmp/mpz/kronsz.c create mode 100644 ghc/rts/gmp/mpz/kronuz.c create mode 100644 ghc/rts/gmp/mpz/kronzs.c create mode 100644 ghc/rts/gmp/mpz/kronzu.c create mode 100644 ghc/rts/gmp/mpz/lcm.c create mode 100644 ghc/rts/gmp/mpz/mul_siui.c create mode 100644 ghc/rts/gmp/mpz/nextprime.c create mode 100644 ghc/rts/gmp/mpz/perfpow.c create mode 100644 ghc/rts/gmp/mpz/remove.c create mode 100644 ghc/rts/gmp/mpz/root.c create mode 100644 ghc/rts/gmp/mpz/rrandomb.c create mode 100644 ghc/rts/gmp/mpz/swap.c create mode 100644 ghc/rts/gmp/mpz/tdiv_ui.c create mode 100644 ghc/rts/gmp/mpz/tstbit.c create mode 100644 ghc/rts/gmp/mpz/urandomb.c create mode 100644 ghc/rts/gmp/mpz/urandomm.c create mode 100644 ghc/rts/gmp/mpz/xor.c create mode 100644 ghc/rts/gmp/rand.c create mode 100644 ghc/rts/gmp/randclr.c create mode 100644 ghc/rts/gmp/randlc.c create mode 100644 ghc/rts/gmp/randlc2x.c create mode 100644 ghc/rts/gmp/randraw.c create mode 100644 ghc/rts/gmp/randsd.c create mode 100644 ghc/rts/gmp/randsdui.c create mode 100644 ghc/rts/gmp/stamp-vti create mode 100644 ghc/rts/gmp/version.texi diff --git a/ghc/rts/gmp/.gdbinit b/ghc/rts/gmp/.gdbinit new file mode 100644 index 0000000..843c109 --- /dev/null +++ b/ghc/rts/gmp/.gdbinit @@ -0,0 +1,34 @@ +# Copyright (C) 1999 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +define pz +set __gmpz_dump ($) +end + +define pq +set __gmpz_dump ($->_mp_num) +echo / +set __gmpz_dump ($->_mp_den) +end + +define pf +set __gmpf_dump ($) +end + diff --git a/ghc/rts/gmp/assert.c b/ghc/rts/gmp/assert.c new file mode 100644 index 0000000..65eccfa --- /dev/null +++ b/ghc/rts/gmp/assert.c @@ -0,0 +1,52 @@ +/* GMP assertion failure handler. */ + +/* +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#include +#include "gmp.h" +#include "gmp-impl.h" + + +int +#if __STDC__ +__gmp_assert_fail (const char *filename, int linenum, + const char *expr) +#else +__gmp_assert_fail (filename, linenum, expr) +char *filename; +int linenum; +char *expr; +#endif +{ + if (filename != NULL && filename[0] != '\0') + { + fprintf (stderr, "%s:", filename); + if (linenum != -1) + fprintf (stderr, "%d: ", linenum); + } + + fprintf (stderr, "GNU MP assertion failed: %s\n", expr); + abort(); + + /*NOTREACHED*/ + return 0; +} diff --git a/ghc/rts/gmp/compat.c b/ghc/rts/gmp/compat.c new file mode 100644 index 0000000..ab7529f --- /dev/null +++ b/ghc/rts/gmp/compat.c @@ -0,0 +1,46 @@ +/* Old function entrypoints retained for binary compatibility. */ + +/* +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#include +#include "gmp.h" +#include "gmp-impl.h" + + +/* mpn_divexact_by3 was a function in gmp 3.0, but as of gmp 3.1 it's a + macro calling mpn_divexact_by3c. */ +int +__MPN (divexact_by3) (mp_ptr dst, mp_srcptr src, mp_size_t size) +{ + mpn_divexact_by3 (dst, src, size); +} + + +/* mpn_divmod_1 was a function in gmp 3.0 and earlier, but marked obsolete + in gmp 2 and 3. As of gmp 3.1 it's a macro calling mpn_divrem_1. */ +int +__MPN (divmod_1) (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor) +{ + mpn_divmod_1 (dst, src, size, divisor); +} + + diff --git a/ghc/rts/gmp/depcomp b/ghc/rts/gmp/depcomp new file mode 100644 index 0000000..7906096 --- /dev/null +++ b/ghc/rts/gmp/depcomp @@ -0,0 +1,269 @@ +#! /bin/sh + +# depcomp - compile a program generating dependencies as side-effects +# Copyright (C) 1999 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +# 02111-1307, USA. + +# Originally written by Alexandre Oliva . + +if test -z "$depmode" || test -z "$source" || test -z "$object"; then + echo "depcomp: Variables source, object and depmode must be set" 1>&2 + exit 1 +fi +# `libtool' can also be set to `yes' or `no'. + +depfile=${depfile-`echo "$object" | sed 's,\([^/]*\)$,.deps/\1,;s/\.\([^.]*\)$/.P\1/'`} +tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`} + +rm -f "$tmpdepfile" + +# Some modes work just like other modes, but use different flags. We +# parameterize here, but still list the modes in the big case below, +# to make depend.m4 easier to write. Note that we *cannot* use a case +# here, because this file can only contain one case statement. +if test "$depmode" = hp; then + # HP compiler uses -M and no extra arg. + gccflag=-M + depmode=gcc +fi + +if test "$depmode" = dashXmstdout; then + # This is just like dashmstdout with a different argument. + dashmflag=-xM + depmode=dashmstdout +fi + +case "$depmode" in +gcc) +## There are various ways to get dependency output from gcc. Here's +## why we pick this rather obscure method: +## - Don't want to use -MD because we'd like the dependencies to end +## up in a subdir. Having to rename by hand is ugly. +## (We might end up doing this anyway to support other compilers.) +## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like +## -MM, not -M (despite what the docs say). +## - Using -M directly means running the compiler twice (even worse +## than renaming). + if test -z "$gccflag"; then + gccflag=-MD, + fi + if "$@" -Wp,"$gccflag$tmpdepfile"; then : + else + stat=$? + rm -f "$tmpdepfile" + exit $stat + fi + rm -f "$depfile" + echo "$object : \\" > "$depfile" + sed 's/^[^:]*: / /' < "$tmpdepfile" >> "$depfile" +## This next piece of magic avoids the `deleted header file' problem. +## The problem is that when a header file which appears in a .P file +## is deleted, the dependency causes make to die (because there is +## typically no way to rebuild the header). We avoid this by adding +## dummy dependencies for each header file. Too bad gcc doesn't do +## this for us directly. + tr ' ' ' +' < "$tmpdepfile" | +## Some versions of gcc put a space before the `:'. On the theory +## that the space means something, we add a space to the output as +## well. +## Some versions of the HPUX 10.20 sed can't process this invocation +## correctly. Breaking it into two sed invocations is a workaround. + sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile" + rm -f "$tmpdepfile" + ;; + +hp) + # This case exists only to let depend.m4 do its work. It works by + # looking at the text of this script. This case will never be run, + # since it is checked for above. + exit 1 + ;; + +dashmd) + # The Java front end to gcc doesn't run cpp, so we can't use the -Wp + # trick. Instead we must use -M and then rename the resulting .d + # file. This is also the case for older versions of gcc, which + # don't implement -Wp. + if "$@" -MD; then : + else + stat=$? + rm -f FIXME + exit $stat + fi + FIXME: rewrite the file + ;; + +sgi) + if test "$libtool" = yes; then + "$@" "-Wc,-MDupdate,$tmpdepfile" + else + "$@" -MDupdate "$tmpdepfile" + fi + stat=$? + if test $stat -eq 0; then : + else + stat=$? + rm -f "$tmpdepfile" + exit $stat + fi + rm -f "$depfile" + echo "$object : \\" > "$depfile" + sed 's/^[^:]*: / /' < "$tmpdepfile" >> "$depfile" + tr ' ' ' +' < "$tmpdepfile" | \ +## Some versions of the HPUX 10.20 sed can't process this invocation +## correctly. Breaking it into two sed invocations is a workaround. + sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile" + rm -f "$tmpdepfile" + ;; + +#nosideeffect) + # This comment above is used by automake to tell side-effect + # dependency tracking mechanisms from slower ones. + +dashmstdout) + # Important note: in order to support this mode, a compiler *must* + # always write the proprocessed file to stdout, regardless of -o, + # because we must use -o when running libtool. + test -z "$dashmflag" && dashmflag=-M + ( IFS=" " + case " $* " in + *" --mode=compile "*) # this is libtool, let us make it quiet + for arg + do # cycle over the arguments + case "$arg" in + "--mode=compile") + # insert --quiet before "--mode=compile" + set fnord "$@" --quiet + shift # fnord + ;; + esac + set fnord "$@" "$arg" + shift # fnord + shift # "$arg" + done + ;; + esac + "$@" $dashmflag | sed 's:^[^:]*\:[ ]*:'"$object"'\: :' > "$tmpdepfile" + ) & + proc=$! + "$@" + stat=$? + wait "$proc" + if test "$stat" != 0; then exit $stat; fi + rm -f "$depfile" + cat < "$tmpdepfile" > "$depfile" + tr ' ' ' +' < "$tmpdepfile" | \ +## Some versions of the HPUX 10.20 sed can't process this invocation +## correctly. Breaking it into two sed invocations is a workaround. + sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile" + rm -f "$tmpdepfile" + ;; + +dashXmstdout) + # This case only exists to satisfy depend.m4. It is never actually + # run, as this mode is specially recognized in the preamble. + exit 1 + ;; + +makedepend) + # X makedepend + ( + shift + cleared=no + for arg in "$@"; do + case $cleared in no) + set ""; shift + cleared=yes + esac + case "$arg" in + -D*|-I*) + set fnord "$@" "$arg"; shift;; + -*) + ;; + *) + set fnord "$@" "$arg"; shift;; + esac + done + obj_suffix="`echo $object | sed 's/^.*\././'`" + touch "$tmpdepfile" + ${MAKEDEPEND-makedepend} 2>/dev/null -o"$obj_suffix" -f"$tmpdepfile" "$@" + ) & + proc=$! + "$@" + stat=$? + wait "$proc" + if test "$stat" != 0; then exit $stat; fi + rm -f "$depfile" + cat < "$tmpdepfile" > "$depfile" + tail +3 "$tmpdepfile" | tr ' ' ' +' | \ +## Some versions of the HPUX 10.20 sed can't process this invocation +## correctly. Breaking it into two sed invocations is a workaround. + sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile" + rm -f "$tmpdepfile" "$tmpdepfile".bak + ;; + +cpp) + # Important note: in order to support this mode, a compiler *must* + # always write the proprocessed file to stdout, regardless of -o, + # because we must use -o when running libtool. + ( IFS=" " + case " $* " in + *" --mode=compile "*) + for arg + do # cycle over the arguments + case "$arg" in + "--mode=compile") + # insert --quiet before "--mode=compile" + set fnord "$@" --quiet + shift # fnord + ;; + esac + set fnord "$@" "$arg" + shift # fnord + shift # "$arg" + done + ;; + esac + "$@" -E | + sed -n '/^# [0-9][0-9]* "\([^"]*\)"/ s::'"$object"'\: \1:p' > "$tmpdepfile" + ) & + proc=$! + "$@" + stat=$? + wait "$proc" + if test "$stat" != 0; then exit $stat; fi + rm -f "$depfile" + cat < "$tmpdepfile" > "$depfile" + sed < "$tmpdepfile" -e 's/^[^:]*: //' -e 's/$/ :/' >> "$depfile" + rm -f "$tmpdepfile" + ;; + +none) + exec "$@" + ;; + +*) + echo "Unknown depmode $depmode" 1>&2 + exit 1 + ;; +esac + +exit 0 diff --git a/ghc/rts/gmp/errno.c b/ghc/rts/gmp/errno.c new file mode 100644 index 0000000..7dd223c --- /dev/null +++ b/ghc/rts/gmp/errno.c @@ -0,0 +1,26 @@ +/* gmp_errno -- The largest and most complex file in GMP. + +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#include "gmp.h" +#include "gmp-impl.h" + +int gmp_errno = 0; diff --git a/ghc/rts/gmp/extract-dbl.c b/ghc/rts/gmp/extract-dbl.c new file mode 100644 index 0000000..2d70d9a --- /dev/null +++ b/ghc/rts/gmp/extract-dbl.c @@ -0,0 +1,187 @@ +/* __gmp_extract_double -- convert from double to array of mp_limb_t. + +Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#ifdef XDEBUG +#undef _GMP_IEEE_FLOATS +#endif + +#ifndef _GMP_IEEE_FLOATS +#define _GMP_IEEE_FLOATS 0 +#endif + +/* Extract a non-negative double in d. */ + +int +#if __STDC__ +__gmp_extract_double (mp_ptr rp, double d) +#else +__gmp_extract_double (rp, d) + mp_ptr rp; + double d; +#endif +{ + long exp; + unsigned sc; + mp_limb_t manh, manl; + + /* BUGS + + 1. Should handle Inf and NaN in IEEE specific code. + 2. Handle Inf and NaN also in default code, to avoid hangs. + 3. Generalize to handle all BITS_PER_MP_LIMB >= 32. + 4. This lits is incomplete and misspelled. + */ + + if (d == 0.0) + { + rp[0] = 0; + rp[1] = 0; +#if BITS_PER_MP_LIMB == 32 + rp[2] = 0; +#endif + return 0; + } + +#if _GMP_IEEE_FLOATS + { +#if defined (__alpha) && __GNUC__ == 2 && __GNUC_MINOR__ == 8 + /* Work around alpha-specific bug in GCC 2.8.x. */ + volatile +#endif + union ieee_double_extract x; + x.d = d; + exp = x.s.exp; +#if BITS_PER_MP_LIMB == 64 + manl = (((mp_limb_t) 1 << 63) + | ((mp_limb_t) x.s.manh << 43) | ((mp_limb_t) x.s.manl << 11)); + if (exp == 0) + { + /* Denormalized number. Don't try to be clever about this, + since it is not an important case to make fast. */ + exp = 1; + do + { + manl = manl << 1; + exp--; + } + while ((mp_limb_signed_t) manl >= 0); + } +#else + manh = ((mp_limb_t) 1 << 31) | (x.s.manh << 11) | (x.s.manl >> 21); + manl = x.s.manl << 11; + if (exp == 0) + { + /* Denormalized number. Don't try to be clever about this, + since it is not an important case to make fast. */ + exp = 1; + do + { + manh = (manh << 1) | (manl >> 31); + manl = manl << 1; + exp--; + } + while ((mp_limb_signed_t) manh >= 0); + } +#endif + exp -= 1022; /* Remove IEEE bias. */ + } +#else + { + /* Unknown (or known to be non-IEEE) double format. */ + exp = 0; + if (d >= 1.0) + { + if (d * 0.5 == d) + abort (); + + while (d >= 32768.0) + { + d *= (1.0 / 65536.0); + exp += 16; + } + while (d >= 1.0) + { + d *= 0.5; + exp += 1; + } + } + else if (d < 0.5) + { + while (d < (1.0 / 65536.0)) + { + d *= 65536.0; + exp -= 16; + } + while (d < 0.5) + { + d *= 2.0; + exp -= 1; + } + } + + d *= MP_BASE_AS_DOUBLE; +#if BITS_PER_MP_LIMB == 64 + manl = d; +#else + manh = d; + manl = (d - manh) * MP_BASE_AS_DOUBLE; +#endif + } +#endif + + sc = (unsigned) exp % BITS_PER_MP_LIMB; + + /* We add something here to get rounding right. */ + exp = (exp + 2048) / BITS_PER_MP_LIMB - 2048 / BITS_PER_MP_LIMB + 1; + +#if BITS_PER_MP_LIMB == 64 + if (sc != 0) + { + rp[1] = manl >> (BITS_PER_MP_LIMB - sc); + rp[0] = manl << sc; + } + else + { + rp[1] = manl; + rp[0] = 0; + exp--; + } +#else + if (sc != 0) + { + rp[2] = manh >> (BITS_PER_MP_LIMB - sc); + rp[1] = (manl >> (BITS_PER_MP_LIMB - sc)) | (manh << sc); + rp[0] = manl << sc; + } + else + { + rp[2] = manh; + rp[1] = manl; + rp[0] = 0; + exp--; + } +#endif + + return exp; +} diff --git a/ghc/rts/gmp/insert-dbl.c b/ghc/rts/gmp/insert-dbl.c new file mode 100644 index 0000000..dc88a56 --- /dev/null +++ b/ghc/rts/gmp/insert-dbl.c @@ -0,0 +1,98 @@ +/* __gmp_insert_double -- convert from array of mp_limb_t to double. + +Copyright (C) 1996, 1997, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#ifdef XDEBUG +#undef _GMP_IEEE_FLOATS +#endif + +#ifndef _GMP_IEEE_FLOATS +#define _GMP_IEEE_FLOATS 0 +#endif + +double +#if __STDC__ +__gmp_scale2 (double d, int exp) +#else +__gmp_scale2 (d, exp) + double d; + int exp; +#endif +{ +#if _GMP_IEEE_FLOATS + { +#if defined (__alpha) && __GNUC__ == 2 && __GNUC_MINOR__ == 8 + /* Work around alpha-specific bug in GCC 2.8.x. */ + volatile +#endif + union ieee_double_extract x; + x.d = d; + exp += x.s.exp; + x.s.exp = exp; + if (exp >= 2047) + { + /* Return +-infinity */ + x.s.exp = 2047; + x.s.manl = x.s.manh = 0; + } + else if (exp < 1) + { + x.s.exp = 1; /* smallest exponent (biased) */ + /* Divide result by 2 until we have scaled it to the right IEEE + denormalized number, but stop if it becomes zero. */ + while (exp < 1 && x.d != 0) + { + x.d *= 0.5; + exp++; + } + } + return x.d; + } +#else + { + double factor, r; + + factor = 2.0; + if (exp < 0) + { + factor = 0.5; + exp = -exp; + } + r = d; + if (exp != 0) + { + if ((exp & 1) != 0) + r *= factor; + exp >>= 1; + while (exp != 0) + { + factor *= factor; + if ((exp & 1) != 0) + r *= factor; + exp >>= 1; + } + } + return r; + } +#endif +} diff --git a/ghc/rts/gmp/install-sh b/ghc/rts/gmp/install-sh new file mode 100644 index 0000000..e9de238 --- /dev/null +++ b/ghc/rts/gmp/install-sh @@ -0,0 +1,251 @@ +#!/bin/sh +# +# install - install a program, script, or datafile +# This comes from X11R5 (mit/util/scripts/install.sh). +# +# Copyright 1991 by the Massachusetts Institute of Technology +# +# Permission to use, copy, modify, distribute, and sell this software and its +# documentation for any purpose is hereby granted without fee, provided that +# the above copyright notice appear in all copies and that both that +# copyright notice and this permission notice appear in supporting +# documentation, and that the name of M.I.T. not be used in advertising or +# publicity pertaining to distribution of the software without specific, +# written prior permission. M.I.T. makes no representations about the +# suitability of this software for any purpose. It is provided "as is" +# without express or implied warranty. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. It can only install one file at a time, a restriction +# shared with many OS's install programs. + + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit="${DOITPROG-}" + + +# put in absolute paths if you don't have them in your path; or use env. vars. + +mvprog="${MVPROG-mv}" +cpprog="${CPPROG-cp}" +chmodprog="${CHMODPROG-chmod}" +chownprog="${CHOWNPROG-chown}" +chgrpprog="${CHGRPPROG-chgrp}" +stripprog="${STRIPPROG-strip}" +rmprog="${RMPROG-rm}" +mkdirprog="${MKDIRPROG-mkdir}" + +transformbasename="" +transform_arg="" +instcmd="$mvprog" +chmodcmd="$chmodprog 0755" +chowncmd="" +chgrpcmd="" +stripcmd="" +rmcmd="$rmprog -f" +mvcmd="$mvprog" +src="" +dst="" +dir_arg="" + +while [ x"$1" != x ]; do + case $1 in + -c) instcmd="$cpprog" + shift + continue;; + + -d) dir_arg=true + shift + continue;; + + -m) chmodcmd="$chmodprog $2" + shift + shift + continue;; + + -o) chowncmd="$chownprog $2" + shift + shift + continue;; + + -g) chgrpcmd="$chgrpprog $2" + shift + shift + continue;; + + -s) stripcmd="$stripprog" + shift + continue;; + + -t=*) transformarg=`echo $1 | sed 's/-t=//'` + shift + continue;; + + -b=*) transformbasename=`echo $1 | sed 's/-b=//'` + shift + continue;; + + *) if [ x"$src" = x ] + then + src=$1 + else + # this colon is to work around a 386BSD /bin/sh bug + : + dst=$1 + fi + shift + continue;; + esac +done + +if [ x"$src" = x ] +then + echo "install: no input file specified" + exit 1 +else + true +fi + +if [ x"$dir_arg" != x ]; then + dst=$src + src="" + + if [ -d $dst ]; then + instcmd=: + chmodcmd="" + else + instcmd=mkdir + fi +else + +# Waiting for this to be detected by the "$instcmd $src $dsttmp" command +# might cause directories to be created, which would be especially bad +# if $src (and thus $dsttmp) contains '*'. + + if [ -f $src -o -d $src ] + then + true + else + echo "install: $src does not exist" + exit 1 + fi + + if [ x"$dst" = x ] + then + echo "install: no destination specified" + exit 1 + else + true + fi + +# If destination is a directory, append the input filename; if your system +# does not like double slashes in filenames, you may need to add some logic + + if [ -d $dst ] + then + dst="$dst"/`basename $src` + else + true + fi +fi + +## this sed command emulates the dirname command +dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` + +# Make sure that the destination directory exists. +# this part is taken from Noah Friedman's mkinstalldirs script + +# Skip lots of stat calls in the usual case. +if [ ! -d "$dstdir" ]; then +defaultIFS=' +' +IFS="${IFS-${defaultIFS}}" + +oIFS="${IFS}" +# Some sh's can't handle IFS=/ for some reason. +IFS='%' +set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'` +IFS="${oIFS}" + +pathcomp='' + +while [ $# -ne 0 ] ; do + pathcomp="${pathcomp}${1}" + shift + + if [ ! -d "${pathcomp}" ] ; + then + $mkdirprog "${pathcomp}" + else + true + fi + + pathcomp="${pathcomp}/" +done +fi + +if [ x"$dir_arg" != x ] +then + $doit $instcmd $dst && + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi +else + +# If we're going to rename the final executable, determine the name now. + + if [ x"$transformarg" = x ] + then + dstfile=`basename $dst` + else + dstfile=`basename $dst $transformbasename | + sed $transformarg`$transformbasename + fi + +# don't allow the sed command to completely eliminate the filename + + if [ x"$dstfile" = x ] + then + dstfile=`basename $dst` + else + true + fi + +# Make a temp file name in the proper directory. + + dsttmp=$dstdir/#inst.$$# + +# Move or copy the file name to the temp name + + $doit $instcmd $src $dsttmp && + + trap "rm -f ${dsttmp}" 0 && + +# and set any options; do chmod last to preserve setuid bits + +# If any of these fail, we abort the whole thing. If we want to +# ignore errors from any of these, just make sure not to ignore +# errors from the above "$doit $instcmd $src $dsttmp" command. + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi && + +# Now rename the file to the real destination. + + $doit $rmcmd -f $dstdir/$dstfile && + $doit $mvcmd $dsttmp $dstdir/$dstfile + +fi && + + +exit 0 diff --git a/ghc/rts/gmp/ltconfig b/ghc/rts/gmp/ltconfig new file mode 100644 index 0000000..6d8cf33 --- /dev/null +++ b/ghc/rts/gmp/ltconfig @@ -0,0 +1,3109 @@ +#! /bin/sh + +# ltconfig - Create a system-specific libtool. +# Copyright (C) 1996-2000 Free Software Foundation, Inc. +# Originally by Gordon Matzigkeit , 1996 +# +# This file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# A lot of this script is taken from autoconf-2.10. + +# Check that we are running under the correct shell. +SHELL=${CONFIG_SHELL-/bin/sh} +echo=echo +if test "X$1" = X--no-reexec; then + # Discard the --no-reexec flag, and continue. + shift +elif test "X$1" = X--fallback-echo; then + # Avoid inline document here, it may be left over + : +elif test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then + # Yippee, $echo works! + : +else + # Restart under the correct shell. + exec "$SHELL" "$0" --no-reexec ${1+"$@"} +fi + +if test "X$1" = X--fallback-echo; then + # used as fallback echo + shift + cat </dev/null`} + case X$UNAME in + *-DOS) PATH_SEPARATOR=';' ;; + *) PATH_SEPARATOR=':' ;; + esac +fi + +# The HP-UX ksh and POSIX shell print the target directory to stdout +# if CDPATH is set. +if test "X${CDPATH+set}" = Xset; then CDPATH=:; export CDPATH; fi + +if test "X${echo_test_string+set}" != Xset; then + # find a string as large as possible, as long as the shell can cope with it + for cmd in 'sed 50q "$0"' 'sed 20q "$0"' 'sed 10q "$0"' 'sed 2q "$0"' 'echo test'; do + # expected sizes: less than 2Kb, 1Kb, 512 bytes, 16 bytes, ... + if (echo_test_string="`eval $cmd`") 2>/dev/null && + echo_test_string="`eval $cmd`" && + (test "X$echo_test_string" = "X$echo_test_string") 2>/dev/null; then + break + fi + done +fi + +if test "X`($echo '\t') 2>/dev/null`" = 'X\t' && + echo_testing_string=`($echo "$echo_test_string") 2>/dev/null` && + test "X$echo_testing_string" = "X$echo_test_string"; then + : +else + # The Solaris, AIX, and Digital Unix default echo programs unquote + # backslashes. This makes it impossible to quote backslashes using + # echo "$something" | sed 's/\\/\\\\/g' + # + # So, first we look for a working echo in the user's PATH. + + IFS="${IFS= }"; save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR}" + for dir in $PATH /usr/ucb; do + if (test -f $dir/echo || test -f $dir/echo$ac_exeext) && + test "X`($dir/echo '\t') 2>/dev/null`" = 'X\t' && + echo_testing_string=`($dir/echo "$echo_test_string") 2>/dev/null` && + test "X$echo_testing_string" = "X$echo_test_string"; then + echo="$dir/echo" + break + fi + done + IFS="$save_ifs" + + if test "X$echo" = Xecho; then + # We didn't find a better echo, so look for alternatives. + if test "X`(print -r '\t') 2>/dev/null`" = 'X\t' && + echo_testing_string=`(print -r "$echo_test_string") 2>/dev/null` && + test "X$echo_testing_string" = "X$echo_test_string"; then + # This shell has a builtin print -r that does the trick. + echo='print -r' + elif (test -f /bin/ksh || test -f /bin/ksh$ac_exeext) && + test "X$CONFIG_SHELL" != X/bin/ksh; then + # If we have ksh, try running ltconfig again with it. + ORIGINAL_CONFIG_SHELL="${CONFIG_SHELL-/bin/sh}" + export ORIGINAL_CONFIG_SHELL + CONFIG_SHELL=/bin/ksh + export CONFIG_SHELL + exec "$CONFIG_SHELL" "$0" --no-reexec ${1+"$@"} + else + # Try using printf. + echo='printf "%s\n"' + if test "X`($echo '\t') 2>/dev/null`" = 'X\t' && + echo_testing_string=`($echo "$echo_test_string") 2>/dev/null` && + test "X$echo_testing_string" = "X$echo_test_string"; then + # Cool, printf works + : + elif echo_testing_string=`("$ORIGINAL_CONFIG_SHELL" "$0" --fallback-echo '\t') 2>/dev/null` && + test "X$echo_testing_string" = 'X\t' && + echo_testing_string=`("$ORIGINAL_CONFIG_SHELL" "$0" --fallback-echo "$echo_test_string") 2>/dev/null` && + test "X$echo_testing_string" = "X$echo_test_string"; then + CONFIG_SHELL="$ORIGINAL_CONFIG_SHELL" + export CONFIG_SHELL + SHELL="$CONFIG_SHELL" + export SHELL + echo="$CONFIG_SHELL $0 --fallback-echo" + elif echo_testing_string=`("$CONFIG_SHELL" "$0" --fallback-echo '\t') 2>/dev/null` && + test "X$echo_testing_string" = 'X\t' && + echo_testing_string=`("$CONFIG_SHELL" "$0" --fallback-echo "$echo_test_string") 2>/dev/null` && + test "X$echo_testing_string" = "X$echo_test_string"; then + echo="$CONFIG_SHELL $0 --fallback-echo" + else + # maybe with a smaller string... + prev=: + + for cmd in 'echo test' 'sed 2q "$0"' 'sed 10q "$0"' 'sed 20q "$0"' 'sed 50q "$0"'; do + if (test "X$echo_test_string" = "X`eval $cmd`") 2>/dev/null; then + break + fi + prev="$cmd" + done + + if test "$prev" != 'sed 50q "$0"'; then + echo_test_string=`eval $prev` + export echo_test_string + exec "${ORIGINAL_CONFIG_SHELL}" "$0" ${1+"$@"} + else + # Oops. We lost completely, so just stick with echo. + echo=echo + fi + fi + fi + fi +fi + +# Sed substitution that helps us do robust quoting. It backslashifies +# metacharacters that are still active within double-quoted strings. +Xsed='sed -e s/^X//' +sed_quote_subst='s/\([\\"\\`$\\\\]\)/\\\1/g' + +# Same as above, but do not quote variable references. +double_quote_subst='s/\([\\"\\`\\\\]\)/\\\1/g' + +# Sed substitution to delay expansion of an escaped shell variable in a +# double_quote_subst'ed string. +delay_variable_subst='s/\\\\\\\\\\\$/\\\\\\$/g' + +# The name of this program. +progname=`$echo "X$0" | $Xsed -e 's%^.*/%%'` + +# Constants: +PROGRAM=ltconfig +PACKAGE=libtool +VERSION=1.3c +TIMESTAMP=" (1.696 2000/03/14 20:22:42)" +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +rm="rm -f" + +help="Try \`$progname --help' for more information." + +# Global variables: +default_ofile=libtool +can_build_shared=yes +enable_shared=yes +# All known linkers require a `.a' archive for static linking (except M$VC, +# which needs '.lib'). +enable_static=yes +enable_fast_install=yes +enable_dlopen=unknown +enable_win32_dll=no +pic_mode=default +ltmain= +silent= +srcdir= +ac_config_guess= +ac_config_sub= +host= +build=NONE +nonopt=NONE +ofile="$default_ofile" +verify_host=yes +with_gcc=no +with_gnu_ld=no +need_locks=yes +ac_ext=c +libext=a +cache_file= + +old_AR="$AR" +old_CC="$CC" +old_CFLAGS="$CFLAGS" +old_CPPFLAGS="$CPPFLAGS" +old_LDFLAGS="$LDFLAGS" +old_LIBS="$LIBS" +old_MAGIC="$MAGIC" +old_LD="$LD" +old_LN_S="$LN_S" +old_NM="$NM" +old_RANLIB="$RANLIB" +old_STRIP="$STRIP" +old_AS="$AS" +old_DLLTOOL="$DLLTOOL" +old_OBJDUMP="$OBJDUMP" +old_OBJEXT="$OBJEXT" +old_EXEEXT="$EXEEXT" +old_reload_Flag="$reload_flag" +old_deplibs_check_method="$deplibs_check_method" +old_file_magic_cmd="$file_magic_cmd" + +# Parse the command line options. +args= +prev= +for option +do + case "$option" in + -*=*) optarg=`echo "$option" | sed 's/[-_a-zA-Z0-9]*=//'` ;; + *) optarg= ;; + esac + + # If the previous option needs an argument, assign it. + if test -n "$prev"; then + eval "$prev=\$option" + prev= + continue + fi + + case "$option" in + --help) cat <&2 + echo "$help" 1>&2 + exit 1 + ;; + + *) + if test -z "$ltmain"; then + ltmain="$option" + elif test -z "$host"; then +# This generates an unnecessary warning for sparc-sun-solaris4.1.3_U1 +# if test -n "`echo $option| sed 's/[-a-z0-9.]//g'`"; then +# echo "$progname: warning \`$option' is not a valid host type" 1>&2 +# fi + host="$option" + else + echo "$progname: too many arguments" 1>&2 + echo "$help" 1>&2 + exit 1 + fi ;; + esac +done + +if test -z "$ltmain"; then + echo "$progname: you must specify a LTMAIN file" 1>&2 + echo "$help" 1>&2 + exit 1 +fi + +if test ! -f "$ltmain"; then + echo "$progname: \`$ltmain' does not exist" 1>&2 + echo "$help" 1>&2 + exit 1 +fi + +# Quote any args containing shell metacharacters. +ltconfig_args= +for arg +do + case "$arg" in + *" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*) + ltconfig_args="$ltconfig_args '$arg'" ;; + *) ltconfig_args="$ltconfig_args $arg" ;; + esac +done + +# A relevant subset of AC_INIT. + +# File descriptor usage: +# 0 standard input +# 1 file creation +# 2 errors and warnings +# 3 some systems may open it to /dev/tty +# 4 used on the Kubota Titan +# 5 compiler messages saved in config.log +# 6 checking for... messages and results +if test "$silent" = yes; then + exec 6>/dev/null +else + exec 6>&1 +fi +exec 5>>./config.log + +# NLS nuisances. +# Only set LANG and LC_ALL to C if already set. +# These must not be set unconditionally because not all systems understand +# e.g. LANG=C (notably SCO). +if test "X${LC_ALL+set}" = Xset; then LC_ALL=C; export LC_ALL; fi +if test "X${LANG+set}" = Xset; then LANG=C; export LANG; fi + +if test -n "$cache_file" && test -r "$cache_file"; then + echo "loading cache $cache_file within ltconfig" + . $cache_file +fi + +if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then + # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu. + if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then + ac_n= ac_c=' +' ac_t=' ' + else + ac_n=-n ac_c= ac_t= + fi +else + ac_n= ac_c='\c' ac_t= +fi + +if test -z "$srcdir"; then + # Assume the source directory is the same one as the path to LTMAIN. + srcdir=`$echo "X$ltmain" | $Xsed -e 's%/[^/]*$%%'` + test "$srcdir" = "$ltmain" && srcdir=. +fi + +trap "$rm conftest*; exit 1" 1 2 15 +if test "$verify_host" = yes; then + # Check for config.guess and config.sub. + ac_aux_dir= + for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do + if test -f $ac_dir/config.guess; then + ac_aux_dir=$ac_dir + break + fi + done + if test -z "$ac_aux_dir"; then + echo "$progname: cannot find config.guess in $srcdir $srcdir/.. $srcdir/../.." 1>&2 + echo "$help" 1>&2 + exit 1 + fi + ac_config_guess=$ac_aux_dir/config.guess + ac_config_sub=$ac_aux_dir/config.sub + + # Make sure we can run config.sub. + if $SHELL $ac_config_sub sun4 >/dev/null 2>&1; then : + else + echo "$progname: cannot run $ac_config_sub" 1>&2 + echo "$help" 1>&2 + exit 1 + fi + + echo $ac_n "checking host system type""... $ac_c" 1>&6 + + host_alias=$host + case "$host_alias" in + "") + if host_alias=`$SHELL $ac_config_guess`; then : + else + echo "$progname: cannot guess host type; you must specify one" 1>&2 + echo "$help" 1>&2 + exit 1 + fi ;; + esac + host=`$SHELL $ac_config_sub $host_alias` + echo "$ac_t$host" 1>&6 + + # Make sure the host verified. + test -z "$host" && exit 1 + + # Check for the build system type + echo $ac_n "checking build system type... $ac_c" 1>&6 + + build_alias=$build + case "$build_alias" in + NONE) + case $nonopt in + NONE) build_alias=$host_alias ;; + *) build_alias=$nonopt ;; + esac ;; + esac + + build=`$SHELL $ac_config_sub $build_alias` + build_cpu=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'` + build_vendor=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'` + build_os=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'` + echo "$ac_t""$build" 1>&6 + +elif test -z "$host"; then + echo "$progname: you must specify a host type if you use \`--no-verify'" 1>&2 + echo "$help" 1>&2 + exit 1 +else + host_alias=$host + build_alias=$host_alias + build=$host +fi + +if test x"$host" != x"$build"; then + ac_tool_prefix=${host_alias}- +else + ac_tool_prefix= +fi + +host_cpu=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'` +host_vendor=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'` +host_os=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'` + +# Transform linux* to *-*-linux-gnu*, to support old configure scripts. +case "$host_os" in +linux-gnu*) ;; +linux*) host=`echo $host | sed 's/^\(.*-.*-linux\)\(.*\)$/\1-gnu\2/'` +esac + +case "$host_os" in +aix3*) + # AIX sometimes has problems with the GCC collect2 program. For some + # reason, if we set the COLLECT_NAMES environment variable, the problems + # vanish in a puff of smoke. + if test "X${COLLECT_NAMES+set}" != Xset; then + COLLECT_NAMES= + export COLLECT_NAMES + fi + ;; +esac + +# Determine commands to create old-style static archives. +old_archive_cmds='$AR cru $oldlib$oldobjs$old_deplibs' +old_postinstall_cmds='chmod 644 $oldlib' +old_postuninstall_cmds= + +# Set sane defaults for various variables +test -z "$AR" && AR=ar +test -z "$AS" && AS=as +test -z "$CC" && CC=cc +test -z "$DLLTOOL" && DLLTOOL=dlltool +test -z "$MAGIC" && MAGIC=file +test -z "$LD" && LD=ld +test -z "$LN_S" && LN_S="ln -s" +test -z "$NM" && NM=nm +test -z "$OBJDUMP" && OBJDUMP=objdump +test -z "$RANLIB" && RANLIB=: +test -z "$STRIP" && STRIP=: +test -z "$objext" && objext=o + +echo $ac_n "checking for objdir... $ac_c" 1>&6 +rm -f .libs 2>/dev/null +mkdir .libs 2>/dev/null +if test -d .libs; then + objdir=.libs +else + # MS-DOS does not allow filenames that begin with a dot. + objdir=_libs +fi +rmdir .libs 2>/dev/null +echo "$ac_t$objdir" 1>&6 + +# Allow CC to be a program name with arguments. +set dummy $CC +compiler="$2" + +# We assume here that the value for ac_cv_prog_cc_pic will not be cached +# in isolation, and that seeing it set (from the cache) indicates that +# the associated values are set (in the cache) correctly too. +echo $ac_n "checking for $compiler option to produce PIC... $ac_c" 1>&6 +echo "$progname:563:checking for $compiler option to produce PIC" 1>&5 +if test "X${ac_cv_prog_cc_pic+set}" = Xset; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_cv_prog_cc_pic= + ac_cv_prog_cc_shlib= + ac_cv_prog_cc_wl= + ac_cv_prog_cc_static= + ac_cv_prog_cc_no_builtin= + ac_cv_prog_cc_can_build_shared=$can_build_shared + + if test "$with_gcc" = yes; then + ac_cv_prog_cc_wl='-Wl,' + ac_cv_prog_cc_static='-static' + + case "$host_os" in + beos* | irix5* | irix6* | osf3* | osf4* | osf5*) + # PIC is the default for these OSes. + ;; + aix*) + # Below there is a dirty hack to force normal static linking with -ldl + # The problem is because libdl dynamically linked with both libc and + # libC (AIX C++ library), which obviously doesn't included in libraries + # list by gcc. This cause undefined symbols with -static flags. + # This hack allows C programs to be linked with "-static -ldl", but + # we not sure about C++ programs. + ac_cv_prog_cc_static="$ac_cv_prog_cc_static ${ac_cv_prog_cc_wl}-lC" + ;; + cygwin* | mingw* | os2*) + # This hack is so that the source file can tell whether it is being + # built for inclusion in a dll (and should export symbols for example). + ac_cv_prog_cc_pic='-DDLL_EXPORT' + ;; + amigaos*) + # FIXME: we need at least 68020 code to build shared libraries, but + # adding the `-m68020' flag to GCC prevents building anything better, + # like `-m68040'. + ac_cv_prog_cc_pic='-m68020 -resident32 -malways-restore-a4' + ;; + sysv4*MP*) + if test -d /usr/nec; then + ac_cv_prog_cc_pic=-Kconform_pic + fi + ;; + *) + ac_cv_prog_cc_pic='-fPIC' + ;; + esac + else + # PORTME Check for PIC flags for the system compiler. + case "$host_os" in + aix3* | aix4*) + # All AIX code is PIC. + ac_cv_prog_cc_static='-bnso -bI:/lib/syscalls.exp' + ;; + + hpux9* | hpux10* | hpux11*) + # Is there a better ac_cv_prog_cc_static that works with the bundled CC? + ac_cv_prog_cc_wl='-Wl,' + ac_cv_prog_cc_static="${ac_cv_prog_cc_wl}-a ${ac_cv_prog_cc_wl}archive" + ac_cv_prog_cc_pic='+Z' + ;; + + irix5* | irix6*) + ac_cv_prog_cc_wl='-Wl,' + ac_cv_prog_cc_static='-non_shared' + # PIC (with -KPIC) is the default. + ;; + + cygwin* | mingw* | os2*) + # This hack is so that the source file can tell whether it is being + # built for inclusion in a dll (and should export symbols for example). + ac_cv_prog_cc_pic='-DDLL_EXPORT' + ;; + + osf3* | osf4* | osf5*) + # All OSF/1 code is PIC. + ac_cv_prog_cc_wl='-Wl,' + ac_cv_prog_cc_static='-non_shared' + ;; + + sco3.2v5*) + ac_cv_prog_cc_pic='-Kpic' + ac_cv_prog_cc_static='-dn' + ac_cv_prog_cc_shlib='-belf' + ;; + + solaris*) + ac_cv_prog_cc_pic='-KPIC' + ac_cv_prog_cc_static='-Bstatic' + ac_cv_prog_cc_wl='-Wl,' + ;; + + sunos4*) + ac_cv_prog_cc_pic='-PIC' + ac_cv_prog_cc_static='-Bstatic' + ac_cv_prog_cc_wl='-Qoption ld ' + ;; + + sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*) + ac_cv_prog_cc_pic='-KPIC' + ac_cv_prog_cc_static='-Bstatic' + ac_cv_prog_cc_wl='-Wl,' + ;; + + uts4*) + ac_cv_prog_cc_pic='-pic' + ac_cv_prog_cc_static='-Bstatic' + ;; + + sysv4*MP*) + if test -d /usr/nec ;then + ac_cv_prog_cc_pic='-Kconform_pic' + ac_cv_prog_cc_static='-Bstatic' + fi + ;; + + *) + ac_cv_prog_cc_can_build_shared=no + ;; + esac + fi +fi +if test -z "$ac_cv_prog_cc_pic"; then + echo "$ac_t"none 1>&6 +else + echo "$ac_t""$ac_cv_prog_cc_pic" 1>&6 + + # Check to make sure the pic_flag actually works. + echo $ac_n "checking if $compiler PIC flag $ac_cv_prog_cc_pic works... $ac_c" 1>&6 + echo "$progname:693:checking that $compiler PIC flag $ac_cv_prog_cc_pic works." 1>&5 + if test "X${ac_cv_prog_cc_pic_works+set}" = Xset; then + echo $ac_n "(cached) $ac_c" 1>&6 + else + ac_cv_prog_cc_pic_works=yes + $rm conftest* + echo "int some_variable = 0;" > conftest.c + save_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS $ac_cv_prog_cc_pic -DPIC" + if { (eval echo $progname:702: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.$objext; then + # Append any warnings to the config.log. + cat conftest.err 1>&5 + + case "$host_os" in + hpux9* | hpux10* | hpux11*) + # On HP-UX, both CC and GCC only warn that PIC is supported... then + # they create non-PIC objects. So, if there were any warnings, we + # assume that PIC is not supported. + if test -s conftest.err; then + ac_cv_prog_cc_pic_works=no + ac_cv_prog_cc_can_build_shared=no + ac_cv_prog_cc_pic= + else + ac_cv_prog_cc_pic_works=yes + ac_cv_prog_cc_pic=" $ac_cv_prog_cc_pic" + fi + ;; + *) + ac_cv_prog_cc_pic_works=yes + ac_cv_prog_cc_pic=" $ac_cv_prog_cc_pic" + ;; + esac + else + # Append any errors to the config.log. + cat conftest.err 1>&5 + ac_cv_prog_cc_pic_works=no + ac_cv_prog_cc_can_build_shared=no + ac_cv_prog_cc_pic= + fi + CFLAGS="$save_CFLAGS" + $rm conftest* + fi + # Belt *and* braces to stop my trousers falling down: + if test "X$ac_cv_prog_cc_pic_works" = Xno; then + ac_cv_prog_cc_pic= + ac_cv_prog_cc_can_build_shared=no + fi + echo "$ac_t""$ac_cv_prog_cc_pic_works" 1>&6 +fi + +# Check for any special shared library compilation flags. +if test -n "$ac_cv_prog_cc_shlib"; then + echo "$progname: warning: \`$CC' requires \`$ac_cv_prog_cc_shlib' to build shared libraries" 1>&2 + if echo "$old_CC $old_CFLAGS " | egrep -e "[ ]$ac_cv_prog_cc_shlib[ ]" >/dev/null; then : + else + echo "$progname: add \`$ac_cv_prog_cc_shlib' to the CC or CFLAGS env variable and reconfigure" 1>&2 + ac_cv_prog_cc_can_build_shared=no + fi +fi + +echo $ac_n "checking if $compiler static flag $ac_cv_prog_cc_static works... $ac_c" 1>&6 +echo "$progname:754: checking if $compiler static flag $ac_cv_prog_cc_static works" >&5 +if test "X${ac_cv_prog_cc_static_works+set}" = Xset; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + $rm conftest* + echo 'main(){return(0);}' > conftest.c + save_LDFLAGS="$LDFLAGS" + LDFLAGS="$LDFLAGS $ac_cv_prog_cc_static" + if { (eval echo $progname:762: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then + ac_cv_prog_cc_static_works=yes + else + ac_cv_prog_cc_static_works=no + ac_cv_prog_cc_static= + fi + LDFLAGS="$save_LDFLAGS" + $rm conftest* +fi +# Belt *and* braces to stop my trousers falling down: +if test "X$ac_cv_prog_cc_static_works" = Xno; then + ac_cv_prog_cc_static= +fi +echo "$ac_t""$ac_cv_prog_cc_static_works" 1>&6 +pic_flag="$ac_cv_prog_cc_pic" +special_shlib_compile_flags="$ac_cv_prog_cc_shlib" +wl="$ac_cv_prog_cc_wl" +link_static_flag="$ac_cv_prog_cc_static" +no_builtin_flag="$ac_cv_prog_cc_no_builtin" +can_build_shared="$ac_cv_prog_cc_can_build_shared" + +# Check to see if options -o and -c are simultaneously supported by compiler +echo $ac_n "checking if $compiler supports -c -o file.o... $ac_c" 1>&6 +$rm -r conftest 2>/dev/null +mkdir conftest +cd conftest +$rm conftest* +echo "int some_variable = 0;" > conftest.c +mkdir out +# According to Tom Tromey, Ian Lance Taylor reported there are C compilers +# that will create temporary files in the current directory regardless of +# the output directory. Thus, making CWD read-only will cause this test +# to fail, enabling locking or at least warning the user not to do parallel +# builds. +chmod -w . +save_CFLAGS="$CFLAGS" +CFLAGS="$CFLAGS -o out/conftest2.o" +echo "$progname:799: checking if $compiler supports -c -o file.o" >&5 +if { (eval echo $progname:800: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.o; then + + # The compiler can only warn and ignore the option if not recognized + # So say no if there are warnings + if test -s out/conftest.err; then + echo "$ac_t"no 1>&6 + compiler_c_o=no + else + echo "$ac_t"yes 1>&6 + compiler_c_o=yes + fi +else + # Append any errors to the config.log. + cat out/conftest.err 1>&5 + compiler_c_o=no + echo "$ac_t"no 1>&6 +fi +CFLAGS="$save_CFLAGS" +chmod u+w . +$rm conftest* out/* +rmdir out +cd .. +rmdir conftest +$rm -r conftest 2>/dev/null + +if test x"$compiler_c_o" = x"yes"; then + # Check to see if we can write to a .lo + echo $ac_n "checking if $compiler supports -c -o file.lo... $ac_c" 1>&6 + $rm conftest* + echo "int some_variable = 0;" > conftest.c + save_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS -c -o conftest.lo" + echo "$progname:832: checking if $compiler supports -c -o file.lo" >&5 +if { (eval echo $progname:833: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.lo; then + + # The compiler can only warn and ignore the option if not recognized + # So say no if there are warnings + if test -s conftest.err; then + echo "$ac_t"no 1>&6 + compiler_o_lo=no + else + echo "$ac_t"yes 1>&6 + compiler_o_lo=yes + fi + else + # Append any errors to the config.log. + cat conftest.err 1>&5 + compiler_o_lo=no + echo "$ac_t"no 1>&6 + fi + CFLAGS="$save_CFLAGS" + $rm conftest* +else + compiler_o_lo=no +fi + +# Check to see if we can do hard links to lock some files if needed +hard_links="nottested" +if test "$compiler_c_o" = no && test "$need_locks" != no; then + # do not overwrite the value of need_locks provided by the user + echo $ac_n "checking if we can lock with hard links... $ac_c" 1>&6 + hard_links=yes + $rm conftest* + ln conftest.a conftest.b 2>/dev/null && hard_links=no + touch conftest.a + ln conftest.a conftest.b 2>&5 || hard_links=no + ln conftest.a conftest.b 2>/dev/null && hard_links=no + echo "$ac_t$hard_links" 1>&6 + $rm conftest* + if test "$hard_links" = no; then + echo "*** WARNING: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" >&2 + need_locks=warn + fi +else + need_locks=no +fi + +if test "$with_gcc" = yes; then + # Check to see if options -fno-rtti -fno-exceptions are supported by compiler + echo $ac_n "checking if $compiler supports -fno-rtti -fno-exceptions ... $ac_c" 1>&6 + $rm conftest* + echo "int some_variable = 0;" > conftest.c + save_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS -fno-rtti -fno-exceptions -c conftest.c" + echo "$progname:884: checking if $compiler supports -fno-rtti -fno-exceptions" >&5 + if { (eval echo $progname:885: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.o; then + + # The compiler can only warn and ignore the option if not recognized + # So say no if there are warnings + if test -s conftest.err; then + echo "$ac_t"no 1>&6 + compiler_rtti_exceptions=no + else + echo "$ac_t"yes 1>&6 + compiler_rtti_exceptions=yes + fi + else + # Append any errors to the config.log. + cat conftest.err 1>&5 + compiler_rtti_exceptions=no + echo "$ac_t"no 1>&6 + fi + CFLAGS="$save_CFLAGS" + $rm conftest* + + if test "$compiler_rtti_exceptions" = "yes"; then + no_builtin_flag=' -fno-builtin -fno-rtti -fno-exceptions' + else + no_builtin_flag=' -fno-builtin' + fi + +fi + +# See if the linker supports building shared libraries. +echo $ac_n "checking whether the linker ($LD) supports shared libraries... $ac_c" 1>&6 + +allow_undefined_flag= +no_undefined_flag= +need_lib_prefix=unknown +need_version=unknown +# when you set need_version to no, make sure it does not cause -set_version +# flags to be left without arguments +archive_cmds= +archive_expsym_cmds= +old_archive_from_new_cmds= +old_archive_from_expsyms_cmds= +striplib= +old_striplib= +export_dynamic_flag_spec= +whole_archive_flag_spec= +thread_safe_flag_spec= +hardcode_into_libs=no +hardcode_libdir_flag_spec= +hardcode_libdir_separator= +hardcode_direct=no +hardcode_minus_L=no +hardcode_shlibpath_var=unsupported +runpath_var= +link_all_deplibs=unknown +always_export_symbols=no +export_symbols_cmds='$NM $libobjs $convenience | $global_symbol_pipe | sed '\''s/.* //'\'' | sort | uniq > $export_symbols' +# include_expsyms should be a list of space-separated symbols to be *always* +# included in the symbol list +include_expsyms= +# exclude_expsyms can be an egrep regular expression of symbols to exclude +# it will be wrapped by ` (' and `)$', so one must not match beginning or +# end of line. Example: `a|bc|.*d.*' will exclude the symbols `a' and `bc', +# as well as any symbol that contains `d'. +exclude_expsyms="_GLOBAL_OFFSET_TABLE_" +# Although _GLOBAL_OFFSET_TABLE_ is a valid symbol C name, most a.out +# platforms (ab)use it in PIC code, but their linkers get confused if +# the symbol is explicitly referenced. Since portable code cannot +# rely on this symbol name, it's probably fine to never include it in +# preloaded symbol tables. +extract_expsyms_cmds= + +case "$host_os" in +cygwin* | mingw*) + # FIXME: the MSVC++ port hasn't been tested in a loooong time + # When not using gcc, we currently assume that we are using + # Microsoft Visual C++. + if test "$with_gcc" != yes; then + with_gnu_ld=no + fi + ;; + +esac + +ld_shlibs=yes +if test "$with_gnu_ld" = yes; then + # If archive_cmds runs LD, not CC, wlarc should be empty + wlarc='${wl}' + + # See if GNU ld supports shared libraries. + case "$host_os" in + aix3* | aix4*) + # On AIX, the GNU linker is very broken + ld_shlibs=no + cat <&2 + +*** Warning: the GNU linker, at least up to release 2.9.1, is reported +*** to be unable to reliably create shared libraries on AIX. +*** Therefore, libtool is disabling shared libraries support. If you +*** really care for shared libraries, you may want to modify your PATH +*** so that a non-GNU linker is found, and then restart. + +EOF + ;; + + amigaos*) + archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR cru $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)' + hardcode_libdir_flag_spec='-L$libdir' + hardcode_minus_L=yes + + # Samuel A. Falvo II reports + # that the semantics of dynamic libraries on AmigaOS, at least up + # to version 4, is to share data among multiple programs linked + # with the same dynamic library. Since this doesn't match the + # behavior of shared libraries on other platforms, we can use + # them. + ld_shlibs=no + ;; + + beos*) + if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then + allow_undefined_flag=unsupported + # Joseph Beckenbach says some releases of gcc + # support --undefined. This deserves some investigation. FIXME + archive_cmds='$CC -nostart $libobjs $deplibs $linker_flags ${wl}-soname $wl$soname -o $lib' + else + ld_shlibs=no + fi + ;; + + cygwin* | mingw*) + # hardcode_libdir_flag_spec is actually meaningless, as there is + # no search path for DLLs. + hardcode_libdir_flag_spec='-L$libdir' + allow_undefined_flag=unsupported + always_export_symbols=yes + + extract_expsyms_cmds='test -f $output_objdir/impgen.c || \ + sed -e "/^# \/\* impgen\.c starts here \*\//,/^# \/\* impgen.c ends here \*\// { s/^# //; p; }" -e d < $0 > $output_objdir/impgen.c~ + test -f $output_objdir/impgen.exe || (cd $output_objdir && \ + if test "x$HOST_CC" != "x" ; then $HOST_CC -o impgen impgen.c ; \ + else $CC -o impgen impgen.c ; fi)~ + $output_objdir/impgen $dir/$soname > $output_objdir/$soname-def' + + old_archive_from_expsyms_cmds='$DLLTOOL --as=$AS --dllname $soname --def $output_objdir/$soname-def --output-lib $output_objdir/$newlib' + + # cygwin and mingw dlls have different entry points and sets of symbols + # to exclude. + # FIXME: what about values for MSVC? + dll_entry=__cygwin_dll_entry@12 + dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12~ + case "$host_os" in + mingw*) + # mingw values + dll_entry=_DllMainCRTStartup@12 + dll_exclude_symbols=DllMain@12,DllMainCRTStartup@12,DllEntryPoint@12~ + ;; + esac + + # mingw and cygwin differ, and it's simplest to just exclude the union + # of the two symbol sets. + dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12,DllMainCRTStartup@12,DllEntryPoint@12 + + # recent cygwin and mingw systems supply a stub DllMain which the user + # can override, but on older systems we have to supply one (in ltdll.c) + if test "x$lt_cv_need_dllmain" = "xyes"; then + ltdll_obj='$output_objdir/$soname-ltdll.'"$objext " + ltdll_cmds='test -f $output_objdir/$soname-ltdll.c || sed -e "/^# \/\* ltdll\.c starts here \*\//,/^# \/\* ltdll.c ends here \*\// { s/^# //; p; }" -e d < $0 > $output_objdir/$soname-ltdll.c~ + test -f $output_objdir/$soname-ltdll.$objext || (cd $output_objdir && $CC -c $soname-ltdll.c)~' + else + ltdll_obj= + ltdll_cmds= + fi + + # Extract the symbol export list from an `--export-all' def file, + # then regenerate the def file from the symbol export list, so that + # the compiled dll only exports the symbol export list. + # Be careful not to strip the DATA tag left be newer dlltools. + export_symbols_cmds="$ltdll_cmds"' + $DLLTOOL --export-all --exclude-symbols '$dll_exclude_symbols' --output-def $output_objdir/$soname-def '$ltdll_obj'$libobjs $convenience~ + sed -e "1,/EXPORTS/d" -e "s/ @ [0-9]*//" -e "s/ *;.*$//" < $output_objdir/$soname-def > $export_symbols' + + # If DATA tags from a recent dlltool are present, honour them! + archive_expsym_cmds='echo EXPORTS > $output_objdir/$soname-def~ + _lt_hint=1; + cat $export_symbols | while read symbol; do + set dummy \$symbol; + case \$# in + 2) echo " \$2 @ \$_lt_hint ; " >> $output_objdir/$soname-def;; + *) echo " \$2 @ \$_lt_hint \$3 ; " >> $output_objdir/$soname-def;; + esac; + _lt_hint=`expr 1 + \$_lt_hint`; + done~ + '"$ltdll_cmds"' + $CC -Wl,--base-file,$output_objdir/$soname-base '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $lib '$ltdll_obj'$libobjs $deplibs $compiler_flags~ + $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp~ + $CC -Wl,--base-file,$output_objdir/$soname-base $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $lib '$ltdll_obj'$libobjs $deplibs $compiler_flags~ + $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp~ + $CC $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $lib '$ltdll_obj'$libobjs $deplibs $compiler_flags' + ;; + + netbsd*) + if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then + archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib' + archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib' + else + archive_cmds='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib' + fi + ;; + + solaris* | sysv5*) + if $LD -v 2>&1 | egrep 'BFD 2\.8' > /dev/null; then + ld_shlibs=no + cat <&2 + +*** Warning: The releases 2.8.* of the GNU linker cannot reliably +*** create shared libraries on Solaris systems. Therefore, libtool +*** is disabling shared libraries support. We urge you to upgrade GNU +*** binutils to release 2.9.1 or newer. Another option is to modify +*** your PATH or compiler configuration so that the native linker is +*** used, and then restart. + +EOF + elif $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then + archive_cmds='$CC -shared $libobjs $deplibs $linker_flags ${wl}-soname $wl$soname -o $lib' + archive_expsym_cmds='$CC -shared $libobjs $deplibs $linker_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib' + else + ld_shlibs=no + fi + ;; + + sunos4*) + archive_cmds='$LD -assert pure-text -Bshareable -o $lib $libobjs $deplibs $linker_flags' + wlarc= + hardcode_direct=yes + hardcode_shlibpath_var=no + ;; + + *) + if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then + archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib' + archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib' + else + ld_shlibs=no + fi + ;; + esac + + if test "$ld_shlibs" = yes; then + runpath_var=LD_RUN_PATH + hardcode_libdir_flag_spec='${wl}--rpath ${wl}$libdir' + export_dynamic_flag_spec='${wl}--export-dynamic' + case $host_os in + cygwin* | mingw*) + # dlltool doesn't understand --whole-archive et. al. + whole_archive_flag_spec= + ;; + *) + # ancient GNU ld didn't support --whole-archive et. al. + if $LD --help 2>&1 | egrep 'no-whole-archive' > /dev/null; then + whole_archive_flag_spec="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive' + else + whole_archive_flag_spec= + fi + ;; + esac + fi +else + # PORTME fill in a description of your system's linker (not GNU ld) + case "$host_os" in + aix3*) + allow_undefined_flag=unsupported + always_export_symbols=yes + archive_expsym_cmds='$LD -o $output_objdir/$soname $libobjs $deplibs $linker_flags -bE:$export_symbols -T512 -H512 -bM:SRE~$AR cru $lib $output_objdir/$soname' + # Note: this linker hardcodes the directories in LIBPATH if there + # are no directories specified by -L. + hardcode_minus_L=yes + if test "$with_gcc" = yes && test -z "$link_static_flag"; then + # Neither direct hardcoding nor static linking is supported with a + # broken collect2. + hardcode_direct=unsupported + fi + ;; + + aix4*) + hardcode_libdir_flag_spec='${wl}-b ${wl}nolibpath ${wl}-b ${wl}libpath:$libdir:/usr/lib:/lib' + hardcode_libdir_separator=':' + if test "$with_gcc" = yes; then + collect2name=`${CC} -print-prog-name=collect2` + if test -f "$collect2name" && \ + strings "$collect2name" | grep resolve_lib_name >/dev/null + then + # We have reworked collect2 + hardcode_direct=yes + else + # We have old collect2 + hardcode_direct=unsupported + # It fails to find uninstalled libraries when the uninstalled + # path is not listed in the libpath. Setting hardcode_minus_L + # to unsupported forces relinking + hardcode_minus_L=yes + hardcode_libdir_flag_spec='-L$libdir' + hardcode_libdir_separator= + fi + shared_flag='-shared' + else + shared_flag='${wl}-bM:SRE' + hardcode_direct=yes + fi + allow_undefined_flag=' ${wl}-berok' + archive_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${wl}-bexpall ${wl}-bnoentry${allow_undefined_flag}' + archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${wl}-bE:$export_symbols ${wl}-bnoentry${allow_undefined_flag}' + case "$host_os" in aix4.[01]|aix4.[01].*) + # According to Greg Wooledge, -bexpall is only supported from AIX 4.2 on + always_export_symbols=yes ;; + esac + ;; + + amigaos*) + archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR cru $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)' + hardcode_libdir_flag_spec='-L$libdir' + hardcode_minus_L=yes + # see comment about different semantics on the GNU ld section + ld_shlibs=no + ;; + + cygwin* | mingw*) + # When not using gcc, we currently assume that we are using + # Microsoft Visual C++. + # hardcode_libdir_flag_spec is actually meaningless, as there is + # no search path for DLLs. + hardcode_libdir_flag_spec=' ' + allow_undefined_flag=unsupported + # Tell ltmain to make .lib files, not .a files. + libext=lib + # FIXME: Setting linknames here is a bad hack. + archive_cmds='$CC -o $lib $libobjs $compiler_flags `echo "$deplibs" | sed -e '\''s/ -lc$//'\''` -link -dll~linknames=' + # The linker will automatically build a .lib file if we build a DLL. + old_archive_from_new_cmds='true' + # FIXME: Should let the user specify the lib program. + old_archive_cmds='lib /OUT:$oldlib$oldobjs$old_deplibs' + fix_srcfile_path='`cygpath -w $srcfile`' + ;; + + freebsd1*) + ld_shlibs=no + ;; + + # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor + # support. Future versions do this automatically, but an explicit c++rt0.o + # does not break anything, and helps significantly (at the cost of a little + # extra space). + freebsd2.2*) + archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags /usr/lib/c++rt0.o' + hardcode_libdir_flag_spec='-R$libdir' + hardcode_direct=yes + hardcode_shlibpath_var=no + ;; + + # Unfortunately, older versions of FreeBSD 2 do not have this feature. + freebsd2*) + archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags' + hardcode_direct=yes + hardcode_minus_L=yes + hardcode_shlibpath_var=no + ;; + + # FreeBSD 3 and greater uses gcc -shared to do shared libraries. + freebsd*) + archive_cmds='$CC -shared -o $lib $libobjs $deplibs $compiler_flags' + hardcode_libdir_flag_spec='-R$libdir' + hardcode_direct=yes + hardcode_shlibpath_var=no + ;; + + hpux9* | hpux10* | hpux11*) + case "$host_os" in + hpux9*) archive_cmds='$rm $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib' ;; + *) archive_cmds='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags' ;; + esac + hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir' + hardcode_libdir_separator=: + hardcode_direct=yes + hardcode_minus_L=yes # Not in the search PATH, but as the default + # location of the library. + export_dynamic_flag_spec='${wl}-E' + ;; + + irix5* | irix6*) + if test "$with_gcc" = yes; then + archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib' + else + archive_cmds='$LD -shared $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${output_objdir}/so_locations -o $lib' + fi + hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir' + hardcode_libdir_separator=: + link_all_deplibs=yes + ;; + + netbsd*) + if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then + archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags' # a.out + else + archive_cmds='$LD -shared -o $lib $libobjs $deplibs $linker_flags' # ELF + fi + hardcode_libdir_flag_spec='${wl}-R$libdir' + hardcode_direct=yes + hardcode_shlibpath_var=no + ;; + + openbsd*) + archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags' + hardcode_libdir_flag_spec='-R$libdir' + hardcode_direct=yes + hardcode_shlibpath_var=no + ;; + + os2*) + hardcode_libdir_flag_spec='-L$libdir' + hardcode_minus_L=yes + allow_undefined_flag=unsupported + archive_cmds='$echo "LIBRARY $libname INITINSTANCE" > $output_objdir/$libname.def~$echo "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~$echo DATA >> $output_objdir/$libname.def~$echo " SINGLE NONSHARED" >> $output_objdir/$libname.def~$echo EXPORTS >> $output_objdir/$libname.def~emxexp $libobjs >> $output_objdir/$libname.def~$CC -Zdll -Zcrtdll -o $lib $libobjs $deplibs $compiler_flags $output_objdir/$libname.def' + old_archive_from_new_cmds='emximp -o $output_objdir/$libname.a $output_objdir/$libname.def' + ;; + + osf3*) + if test "$with_gcc" = yes; then + allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*' + archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib' + else + allow_undefined_flag=' -expect_unresolved \*' + archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${output_objdir}/so_locations -o $lib' + fi + hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir' + hardcode_libdir_separator=: + ;; + + osf4* | osf5*) # as osf3* with the addition of -msym flag + if test "$with_gcc" = yes; then + allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*' + archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib' + else + allow_undefined_flag=' -expect_unresolved \*' + archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -msym -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${output_objdir}/so_locations -o $lib' + fi + hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir' + hardcode_libdir_separator=: + ;; + + sco3.2v5*) + archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags' + hardcode_shlibpath_var=no + runpath_var=LD_RUN_PATH + hardcode_runpath_var=yes + ;; + + solaris*) + no_undefined_flag=' -z text' + # $CC -shared without GNU ld will not create a library from C++ + # object files and a static libstdc++, better avoid it by now + archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags' + archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~ + $LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp' + hardcode_libdir_flag_spec='-R$libdir' + hardcode_shlibpath_var=no + case "$host_os" in + solaris2.[0-5] | solaris2.[0-5].*) ;; + *) # Supported since Solaris 2.6 (maybe 2.5.1?) + whole_archive_flag_spec='-z allextract$convenience -z defaultextract' ;; + esac + link_all_deplibs=yes + ;; + + sunos4*) + archive_cmds='$LD -assert pure-text -Bstatic -o $lib $libobjs $deplibs $linker_flags' + hardcode_libdir_flag_spec='-L$libdir' + hardcode_direct=yes + hardcode_minus_L=yes + hardcode_shlibpath_var=no + ;; + + sysv4) + archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags' + runpath_var='LD_RUN_PATH' + hardcode_shlibpath_var=no + hardcode_direct=no #Motorola manual says yes, but my tests say they lie + ;; + + sysv4.3*) + archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags' + hardcode_shlibpath_var=no + export_dynamic_flag_spec='-Bexport' + ;; + + sysv5*) + no_undefined_flag=' -z text' + # $CC -shared without GNU ld will not create a library from C++ + # object files and a static libstdc++, better avoid it by now + archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags' + archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~ + $LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp' + hardcode_libdir_flag_spec= + hardcode_shlibpath_var=no + runpath_var='LD_RUN_PATH' + ;; + + uts4*) + archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags' + hardcode_libdir_flag_spec='-L$libdir' + hardcode_shlibpath_var=no + ;; + + dgux*) + archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags' + hardcode_libdir_flag_spec='-L$libdir' + hardcode_shlibpath_var=no + ;; + + sysv4*MP*) + if test -d /usr/nec; then + archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags' + hardcode_shlibpath_var=no + runpath_var=LD_RUN_PATH + hardcode_runpath_var=yes + ld_shlibs=yes + fi + ;; + + sysv4.2uw2*) + archive_cmds='$LD -G -o $lib $libobjs $deplibs $linker_flags' + hardcode_direct=yes + hardcode_minus_L=no + hardcode_shlibpath_var=no + hardcode_runpath_var=yes + runpath_var=LD_RUN_PATH + ;; + + unixware7*) + archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags' + runpath_var='LD_RUN_PATH' + hardcode_shlibpath_var=no + ;; + + *) + ld_shlibs=no + ;; + esac +fi +echo "$ac_t$ld_shlibs" 1>&6 +test "$ld_shlibs" = no && can_build_shared=no + +# Check hardcoding attributes. +echo $ac_n "checking how to hardcode library paths into programs... $ac_c" 1>&6 +hardcode_action= +if test -n "$hardcode_libdir_flag_spec" || \ + test -n "$runpath_var"; then + + # We can hardcode non-existant directories. + if test "$hardcode_direct" != no && + # If the only mechanism to avoid hardcoding is shlibpath_var, we + # have to relink, otherwise we might link with an installed library + # when we should be linking with a yet-to-be-installed one + ## test "$hardcode_shlibpath_var" != no && + test "$hardcode_minus_L" != no; then + # Linking always hardcodes the temporary library directory. + hardcode_action=relink + else + # We can link without hardcoding, and we can hardcode nonexisting dirs. + hardcode_action=immediate + fi +else + # We cannot hardcode anything, or else we can only hardcode existing + # directories. + hardcode_action=unsupported +fi +echo "$ac_t$hardcode_action" 1>&6 + +echo $ac_n "checking whether stripping libraries is possible... $ac_c" 1>&6 +if test -n "$STRIP" && $STRIP -V 2>&1 | grep "GNU strip" >/dev/null; then + test -z "$old_striplib" && old_striplib="$STRIP --strip-debug" + test -z "$striplib" && striplib="$STRIP --strip-unneeded" + echo "${ac_t}yes" 1>&6 +else + echo "${ac_t}no" 1>&6 +fi + +reload_cmds='$LD$reload_flag -o $output$reload_objs' +test -z "$deplibs_check_method" && deplibs_check_method=unknown + +# PORTME Fill in your ld.so characteristics +library_names_spec= +libname_spec='lib$name' +soname_spec= +postinstall_cmds= +postuninstall_cmds= +finish_cmds= +finish_eval= +shlibpath_var= +shlibpath_overrides_runpath=unknown +version_type=none +dynamic_linker="$host_os ld.so" +sys_lib_dlsearch_path_spec="/lib /usr/lib" +sys_lib_search_path_spec="/lib /usr/lib /usr/local/lib" + +echo $ac_n "checking dynamic linker characteristics... $ac_c" 1>&6 +case "$host_os" in +aix3*) + version_type=linux + library_names_spec='${libname}${release}.so$versuffix $libname.a' + shlibpath_var=LIBPATH + + # AIX has no versioning support, so we append a major version to the name. + soname_spec='${libname}${release}.so$major' + ;; + +aix4*) + version_type=linux + # AIX has no versioning support, so currently we can not hardcode correct + # soname into executable. Probably we can add versioning support to + # collect2, so additional links can be useful in future. + # We preserve .a as extension for shared libraries though AIX4.2 + # and later linker supports .so + library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.a' + shlibpath_var=LIBPATH + ;; + +amigaos*) + library_names_spec='$libname.ixlibrary $libname.a' + # Create ${libname}_ixlibrary.a entries in /sys/libs. + finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`$echo "X$lib" | $Xsed -e '\''s%^.*/\([^/]*\)\.ixlibrary$%\1%'\''`; test $rm /sys/libs/${libname}_ixlibrary.a; $show "(cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a)"; (cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a) || exit 1; done' + ;; + +beos*) + library_names_spec='${libname}.so' + dynamic_linker="$host_os ld.so" + shlibpath_var=LIBRARY_PATH + lt_cv_dlopen="load_add_on" + lt_cv_dlopen_libs= + lt_cv_dlopen_self=yes + ;; + +bsdi4*) + version_type=linux + need_version=no + library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so' + soname_spec='${libname}${release}.so$major' + finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir' + shlibpath_var=LD_LIBRARY_PATH + sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib" + sys_lib_dlsearch_path_spec="/shlib /usr/lib /usr/local/lib" + export_dynamic_flag_spec=-rdynamic + # the default ld.so.conf also contains /usr/contrib/lib and + # /usr/X11R6/lib (/usr/X11 is a link to /usr/X11R6), but let us allow + # libtool to hard-code these into programs + ;; + +cygwin* | mingw*) + version_type=windows + need_version=no + need_lib_prefix=no + if test "$with_gcc" = yes; then + library_names_spec='${libname}`echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll' + else + library_names_spec='${libname}`echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll $libname.lib' + fi + dynamic_linker='Win32 ld.exe' + # FIXME: first we should search . and the directory the executable is in + shlibpath_var=PATH + lt_cv_dlopen="LoadLibrary" + lt_cv_dlopen_libs= + ;; + +freebsd1*) + dynamic_linker=no + ;; + +freebsd*) + objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo aout` + version_type=freebsd-$objformat + case "$version_type" in + freebsd-elf*) + library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so $libname.so' + need_version=no + need_lib_prefix=no + ;; + freebsd-*) + library_names_spec='${libname}${release}.so$versuffix $libname.so$versuffix' + need_version=yes + ;; + esac + shlibpath_var=LD_LIBRARY_PATH + case "$host_os" in + freebsd2*) + shlibpath_overrides_runpath=yes + ;; + freebsd3.[01]* | freebsdelf3.[01]*) + shlibpath_overrides_runpath=yes + hardcode_into_libs=yes + ;; + *) # from 3.2 on + shlibpath_overrides_runpath=no + hardcode_into_libs=yes + ;; + esac + ;; + +gnu*) + version_type=linux + need_lib_prefix=no + need_version=no + library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so${major} ${libname}.so' + soname_spec='${libname}${release}.so$major' + shlibpath_var=LD_LIBRARY_PATH + hardcode_into_libs=yes + ;; + +hpux9* | hpux10* | hpux11*) + # Give a soname corresponding to the major version so that dld.sl refuses to + # link against other versions. + dynamic_linker="$host_os dld.sl" + version_type=sunos + need_lib_prefix=no + need_version=no + shlibpath_var=SHLIB_PATH + shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH + library_names_spec='${libname}${release}.sl$versuffix ${libname}${release}.sl$major $libname.sl' + soname_spec='${libname}${release}.sl$major' + # HP-UX runs *really* slowly unless shared libraries are mode 555. + postinstall_cmds='chmod 555 $lib' + ;; + +irix5* | irix6*) + version_type=irix + need_lib_prefix=no + need_version=no + soname_spec='${libname}${release}.so.$major' + library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major ${libname}${release}.so $libname.so' + case "$host_os" in + irix5*) + libsuff= shlibsuff= + ;; + *) + case "$LD" in # libtool.m4 will add one of these switches to LD + *-32|*"-32 ") libsuff= shlibsuff= libmagic=32-bit;; + *-n32|*"-n32 ") libsuff=32 shlibsuff=N32 libmagic=N32;; + *-64|*"-64 ") libsuff=64 shlibsuff=64 libmagic=64-bit;; + *) libsuff= shlibsuff= libmagic=never-match;; + esac + ;; + esac + shlibpath_var=LD_LIBRARY${shlibsuff}_PATH + shlibpath_overrides_runpath=no + sys_lib_search_path_spec="/usr/lib${libsuff} /lib${libsuff} /usr/local/lib${libsuff}" + sys_lib_dlsearch_path_spec="/usr/lib${libsuff} /lib${libsuff}" + ;; + +# No shared lib support for Linux oldld, aout, or coff. +linux-gnuoldld* | linux-gnuaout* | linux-gnucoff*) + dynamic_linker=no + ;; + +# This must be Linux ELF. +linux-gnu*) + version_type=linux + need_lib_prefix=no + need_version=no + library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so' + soname_spec='${libname}${release}.so$major' + finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir' + shlibpath_var=LD_LIBRARY_PATH + shlibpath_overrides_runpath=no + # This implies no fast_install, which is unacceptable. + # Some rework will be needed to allow for fast_install + # before this can be enabled. + hardcode_into_libs=yes + + if test -f /lib/ld.so.1; then + dynamic_linker='GNU ld.so' + else + # Only the GNU ld.so supports shared libraries on MkLinux. + case "$host_cpu" in + powerpc*) dynamic_linker=no ;; + *) dynamic_linker='Linux ld.so' ;; + esac + fi + ;; + +netbsd*) + version_type=sunos + if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then + library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix' + finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir' + dynamic_linker='NetBSD (a.out) ld.so' + else + library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major ${libname}${release}.so ${libname}.so' + soname_spec='${libname}${release}.so$major' + dynamic_linker='NetBSD ld.elf_so' + fi + shlibpath_var=LD_LIBRARY_PATH + ;; + +openbsd*) + version_type=sunos + if test "$with_gnu_ld" = yes; then + need_lib_prefix=no + need_version=no + fi + library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix' + finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir' + shlibpath_var=LD_LIBRARY_PATH + ;; + +os2*) + libname_spec='$name' + need_lib_prefix=no + library_names_spec='$libname.dll $libname.a' + dynamic_linker='OS/2 ld.exe' + shlibpath_var=LIBPATH + ;; + +osf3* | osf4* | osf5*) + version_type=osf + need_version=no + soname_spec='${libname}${release}.so' + library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so $libname.so' + shlibpath_var=LD_LIBRARY_PATH + sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib" + sys_lib_dlsearch_path_spec="$sys_lib_search_path_spec" + ;; + +sco3.2v5*) + version_type=osf + soname_spec='${libname}${release}.so$major' + library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so' + shlibpath_var=LD_LIBRARY_PATH + ;; + +solaris*) + version_type=linux + need_lib_prefix=no + need_version=no + library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so' + soname_spec='${libname}${release}.so$major' + shlibpath_var=LD_LIBRARY_PATH + shlibpath_overrides_runpath=yes + hardcode_into_libs=yes + # ldd complains unless libraries are executable + postinstall_cmds='chmod +x $lib' + ;; + +sunos4*) + version_type=sunos + library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix' + finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir' + shlibpath_var=LD_LIBRARY_PATH + shlibpath_overrides_runpath=yes + if test "$with_gnu_ld" = yes; then + need_lib_prefix=no + fi + need_version=yes + ;; + +sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*) + version_type=linux + library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so' + soname_spec='${libname}${release}.so$major' + shlibpath_var=LD_LIBRARY_PATH + case "$host_vendor" in + motorola) + need_lib_prefix=no + need_version=no + shlibpath_overrides_runpath=no + sys_lib_search_path_spec='/lib /usr/lib /usr/ccs/lib' + ;; + esac + ;; + +uts4*) + version_type=linux + library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so' + soname_spec='${libname}${release}.so$major' + shlibpath_var=LD_LIBRARY_PATH + ;; + +dgux*) + version_type=linux + need_lib_prefix=no + need_version=no + library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so' + soname_spec='${libname}${release}.so$major' + shlibpath_var=LD_LIBRARY_PATH + ;; + +sysv4*MP*) + if test -d /usr/nec ;then + version_type=linux + library_names_spec='$libname.so.$versuffix $libname.so.$major $libname.so' + soname_spec='$libname.so.$major' + shlibpath_var=LD_LIBRARY_PATH + fi + ;; + +*) + dynamic_linker=no + ;; +esac +echo "$ac_t$dynamic_linker" 1>&6 +test "$dynamic_linker" = no && can_build_shared=no + +# Check for command to grab the raw symbol name followed by C symbol from nm. +echo $ac_n "checking command to parse $NM output... $ac_c" 1>&6 + +# These are sane defaults that work on at least a few old systems. +# [They come from Ultrix. What could be older than Ultrix?!! ;)] + +# Character class describing NM global symbol codes. +symcode='[BCDEGRST]' + +# Regexp to match symbols that can be accessed directly from C. +sympat='\([_A-Za-z][_A-Za-z0-9]*\)' + +# Transform the above into a raw symbol and a C symbol. +symxfrm='\1 \2\3 \3' + +# Transform an extracted symbol line into a proper C declaration +global_symbol_to_cdecl="sed -n -e 's/^. .* \(.*\)$/extern char \1;/p'" + +# Define system-specific variables. +case "$host_os" in +aix*) + symcode='[BCDT]' + ;; +cygwin* | mingw*) + symcode='[ABCDGISTW]' + ;; +hpux*) # Its linker distinguishes data from code symbols + global_symbol_to_cdecl="sed -n -e 's/^T .* \(.*\)$/extern char \1();/p' -e 's/^. .* \(.*\)$/extern char \1;/p'" + ;; +irix*) + symcode='[BCDEGRST]' + ;; +solaris* | sysv5*) + symcode='[BDT]' + ;; +sysv4) + symcode='[DFNSTU]' + ;; +esac + +# Handle CRLF in mingw too chain +opt_cr= +case "$host_os" in +mingw*) + opt_cr=`echo 'x\{0,1\}' | tr x '\015'` # option cr in regexp + ;; +esac + +# If we're using GNU nm, then use its standard symbol codes. +if $NM -V 2>&1 | egrep '(GNU|with BFD)' > /dev/null; then + symcode='[ABCDGISTW]' +fi + +# Try without a prefix undercore, then with it. +for ac_symprfx in "" "_"; do + + # Write the raw and C identifiers. +global_symbol_pipe="sed -n -e 's/^.*[ ]\($symcode\)[ ][ ]*\($ac_symprfx\)$sympat$opt_cr$/$symxfrm/p'" + + # Check to see that the pipe works correctly. + pipe_works=no + $rm conftest* + cat > conftest.c <&5 + if { (eval echo $progname:1868: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; } && test -s conftest.$objext; then + # Now try to grab the symbols. + nlist=conftest.nm + if { echo "$progname:1871: eval \"$NM conftest.$objext | $global_symbol_pipe > $nlist\"" >&5; eval "$NM conftest.$objext | $global_symbol_pipe > $nlist 2>&5"; } && test -s "$nlist"; then + + # Try sorting and uniquifying the output. + if sort "$nlist" | uniq > "$nlist"T; then + mv -f "$nlist"T "$nlist" + else + rm -f "$nlist"T + fi + + # Make sure that we snagged all the symbols we need. + if egrep ' nm_test_var$' "$nlist" >/dev/null; then + if egrep ' nm_test_func$' "$nlist" >/dev/null; then + cat < conftest.c +#ifdef __cplusplus +extern "C" { +#endif + +EOF + # Now generate the symbol file. + eval "$global_symbol_to_cdecl"' < "$nlist" >> conftest.c' + + cat <> conftest.c +#if defined (__STDC__) && __STDC__ +# define lt_ptr_t void * +#else +# define lt_ptr_t char * +# define const +#endif + +/* The mapping between symbol names and symbols. */ +const struct { + const char *name; + lt_ptr_t address; +} +lt_preloaded_symbols[] = +{ +EOF + sed 's/^. \(.*\) \(.*\)$/ {"\2", (lt_ptr_t) \&\2},/' < "$nlist" >> conftest.c + cat <<\EOF >> conftest.c + {0, (lt_ptr_t) 0} +}; + +#ifdef __cplusplus +} +#endif +EOF + # Now try linking the two files. + mv conftest.$objext conftstm.$objext + save_LIBS="$LIBS" + save_CFLAGS="$CFLAGS" + LIBS="conftstm.$objext" + CFLAGS="$CFLAGS$no_builtin_flag" + if { (eval echo $progname:1923: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then + pipe_works=yes + else + echo "$progname: failed program was:" >&5 + cat conftest.c >&5 + fi + LIBS="$save_LIBS" + else + echo "cannot find nm_test_func in $nlist" >&5 + fi + else + echo "cannot find nm_test_var in $nlist" >&5 + fi + else + echo "cannot run $global_symbol_pipe" >&5 + fi + else + echo "$progname: failed program was:" >&5 + cat conftest.c >&5 + fi + $rm conftest* conftst* + + # Do not use the global_symbol_pipe unless it works. + if test "$pipe_works" = yes; then + break + else + global_symbol_pipe= + fi +done +if test "$pipe_works" = yes; then + echo "${ac_t}ok" 1>&6 +else + echo "${ac_t}failed" 1>&6 +fi + +if test -z "$global_symbol_pipe"; then + global_symbol_to_cdecl= +fi + +# Report the final consequences. +echo "checking if libtool supports shared libraries... $can_build_shared" 1>&6 + +# Only try to build win32 dlls if AC_LIBTOOL_WIN32_DLL was used in +# configure.in, otherwise build static only libraries. +case "$host_os" in +cygwin* | mingw* | os2*) + if test x$can_build_shared = xyes; then + test x$enable_win32_dll = xno && can_build_shared=no + echo "checking if package supports dlls... $can_build_shared" 1>&6 + fi +;; +esac + +echo $ac_n "checking whether to build shared libraries... $ac_c" 1>&6 +test "$can_build_shared" = "no" && enable_shared=no + +# On AIX, shared libraries and static libraries use the same namespace, and +# are all built from PIC. +case "$host_os" in +aix3*) + test "$enable_shared" = yes && enable_static=no + if test -n "$RANLIB"; then + archive_cmds="$archive_cmds~\$RANLIB \$lib" + postinstall_cmds='$RANLIB $lib' + fi + ;; + +aix4*) + test "$enable_shared" = yes && enable_static=no + ;; +esac + +echo "$ac_t$enable_shared" 1>&6 + +# Make sure either enable_shared or enable_static is yes. +test "$enable_shared" = yes || enable_static=yes + +echo "checking whether to build static libraries... $enable_static" 1>&6 + +if test "$hardcode_action" = relink || test "$hardcode_into_libs" = all; then + # Fast installation is not supported + enable_fast_install=no +elif test "$shlibpath_overrides_runpath" = yes || + test "$enable_shared" = no; then + # Fast installation is not necessary + enable_fast_install=needless +fi + +# Check whether we must set pic_mode to default +test -z "$pic_flag" && pic_mode=default +# On Cygwin there's no "real" PIC flag so we must build both object types +case "$host_os" in +cygwin* | mingw* | os2*) + pic_mode=default + ;; +esac +if test $pic_mode = no && test "$deplibs_check_method" != pass_all; then + # non-PIC code in shared libraries is not supported + pic_mode=default +fi + +if test "x$enable_dlopen" != xyes; then + enable_dlopen=unknown + enable_dlopen_self=unknown + enable_dlopen_self_static=unknown +else +if test "X${lt_cv_dlopen+set}" != Xset; then + lt_cv_dlopen=no lt_cv_dlopen_libs= +echo $ac_n "checking for dlopen in -ldl""... $ac_c" 1>&6 +echo "$progname:2032: checking for dlopen in -ldl" >&5 +if test "X${ac_cv_lib_dl_dlopen+set}" = Xset; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_save_LIBS="$LIBS" +LIBS="-ldl $LIBS" +cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + ac_cv_lib_dl_dlopen=yes +else + echo "$progname: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ac_cv_lib_dl_dlopen=no +fi +rm -f conftest* +LIBS="$ac_save_LIBS" + +fi +if test "X$ac_cv_lib_dl_dlopen" = Xyes; then + echo "$ac_t""yes" 1>&6 + lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-ldl" +else + echo "$ac_t""no" 1>&6 +echo $ac_n "checking for dlopen""... $ac_c" 1>&6 +echo "$progname:2071: checking for dlopen" >&5 +if test "X${ac_cv_func_dlopen+set}" = Xset; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char dlopen(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_dlopen) || defined (__stub___dlopen) +choke me +#else +dlopen(); +#endif + +; return 0; } +EOF +if { (eval echo $progname:2101: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + ac_cv_func_dlopen=yes +else + echo "$progname: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ac_cv_func_dlopen=no +fi +rm -f conftest* +fi +if test "X$ac_cv_func_dlopen" = Xyes; then + echo "$ac_t""yes" 1>&6 + lt_cv_dlopen="dlopen" +else + echo "$ac_t""no" 1>&6 +echo $ac_n "checking for dld_link in -ldld""... $ac_c" 1>&6 +echo "$progname:2118: checking for dld_link in -ldld" >&5 +if test "X${ac_cv_lib_dld_dld_link+set}" = Xset; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_save_LIBS="$LIBS" +LIBS="-ldld $LIBS" +cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + ac_cv_lib_dld_dld_link=yes +else + echo "$progname: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ac_cv_lib_dld_dld_link=no +fi +rm -f conftest* +LIBS="$ac_save_LIBS" + +fi +if test "X$ac_cv_lib_dld_dld_link" = Xyes; then + echo "$ac_t""yes" 1>&6 + lt_cv_dlopen="dld_link" lt_cv_dlopen_libs="-ldld" +else + echo "$ac_t""no" 1>&6 +echo $ac_n "checking for shl_load""... $ac_c" 1>&6 +echo "$progname:2157: checking for shl_load" >&5 +if test "X${ac_cv_func_shl_load+set}" = Xset; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char shl_load(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_shl_load) || defined (__stub___shl_load) +choke me +#else +shl_load(); +#endif + +; return 0; } +EOF +if { (eval echo $progname:2187: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + ac_cv_func_shl_load=yes +else + echo "$progname: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ac_cv_func_shl_load=no +fi +rm -f conftest* +fi + +if test "X$ac_cv_func_shl_load" = Xyes; then + echo "$ac_t""yes" 1>&6 + lt_cv_dlopen="shl_load" +else + echo "$ac_t""no" 1>&6 +echo $ac_n "checking for shl_load in -ldld""... $ac_c" 1>&6 +echo "$progname:2205: checking for shl_load in -ldld" >&5 +if test "X${ac_cv_lib_dld_shl_load+set}" = Xset; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_save_LIBS="$LIBS" +LIBS="-ldld $LIBS" +cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + ac_cv_lib_dld_shl_load=yes +else + echo "$progname: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ac_cv_lib_dld_shl_load=no +fi +rm -f conftest* +LIBS="$ac_save_LIBS" + +fi +if test "X$ac_cv_lib_dld_shl_load" = Xyes; then + echo "$ac_t""yes" 1>&6 + lt_cv_dlopen="shl_load" lt_cv_dlopen_libs="-ldld" +else + echo "$ac_t""no" 1>&6 +fi + + +fi + + +fi + + +fi + + +fi + +fi + + if test "x$lt_cv_dlopen" != xno; then + enable_dlopen=yes + fi + + case "$lt_cv_dlopen" in + dlopen) +for ac_hdr in dlfcn.h; do +ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'` +echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6 +echo "$progname:2269: checking for $ac_hdr" >&5 +if eval "test \"`echo 'X$''{'ac_cv_header_$ac_safe'+set}'`\" = Xset"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +int fnord = 0; +int main () { } +EOF +ac_try="$ac_compile >/dev/null 2>conftest.out" +{ (eval echo $progname:2280: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + rm -rf conftest* + eval "ac_cv_header_$ac_safe=yes" +else + echo "$ac_err" >&5 + echo "$progname: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_header_$ac_safe=no" +fi +rm -f conftest* +fi +if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then + echo "$ac_t""yes" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi +done + + if test "x$ac_cv_header_dlfcn_h" = xyes; then + CPPFLAGS="$CPPFLAGS -DHAVE_DLFCN_H" + fi + eval LDFLAGS=\"\$LDFLAGS $export_dynamic_flag_spec\" + LIBS="$lt_cv_dlopen_libs $LIBS" + + echo $ac_n "checking whether a program can dlopen itself""... $ac_c" 1>&6 +echo "$progname:2308: checking whether a program can dlopen itself" >&5 +if test "X${lt_cv_dlopen_self+set}" = Xset; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test "$cross_compiling" = yes; then + lt_cv_dlopen_self=cross + else + cat > conftest.c < +#endif + +#include + +#ifdef RTLD_GLOBAL +# define LTDL_GLOBAL RTLD_GLOBAL +#else +# ifdef DL_GLOBAL +# define LTDL_GLOBAL DL_GLOBAL +# else +# define LTDL_GLOBAL 0 +# endif +#endif + +/* We may have to define LTDL_LAZY_OR_NOW in the command line if we + find out it does not work in some platform. */ +#ifndef LTDL_LAZY_OR_NOW +# ifdef RTLD_LAZY +# define LTDL_LAZY_OR_NOW RTLD_LAZY +# else +# ifdef DL_LAZY +# define LTDL_LAZY_OR_NOW DL_LAZY +# else +# ifdef RTLD_NOW +# define LTDL_LAZY_OR_NOW RTLD_NOW +# else +# ifdef DL_NOW +# define LTDL_LAZY_OR_NOW DL_NOW +# else +# define LTDL_LAZY_OR_NOW 0 +# endif +# endif +# endif +# endif +#endif + +fnord() { int i=42;} +main() { void *self, *ptr1, *ptr2; self=dlopen(0,LTDL_GLOBAL|LTDL_LAZY_OR_NOW); + if(self) { ptr1=dlsym(self,"fnord"); ptr2=dlsym(self,"_fnord"); + if(ptr1 || ptr2) { dlclose(self); exit(0); } } exit(1); } + +EOF +if { (eval echo $progname:2362: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null +then + lt_cv_dlopen_self=yes +else + echo "$progname: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + lt_cv_dlopen_self=no +fi +rm -fr conftest* +fi + +fi + +echo "$ac_t""$lt_cv_dlopen_self" 1>&6 + + if test "$lt_cv_dlopen_self" = yes; then + LDFLAGS="$LDFLAGS $link_static_flag" + echo $ac_n "checking whether a statically linked program can dlopen itself""... $ac_c" 1>&6 +echo "$progname:2381: checking whether a statically linked program can dlopen itself" >&5 +if test "X${lt_cv_dlopen_self_static+set}" = Xset; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test "$cross_compiling" = yes; then + lt_cv_dlopen_self_static=cross + else + cat > conftest.c < +#endif + +#include + +#ifdef RTLD_GLOBAL +# define LTDL_GLOBAL RTLD_GLOBAL +#else +# ifdef DL_GLOBAL +# define LTDL_GLOBAL DL_GLOBAL +# else +# define LTDL_GLOBAL 0 +# endif +#endif + +/* We may have to define LTDL_LAZY_OR_NOW in the command line if we + find out it does not work in some platform. */ +#ifndef LTDL_LAZY_OR_NOW +# ifdef RTLD_LAZY +# define LTDL_LAZY_OR_NOW RTLD_LAZY +# else +# ifdef DL_LAZY +# define LTDL_LAZY_OR_NOW DL_LAZY +# else +# ifdef RTLD_NOW +# define LTDL_LAZY_OR_NOW RTLD_NOW +# else +# ifdef DL_NOW +# define LTDL_LAZY_OR_NOW DL_NOW +# else +# define LTDL_LAZY_OR_NOW 0 +# endif +# endif +# endif +# endif +#endif + +fnord() { int i=42;} +main() { void *self, *ptr1, *ptr2; self=dlopen(0,LTDL_GLOBAL|LTDL_LAZY_OR_NOW); + if(self) { ptr1=dlsym(self,"fnord"); ptr2=dlsym(self,"_fnord"); + if(ptr1 || ptr2) { dlclose(self); exit(0); } } exit(1); } + +EOF +if { (eval echo $progname:2435: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null +then + lt_cv_dlopen_self_static=yes +else + echo "$progname: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + lt_cv_dlopen_self_static=no +fi +rm -fr conftest* +fi + +fi + +echo "$ac_t""$lt_cv_dlopen_self_static" 1>&6 +fi + ;; + esac + + case "$lt_cv_dlopen_self" in + yes|no) enable_dlopen_self=$lt_cv_dlopen_self ;; + *) enable_dlopen_self=unknown ;; + esac + + case "$lt_cv_dlopen_self_static" in + yes|no) enable_dlopen_self_static=$lt_cv_dlopen_self_static ;; + *) enable_dlopen_self_static=unknown ;; + esac +fi + +# Copy echo and quote the copy, instead of the original, because it is +# used later. +ltecho="$echo" +if test "X$ltecho" = "X$CONFIG_SHELL $0 --fallback-echo"; then + ltecho="$CONFIG_SHELL \$0 --fallback-echo" +fi +LTSHELL="$SHELL" + +LTCONFIG_VERSION="$VERSION" + +# Only quote variables if we're using ltmain.sh. +case "$ltmain" in +*.sh) + # Now quote all the things that may contain metacharacters. + for var in ltecho old_AR old_CC old_CFLAGS old_CPPFLAGS \ + old_MAGIC old_LD old_LDFLAGS old_LIBS \ + old_LN_S old_NM old_RANLIB old_STRIP \ + old_AS old_DLLTOOL old_OBJDUMP \ + old_OBJEXT old_EXEEXT old_reload_flag \ + old_deplibs_check_method old_file_magic_cmd \ + AR CC LD LN_S NM LTSHELL LTCONFIG_VERSION \ + reload_flag reload_cmds wl \ + pic_flag link_static_flag no_builtin_flag export_dynamic_flag_spec \ + thread_safe_flag_spec whole_archive_flag_spec libname_spec \ + library_names_spec soname_spec \ + RANLIB old_archive_cmds old_archive_from_new_cmds old_postinstall_cmds \ + old_postuninstall_cmds archive_cmds archive_expsym_cmds postinstall_cmds \ + postuninstall_cmds extract_expsyms_cmds old_archive_from_expsyms_cmds \ + old_striplib striplib file_magic_cmd export_symbols_cmds \ + deplibs_check_method allow_undefined_flag no_undefined_flag \ + finish_cmds finish_eval global_symbol_pipe global_symbol_to_cdecl \ + hardcode_libdir_flag_spec hardcode_libdir_separator \ + sys_lib_search_path_spec sys_lib_dlsearch_path_spec \ + compiler_c_o compiler_o_lo need_locks exclude_expsyms include_expsyms; do + + case "$var" in + reload_cmds | old_archive_cmds | old_archive_from_new_cmds | \ + old_postinstall_cmds | old_postuninstall_cmds | \ + export_symbols_cmds | archive_cmds | archive_expsym_cmds | \ + extract_expsyms_cmds | old_archive_from_expsyms_cmds | \ + postinstall_cmds | postuninstall_cmds | \ + finish_cmds | sys_lib_search_path_spec | sys_lib_dlsearch_path_spec) + # Double-quote double-evaled strings. + eval "$var=\\\"\`\$echo \"X\$$var\" | \$Xsed -e \"\$double_quote_subst\" -e \"\$sed_quote_subst\" -e \"\$delay_variable_subst\"\`\\\"" ### testsuite: skip nested quoting test + ;; + *) + eval "$var=\\\"\`\$echo \"X\$$var\" | \$Xsed -e \"\$sed_quote_subst\"\`\\\"" ### testsuite: skip nested quoting test + ;; + esac + done + + case "$ltecho" in + *'\$0 --fallback-echo"') + ltecho=`$echo "X$ltecho" | $Xsed -e 's/\\\\\\\$0 --fallback-echo"$/$0 --fallback-echo"/'` + ;; + esac + + trap "$rm \"$ofile\"; exit 1" 1 2 15 + echo "creating $ofile" + $rm "$ofile" + cat < "$ofile" +#! $SHELL + +# `$echo "$ofile" | sed 's%^.*/%%'` - Provide generalized library-building support services. +# Generated automatically by $PROGRAM (GNU $PACKAGE $VERSION$TIMESTAMP) +# NOTE: Changes made to this file will be lost: look at ltconfig or ltmain.sh. +# +# Copyright (C) 1996-2000 Free Software Foundation, Inc. +# Originally by Gordon Matzigkeit , 1996 +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# Sed that helps us avoid accidentally triggering echo(1) options like -n. +Xsed="sed -e s/^X//" + +# The HP-UX ksh and POSIX shell print the target directory to stdout +# if CDPATH is set. +if test "X\${CDPATH+set}" = Xset; then CDPATH=:; export CDPATH; fi + +### BEGIN LIBTOOL CONFIG +EOF + cfgfile="$ofile" + ;; + +*) + # Double-quote the variables that need it (for aesthetics). + for var in old_AR old_CC old_CFLAGS old_CPPFLAGS \ + old_MAGIC old_LD old_LDFLAGS old_LIBS \ + old_LN_S old_NM old_RANLIB old_STRIP \ + old_AS old_DLLTOOL old_OBJDUMP \ + old_OBJEXT old_EXEEXT old_reload_flag \ + old_deplibs_check_method old_file_magic_cmd; do + eval "$var=\\\"\$var\\\"" + done + + # Just create a config file. + cfgfile="$ofile.cfg" + trap "$rm \"$cfgfile\"; exit 1" 1 2 15 + echo "creating $cfgfile" + $rm "$cfgfile" + cat < "$cfgfile" +# `$echo "$cfgfile" | sed 's%^.*/%%'` - Libtool configuration file. +# Generated automatically by $PROGRAM (GNU $PACKAGE $VERSION$TIMESTAMP) +EOF + ;; +esac + +cat <> "$cfgfile" +# Libtool was configured as follows, on host `(hostname || uname -n) 2>/dev/null | sed 1q`: +# +# AR=$old_AR CC=$old_CC CFLAGS=$old_CFLAGS CPPFLAGS=$old_CPPFLAGS \\ +# MAGIC=$old_MAGIC LD=$old_LD LDFLAGS=$old_LDFLAGS LIBS=$old_LIBS \\ +# LN_S=$old_LN_S NM=$old_NM RANLIB=$old_RANLIB STRIP=$old_STRIP \\ +# AS=$old_AS DLLTOOL=$old_DLLTOOL OBJDUMP=$old_OBJDUMP \\ +# objext=$old_OBJEXT exeext=$old_EXEEXT reload_flag=$old_reload_flag \\ +# deplibs_check_method=$old_deplibs_check_method file_magic_cmd=$old_file_magic_cmd \\ +# $0$ltconfig_args +# +# Compiler and other test output produced by $progname, useful for +# debugging $progname, is in ./config.log if it exists. +# The version of $progname that generated this script. +LTCONFIG_VERSION=$LTCONFIG_VERSION + +# Shell to use when invoking shell scripts. +SHELL=$LTSHELL + +# Whether or not to build shared libraries. +build_libtool_libs=$enable_shared + +# Whether or not to build static libraries. +build_old_libs=$enable_static + +# Whether or not to optimize for fast installation. +fast_install=$enable_fast_install + +# The host system. +host_alias=$host_alias +host=$host + +# An echo program that does not interpret backslashes. +echo=$ltecho + +# The archiver. +AR=$AR + +# The default C compiler. +CC=$CC + +# The linker used to build libraries. +LD=$LD + +# Whether we need hard or soft links. +LN_S=$LN_S + +# A BSD-compatible nm program. +NM=$NM + +# A symbol stripping program +STRIP=$STRIP + +# Used to examine libraries when file_magic_cmd begins "file" +MAGIC=$MAGIC + +# Used on cygwin: DLL creation program. +DLLTOOL="$DLLTOOL" + +# Used on cygwin: object dumper. +OBJDUMP="$OBJDUMP" + +# Used on cygwin: assembler. +AS="$AS" + +# The name of the directory that contains temporary libtool files. +objdir=$objdir + +# How to create reloadable object files. +reload_flag=$reload_flag +reload_cmds=$reload_cmds + +# How to pass a linker flag through the compiler. +wl=$wl + +# Object file suffix (normally "o"). +objext="$objext" + +# Old archive suffix (normally "a"). +libext="$libext" + +# Executable file suffix (normally ""). +exeext="$exeext" + +# Additional compiler flags for building library objects. +pic_flag=$pic_flag +pic_mode=$pic_mode + +# Does compiler simultaneously support -c and -o options? +compiler_c_o=$compiler_c_o + +# Can we write directly to a .lo ? +compiler_o_lo=$compiler_o_lo + +# Must we lock files when doing compilation ? +need_locks=$need_locks + +# Do we need the lib prefix for modules? +need_lib_prefix=$need_lib_prefix + +# Do we need a version for libraries? +need_version=$need_version + +# Whether dlopen is supported. +dlopen_support=$enable_dlopen + +# Whether dlopen of programs is supported. +dlopen_self=$enable_dlopen_self + +# Whether dlopen of statically linked programs is supported. +dlopen_self_static=$enable_dlopen_self_static + +# Compiler flag to prevent dynamic linking. +link_static_flag=$link_static_flag + +# Compiler flag to turn off builtin functions. +no_builtin_flag=$no_builtin_flag + +# Compiler flag to allow reflexive dlopens. +export_dynamic_flag_spec=$export_dynamic_flag_spec + +# Compiler flag to generate shared objects directly from archives. +whole_archive_flag_spec=$whole_archive_flag_spec + +# Compiler flag to generate thread-safe objects. +thread_safe_flag_spec=$thread_safe_flag_spec + +# Library versioning type. +version_type=$version_type + +# Format of library name prefix. +libname_spec=$libname_spec + +# List of archive names. First name is the real one, the rest are links. +# The last name is the one that the linker finds with -lNAME. +library_names_spec=$library_names_spec + +# The coded name of the library, if different from the real name. +soname_spec=$soname_spec + +# Commands used to build and install an old-style archive. +RANLIB=$RANLIB +old_archive_cmds=$old_archive_cmds +old_postinstall_cmds=$old_postinstall_cmds +old_postuninstall_cmds=$old_postuninstall_cmds + +# Create an old-style archive from a shared archive. +old_archive_from_new_cmds=$old_archive_from_new_cmds + +# Create a temporary old-style archive to link instead of a shared archive. +old_archive_from_expsyms_cmds=$old_archive_from_expsyms_cmds + +# Commands used to build and install a shared archive. +archive_cmds=$archive_cmds +archive_expsym_cmds=$archive_expsym_cmds +postinstall_cmds=$postinstall_cmds +postuninstall_cmds=$postuninstall_cmds + +# Commands to strip libraries. +old_striplib=$old_striplib +striplib=$striplib + +# Method to check whether dependent libraries are shared objects. +deplibs_check_method=$deplibs_check_method + +# Command to use when deplibs_check_method == file_magic. +file_magic_cmd=$file_magic_cmd + +# Flag that allows shared libraries with undefined symbols to be built. +allow_undefined_flag=$allow_undefined_flag + +# Flag that forces no undefined symbols. +no_undefined_flag=$no_undefined_flag + +# Commands used to finish a libtool library installation in a directory. +finish_cmds=$finish_cmds + +# Same as above, but a single script fragment to be evaled but not shown. +finish_eval=$finish_eval + +# Take the output of nm and produce a listing of raw symbols and C names. +global_symbol_pipe=$global_symbol_pipe + +# Transform the output of nm in a proper C declaration +global_symbol_to_cdecl=$global_symbol_to_cdecl + +# This is the shared library runtime path variable. +runpath_var=$runpath_var + +# This is the shared library path variable. +shlibpath_var=$shlibpath_var + +# Is shlibpath searched before the hard-coded library search path? +shlibpath_overrides_runpath=$shlibpath_overrides_runpath + +# How to hardcode a shared library path into an executable. +hardcode_action=$hardcode_action + +# Whether we should hardcode library paths into libraries. +hardcode_into_libs=$hardcode_into_libs + +# Flag to hardcode \$libdir into a binary during linking. +# This must work even if \$libdir does not exist. +hardcode_libdir_flag_spec=$hardcode_libdir_flag_spec + +# Whether we need a single -rpath flag with a separated argument. +hardcode_libdir_separator=$hardcode_libdir_separator + +# Set to yes if using DIR/libNAME.so during linking hardcodes DIR into the +# resulting binary. +hardcode_direct=$hardcode_direct + +# Set to yes if using the -LDIR flag during linking hardcodes DIR into the +# resulting binary. +hardcode_minus_L=$hardcode_minus_L + +# Set to yes if using SHLIBPATH_VAR=DIR during linking hardcodes DIR into +# the resulting binary. +hardcode_shlibpath_var=$hardcode_shlibpath_var + +# Whether libtool must link a program against all its dependency libraries. +link_all_deplibs=$link_all_deplibs + +# Compile-time system search path for libraries +sys_lib_search_path_spec=$sys_lib_search_path_spec + +# Run-time system search path for libraries +sys_lib_dlsearch_path_spec=$sys_lib_dlsearch_path_spec + +# Fix the shell variable \$srcfile for the compiler. +fix_srcfile_path="$fix_srcfile_path" + +# Set to yes if exported symbols are required. +always_export_symbols=$always_export_symbols + +# The commands to list exported symbols. +export_symbols_cmds=$export_symbols_cmds + +# The commands to extract the exported symbol list from a shared archive. +extract_expsyms_cmds=$extract_expsyms_cmds + +# Symbols that should not be listed in the preloaded symbols. +exclude_expsyms=$exclude_expsyms + +# Symbols that must always be exported. +include_expsyms=$include_expsyms + +EOF + +case "$ltmain" in +*.sh) + echo '### END LIBTOOL CONFIG' >> "$ofile" + echo >> "$ofile" + case "$host_os" in + aix3*) + cat <<\EOF >> "$ofile" + +# AIX sometimes has problems with the GCC collect2 program. For some +# reason, if we set the COLLECT_NAMES environment variable, the problems +# vanish in a puff of smoke. +if test "X${COLLECT_NAMES+set}" != Xset; then + COLLECT_NAMES= + export COLLECT_NAMES +fi +EOF + ;; + esac + case "$host" in + *-*-cygwin* | *-*-mingw* | *-*-os2*) + cat <<'EOF' >> "$ofile" + # This is a source program that is used to create dlls on Windows + # Don't remove nor modify the starting and closing comments +# /* ltdll.c starts here */ +# #define WIN32_LEAN_AND_MEAN +# #include +# #undef WIN32_LEAN_AND_MEAN +# #include +# +# #ifndef __CYGWIN__ +# # ifdef __CYGWIN32__ +# # define __CYGWIN__ __CYGWIN32__ +# # endif +# #endif +# +# #ifdef __cplusplus +# extern "C" { +# #endif +# BOOL APIENTRY DllMain (HINSTANCE hInst, DWORD reason, LPVOID reserved); +# #ifdef __cplusplus +# } +# #endif +# +# #ifdef __CYGWIN__ +# #include +# DECLARE_CYGWIN_DLL( DllMain ); +# #endif +# HINSTANCE __hDllInstance_base; +# +# BOOL APIENTRY +# DllMain (HINSTANCE hInst, DWORD reason, LPVOID reserved) +# { +# __hDllInstance_base = hInst; +# return TRUE; +# } +# /* ltdll.c ends here */ + # This is a source program that is used to create import libraries + # on Windows for dlls which lack them. Don't remove nor modify the + # starting and closing comments +# /* impgen.c starts here */ +# /* Copyright (C) 1999-2000 Free Software Foundation, Inc. +# +# This file is part of GNU libtool. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# */ +# +# #include /* for printf() */ +# #include /* for open(), lseek(), read() */ +# #include /* for O_RDONLY, O_BINARY */ +# #include /* for strdup() */ +# +# /* O_BINARY isn't required (or even defined sometimes) under Unix */ +# #ifndef O_BINARY +# #define O_BINARY 0 +# #endif +# +# static unsigned int +# pe_get16 (fd, offset) +# int fd; +# int offset; +# { +# unsigned char b[2]; +# lseek (fd, offset, SEEK_SET); +# read (fd, b, 2); +# return b[0] + (b[1]<<8); +# } +# +# static unsigned int +# pe_get32 (fd, offset) +# int fd; +# int offset; +# { +# unsigned char b[4]; +# lseek (fd, offset, SEEK_SET); +# read (fd, b, 4); +# return b[0] + (b[1]<<8) + (b[2]<<16) + (b[3]<<24); +# } +# +# static unsigned int +# pe_as32 (ptr) +# void *ptr; +# { +# unsigned char *b = ptr; +# return b[0] + (b[1]<<8) + (b[2]<<16) + (b[3]<<24); +# } +# +# int +# main (argc, argv) +# int argc; +# char *argv[]; +# { +# int dll; +# unsigned long pe_header_offset, opthdr_ofs, num_entries, i; +# unsigned long export_rva, export_size, nsections, secptr, expptr; +# unsigned long name_rvas, nexp; +# unsigned char *expdata, *erva; +# char *filename, *dll_name; +# +# filename = argv[1]; +# +# dll = open(filename, O_RDONLY|O_BINARY); +# if (!dll) +# return 1; +# +# dll_name = filename; +# +# for (i=0; filename[i]; i++) +# if (filename[i] == '/' || filename[i] == '\\' || filename[i] == ':') +# dll_name = filename + i +1; +# +# pe_header_offset = pe_get32 (dll, 0x3c); +# opthdr_ofs = pe_header_offset + 4 + 20; +# num_entries = pe_get32 (dll, opthdr_ofs + 92); +# +# if (num_entries < 1) /* no exports */ +# return 1; +# +# export_rva = pe_get32 (dll, opthdr_ofs + 96); +# export_size = pe_get32 (dll, opthdr_ofs + 100); +# nsections = pe_get16 (dll, pe_header_offset + 4 +2); +# secptr = (pe_header_offset + 4 + 20 + +# pe_get16 (dll, pe_header_offset + 4 + 16)); +# +# expptr = 0; +# for (i = 0; i < nsections; i++) +# { +# char sname[8]; +# unsigned long secptr1 = secptr + 40 * i; +# unsigned long vaddr = pe_get32 (dll, secptr1 + 12); +# unsigned long vsize = pe_get32 (dll, secptr1 + 16); +# unsigned long fptr = pe_get32 (dll, secptr1 + 20); +# lseek(dll, secptr1, SEEK_SET); +# read(dll, sname, 8); +# if (vaddr <= export_rva && vaddr+vsize > export_rva) +# { +# expptr = fptr + (export_rva - vaddr); +# if (export_rva + export_size > vaddr + vsize) +# export_size = vsize - (export_rva - vaddr); +# break; +# } +# } +# +# expdata = (unsigned char*)malloc(export_size); +# lseek (dll, expptr, SEEK_SET); +# read (dll, expdata, export_size); +# erva = expdata - export_rva; +# +# nexp = pe_as32 (expdata+24); +# name_rvas = pe_as32 (expdata+32); +# +# printf ("EXPORTS\n"); +# for (i = 0; i> "$ofile" || (rm -f "$ofile"; exit 1) + # We use sed instead of cat because bash on DJGPP gets confused if + # if finds mixed CR/LF and LF-only lines. Since sed operates in + # text mode, it properly converts lines to CR/LF. This bash problem + # is reportedly fixed, but why not run on old versions too? + + chmod +x "$ofile" + ;; + +*) + # Compile the libtool program. + echo "FIXME: would compile $ltmain" + ;; +esac + +test -n "$cache_file" || exit 0 + +# AC_CACHE_SAVE +trap '' 1 2 15 +cat > confcache <<\EOF +# This file is a shell script that caches the results of configure +# tests run on this system so they can be shared between configure +# scripts and configure runs. It is not useful on other systems. +# If it contains results you don't want to keep, you may remove or edit it. +# +# By default, configure uses ./config.cache as the cache file, +# creating it if it does not exist already. You can give configure +# the --cache-file=FILE option to use a different cache file; that is +# what configure does when it calls configure scripts in +# subdirectories, so they share the cache. +# Giving --cache-file=/dev/null disables caching, for debugging configure. +# config.status only pays attention to the cache file if you give it the +# --recheck option to rerun configure. +# +EOF +# The following way of writing the cache mishandles newlines in values, +# but we know of no workaround that is simple, portable, and efficient. +# So, don't put newlines in cache variables' values. +# Ultrix sh set writes to stderr and can't be redirected directly, +# and sets the high bit in the cache file unless we assign to the vars. +(set) 2>&1 | + case `(ac_space=' '; set | grep ac_space) 2>&1` in + *ac_space=\ *) + # `set' does not quote correctly, so add quotes (double-quote substitution + # turns \\\\ into \\, and sed turns \\ into \). + sed -n \ + -e "s/'/'\\\\''/g" \ + -e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p" + ;; + *) + # `set' quotes correctly as required by POSIX, so do not add quotes. + sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p' + ;; + esac >> confcache +if cmp -s $cache_file confcache; then + : +else + if test -w $cache_file; then + echo "updating cache $cache_file" + cat confcache > $cache_file + else + echo "not updating unwritable cache $cache_file" + fi +fi +rm -f confcache + +exit 0 + +# Local Variables: +# mode:shell-script +# sh-indentation:2 +# End: diff --git a/ghc/rts/gmp/ltmain.sh b/ghc/rts/gmp/ltmain.sh new file mode 100644 index 0000000..d81d89f --- /dev/null +++ b/ghc/rts/gmp/ltmain.sh @@ -0,0 +1,4692 @@ +# ltmain.sh - Provide generalized library-building support services. +# NOTE: Changing this file will not affect anything until you rerun ltconfig. +# +# Copyright (C) 1996-2000 Free Software Foundation, Inc. +# Originally by Gordon Matzigkeit , 1996 +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# Check that we have a working $echo. +if test "X$1" = X--no-reexec; then + # Discard the --no-reexec flag, and continue. + shift +elif test "X$1" = X--fallback-echo; then + # Avoid inline document here, it may be left over + : +elif test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then + # Yippee, $echo works! + : +else + # Restart under the correct shell, and then maybe $echo will work. + exec $SHELL "$0" --no-reexec ${1+"$@"} +fi + +if test "X$1" = X--fallback-echo; then + # used as fallback echo + shift + cat <&2 + echo "Fatal configuration error. See the $PACKAGE docs for more information." 1>&2 + exit 1 +fi + +if test "$build_libtool_libs" != yes && test "$build_old_libs" != yes; then + echo "$modename: not configured to build any kind of library" 1>&2 + echo "Fatal configuration error. See the $PACKAGE docs for more information." 1>&2 + exit 1 +fi + +# Global variables. +mode=$default_mode +nonopt= +prev= +prevopt= +run= +show="$echo" +show_help= +execute_dlfiles= +lo2o="s/\\.lo\$/.${objext}/" +o2lo="s/\\.${objext}\$/.lo/" + +# Parse our command line options once, thoroughly. +while test $# -gt 0 +do + arg="$1" + shift + + case "$arg" in + -*=*) optarg=`$echo "X$arg" | $Xsed -e 's/[-_a-zA-Z0-9]*=//'` ;; + *) optarg= ;; + esac + + # If the previous option needs an argument, assign it. + if test -n "$prev"; then + case "$prev" in + execute_dlfiles) + eval "$prev=\"\$$prev \$arg\"" + ;; + *) + eval "$prev=\$arg" + ;; + esac + + prev= + prevopt= + continue + fi + + # Have we seen a non-optional argument yet? + case "$arg" in + --help) + show_help=yes + ;; + + --version) + echo "$PROGRAM (GNU $PACKAGE) $VERSION$TIMESTAMP" + exit 0 + ;; + + --config) + sed -e '1,/^### BEGIN LIBTOOL CONFIG/d' -e '/^### END LIBTOOL CONFIG/,$d' $0 + exit 0 + ;; + + --debug) + echo "$progname: enabling shell trace mode" + set -x + ;; + + --dry-run | -n) + run=: + ;; + + --features) + echo "host: $host" + if test "$build_libtool_libs" = yes; then + echo "enable shared libraries" + else + echo "disable shared libraries" + fi + if test "$build_old_libs" = yes; then + echo "enable static libraries" + else + echo "disable static libraries" + fi + exit 0 + ;; + + --finish) mode="finish" ;; + + --mode) prevopt="--mode" prev=mode ;; + --mode=*) mode="$optarg" ;; + + --quiet | --silent) + show=: + ;; + + -dlopen) + prevopt="-dlopen" + prev=execute_dlfiles + ;; + + -*) + $echo "$modename: unrecognized option \`$arg'" 1>&2 + $echo "$help" 1>&2 + exit 1 + ;; + + *) + nonopt="$arg" + break + ;; + esac +done + +if test -n "$prevopt"; then + $echo "$modename: option \`$prevopt' requires an argument" 1>&2 + $echo "$help" 1>&2 + exit 1 +fi + +if test -z "$show_help"; then + + # Infer the operation mode. + if test -z "$mode"; then + case "$nonopt" in + *cc | *++ | gcc* | *-gcc*) + mode=link + for arg + do + case "$arg" in + -c) + mode=compile + break + ;; + esac + done + ;; + *db | *dbx | *strace | *truss) + mode=execute + ;; + *install*|cp|mv) + mode=install + ;; + *rm) + mode=uninstall + ;; + *) + # If we have no mode, but dlfiles were specified, then do execute mode. + test -n "$execute_dlfiles" && mode=execute + + # Just use the default operation mode. + if test -z "$mode"; then + if test -n "$nonopt"; then + $echo "$modename: warning: cannot infer operation mode from \`$nonopt'" 1>&2 + else + $echo "$modename: warning: cannot infer operation mode without MODE-ARGS" 1>&2 + fi + fi + ;; + esac + fi + + # Only execute mode is allowed to have -dlopen flags. + if test -n "$execute_dlfiles" && test "$mode" != execute; then + $echo "$modename: unrecognized option \`-dlopen'" 1>&2 + $echo "$help" 1>&2 + exit 1 + fi + + # Change the help message to a mode-specific one. + generic_help="$help" + help="Try \`$modename --help --mode=$mode' for more information." + + # These modes are in order of execution frequency so that they run quickly. + case "$mode" in + # libtool compile mode + compile) + modename="$modename: compile" + # Get the compilation command and the source file. + base_compile= + prev= + lastarg= + srcfile="$nonopt" + suppress_output= + + user_target=no + for arg + do + case "$prev" in + "") ;; + xcompiler) + # Aesthetically quote the previous argument. + prev= + lastarg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"` + + case "$arg" in + # Double-quote args containing other shell metacharacters. + # Many Bourne shells cannot handle close brackets correctly + # in scan sets, so we specify it separately. + *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"") + arg="\"$arg\"" + ;; + esac + + # Add the previous argument to base_compile. + if test -z "$base_compile"; then + base_compile="$lastarg" + else + base_compile="$base_compile $lastarg" + fi + continue + ;; + esac + + # Accept any command-line options. + case "$arg" in + -o) + if test "$user_target" != "no"; then + $echo "$modename: you cannot specify \`-o' more than once" 1>&2 + exit 1 + fi + user_target=next + ;; + + -static) + build_old_libs=yes + continue + ;; + + -Xcompiler) + prev=xcompiler + continue + ;; + + -Wc,*) + args=`$echo "X$arg" | $Xsed -e "s/^-Wc,//"` + lastarg= + IFS="${IFS= }"; save_ifs="$IFS"; IFS=',' + for arg in $args; do + IFS="$save_ifs" + + # Double-quote args containing other shell metacharacters. + # Many Bourne shells cannot handle close brackets correctly + # in scan sets, so we specify it separately. + case "$arg" in + *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"") + arg="\"$arg\"" + ;; + esac + lastarg="$lastarg $arg" + done + IFS="$save_ifs" + lastarg=`$echo "X$lastarg" | $Xsed -e "s/^ //"` + + # Add the arguments to base_compile. + if test -z "$base_compile"; then + base_compile="$lastarg" + else + base_compile="$base_compile $lastarg" + fi + continue + ;; + esac + + case "$user_target" in + next) + # The next one is the -o target name + user_target=yes + continue + ;; + yes) + # We got the output file + user_target=set + libobj="$arg" + continue + ;; + esac + + # Accept the current argument as the source file. + lastarg="$srcfile" + srcfile="$arg" + + # Aesthetically quote the previous argument. + + # Backslashify any backslashes, double quotes, and dollar signs. + # These are the only characters that are still specially + # interpreted inside of double-quoted scrings. + lastarg=`$echo "X$lastarg" | $Xsed -e "$sed_quote_subst"` + + # Double-quote args containing other shell metacharacters. + # Many Bourne shells cannot handle close brackets correctly + # in scan sets, so we specify it separately. + case "$lastarg" in + *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"") + lastarg="\"$lastarg\"" + ;; + esac + + # Add the previous argument to base_compile. + if test -z "$base_compile"; then + base_compile="$lastarg" + else + base_compile="$base_compile $lastarg" + fi + done + + case "$user_target" in + set) + ;; + no) + # Get the name of the library object. + libobj=`$echo "X$srcfile" | $Xsed -e 's%^.*/%%'` + ;; + *) + $echo "$modename: you must specify a target with \`-o'" 1>&2 + exit 1 + ;; + esac + + # Recognize several different file suffixes. + # If the user specifies -o file.o, it is replaced with file.lo + xform='[cCFSfmso]' + case "$libobj" in + *.ada) xform=ada ;; + *.adb) xform=adb ;; + *.ads) xform=ads ;; + *.asm) xform=asm ;; + *.c++) xform=c++ ;; + *.cc) xform=cc ;; + *.cpp) xform=cpp ;; + *.cxx) xform=cxx ;; + *.f90) xform=f90 ;; + *.for) xform=for ;; + esac + + libobj=`$echo "X$libobj" | $Xsed -e "s/\.$xform$/.lo/"` + + case "$libobj" in + *.lo) obj=`$echo "X$libobj" | $Xsed -e "$lo2o"` ;; + *) + $echo "$modename: cannot determine name of library object from \`$libobj'" 1>&2 + exit 1 + ;; + esac + + if test -z "$base_compile"; then + $echo "$modename: you must specify a compilation command" 1>&2 + $echo "$help" 1>&2 + exit 1 + fi + + # Delete any leftover library objects. + if test "$build_old_libs" = yes; then + removelist="$obj $libobj" + else + removelist="$libobj" + fi + + $run $rm $removelist + trap "$run $rm $removelist; exit 1" 1 2 15 + + # Calculate the filename of the output object if compiler does + # not support -o with -c + if test "$compiler_c_o" = no; then + output_obj=`$echo "X$srcfile" | $Xsed -e 's%^.*/%%' -e 's%\..*$%%'`.${objext} + lockfile="$output_obj.lock" + removelist="$removelist $output_obj $lockfile" + trap "$run $rm $removelist; exit 1" 1 2 15 + else + need_locks=no + lockfile= + fi + + # Lock this critical section if it is needed + # We use this script file to make the link, it avoids creating a new file + if test "$need_locks" = yes; then + until ln "$0" "$lockfile" 2>/dev/null; do + $show "Waiting for $lockfile to be removed" + sleep 2 + done + elif test "$need_locks" = warn; then + if test -f "$lockfile"; then + echo "\ +*** ERROR, $lockfile exists and contains: +`cat $lockfile 2>/dev/null` + +This indicates that another process is trying to use the same +temporary object file, and libtool could not work around it because +your compiler does not support \`-c' and \`-o' together. If you +repeat this compilation, it may succeed, by chance, but you had better +avoid parallel builds (make -j) in this platform, or get a better +compiler." + + $run $rm $removelist + exit 1 + fi + echo $srcfile > "$lockfile" + fi + + if test -n "$fix_srcfile_path"; then + eval srcfile=\"$fix_srcfile_path\" + fi + + # Only build a PIC object if we are building libtool libraries. + if test "$build_libtool_libs" = yes; then + # Without this assignment, base_compile gets emptied. + fbsd_hideous_sh_bug=$base_compile + + if test "$pic_mode" != no; then + # All platforms use -DPIC, to notify preprocessed assembler code. + command="$base_compile $srcfile $pic_flag -DPIC" + else + # Don't build PIC code + command="$base_compile $srcfile" + fi + if test "$build_old_libs" = yes; then + lo_libobj="$libobj" + dir=`$echo "X$libobj" | $Xsed -e 's%/[^/]*$%%'` + if test "X$dir" = "X$libobj"; then + dir="$objdir" + else + dir="$dir/$objdir" + fi + libobj="$dir/"`$echo "X$libobj" | $Xsed -e 's%^.*/%%'` + + if test -d "$dir"; then + $show "$rm $libobj" + $run $rm $libobj + else + $show "$mkdir $dir" + $run $mkdir $dir + status=$? + if test $status -ne 0 && test ! -d $dir; then + exit $status + fi + fi + fi + if test "$compiler_o_lo" = yes; then + output_obj="$libobj" + command="$command -o $output_obj" + elif test "$compiler_c_o" = yes; then + output_obj="$obj" + command="$command -o $output_obj" + fi + + $run $rm "$output_obj" + $show "$command" + if $run eval "$command"; then : + else + test -n "$output_obj" && $run $rm $removelist + exit 1 + fi + + if test "$need_locks" = warn && + test x"`cat $lockfile 2>/dev/null`" != x"$srcfile"; then + echo "\ +*** ERROR, $lockfile contains: +`cat $lockfile 2>/dev/null` + +but it should contain: +$srcfile + +This indicates that another process is trying to use the same +temporary object file, and libtool could not work around it because +your compiler does not support \`-c' and \`-o' together. If you +repeat this compilation, it may succeed, by chance, but you had better +avoid parallel builds (make -j) in this platform, or get a better +compiler." + + $run $rm $removelist + exit 1 + fi + + # Just move the object if needed, then go on to compile the next one + if test x"$output_obj" != x"$libobj"; then + $show "$mv $output_obj $libobj" + if $run $mv $output_obj $libobj; then : + else + error=$? + $run $rm $removelist + exit $error + fi + fi + + # If we have no pic_flag, then copy the object into place and finish. + if (test -z "$pic_flag" || test "$pic_mode" != default) && + test "$build_old_libs" = yes; then + # Rename the .lo from within objdir to obj + if test -f $obj; then + $show $rm $obj + $run $rm $obj + fi + + $show "$mv $libobj $obj" + if $run $mv $libobj $obj; then : + else + error=$? + $run $rm $removelist + exit $error + fi + + xdir=`$echo "X$obj" | $Xsed -e 's%/[^/]*$%%'` + if test "X$xdir" = "X$obj"; then + xdir="." + else + xdir="$xdir" + fi + baseobj=`$echo "X$obj" | $Xsed -e "s%.*/%%"` + libobj=`$echo "X$baseobj" | $Xsed -e "$o2lo"` + # Now arrange that obj and lo_libobj become the same file + $show "(cd $xdir && $LN_S $baseobj $libobj)" + if $run eval '(cd $xdir && $LN_S $baseobj $libobj)'; then + exit 0 + else + error=$? + $run $rm $removelist + exit $error + fi + fi + + # Allow error messages only from the first compilation. + suppress_output=' >/dev/null 2>&1' + fi + + # Only build a position-dependent object if we build old libraries. + if test "$build_old_libs" = yes; then + if test "$pic_mode" != yes; then + # Don't build PIC code + command="$base_compile $srcfile" + else + # All platforms use -DPIC, to notify preprocessed assembler code. + command="$base_compile $srcfile $pic_flag -DPIC" + fi + if test "$compiler_c_o" = yes; then + command="$command -o $obj" + output_obj="$obj" + fi + + # Suppress compiler output if we already did a PIC compilation. + command="$command$suppress_output" + $run $rm "$output_obj" + $show "$command" + if $run eval "$command"; then : + else + $run $rm $removelist + exit 1 + fi + + if test "$need_locks" = warn && + test x"`cat $lockfile 2>/dev/null`" != x"$srcfile"; then + echo "\ +*** ERROR, $lockfile contains: +`cat $lockfile 2>/dev/null` + +but it should contain: +$srcfile + +This indicates that another process is trying to use the same +temporary object file, and libtool could not work around it because +your compiler does not support \`-c' and \`-o' together. If you +repeat this compilation, it may succeed, by chance, but you had better +avoid parallel builds (make -j) in this platform, or get a better +compiler." + + $run $rm $removelist + exit 1 + fi + + # Just move the object if needed + if test x"$output_obj" != x"$obj"; then + $show "$mv $output_obj $obj" + if $run $mv $output_obj $obj; then : + else + error=$? + $run $rm $removelist + exit $error + fi + fi + + # Create an invalid libtool object if no PIC, so that we do not + # accidentally link it into a program. + if test "$build_libtool_libs" != yes; then + $show "echo timestamp > $libobj" + $run eval "echo timestamp > \$libobj" || exit $? + else + # Move the .lo from within objdir + $show "$mv $libobj $lo_libobj" + if $run $mv $libobj $lo_libobj; then : + else + error=$? + $run $rm $removelist + exit $error + fi + fi + fi + + # Unlock the critical section if it was locked + if test "$need_locks" != no; then + $rm "$lockfile" + fi + + exit 0 + ;; + + # libtool link mode + link | relink) + modename="$modename: link" + case "$host" in + *-*-cygwin* | *-*-mingw* | *-*-os2*) + # It is impossible to link a dll without this setting, and + # we shouldn't force the makefile maintainer to figure out + # which system we are compiling for in order to pass an extra + # flag for every libtool invokation. + # allow_undefined=no + + # FIXME: Unfortunately, there are problems with the above when trying + # to make a dll which has undefined symbols, in which case not + # even a static library is built. For now, we need to specify + # -no-undefined on the libtool link line when we can be certain + # that all symbols are satisfied, otherwise we get a static library. + allow_undefined=yes + ;; + *) + allow_undefined=yes + ;; + esac + libtool_args="$nonopt" + compile_command="$nonopt" + finalize_command="$nonopt" + + compile_rpath= + finalize_rpath= + compile_shlibpath= + finalize_shlibpath= + convenience= + old_convenience= + deplibs= + old_deplibs= + compiler_flags= + linker_flags= + dllsearchpath= + lib_search_path=`pwd` + + avoid_version=no + dlfiles= + dlprefiles= + dlself=no + export_dynamic=no + export_symbols= + export_symbols_regex= + generated= + libobjs= + ltlibs= + module=no + no_install=no + objs= + prefer_static_libs=no + preload=no + prev= + prevarg= + release= + rpath= + xrpath= + perm_rpath= + temp_rpath= + thread_safe=no + vinfo= + + # We need to know -static, to get the right output filenames. + for arg + do + case "$arg" in + -all-static | -static) + if test "X$arg" = "X-all-static"; then + if test "$build_libtool_libs" = yes && test -z "$link_static_flag"; then + $echo "$modename: warning: complete static linking is impossible in this configuration" 1>&2 + fi + if test -n "$link_static_flag"; then + dlopen_self=$dlopen_self_static + fi + else + if test -z "$pic_flag" && test -n "$link_static_flag"; then + dlopen_self=$dlopen_self_static + fi + fi + build_libtool_libs=no + build_old_libs=yes + prefer_static_libs=yes + break + ;; + esac + done + + # See if our shared archives depend on static archives. + test -n "$old_archive_from_new_cmds" && build_old_libs=yes + + # Go through the arguments, transforming them on the way. + while test $# -gt 0; do + arg="$1" + shift + case "$arg" in + *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"") + qarg=\"`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`\" ### testsuite: skip nested quoting test + ;; + *) qarg=$arg ;; + esac + libtool_args="$libtool_args $qarg" + + # If the previous option needs an argument, assign it. + if test -n "$prev"; then + case "$prev" in + output) + compile_command="$compile_command @OUTPUT@" + finalize_command="$finalize_command @OUTPUT@" + ;; + esac + + case "$prev" in + dlfiles|dlprefiles) + if test "$preload" = no; then + # Add the symbol object into the linking commands. + compile_command="$compile_command @SYMFILE@" + finalize_command="$finalize_command @SYMFILE@" + preload=yes + fi + case "$arg" in + *.la | *.lo) ;; # We handle these cases below. + force) + if test "$dlself" = no; then + dlself=needless + export_dynamic=yes + fi + prev= + continue + ;; + self) + if test "$prev" = dlprefiles; then + dlself=yes + elif test "$prev" = dlfiles && test "$dlopen_self" != yes; then + dlself=yes + else + dlself=needless + export_dynamic=yes + fi + prev= + continue + ;; + *) + if test "$prev" = dlfiles; then + dlfiles="$dlfiles $arg" + else + dlprefiles="$dlprefiles $arg" + fi + prev= + continue + ;; + esac + ;; + expsyms) + export_symbols="$arg" + if test ! -f "$arg"; then + $echo "$modename: symbol file \`$arg' does not exist" + exit 1 + fi + prev= + continue + ;; + expsyms_regex) + export_symbols_regex="$arg" + prev= + continue + ;; + release) + release="-$arg" + prev= + continue + ;; + rpath | xrpath) + # We need an absolute path. + case "$arg" in + [\\/]* | [A-Za-z]:[\\/]*) ;; + *) + $echo "$modename: only absolute run-paths are allowed" 1>&2 + exit 1 + ;; + esac + if test "$prev" = rpath; then + case "$rpath " in + *" $arg "*) ;; + *) rpath="$rpath $arg" ;; + esac + else + case "$xrpath " in + *" $arg "*) ;; + *) xrpath="$xrpath $arg" ;; + esac + fi + prev= + continue + ;; + xcompiler) + compiler_flags="$compiler_flags $qarg" + prev= + compile_command="$compile_command $qarg" + finalize_command="$finalize_command $qarg" + continue + ;; + xlinker) + linker_flags="$linker_flags $qarg" + compiler_flags="$compiler_flags $wl$qarg" + prev= + compile_command="$compile_command $wl$qarg" + finalize_command="$finalize_command $wl$qarg" + continue + ;; + *) + eval "$prev=\"\$arg\"" + prev= + continue + ;; + esac + fi + + prevarg="$arg" + + case "$arg" in + -all-static) + if test -n "$link_static_flag"; then + compile_command="$compile_command $link_static_flag" + finalize_command="$finalize_command $link_static_flag" + fi + continue + ;; + + -allow-undefined) + # FIXME: remove this flag sometime in the future. + $echo "$modename: \`-allow-undefined' is deprecated because it is the default" 1>&2 + continue + ;; + + -avoid-version) + avoid_version=yes + continue + ;; + + -dlopen) + prev=dlfiles + continue + ;; + + -dlpreopen) + prev=dlprefiles + continue + ;; + + -export-dynamic) + export_dynamic=yes + continue + ;; + + -export-symbols | -export-symbols-regex) + if test -n "$export_symbols" || test -n "$export_symbols_regex"; then + $echo "$modename: not more than one -exported-symbols argument allowed" + exit 1 + fi + if test "X$arg" = "X-export-symbols"; then + prev=expsyms + else + prev=expsyms_regex + fi + continue + ;; + + -L*) + dir=`$echo "X$arg" | $Xsed -e 's/^-L//'` + # We need an absolute path. + case "$dir" in + [\\/]* | [A-Za-z]:[\\/]*) ;; + *) + absdir=`cd "$dir" && pwd` + if test -z "$absdir"; then + $echo "$modename: cannot determine absolute directory name of \`$dir'" 1>&2 + exit 1 + fi + dir="$absdir" + ;; + esac + case "$deplibs " in + *" -L$dir "*) ;; + *) + deplibs="$deplibs -L$dir" + lib_search_path="$lib_search_path $dir" + ;; + esac + case "$host" in + *-*-cygwin* | *-*-mingw* | *-*-os2*) + case ":$dllsearchpath:" in + *":$dir:"*) ;; + *) dllsearchpath="$dllsearchpath:$dir";; + esac + ;; + esac + continue + ;; + + -l*) + if test "$arg" = "-lc"; then + case "$host" in + *-*-cygwin* | *-*-mingw* | *-*-os2* | *-*-beos*) + # These systems don't actually have c library (as such) + continue + ;; + esac + elif test "$arg" = "-lm"; then + case "$host" in + *-*-cygwin* | *-*-beos*) + # These systems don't actually have math library (as such) + continue + ;; + esac + fi + deplibs="$deplibs $arg" + continue + ;; + + -module) + module=yes + continue + ;; + + -no-fast-install) + fast_install=no + continue + ;; + + -no-install) + case "$host" in + *-*-cygwin* | *-*-mingw* | *-*-os2*) + # The PATH hackery in wrapper scripts is required on Windows + # in order for the loader to find any dlls it needs. + $echo "$modename: warning: \`-no-install' is ignored for $host" 1>&2 + $echo "$modename: warning: assuming \`-no-fast-install' instead" 1>&2 + fast_install=no + ;; + *) + no_install=yes + ;; + esac + continue + ;; + + -no-undefined) + allow_undefined=no + continue + ;; + + -o) prev=output ;; + + -release) + prev=release + continue + ;; + + -rpath) + prev=rpath + continue + ;; + + -R) + prev=xrpath + continue + ;; + + -R*) + dir=`$echo "X$arg" | $Xsed -e 's/^-R//'` + # We need an absolute path. + case "$dir" in + [\\/]* | [A-Za-z]:[\\/]*) ;; + *) + $echo "$modename: only absolute run-paths are allowed" 1>&2 + exit 1 + ;; + esac + case "$xrpath " in + *" $dir "*) ;; + *) xrpath="$xrpath $dir" ;; + esac + continue + ;; + + -static) + # If we have no pic_flag, then this is the same as -all-static. + if test -z "$pic_flag" && test -n "$link_static_flag"; then + compile_command="$compile_command $link_static_flag" + finalize_command="$finalize_command $link_static_flag" + fi + continue + ;; + + -thread-safe) + thread_safe=yes + continue + ;; + + -version-info) + prev=vinfo + continue + ;; + + -Wc,*) + args=`$echo "X$arg" | $Xsed -e "$sed_quote_subst" -e 's/^-Wc,//'` + arg= + IFS="${IFS= }"; save_ifs="$IFS"; IFS=',' + for flag in $args; do + IFS="$save_ifs" + case "$flag" in + *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"") + flag="\"$flag\"" + ;; + esac + arg="$arg $wl$flag" + compiler_flags="$compiler_flags $flag" + done + IFS="$save_ifs" + arg=`$echo "X$arg" | $Xsed -e "s/^ //"` + ;; + + -Wl,*) + args=`$echo "X$arg" | $Xsed -e "$sed_quote_subst" -e 's/^-Wl,//'` + arg= + IFS="${IFS= }"; save_ifs="$IFS"; IFS=',' + for flag in $args; do + IFS="$save_ifs" + case "$flag" in + *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"") + flag="\"$flag\"" + ;; + esac + arg="$arg $wl$flag" + compiler_flags="$compiler_flags $wl$flag" + linker_flags="$linker_flags $flag" + done + IFS="$save_ifs" + arg=`$echo "X$arg" | $Xsed -e "s/^ //"` + ;; + + -Xcompiler) + prev=xcompiler + continue + ;; + + -Xlinker) + prev=xlinker + continue + ;; + + # Some other compiler flag. + -* | +*) + # Unknown arguments in both finalize_command and compile_command need + # to be aesthetically quoted because they are evaled later. + arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"` + case "$arg" in + *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"") + arg="\"$arg\"" + ;; + esac + ;; + + *.$objext) + # A standard object. + objs="$objs $arg" + ;; + + *.lo) + # A library object. + if test "$prev" = dlfiles; then + # This file was specified with -dlopen. + if test "$build_libtool_libs" = yes && test "$dlopen_support" = yes; then + dlfiles="$dlfiles $arg" + prev= + continue + else + # If libtool objects are unsupported, then we need to preload. + prev=dlprefiles + fi + fi + + if test "$prev" = dlprefiles; then + # Preload the old-style object. + dlprefiles="$dlprefiles "`$echo "X$arg" | $Xsed -e "$lo2o"` + prev= + else + libobjs="$libobjs $arg" + fi + ;; + + *.$libext) + # An archive. + deplibs="$deplibs $arg" + old_deplibs="$old_deplibs $arg" + continue + ;; + + *.la) + # A libtool-controlled library. + + if test "$prev" = dlfiles; then + # This library was specified with -dlopen. + dlfiles="$dlfiles $arg" + prev= + elif test "$prev" = dlprefiles; then + # The library was specified with -dlpreopen. + dlprefiles="$dlprefiles $arg" + prev= + else + deplibs="$deplibs $arg" + fi + continue + ;; + + # Some other compiler argument. + *) + # Unknown arguments in both finalize_command and compile_command need + # to be aesthetically quoted because they are evaled later. + arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"` + case "$arg" in + *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*|"") + arg="\"$arg\"" + ;; + esac + ;; + esac + + # Now actually substitute the argument into the commands. + if test -n "$arg"; then + compile_command="$compile_command $arg" + finalize_command="$finalize_command $arg" + fi + done + + if test -n "$prev"; then + $echo "$modename: the \`$prevarg' option requires an argument" 1>&2 + $echo "$help" 1>&2 + exit 1 + fi + + if test "$export_dynamic" = yes && test -n "$export_dynamic_flag_spec"; then + eval arg=\"$export_dynamic_flag_spec\" + compile_command="$compile_command $arg" + finalize_command="$finalize_command $arg" + fi + + oldlibs= + # calculate the name of the file, without its directory + outputname=`$echo "X$output" | $Xsed -e 's%^.*/%%'` + libobjs_save="$libobjs" + + if test -n "$shlibpath_var"; then + # get the directories listed in $shlibpath_var + eval shlib_search_path=\`\$echo \"X \${$shlibpath_var}\" \| \$Xsed -e \'s/:/ /g\'\` + else + shlib_search_path= + fi + eval sys_lib_search_path=\"$sys_lib_search_path_spec\" + eval sys_lib_dlsearch_path=\"$sys_lib_dlsearch_path_spec\" + lib_search_path="$lib_search_path $sys_lib_search_path $shlib_search_path" + + output_objdir=`$echo "X$output" | $Xsed -e 's%/[^/]*$%%'` + if test "X$output_objdir" = "X$output"; then + output_objdir="$objdir" + else + output_objdir="$output_objdir/$objdir" + fi + # Create the object directory. + if test ! -d $output_objdir; then + $show "$mkdir $output_objdir" + $run $mkdir $output_objdir + status=$? + if test $status -ne 0 && test ! -d $output_objdir; then + exit $status + fi + fi + + case "$output" in + "") + $echo "$modename: you must specify an output file" 1>&2 + $echo "$help" 1>&2 + exit 1 + ;; + *.$libext) + linkmode=oldlib ;; + *.lo | *.$objext) + linkmode=obj ;; + *.la) + linkmode=lib ;; + *) # Anything else should be a program. + linkmode=prog ;; + esac + + specialdeplibs= + libs= + # Find all interdependent deplibs that + # are linked more than once (e.g. -la -lb -la) + for deplib in $deplibs; do + case "$libs " in + *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;; + esac + libs="$libs $deplib" + done + deplibs= + newdependency_libs= + uninst_path= # paths that contain uninstalled libtool libraries + new_lib_search_path= + need_relink=no # whether we're linking any uninstalled libtool libraries + case $linkmode in + lib) + passes="link" + for file in $dlfiles $dlprefiles; do + case "$file" in + *.la) ;; + *) + $echo "$modename: libraries can \`-dlopen' only libtool libraries" 1>&2 + exit 1 + ;; + esac + done + ;; + prog) + compile_deplibs= + finalize_deplibs= + alldeplibs=no + newdlfiles= + newdlprefiles= + link_against_libtool_libs= + passes="scan dlopen dlpreopen link" + ;; + *) passes="link" + ;; + esac + for pass in $passes; do + if test $linkmode = prog; then + case $pass in + dlopen) libs="$dlfiles" ;; + dlpreopen) libs="$dlprefiles" ;; + link) libs="$deplibs %DEPLIBS% $dependency_libs" ;; + esac + fi + if test $pass = dlopen; then + # Collect dlpreopened libraries + save_deplibs="$deplibs" + deplibs= + fi + for deplib in $libs; do + lib= + found=no + case "$deplib" in + -l*) + if test $linkmode != lib && test $linkmode != prog; then + $echo "$modename: warning: \`-l' is ignored for archives/objects" 1>&2 + continue + fi + name=`$echo "X$deplib" | $Xsed -e 's/^-l//'` + for searchdir in $lib_search_path; do + # Search the libtool library + lib="$searchdir/lib${name}.la" + if test -f "$lib"; then + found=yes + break + fi + done + if test "$found" != yes; then + if test "$linkmode,$pass" = "prog,link"; then + compile_deplibs="$deplib $compile_deplibs" + finalize_deplibs="$deplib $finalize_deplibs" + else + deplibs="$deplib $deplibs" + test $linkmode = lib && newdependency_libs="$deplib $newdependency_libs" + fi + continue + fi + ;; + -L*) + case $linkmode in + lib) + deplibs="$deplib $deplibs" + newdependency_libs="$deplib $newdependency_libs" + new_lib_search_path="$new_lib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'` + ;; + prog) + if test $pass = scan; then + deplibs="$deplib $deplibs" + new_lib_search_path="$new_lib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'` + else + compile_deplibs="$deplib $compile_deplibs" + finalize_deplibs="$deplib $finalize_deplibs" + fi + ;; + *) + $echo "$modename: warning: \`-L' is ignored for archives/objects" 1>&2 + ;; + esac + continue + ;; + -R*) + if test "$linkmode,$pass" = "prog,link"; then + dir=`$echo "X$deplib" | $Xsed -e 's/^-R//'` + # Make sure the xrpath contains only unique directories. + case "$xrpath " in + *" $dir "*) ;; + *) xrpath="$xrpath $dir" ;; + esac + fi + continue + ;; + *.la) lib="$deplib" ;; + *.$libext) + case $linkmode in + lib) + if test "$deplibs_check_method" != pass_all; then + echo + echo "*** Warning: This library needs some functionality provided by $deplib." + echo "*** I have the capability to make that library automatically link in when" + echo "*** you link to this library. But I can only do this if you have a" + echo "*** shared version of the library, which you do not appear to have." + else + echo + echo "*** Warning: Linking the shared library $output against the" + echo "*** static library $deplib is not portable!" + deplibs="$deplib $deplibs" + fi + continue + ;; + prog) + if test $pass != link; then + deplibs="$deplib $deplibs" + else + compile_deplibs="$deplib $compile_deplibs" + finalize_deplibs="$deplib $finalize_deplibs" + fi + continue + ;; + esac + ;; + *.lo | *.$objext) + if test $linkmode = prog; then + if test $pass = dlpreopen || test "$dlopen_support" != yes || test "$build_libtool_libs" = no; then + # If there is no dlopen support or we're linking statically, + # we need to preload. + newdlprefiles="$newdlprefiles $deplib" + compile_deplibs="$deplib $compile_deplibs" + finalize_deplibs="$deplib $finalize_deplibs" + else + newdlfiles="$newdlfiles $deplib" + fi + fi + continue + ;; + %DEPLIBS%) + alldeplibs=yes + continue + ;; + esac + if test $found = yes || test -f "$lib"; then : + else + $echo "$modename: cannot find the library \`$lib'" 1>&2 + exit 1 + fi + + # Check to see that this really is a libtool archive. + if (sed -e '2q' $lib | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then : + else + $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2 + exit 1 + fi + + ladir=`$echo "X$lib" | $Xsed -e 's%/[^/]*$%%'` + test "X$ladir" = "X$lib" && ladir="." + + dlname= + dlopen= + dlpreopen= + libdir= + library_names= + old_library= + # If the library was installed with an old release of libtool, + # it will not redefine variable installed. + installed=yes + + # Read the .la file + case "$lib" in + */* | *\\*) . $lib ;; + *) . ./$lib ;; + esac + + if test $linkmode = lib || test "$linkmode,$pass" = "prog,scan"; then + test -n "$dlopen" && dlfiles="$dlfiles $dlopen" + test -n "$dlpreopen" && dlprefiles="$dlprefiles $dlpreopen" + fi + + if test $linkmode != lib && test $linkmode != prog; then + # only check for convenience libraries + if test -z "$old_library"; then + $echo "$modename: cannot find name of link library for \`$lib'" 1>&2 + exit 1 + fi + if test -n "$libdir"; then + $echo "$modename: \`$lib' is not a convenience library" 1>&2 + exit 1 + fi + # It is a libtool convenience library, so add in its objects. + convenience="$convenience $ladir/$objdir/$old_library" + old_convenience="$old_convenience $ladir/$objdir/$old_library" + continue + fi + + # Get the name of the library we link against. + linklib= + for l in $old_library $library_names; do + linklib="$l" + done + if test -z "$linklib"; then + $echo "$modename: cannot find name of link library for \`$lib'" 1>&2 + exit 1 + fi + + # This library was specified with -dlopen. + if test $pass = dlopen; then + if test -z "$dlname" || test "$dlopen_support" != yes || test "$build_libtool_libs" = no; then + # If there is no dlname, no dlopen support or we're linking statically, + # we need to preload. + dlprefiles="$dlprefiles $lib" + else + newdlfiles="$newdlfiles $lib" + fi + continue + fi + + # We need an absolute path. + case "$ladir" in + [\\/]* | [A-Za-z]:[\\/]*) abs_ladir="$ladir" ;; + *) + abs_ladir=`cd "$ladir" && pwd` + if test -z "$abs_ladir"; then + $echo "$modename: warning: cannot determine absolute directory name of \`$ladir'" 1>&2 + $echo "$modename: passing it literally to the linker, although it might fail" 1>&2 + abs_ladir="$ladir" + fi + ;; + esac + laname=`$echo "X$lib" | $Xsed -e 's%^.*/%%'` + + # Find the relevant object directory and library name. + if test "X$installed" = Xyes; then + if test ! -f "$libdir/$linklib" && test -f "$abs_ladir/$linklib"; then + $echo "$modename: warning: library \`$lib' was moved." 1>&2 + dir="$ladir" + absdir="$abs_ladir" + libdir="$abs_ladir" + else + dir="$libdir" + absdir="$libdir" + fi + else + dir="$ladir/$objdir" + absdir="$abs_ladir/$objdir" + # Remove this search path later + uninst_path="$uninst_path $abs_ladir" + fi + name=`$echo "X$laname" | $Xsed -e 's/\.la$//' -e 's/^lib//'` + + # This library was specified with -dlpreopen. + if test $pass = dlpreopen; then + # Prefer using a static library (so that no silly _DYNAMIC symbols + # are required to link). + if test -n "$old_library"; then + newdlprefiles="$newdlprefiles $dir/$old_library" + else + newdlprefiles="$newdlprefiles $dir/$linklib" + fi + fi + + if test $linkmode = prog && test $pass != link; then + new_lib_search_path="$new_lib_search_path $ladir" + deplibs="$lib $deplibs" + + linkalldeplibs=no + if test "$link_all_deplibs" != no || test "$fast_install" != no || \ + test "$build_libtool_libs" = no || test -z "$library_names"; then + linkalldeplibs=yes + fi + + tmp_libs= + for deplib in $dependency_libs; do + case "$deplib" in + -L*) new_lib_search_path="$new_lib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`;; ### testsuite: skip nested quoting test + esac + # Need to link against all dependency_libs? + if test $linkalldeplibs = yes; then + deplibs="$deplib $deplibs" + else + # Need to hardcode shared library paths + # or/and link against static libraries + newdependency_libs="$deplib $newdependency_libs" + fi + case "$tmp_libs " in + *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;; + esac + tmp_libs="$tmp_libs $deplib" + done + continue + fi + + if test -z "$libdir"; then + # It is a libtool convenience library, so add in its objects. + convenience="$convenience $dir/$old_library" + old_convenience="$old_convenience $dir/$old_library" + if test $linkmode = lib; then + deplibs="$dir/$old_library $deplibs" + tmp_libs= + for deplib in $dependency_libs; do + newdependency_libs="$deplib $newdependency_libs" + case "$tmp_libs " in + *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;; + esac + tmp_libs="$tmp_libs $deplib" + done + elif test "$linkmode,$pass" = "prog,link"; then + compile_deplibs="$dir/$old_library $compile_deplibs" + finalize_deplibs="$dir/$old_library $finalize_deplibs" + fi + continue + fi + + if test "$linkmode,$pass" = "prog,link"; then + if test -n "$library_names" && + { test "$hardcode_into_libs" != all || test "$alldeplibs" != yes; } && + { test "$prefer_static_libs" = no || test -z "$old_library"; }; then + # We need to hardcode the library path + if test -n "$shlibpath_var"; then + # Make sure the rpath contains only unique directories. + case "$temp_rpath " in + *" $dir "*) ;; + *" $absdir "*) ;; + *) temp_rpath="$temp_rpath $dir" ;; + esac + fi + + # Hardcode the library path. + # Skip directories that are in the system default run-time + # search path. + case " $sys_lib_dlsearch_path " in + *" $absdir "*) ;; + *) + case "$compile_rpath " in + *" $absdir "*) ;; + *) compile_rpath="$compile_rpath $absdir" + esac + ;; + esac + + case " $sys_lib_dlsearch_path " in + *" $libdir "*) ;; + *) + case "$finalize_rpath " in + *" $libdir "*) ;; + *) finalize_rpath="$finalize_rpath $libdir" + esac + ;; + esac + fi + + if test "$alldeplibs" = yes && + { test "$deplibs_check_method" = pass_all || + { test "$build_libtool_libs" = yes && + test -n "$library_names"; }; }; then + # Do we only need to link against static libraries? + continue + fi + fi + + link_static=no # Whether this library is linked statically + if test -n "$library_names" && + { test "$prefer_static_libs" = no || test -z "$old_library"; }; then + link_against_libtool_libs="$link_against_libtool_libs $lib" + test "X$installed" = xno && need_relink=yes + # This is a shared library + if test $linkmode = lib && test "$hardcode_into_libs" = all; then + # Hardcode the library path. + # Skip directories that are in the system default run-time + # search path. + case " $sys_lib_dlsearch_path " in + *" $absdir "*) ;; + *) + case "$compile_rpath " in + *" $absdir "*) ;; + *) compile_rpath="$compile_rpath $absdir" + esac + ;; + esac + case " $sys_lib_dlsearch_path " in + *" $libdir "*) ;; + *) + case "$finalize_rpath " in + *" $libdir "*) ;; + *) finalize_rpath="$finalize_rpath $libdir" + esac + ;; + esac + fi + + if test -n "$old_archive_from_expsyms_cmds"; then + # figure out the soname + set dummy $library_names + realname="$2" + shift; shift + libname=`eval \\$echo \"$libname_spec\"` + if test -n "$soname_spec"; then + eval soname=\"$soname_spec\" + else + soname="$realname" + fi + + # Make a new name for the extract_expsyms_cmds to use + newlib="libimp-`echo $soname | sed 's/^lib//;s/\.dll$//'`.a" + + # If the library has no export list, then create one now + if test -f "$output_objdir/$soname-def"; then : + else + $show "extracting exported symbol list from \`$soname'" + IFS="${IFS= }"; save_ifs="$IFS"; IFS='~' + eval cmds=\"$extract_expsyms_cmds\" + for cmd in $cmds; do + IFS="$save_ifs" + $show "$cmd" + $run eval "$cmd" || exit $? + done + IFS="$save_ifs" + fi + + # Create $newlib + if test -f "$output_objdir/$newlib"; then :; else + $show "generating import library for \`$soname'" + IFS="${IFS= }"; save_ifs="$IFS"; IFS='~' + eval cmds=\"$old_archive_from_expsyms_cmds\" + for cmd in $cmds; do + IFS="$save_ifs" + $show "$cmd" + $run eval "$cmd" || exit $? + done + IFS="$save_ifs" + fi + # make sure the library variables are pointing to the new library + dir=$output_objdir + linklib=$newlib + fi + + if test $linkmode = prog || test "$mode" != relink; then + add_shlibpath= + add_dir= + add= + lib_linked=yes + case "$hardcode_action" in + immediate | unsupported) + if test "$hardcode_direct" = no; then + add="$dir/$linklib" + elif test "$hardcode_minus_L" = no; then + case "$host" in + *-*-sunos*) add_shlibpath="$dir" ;; + esac + add_dir="-L$dir" + add="-l$name" + elif test "$hardcode_shlibpath_var" = no; then + add_shlibpath="$dir" + add="-l$name" + else + lib_linked=no + fi + ;; + relink) + if test "$hardcode_direct" = yes; then + add="$dir/$linklib" + elif test "$hardcode_minus_L" = yes; then + add_dir="-L$dir" + add="-l$name" + elif test "$hardcode_shlibpath_var" = yes; then + add_shlibpath="$dir" + add="-l$name" + else + lib_linked=no + fi + ;; + *) lib_linked=no ;; + esac + + if test "$lib_linked" != yes; then + $echo "$modename: configuration error: unsupported hardcode properties" + exit 1 + fi + + if test -n "$add_shlibpath"; then + case ":$compile_shlibpath:" in + *":$add_shlibpath:"*) ;; + *) compile_shlibpath="$compile_shlibpath$add_shlibpath:" ;; + esac + fi + if test $linkmode = prog; then + test -n "$add_dir" && compile_deplibs="$add_dir $compile_deplibs" + test -n "$add" && compile_deplibs="$add $compile_deplibs" + else + test -n "$add_dir" && deplibs="$add_dir $deplibs" + test -n "$add" && deplibs="$add $deplibs" + if test "$hardcode_direct" != yes && \ + test "$hardcode_minus_L" != yes && \ + test "$hardcode_shlibpath_var" = yes; then + case ":$finalize_shlibpath:" in + *":$libdir:"*) ;; + *) finalize_shlibpath="$finalize_shlibpath$libdir:" ;; + esac + fi + fi + fi + + if test $linkmode = prog || test "$mode" = relink; then + add_shlibpath= + add_dir= + add= + # Finalize command for both is simple: just hardcode it. + if test "$hardcode_direct" = yes; then + add="$libdir/$linklib" + elif test "$hardcode_minus_L" = yes; then + add_dir="-L$libdir" + add="-l$name" + elif test "$hardcode_shlibpath_var" = yes; then + case ":$finalize_shlibpath:" in + *":$libdir:"*) ;; + *) finalize_shlibpath="$finalize_shlibpath$libdir:" ;; + esac + add="-l$name" + else + # We cannot seem to hardcode it, guess we'll fake it. + add_dir="-L$libdir" + add="-l$name" + fi + + if test $linkmode = prog; then + test -n "$add_dir" && finalize_deplibs="$add_dir $finalize_deplibs" + test -n "$add" && finalize_deplibs="$add $finalize_deplibs" + else + test -n "$add_dir" && deplibs="$add_dir $deplibs" + test -n "$add" && deplibs="$add deplibs" + fi + fi + elif test $linkmode = prog; then + # Here we assume that one of hardcode_direct or hardcode_minus_L + # is not unsupported. This is valid on all known static and + # shared platforms. + if test "$hardcode_direct" != unsupported; then + test -n "$old_library" && linklib="$old_library" + compile_deplibs="$dir/$linklib $compile_deplibs" + finalize_deplibs="$dir/$linklib $finalize_deplibs" + else + compile_deplibs="-l$name -L$dir $compile_deplibs" + finalize_deplibs="-l$name -L$dir $finalize_deplibs" + fi + elif test "$build_libtool_libs" = yes; then + # Not a shared library + if test "$deplibs_check_method" != pass_all; then + # We're trying link a shared library against a static one + # but the system doesn't support it. + # Just print a warning and add the library to dependency_libs so + # that the program can be linked against the static library. + echo + echo "*** Warning: This library needs some functionality provided by $lib." + echo "*** I have the capability to make that library automatically link in when" + echo "*** you link to this library. But I can only do this if you have a" + echo "*** shared version of the library, which you do not appear to have." + else + convenience="$convenience $dir/$old_library" + old_convenience="$old_convenience $dir/$old_library" + deplibs="$dir/$old_library $deplibs" + link_static=yes + fi + fi + + if test $linkmode = lib; then + if test -n "$dependency_libs" && + { test "$hardcode_into_libs" = no || test $build_old_libs = yes || + test $link_static = yes; }; then + # Extract -R from dependency_libs + temp_deplibs= + for libdir in $dependency_libs; do + case "$libdir" in + -R*) temp_xrpath=`$echo "X$libdir" | $Xsed -e 's/^-R//'` + case " $xrpath " in + *" $temp_xrpath "*) ;; + *) xrpath="$xrpath $temp_xrpath";; + esac;; + *) temp_deplibs="$temp_deplibs $libdir";; + esac + done + dependency_libs="$temp_deplibs" + fi + + new_lib_search_path="$new_lib_search_path $absdir" + # Link against this library + test "$link_static" = no && newdependency_libs="$abs_ladir/$laname $newdependency_libs" + # ... and its dependency_libs + tmp_libs= + for deplib in $dependency_libs; do + newdependency_libs="$deplib $newdependency_libs" + case "$tmp_libs " in + *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;; + esac + tmp_libs="$tmp_libs $deplib" + done + + if test $link_all_deplibs != no; then + # Add the search paths of all dependency libraries + for deplib in $dependency_libs; do + case "$deplib" in + -L*) path="$deplib" ;; + *.la) + dir=`$echo "X$deplib" | $Xsed -e 's%/[^/]*$%%'` + test "X$dir" = "X$deplib" && dir="." + # We need an absolute path. + case "$dir" in + [\\/]* | [A-Za-z]:[\\/]*) absdir="$dir" ;; + *) + absdir=`cd "$dir" && pwd` + if test -z "$absdir"; then + $echo "$modename: warning: cannot determine absolute directory name of \`$dir'" 1>&2 + absdir="$dir" + fi + ;; + esac + if grep "^installed=no" $deplib > /dev/null; then + path="-L$absdir/$objdir" + else + eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $deplib` + if test -z "$libdir"; then + $echo "$modename: \`$deplib' is not a valid libtool archive" 1>&2 + exit 1 + fi + if test "$absdir" != "$libdir"; then + $echo "$modename: warning: \`$deplib' seems to be moved" 1>&2 + fi + path="-L$absdir" + fi + ;; + *) continue ;; + esac + case " $deplibs " in + *" $path "*) ;; + *) deplibs="$deplibs $path" ;; + esac + done + fi + fi + done + dependency_libs="$newdependency_libs" + if test $pass = dlpreopen; then + # Link the dlpreopened libraries before other libraries + deplibs="$deplibs $save_deplibs" + elif test $pass != dlopen; then + # Make sure lib_search_path contains only unique directories. + lib_search_path= + for dir in $new_lib_search_path; do + case "$lib_search_path " in + *" $dir "*) ;; + *) lib_search_path="$lib_search_path $dir" ;; + esac + done + lib_search_path="$lib_search_path $sys_lib_search_path" + + if test "$linkmode,$pass" != "prog,link"; then + vars="deplibs" + else + vars="compile_deplibs finalize_deplibs" + fi + for var in $vars dependency_libs; do + # Make sure that $var contains only unique libraries + # and add them in reverse order + eval tmp_libs=\"\$$var\" + new_libs= + for deplib in $tmp_libs; do + case "$deplib" in + -L*) new_libs="$deplib $new_libs" ;; + *) + case " $specialdeplibs " in + *" $deplib "*) new_libs="$deplib $new_libs" ;; + *) + case " $new_libs " in + *" $deplib "*) ;; + *) new_libs="$deplib $new_libs" ;; + esac + ;; + esac + ;; + esac + done + tmp_libs= + for deplib in $new_libs; do + case "$deplib" in + -L*) + case " $tmp_libs " in + *" $deplib "*) ;; + *) tmp_libs="$tmp_libs $deplib" ;; + esac + ;; + *) tmp_libs="$tmp_libs $deplib" ;; + esac + done + eval $var=\"$tmp_libs\" + done + fi + done + if test $linkmode = prog; then + dlfiles="$newdlfiles" + dlprefiles="$newdlprefiles" + fi + + case $linkmode in + oldlib) + if test -n "$deplibs"; then + $echo "$modename: warning: \`-l' and \`-L' are ignored for archives" 1>&2 + fi + + if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then + $echo "$modename: warning: \`-dlopen' is ignored for archives" 1>&2 + fi + + if test -n "$rpath"; then + $echo "$modename: warning: \`-rpath' is ignored for archives" 1>&2 + fi + + if test -n "$xrpath"; then + $echo "$modename: warning: \`-R' is ignored for archives" 1>&2 + fi + + if test -n "$vinfo"; then + $echo "$modename: warning: \`-version-info' is ignored for archives" 1>&2 + fi + + if test -n "$release"; then + $echo "$modename: warning: \`-release' is ignored for archives" 1>&2 + fi + + if test -n "$export_symbols" || test -n "$export_symbols_regex"; then + $echo "$modename: warning: \`-export-symbols' is ignored for archives" 1>&2 + fi + + # Now set the variables for building old libraries. + build_libtool_libs=no + oldlibs="$output" + objs="$objs$old_deplibs" + ;; + + lib) + # Make sure we only generate libraries of the form `libNAME.la'. + case "$outputname" in + lib*) + name=`$echo "X$outputname" | $Xsed -e 's/\.la$//' -e 's/^lib//'` + eval libname=\"$libname_spec\" + ;; + *) + if test "$module" = no; then + $echo "$modename: libtool library \`$output' must begin with \`lib'" 1>&2 + $echo "$help" 1>&2 + exit 1 + fi + if test "$need_lib_prefix" != no; then + # Add the "lib" prefix for modules if required + name=`$echo "X$outputname" | $Xsed -e 's/\.la$//'` + eval libname=\"$libname_spec\" + else + libname=`$echo "X$outputname" | $Xsed -e 's/\.la$//'` + fi + ;; + esac + + if test -n "$objs"; then + if test "$deplibs_check_method" != pass_all; then + $echo "$modename: cannot build libtool library \`$output' from non-libtool objects on this host:$objs" 2>&1 + exit 1 + else + echo + echo "*** Warning: Linking the shared library $output against the non-libtool" + echo "*** objects $objs is not portable!" + libobjs="$libobjs $objs" + fi + fi + + if test "$dlself" != no; then + $echo "$modename: warning: \`-dlopen self' is ignored for libtool libraries" 1>&2 + fi + + set dummy $rpath + if test $# -gt 2; then + $echo "$modename: warning: ignoring multiple \`-rpath's for a libtool library" 1>&2 + fi + install_libdir="$2" + + oldlibs= + if test -z "$rpath"; then + if test "$build_libtool_libs" = yes; then + # Building a libtool convenience library. + libext=al + oldlibs="$output_objdir/$libname.$libext $oldlibs" + build_libtool_libs=convenience + build_old_libs=yes + fi + + if test -n "$vinfo"; then + $echo "$modename: warning: \`-version-info' is ignored for convenience libraries" 1>&2 + fi + + if test -n "$release"; then + $echo "$modename: warning: \`-release' is ignored for convenience libraries" 1>&2 + fi + else + + # Parse the version information argument. + IFS="${IFS= }"; save_ifs="$IFS"; IFS=':' + set dummy $vinfo 0 0 0 + IFS="$save_ifs" + + if test -n "$8"; then + $echo "$modename: too many parameters to \`-version-info'" 1>&2 + $echo "$help" 1>&2 + exit 1 + fi + + current="$2" + revision="$3" + age="$4" + + # Check that each of the things are valid numbers. + case "$current" in + 0 | [1-9] | [1-9][0-9]*) ;; + *) + $echo "$modename: CURRENT \`$current' is not a nonnegative integer" 1>&2 + $echo "$modename: \`$vinfo' is not valid version information" 1>&2 + exit 1 + ;; + esac + + case "$revision" in + 0 | [1-9] | [1-9][0-9]*) ;; + *) + $echo "$modename: REVISION \`$revision' is not a nonnegative integer" 1>&2 + $echo "$modename: \`$vinfo' is not valid version information" 1>&2 + exit 1 + ;; + esac + + case "$age" in + 0 | [1-9] | [1-9][0-9]*) ;; + *) + $echo "$modename: AGE \`$age' is not a nonnegative integer" 1>&2 + $echo "$modename: \`$vinfo' is not valid version information" 1>&2 + exit 1 + ;; + esac + + if test $age -gt $current; then + $echo "$modename: AGE \`$age' is greater than the current interface number \`$current'" 1>&2 + $echo "$modename: \`$vinfo' is not valid version information" 1>&2 + exit 1 + fi + + # Calculate the version variables. + major= + versuffix= + verstring= + case "$version_type" in + none) ;; + + irix) + major=`expr $current - $age + 1` + versuffix="$major.$revision" + verstring="sgi$major.$revision" + + # Add in all the interfaces that we are compatible with. + loop=$revision + while test $loop != 0; do + iface=`expr $revision - $loop` + loop=`expr $loop - 1` + verstring="sgi$major.$iface:$verstring" + done + ;; + + linux) + major=.`expr $current - $age` + versuffix="$major.$age.$revision" + ;; + + osf) + major=`expr $current - $age` + versuffix=".$current.$age.$revision" + verstring="$current.$age.$revision" + + # Add in all the interfaces that we are compatible with. + loop=$age + while test $loop != 0; do + iface=`expr $current - $loop` + loop=`expr $loop - 1` + verstring="$verstring:${iface}.0" + done + + # Make executables depend on our current version. + verstring="$verstring:${current}.0" + ;; + + sunos) + major=".$current" + versuffix=".$current.$revision" + ;; + + freebsd-aout) + major=".$current" + versuffix=".$current.$revision"; + ;; + + freebsd-elf) + major=".$current" + versuffix=".$current"; + ;; + + windows) + # Like Linux, but with '-' rather than '.', since we only + # want one extension on Windows 95. + major=`expr $current - $age` + versuffix="-$major-$age-$revision" + ;; + + *) + $echo "$modename: unknown library version type \`$version_type'" 1>&2 + echo "Fatal configuration error. See the $PACKAGE docs for more information." 1>&2 + exit 1 + ;; + esac + + # Clear the version info if we defaulted, and they specified a release. + if test -z "$vinfo" && test -n "$release"; then + major= + verstring="0.0" + if test "$need_version" = no; then + versuffix= + else + versuffix=".0.0" + fi + fi + + # Remove version info from name if versioning should be avoided + if test "$avoid_version" = yes && test "$need_version" = no; then + major= + versuffix= + verstring="" + fi + + # Check to see if the archive will have undefined symbols. + if test "$allow_undefined" = yes; then + if test "$allow_undefined_flag" = unsupported; then + $echo "$modename: warning: undefined symbols not allowed in $host shared libraries" 1>&2 + build_libtool_libs=no + build_old_libs=yes + fi + else + # Don't allow undefined symbols. + allow_undefined_flag="$no_undefined_flag" + fi + fi + + if test "$mode" != relink; then + # Remove our outputs. + $show "${rm}r $output_objdir/$outputname $output_objdir/$libname.* $output_objdir/${libname}${release}.*" + $run ${rm}r $output_objdir/$outputname $output_objdir/$libname.* $output_objdir/${libname}${release}.* + fi + + # Now set the variables for building old libraries. + if test "$build_old_libs" = yes && test "$build_libtool_libs" != convenience ; then + oldlibs="$oldlibs $output_objdir/$libname.$libext" + + # Transform .lo files to .o files. + oldobjs="$objs "`$echo "X$libobjs" | $SP2NL | $Xsed -e '/\.'${libext}'$/d' -e "$lo2o" | $NL2SP` + fi + + # Eliminate all temporary directories. + for path in $uninst_path; do + lib_search_path=`echo "$lib_search_path " | sed -e 's% $path % %g'` + deplibs=`echo "$deplibs " | sed -e 's% -L$path % %g'` + dependency_libs=`echo "$dependency_libs " | sed -e 's% -L$path % %g'` + done + + if test -n "$xrpath"; then + # If the user specified any rpath flags, then add them. + temp_xrpath= + for libdir in $xrpath; do + temp_xrpath="$temp_xrpath -R$libdir" + case "$finalize_rpath " in + *" $libdir "*) ;; + *) finalize_rpath="$finalize_rpath $libdir" ;; + esac + done + if test "$hardcode_into_libs" = no || test $build_old_libs = yes; then + dependency_libs="$temp_xrpath $dependency_libs" + fi + fi + + # Make sure dlfiles contains only unique files that won't be dlpreopened + old_dlfiles="$dlfiles" + dlfiles= + for lib in $old_dlfiles; do + case " $dlprefiles $dlfiles " in + *" $lib "*) ;; + *) dlfiles="$dlfiles $lib" ;; + esac + done + + # Make sure dlprefiles contains only unique files + old_dlprefiles="$dlprefiles" + dlprefiles= + for lib in $old_dlprefiles; do + case "$dlprefiles " in + *" $lib "*) ;; + *) dlprefiles="$dlprefiles $lib" ;; + esac + done + + if test "$build_libtool_libs" = yes; then + if test -n "$rpath"; then + case "$host" in + *-*-cygwin* | *-*-mingw* | *-*-os2* | *-*-beos*) + # these systems don't actually have a c library (as such)! + ;; + *) + # Add libc to deplibs on all other systems. + deplibs="$deplibs -lc" + ;; + esac + fi + + # Transform deplibs into only deplibs that can be linked in shared. + name_save=$name + libname_save=$libname + release_save=$release + versuffix_save=$versuffix + major_save=$major + # I'm not sure if I'm treating the release correctly. I think + # release should show up in the -l (ie -lgmp5) so we don't want to + # add it in twice. Is that correct? + release="" + versuffix="" + major="" + newdeplibs= + droppeddeps=no + case "$deplibs_check_method" in + pass_all) + # Don't check for shared/static. Everything works. + # This might be a little naive. We might want to check + # whether the library exists or not. But this is on + # osf3 & osf4 and I'm not really sure... Just + # implementing what was already the behaviour. + newdeplibs=$deplibs + ;; + test_compile) + # This code stresses the "libraries are programs" paradigm to its + # limits. Maybe even breaks it. We compile a program, linking it + # against the deplibs as a proxy for the library. Then we can check + # whether they linked in statically or dynamically with ldd. + $rm conftest.c + cat > conftest.c </dev/null` + for potent_lib in $potential_libs; do + # Follow soft links. + if ls -lLd "$potent_lib" 2>/dev/null \ + | grep " -> " >/dev/null; then + continue + fi + # The statement above tries to avoid entering an + # endless loop below, in case of cyclic links. + # We might still enter an endless loop, since a link + # loop can be closed while we follow links, + # but so what? + potlib="$potent_lib" + while test -h "$potlib" 2>/dev/null; do + potliblink=`ls -ld $potlib | sed 's/.* -> //'` + case "$potliblink" in + [\\/]* | [A-Za-z]:[\\/]*) potlib="$potliblink";; + *) potlib=`$echo "X$potlib" | $Xsed -e 's,[^/]*$,,'`"$potliblink";; + esac + done + if eval $file_magic_cmd \"\$potlib\" 2>/dev/null \ + | sed 10q \ + | egrep "$file_magic_regex" > /dev/null; then + newdeplibs="$newdeplibs $a_deplib" + a_deplib="" + break 2 + fi + done + done + if test -n "$a_deplib" ; then + droppeddeps=yes + echo + echo "*** Warning: This library needs some functionality provided by $a_deplib." + echo "*** I have the capability to make that library automatically link in when" + echo "*** you link to this library. But I can only do this if you have a" + echo "*** shared version of the library, which you do not appear to have." + fi + else + # Add a -L argument. + newdeplibs="$newdeplibs $a_deplib" + fi + done # Gone through all deplibs. + ;; + none | unknown | *) + newdeplibs="" + if $echo "X $deplibs" | $Xsed -e 's/ -lc$//' \ + -e 's/ -[LR][^ ]*//g' -e 's/[ ]//g' | + grep . >/dev/null; then + echo + if test "X$deplibs_check_method" = "Xnone"; then + echo "*** Warning: inter-library dependencies are not supported in this platform." + else + echo "*** Warning: inter-library dependencies are not known to be supported." + fi + echo "*** All declared inter-library dependencies are being dropped." + droppeddeps=yes + fi + ;; + esac + versuffix=$versuffix_save + major=$major_save + release=$release_save + libname=$libname_save + name=$name_save + + if test "$droppeddeps" = yes; then + if test "$module" = yes; then + echo + echo "*** Warning: libtool could not satisfy all declared inter-library" + echo "*** dependencies of module $libname. Therefore, libtool will create" + echo "*** a static module, that should work as long as the dlopening" + echo "*** application is linked with the -dlopen flag." + if test -z "$global_symbol_pipe"; then + echo + echo "*** However, this would only work if libtool was able to extract symbol" + echo "*** lists from a program, using \`nm' or equivalent, but libtool could" + echo "*** not find such a program. So, this module is probably useless." + echo "*** \`nm' from GNU binutils and a full rebuild may help." + fi + if test "$build_old_libs" = no; then + oldlibs="$output_objdir/$libname.$libext" + build_libtool_libs=module + build_old_libs=yes + else + build_libtool_libs=no + fi + else + echo "*** The inter-library dependencies that have been dropped here will be" + echo "*** automatically added whenever a program is linked with this library" + echo "*** or is declared to -dlopen it." + fi + fi + # Done checking deplibs! + deplibs=$newdeplibs + fi + + # All the library-specific variables (install_libdir is set above). + library_names= + old_library= + dlname= + + # Test again, we may have decided not to build it any more + if test "$build_libtool_libs" = yes; then + if test "$hardcode_into_libs" != no; then + # Hardcode the library paths + hardcode_libdirs= + dep_rpath= + rpath="$finalize_rpath" + test "$mode" != relink && rpath="$compile_rpath$rpath" + for libdir in $rpath; do + if test -n "$hardcode_libdir_flag_spec"; then + if test -n "$hardcode_libdir_separator"; then + if test -z "$hardcode_libdirs"; then + hardcode_libdirs="$libdir" + else + # Just accumulate the unique libdirs. + case "$hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator" in + *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*) + ;; + *) + hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir" + ;; + esac + fi + else + eval flag=\"$hardcode_libdir_flag_spec\" + dep_rpath="$dep_rpath $flag" + fi + elif test -n "$runpath_var"; then + case "$perm_rpath " in + *" $libdir "*) ;; + *) perm_rpath="$perm_rpath $libdir" ;; + esac + fi + done + # Substitute the hardcoded libdirs into the rpath. + if test -n "$hardcode_libdir_separator" && + test -n "$hardcode_libdirs"; then + libdir="$hardcode_libdirs" + eval dep_rpath=\"$hardcode_libdir_flag_spec\" + fi + if test -n "$runpath_var" && test -n "$perm_rpath"; then + # We should set the runpath_var. + rpath= + for dir in $perm_rpath; do + rpath="$rpath$dir:" + done + eval "$runpath_var='$rpath\$$runpath_var'; export $runpath_var" + fi + test -n "$dep_rpath" && deplibs="$dep_rpath $deplibs" + fi + + shlibpath="$finalize_shlibpath" + test "$mode" != relink && shlibpath="$compile_shlibpath$shlibpath" + if test -n "$shlibpath"; then + eval "$shlibpath_var='$shlibpath\$$shlibpath_var'; export $shlibpath_var" + fi + + # Get the real and link names of the library. + eval library_names=\"$library_names_spec\" + set dummy $library_names + realname="$2" + shift; shift + + if test -n "$soname_spec"; then + eval soname=\"$soname_spec\" + else + soname="$realname" + fi + + lib="$output_objdir/$realname" + for link + do + linknames="$linknames $link" + done + + # Ensure that we have .o objects for linkers which dislike .lo + # (e.g. aix) in case we are running --disable-static + for obj in $libobjs; do + xdir=`$echo "X$obj" | $Xsed -e 's%/[^/]*$%%'` + if test "X$xdir" = "X$obj"; then + xdir="." + else + xdir="$xdir" + fi + baseobj=`$echo "X$obj" | $Xsed -e 's%^.*/%%'` + oldobj=`$echo "X$baseobj" | $Xsed -e "$lo2o"` + if test ! -f $xdir/$oldobj; then + $show "(cd $xdir && ${LN_S} $baseobj $oldobj)" + $run eval '(cd $xdir && ${LN_S} $baseobj $oldobj)' || exit $? + fi + done + + # Use standard objects if they are pic + test -z "$pic_flag" && libobjs=`$echo "X$libobjs" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP` + + # Prepare the list of exported symbols + if test -z "$export_symbols"; then + if test "$always_export_symbols" = yes || test -n "$export_symbols_regex"; then + $show "generating symbol list for \`$libname.la'" + export_symbols="$output_objdir/$libname.exp" + $run $rm $export_symbols + eval cmds=\"$export_symbols_cmds\" + IFS="${IFS= }"; save_ifs="$IFS"; IFS='~' + for cmd in $cmds; do + IFS="$save_ifs" + $show "$cmd" + $run eval "$cmd" || exit $? + done + IFS="$save_ifs" + if test -n "$export_symbols_regex"; then + $show "egrep -e \"$export_symbols_regex\" \"$export_symbols\" > \"${export_symbols}T\"" + $run eval 'egrep -e "$export_symbols_regex" "$export_symbols" > "${export_symbols}T"' + $show "$mv \"${export_symbols}T\" \"$export_symbols\"" + $run eval '$mv "${export_symbols}T" "$export_symbols"' + fi + fi + fi + + if test -n "$export_symbols" && test -n "$include_expsyms"; then + $run eval '$echo "X$include_expsyms" | $SP2NL >> "$export_symbols"' + fi + + if test -n "$convenience"; then + if test -n "$whole_archive_flag_spec"; then + eval libobjs=\"\$libobjs $whole_archive_flag_spec\" + else + gentop="$output_objdir/${outputname}x" + $show "${rm}r $gentop" + $run ${rm}r "$gentop" + $show "mkdir $gentop" + $run mkdir "$gentop" + status=$? + if test $status -ne 0 && test ! -d "$gentop"; then + exit $status + fi + generated="$generated $gentop" + + for xlib in $convenience; do + # Extract the objects. + case "$xlib" in + [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;; + *) xabs=`pwd`"/$xlib" ;; + esac + xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'` + xdir="$gentop/$xlib" + + $show "${rm}r $xdir" + $run ${rm}r "$xdir" + $show "mkdir $xdir" + $run mkdir "$xdir" + status=$? + if test $status -ne 0 && test ! -d "$xdir"; then + exit $status + fi + $show "(cd $xdir && $AR x $xabs)" + $run eval "(cd \$xdir && $AR x \$xabs)" || exit $? + + libobjs="$libobjs "`find $xdir -name \*.o -print -o -name \*.lo -print | $NL2SP` + done + fi + fi + + if test "$thread_safe" = yes && test -n "$thread_safe_flag_spec"; then + eval flag=\"$thread_safe_flag_spec\" + linker_flags="$linker_flags $flag" + fi + + # Make a backup of the uninstalled library when relinking + if test "$mode" = relink && test "$hardcode_into_libs" = all; then + $run eval '(cd $output_objdir && $rm ${realname}U && $mv $realname ${realname}U)' || exit $? + fi + + # Do each of the archive commands. + if test -n "$export_symbols" && test -n "$archive_expsym_cmds"; then + eval cmds=\"$archive_expsym_cmds\" + else + eval cmds=\"$archive_cmds\" + fi + IFS="${IFS= }"; save_ifs="$IFS"; IFS='~' + for cmd in $cmds; do + IFS="$save_ifs" + $show "$cmd" + $run eval "$cmd" || exit $? + done + IFS="$save_ifs" + + # Restore the uninstalled library and exit + if test "$mode" = relink && test "$hardcode_into_libs" = all; then + $run eval '(cd $output_objdir && $rm ${realname}T && $mv $realname ${realname}T && $mv "$realname"U $realname)' || exit $? + exit 0 + fi + + # Create links to the real library. + for linkname in $linknames; do + if test "$realname" != "$linkname"; then + $show "(cd $output_objdir && $rm $linkname && $LN_S $realname $linkname)" + $run eval '(cd $output_objdir && $rm $linkname && $LN_S $realname $linkname)' || exit $? + fi + done + + # If -module or -export-dynamic was specified, set the dlname. + if test "$module" = yes || test "$export_dynamic" = yes; then + # On all known operating systems, these are identical. + dlname="$soname" + fi + fi + ;; + + obj) + if test -n "$deplibs"; then + $echo "$modename: warning: \`-l' and \`-L' are ignored for objects" 1>&2 + fi + + if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then + $echo "$modename: warning: \`-dlopen' is ignored for objects" 1>&2 + fi + + if test -n "$rpath"; then + $echo "$modename: warning: \`-rpath' is ignored for objects" 1>&2 + fi + + if test -n "$xrpath"; then + $echo "$modename: warning: \`-R' is ignored for objects" 1>&2 + fi + + if test -n "$vinfo"; then + $echo "$modename: warning: \`-version-info' is ignored for objects" 1>&2 + fi + + if test -n "$release"; then + $echo "$modename: warning: \`-release' is ignored for objects" 1>&2 + fi + + case "$output" in + *.lo) + if test -n "$objs$old_deplibs"; then + $echo "$modename: cannot build library object \`$output' from non-libtool objects" 1>&2 + exit 1 + fi + libobj="$output" + obj=`$echo "X$output" | $Xsed -e "$lo2o"` + ;; + *) + libobj= + obj="$output" + ;; + esac + + # Delete the old objects. + $run $rm $obj $libobj + + # Objects from convenience libraries. This assumes + # single-version convenience libraries. Whenever we create + # different ones for PIC/non-PIC, this we'll have to duplicate + # the extraction. + reload_conv_objs= + gentop= + # reload_cmds runs $LD directly, so let us get rid of + # -Wl from whole_archive_flag_spec + wl= + + if test -n "$convenience"; then + if test -n "$whole_archive_flag_spec"; then + eval reload_conv_objs=\"\$reload_objs $whole_archive_flag_spec\" + else + gentop="$output_objdir/${obj}x" + $show "${rm}r $gentop" + $run ${rm}r "$gentop" + $show "mkdir $gentop" + $run mkdir "$gentop" + status=$? + if test $status -ne 0 && test ! -d "$gentop"; then + exit $status + fi + generated="$generated $gentop" + + for xlib in $convenience; do + # Extract the objects. + case "$xlib" in + [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;; + *) xabs=`pwd`"/$xlib" ;; + esac + xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'` + xdir="$gentop/$xlib" + + $show "${rm}r $xdir" + $run ${rm}r "$xdir" + $show "mkdir $xdir" + $run mkdir "$xdir" + status=$? + if test $status -ne 0 && test ! -d "$xdir"; then + exit $status + fi + $show "(cd $xdir && $AR x $xabs)" + $run eval "(cd \$xdir && $AR x \$xabs)" || exit $? + + reload_conv_objs="$reload_objs "`find $xdir -name \*.o -print -o -name \*.lo -print | $NL2SP` + done + fi + fi + + # Create the old-style object. + reload_objs="$objs$old_deplibs "`$echo "X$libobjs" | $SP2NL | $Xsed -e '/\.'${libext}$'/d' -e '/\.lib$/d' -e "$lo2o" | $NL2SP`" $reload_conv_objs" ### testsuite: skip nested quoting test + + output="$obj" + eval cmds=\"$reload_cmds\" + IFS="${IFS= }"; save_ifs="$IFS"; IFS='~' + for cmd in $cmds; do + IFS="$save_ifs" + $show "$cmd" + $run eval "$cmd" || exit $? + done + IFS="$save_ifs" + + # Exit if we aren't doing a library object file. + if test -z "$libobj"; then + if test -n "$gentop"; then + $show "${rm}r $gentop" + $run ${rm}r $gentop + fi + + exit 0 + fi + + if test "$build_libtool_libs" != yes; then + if test -n "$gentop"; then + $show "${rm}r $gentop" + $run ${rm}r $gentop + fi + + # Create an invalid libtool object if no PIC, so that we don't + # accidentally link it into a program. + $show "echo timestamp > $libobj" + $run eval "echo timestamp > $libobj" || exit $? + exit 0 + fi + + if test -n "$pic_flag" || test "$pic_mode" != default; then + # Only do commands if we really have different PIC objects. + reload_objs="$libobjs $reload_conv_objs" + output="$libobj" + eval cmds=\"$reload_cmds\" + IFS="${IFS= }"; save_ifs="$IFS"; IFS='~' + for cmd in $cmds; do + IFS="$save_ifs" + $show "$cmd" + $run eval "$cmd" || exit $? + done + IFS="$save_ifs" + else + # Just create a symlink. + $show $rm $libobj + $run $rm $libobj + xdir=`$echo "X$libobj" | $Xsed -e 's%/[^/]*$%%'` + if test "X$xdir" = "X$libobj"; then + xdir="." + else + xdir="$xdir" + fi + baseobj=`$echo "X$libobj" | $Xsed -e 's%^.*/%%'` + oldobj=`$echo "X$baseobj" | $Xsed -e "$lo2o"` + $show "(cd $xdir && $LN_S $oldobj $baseobj)" + $run eval '(cd $xdir && $LN_S $oldobj $baseobj)' || exit $? + fi + + if test -n "$gentop"; then + $show "${rm}r $gentop" + $run ${rm}r $gentop + fi + + exit 0 + ;; + + prog) + if test -n "$vinfo"; then + $echo "$modename: warning: \`-version-info' is ignored for programs" 1>&2 + fi + + if test -n "$release"; then + $echo "$modename: warning: \`-release' is ignored for programs" 1>&2 + fi + + if test "$preload" = yes; then + if test "$dlopen_support" = unknown && test "$dlopen_self" = unknown && + test "$dlopen_self_static" = unknown; then + $echo "$modename: warning: \`AC_LIBTOOL_DLOPEN' not used. Assuming no dlopen support." + fi + fi + + compile_command="$compile_command $compile_deplibs" + finalize_command="$finalize_command $finalize_deplibs" + + if test -n "$rpath$xrpath"; then + # If the user specified any rpath flags, then add them. + for libdir in $rpath $xrpath; do + # This is the magic to use -rpath. + case "$finalize_rpath " in + *" $libdir "*) ;; + *) finalize_rpath="$finalize_rpath $libdir" ;; + esac + done + fi + + # Now hardcode the library paths + rpath= + hardcode_libdirs= + for libdir in $compile_rpath $finalize_rpath; do + if test -n "$hardcode_libdir_flag_spec"; then + if test -n "$hardcode_libdir_separator"; then + if test -z "$hardcode_libdirs"; then + hardcode_libdirs="$libdir" + else + # Just accumulate the unique libdirs. + case "$hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator" in + *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*) + ;; + *) + hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir" + ;; + esac + fi + else + eval flag=\"$hardcode_libdir_flag_spec\" + rpath="$rpath $flag" + fi + elif test -n "$runpath_var"; then + case "$perm_rpath " in + *" $libdir "*) ;; + *) perm_rpath="$perm_rpath $libdir" ;; + esac + fi + case "$host" in + *-*-cygwin* | *-*-mingw* | *-*-os2*) + case ":$dllsearchpath:" in + *":$libdir:"*) ;; + *) dllsearchpath="$dllsearchpath:$libdir";; + esac + ;; + esac + done + # Substitute the hardcoded libdirs into the rpath. + if test -n "$hardcode_libdir_separator" && + test -n "$hardcode_libdirs"; then + libdir="$hardcode_libdirs" + eval rpath=\" $hardcode_libdir_flag_spec\" + fi + compile_rpath="$rpath" + + rpath= + hardcode_libdirs= + for libdir in $finalize_rpath; do + if test -n "$hardcode_libdir_flag_spec"; then + if test -n "$hardcode_libdir_separator"; then + if test -z "$hardcode_libdirs"; then + hardcode_libdirs="$libdir" + else + # Just accumulate the unique libdirs. + case "$hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator" in + *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*) + ;; + *) + hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir" + ;; + esac + fi + else + eval flag=\"$hardcode_libdir_flag_spec\" + rpath="$rpath $flag" + fi + elif test -n "$runpath_var"; then + case "$finalize_perm_rpath " in + *" $libdir "*) ;; + *) finalize_perm_rpath="$finalize_perm_rpath $libdir" ;; + esac + fi + done + # Substitute the hardcoded libdirs into the rpath. + if test -n "$hardcode_libdir_separator" && + test -n "$hardcode_libdirs"; then + libdir="$hardcode_libdirs" + eval rpath=\" $hardcode_libdir_flag_spec\" + fi + finalize_rpath="$rpath" + + if test -n "$libobjs" && test "$build_old_libs" = yes; then + # Transform all the library objects into standard objects. + compile_command=`$echo "X$compile_command" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP` + finalize_command=`$echo "X$finalize_command" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP` + fi + + dlsyms= + if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then + if test -n "$NM" && test -n "$global_symbol_pipe"; then + dlsyms="${outputname}S.c" + else + $echo "$modename: not configured to extract global symbols from dlpreopened files" 1>&2 + fi + fi + + if test -n "$dlsyms"; then + case "$dlsyms" in + "") ;; + *.c) + # Discover the nlist of each of the dlfiles. + nlist="$output_objdir/${outputname}.nm" + + $show "$rm $nlist ${nlist}S ${nlist}T" + $run $rm "$nlist" "${nlist}S" "${nlist}T" + + # Parse the name list into a source file. + $show "creating $output_objdir/$dlsyms" + + test -z "$run" && $echo > "$output_objdir/$dlsyms" "\ +/* $dlsyms - symbol resolution table for \`$outputname' dlsym emulation. */ +/* Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP */ + +#ifdef __cplusplus +extern \"C\" { +#endif + +/* Prevent the only kind of declaration conflicts we can make. */ +#define lt_preloaded_symbols some_other_symbol + +/* External symbol declarations for the compiler. */\ +" + + if test "$dlself" = yes; then + $show "generating symbol list for \`$output'" + + test -z "$run" && $echo ': @PROGRAM@ ' > "$nlist" + + # Add our own program objects to the symbol list. + progfiles=`$echo "X$objs$old_deplibs" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP` + for arg in $progfiles; do + $show "extracting global C symbols from \`$arg'" + $run eval "$NM $arg | $global_symbol_pipe >> '$nlist'" + done + + if test -n "$exclude_expsyms"; then + $run eval 'egrep -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T' + $run eval '$mv "$nlist"T "$nlist"' + fi + + if test -n "$export_symbols_regex"; then + $run eval 'egrep -e "$export_symbols_regex" "$nlist" > "$nlist"T' + $run eval '$mv "$nlist"T "$nlist"' + fi + + # Prepare the list of exported symbols + if test -z "$export_symbols"; then + export_symbols="$output_objdir/$output.exp" + $run $rm $export_symbols + $run eval "sed -n -e '/^: @PROGRAM@$/d' -e 's/^.* \(.*\)$/\1/p' "'< "$nlist" > "$export_symbols"' + else + $run eval "sed -e 's/\([][.*^$]\)/\\\1/g' -e 's/^/ /' -e 's/$/$/'"' < "$export_symbols" > "$output_objdir/$output.exp"' + $run eval 'grep -f "$output_objdir/$output.exp" < "$nlist" > "$nlist"T' + $run eval 'mv "$nlist"T "$nlist"' + fi + fi + + for arg in $dlprefiles; do + $show "extracting global C symbols from \`$arg'" + name=`echo "$arg" | sed -e 's%^.*/%%'` + $run eval 'echo ": $name " >> "$nlist"' + $run eval "$NM $arg | $global_symbol_pipe >> '$nlist'" + done + + if test -z "$run"; then + # Make sure we have at least an empty file. + test -f "$nlist" || : > "$nlist" + + if test -n "$exclude_expsyms"; then + egrep -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T + $mv "$nlist"T "$nlist" + fi + + # Try sorting and uniquifying the output. + if grep -v "^: " < "$nlist" | sort +2 | uniq > "$nlist"S; then + : + else + grep -v "^: " < "$nlist" > "$nlist"S + fi + + if test -f "$nlist"S; then + eval "$global_symbol_to_cdecl"' < "$nlist"S >> "$output_objdir/$dlsyms"' + else + echo '/* NONE */' >> "$output_objdir/$dlsyms" + fi + + $echo >> "$output_objdir/$dlsyms" "\ + +#undef lt_preloaded_symbols + +#if defined (__STDC__) && __STDC__ +# define lt_ptr_t void * +#else +# define lt_ptr_t char * +# define const +#endif + +/* The mapping between symbol names and symbols. */ +const struct { + const char *name; + lt_ptr_t address; +} +lt_preloaded_symbols[] = +{\ +" + + sed -n -e 's/^: \([^ ]*\) $/ {\"\1\", (lt_ptr_t) 0},/p' \ + -e 's/^. \([^ ]*\) \([^ ]*\)$/ {"\2", (lt_ptr_t) \&\2},/p' \ + < "$nlist" >> "$output_objdir/$dlsyms" + + $echo >> "$output_objdir/$dlsyms" "\ + {0, (lt_ptr_t) 0} +}; + +/* This works around a problem in FreeBSD linker */ +#ifdef FREEBSD_WORKAROUND +static const void *lt_preloaded_setup() { + return lt_preloaded_symbols; +} +#endif + +#ifdef __cplusplus +} +#endif\ +" + fi + + pic_flag_for_symtable= + case "$host" in + # compiling the symbol table file with pic_flag works around + # a FreeBSD bug that causes programs to crash when -lm is + # linked before any other PIC object. But we must not use + # pic_flag when linking with -static. The problem exists in + # FreeBSD 2.2.6 and is fixed in FreeBSD 3.1. + *-*-freebsd2*|*-*-freebsd3.0*|*-*-freebsdelf3.0*) + case "$compile_command " in + *" -static "*) ;; + *) pic_flag_for_symtable=" $pic_flag -DPIC -DFREEBSD_WORKAROUND";; + esac;; + *-*-hpux*) + case "$compile_command " in + *" -static "*) ;; + *) pic_flag_for_symtable=" $pic_flag -DPIC";; + esac + esac + + # Now compile the dynamic symbol file. + $show "(cd $output_objdir && $CC -c$no_builtin_flag$pic_flag_for_symtable \"$dlsyms\")" + $run eval '(cd $output_objdir && $CC -c$no_builtin_flag$pic_flag_for_symtable "$dlsyms")' || exit $? + + # Clean up the generated files. + $show "$rm $output_objdir/$dlsyms $nlist ${nlist}S ${nlist}T" + $run $rm "$output_objdir/$dlsyms" "$nlist" "${nlist}S" "${nlist}T" + + # Transform the symbol file into the correct name. + compile_command=`$echo "X$compile_command" | $Xsed -e "s%@SYMFILE@%$output_objdir/${outputname}S.${objext}%"` + finalize_command=`$echo "X$finalize_command" | $Xsed -e "s%@SYMFILE@%$output_objdir/${outputname}S.${objext}%"` + ;; + *) + $echo "$modename: unknown suffix for \`$dlsyms'" 1>&2 + exit 1 + ;; + esac + else + # We keep going just in case the user didn't refer to + # lt_preloaded_symbols. The linker will fail if global_symbol_pipe + # really was required. + + # Nullify the symbol file. + compile_command=`$echo "X$compile_command" | $Xsed -e "s% @SYMFILE@%%"` + finalize_command=`$echo "X$finalize_command" | $Xsed -e "s% @SYMFILE@%%"` + fi + + if test -z "$link_against_libtool_libs" || test "$build_libtool_libs" != yes; then + # Replace the output file specification. + compile_command=`$echo "X$compile_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'` + link_command="$compile_command$compile_rpath" + + # We have no uninstalled library dependencies, so finalize right now. + $show "$link_command" + $run eval "$link_command" + status=$? + + # Delete the generated files. + if test -n "$dlsyms"; then + $show "$rm $output_objdir/${outputname}S.${objext}" + $run $rm "$output_objdir/${outputname}S.${objext}" + fi + + exit $status + fi + + if test -n "$shlibpath_var"; then + # We should set the shlibpath_var + rpath= + for dir in $temp_rpath; do + case "$dir" in + [\\/]* | [A-Za-z]:[\\/]*) + # Absolute path. + rpath="$rpath$dir:" + ;; + *) + # Relative path: add a thisdir entry. + rpath="$rpath\$thisdir/$dir:" + ;; + esac + done + temp_rpath="$rpath" + fi + + if test -n "$compile_shlibpath$finalize_shlibpath"; then + compile_command="$shlibpath_var=\"$compile_shlibpath$finalize_shlibpath\$$shlibpath_var\" $compile_command" + fi + if test -n "$finalize_shlibpath"; then + finalize_command="$shlibpath_var=\"$finalize_shlibpath\$$shlibpath_var\" $finalize_command" + fi + + compile_var= + finalize_var= + if test -n "$runpath_var"; then + if test -n "$perm_rpath"; then + # We should set the runpath_var. + rpath= + for dir in $perm_rpath; do + rpath="$rpath$dir:" + done + compile_var="$runpath_var=\"$rpath\$$runpath_var\" " + fi + if test -n "$finalize_perm_rpath"; then + # We should set the runpath_var. + rpath= + for dir in $finalize_perm_rpath; do + rpath="$rpath$dir:" + done + finalize_var="$runpath_var=\"$rpath\$$runpath_var\" " + fi + fi + + if test "$no_install" = yes; then + # We don't need to create a wrapper script. + link_command="$compile_var$compile_command$compile_rpath" + # Replace the output file specification. + link_command=`$echo "X$link_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'` + # Delete the old output file. + $run $rm $output + # Link the executable and exit + $show "$link_command" + $run eval "$link_command" || exit $? + exit 0 + fi + + if test "$hardcode_action" = relink || test "$hardcode_into_libs" = all; then + # Fast installation is not supported + link_command="$compile_var$compile_command$compile_rpath" + relink_command="$finalize_var$finalize_command$finalize_rpath" + + $echo "$modename: warning: this platform does not like uninstalled shared libraries" 1>&2 + $echo "$modename: \`$output' will be relinked during installation" 1>&2 + else + if test "$fast_install" != no; then + link_command="$finalize_var$compile_command$finalize_rpath" + if test "$fast_install" = yes; then + relink_command=`$echo "X$compile_var$compile_command$compile_rpath" | $Xsed -e 's%@OUTPUT@%\$progdir/\$file%g'` + else + # fast_install is set to needless + relink_command= + fi + else + link_command="$compile_var$compile_command$compile_rpath" + relink_command="$finalize_var$finalize_command$finalize_rpath" + fi + fi + + # Replace the output file specification. + link_command=`$echo "X$link_command" | $Xsed -e 's%@OUTPUT@%'"$output_objdir/$outputname"'%g'` + + # Delete the old output files. + $run $rm $output $output_objdir/$outputname $output_objdir/lt-$outputname + + $show "$link_command" + $run eval "$link_command" || exit $? + + # Now create the wrapper script. + $show "creating $output" + + # Quote the relink command for shipping. + if test -n "$relink_command"; then + relink_command="cd `pwd`; $relink_command" + relink_command=`$echo "X$relink_command" | $Xsed -e "$sed_quote_subst"` + fi + + # Quote $echo for shipping. + if test "X$echo" = "X$SHELL $0 --fallback-echo"; then + case "$0" in + [\\/]* | [A-Za-z]:[\\/]*) qecho="$SHELL $0 --fallback-echo";; + *) qecho="$SHELL `pwd`/$0 --fallback-echo";; + esac + qecho=`$echo "X$qecho" | $Xsed -e "$sed_quote_subst"` + else + qecho=`$echo "X$echo" | $Xsed -e "$sed_quote_subst"` + fi + + # Only actually do things if our run command is non-null. + if test -z "$run"; then + # win32 will think the script is a binary if it has + # a .exe suffix, so we strip it off here. + case $output in + *.exe) output=`echo $output|sed 's,.exe$,,'` ;; + esac + $rm $output + trap "$rm $output; exit 1" 1 2 15 + + $echo > $output "\ +#! $SHELL + +# $output - temporary wrapper script for $objdir/$outputname +# Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP +# +# The $output program cannot be directly executed until all the libtool +# libraries that it depends on are installed. +# +# This wrapper script should never be moved out of the build directory. +# If it is, it will not operate correctly. + +# Sed substitution that helps us do robust quoting. It backslashifies +# metacharacters that are still active within double-quoted strings. +Xsed='sed -e 1s/^X//' +sed_quote_subst='$sed_quote_subst' + +# The HP-UX ksh and POSIX shell print the target directory to stdout +# if CDPATH is set. +if test \"\${CDPATH+set}\" = set; then CDPATH=:; export CDPATH; fi + +relink_command=\"$relink_command\" + +# This environment variable determines our operation mode. +if test \"\$libtool_install_magic\" = \"$magic\"; then + # install mode needs the following variable: + link_against_libtool_libs='$link_against_libtool_libs' +else + # When we are sourced in execute mode, \$file and \$echo are already set. + if test \"\$libtool_execute_magic\" != \"$magic\"; then + echo=\"$qecho\" + file=\"\$0\" + # Make sure echo works. + if test \"X\$1\" = X--no-reexec; then + # Discard the --no-reexec flag, and continue. + shift + elif test \"X\`(\$echo '\t') 2>/dev/null\`\" = 'X\t'; then + # Yippee, \$echo works! + : + else + # Restart under the correct shell, and then maybe \$echo will work. + exec $SHELL \"\$0\" --no-reexec \${1+\"\$@\"} + fi + fi\ +" + $echo >> $output "\ + + # Find the directory that this script lives in. + thisdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*$%%'\` + test \"x\$thisdir\" = \"x\$file\" && thisdir=. + + # Follow symbolic links until we get to the real thisdir. + file=\`ls -ld \"\$file\" | sed -n 's/.*-> //p'\` + while test -n \"\$file\"; do + destdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*\$%%'\` + + # If there was a directory component, then change thisdir. + if test \"x\$destdir\" != \"x\$file\"; then + case \"\$destdir\" in + [\\/]* | [A-Za-z]:[\\/]*) thisdir=\"\$destdir\" ;; + *) thisdir=\"\$thisdir/\$destdir\" ;; + esac + fi + + file=\`\$echo \"X\$file\" | \$Xsed -e 's%^.*/%%'\` + file=\`ls -ld \"\$thisdir/\$file\" | sed -n 's/.*-> //p'\` + done + + # Try to get the absolute directory name. + absdir=\`cd \"\$thisdir\" && pwd\` + test -n \"\$absdir\" && thisdir=\"\$absdir\" +" + + if test "$fast_install" = yes; then + echo >> $output "\ + program=lt-'$outputname' + progdir=\"\$thisdir/$objdir\" + + if test ! -f \"\$progdir/\$program\" || \\ + { file=\`ls -1dt \"\$progdir/\$program\" \"\$progdir/../\$program\" 2>/dev/null | sed 1q\`; \\ + test \"X\$file\" != \"X\$progdir/\$program\"; }; then + + file=\"\$\$-\$program\" + + if test ! -d \"\$progdir\"; then + $mkdir \"\$progdir\" + else + $rm \"\$progdir/\$file\" + fi" + + echo >> $output "\ + + # relink executable if necessary + if test -n \"\$relink_command\"; then + if (eval \$relink_command); then : + else + $rm \"\$progdir/\$file\" + exit 1 + fi + fi + + $mv \"\$progdir/\$file\" \"\$progdir/\$program\" 2>/dev/null || + { $rm \"\$progdir/\$program\"; + $mv \"\$progdir/\$file\" \"\$progdir/\$program\"; } + $rm \"\$progdir/\$file\" + fi" + else + echo >> $output "\ + program='$outputname' + progdir=\"\$thisdir/$objdir\" +" + fi + + echo >> $output "\ + + if test -f \"\$progdir/\$program\"; then" + + # Export our shlibpath_var if we have one. + if test "$shlibpath_overrides_runpath" = yes && test -n "$shlibpath_var" && test -n "$temp_rpath"; then + $echo >> $output "\ + # Add our own library path to $shlibpath_var + $shlibpath_var=\"$temp_rpath\$$shlibpath_var\" + + # Some systems cannot cope with colon-terminated $shlibpath_var + # The second colon is a workaround for a bug in BeOS R4 sed + $shlibpath_var=\`\$echo \"X\$$shlibpath_var\" | \$Xsed -e 's/::*\$//'\` + + export $shlibpath_var +" + fi + + # fixup the dll searchpath if we need to. + if test -n "$dllsearchpath"; then + $echo >> $output "\ + # Add the dll search path components to the executable PATH + PATH=$dllsearchpath:\$PATH +" + fi + + $echo >> $output "\ + if test \"\$libtool_execute_magic\" != \"$magic\"; then + # Run the actual program with our arguments. +" + case $host in + *-*-cygwin* | *-*-mingw | *-*-os2*) + # win32 systems need to use the prog path for dll + # lookup to work + $echo >> $output "\ + exec \$progdir\\\\\$program \${1+\"\$@\"} +" + ;; + *) + $echo >> $output "\ + # Export the path to the program. + PATH=\"\$progdir:\$PATH\" + export PATH + + exec \$program \${1+\"\$@\"} +" + ;; + esac + $echo >> $output "\ + \$echo \"\$0: cannot exec \$program \${1+\"\$@\"}\" + exit 1 + fi + else + # The program doesn't exist. + \$echo \"\$0: error: \$progdir/\$program does not exist\" 1>&2 + \$echo \"This script is just a wrapper for \$program.\" 1>&2 + echo \"See the $PACKAGE documentation for more information.\" 1>&2 + exit 1 + fi +fi\ +" + chmod +x $output + fi + exit 0 + ;; + esac + + # See if we need to build an old-fashioned archive. + for oldlib in $oldlibs; do + + if test "$build_libtool_libs" = convenience; then + oldobjs="$libobjs_save" + addlibs="$convenience" + build_libtool_libs=no + else + if test "$build_libtool_libs" = module; then + oldobjs="$libobjs_save" + build_libtool_libs=no + else + oldobjs="$objs$old_deplibs "`$echo "X$libobjs_save" | $SP2NL | $Xsed -e '/\.'${libext}'$/d' -e '/\.lib$/d' -e "$lo2o" | $NL2SP` + fi + addlibs="$old_convenience" + fi + + if test -n "$addlibs"; then + gentop="$output_objdir/${outputname}x" + $show "${rm}r $gentop" + $run ${rm}r "$gentop" + $show "mkdir $gentop" + $run mkdir "$gentop" + status=$? + if test $status -ne 0 && test ! -d "$gentop"; then + exit $status + fi + generated="$generated $gentop" + + # Add in members from convenience archives. + for xlib in $addlibs; do + # Extract the objects. + case "$xlib" in + [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;; + *) xabs=`pwd`"/$xlib" ;; + esac + xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'` + xdir="$gentop/$xlib" + + $show "${rm}r $xdir" + $run ${rm}r "$xdir" + $show "mkdir $xdir" + $run mkdir "$xdir" + status=$? + if test $status -ne 0 && test ! -d "$xdir"; then + exit $status + fi + $show "(cd $xdir && $AR x $xabs)" + $run eval "(cd \$xdir && $AR x \$xabs)" || exit $? + + oldobjs="$oldobjs "`find $xdir -name \*.${objext} -print -o -name \*.lo -print | $NL2SP` + done + fi + + # Do each command in the archive commands. + if test -n "$old_archive_from_new_cmds" && test "$build_libtool_libs" = yes; then + eval cmds=\"$old_archive_from_new_cmds\" + else + # Ensure that we have .o objects in place in case we decided + # not to build a shared library, and have fallen back to building + # static libs even though --disable-static was passed! + for oldobj in $oldobjs; do + if test ! -f $oldobj; then + xdir=`$echo "X$oldobj" | $Xsed -e 's%/[^/]*$%%'` + if test "X$xdir" = "X$oldobj"; then + xdir="." + else + xdir="$xdir" + fi + baseobj=`$echo "X$oldobj" | $Xsed -e 's%^.*/%%'` + obj=`$echo "X$baseobj" | $Xsed -e "$o2lo"` + $show "(cd $xdir && ${LN_S} $obj $baseobj)" + $run eval '(cd $xdir && ${LN_S} $obj $baseobj)' || exit $? + fi + done + + eval cmds=\"$old_archive_cmds\" + fi + IFS="${IFS= }"; save_ifs="$IFS"; IFS='~' + for cmd in $cmds; do + IFS="$save_ifs" + $show "$cmd" + $run eval "$cmd" || exit $? + done + IFS="$save_ifs" + done + + if test -n "$generated"; then + $show "${rm}r$generated" + $run ${rm}r$generated + fi + + # Now create the libtool archive. + case "$output" in + *.la) + old_library= + test "$build_old_libs" = yes && old_library="$libname.$libext" + $show "creating $output" + + # Quote the link command for shipping. + relink_command="cd `pwd`; $SHELL $0 --mode=relink $libtool_args" + relink_command=`$echo "X$relink_command" | $Xsed -e "$sed_quote_subst"` + + # Only create the output if not a dry run. + if test -z "$run"; then + for installed in no yes; do + if test "$installed" = yes; then + if test -z "$install_libdir"; then + break + fi + output="$output_objdir/$outputname"i + # Replace all uninstalled libtool libraries with the installed ones + newdependency_libs= + for deplib in $dependency_libs; do + case "$deplib" in + *.la) + name=`$echo "X$deplib" | $Xsed -e 's%^.*/%%'` + eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $deplib` + if test -z "$libdir"; then + $echo "$modename: \`$deplib' is not a valid libtool archive" 1>&2 + exit 1 + fi + newdependency_libs="$newdependency_libs $libdir/$name" + ;; + *) newdependency_libs="$newdependency_libs $deplib" ;; + esac + done + dependency_libs="$newdependency_libs" + newdlfiles= + for lib in $dlfiles; do + name=`$echo "X$lib" | $Xsed -e 's%^.*/%%'` + eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $lib` + if test -z "$libdir"; then + $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2 + exit 1 + fi + newdlfiles="$newdlfiles $libdir/$name" + done + dlfiles="$newdlfiles" + newdlprefiles= + for lib in $dlprefiles; do + name=`$echo "X$lib" | $Xsed -e 's%^.*/%%'` + eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $lib` + if test -z "$libdir"; then + $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2 + exit 1 + fi + newdlprefiles="$newdlprefiles $libdir/$name" + done + dlprefiles="$newdlprefiles" + fi + $rm $output + $echo > $output "\ +# $outputname - a libtool library file +# Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP +# +# Please DO NOT delete this file! +# It is necessary for linking the library. + +# The name that we can dlopen(3). +dlname='$dlname' + +# Names of this library. +library_names='$library_names' + +# The name of the static archive. +old_library='$old_library' + +# Libraries that this one depends upon. +dependency_libs='$dependency_libs' + +# Version information for $libname. +current=$current +age=$age +revision=$revision + +# Is this an already installed library? +installed=$installed + +# Files to dlopen/dlpreopen +dlopen='$dlfiles' +dlpreopen='$dlprefiles' + +# Directory that this library needs to be installed in: +libdir='$install_libdir'" + if test "$installed" = no; then + $echo >> $output "\ +relink_command=\"$relink_command\"" + fi + done + fi + + # Do a symbolic link so that the libtool archive can be found in + # LD_LIBRARY_PATH before the program is installed. + $show "(cd $output_objdir && $rm $outputname && $LN_S ../$outputname $outputname)" + $run eval '(cd $output_objdir && $rm $outputname && $LN_S ../$outputname $outputname)' || exit $? + ;; + esac + exit 0 + ;; + + # libtool install mode + install) + modename="$modename: install" + + # There may be an optional sh(1) argument at the beginning of + # install_prog (especially on Windows NT). + if test "$nonopt" = "$SHELL" || test "$nonopt" = /bin/sh; then + # Aesthetically quote it. + arg=`$echo "X$nonopt" | $Xsed -e "$sed_quote_subst"` + case "$arg" in + *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*) + arg="\"$arg\"" + ;; + esac + install_prog="$arg " + arg="$1" + shift + else + install_prog= + arg="$nonopt" + fi + + # The real first argument should be the name of the installation program. + # Aesthetically quote it. + arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"` + case "$arg" in + *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*) + arg="\"$arg\"" + ;; + esac + install_prog="$install_prog$arg" + + # We need to accept at least all the BSD install flags. + dest= + files= + opts= + prev= + install_type= + isdir=no + stripme= + for arg + do + if test -n "$dest"; then + files="$files $dest" + dest="$arg" + continue + fi + + case "$arg" in + -d) isdir=yes ;; + -f) prev="-f" ;; + -g) prev="-g" ;; + -m) prev="-m" ;; + -o) prev="-o" ;; + -s) + stripme=" -s" + continue + ;; + -*) ;; + + *) + # If the previous option needed an argument, then skip it. + if test -n "$prev"; then + prev= + else + dest="$arg" + continue + fi + ;; + esac + + # Aesthetically quote the argument. + arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"` + case "$arg" in + *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \ ]*|*]*) + arg="\"$arg\"" + ;; + esac + install_prog="$install_prog $arg" + done + + if test -z "$install_prog"; then + $echo "$modename: you must specify an install program" 1>&2 + $echo "$help" 1>&2 + exit 1 + fi + + if test -n "$prev"; then + $echo "$modename: the \`$prev' option requires an argument" 1>&2 + $echo "$help" 1>&2 + exit 1 + fi + + if test -z "$files"; then + if test -z "$dest"; then + $echo "$modename: no file or destination specified" 1>&2 + else + $echo "$modename: you must specify a destination" 1>&2 + fi + $echo "$help" 1>&2 + exit 1 + fi + + # Strip any trailing slash from the destination. + dest=`$echo "X$dest" | $Xsed -e 's%/$%%'` + + # Check to see that the destination is a directory. + test -d "$dest" && isdir=yes + if test "$isdir" = yes; then + destdir="$dest" + destname= + else + destdir=`$echo "X$dest" | $Xsed -e 's%/[^/]*$%%'` + test "X$destdir" = "X$dest" && destdir=. + destname=`$echo "X$dest" | $Xsed -e 's%^.*/%%'` + + # Not a directory, so check to see that there is only one file specified. + set dummy $files + if test $# -gt 2; then + $echo "$modename: \`$dest' is not a directory" 1>&2 + $echo "$help" 1>&2 + exit 1 + fi + fi + case "$destdir" in + [\\/]* | [A-Za-z]:[\\/]*) ;; + *) + for file in $files; do + case "$file" in + *.lo) ;; + *) + $echo "$modename: \`$destdir' must be an absolute directory name" 1>&2 + $echo "$help" 1>&2 + exit 1 + ;; + esac + done + ;; + esac + + # This variable tells wrapper scripts just to set variables rather + # than running their programs. + libtool_install_magic="$magic" + + staticlibs= + future_libdirs= + current_libdirs= + for file in $files; do + + # Do each installation. + case "$file" in + *.$libext) + # Do the static libraries later. + staticlibs="$staticlibs $file" + ;; + + *.la) + # Check to see that this really is a libtool archive. + if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then : + else + $echo "$modename: \`$file' is not a valid libtool archive" 1>&2 + $echo "$help" 1>&2 + exit 1 + fi + + library_names= + old_library= + relink_command= + # If there is no directory component, then add one. + case "$file" in + */* | *\\*) . $file ;; + *) . ./$file ;; + esac + + # Add the libdir to current_libdirs if it is the destination. + if test "X$destdir" = "X$libdir"; then + case "$current_libdirs " in + *" $libdir "*) ;; + *) current_libdirs="$current_libdirs $libdir" ;; + esac + else + # Note the libdir as a future libdir. + case "$future_libdirs " in + *" $libdir "*) ;; + *) future_libdirs="$future_libdirs $libdir" ;; + esac + fi + + dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`/ + test "X$dir" = "X$file/" && dir= + dir="$dir$objdir" + + if test "$hardcode_into_libs" = all; then + if test -z "$relink_command"; then + $echo "$modename: invalid libtool pseudo library \`$file'" 1>&2 + exit 1 + fi + $echo "$modename: warning: relinking \`$file'" 1>&2 + $show "$relink_command" + if $run eval "$relink_command"; then : + else + $echo "$modename: error: relink \`$file' with the above command before installing it" 1>&2 + continue + fi + fi + + # See the names of the shared library. + set dummy $library_names + if test -n "$2"; then + realname="$2" + shift + shift + + srcname="$realname" + test "$hardcode_into_libs" = all && srcname="$realname"T + + # Install the shared library and build the symlinks. + $show "$install_prog $dir/$srcname $destdir/$realname" + $run eval "$install_prog $dir/$srcname $destdir/$realname" || exit $? + if test -n "$stripme" && test -n "$striplib"; then + $show "$striplib $destdir/$realname" + $run eval "$striplib $destdir/$realname" || exit $? + fi + + if test $# -gt 0; then + # Delete the old symlinks, and create new ones. + for linkname + do + if test "$linkname" != "$realname"; then + $show "(cd $destdir && $rm $linkname && $LN_S $realname $linkname)" + $run eval "(cd $destdir && $rm $linkname && $LN_S $realname $linkname)" + fi + done + fi + + # Do each command in the postinstall commands. + lib="$destdir/$realname" + eval cmds=\"$postinstall_cmds\" + IFS="${IFS= }"; save_ifs="$IFS"; IFS='~' + for cmd in $cmds; do + IFS="$save_ifs" + $show "$cmd" + $run eval "$cmd" || exit $? + done + IFS="$save_ifs" + fi + + # Install the pseudo-library for information purposes. + name=`$echo "X$file" | $Xsed -e 's%^.*/%%'` + instname="$dir/$name"i + $show "$install_prog $instname $destdir/$name" + $run eval "$install_prog $instname $destdir/$name" || exit $? + + # Maybe install the static library, too. + test -n "$old_library" && staticlibs="$staticlibs $dir/$old_library" + ;; + + *.lo) + # Install (i.e. copy) a libtool object. + + # Figure out destination file name, if it wasn't already specified. + if test -n "$destname"; then + destfile="$destdir/$destname" + else + destfile=`$echo "X$file" | $Xsed -e 's%^.*/%%'` + destfile="$destdir/$destfile" + fi + + # Deduce the name of the destination old-style object file. + case "$destfile" in + *.lo) + staticdest=`$echo "X$destfile" | $Xsed -e "$lo2o"` + ;; + *.$objext) + staticdest="$destfile" + destfile= + ;; + *) + $echo "$modename: cannot copy a libtool object to \`$destfile'" 1>&2 + $echo "$help" 1>&2 + exit 1 + ;; + esac + + # Install the libtool object if requested. + if test -n "$destfile"; then + $show "$install_prog $file $destfile" + $run eval "$install_prog $file $destfile" || exit $? + fi + + # Install the old object if enabled. + if test "$build_old_libs" = yes; then + # Deduce the name of the old-style object file. + staticobj=`$echo "X$file" | $Xsed -e "$lo2o"` + + $show "$install_prog $staticobj $staticdest" + $run eval "$install_prog \$staticobj \$staticdest" || exit $? + fi + exit 0 + ;; + + *) + # Figure out destination file name, if it wasn't already specified. + if test -n "$destname"; then + destfile="$destdir/$destname" + else + destfile=`$echo "X$file" | $Xsed -e 's%^.*/%%'` + destfile="$destdir/$destfile" + fi + + # Do a test to see if this is really a libtool program. + if (sed -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then + link_against_libtool_libs= + relink_command= + + # If there is no directory component, then add one. + case "$file" in + */* | *\\*) . $file ;; + *) . ./$file ;; + esac + + # Check the variables that should have been set. + if test -z "$link_against_libtool_libs"; then + $echo "$modename: invalid libtool wrapper script \`$file'" 1>&2 + exit 1 + fi + + finalize=yes + for lib in $link_against_libtool_libs; do + # Check to see that each library is installed. + libdir= + if test -f "$lib"; then + # If there is no directory component, then add one. + case "$lib" in + */* | *\\*) . $lib ;; + *) . ./$lib ;; + esac + fi + libfile="$libdir/"`$echo "X$lib" | $Xsed -e 's%^.*/%%g'` ### testsuite: skip nested quoting test + if test -n "$libdir" && test ! -f "$libfile"; then + $echo "$modename: warning: \`$lib' has not been installed in \`$libdir'" 1>&2 + finalize=no + fi + done + + relink_command= + # If there is no directory component, then add one. + case "$file" in + */* | *\\*) . $file ;; + *) . ./$file ;; + esac + + outputname= + if test "$fast_install" = no && test -n "$relink_command"; then + if test "$finalize" = yes && test -z "$run"; then + tmpdir="/tmp" + test -n "$TMPDIR" && tmpdir="$TMPDIR" + tmpdir="$tmpdir/libtool-$$" + if $mkdir -p "$tmpdir" && chmod 700 "$tmpdir"; then : + else + $echo "$modename: error: cannot create temporary directory \`$tmpdir'" 1>&2 + continue + fi + outputname="$tmpdir/$file" + # Replace the output file specification. + relink_command=`$echo "X$relink_command" | $Xsed -e 's%@OUTPUT@%'"$outputname"'%g'` + + $show "$relink_command" + if $run eval "$relink_command"; then : + else + $echo "$modename: error: relink \`$file' with the above command before installing it" 1>&2 + ${rm}r "$tmpdir" + continue + fi + file="$outputname" + else + $echo "$modename: warning: cannot relink \`$file'" 1>&2 + fi + else + # Install the binary that we compiled earlier. + file=`$echo "X$file" | $Xsed -e "s%\([^/]*\)$%$objdir/\1%"` + fi + fi + + $show "$install_prog$stripme $file $destfile" + $run eval "$install_prog\$stripme \$file \$destfile" || exit $? + test -n "$outputname" && ${rm}r "$tmpdir" + ;; + esac + done + + for file in $staticlibs; do + name=`$echo "X$file" | $Xsed -e 's%^.*/%%'` + + # Set up the ranlib parameters. + oldlib="$destdir/$name" + + $show "$install_prog $file $oldlib" + $run eval "$install_prog \$file \$oldlib" || exit $? + + if test -n "$stripme" && test -n "$striplib"; then + $show "$old_striplib $oldlib" + $run eval "$old_striplib $oldlib" || exit $? + fi + + # Do each command in the postinstall commands. + eval cmds=\"$old_postinstall_cmds\" + IFS="${IFS= }"; save_ifs="$IFS"; IFS='~' + for cmd in $cmds; do + IFS="$save_ifs" + $show "$cmd" + $run eval "$cmd" || exit $? + done + IFS="$save_ifs" + done + + if test -n "$future_libdirs"; then + $echo "$modename: warning: remember to run \`$progname --finish$future_libdirs'" 1>&2 + fi + + if test -n "$current_libdirs"; then + # Maybe just do a dry run. + test -n "$run" && current_libdirs=" -n$current_libdirs" + exec $SHELL $0 --finish$current_libdirs + exit 1 + fi + + exit 0 + ;; + + # libtool finish mode + finish) + modename="$modename: finish" + libdirs="$nonopt" + admincmds= + + if test -n "$finish_cmds$finish_eval" && test -n "$libdirs"; then + for dir + do + libdirs="$libdirs $dir" + done + + for libdir in $libdirs; do + if test -n "$finish_cmds"; then + # Do each command in the finish commands. + eval cmds=\"$finish_cmds\" + IFS="${IFS= }"; save_ifs="$IFS"; IFS='~' + for cmd in $cmds; do + IFS="$save_ifs" + $show "$cmd" + $run eval "$cmd" || admincmds="$admincmds + $cmd" + done + IFS="$save_ifs" + fi + if test -n "$finish_eval"; then + # Do the single finish_eval. + eval cmds=\"$finish_eval\" + $run eval "$cmds" || admincmds="$admincmds + $cmds" + fi + done + fi + + # Exit here if they wanted silent mode. + test "$show" = : && exit 0 + + echo "----------------------------------------------------------------------" + echo "Libraries have been installed in:" + for libdir in $libdirs; do + echo " $libdir" + done + echo + echo "If you ever happen to want to link against installed libraries" + echo "in a given directory, LIBDIR, you must either use libtool, and" + echo "specify the full pathname of the library, or use \`-LLIBDIR'" + echo "flag during linking and do at least one of the following:" + if test -n "$shlibpath_var"; then + echo " - add LIBDIR to the \`$shlibpath_var' environment variable" + echo " during execution" + fi + if test -n "$runpath_var"; then + echo " - add LIBDIR to the \`$runpath_var' environment variable" + echo " during linking" + fi + if test -n "$hardcode_libdir_flag_spec"; then + libdir=LIBDIR + eval flag=\"$hardcode_libdir_flag_spec\" + + echo " - use the \`$flag' linker flag" + fi + if test -n "$admincmds"; then + echo " - have your system administrator run these commands:$admincmds" + fi + if test -f /etc/ld.so.conf; then + echo " - have your system administrator add LIBDIR to \`/etc/ld.so.conf'" + fi + echo + echo "See any operating system documentation about shared libraries for" + echo "more information, such as the ld(1) and ld.so(8) manual pages." + echo "----------------------------------------------------------------------" + exit 0 + ;; + + # libtool execute mode + execute) + modename="$modename: execute" + + # The first argument is the command name. + cmd="$nonopt" + if test -z "$cmd"; then + $echo "$modename: you must specify a COMMAND" 1>&2 + $echo "$help" + exit 1 + fi + + # Handle -dlopen flags immediately. + for file in $execute_dlfiles; do + if test ! -f "$file"; then + $echo "$modename: \`$file' is not a file" 1>&2 + $echo "$help" 1>&2 + exit 1 + fi + + dir= + case "$file" in + *.la) + # Check to see that this really is a libtool archive. + if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then : + else + $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2 + $echo "$help" 1>&2 + exit 1 + fi + + # Read the libtool library. + dlname= + library_names= + + # If there is no directory component, then add one. + case "$file" in + */* | *\\*) . $file ;; + *) . ./$file ;; + esac + + # Skip this library if it cannot be dlopened. + if test -z "$dlname"; then + # Warn if it was a shared library. + test -n "$library_names" && $echo "$modename: warning: \`$file' was not linked with \`-export-dynamic'" + continue + fi + + dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'` + test "X$dir" = "X$file" && dir=. + + if test -f "$dir/$objdir/$dlname"; then + dir="$dir/$objdir" + else + $echo "$modename: cannot find \`$dlname' in \`$dir' or \`$dir/$objdir'" 1>&2 + exit 1 + fi + ;; + + *.lo) + # Just add the directory containing the .lo file. + dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'` + test "X$dir" = "X$file" && dir=. + ;; + + *) + $echo "$modename: warning \`-dlopen' is ignored for non-libtool libraries and objects" 1>&2 + continue + ;; + esac + + # Get the absolute pathname. + absdir=`cd "$dir" && pwd` + test -n "$absdir" && dir="$absdir" + + # Now add the directory to shlibpath_var. + if eval "test -z \"\$$shlibpath_var\""; then + eval "$shlibpath_var=\"\$dir\"" + else + eval "$shlibpath_var=\"\$dir:\$$shlibpath_var\"" + fi + done + + # This variable tells wrapper scripts just to set shlibpath_var + # rather than running their programs. + libtool_execute_magic="$magic" + + # Check if any of the arguments is a wrapper script. + args= + for file + do + case "$file" in + -*) ;; + *) + # Do a test to see if this is really a libtool program. + if (sed -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then + # If there is no directory component, then add one. + case "$file" in + */* | *\\*) . $file ;; + *) . ./$file ;; + esac + + # Transform arg to wrapped name. + file="$progdir/$program" + fi + ;; + esac + # Quote arguments (to preserve shell metacharacters). + file=`$echo "X$file" | $Xsed -e "$sed_quote_subst"` + args="$args \"$file\"" + done + + if test -z "$run"; then + if test -n "$shlibpath_var"; then + # Export the shlibpath_var. + eval "export $shlibpath_var" + fi + + # Restore saved enviroment variables + if test "${save_LC_ALL+set}" = set; then + LC_ALL="$save_LC_ALL"; export LC_ALL + fi + if test "${save_LANG+set}" = set; then + LANG="$save_LANG"; export LANG + fi + + # Now actually exec the command. + eval "exec \$cmd$args" + + $echo "$modename: cannot exec \$cmd$args" + exit 1 + else + # Display what would be done. + if test -n "$shlibpath_var"; then + eval "\$echo \"\$shlibpath_var=\$$shlibpath_var\"" + $echo "export $shlibpath_var" + fi + $echo "$cmd$args" + exit 0 + fi + ;; + + # libtool clean and uninstall mode + clean | uninstall) + modename="$modename: $mode" + rm="$nonopt" + files= + + # This variable tells wrapper scripts just to set variables rather + # than running their programs. + libtool_install_magic="$magic" + + for arg + do + case "$arg" in + -*) rm="$rm $arg" ;; + *) files="$files $arg" ;; + esac + done + + if test -z "$rm"; then + $echo "$modename: you must specify an RM program" 1>&2 + $echo "$help" 1>&2 + exit 1 + fi + + for file in $files; do + dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'` + if test "X$dir" = "X$file"; then + dir=. + objdir="$objdir" + else + objdir="$dir/$objdir" + fi + name=`$echo "X$file" | $Xsed -e 's%^.*/%%'` + test $mode = uninstall && objdir="$dir" + + rmfiles="$file" + + case "$name" in + *.la) + # Possibly a libtool archive, so verify it. + if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then + . $dir/$name + + # Delete the libtool libraries and symlinks. + for n in $library_names; do + rmfiles="$rmfiles $objdir/$n" + done + test -n "$old_library" && rmfiles="$rmfiles $objdir/$old_library" + test $mode = clean && rmfiles="$rmfiles $objdir/$name $objdir/${name}i" + + if test $mode = uninstall; then + if test -n "$library_names"; then + # Do each command in the postuninstall commands. + eval cmds=\"$postuninstall_cmds\" + IFS="${IFS= }"; save_ifs="$IFS"; IFS='~' + for cmd in $cmds; do + IFS="$save_ifs" + $show "$cmd" + $run eval "$cmd" + done + IFS="$save_ifs" + fi + + if test -n "$old_library"; then + # Do each command in the old_postuninstall commands. + eval cmds=\"$old_postuninstall_cmds\" + IFS="${IFS= }"; save_ifs="$IFS"; IFS='~' + for cmd in $cmds; do + IFS="$save_ifs" + $show "$cmd" + $run eval "$cmd" + done + IFS="$save_ifs" + fi + # FIXME: should reinstall the best remaining shared library. + fi + fi + ;; + + *.lo) + if test "$build_old_libs" = yes; then + oldobj=`$echo "X$name" | $Xsed -e "$lo2o"` + rmfiles="$rmfiles $dir/$oldobj" + fi + ;; + + *) + # Do a test to see if this is a libtool program. + if test $mode = clean && + (sed -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then + relink_command= + . $dir/$file + + rmfiles="$rmfiles $objdir/$name $objdir/${name}S.${objext}" + if test "$fast_install" = yes && test -n "$relink_command"; then + rmfiles="$rmfiles $objdir/lt-$name" + fi + fi + ;; + esac + $show "$rm $rmfiles" + $run $rm $rmfiles + done + exit 0 + ;; + + "") + $echo "$modename: you must specify a MODE" 1>&2 + $echo "$generic_help" 1>&2 + exit 1 + ;; + esac + + $echo "$modename: invalid operation mode \`$mode'" 1>&2 + $echo "$generic_help" 1>&2 + exit 1 +fi # test -z "$show_help" + +# We need to display help for each of the modes. +case "$mode" in +"") $echo \ +"Usage: $modename [OPTION]... [MODE-ARG]... + +Provide generalized library-building support services. + + --config show all configuration variables + --debug enable verbose shell tracing +-n, --dry-run display commands without modifying any files + --features display basic configuration information and exit + --finish same as \`--mode=finish' + --help display this help message and exit + --mode=MODE use operation mode MODE [default=inferred from MODE-ARGS] + --quiet same as \`--silent' + --silent don't print informational messages + --version print version information + +MODE must be one of the following: + + clean remove files from the build directory + compile compile a source file into a libtool object + execute automatically set library path, then run a program + finish complete the installation of libtool libraries + install install libraries or executables + link create a library or an executable + uninstall remove libraries from an installed directory + +MODE-ARGS vary depending on the MODE. Try \`$modename --help --mode=MODE' for +a more detailed description of MODE." + exit 0 + ;; + +clean) + $echo \ +"Usage: $modename [OPTION]... --mode=clean RM [RM-OPTION]... FILE... + +Remove files from the build directory. + +RM is the name of the program to use to delete files associated with each FILE +(typically \`/bin/rm'). RM-OPTIONS are options (such as \`-f') to be passed +to RM. + +If FILE is a libtool library, object or program, all the files associated +with it are deleted. Otherwise, only FILE itself is deleted using RM." + ;; + +compile) + $echo \ +"Usage: $modename [OPTION]... --mode=compile COMPILE-COMMAND... SOURCEFILE + +Compile a source file into a libtool library object. + +This mode accepts the following additional options: + + -o OUTPUT-FILE set the output file name to OUTPUT-FILE + -static always build a \`.o' file suitable for static linking + +COMPILE-COMMAND is a command to be used in creating a \`standard' object file +from the given SOURCEFILE. + +The output file name is determined by removing the directory component from +SOURCEFILE, then substituting the C source code suffix \`.c' with the +library object suffix, \`.lo'." + ;; + +execute) + $echo \ +"Usage: $modename [OPTION]... --mode=execute COMMAND [ARGS]... + +Automatically set library path, then run a program. + +This mode accepts the following additional options: + + -dlopen FILE add the directory containing FILE to the library path + +This mode sets the library path environment variable according to \`-dlopen' +flags. + +If any of the ARGS are libtool executable wrappers, then they are translated +into their corresponding uninstalled binary, and any of their required library +directories are added to the library path. + +Then, COMMAND is executed, with ARGS as arguments." + ;; + +finish) + $echo \ +"Usage: $modename [OPTION]... --mode=finish [LIBDIR]... + +Complete the installation of libtool libraries. + +Each LIBDIR is a directory that contains libtool libraries. + +The commands that this mode executes may require superuser privileges. Use +the \`--dry-run' option if you just want to see what would be executed." + ;; + +install) + $echo \ +"Usage: $modename [OPTION]... --mode=install INSTALL-COMMAND... + +Install executables or libraries. + +INSTALL-COMMAND is the installation command. The first component should be +either the \`install' or \`cp' program. + +The rest of the components are interpreted as arguments to that command (only +BSD-compatible install options are recognized)." + ;; + +link) + $echo \ +"Usage: $modename [OPTION]... --mode=link LINK-COMMAND... + +Link object files or libraries together to form another library, or to +create an executable program. + +LINK-COMMAND is a command using the C compiler that you would use to create +a program from several object files. + +The following components of LINK-COMMAND are treated specially: + + -all-static do not do any dynamic linking at all + -avoid-version do not add a version suffix if possible + -dlopen FILE \`-dlpreopen' FILE if it cannot be dlopened at runtime + -dlpreopen FILE link in FILE and add its symbols to lt_preloaded_symbols + -export-dynamic allow symbols from OUTPUT-FILE to be resolved with dlsym(3) + -export-symbols SYMFILE + try to export only the symbols listed in SYMFILE + -export-symbols-regex REGEX + try to export only the symbols matching REGEX + -LLIBDIR search LIBDIR for required installed libraries + -lNAME OUTPUT-FILE requires the installed library libNAME + -module build a library that can dlopened + -no-fast-install disable the fast-install mode + -no-install link a not-installable executable + -no-undefined declare that a library does not refer to external symbols + -o OUTPUT-FILE create OUTPUT-FILE from the specified objects + -release RELEASE specify package release information + -rpath LIBDIR the created library will eventually be installed in LIBDIR + -R[ ]LIBDIR add LIBDIR to the runtime path of programs and libraries + -static do not do any dynamic linking of libtool libraries + -version-info CURRENT[:REVISION[:AGE]] + specify library version info [each variable defaults to 0] + +All other options (arguments beginning with \`-') are ignored. + +Every other argument is treated as a filename. Files ending in \`.la' are +treated as uninstalled libtool libraries, other files are standard or library +object files. + +If the OUTPUT-FILE ends in \`.la', then a libtool library is created, +only library objects (\`.lo' files) may be specified, and \`-rpath' is +required, except when creating a convenience library. + +If OUTPUT-FILE ends in \`.a' or \`.lib', then a standard library is created +using \`ar' and \`ranlib', or on Windows using \`lib'. + +If OUTPUT-FILE ends in \`.lo' or \`.${objext}', then a reloadable object file +is created, otherwise an executable program is created." + ;; + +uninstall) + $echo \ +"Usage: $modename [OPTION]... --mode=uninstall RM [RM-OPTION]... FILE... + +Remove libraries from an installation directory. + +RM is the name of the program to use to delete files associated with each FILE +(typically \`/bin/rm'). RM-OPTIONS are options (such as \`-f') to be passed +to RM. + +If FILE is a libtool library, all the files associated with it are deleted. +Otherwise, only FILE itself is deleted using RM." + ;; + +*) + $echo "$modename: invalid operation mode \`$mode'" 1>&2 + $echo "$help" 1>&2 + exit 1 + ;; +esac + +echo +$echo "Try \`$modename --help' for more information about other modes." + +exit 0 + +# Local Variables: +# mode:shell-script +# sh-indentation:2 +# End: diff --git a/ghc/rts/gmp/mdate-sh b/ghc/rts/gmp/mdate-sh new file mode 100644 index 0000000..37171f2 --- /dev/null +++ b/ghc/rts/gmp/mdate-sh @@ -0,0 +1,92 @@ +#!/bin/sh +# Get modification time of a file or directory and pretty-print it. +# Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc. +# written by Ulrich Drepper , June 1995 +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +# Prevent date giving response in another language. +LANG=C +export LANG +LC_ALL=C +export LC_ALL +LC_TIME=C +export LC_TIME + +# Get the extended ls output of the file or directory. +# On HPUX /bin/sh, "set" interprets "-rw-r--r--" as options, so the "x" below. +if ls -L /dev/null 1>/dev/null 2>&1; then + set - x`ls -L -l -d $1` +else + set - x`ls -l -d $1` +fi +# The month is at least the fourth argument +# (3 shifts here, the next inside the loop). +shift +shift +shift + +# Find the month. Next argument is day, followed by the year or time. +month= +until test $month +do + shift + case $1 in + Jan) month=January; nummonth=1;; + Feb) month=February; nummonth=2;; + Mar) month=March; nummonth=3;; + Apr) month=April; nummonth=4;; + May) month=May; nummonth=5;; + Jun) month=June; nummonth=6;; + Jul) month=July; nummonth=7;; + Aug) month=August; nummonth=8;; + Sep) month=September; nummonth=9;; + Oct) month=October; nummonth=10;; + Nov) month=November; nummonth=11;; + Dec) month=December; nummonth=12;; + esac +done + +day=$2 + +# Here we have to deal with the problem that the ls output gives either +# the time of day or the year. +case $3 in + *:*) set `date`; eval year=\$$# + case $2 in + Jan) nummonthtod=1;; + Feb) nummonthtod=2;; + Mar) nummonthtod=3;; + Apr) nummonthtod=4;; + May) nummonthtod=5;; + Jun) nummonthtod=6;; + Jul) nummonthtod=7;; + Aug) nummonthtod=8;; + Sep) nummonthtod=9;; + Oct) nummonthtod=10;; + Nov) nummonthtod=11;; + Dec) nummonthtod=12;; + esac + # For the first six month of the year the time notation can also + # be used for files modified in the last year. + if (expr $nummonth \> $nummonthtod) > /dev/null; + then + year=`expr $year - 1` + fi;; + *) year=$3;; +esac + +# The result. +echo $day $month $year diff --git a/ghc/rts/gmp/memory.c b/ghc/rts/gmp/memory.c index 9ee1da8..9df440c 100644 --- a/ghc/rts/gmp/memory.c +++ b/ghc/rts/gmp/memory.c @@ -1,25 +1,26 @@ /* Memory allocation routines. -Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include +#include /* for malloc, realloc, free */ #include "gmp.h" #include "gmp-impl.h" @@ -28,16 +29,12 @@ MA 02111-1307, USA. */ #define static #endif -#if __STDC__ -void * (*_mp_allocate_func) (size_t) = _mp_default_allocate; -void * (*_mp_reallocate_func) (void *, size_t, size_t) + +void * (*_mp_allocate_func) _PROTO ((size_t)) = _mp_default_allocate; +void * (*_mp_reallocate_func) _PROTO ((void *, size_t, size_t)) = _mp_default_reallocate; -void (*_mp_free_func) (void *, size_t) = _mp_default_free; -#else -void * (*_mp_allocate_func) () = _mp_default_allocate; -void * (*_mp_reallocate_func) () = _mp_default_reallocate; -void (*_mp_free_func) () = _mp_default_free; -#endif +void (*_mp_free_func) _PROTO ((void *, size_t)) = _mp_default_free; + /* Default allocation functions. In case of failure to allocate/reallocate an error message is written to stderr and the program aborts. */ @@ -51,14 +48,27 @@ _mp_default_allocate (size) #endif { void *ret; - +#ifdef DEBUG + size_t req_size = size; + size += 2 * BYTES_PER_MP_LIMB; +#endif ret = malloc (size); if (ret == 0) { perror ("cannot allocate in gmp"); abort (); } - + +#ifdef DEBUG + { + mp_ptr p = ret; + p++; + p[-1] = (0xdeadbeef << 31) + 0xdeafdeed; + if (req_size % BYTES_PER_MP_LIMB == 0) + p[req_size / BYTES_PER_MP_LIMB] = ~((0xdeadbeef << 31) + 0xdeafdeed); + ret = p; + } +#endif return ret; } @@ -74,6 +84,29 @@ _mp_default_reallocate (oldptr, old_size, new_size) { void *ret; +#ifdef DEBUG + size_t req_size = new_size; + + if (old_size != 0) + { + mp_ptr p = oldptr; + if (p[-1] != (0xdeadbeef << 31) + 0xdeafdeed) + { + fprintf (stderr, "gmp: (realloc) data clobbered before allocation block\n"); + abort (); + } + if (old_size % BYTES_PER_MP_LIMB == 0) + if (p[old_size / BYTES_PER_MP_LIMB] != ~((0xdeadbeef << 31) + 0xdeafdeed)) + { + fprintf (stderr, "gmp: (realloc) data clobbered after allocation block\n"); + abort (); + } + oldptr = p - 1; + } + + new_size += 2 * BYTES_PER_MP_LIMB; +#endif + ret = realloc (oldptr, new_size); if (ret == 0) { @@ -81,6 +114,16 @@ _mp_default_reallocate (oldptr, old_size, new_size) abort (); } +#ifdef DEBUG + { + mp_ptr p = ret; + p++; + p[-1] = (0xdeadbeef << 31) + 0xdeafdeed; + if (req_size % BYTES_PER_MP_LIMB == 0) + p[req_size / BYTES_PER_MP_LIMB] = ~((0xdeadbeef << 31) + 0xdeafdeed); + ret = p; + } +#endif return ret; } @@ -93,5 +136,25 @@ _mp_default_free (blk_ptr, blk_size) size_t blk_size; #endif { +#ifdef DEBUG + { + mp_ptr p = blk_ptr; + if (blk_size != 0) + { + if (p[-1] != (0xdeadbeef << 31) + 0xdeafdeed) + { + fprintf (stderr, "gmp: (free) data clobbered before allocation block\n"); + abort (); + } + if (blk_size % BYTES_PER_MP_LIMB == 0) + if (p[blk_size / BYTES_PER_MP_LIMB] != ~((0xdeadbeef << 31) + 0xdeafdeed)) + { + fprintf (stderr, "gmp: (free) data clobbered after allocation block\n"); + abort (); + } + } + blk_ptr = p - 1; + } +#endif free (blk_ptr); } diff --git a/ghc/rts/gmp/missing b/ghc/rts/gmp/missing new file mode 100644 index 0000000..c60e9d7 --- /dev/null +++ b/ghc/rts/gmp/missing @@ -0,0 +1,244 @@ +#! /bin/sh +# Common stub for a few missing GNU programs while installing. +# Copyright (C) 1996, 1997, 1999 Free Software Foundation, Inc. +# Originally by Fran,cois Pinard , 1996. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +# 02111-1307, USA. + +if test $# -eq 0; then + echo 1>&2 "Try \`$0 --help' for more information" + exit 1 +fi + +run=: + +case "$1" in +--run) + # Try to run requested program, and just exit if it succeeds. + run= + shift + "$@" && exit 0 + ;; +esac + +# If it does not exist, or fails to run (possibly an outdated version), +# try to emulate it. +case "$1" in + + -h|--h|--he|--hel|--help) + echo "\ +$0 [OPTION]... PROGRAM [ARGUMENT]... + +Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an +error status if there is no known handling for PROGRAM. + +Options: + -h, --help display this help and exit + -v, --version output version information and exit + --run try to run the given command, and emulate it if it fails + +Supported PROGRAM values: + aclocal touch file \`aclocal.m4' + autoconf touch file \`configure' + autoheader touch file \`config.h.in' + automake touch all \`Makefile.in' files + bison create \`y.tab.[ch]', if possible, from existing .[ch] + flex create \`lex.yy.c', if possible, from existing .c + lex create \`lex.yy.c', if possible, from existing .c + makeinfo touch the output file + tar try tar, gnutar, gtar, then tar without non-portable flags + yacc create \`y.tab.[ch]', if possible, from existing .[ch]" + ;; + + -v|--v|--ve|--ver|--vers|--versi|--versio|--version) + echo "missing 0.2 - GNU automake" + ;; + + -*) + echo 1>&2 "$0: Unknown \`$1' option" + echo 1>&2 "Try \`$0 --help' for more information" + exit 1 + ;; + + aclocal) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`acinclude.m4' or \`configure.in'. You might want + to install the \`Automake' and \`Perl' packages. Grab them from + any GNU archive site." + touch aclocal.m4 + ;; + + autoconf) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`configure.in'. You might want to install the + \`Autoconf' and \`GNU m4' packages. Grab them from any GNU + archive site." + touch configure + ;; + + autoheader) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`acconfig.h' or \`configure.in'. You might want + to install the \`Autoconf' and \`GNU m4' packages. Grab them + from any GNU archive site." + files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' configure.in` + test -z "$files" && files="config.h" + touch_files= + for f in $files; do + case "$f" in + *:*) touch_files="$touch_files "`echo "$f" | + sed -e 's/^[^:]*://' -e 's/:.*//'`;; + *) touch_files="$touch_files $f.in";; + esac + done + touch $touch_files + ;; + + automake) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`Makefile.am', \`acinclude.m4' or \`configure.in'. + You might want to install the \`Automake' and \`Perl' packages. + Grab them from any GNU archive site." + find . -type f -name Makefile.am -print | + sed 's/\.am$/.in/' | + while read f; do touch "$f"; done + ;; + + bison|yacc) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified a \`.y' file. You may need the \`Bison' package + in order for those modifications to take effect. You can get + \`Bison' from any GNU archive site." + rm -f y.tab.c y.tab.h + if [ $# -ne 1 ]; then + eval LASTARG="\${$#}" + case "$LASTARG" in + *.y) + SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'` + if [ -f "$SRCFILE" ]; then + cp "$SRCFILE" y.tab.c + fi + SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'` + if [ -f "$SRCFILE" ]; then + cp "$SRCFILE" y.tab.h + fi + ;; + esac + fi + if [ ! -f y.tab.h ]; then + echo >y.tab.h + fi + if [ ! -f y.tab.c ]; then + echo 'main() { return 0; }' >y.tab.c + fi + ;; + + lex|flex) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified a \`.l' file. You may need the \`Flex' package + in order for those modifications to take effect. You can get + \`Flex' from any GNU archive site." + rm -f lex.yy.c + if [ $# -ne 1 ]; then + eval LASTARG="\${$#}" + case "$LASTARG" in + *.l) + SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'` + if [ -f "$SRCFILE" ]; then + cp "$SRCFILE" lex.yy.c + fi + ;; + esac + fi + if [ ! -f lex.yy.c ]; then + echo 'main() { return 0; }' >lex.yy.c + fi + ;; + + makeinfo) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified a \`.texi' or \`.texinfo' file, or any other file + indirectly affecting the aspect of the manual. The spurious + call might also be the consequence of using a buggy \`make' (AIX, + DU, IRIX). You might want to install the \`Texinfo' package or + the \`GNU make' package. Grab either from any GNU archive site." + file=`echo "$*" | sed -n 's/.*-o \([^ ]*\).*/\1/p'` + if test -z "$file"; then + file=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'` + file=`sed -n '/^@setfilename/ { s/.* \([^ ]*\) *$/\1/; p; q; }' $file` + fi + touch $file + ;; + + tar) + shift + if test -n "$run"; then + echo 1>&2 "ERROR: \`tar' requires --run" + exit 1 + fi + + # We have already tried tar in the generic part. + # Look for gnutar/gtar before invocation to avoid ugly error + # messages. + if (gnutar --version > /dev/null 2>&1); then + gnutar ${1+"$@"} && exit 0 + fi + if (gtar --version > /dev/null 2>&1); then + gtar ${1+"$@"} && exit 0 + fi + firstarg="$1" + if shift; then + case "$firstarg" in + *o*) + firstarg=`echo "$firstarg" | sed s/o//` + tar "$firstarg" ${1+"$@"} && exit 0 + ;; + esac + case "$firstarg" in + *h*) + firstarg=`echo "$firstarg" | sed s/h//` + tar "$firstarg" ${1+"$@"} && exit 0 + ;; + esac + fi + + echo 1>&2 "\ +WARNING: I can't seem to be able to run \`tar' with the given arguments. + You may want to install GNU tar or Free paxutils, or check the + command line arguments." + exit 1 + ;; + + *) + echo 1>&2 "\ +WARNING: \`$1' is needed, and you do not seem to have it handy on your + system. You might have modified some files without having the + proper tools for further handling them. Check the \`README' file, + it often tells you about the needed prerequirements for installing + this package. You may also peek at any GNU archive site, in case + some other package would contain this missing \`$1' program." + exit 1 + ;; +esac + +exit 0 diff --git a/ghc/rts/gmp/mp_bpl.c b/ghc/rts/gmp/mp_bpl.c index d817ac8..df8b03e 100644 --- a/ghc/rts/gmp/mp_bpl.c +++ b/ghc/rts/gmp/mp_bpl.c @@ -1,4 +1,27 @@ +/* +Copyright (C) 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + #include "gmp.h" #include "gmp-impl.h" const int mp_bits_per_limb = BITS_PER_MP_LIMB; +const int __gmp_0 = 0; +int __gmp_junk; diff --git a/ghc/rts/gmp/mp_clz_tab.c b/ghc/rts/gmp/mp_clz_tab.c index 6fd7e90..1bbd1d6 100644 --- a/ghc/rts/gmp/mp_clz_tab.c +++ b/ghc/rts/gmp/mp_clz_tab.c @@ -1,32 +1,28 @@ /* __clz_tab -- support for longlong.h -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1997 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#if 0 #include "gmp.h" #include "gmp-impl.h" -#endif -#if 0 const -#endif unsigned char __clz_tab[] = { 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, diff --git a/ghc/rts/gmp/mp_minv_tab.c b/ghc/rts/gmp/mp_minv_tab.c new file mode 100644 index 0000000..4afff85 --- /dev/null +++ b/ghc/rts/gmp/mp_minv_tab.c @@ -0,0 +1,50 @@ +/* A table of data supporting modlimb_invert(). + + THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND MAY CHANGE + INCOMPATIBLY OR DISAPPEAR IN A FUTURE GNU MP RELEASE. */ + +/* +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + + +/* modlimb_invert_table[i] is the multiplicative inverse of 2*i+1 mod 256, + ie. (modlimb_invert_table[i] * (2*i+1)) % 256 == 1 */ + +const unsigned char modlimb_invert_table[128] = { + 0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF, + 0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF, + 0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF, + 0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF, + 0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF, + 0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F, + 0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F, + 0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F, + 0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F, + 0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F, + 0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F, + 0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F, + 0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F, + 0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F, + 0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F, + 0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF +}; diff --git a/ghc/rts/gmp/mp_set_fns.c b/ghc/rts/gmp/mp_set_fns.c index 35a462c..55d4d9d 100644 --- a/ghc/rts/gmp/mp_set_fns.c +++ b/ghc/rts/gmp/mp_set_fns.c @@ -6,16 +6,16 @@ Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/Makefile.am b/ghc/rts/gmp/mpn/Makefile.am new file mode 100644 index 0000000..1c49ccd --- /dev/null +++ b/ghc/rts/gmp/mpn/Makefile.am @@ -0,0 +1,94 @@ +## Process this file with automake to generate Makefile.in + +# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +AUTOMAKE_OPTIONS = gnu no-dependencies +SUBDIRS = tests + +CPP = @CPP@ + +# -DOPERATION_$* tells multi-function files which function to produce. +INCLUDES = -I$(top_srcdir) -DOPERATION_$* + +GENERIC_SOURCES = mp_bases.c +OFILES = @mpn_objects@ + +noinst_LTLIBRARIES = libmpn.la +libmpn_la_SOURCES = $(GENERIC_SOURCES) +libmpn_la_LIBADD = $(OFILES) +libmpn_la_DEPENDENCIES = $(OFILES) + +TARG_DIST = a29k alpha arm clipper cray generic hppa i960 lisp m68k m88k \ + mips2 mips3 ns32k pa64 pa64w power powerpc32 powerpc64 pyr sh sparc32 \ + sparc64 thumb vax x86 z8000 z8000x + +EXTRA_DIST = underscore.h asm-defs.m4 $(TARG_DIST) + +# COMPILE minus CC. FIXME: Really pass *_CFLAGS to CPP? +COMPILE_FLAGS = \ + $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) + +SUFFIXES = .s .S .asm + +# *.s are not preprocessed at all. +.s.o: + $(CCAS) $(COMPILE_FLAGS) $< +.s.obj: + $(CCAS) $(COMPILE_FLAGS) `cygpath -w $<` +.s.lo: + $(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) $< + +# *.S are preprocessed with CPP. +.S.o: + $(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + rm -f tmp-$*.s +.S.obj: + $(CPP) $(COMPILE_FLAGS) `cygpath -w $<` | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + rm -f tmp-$*.s + +# We have to rebuild the static object file without passing -DPIC to +# preprocessor. The overhead cost is one extra assemblation. FIXME: +# Teach libtool how to assemble with a preprocessor pass (CPP or m4). + +.S.lo: + $(CPP) $(COMPILE_FLAGS) -DPIC $< | grep -v '^#' >tmp-$*.s + $(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o + rm -f tmp-$*.s + +# *.m4 are preprocessed with m4. +.asm.o: + $(M4) -DOPERATION_$* $< >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + rm -f tmp-$*.s +.asm.obj: + $(M4) -DOPERATION_$* `cygpath -w $<` >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + rm -f tmp-$*.s +.asm.lo: + $(M4) -DPIC -DOPERATION_$* $< >tmp-$*.s + $(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(M4) -DOPERATION_$* $< >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o + rm -f tmp-$*.s diff --git a/ghc/rts/gmp/mpn/README b/ghc/rts/gmp/mpn/README index 3da559e..7453c9d 100644 --- a/ghc/rts/gmp/mpn/README +++ b/ghc/rts/gmp/mpn/README @@ -1,15 +1,13 @@ This directory contains all code for the mpn layer of GMP. -Most subdirectories contain machine-dependent code, written in assembly or -C. The `generic' subdirectory contains default code, used when there is no +Most subdirectories contain machine-dependent code, written in assembly or C. +The `generic' subdirectory contains default code, used when there is no machine-dependent replacement for a particular machine. -There is one subdirectory for each architecture. Note that e.g., 32-bit -sparc and 64-bit sparc cannot share any code, and are therefore considered -completely different architecture. +There is one subdirectory for each ISA family. Note that e.g., 32-bit SPARC +and 64-bit SPARC are very different ISA's, and thus cannot share any code. -A particular machine will only use code from one such subdirectory, and the -`generic' subdirectory. The architecture-specific subdirectory contains a -hierachy of directories for various architecture variants and -implementations; the top-most level contains code that runs correctly on all -variants. +A particular compile will only use code from one subdirectory, and the +`generic' subdirectory. The ISA-specific subdirectories contain hierachies of +directories for various architecture variants and implementations; the +top-most level contains code that runs correctly on all variants. diff --git a/ghc/rts/gmp/mpn/a29k/add_n.s b/ghc/rts/gmp/mpn/a29k/add_n.s index 74c20e3..e3ee6df 100644 --- a/ghc/rts/gmp/mpn/a29k/add_n.s +++ b/ghc/rts/gmp/mpn/a29k/add_n.s @@ -1,21 +1,21 @@ -; 29000 __mpn_add -- Add two limb vectors of the same length > 0 and store +; 29000 __gmpn_add -- Add two limb vectors of the same length > 0 and store ; sum in a third limb vector. -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -38,9 +38,9 @@ .sect .lit,lit .text .align 4 - .global ___mpn_add_n + .global ___gmpn_add_n .word 0x60000 -___mpn_add_n: +___gmpn_add_n: srl gr117,lr5,3 sub gr118,gr117,1 jmpt gr118,Ltail diff --git a/ghc/rts/gmp/mpn/a29k/addmul_1.s b/ghc/rts/gmp/mpn/a29k/addmul_1.s index 8c0ec96..f51b6d7 100644 --- a/ghc/rts/gmp/mpn/a29k/addmul_1.s +++ b/ghc/rts/gmp/mpn/a29k/addmul_1.s @@ -1,21 +1,21 @@ -; 29000 __mpn_addmul_1 -- Multiply a limb vector with a single limb and +; 29000 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and ; add the product to a second limb vector. -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -31,9 +31,9 @@ .sect .lit,lit .text .align 4 - .global ___mpn_addmul_1 + .global ___gmpn_addmul_1 .word 0x60000 -___mpn_addmul_1: +___gmpn_addmul_1: sub lr4,lr4,8 jmpt lr4,Ltail const gr120,0 ; init cylimb reg diff --git a/ghc/rts/gmp/mpn/a29k/lshift.s b/ghc/rts/gmp/mpn/a29k/lshift.s index 7554e2c..93e1917 100644 --- a/ghc/rts/gmp/mpn/a29k/lshift.s +++ b/ghc/rts/gmp/mpn/a29k/lshift.s @@ -1,20 +1,20 @@ -; 29000 __mpn_lshift -- +; 29000 __gmpn_lshift -- -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -32,9 +32,9 @@ .sect .lit,lit .text .align 4 - .global ___mpn_lshift + .global ___gmpn_lshift .word 0x60000 -___mpn_lshift: +___gmpn_lshift: sll gr116,lr4,2 add lr3,gr116,lr3 add lr2,gr116,lr2 diff --git a/ghc/rts/gmp/mpn/a29k/mul_1.s b/ghc/rts/gmp/mpn/a29k/mul_1.s index 5d120f4..6bcf7ce 100644 --- a/ghc/rts/gmp/mpn/a29k/mul_1.s +++ b/ghc/rts/gmp/mpn/a29k/mul_1.s @@ -1,21 +1,21 @@ -; 29000 __mpn_mul_1 -- Multiply a limb vector with a single limb and +; 29000 __gmpn_mul_1 -- Multiply a limb vector with a single limb and ; store the product in a second limb vector. -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -31,9 +31,9 @@ .sect .lit,lit .text .align 4 - .global ___mpn_mul_1 + .global ___gmpn_mul_1 .word 0x60000 -___mpn_mul_1: +___gmpn_mul_1: sub lr4,lr4,8 jmpt lr4,Ltail const gr120,0 ; init cylimb reg diff --git a/ghc/rts/gmp/mpn/a29k/rshift.s b/ghc/rts/gmp/mpn/a29k/rshift.s index fe53b71..ea163bf 100644 --- a/ghc/rts/gmp/mpn/a29k/rshift.s +++ b/ghc/rts/gmp/mpn/a29k/rshift.s @@ -1,20 +1,20 @@ -; 29000 __mpn_rshift -- +; 29000 __gmpn_rshift -- -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -32,9 +32,9 @@ .sect .lit,lit .text .align 4 - .global ___mpn_rshift + .global ___gmpn_rshift .word 0x60000 -___mpn_rshift: +___gmpn_rshift: load 0,0,gr119,lr3 add lr3,lr3,4 diff --git a/ghc/rts/gmp/mpn/a29k/sub_n.s b/ghc/rts/gmp/mpn/a29k/sub_n.s index 3c8d610..c6b64c5 100644 --- a/ghc/rts/gmp/mpn/a29k/sub_n.s +++ b/ghc/rts/gmp/mpn/a29k/sub_n.s @@ -1,21 +1,21 @@ -; 29000 __mpn_sub -- Subtract two limb vectors of the same length > 0 and +; 29000 __gmpn_sub -- Subtract two limb vectors of the same length > 0 and ; store difference in a third limb vector. -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -38,9 +38,9 @@ .sect .lit,lit .text .align 4 - .global ___mpn_sub_n + .global ___gmpn_sub_n .word 0x60000 -___mpn_sub_n: +___gmpn_sub_n: srl gr117,lr5,3 sub gr118,gr117,1 jmpt gr118,Ltail diff --git a/ghc/rts/gmp/mpn/a29k/submul_1.s b/ghc/rts/gmp/mpn/a29k/submul_1.s index ca2ef72..ef97d8d 100644 --- a/ghc/rts/gmp/mpn/a29k/submul_1.s +++ b/ghc/rts/gmp/mpn/a29k/submul_1.s @@ -1,21 +1,21 @@ -; 29000 __mpn_submul_1 -- Multiply a limb vector with a single limb and +; 29000 __gmpn_submul_1 -- Multiply a limb vector with a single limb and ; subtract the product from a second limb vector. -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -31,9 +31,9 @@ .sect .lit,lit .text .align 4 - .global ___mpn_submul_1 + .global ___gmpn_submul_1 .word 0x60000 -___mpn_submul_1: +___gmpn_submul_1: sub lr4,lr4,8 jmpt lr4,Ltail const gr120,0 ; init cylimb reg diff --git a/ghc/rts/gmp/mpn/a29k/udiv.s b/ghc/rts/gmp/mpn/a29k/udiv.s new file mode 100644 index 0000000..fdd53a9 --- /dev/null +++ b/ghc/rts/gmp/mpn/a29k/udiv.s @@ -0,0 +1,30 @@ +; Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + .sect .lit,lit + .text + .align 4 + .global ___udiv_qrnnd + .word 0x60000 +___udiv_qrnnd: + mtsr q,lr3 + dividu gr96,lr4,lr5 + mfsr gr116,q + jmpi lr0 + store 0,0,gr116,lr2 diff --git a/ghc/rts/gmp/mpn/a29k/umul.s b/ghc/rts/gmp/mpn/a29k/umul.s new file mode 100644 index 0000000..7741981 --- /dev/null +++ b/ghc/rts/gmp/mpn/a29k/umul.s @@ -0,0 +1,29 @@ +; Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + .sect .lit,lit + .text + .align 4 + .global ___umul_ppmm + .word 0x50000 +___umul_ppmm: + multiplu gr116,lr3,lr4 + multmu gr96,lr3,lr4 + jmpi lr0 + store 0,0,gr116,lr2 diff --git a/ghc/rts/gmp/mpn/alpha/README b/ghc/rts/gmp/mpn/alpha/README index 55c0a29..744260c 100644 --- a/ghc/rts/gmp/mpn/alpha/README +++ b/ghc/rts/gmp/mpn/alpha/README @@ -1,53 +1,224 @@ This directory contains mpn functions optimized for DEC Alpha processors. +ALPHA ASSEMBLY RULES AND REGULATIONS + +The `.prologue N' pseudo op marks the end of instruction that needs +special handling by unwinding. It also says whether $27 is really +needed for computing the gp. The `.mask M' pseudo op says which +registers are saved on the stack, and at what offset in the frame. + +Cray code is very very different... + + RELEVANT OPTIMIZATION ISSUES EV4 1. This chip has very limited store bandwidth. The on-chip L1 cache is -write-through, and a cache line is transfered from the store buffer to the -off-chip L2 in as much 15 cycles on most systems. This delay hurts -mpn_add_n, mpn_sub_n, mpn_lshift, and mpn_rshift. + write-through, and a cache line is transfered from the store buffer to + the off-chip L2 in as much 15 cycles on most systems. This delay hurts + mpn_add_n, mpn_sub_n, mpn_lshift, and mpn_rshift. 2. Pairing is possible between memory instructions and integer arithmetic -instructions. + instructions. -3. mulq and umulh is documented to have a latency of 23 cycles, but 2 of -these cycles are pipelined. Thus, multiply instructions can be issued at a -rate of one each 21nd cycle. +3. mulq and umulh are documented to have a latency of 23 cycles, but 2 of + these cycles are pipelined. Thus, multiply instructions can be issued at + a rate of one each 21st cycle. EV5 1. The memory bandwidth of this chip seems excellent, both for loads and -stores. Even when the working set is larger than the on-chip L1 and L2 -caches, the perfromance remain almost unaffected. + stores. Even when the working set is larger than the on-chip L1 and L2 + caches, the performance remain almost unaffected. -2. mulq has a measured latency of 13 cycles and an issue rate of 1 each 8th -cycle. umulh has a measured latency of 15 cycles and an issue rate of 1 -each 10th cycle. But the exact timing is somewhat confusing. +2. mulq has a latency of 12 cycles and an issue rate of 1 each 8th cycle. + umulh has a measured latency of 14 cycles and an issue rate of 1 each + 10th cycle. But the exact timing is somewhat confusing. 3. mpn_add_n. With 4-fold unrolling, we need 37 instructions, whereof 12 are memory operations. This will take at least - ceil(37/2) [dual issue] + 1 [taken branch] = 20 cycles + ceil(37/2) [dual issue] + 1 [taken branch] = 19 cycles We have 12 memory cycles, plus 4 after-store conflict cycles, or 16 data - cache cycles, which should be completely hidden in the 20 issue cycles. + cache cycles, which should be completely hidden in the 19 issue cycles. The computation is inherently serial, with these dependencies: - addq - / \ - addq cmpult - | | - cmpult | - \ / - or - I.e., there is a 4 cycle path for each limb, making 16 cycles the absolute - minimum. We could replace the `or' with a cmoveq/cmovne, which would save - a cycle on EV5, but that might waste a cycle on EV4. Also, cmov takes 2 - cycles. + + ldq ldq + \ /\ + (or) addq | + |\ / \ | + | addq cmpult + \ | | + cmpult | + \ / + or + + I.e., 3 operations are needed between carry-in and carry-out, making 12 + cycles the absolute minimum for the 4 limbs. We could replace the `or' + with a cmoveq/cmovne, which could issue one cycle earlier that the `or', + but that might waste a cycle on EV4. The total depth remain unaffected, + since cmov has a latency of 2 cycles. + addq / \ addq cmpult | \ cmpult -> cmovne -STATUS +Montgomery has a slightly different way of computing carry that requires one +less instruction, but has depth 4 (instead of the current 3). Since the +code is currently instruction issue bound, Montgomery's idea should save us +1/2 cycle per limb, or bring us down to a total of 17 cycles or 4.25 +cycles/limb. Unfortunately, this method will not be good for the EV6. + +EV6 + +Here we have a really parallel pipeline, capable of issuing up to 4 integer +instructions per cycle. One integer multiply instruction can issue each +cycle. To get optimal speed, we need to pretend we are vectorizing the code, +i.e., minimize the iterative dependencies. + +There are two dependencies to watch out for. 1) Address arithmetic +dependencies, and 2) carry propagation dependencies. + +We can avoid serializing due to address arithmetic by unrolling the loop, so +that addresses don't depend heavily on an index variable. Avoiding +serializing because of carry propagation is trickier; the ultimate performance +of the code will be determined of the number of latency cycles it takes from +accepting carry-in to a vector point until we can generate carry-out. + +Most integer instructions can execute in either the L0, U0, L1, or U1 +pipelines. Shifts only execute in U0 and U1, and multiply only in U1. + +CMOV instructions split into two internal instructions, CMOV1 and CMOV2, but +the execute efficiently. But CMOV split the mapping process (see pg 2-26 in +cmpwrgd.pdf), suggesting the CMOV should always be placed as the last +instruction of an aligned 4 instruction block (?). + +Perhaps the most important issue is the latency between the L0/U0 and L1/U1 +clusters; a result obtained on either cluster has an extra cycle of latency +for consumers in the opposite cluster. Because of the dynamic nature of the +implementation, it is hard to predict where an instruction will execute. + +The shift loops need (per limb): + 1 load (Lx pipes) + 1 store (Lx pipes) + 2 shift (Ux pipes) + 1 iaddlog (Lx pipes, Ux pipes) +Obviously, since the pipes are very equally loaded, we should get 4 insn/cycle, or 1.25 cycles/limb. + +For mpn_add_n, we currently have + 2 load (Lx pipes) + 1 store (Lx pipes) + 5 iaddlog (Lx pipes, Ux pipes) + +Again, we have a perfect balance and will be limited by carry propagation +delays, currently three cycles. The superoptimizer indicates that ther +might be sequences that--using a final cmov--have a carry propagation delay +of just two. Montgomery's subtraction sequence could perhaps be used, by +complementing some operands. All in all, we should get down to 2 cycles +without much problems. + +For mpn_mul_1, we could do, just like for mpn_add_n: + not newlo,notnewlo + addq cylimb,newlo,newlo || cmpult cylimb,notnewlo,cyout + addq cyout,newhi,cylimb +and get 2-cycle carry propagation. The instructions needed will be + 1 ld (Lx pipes) + 1 st (Lx pipes) + 2 mul (U1 pipe) + 4 iaddlog (Lx pipes, Ux pipes) +issue1: addq not mul ld +issue2: cmpult addq mul st +Conclusion: no cluster delays and 2-cycle carry delays will give us 2 cycles/limb! + +Last, we have mpn_addmul_1. Almost certainly, we will get down to 3 +cycles/limb, which would be absolutely awesome. + +Old, perhaps obsolete addmul_1 dependency diagram (needs 175 columns wide screen): + i + s + s i + u n + e s + d t + r + i u +l n c +i s t +v t i +e r o + u n +v c +a t t +l i y +u o p +e n e +s s s + issue + in + cycle + -1 ldq + / \ + 0 | \ + | \ + 1 | | + | | + 2 | | ldq + | | / \ + 3 | mulq | \ + | \ | \ + 4 umulh \ | | + | | | | + 5 | | | | ldq + | | | | / \ + 4calm 6 | | ldq | mulq | \ + | | / | \ | \ + 4casm 7 | | / umulh \ | | +6 | || | | | | + 3aal 8 | || | | | | ldq +7 | || | | | | / \ + 4calm 9 | || | | ldq | mulq | \ +9 | || | | / | \ | \ + 4casm 10 | || | | / umulh \ | | +9 | || | || | | | | + 3aal 11 | addq | || | | | | ldq +9 | // \ | || | | | | / \ + 4calm 12 \ cmpult addq<-cy | || | | ldq | mulq | \ +13 \ / // \ | || | | / | \ | \ + 4casm 13 addq cmpult stq | || | | / umulh \ | | +11 \ / | || | || | | | | + 3aal 14 addq | addq | || | | | | ldq +10 \ | // \ | || | | | | / \ + 4calm 15 cy ----> \ cmpult addq<-cy | || | | ldq | mulq | \ +13 \ / // \ | || | | / | \ | \ + 4casm 16 addq cmpult stq | || | | / umulh \ | | +11 \ / | || | || | | | | + 3aal 17 addq | addq | || | | | | +10 \ | // \ | || | | | | + 4calm 18 cy ----> \ cmpult addq<-cy | || | | ldq | mulq +13 \ / // \ | || | | / | \ + 4casm 19 addq cmpult stq | || | | / umulh \ +11 \ / | || | || | | + 3aal 20 addq | addq | || | | +10 \ | // \ | || | | + 4calm 21 cy ----> \ cmpult addq<-cy | || | | ldq + \ / // \ | || | | / + 22 addq cmpult stq | || | | / + \ / | || | || + 23 addq | addq | || + \ | // \ | || + 24 cy ----> \ cmpult addq<-cy | || + \ / // \ | || + 25 addq cmpult stq | || + \ / | || + 26 addq | addq + \ | // \ + 27 cy ----> \ cmpult addq<-cy + \ / // \ + 28 addq cmpult stq + \ / +As many as 6 consecutive points will be under execution simultaneously, or if we addq +schedule loads even further away, maybe 7 or 8. But the number of live quantities \ +is reasonable, and can easily be satisfied. cy ----> diff --git a/ghc/rts/gmp/mpn/alpha/add_n.asm b/ghc/rts/gmp/mpn/alpha/add_n.asm new file mode 100644 index 0000000..08d6a9f --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/add_n.asm @@ -0,0 +1,114 @@ +dnl Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_add_n) + ldq r3,0(r17) + ldq r4,0(r18) + + subq r19,1,r19 + and r19,4-1,r2 C number of limbs in first loop + bis r31,r31,r0 + beq r2,$L0 C if multiple of 4 limbs, skip first loop + + subq r19,r2,r19 + +$Loop0: subq r2,1,r2 + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,0(r16) + bis r0,r1,r0 + + addq r17,8,r17 + addq r18,8,r18 + bis r5,r5,r3 + bis r6,r6,r4 + addq r16,8,r16 + bne r2,$Loop0 + +$L0: beq r19,$Lend + + ALIGN(8) +$Loop: subq r19,4,r19 + + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,0(r16) + bis r0,r1,r0 + + ldq r3,16(r17) + addq r6,r0,r6 + ldq r4,16(r18) + cmpult r6,r0,r1 + addq r5,r6,r6 + cmpult r6,r5,r0 + stq r6,8(r16) + bis r0,r1,r0 + + ldq r5,24(r17) + addq r4,r0,r4 + ldq r6,24(r18) + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,16(r16) + bis r0,r1,r0 + + ldq r3,32(r17) + addq r6,r0,r6 + ldq r4,32(r18) + cmpult r6,r0,r1 + addq r5,r6,r6 + cmpult r6,r5,r0 + stq r6,24(r16) + bis r0,r1,r0 + + addq r17,32,r17 + addq r18,32,r18 + addq r16,32,r16 + bne r19,$Loop + +$Lend: addq r4,r0,r4 + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,0(r16) + bis r0,r1,r0 + ret r31,(r26),1 +EPILOGUE(mpn_add_n) +ASM_END() diff --git a/ghc/rts/gmp/mpn/alpha/addmul_1.asm b/ghc/rts/gmp/mpn/alpha/addmul_1.asm new file mode 100644 index 0000000..4ea900b --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/addmul_1.asm @@ -0,0 +1,87 @@ +dnl Alpha __gmpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl s2_limb r19 + +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7 +dnl cycles/limb on EV6. + +ASM_START() +PROLOGUE(mpn_addmul_1) + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Lend1 C jump if size was == 1 + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + addq r5,r3,r3 + cmpult r3,r5,r4 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + beq r18,$Lend2 C jump if size was == 2 + + ALIGN(8) +$Loop: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + subq r18,1,r18 C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + addq r5,r3,r3 + cmpult r3,r5,r5 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + addq r5,r0,r0 C combine carries + bne r18,$Loop + +$Lend2: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + addq r5,r3,r3 + cmpult r3,r5,r5 + stq r3,0(r16) + addq r5,r0,r0 C combine carries + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Lend1: addq r5,r3,r3 + cmpult r3,r5,r5 + stq r3,0(r16) + addq r0,r5,r0 + ret r31,(r26),1 +EPILOGUE(mpn_addmul_1) +ASM_END() diff --git a/ghc/rts/gmp/mpn/alpha/cntlz.asm b/ghc/rts/gmp/mpn/alpha/cntlz.asm new file mode 100644 index 0000000..febb3b7 --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/cntlz.asm @@ -0,0 +1,68 @@ +dnl Alpha auxiliary for longlong.h's count_leading_zeros + +dnl Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl DISCUSSION: + +dnl Other methods have been tried, and using a 128-entry table actually trims +dnl about 10% of the execution time (on a 21164) when the table is in the L1 +dnl cache. But under non-benchmarking conditions, the table will hardly be in +dnl the L1 cache. Tricky bit-fiddling methods with multiplies and magic tables +dnl are also possible, but they require many more instructions than the current +dnl code. (But for count_trailing_zeros, such tricks are beneficial.) +dnl Finally, converting to floating-point and extracting the exponent is much +dnl slower. + +ASM_START() +PROLOGUE(MPN(count_leading_zeros)) + bis r31,63,r0 C initialize partial result count + + srl r16,32,r1 C shift down 32 steps -> r1 + cmovne r1,r1,r16 C select r1 if non-zero + cmovne r1,31,r0 C if r1 is nonzero choose smaller count + + srl r16,16,r1 C shift down 16 steps -> r1 + subq r0,16,r2 C generate new partial result count + cmovne r1,r1,r16 C choose new r1 if non-zero + cmovne r1,r2,r0 C choose new count if r1 was non-zero + + srl r16,8,r1 + subq r0,8,r2 + cmovne r1,r1,r16 + cmovne r1,r2,r0 + + srl r16,4,r1 + subq r0,4,r2 + cmovne r1,r1,r16 + cmovne r1,r2,r0 + + srl r16,2,r1 + subq r0,2,r2 + cmovne r1,r1,r16 + cmovne r1,r2,r0 + + srl r16,1,r1 C extract bit 1 + subq r0,r1,r0 C subtract it from partial result + + ret r31,(r26),1 +EPILOGUE(MPN(count_leading_zeros)) +ASM_END() diff --git a/ghc/rts/gmp/mpn/alpha/default.m4 b/ghc/rts/gmp/mpn/alpha/default.m4 new file mode 100644 index 0000000..5f4c48d --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/default.m4 @@ -0,0 +1,77 @@ +divert(-1) + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +define(`ASM_START', + ` + .set noreorder + .set noat') + +define(`X',`0x$1') +define(`FLOAT64', + ` + .align 3 +$1: .t_floating $2') + +define(`PROLOGUE', + ` + .text + .align 3 + .globl $1 + .ent $1 +$1: + .frame r30,0,r26 + .prologue 0') + +define(`PROLOGUE_GP', + ` + .text + .align 3 + .globl $1 + .ent $1 +$1: + ldgp r29,0(r27) + .frame r30,0,r26 + .prologue 1') + +define(`EPILOGUE', + ` + .end $1') + +dnl Map register names r0, r1, etc, to `$0', `$1', etc. +dnl This is needed on all systems but Unicos +forloop(i,0,31, +`define(`r'i,``$''i)' +) +forloop(i,0,31, +`define(`f'i,``$f''i)' +) + +define(`DATASTART', + `dnl + DATA +$1:') +define(`DATAEND',`dnl') + +define(`ASM_END',`dnl') + +divert diff --git a/ghc/rts/gmp/mpn/alpha/ev5/add_n.asm b/ghc/rts/gmp/mpn/alpha/ev5/add_n.asm new file mode 100644 index 0000000..716d640 --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/ev5/add_n.asm @@ -0,0 +1,143 @@ +dnl Alpha EV5 __gmpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_add_n) + bis r31,r31,r25 C clear cy + subq r19,4,r19 C decr loop cnt + blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop +C Start software pipeline for 1st loop + ldq r0,0(r18) + ldq r4,0(r17) + ldq r1,8(r18) + ldq r5,8(r17) + addq r17,32,r17 C update s1_ptr + ldq r2,16(r18) + addq r0,r4,r20 C 1st main add + ldq r3,24(r18) + subq r19,4,r19 C decr loop cnt + ldq r6,-16(r17) + cmpult r20,r0,r25 C compute cy from last add + ldq r7,-8(r17) + addq r1,r5,r28 C 2nd main add + addq r18,32,r18 C update s2_ptr + addq r28,r25,r21 C 2nd carry add + cmpult r28,r5,r8 C compute cy from last add + blt r19,$Lend1 C if less than 4 limbs remain, jump +C 1st loop handles groups of 4 limbs in a software pipeline + ALIGN(16) +$Loop: cmpult r21,r28,r25 C compute cy from last add + ldq r0,0(r18) + bis r8,r25,r25 C combine cy from the two adds + ldq r1,8(r18) + addq r2,r6,r28 C 3rd main add + ldq r4,0(r17) + addq r28,r25,r22 C 3rd carry add + ldq r5,8(r17) + cmpult r28,r6,r8 C compute cy from last add + cmpult r22,r28,r25 C compute cy from last add + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two adds + stq r21,8(r16) + addq r3,r7,r28 C 4th main add + addq r28,r25,r23 C 4th carry add + cmpult r28,r7,r8 C compute cy from last add + cmpult r23,r28,r25 C compute cy from last add + addq r17,32,r17 C update s1_ptr + bis r8,r25,r25 C combine cy from the two adds + addq r16,32,r16 C update res_ptr + addq r0,r4,r28 C 1st main add + ldq r2,16(r18) + addq r25,r28,r20 C 1st carry add + ldq r3,24(r18) + cmpult r28,r4,r8 C compute cy from last add + ldq r6,-16(r17) + cmpult r20,r28,r25 C compute cy from last add + ldq r7,-8(r17) + bis r8,r25,r25 C combine cy from the two adds + subq r19,4,r19 C decr loop cnt + stq r22,-16(r16) + addq r1,r5,r28 C 2nd main add + stq r23,-8(r16) + addq r25,r28,r21 C 2nd carry add + addq r18,32,r18 C update s2_ptr + cmpult r28,r5,r8 C compute cy from last add + bge r19,$Loop +C Finish software pipeline for 1st loop +$Lend1: cmpult r21,r28,r25 C compute cy from last add + bis r8,r25,r25 C combine cy from the two adds + addq r2,r6,r28 C 3rd main add + addq r28,r25,r22 C 3rd carry add + cmpult r28,r6,r8 C compute cy from last add + cmpult r22,r28,r25 C compute cy from last add + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two adds + stq r21,8(r16) + addq r3,r7,r28 C 4th main add + addq r28,r25,r23 C 4th carry add + cmpult r28,r7,r8 C compute cy from last add + cmpult r23,r28,r25 C compute cy from last add + bis r8,r25,r25 C combine cy from the two adds + addq r16,32,r16 C update res_ptr + stq r22,-16(r16) + stq r23,-8(r16) +$Lend2: addq r19,4,r19 C restore loop cnt + beq r19,$Lret +C Start software pipeline for 2nd loop + ldq r0,0(r18) + ldq r4,0(r17) + subq r19,1,r19 + beq r19,$Lend0 +C 2nd loop handles remaining 1-3 limbs + ALIGN(16) +$Loop0: addq r0,r4,r28 C main add + ldq r0,8(r18) + cmpult r28,r4,r8 C compute cy from last add + ldq r4,8(r17) + addq r28,r25,r20 C carry add + addq r18,8,r18 + addq r17,8,r17 + stq r20,0(r16) + cmpult r20,r28,r25 C compute cy from last add + subq r19,1,r19 C decr loop cnt + bis r8,r25,r25 C combine cy from the two adds + addq r16,8,r16 + bne r19,$Loop0 +$Lend0: addq r0,r4,r28 C main add + addq r28,r25,r20 C carry add + cmpult r28,r4,r8 C compute cy from last add + cmpult r20,r28,r25 C compute cy from last add + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two adds + +$Lret: bis r25,r31,r0 C return cy + ret r31,(r26),1 +EPILOGUE(mpn_add_n) +ASM_END() diff --git a/ghc/rts/gmp/mpn/alpha/ev5/lshift.asm b/ghc/rts/gmp/mpn/alpha/ev5/lshift.asm new file mode 100644 index 0000000..cb181dd --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/ev5/lshift.asm @@ -0,0 +1,169 @@ +dnl Alpha EV5 __gmpn_lshift -- Shift a number left. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 3.25 cycles/limb on the EV5. + +ASM_START() +PROLOGUE(mpn_lshift) + s8addq r18,r17,r17 C make r17 point at end of s1 + ldq r4,-8(r17) C load first limb + subq r31,r19,r20 + s8addq r18,r16,r16 C make r16 point at end of RES + subq r18,1,r18 + and r18,4-1,r28 C number of limbs in first loop + srl r4,r20,r0 C compute function result + + beq r28,$L0 + subq r18,r28,r18 + + ALIGN(8) +$Loop0: ldq r3,-16(r17) + subq r16,8,r16 + sll r4,r19,r5 + subq r17,8,r17 + subq r28,1,r28 + srl r3,r20,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,0(r16) + bne r28,$Loop0 + +$L0: sll r4,r19,r24 + beq r18,$Lend +C warm up phase 1 + ldq r1,-16(r17) + subq r18,4,r18 + ldq r2,-24(r17) + ldq r3,-32(r17) + ldq r4,-40(r17) + beq r18,$Lend1 +C warm up phase 2 + srl r1,r20,r7 + sll r1,r19,r21 + srl r2,r20,r8 + ldq r1,-48(r17) + sll r2,r19,r22 + ldq r2,-56(r17) + srl r3,r20,r5 + bis r7,r24,r7 + sll r3,r19,r23 + bis r8,r21,r8 + srl r4,r20,r6 + ldq r3,-64(r17) + sll r4,r19,r24 + ldq r4,-72(r17) + subq r18,4,r18 + beq r18,$Lend2 + ALIGN(16) +C main loop +$Loop: stq r7,-8(r16) + bis r5,r22,r5 + stq r8,-16(r16) + bis r6,r23,r6 + + srl r1,r20,r7 + subq r18,4,r18 + sll r1,r19,r21 + unop C ldq r31,-96(r17) + + srl r2,r20,r8 + ldq r1,-80(r17) + sll r2,r19,r22 + ldq r2,-88(r17) + + stq r5,-24(r16) + bis r7,r24,r7 + stq r6,-32(r16) + bis r8,r21,r8 + + srl r3,r20,r5 + unop C ldq r31,-96(r17) + sll r3,r19,r23 + subq r16,32,r16 + + srl r4,r20,r6 + ldq r3,-96(r17) + sll r4,r19,r24 + ldq r4,-104(r17) + + subq r17,32,r17 + bne r18,$Loop +C cool down phase 2/1 +$Lend2: stq r7,-8(r16) + bis r5,r22,r5 + stq r8,-16(r16) + bis r6,r23,r6 + srl r1,r20,r7 + sll r1,r19,r21 + srl r2,r20,r8 + sll r2,r19,r22 + stq r5,-24(r16) + bis r7,r24,r7 + stq r6,-32(r16) + bis r8,r21,r8 + srl r3,r20,r5 + sll r3,r19,r23 + srl r4,r20,r6 + sll r4,r19,r24 +C cool down phase 2/2 + stq r7,-40(r16) + bis r5,r22,r5 + stq r8,-48(r16) + bis r6,r23,r6 + stq r5,-56(r16) + stq r6,-64(r16) +C cool down phase 2/3 + stq r24,-72(r16) + ret r31,(r26),1 + +C cool down phase 1/1 +$Lend1: srl r1,r20,r7 + sll r1,r19,r21 + srl r2,r20,r8 + sll r2,r19,r22 + srl r3,r20,r5 + bis r7,r24,r7 + sll r3,r19,r23 + bis r8,r21,r8 + srl r4,r20,r6 + sll r4,r19,r24 +C cool down phase 1/2 + stq r7,-8(r16) + bis r5,r22,r5 + stq r8,-16(r16) + bis r6,r23,r6 + stq r5,-24(r16) + stq r6,-32(r16) + stq r24,-40(r16) + ret r31,(r26),1 + +$Lend: stq r24,-8(r16) + ret r31,(r26),1 +EPILOGUE(mpn_lshift) +ASM_END() diff --git a/ghc/rts/gmp/mpn/alpha/ev5/rshift.asm b/ghc/rts/gmp/mpn/alpha/ev5/rshift.asm new file mode 100644 index 0000000..9940d83 --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/ev5/rshift.asm @@ -0,0 +1,167 @@ +dnl Alpha EV5 __gmpn_rshift -- Shift a number right. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 3.25 cycles/limb on the EV5. + +ASM_START() +PROLOGUE(mpn_rshift) + ldq r4,0(r17) C load first limb + subq r31,r19,r20 + subq r18,1,r18 + and r18,4-1,r28 C number of limbs in first loop + sll r4,r20,r0 C compute function result + + beq r28,$L0 + subq r18,r28,r18 + + ALIGN(8) +$Loop0: ldq r3,8(r17) + addq r16,8,r16 + srl r4,r19,r5 + addq r17,8,r17 + subq r28,1,r28 + sll r3,r20,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,-8(r16) + bne r28,$Loop0 + +$L0: srl r4,r19,r24 + beq r18,$Lend +C warm up phase 1 + ldq r1,8(r17) + subq r18,4,r18 + ldq r2,16(r17) + ldq r3,24(r17) + ldq r4,32(r17) + beq r18,$Lend1 +C warm up phase 2 + sll r1,r20,r7 + srl r1,r19,r21 + sll r2,r20,r8 + ldq r1,40(r17) + srl r2,r19,r22 + ldq r2,48(r17) + sll r3,r20,r5 + bis r7,r24,r7 + srl r3,r19,r23 + bis r8,r21,r8 + sll r4,r20,r6 + ldq r3,56(r17) + srl r4,r19,r24 + ldq r4,64(r17) + subq r18,4,r18 + beq r18,$Lend2 + ALIGN(16) +C main loop +$Loop: stq r7,0(r16) + bis r5,r22,r5 + stq r8,8(r16) + bis r6,r23,r6 + + sll r1,r20,r7 + subq r18,4,r18 + srl r1,r19,r21 + unop C ldq r31,-96(r17) + + sll r2,r20,r8 + ldq r1,72(r17) + srl r2,r19,r22 + ldq r2,80(r17) + + stq r5,16(r16) + bis r7,r24,r7 + stq r6,24(r16) + bis r8,r21,r8 + + sll r3,r20,r5 + unop C ldq r31,-96(r17) + srl r3,r19,r23 + addq r16,32,r16 + + sll r4,r20,r6 + ldq r3,88(r17) + srl r4,r19,r24 + ldq r4,96(r17) + + addq r17,32,r17 + bne r18,$Loop +C cool down phase 2/1 +$Lend2: stq r7,0(r16) + bis r5,r22,r5 + stq r8,8(r16) + bis r6,r23,r6 + sll r1,r20,r7 + srl r1,r19,r21 + sll r2,r20,r8 + srl r2,r19,r22 + stq r5,16(r16) + bis r7,r24,r7 + stq r6,24(r16) + bis r8,r21,r8 + sll r3,r20,r5 + srl r3,r19,r23 + sll r4,r20,r6 + srl r4,r19,r24 +C cool down phase 2/2 + stq r7,32(r16) + bis r5,r22,r5 + stq r8,40(r16) + bis r6,r23,r6 + stq r5,48(r16) + stq r6,56(r16) +C cool down phase 2/3 + stq r24,64(r16) + ret r31,(r26),1 + +C cool down phase 1/1 +$Lend1: sll r1,r20,r7 + srl r1,r19,r21 + sll r2,r20,r8 + srl r2,r19,r22 + sll r3,r20,r5 + bis r7,r24,r7 + srl r3,r19,r23 + bis r8,r21,r8 + sll r4,r20,r6 + srl r4,r19,r24 +C cool down phase 1/2 + stq r7,0(r16) + bis r5,r22,r5 + stq r8,8(r16) + bis r6,r23,r6 + stq r5,16(r16) + stq r6,24(r16) + stq r24,32(r16) + ret r31,(r26),1 + +$Lend: stq r24,0(r16) + ret r31,(r26),1 +EPILOGUE(mpn_rshift) +ASM_END() diff --git a/ghc/rts/gmp/mpn/alpha/ev5/sub_n.asm b/ghc/rts/gmp/mpn/alpha/ev5/sub_n.asm new file mode 100644 index 0000000..5248a2a --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/ev5/sub_n.asm @@ -0,0 +1,143 @@ +dnl Alpha EV5 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 +dnl and store difference in a third limb vector. + +dnl Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_sub_n) + bis r31,r31,r25 C clear cy + subq r19,4,r19 C decr loop cnt + blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop +C Start software pipeline for 1st loop + ldq r0,0(r18) + ldq r4,0(r17) + ldq r1,8(r18) + ldq r5,8(r17) + addq r17,32,r17 C update s1_ptr + ldq r2,16(r18) + subq r4,r0,r20 C 1st main subtract + ldq r3,24(r18) + subq r19,4,r19 C decr loop cnt + ldq r6,-16(r17) + cmpult r4,r0,r25 C compute cy from last subtract + ldq r7,-8(r17) + subq r5,r1,r28 C 2nd main subtract + addq r18,32,r18 C update s2_ptr + subq r28,r25,r21 C 2nd carry subtract + cmpult r5,r1,r8 C compute cy from last subtract + blt r19,$Lend1 C if less than 4 limbs remain, jump +C 1st loop handles groups of 4 limbs in a software pipeline + ALIGN(16) +$Loop: cmpult r28,r25,r25 C compute cy from last subtract + ldq r0,0(r18) + bis r8,r25,r25 C combine cy from the two subtracts + ldq r1,8(r18) + subq r6,r2,r28 C 3rd main subtract + ldq r4,0(r17) + subq r28,r25,r22 C 3rd carry subtract + ldq r5,8(r17) + cmpult r6,r2,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two subtracts + stq r21,8(r16) + subq r7,r3,r28 C 4th main subtract + subq r28,r25,r23 C 4th carry subtract + cmpult r7,r3,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + addq r17,32,r17 C update s1_ptr + bis r8,r25,r25 C combine cy from the two subtracts + addq r16,32,r16 C update res_ptr + subq r4,r0,r28 C 1st main subtract + ldq r2,16(r18) + subq r28,r25,r20 C 1st carry subtract + ldq r3,24(r18) + cmpult r4,r0,r8 C compute cy from last subtract + ldq r6,-16(r17) + cmpult r28,r25,r25 C compute cy from last subtract + ldq r7,-8(r17) + bis r8,r25,r25 C combine cy from the two subtracts + subq r19,4,r19 C decr loop cnt + stq r22,-16(r16) + subq r5,r1,r28 C 2nd main subtract + stq r23,-8(r16) + subq r28,r25,r21 C 2nd carry subtract + addq r18,32,r18 C update s2_ptr + cmpult r5,r1,r8 C compute cy from last subtract + bge r19,$Loop +C Finish software pipeline for 1st loop +$Lend1: cmpult r28,r25,r25 C compute cy from last subtract + bis r8,r25,r25 C combine cy from the two subtracts + subq r6,r2,r28 C cy add + subq r28,r25,r22 C 3rd main subtract + cmpult r6,r2,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two subtracts + stq r21,8(r16) + subq r7,r3,r28 C cy add + subq r28,r25,r23 C 4th main subtract + cmpult r7,r3,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + bis r8,r25,r25 C combine cy from the two subtracts + addq r16,32,r16 C update res_ptr + stq r22,-16(r16) + stq r23,-8(r16) +$Lend2: addq r19,4,r19 C restore loop cnt + beq r19,$Lret +C Start software pipeline for 2nd loop + ldq r0,0(r18) + ldq r4,0(r17) + subq r19,1,r19 + beq r19,$Lend0 +C 2nd loop handles remaining 1-3 limbs + ALIGN(16) +$Loop0: subq r4,r0,r28 C main subtract + cmpult r4,r0,r8 C compute cy from last subtract + ldq r0,8(r18) + ldq r4,8(r17) + subq r28,r25,r20 C carry subtract + addq r18,8,r18 + addq r17,8,r17 + stq r20,0(r16) + cmpult r28,r25,r25 C compute cy from last subtract + subq r19,1,r19 C decr loop cnt + bis r8,r25,r25 C combine cy from the two subtracts + addq r16,8,r16 + bne r19,$Loop0 +$Lend0: subq r4,r0,r28 C main subtract + subq r28,r25,r20 C carry subtract + cmpult r4,r0,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two subtracts + +$Lret: bis r25,r31,r0 C return cy + ret r31,(r26),1 +EPILOGUE(mpn_sub_n) +ASM_END() diff --git a/ghc/rts/gmp/mpn/alpha/ev6/addmul_1.asm b/ghc/rts/gmp/mpn/alpha/ev6/addmul_1.asm new file mode 100644 index 0000000..dde187c --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/ev6/addmul_1.asm @@ -0,0 +1,474 @@ +dnl Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright (C) 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl s2_limb r19 + +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and +dnl exactly 3.625 cycles/limb on EV6... + +dnl This code was written in close cooperation with ev6 pipeline expert +dnl Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though. +dnl +dnl Register usages for unrolled loop: +dnl 0-3 mul's +dnl 4-7 acc's +dnl 8-15 mul results +dnl 20,21 carry's +dnl 22,23 save for stores + +dnl Sustains 8 mul-adds in 29 cycles in the unrolled inner loop. + +dnl The stores can issue a cycle late so we have paired no-op's to 'catch' +dnl them, so that further disturbance to the schedule is damped. + +dnl We couldn't pair the loads, because the entangled schedule of the +dnl carry's has to happen on one side {0} of the machine. Note, the total +dnl use of U0, and the total use of L0 (after attending to the stores). +dnl which is part of the reason why.... + +dnl This is a great schedule for the d_cache, a poor schedule for the +dnl b_cache. The lockup on U0 means that any stall can't be recovered +dnl from. Consider a ldq in L1. say that load gets stalled because it +dnl collides with a fill from the b_Cache. On the next cycle, this load +dnl gets priority. If first looks at L0, and goes there. The instruction +dnl we intended for L0 gets to look at L1, which is NOT where we want +dnl it. It either stalls 1, because it can't go in L0, or goes there, and +dnl causes a further instruction to stall. + +dnl So for b_cache, we're likely going to want to put one or more cycles +dnl back into the code! And, of course, put in prefetches. For the +dnl accumulator, lds, intent to modify. For the multiplier, you might +dnl want ldq, evict next, if you're not wanting to use it again soon. Use +dnl 256 ahead of present pointer value. At a place where we have an mt +dnl followed by a bookkeeping, put the bookkeeping in upper, and the +dnl prefetch into lower. + +dnl Note, the usage of physical registers per cycle is smoothed off, as +dnl much as possible. + +dnl Note, the ldq's and stq's are at the end of the quadpacks. note, we'd +dnl like not to have a ldq or stq to preceded a conditional branch in a +dnl quadpack. The conditional branch moves the retire pointer one cycle +dnl later. + +dnl Optimization notes: +dnl Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27? +dnl Reserved regs: r29 r30 r31 +dnl Free caller-saves regs in unrolled code: r24 r25 r28 +dnl We should swap some of the callee-saves regs for some of the free +dnl caller-saves regs, saving some overhead cycles. +dnl Most importantly, we should write fast code for the 0-7 case. +dnl The code we use there are for the 21164, and runs at 7 cycles/limb +dnl on the 21264. Should not be hard, if we write specialized code for +dnl 1-7 limbs (the one for 0 limbs should be straightforward). We then just +dnl need a jump table indexed by the low 3 bits of the count argument. + + +ASM_START() +PROLOGUE(mpn_addmul_1) + cmpult r18, 8, r1 + beq r1, $Large + + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + subq r18, 1, r18 C size-- + mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + umulh r2, r19, r0 C r0 = prod_high + beq r18, $Lend0b C jump if size was == 1 + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + subq r18, 1, r18 C size-- + addq r5, r3, r3 + cmpult r3, r5, r4 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + beq r18, $Lend0a C jump if size was == 2 + + ALIGN(8) +$Loop0: mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + addq r4, r0, r0 C cy_limb = cy_limb + 'cy' + subq r18, 1, r18 C size-- + umulh r2, r19, r4 C r4 = cy_limb + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + addq r3, r0, r3 C r3 = cy_limb + prod_low + cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + addq r5, r0, r0 C combine carries + bne r18, $Loop0 +$Lend0a: + mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + addq r4, r0, r0 C cy_limb = cy_limb + 'cy' + umulh r2, r19, r4 C r4 = cy_limb + addq r3, r0, r3 C r3 = cy_limb + prod_low + cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r5, r0, r0 C combine carries + addq r4, r0, r0 C cy_limb = prod_high + cy + ret r31, (r26), 1 +$Lend0b: + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r0, r5, r0 + ret r31, (r26), 1 + +$Large: + lda $30, -240($30) + stq $9, 8($30) + stq $10, 16($30) + stq $11, 24($30) + stq $12, 32($30) + stq $13, 40($30) + stq $14, 48($30) + stq $15, 56($30) + + and r18, 7, r20 C count for the first loop, 0-7 + srl r18, 3, r18 C count for unrolled loop + bis r31, r31, r0 + beq r20, $Lunroll + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + subq r20, 1, r20 C size-- + mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + umulh r2, r19, r0 C r0 = prod_high + beq r20, $Lend1b C jump if size was == 1 + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + subq r20, 1, r20 C size-- + addq r5, r3, r3 + cmpult r3, r5, r4 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + beq r20, $Lend1a C jump if size was == 2 + + ALIGN(8) +$Loop1: mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + addq r4, r0, r0 C cy_limb = cy_limb + 'cy' + subq r20, 1, r20 C size-- + umulh r2, r19, r4 C r4 = cy_limb + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + addq r3, r0, r3 C r3 = cy_limb + prod_low + cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + addq r5, r0, r0 C combine carries + bne r20, $Loop1 + +$Lend1a: + mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + addq r4, r0, r0 C cy_limb = cy_limb + 'cy' + umulh r2, r19, r4 C r4 = cy_limb + addq r3, r0, r3 C r3 = cy_limb + prod_low + cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + addq r5, r0, r0 C combine carries + addq r4, r0, r0 C cy_limb = prod_high + cy + br r31, $Lunroll +$Lend1b: + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + addq r0, r5, r0 + +$Lunroll: + lda r17, -16(r17) C L1 bookkeeping + lda r16, -16(r16) C L1 bookkeeping + bis r0, r31, r12 + +C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____ + + ldq r2, 16(r17) C L1 + ldq r3, 24(r17) C L1 + lda r18, -1(r18) C L1 bookkeeping + ldq r6, 16(r16) C L1 + ldq r7, 24(r16) C L1 + ldq r0, 32(r17) C L1 + mulq r19, r2, r13 C U1 + ldq r1, 40(r17) C L1 + umulh r19, r2, r14 C U1 + mulq r19, r3, r15 C U1 + lda r17, 64(r17) C L1 bookkeeping + ldq r4, 32(r16) C L1 + ldq r5, 40(r16) C L1 + umulh r19, r3, r8 C U1 + ldq r2, -16(r17) C L1 + mulq r19, r0, r9 C U1 + ldq r3, -8(r17) C L1 + umulh r19, r0, r10 C U1 + addq r6, r13, r6 C L0 lo + acc + mulq r19, r1, r11 C U1 + cmpult r6, r13, r20 C L0 lo add => carry + lda r16, 64(r16) C L1 bookkeeping + addq r6, r12, r22 C U0 hi add => answer + cmpult r22, r12, r21 C L0 hi add => carry + addq r14, r20, r14 C U0 hi mul + carry + ldq r6, -16(r16) C L1 + addq r7, r15, r23 C L0 lo + acc + addq r14, r21, r14 C U0 hi mul + carry + ldq r7, -8(r16) C L1 + umulh r19, r1, r12 C U1 + cmpult r23, r15, r20 C L0 lo add => carry + addq r23, r14, r23 C U0 hi add => answer + ldq r0, (r17) C L1 + mulq r19, r2, r13 C U1 + cmpult r23, r14, r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + ldq r1, 8(r17) C L1 + umulh r19, r2, r14 C U1 + addq r4, r9, r4 C L0 lo + acc + stq r22, -48(r16) C L0 + stq r23, -40(r16) C L1 + mulq r19, r3, r15 C U1 + addq r8, r21, r8 C U0 hi mul + carry + cmpult r4, r9, r20 C L0 lo add => carry + addq r4, r8, r22 C U0 hi add => answer + ble r18, $Lend C U1 bookkeeping + +C ____ MAIN UNROLLED LOOP ____ + ALIGN(16) +$Loop: + bis r31, r31, r31 C U1 mt + cmpult r22, r8, r21 C L0 hi add => carry + addq r10, r20, r10 C U0 hi mul + carry + ldq r4, (r16) C L1 + + bis r31, r31, r31 C U1 mt + addq r5, r11, r23 C L0 lo + acc + addq r10, r21, r10 C L0 hi mul + carry + ldq r5, 8(r16) C L1 + + umulh r19, r3, r8 C U1 + cmpult r23, r11, r20 C L0 lo add => carry + addq r23, r10, r23 C U0 hi add => answer + ldq r2, 16(r17) C L1 + + mulq r19, r0, r9 C U1 + cmpult r23, r10, r21 C L0 hi add => carry + addq r12, r20, r12 C U0 hi mul + carry + ldq r3, 24(r17) C L1 + + umulh r19, r0, r10 C U1 + addq r6, r13, r6 C L0 lo + acc + stq r22, -32(r16) C L0 + stq r23, -24(r16) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq r19, r1, r11 C U1 + bis r31, r31, r31 C L1 st slosh + addq r12, r21, r12 C U0 hi mul + carry + + cmpult r6, r13, r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda r18, -1(r18) C L1 bookkeeping + addq r6, r12, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + cmpult r22, r12, r21 C L0 hi add => carry + addq r14, r20, r14 C U0 hi mul + carry + ldq r6, 16(r16) C L1 + + bis r31, r31, r31 C U1 mt + addq r7, r15, r23 C L0 lo + acc + addq r14, r21, r14 C U0 hi mul + carry + ldq r7, 24(r16) C L1 + + umulh r19, r1, r12 C U1 + cmpult r23, r15, r20 C L0 lo add => carry + addq r23, r14, r23 C U0 hi add => answer + ldq r0, 32(r17) C L1 + + mulq r19, r2, r13 C U1 + cmpult r23, r14, r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + ldq r1, 40(r17) C L1 + + umulh r19, r2, r14 C U1 + addq r4, r9, r4 C U0 lo + acc + stq r22, -16(r16) C L0 + stq r23, -8(r16) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq r19, r3, r15 C U1 + bis r31, r31, r31 C L1 st slosh + addq r8, r21, r8 C L0 hi mul + carry + + cmpult r4, r9, r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda r17, 64(r17) C L1 bookkeeping + addq r4, r8, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + cmpult r22, r8, r21 C L0 hi add => carry + addq r10, r20, r10 C U0 hi mul + carry + ldq r4, 32(r16) C L1 + + bis r31, r31, r31 C U1 mt + addq r5, r11, r23 C L0 lo + acc + addq r10, r21, r10 C L0 hi mul + carry + ldq r5, 40(r16) C L1 + + umulh r19, r3, r8 C U1 + cmpult r23, r11, r20 C L0 lo add => carry + addq r23, r10, r23 C U0 hi add => answer + ldq r2, -16(r17) C L1 + + mulq r19, r0, r9 C U1 + cmpult r23, r10, r21 C L0 hi add => carry + addq r12, r20, r12 C U0 hi mul + carry + ldq r3, -8(r17) C L1 + + umulh r19, r0, r10 C U1 + addq r6, r13, r6 C L0 lo + acc + stq r22, (r16) C L0 + stq r23, 8(r16) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq r19, r1, r11 C U1 + bis r31, r31, r31 C L1 st slosh + addq r12, r21, r12 C U0 hi mul + carry + + cmpult r6, r13, r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda r16, 64(r16) C L1 bookkeeping + addq r6, r12, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + cmpult r22, r12, r21 C L0 hi add => carry + addq r14, r20, r14 C U0 hi mul + carry + ldq r6, -16(r16) C L1 + + bis r31, r31, r31 C U1 mt + addq r7, r15, r23 C L0 lo + acc + addq r14, r21, r14 C U0 hi mul + carry + ldq r7, -8(r16) C L1 + + umulh r19, r1, r12 C U1 + cmpult r23, r15, r20 C L0 lo add => carry + addq r23, r14, r23 C U0 hi add => answer + ldq r0, (r17) C L1 + + mulq r19, r2, r13 C U1 + cmpult r23, r14, r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + ldq r1, 8(r17) C L1 + + umulh r19, r2, r14 C U1 + addq r4, r9, r4 C L0 lo + acc + stq r22, -48(r16) C L0 + stq r23, -40(r16) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq r19, r3, r15 C U1 + bis r31, r31, r31 C L1 st slosh + addq r8, r21, r8 C U0 hi mul + carry + + cmpult r4, r9, r20 C L0 lo add => carry + addq r4, r8, r22 C U0 hi add => answer + bis r31, r31, r31 C L1 mt + bgt r18, $Loop C U1 bookkeeping + +C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____ +$Lend: + cmpult r22, r8, r21 C L0 hi add => carry + addq r10, r20, r10 C U0 hi mul + carry + ldq r4, (r16) C L1 + addq r5, r11, r23 C L0 lo + acc + addq r10, r21, r10 C L0 hi mul + carry + ldq r5, 8(r16) C L1 + umulh r19, r3, r8 C U1 + cmpult r23, r11, r20 C L0 lo add => carry + addq r23, r10, r23 C U0 hi add => answer + mulq r19, r0, r9 C U1 + cmpult r23, r10, r21 C L0 hi add => carry + addq r12, r20, r12 C U0 hi mul + carry + umulh r19, r0, r10 C U1 + addq r6, r13, r6 C L0 lo + acc + stq r22, -32(r16) C L0 + stq r23, -24(r16) C L1 + mulq r19, r1, r11 C U1 + addq r12, r21, r12 C U0 hi mul + carry + cmpult r6, r13, r20 C L0 lo add => carry + addq r6, r12, r22 C U0 hi add => answer + cmpult r22, r12, r21 C L0 hi add => carry + addq r14, r20, r14 C U0 hi mul + carry + addq r7, r15, r23 C L0 lo + acc + addq r14, r21, r14 C U0 hi mul + carry + umulh r19, r1, r12 C U1 + cmpult r23, r15, r20 C L0 lo add => carry + addq r23, r14, r23 C U0 hi add => answer + cmpult r23, r14, r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + addq r4, r9, r4 C U0 lo + acc + stq r22, -16(r16) C L0 + stq r23, -8(r16) C L1 + bis r31, r31, r31 C L0 st slosh + addq r8, r21, r8 C L0 hi mul + carry + cmpult r4, r9, r20 C L0 lo add => carry + addq r4, r8, r22 C U0 hi add => answer + cmpult r22, r8, r21 C L0 hi add => carry + addq r10, r20, r10 C U0 hi mul + carry + addq r5, r11, r23 C L0 lo + acc + addq r10, r21, r10 C L0 hi mul + carry + cmpult r23, r11, r20 C L0 lo add => carry + addq r23, r10, r23 C U0 hi add => answer + cmpult r23, r10, r21 C L0 hi add => carry + addq r12, r20, r12 C U0 hi mul + carry + stq r22, (r16) C L0 + stq r23, 8(r16) C L1 + addq r12, r21, r0 C U0 hi mul + carry + + ldq $9, 8($30) + ldq $10, 16($30) + ldq $11, 24($30) + ldq $12, 32($30) + ldq $13, 40($30) + ldq $14, 48($30) + ldq $15, 56($30) + lda $30, 240($30) + ret r31, (r26), 1 +EPILOGUE(mpn_addmul_1) +ASM_END() diff --git a/ghc/rts/gmp/mpn/alpha/ev6/gmp-mparam.h b/ghc/rts/gmp/mpn/alpha/ev6/gmp-mparam.h new file mode 100644 index 0000000..7ea2057 --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/ev6/gmp-mparam.h @@ -0,0 +1,62 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 64 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* Generated by tuneup.c, 2000-08-02. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 47 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 70 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 94 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 101 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 33 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 70 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 29 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 46 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 33 +#endif diff --git a/ghc/rts/gmp/mpn/alpha/gmp-mparam.h b/ghc/rts/gmp/mpn/alpha/gmp-mparam.h index a3c6697..054ff2f 100644 --- a/ghc/rts/gmp/mpn/alpha/gmp-mparam.h +++ b/ghc/rts/gmp/mpn/alpha/gmp-mparam.h @@ -1,20 +1,20 @@ /* gmp-mparam.h -- Compiler/machine parameter header file. -Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -25,3 +25,40 @@ MA 02111-1307, USA. */ #define BITS_PER_INT 32 #define BITS_PER_SHORTINT 16 #define BITS_PER_CHAR 8 + +/* These values are for the 21164 family. The 21264 will require + different values, since it has such quick multiplication. */ +/* Generated by tuneup.c, 2000-07-19. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 22 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 53 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 31 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 47 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 64 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 98 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 17 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 4 +#endif diff --git a/ghc/rts/gmp/mpn/alpha/invert_limb.asm b/ghc/rts/gmp/mpn/alpha/invert_limb.asm new file mode 100644 index 0000000..a921b32 --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/invert_limb.asm @@ -0,0 +1,345 @@ +dnl Alpha mpn_invert_limb -- Invert a normalized limb. + +dnl Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +dnl +dnl This is based on sophie:/gmp-stuff/dbg-inv-limb.c. +dnl The ideas are due to Peter L. Montgomery +dnl +dnl The table below uses 4096 bytes. The file mentioned above has an +dnl alternative function that doesn't require the table, but it runs 50% +dnl slower than this. + +include(`../config.m4') + +ASM_START() + +FLOAT64($C36,9223372036854775808.0) C 2^63 + +PROLOGUE_GP(mpn_invert_limb) + lda r30,-16(r30) + addq r16,r16,r1 + bne r1,$73 + lda r0,-1 + br r31,$Lend +$73: + srl r16,1,r1 + stq r1,0(r30) + ldt f11,0(r30) + cvtqt f11,f1 + lda r1,$C36 + ldt f10,0(r1) + divt f10,f1,f10 + lda r2,$invtab-4096 + srl r16,52,r1 + addq r1,r1,r1 + addq r1,r2,r1 + bic r1,6,r2 + ldq r2,0(r2) + bic r1,1,r1 + extwl r2,r1,r2 + sll r2,48,r0 + umulh r16,r0,r1 + addq r16,r1,r3 + stq r3,0(r30) + ldt f11,0(r30) + cvtqt f11,f1 + mult f1,f10,f1 + cvttqc f1,f1 + stt f1,0(r30) + ldq r4,0(r30) + subq r0,r4,r0 + umulh r16,r0,r1 + mulq r16,r0,r2 + addq r16,r1,r3 + bge r3,$Loop2 +$Loop1: addq r2,r16,r2 + cmpult r2,r16,r1 + addq r3,r1,r3 + addq r0,1,r0 + blt r3,$Loop1 +$Loop2: cmpult r2,r16,r1 + subq r0,1,r0 + subq r3,r1,r3 + subq r2,r16,r2 + bge r3,$Loop2 +$Lend: + lda r30,16(r30) + ret r31,(r26),1 +EPILOGUE(mpn_invert_limb) +DATASTART(`$invtab',4) + .word 0xffff,0xffc0,0xff80,0xff40,0xff00,0xfec0,0xfe81,0xfe41 + .word 0xfe01,0xfdc2,0xfd83,0xfd43,0xfd04,0xfcc5,0xfc86,0xfc46 + .word 0xfc07,0xfbc8,0xfb8a,0xfb4b,0xfb0c,0xfacd,0xfa8e,0xfa50 + .word 0xfa11,0xf9d3,0xf994,0xf956,0xf918,0xf8d9,0xf89b,0xf85d + .word 0xf81f,0xf7e1,0xf7a3,0xf765,0xf727,0xf6ea,0xf6ac,0xf66e + .word 0xf631,0xf5f3,0xf5b6,0xf578,0xf53b,0xf4fd,0xf4c0,0xf483 + .word 0xf446,0xf409,0xf3cc,0xf38f,0xf352,0xf315,0xf2d8,0xf29c + .word 0xf25f,0xf222,0xf1e6,0xf1a9,0xf16d,0xf130,0xf0f4,0xf0b8 + .word 0xf07c,0xf03f,0xf003,0xefc7,0xef8b,0xef4f,0xef14,0xeed8 + .word 0xee9c,0xee60,0xee25,0xede9,0xedae,0xed72,0xed37,0xecfb + .word 0xecc0,0xec85,0xec4a,0xec0e,0xebd3,0xeb98,0xeb5d,0xeb22 + .word 0xeae8,0xeaad,0xea72,0xea37,0xe9fd,0xe9c2,0xe988,0xe94d + .word 0xe913,0xe8d8,0xe89e,0xe864,0xe829,0xe7ef,0xe7b5,0xe77b + .word 0xe741,0xe707,0xe6cd,0xe694,0xe65a,0xe620,0xe5e6,0xe5ad + .word 0xe573,0xe53a,0xe500,0xe4c7,0xe48d,0xe454,0xe41b,0xe3e2 + .word 0xe3a9,0xe370,0xe336,0xe2fd,0xe2c5,0xe28c,0xe253,0xe21a + .word 0xe1e1,0xe1a9,0xe170,0xe138,0xe0ff,0xe0c7,0xe08e,0xe056 + .word 0xe01e,0xdfe5,0xdfad,0xdf75,0xdf3d,0xdf05,0xdecd,0xde95 + .word 0xde5d,0xde25,0xdded,0xddb6,0xdd7e,0xdd46,0xdd0f,0xdcd7 + .word 0xdca0,0xdc68,0xdc31,0xdbf9,0xdbc2,0xdb8b,0xdb54,0xdb1d + .word 0xdae6,0xdaae,0xda78,0xda41,0xda0a,0xd9d3,0xd99c,0xd965 + .word 0xd92f,0xd8f8,0xd8c1,0xd88b,0xd854,0xd81e,0xd7e8,0xd7b1 + .word 0xd77b,0xd745,0xd70e,0xd6d8,0xd6a2,0xd66c,0xd636,0xd600 + .word 0xd5ca,0xd594,0xd55f,0xd529,0xd4f3,0xd4bd,0xd488,0xd452 + .word 0xd41d,0xd3e7,0xd3b2,0xd37c,0xd347,0xd312,0xd2dd,0xd2a7 + .word 0xd272,0xd23d,0xd208,0xd1d3,0xd19e,0xd169,0xd134,0xd100 + .word 0xd0cb,0xd096,0xd061,0xd02d,0xcff8,0xcfc4,0xcf8f,0xcf5b + .word 0xcf26,0xcef2,0xcebe,0xce89,0xce55,0xce21,0xcded,0xcdb9 + .word 0xcd85,0xcd51,0xcd1d,0xcce9,0xccb5,0xcc81,0xcc4e,0xcc1a + .word 0xcbe6,0xcbb3,0xcb7f,0xcb4c,0xcb18,0xcae5,0xcab1,0xca7e + .word 0xca4b,0xca17,0xc9e4,0xc9b1,0xc97e,0xc94b,0xc918,0xc8e5 + .word 0xc8b2,0xc87f,0xc84c,0xc819,0xc7e7,0xc7b4,0xc781,0xc74f + .word 0xc71c,0xc6e9,0xc6b7,0xc684,0xc652,0xc620,0xc5ed,0xc5bb + .word 0xc589,0xc557,0xc524,0xc4f2,0xc4c0,0xc48e,0xc45c,0xc42a + .word 0xc3f8,0xc3c7,0xc395,0xc363,0xc331,0xc300,0xc2ce,0xc29c + .word 0xc26b,0xc239,0xc208,0xc1d6,0xc1a5,0xc174,0xc142,0xc111 + .word 0xc0e0,0xc0af,0xc07e,0xc04d,0xc01c,0xbfeb,0xbfba,0xbf89 + .word 0xbf58,0xbf27,0xbef6,0xbec5,0xbe95,0xbe64,0xbe33,0xbe03 + .word 0xbdd2,0xbda2,0xbd71,0xbd41,0xbd10,0xbce0,0xbcb0,0xbc80 + .word 0xbc4f,0xbc1f,0xbbef,0xbbbf,0xbb8f,0xbb5f,0xbb2f,0xbaff + .word 0xbacf,0xba9f,0xba6f,0xba40,0xba10,0xb9e0,0xb9b1,0xb981 + .word 0xb951,0xb922,0xb8f2,0xb8c3,0xb894,0xb864,0xb835,0xb806 + .word 0xb7d6,0xb7a7,0xb778,0xb749,0xb71a,0xb6eb,0xb6bc,0xb68d + .word 0xb65e,0xb62f,0xb600,0xb5d1,0xb5a2,0xb574,0xb545,0xb516 + .word 0xb4e8,0xb4b9,0xb48a,0xb45c,0xb42e,0xb3ff,0xb3d1,0xb3a2 + .word 0xb374,0xb346,0xb318,0xb2e9,0xb2bb,0xb28d,0xb25f,0xb231 + .word 0xb203,0xb1d5,0xb1a7,0xb179,0xb14b,0xb11d,0xb0f0,0xb0c2 + .word 0xb094,0xb067,0xb039,0xb00b,0xafde,0xafb0,0xaf83,0xaf55 + .word 0xaf28,0xaefb,0xaecd,0xaea0,0xae73,0xae45,0xae18,0xadeb + .word 0xadbe,0xad91,0xad64,0xad37,0xad0a,0xacdd,0xacb0,0xac83 + .word 0xac57,0xac2a,0xabfd,0xabd0,0xaba4,0xab77,0xab4a,0xab1e + .word 0xaaf1,0xaac5,0xaa98,0xaa6c,0xaa40,0xaa13,0xa9e7,0xa9bb + .word 0xa98e,0xa962,0xa936,0xa90a,0xa8de,0xa8b2,0xa886,0xa85a + .word 0xa82e,0xa802,0xa7d6,0xa7aa,0xa77e,0xa753,0xa727,0xa6fb + .word 0xa6d0,0xa6a4,0xa678,0xa64d,0xa621,0xa5f6,0xa5ca,0xa59f + .word 0xa574,0xa548,0xa51d,0xa4f2,0xa4c6,0xa49b,0xa470,0xa445 + .word 0xa41a,0xa3ef,0xa3c4,0xa399,0xa36e,0xa343,0xa318,0xa2ed + .word 0xa2c2,0xa297,0xa26d,0xa242,0xa217,0xa1ed,0xa1c2,0xa197 + .word 0xa16d,0xa142,0xa118,0xa0ed,0xa0c3,0xa098,0xa06e,0xa044 + .word 0xa01a,0x9fef,0x9fc5,0x9f9b,0x9f71,0x9f47,0x9f1c,0x9ef2 + .word 0x9ec8,0x9e9e,0x9e74,0x9e4b,0x9e21,0x9df7,0x9dcd,0x9da3 + .word 0x9d79,0x9d50,0x9d26,0x9cfc,0x9cd3,0x9ca9,0x9c80,0x9c56 + .word 0x9c2d,0x9c03,0x9bda,0x9bb0,0x9b87,0x9b5e,0x9b34,0x9b0b + .word 0x9ae2,0x9ab9,0x9a8f,0x9a66,0x9a3d,0x9a14,0x99eb,0x99c2 + .word 0x9999,0x9970,0x9947,0x991e,0x98f6,0x98cd,0x98a4,0x987b + .word 0x9852,0x982a,0x9801,0x97d8,0x97b0,0x9787,0x975f,0x9736 + .word 0x970e,0x96e5,0x96bd,0x9695,0x966c,0x9644,0x961c,0x95f3 + .word 0x95cb,0x95a3,0x957b,0x9553,0x952b,0x9503,0x94db,0x94b3 + .word 0x948b,0x9463,0x943b,0x9413,0x93eb,0x93c3,0x939b,0x9374 + .word 0x934c,0x9324,0x92fd,0x92d5,0x92ad,0x9286,0x925e,0x9237 + .word 0x920f,0x91e8,0x91c0,0x9199,0x9172,0x914a,0x9123,0x90fc + .word 0x90d4,0x90ad,0x9086,0x905f,0x9038,0x9011,0x8fea,0x8fc3 + .word 0x8f9c,0x8f75,0x8f4e,0x8f27,0x8f00,0x8ed9,0x8eb2,0x8e8b + .word 0x8e65,0x8e3e,0x8e17,0x8df1,0x8dca,0x8da3,0x8d7d,0x8d56 + .word 0x8d30,0x8d09,0x8ce3,0x8cbc,0x8c96,0x8c6f,0x8c49,0x8c23 + .word 0x8bfc,0x8bd6,0x8bb0,0x8b8a,0x8b64,0x8b3d,0x8b17,0x8af1 + .word 0x8acb,0x8aa5,0x8a7f,0x8a59,0x8a33,0x8a0d,0x89e7,0x89c1 + .word 0x899c,0x8976,0x8950,0x892a,0x8904,0x88df,0x88b9,0x8893 + .word 0x886e,0x8848,0x8823,0x87fd,0x87d8,0x87b2,0x878d,0x8767 + .word 0x8742,0x871d,0x86f7,0x86d2,0x86ad,0x8687,0x8662,0x863d + .word 0x8618,0x85f3,0x85ce,0x85a9,0x8583,0x855e,0x8539,0x8514 + .word 0x84f0,0x84cb,0x84a6,0x8481,0x845c,0x8437,0x8412,0x83ee + .word 0x83c9,0x83a4,0x8380,0x835b,0x8336,0x8312,0x82ed,0x82c9 + .word 0x82a4,0x8280,0x825b,0x8237,0x8212,0x81ee,0x81ca,0x81a5 + .word 0x8181,0x815d,0x8138,0x8114,0x80f0,0x80cc,0x80a8,0x8084 + .word 0x8060,0x803c,0x8018,0x7ff4,0x7fd0,0x7fac,0x7f88,0x7f64 + .word 0x7f40,0x7f1c,0x7ef8,0x7ed4,0x7eb1,0x7e8d,0x7e69,0x7e45 + .word 0x7e22,0x7dfe,0x7ddb,0x7db7,0x7d93,0x7d70,0x7d4c,0x7d29 + .word 0x7d05,0x7ce2,0x7cbf,0x7c9b,0x7c78,0x7c55,0x7c31,0x7c0e + .word 0x7beb,0x7bc7,0x7ba4,0x7b81,0x7b5e,0x7b3b,0x7b18,0x7af5 + .word 0x7ad2,0x7aaf,0x7a8c,0x7a69,0x7a46,0x7a23,0x7a00,0x79dd + .word 0x79ba,0x7997,0x7975,0x7952,0x792f,0x790c,0x78ea,0x78c7 + .word 0x78a4,0x7882,0x785f,0x783c,0x781a,0x77f7,0x77d5,0x77b2 + .word 0x7790,0x776e,0x774b,0x7729,0x7706,0x76e4,0x76c2,0x76a0 + .word 0x767d,0x765b,0x7639,0x7617,0x75f5,0x75d2,0x75b0,0x758e + .word 0x756c,0x754a,0x7528,0x7506,0x74e4,0x74c2,0x74a0,0x747e + .word 0x745d,0x743b,0x7419,0x73f7,0x73d5,0x73b4,0x7392,0x7370 + .word 0x734f,0x732d,0x730b,0x72ea,0x72c8,0x72a7,0x7285,0x7264 + .word 0x7242,0x7221,0x71ff,0x71de,0x71bc,0x719b,0x717a,0x7158 + .word 0x7137,0x7116,0x70f5,0x70d3,0x70b2,0x7091,0x7070,0x704f + .word 0x702e,0x700c,0x6feb,0x6fca,0x6fa9,0x6f88,0x6f67,0x6f46 + .word 0x6f26,0x6f05,0x6ee4,0x6ec3,0x6ea2,0x6e81,0x6e60,0x6e40 + .word 0x6e1f,0x6dfe,0x6dde,0x6dbd,0x6d9c,0x6d7c,0x6d5b,0x6d3a + .word 0x6d1a,0x6cf9,0x6cd9,0x6cb8,0x6c98,0x6c77,0x6c57,0x6c37 + .word 0x6c16,0x6bf6,0x6bd6,0x6bb5,0x6b95,0x6b75,0x6b54,0x6b34 + .word 0x6b14,0x6af4,0x6ad4,0x6ab4,0x6a94,0x6a73,0x6a53,0x6a33 + .word 0x6a13,0x69f3,0x69d3,0x69b3,0x6993,0x6974,0x6954,0x6934 + .word 0x6914,0x68f4,0x68d4,0x68b5,0x6895,0x6875,0x6855,0x6836 + .word 0x6816,0x67f6,0x67d7,0x67b7,0x6798,0x6778,0x6758,0x6739 + .word 0x6719,0x66fa,0x66db,0x66bb,0x669c,0x667c,0x665d,0x663e + .word 0x661e,0x65ff,0x65e0,0x65c0,0x65a1,0x6582,0x6563,0x6544 + .word 0x6524,0x6505,0x64e6,0x64c7,0x64a8,0x6489,0x646a,0x644b + .word 0x642c,0x640d,0x63ee,0x63cf,0x63b0,0x6391,0x6373,0x6354 + .word 0x6335,0x6316,0x62f7,0x62d9,0x62ba,0x629b,0x627c,0x625e + .word 0x623f,0x6221,0x6202,0x61e3,0x61c5,0x61a6,0x6188,0x6169 + .word 0x614b,0x612c,0x610e,0x60ef,0x60d1,0x60b3,0x6094,0x6076 + .word 0x6058,0x6039,0x601b,0x5ffd,0x5fdf,0x5fc0,0x5fa2,0x5f84 + .word 0x5f66,0x5f48,0x5f2a,0x5f0b,0x5eed,0x5ecf,0x5eb1,0x5e93 + .word 0x5e75,0x5e57,0x5e39,0x5e1b,0x5dfd,0x5de0,0x5dc2,0x5da4 + .word 0x5d86,0x5d68,0x5d4a,0x5d2d,0x5d0f,0x5cf1,0x5cd3,0x5cb6 + .word 0x5c98,0x5c7a,0x5c5d,0x5c3f,0x5c21,0x5c04,0x5be6,0x5bc9 + .word 0x5bab,0x5b8e,0x5b70,0x5b53,0x5b35,0x5b18,0x5afb,0x5add + .word 0x5ac0,0x5aa2,0x5a85,0x5a68,0x5a4b,0x5a2d,0x5a10,0x59f3 + .word 0x59d6,0x59b8,0x599b,0x597e,0x5961,0x5944,0x5927,0x590a + .word 0x58ed,0x58d0,0x58b3,0x5896,0x5879,0x585c,0x583f,0x5822 + .word 0x5805,0x57e8,0x57cb,0x57ae,0x5791,0x5775,0x5758,0x573b + .word 0x571e,0x5702,0x56e5,0x56c8,0x56ac,0x568f,0x5672,0x5656 + .word 0x5639,0x561c,0x5600,0x55e3,0x55c7,0x55aa,0x558e,0x5571 + .word 0x5555,0x5538,0x551c,0x5500,0x54e3,0x54c7,0x54aa,0x548e + .word 0x5472,0x5456,0x5439,0x541d,0x5401,0x53e5,0x53c8,0x53ac + .word 0x5390,0x5374,0x5358,0x533c,0x5320,0x5304,0x52e8,0x52cb + .word 0x52af,0x5293,0x5277,0x525c,0x5240,0x5224,0x5208,0x51ec + .word 0x51d0,0x51b4,0x5198,0x517c,0x5161,0x5145,0x5129,0x510d + .word 0x50f2,0x50d6,0x50ba,0x509f,0x5083,0x5067,0x504c,0x5030 + .word 0x5015,0x4ff9,0x4fdd,0x4fc2,0x4fa6,0x4f8b,0x4f6f,0x4f54 + .word 0x4f38,0x4f1d,0x4f02,0x4ee6,0x4ecb,0x4eb0,0x4e94,0x4e79 + .word 0x4e5e,0x4e42,0x4e27,0x4e0c,0x4df0,0x4dd5,0x4dba,0x4d9f + .word 0x4d84,0x4d69,0x4d4d,0x4d32,0x4d17,0x4cfc,0x4ce1,0x4cc6 + .word 0x4cab,0x4c90,0x4c75,0x4c5a,0x4c3f,0x4c24,0x4c09,0x4bee + .word 0x4bd3,0x4bb9,0x4b9e,0x4b83,0x4b68,0x4b4d,0x4b32,0x4b18 + .word 0x4afd,0x4ae2,0x4ac7,0x4aad,0x4a92,0x4a77,0x4a5d,0x4a42 + .word 0x4a27,0x4a0d,0x49f2,0x49d8,0x49bd,0x49a3,0x4988,0x496e + .word 0x4953,0x4939,0x491e,0x4904,0x48e9,0x48cf,0x48b5,0x489a + .word 0x4880,0x4865,0x484b,0x4831,0x4817,0x47fc,0x47e2,0x47c8 + .word 0x47ae,0x4793,0x4779,0x475f,0x4745,0x472b,0x4711,0x46f6 + .word 0x46dc,0x46c2,0x46a8,0x468e,0x4674,0x465a,0x4640,0x4626 + .word 0x460c,0x45f2,0x45d8,0x45be,0x45a5,0x458b,0x4571,0x4557 + .word 0x453d,0x4523,0x4509,0x44f0,0x44d6,0x44bc,0x44a2,0x4489 + .word 0x446f,0x4455,0x443c,0x4422,0x4408,0x43ef,0x43d5,0x43bc + .word 0x43a2,0x4388,0x436f,0x4355,0x433c,0x4322,0x4309,0x42ef + .word 0x42d6,0x42bc,0x42a3,0x428a,0x4270,0x4257,0x423d,0x4224 + .word 0x420b,0x41f2,0x41d8,0x41bf,0x41a6,0x418c,0x4173,0x415a + .word 0x4141,0x4128,0x410e,0x40f5,0x40dc,0x40c3,0x40aa,0x4091 + .word 0x4078,0x405f,0x4046,0x402d,0x4014,0x3ffb,0x3fe2,0x3fc9 + .word 0x3fb0,0x3f97,0x3f7e,0x3f65,0x3f4c,0x3f33,0x3f1a,0x3f01 + .word 0x3ee8,0x3ed0,0x3eb7,0x3e9e,0x3e85,0x3e6c,0x3e54,0x3e3b + .word 0x3e22,0x3e0a,0x3df1,0x3dd8,0x3dc0,0x3da7,0x3d8e,0x3d76 + .word 0x3d5d,0x3d45,0x3d2c,0x3d13,0x3cfb,0x3ce2,0x3cca,0x3cb1 + .word 0x3c99,0x3c80,0x3c68,0x3c50,0x3c37,0x3c1f,0x3c06,0x3bee + .word 0x3bd6,0x3bbd,0x3ba5,0x3b8d,0x3b74,0x3b5c,0x3b44,0x3b2b + .word 0x3b13,0x3afb,0x3ae3,0x3acb,0x3ab2,0x3a9a,0x3a82,0x3a6a + .word 0x3a52,0x3a3a,0x3a22,0x3a09,0x39f1,0x39d9,0x39c1,0x39a9 + .word 0x3991,0x3979,0x3961,0x3949,0x3931,0x3919,0x3901,0x38ea + .word 0x38d2,0x38ba,0x38a2,0x388a,0x3872,0x385a,0x3843,0x382b + .word 0x3813,0x37fb,0x37e3,0x37cc,0x37b4,0x379c,0x3785,0x376d + .word 0x3755,0x373e,0x3726,0x370e,0x36f7,0x36df,0x36c8,0x36b0 + .word 0x3698,0x3681,0x3669,0x3652,0x363a,0x3623,0x360b,0x35f4 + .word 0x35dc,0x35c5,0x35ae,0x3596,0x357f,0x3567,0x3550,0x3539 + .word 0x3521,0x350a,0x34f3,0x34db,0x34c4,0x34ad,0x3496,0x347e + .word 0x3467,0x3450,0x3439,0x3422,0x340a,0x33f3,0x33dc,0x33c5 + .word 0x33ae,0x3397,0x3380,0x3368,0x3351,0x333a,0x3323,0x330c + .word 0x32f5,0x32de,0x32c7,0x32b0,0x3299,0x3282,0x326c,0x3255 + .word 0x323e,0x3227,0x3210,0x31f9,0x31e2,0x31cb,0x31b5,0x319e + .word 0x3187,0x3170,0x3159,0x3143,0x312c,0x3115,0x30fe,0x30e8 + .word 0x30d1,0x30ba,0x30a4,0x308d,0x3076,0x3060,0x3049,0x3033 + .word 0x301c,0x3005,0x2fef,0x2fd8,0x2fc2,0x2fab,0x2f95,0x2f7e + .word 0x2f68,0x2f51,0x2f3b,0x2f24,0x2f0e,0x2ef8,0x2ee1,0x2ecb + .word 0x2eb4,0x2e9e,0x2e88,0x2e71,0x2e5b,0x2e45,0x2e2e,0x2e18 + .word 0x2e02,0x2dec,0x2dd5,0x2dbf,0x2da9,0x2d93,0x2d7c,0x2d66 + .word 0x2d50,0x2d3a,0x2d24,0x2d0e,0x2cf8,0x2ce1,0x2ccb,0x2cb5 + .word 0x2c9f,0x2c89,0x2c73,0x2c5d,0x2c47,0x2c31,0x2c1b,0x2c05 + .word 0x2bef,0x2bd9,0x2bc3,0x2bad,0x2b97,0x2b81,0x2b6c,0x2b56 + .word 0x2b40,0x2b2a,0x2b14,0x2afe,0x2ae8,0x2ad3,0x2abd,0x2aa7 + .word 0x2a91,0x2a7c,0x2a66,0x2a50,0x2a3a,0x2a25,0x2a0f,0x29f9 + .word 0x29e4,0x29ce,0x29b8,0x29a3,0x298d,0x2977,0x2962,0x294c + .word 0x2937,0x2921,0x290c,0x28f6,0x28e0,0x28cb,0x28b5,0x28a0 + .word 0x288b,0x2875,0x2860,0x284a,0x2835,0x281f,0x280a,0x27f5 + .word 0x27df,0x27ca,0x27b4,0x279f,0x278a,0x2774,0x275f,0x274a + .word 0x2735,0x271f,0x270a,0x26f5,0x26e0,0x26ca,0x26b5,0x26a0 + .word 0x268b,0x2676,0x2660,0x264b,0x2636,0x2621,0x260c,0x25f7 + .word 0x25e2,0x25cd,0x25b8,0x25a2,0x258d,0x2578,0x2563,0x254e + .word 0x2539,0x2524,0x250f,0x24fa,0x24e5,0x24d1,0x24bc,0x24a7 + .word 0x2492,0x247d,0x2468,0x2453,0x243e,0x2429,0x2415,0x2400 + .word 0x23eb,0x23d6,0x23c1,0x23ad,0x2398,0x2383,0x236e,0x235a + .word 0x2345,0x2330,0x231c,0x2307,0x22f2,0x22dd,0x22c9,0x22b4 + .word 0x22a0,0x228b,0x2276,0x2262,0x224d,0x2239,0x2224,0x2210 + .word 0x21fb,0x21e6,0x21d2,0x21bd,0x21a9,0x2194,0x2180,0x216c + .word 0x2157,0x2143,0x212e,0x211a,0x2105,0x20f1,0x20dd,0x20c8 + .word 0x20b4,0x20a0,0x208b,0x2077,0x2063,0x204e,0x203a,0x2026 + .word 0x2012,0x1ffd,0x1fe9,0x1fd5,0x1fc1,0x1fac,0x1f98,0x1f84 + .word 0x1f70,0x1f5c,0x1f47,0x1f33,0x1f1f,0x1f0b,0x1ef7,0x1ee3 + .word 0x1ecf,0x1ebb,0x1ea7,0x1e93,0x1e7f,0x1e6a,0x1e56,0x1e42 + .word 0x1e2e,0x1e1a,0x1e06,0x1df3,0x1ddf,0x1dcb,0x1db7,0x1da3 + .word 0x1d8f,0x1d7b,0x1d67,0x1d53,0x1d3f,0x1d2b,0x1d18,0x1d04 + .word 0x1cf0,0x1cdc,0x1cc8,0x1cb5,0x1ca1,0x1c8d,0x1c79,0x1c65 + .word 0x1c52,0x1c3e,0x1c2a,0x1c17,0x1c03,0x1bef,0x1bdb,0x1bc8 + .word 0x1bb4,0x1ba0,0x1b8d,0x1b79,0x1b66,0x1b52,0x1b3e,0x1b2b + .word 0x1b17,0x1b04,0x1af0,0x1add,0x1ac9,0x1ab6,0x1aa2,0x1a8f + .word 0x1a7b,0x1a68,0x1a54,0x1a41,0x1a2d,0x1a1a,0x1a06,0x19f3 + .word 0x19e0,0x19cc,0x19b9,0x19a5,0x1992,0x197f,0x196b,0x1958 + .word 0x1945,0x1931,0x191e,0x190b,0x18f8,0x18e4,0x18d1,0x18be + .word 0x18ab,0x1897,0x1884,0x1871,0x185e,0x184b,0x1837,0x1824 + .word 0x1811,0x17fe,0x17eb,0x17d8,0x17c4,0x17b1,0x179e,0x178b + .word 0x1778,0x1765,0x1752,0x173f,0x172c,0x1719,0x1706,0x16f3 + .word 0x16e0,0x16cd,0x16ba,0x16a7,0x1694,0x1681,0x166e,0x165b + .word 0x1648,0x1635,0x1623,0x1610,0x15fd,0x15ea,0x15d7,0x15c4 + .word 0x15b1,0x159f,0x158c,0x1579,0x1566,0x1553,0x1541,0x152e + .word 0x151b,0x1508,0x14f6,0x14e3,0x14d0,0x14bd,0x14ab,0x1498 + .word 0x1485,0x1473,0x1460,0x144d,0x143b,0x1428,0x1416,0x1403 + .word 0x13f0,0x13de,0x13cb,0x13b9,0x13a6,0x1394,0x1381,0x136f + .word 0x135c,0x1349,0x1337,0x1325,0x1312,0x1300,0x12ed,0x12db + .word 0x12c8,0x12b6,0x12a3,0x1291,0x127f,0x126c,0x125a,0x1247 + .word 0x1235,0x1223,0x1210,0x11fe,0x11ec,0x11d9,0x11c7,0x11b5 + .word 0x11a3,0x1190,0x117e,0x116c,0x1159,0x1147,0x1135,0x1123 + .word 0x1111,0x10fe,0x10ec,0x10da,0x10c8,0x10b6,0x10a4,0x1091 + .word 0x107f,0x106d,0x105b,0x1049,0x1037,0x1025,0x1013,0x1001 + .word 0x0fef,0x0fdc,0x0fca,0x0fb8,0x0fa6,0x0f94,0x0f82,0x0f70 + .word 0x0f5e,0x0f4c,0x0f3a,0x0f28,0x0f17,0x0f05,0x0ef3,0x0ee1 + .word 0x0ecf,0x0ebd,0x0eab,0x0e99,0x0e87,0x0e75,0x0e64,0x0e52 + .word 0x0e40,0x0e2e,0x0e1c,0x0e0a,0x0df9,0x0de7,0x0dd5,0x0dc3 + .word 0x0db2,0x0da0,0x0d8e,0x0d7c,0x0d6b,0x0d59,0x0d47,0x0d35 + .word 0x0d24,0x0d12,0x0d00,0x0cef,0x0cdd,0x0ccb,0x0cba,0x0ca8 + .word 0x0c97,0x0c85,0x0c73,0x0c62,0x0c50,0x0c3f,0x0c2d,0x0c1c + .word 0x0c0a,0x0bf8,0x0be7,0x0bd5,0x0bc4,0x0bb2,0x0ba1,0x0b8f + .word 0x0b7e,0x0b6c,0x0b5b,0x0b4a,0x0b38,0x0b27,0x0b15,0x0b04 + .word 0x0af2,0x0ae1,0x0ad0,0x0abe,0x0aad,0x0a9c,0x0a8a,0x0a79 + .word 0x0a68,0x0a56,0x0a45,0x0a34,0x0a22,0x0a11,0x0a00,0x09ee + .word 0x09dd,0x09cc,0x09bb,0x09a9,0x0998,0x0987,0x0976,0x0965 + .word 0x0953,0x0942,0x0931,0x0920,0x090f,0x08fe,0x08ec,0x08db + .word 0x08ca,0x08b9,0x08a8,0x0897,0x0886,0x0875,0x0864,0x0853 + .word 0x0842,0x0831,0x081f,0x080e,0x07fd,0x07ec,0x07db,0x07ca + .word 0x07b9,0x07a8,0x0798,0x0787,0x0776,0x0765,0x0754,0x0743 + .word 0x0732,0x0721,0x0710,0x06ff,0x06ee,0x06dd,0x06cd,0x06bc + .word 0x06ab,0x069a,0x0689,0x0678,0x0668,0x0657,0x0646,0x0635 + .word 0x0624,0x0614,0x0603,0x05f2,0x05e1,0x05d1,0x05c0,0x05af + .word 0x059e,0x058e,0x057d,0x056c,0x055c,0x054b,0x053a,0x052a + .word 0x0519,0x0508,0x04f8,0x04e7,0x04d6,0x04c6,0x04b5,0x04a5 + .word 0x0494,0x0484,0x0473,0x0462,0x0452,0x0441,0x0431,0x0420 + .word 0x0410,0x03ff,0x03ef,0x03de,0x03ce,0x03bd,0x03ad,0x039c + .word 0x038c,0x037b,0x036b,0x035b,0x034a,0x033a,0x0329,0x0319 + .word 0x0309,0x02f8,0x02e8,0x02d7,0x02c7,0x02b7,0x02a6,0x0296 + .word 0x0286,0x0275,0x0265,0x0255,0x0245,0x0234,0x0224,0x0214 + .word 0x0204,0x01f3,0x01e3,0x01d3,0x01c3,0x01b2,0x01a2,0x0192 + .word 0x0182,0x0172,0x0161,0x0151,0x0141,0x0131,0x0121,0x0111 + .word 0x0101,0x00f0,0x00e0,0x00d0,0x00c0,0x00b0,0x00a0,0x0090 + .word 0x0080,0x0070,0x0060,0x0050,0x0040,0x0030,0x0020,0x0010 +DATAEND() +ASM_END() diff --git a/ghc/rts/gmp/mpn/alpha/lshift.asm b/ghc/rts/gmp/mpn/alpha/lshift.asm new file mode 100644 index 0000000..87c46f6 --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/lshift.asm @@ -0,0 +1,104 @@ +dnl Alpha mpn_lshift -- Shift a number left. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, +dnl it would take 4 cycles/limb. It should be possible to get down to 3 +dnl cycles/limb since both ldq and stq can be paired with the other used +dnl instructions. But there are many restrictions in the 21064 pipeline that +dnl makes it hard, if not impossible, to get down to 3 cycles/limb: + +dnl 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. +dnl 2. Only aligned instruction pairs can be paired. +dnl 3. The store buffer or silo might not be able to deal with the bandwidth. + +ASM_START() +PROLOGUE(mpn_lshift) + s8addq r18,r17,r17 C make r17 point at end of s1 + ldq r4,-8(r17) C load first limb + subq r17,8,r17 + subq r31,r19,r7 + s8addq r18,r16,r16 C make r16 point at end of RES + subq r18,1,r18 + and r18,4-1,r20 C number of limbs in first loop + srl r4,r7,r0 C compute function result + + beq r20,$L0 + subq r18,r20,r18 + + ALIGN(8) +$Loop0: + ldq r3,-8(r17) + subq r16,8,r16 + subq r17,8,r17 + subq r20,1,r20 + sll r4,r19,r5 + srl r3,r7,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,0(r16) + bne r20,$Loop0 + +$L0: beq r18,$Lend + + ALIGN(8) +$Loop: ldq r3,-8(r17) + subq r16,32,r16 + subq r18,4,r18 + sll r4,r19,r5 + srl r3,r7,r6 + + ldq r4,-16(r17) + sll r3,r19,r1 + bis r5,r6,r8 + stq r8,24(r16) + srl r4,r7,r2 + + ldq r3,-24(r17) + sll r4,r19,r5 + bis r1,r2,r8 + stq r8,16(r16) + srl r3,r7,r6 + + ldq r4,-32(r17) + sll r3,r19,r1 + bis r5,r6,r8 + stq r8,8(r16) + srl r4,r7,r2 + + subq r17,32,r17 + bis r1,r2,r8 + stq r8,0(r16) + + bgt r18,$Loop + +$Lend: sll r4,r19,r8 + stq r8,-8(r16) + ret r31,(r26),1 +EPILOGUE(mpn_lshift) +ASM_END() diff --git a/ghc/rts/gmp/mpn/alpha/mul_1.asm b/ghc/rts/gmp/mpn/alpha/mul_1.asm new file mode 100644 index 0000000..46b8df3 --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/mul_1.asm @@ -0,0 +1,71 @@ +dnl Alpha __gmpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl s2_limb r19 + +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7 +dnl cycles/limb on EV6. + +ASM_START() +PROLOGUE(mpn_mul_1) + ldq r2,0(r17) C r2 = s1_limb + subq r18,1,r18 C size-- + mulq r2,r19,r3 C r3 = prod_low + bic r31,r31,r4 C clear cy_limb + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Lend1 C jump if size was == 1 + ldq r2,8(r17) C r2 = s1_limb + subq r18,1,r18 C size-- + stq r3,0(r16) + beq r18,$Lend2 C jump if size was == 2 + + ALIGN(8) +$Loop: mulq r2,r19,r3 C r3 = prod_low + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + subq r18,1,r18 C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,16(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + stq r3,8(r16) + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + addq r16,8,r16 C res_ptr++ + bne r18,$Loop + +$Lend2: mulq r2,r19,r3 C r3 = prod_low + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + stq r3,8(r16) + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Lend1: stq r3,0(r16) + ret r31,(r26),1 +EPILOGUE(mpn_mul_1) +ASM_END() diff --git a/ghc/rts/gmp/mpn/alpha/rshift.asm b/ghc/rts/gmp/mpn/alpha/rshift.asm new file mode 100644 index 0000000..aa25eda --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/rshift.asm @@ -0,0 +1,102 @@ +dnl Alpha mpn_rshift -- Shift a number right. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, +dnl it would take 4 cycles/limb. It should be possible to get down to 3 +dnl cycles/limb since both ldq and stq can be paired with the other used +dnl instructions. But there are many restrictions in the 21064 pipeline that +dnl makes it hard, if not impossible, to get down to 3 cycles/limb: + +dnl 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. +dnl 2. Only aligned instruction pairs can be paired. +dnl 3. The store buffer or silo might not be able to deal with the bandwidth. + +ASM_START() +PROLOGUE(mpn_rshift) + ldq r4,0(r17) C load first limb + addq r17,8,r17 + subq r31,r19,r7 + subq r18,1,r18 + and r18,4-1,r20 C number of limbs in first loop + sll r4,r7,r0 C compute function result + + beq r20,$L0 + subq r18,r20,r18 + + ALIGN(8) +$Loop0: + ldq r3,0(r17) + addq r16,8,r16 + addq r17,8,r17 + subq r20,1,r20 + srl r4,r19,r5 + sll r3,r7,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,-8(r16) + bne r20,$Loop0 + +$L0: beq r18,$Lend + + ALIGN(8) +$Loop: ldq r3,0(r17) + addq r16,32,r16 + subq r18,4,r18 + srl r4,r19,r5 + sll r3,r7,r6 + + ldq r4,8(r17) + srl r3,r19,r1 + bis r5,r6,r8 + stq r8,-32(r16) + sll r4,r7,r2 + + ldq r3,16(r17) + srl r4,r19,r5 + bis r1,r2,r8 + stq r8,-24(r16) + sll r3,r7,r6 + + ldq r4,24(r17) + srl r3,r19,r1 + bis r5,r6,r8 + stq r8,-16(r16) + sll r4,r7,r2 + + addq r17,32,r17 + bis r1,r2,r8 + stq r8,-8(r16) + + bgt r18,$Loop + +$Lend: srl r4,r19,r8 + stq r8,0(r16) + ret r31,(r26),1 +EPILOGUE(mpn_rshift) +ASM_END() diff --git a/ghc/rts/gmp/mpn/alpha/sub_n.asm b/ghc/rts/gmp/mpn/alpha/sub_n.asm new file mode 100644 index 0000000..718f657 --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/sub_n.asm @@ -0,0 +1,114 @@ +dnl Alpha mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_sub_n) + ldq r3,0(r17) + ldq r4,0(r18) + + subq r19,1,r19 + and r19,4-1,r2 C number of limbs in first loop + bis r31,r31,r0 + beq r2,$L0 C if multiple of 4 limbs, skip first loop + + subq r19,r2,r19 + +$Loop0: subq r2,1,r2 + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,0(r16) + bis r0,r1,r0 + + addq r17,8,r17 + addq r18,8,r18 + bis r5,r5,r3 + bis r6,r6,r4 + addq r16,8,r16 + bne r2,$Loop0 + +$L0: beq r19,$Lend + + ALIGN(8) +$Loop: subq r19,4,r19 + + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,0(r16) + bis r0,r1,r0 + + ldq r3,16(r17) + addq r6,r0,r6 + ldq r4,16(r18) + cmpult r6,r0,r1 + subq r5,r6,r6 + cmpult r5,r6,r0 + stq r6,8(r16) + bis r0,r1,r0 + + ldq r5,24(r17) + addq r4,r0,r4 + ldq r6,24(r18) + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,16(r16) + bis r0,r1,r0 + + ldq r3,32(r17) + addq r6,r0,r6 + ldq r4,32(r18) + cmpult r6,r0,r1 + subq r5,r6,r6 + cmpult r5,r6,r0 + stq r6,24(r16) + bis r0,r1,r0 + + addq r17,32,r17 + addq r18,32,r18 + addq r16,32,r16 + bne r19,$Loop + +$Lend: addq r4,r0,r4 + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,0(r16) + bis r0,r1,r0 + ret r31,(r26),1 +EPILOGUE(mpn_sub_n) +ASM_END() diff --git a/ghc/rts/gmp/mpn/alpha/submul_1.asm b/ghc/rts/gmp/mpn/alpha/submul_1.asm new file mode 100644 index 0000000..caec1a7 --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/submul_1.asm @@ -0,0 +1,87 @@ +dnl Alpha __gmpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl s2_limb r19 + +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7 +dnl cycles/limb on EV6. + +ASM_START() +PROLOGUE(mpn_submul_1) + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Lend1 C jump if size was == 1 + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + subq r5,r3,r3 + cmpult r5,r3,r4 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + beq r18,$Lend2 C jump if size was == 2 + + ALIGN(8) +$Loop: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + subq r18,1,r18 C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + subq r5,r3,r3 + cmpult r5,r3,r5 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + addq r5,r0,r0 C combine carries + bne r18,$Loop + +$Lend2: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + subq r5,r3,r3 + cmpult r5,r3,r5 + stq r3,0(r16) + addq r5,r0,r0 C combine carries + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Lend1: subq r5,r3,r3 + cmpult r5,r3,r5 + stq r3,0(r16) + addq r0,r5,r0 + ret r31,(r26),1 +EPILOGUE(mpn_submul_1) +ASM_END() diff --git a/ghc/rts/gmp/mpn/alpha/udiv_qrnnd.S b/ghc/rts/gmp/mpn/alpha/udiv_qrnnd.S index d3d2cee..53814bb 100644 --- a/ghc/rts/gmp/mpn/alpha/udiv_qrnnd.S +++ b/ghc/rts/gmp/mpn/alpha/udiv_qrnnd.S @@ -1,34 +1,34 @@ # Alpha 21064 __udiv_qrnnd - # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + # Copyright (C) 1992, 1994, 1995, 1997, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. - # You should have received a copy of the GNU Library General Public License + # You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. - .set noreorder - .set noat + .set noreorder + .set noat .text - .align 3 - .globl __udiv_qrnnd - .ent __udiv_qrnnd -__udiv_qrnnd: - .frame $30,0,$26,0 - .prologue 0 + .align 3 + .globl __gmpn_udiv_qrnnd + .ent __gmpn_udiv_qrnnd +__gmpn_udiv_qrnnd: + .frame $30,0,$26,0 + .prologue 0 #define cnt $2 #define tmp $3 #define rem_ptr $16 @@ -148,4 +148,4 @@ __udiv_qrnnd: bis $31,n0,$0 ret $31,($26),1 - .end __udiv_qrnnd + .end __gmpn_udiv_qrnnd diff --git a/ghc/rts/gmp/mpn/alpha/umul.asm b/ghc/rts/gmp/mpn/alpha/umul.asm new file mode 100644 index 0000000..44428ed --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/umul.asm @@ -0,0 +1,39 @@ +dnl Currently unused. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + .set noreorder + .set volatile + .set noat + +.text + .align 3 + .globl __umul_ppmm + .ent __umul_ppmm +__umul_ppmm: +__umul_ppmm..ng: + .frame $30,0,$26,0 + .prologue 0 + mulq $17,$18,$1 + umulh $17,$18,$0 + stq $1,0($16) + ret $31,($26),1 + .end __umul_ppmm diff --git a/ghc/rts/gmp/mpn/alpha/unicos.m4 b/ghc/rts/gmp/mpn/alpha/unicos.m4 new file mode 100644 index 0000000..7ff26c0 --- /dev/null +++ b/ghc/rts/gmp/mpn/alpha/unicos.m4 @@ -0,0 +1,63 @@ +divert(-1) + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +define(`ASM_START', + `.ident dummy') + +define(`X',`^X$1') +define(`FLOAT64', + `dnl + .psect $1@crud,data +$1: .t_floating $2 + .endp') + +define(`PROLOGUE', + `dnl + .stack 192 ; What does this mean? Only Cray knows. + .psect $1@code,code,cache +$1::') +define(`PROLOGUE_GP', `PROLOGUE($1)') + +define(`EPILOGUE', + `dnl + .endp') + +define(`DATASTART', + `dnl + .psect $1@crud,data +$1:') +define(`DATAEND', + `dnl + .endp') + +define(`ASM_END', + `dnl + .end') + +define(`unop',`bis r31,r31,r31') ; Unicos assembler lacks unop +define(`cvttqc',`cvttq/c') + +define(`ALIGN',`') ; Unicos assembler seems to align using garbage + +divert + diff --git a/ghc/rts/gmp/mpn/arm/add_n.S b/ghc/rts/gmp/mpn/arm/add_n.S new file mode 100644 index 0000000..fb3f8f7 --- /dev/null +++ b/ghc/rts/gmp/mpn/arm/add_n.S @@ -0,0 +1,77 @@ +@ ARM mpn_add -- Add two limb vectors of the same length > 0 and store sum in +@ a third limb vector. +@ Contributed by Robert Harley. + +@ Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +@ This file is part of the GNU MP Library. + +@ The GNU MP Library is free software; you can redistribute it and/or modify +@ it under the terms of the GNU Lesser General Public License as published by +@ the Free Software Foundation; either version 2.1 of the License, or (at your +@ option) any later version. + +@ The GNU MP Library is distributed in the hope that it will be useful, but +@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +@ License for more details. + +@ You should have received a copy of the GNU Lesser General Public License +@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to +@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +@ MA 02111-1307, USA. + +#define s r0 +#define a r1 +#define b r2 +#define n r3 + +#define sl r10 +#define fp r11 +#define ip r12 +#define sp r13 +#define lr r14 +#define pc r15 + +.text + .align 0 + .global __gmpn_add_n + .type __gmpn_add_n,%function +__gmpn_add_n: + stmfd sp!, { r8, r9, lr } + movs n, n, lsr #1 + bcc skip1 + ldr ip, [a], #4 + ldr lr, [b], #4 + adds ip, ip, lr + str ip, [s], #4 +skip1: + tst n, #1 + beq skip2 + ldmia a!, { r8, r9 } + ldmia b!, { ip, lr } + adcs r8, r8, ip + adcs r9, r9, lr + stmia s!, { r8, r9 } +skip2: + bics n, n, #1 + beq return + stmfd sp!, { r4, r5, r6, r7 } +add_n_loop: + ldmia a!, { r4, r5, r6, r7 } + ldmia b!, { r8, r9, ip, lr } + adcs r4, r4, r8 + ldr r8, [s] /* Bring stuff into cache. */ + adcs r5, r5, r9 + adcs r6, r6, ip + adcs r7, r7, lr + stmia s!, { r4, r5, r6, r7 } + sub n, n, #2 + teq n, #0 + bne add_n_loop + ldmfd sp!, { r4, r5, r6, r7 } +return: + adc r0, n, #0 + ldmfd sp!, { r8, r9, pc } +end: + .size __gmpn_add_n, end - __gmpn_add_n diff --git a/ghc/rts/gmp/mpn/arm/addmul_1.S b/ghc/rts/gmp/mpn/arm/addmul_1.S new file mode 100644 index 0000000..396fff7 --- /dev/null +++ b/ghc/rts/gmp/mpn/arm/addmul_1.S @@ -0,0 +1,89 @@ +@ ARM mpn_mul_1 -- Multiply a limb vector with a limb and add the result to a +@ second limb vector. +@ Contributed by Robert Harley. + +@ Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +@ This file is part of the GNU MP Library. + +@ The GNU MP Library is free software; you can redistribute it and/or modify +@ it under the terms of the GNU Lesser General Public License as published by +@ the Free Software Foundation; either version 2.1 of the License, or (at your +@ option) any later version. + +@ The GNU MP Library is distributed in the hope that it will be useful, but +@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +@ License for more details. + +@ You should have received a copy of the GNU Lesser General Public License +@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to +@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +@ MA 02111-1307, USA. + +#define p r0 +#define a r1 +#define n r2 +#define w r3 + +#define z r11 + +#define ip r12 +#define sp r13 +#define lr r14 +#define pc r15 + +.text + .align 0 + .global __gmpn_addmul_1 + .type __gmpn_addmul_1,%function +__gmpn_addmul_1: + stmfd sp!, { r8-r11, lr } + mov z, #0 + mov ip, #0 + movs n, n, lsr #1 + bcc skip1 + ldr lr, [a], #4 + ldr r9, [p] + umlal r9, ip, w, lr + str r9, [p], #4 +skip1: + movs n, n, lsr #1 + bcc skip2 + ldmia p, { r9, r10 } + adds r8, ip, r9 + adc r9, z, #0 + ldmia a!, { ip, lr } + umlal r8, r9, w, ip + adds r9, r9, r10 + adc ip, z, #0 + umlal r9, ip, w, lr + stmia p!, { r8, r9 } +skip2: + teq n, #0 + beq return + stmfd sp!, { r4-r7 } +addmul_loop: + ldmia p, { r5, r6, r7, r8 } + adds r4, ip, r5 + adc r5, z, #0 + ldmia a!, { r9, r10, ip, lr } + umlal r4, r5, w, r9 + adds r5, r5, r6 + adc r6, z, #0 + umlal r5, r6, w, r10 + adds r6, r6, r7 + adc r7, z, #0 + umlal r6, r7, w, ip + adds r7, r7, r8 + adc ip, z, #0 + umlal r7, ip, w, lr + subs n, n, #1 + stmia p!, { r4, r5, r6, r7 } + bne addmul_loop + ldmfd sp!, { r4-r7 } +return: + mov r0, ip + ldmfd sp!, { r8-r11, pc } +end: + .size __gmpn_addmul_1, end - __gmpn_addmul_1 diff --git a/ghc/rts/gmp/mpn/arm/gmp-mparam.h b/ghc/rts/gmp/mpn/arm/gmp-mparam.h new file mode 100644 index 0000000..a35b0c7 --- /dev/null +++ b/ghc/rts/gmp/mpn/arm/gmp-mparam.h @@ -0,0 +1,34 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 21 +#endif +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 48 +#endif diff --git a/ghc/rts/gmp/mpn/arm/mul_1.S b/ghc/rts/gmp/mpn/arm/mul_1.S new file mode 100644 index 0000000..bae526a --- /dev/null +++ b/ghc/rts/gmp/mpn/arm/mul_1.S @@ -0,0 +1,81 @@ +@ ARM mpn_addmul_1 -- Multiply a limb vector with a limb and store the result +@ in a second limb vector. +@ Contributed by Robert Harley. + +@ Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +@ This file is part of the GNU MP Library. + +@ The GNU MP Library is free software; you can redistribute it and/or modify +@ it under the terms of the GNU Lesser General Public License as published by +@ the Free Software Foundation; either version 2.1 of the License, or (at your +@ option) any later version. + +@ The GNU MP Library is distributed in the hope that it will be useful, but +@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +@ License for more details. + +@ You should have received a copy of the GNU Lesser General Public License +@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to +@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +@ MA 02111-1307, USA. + +#define p r0 +#define a r1 +#define n r2 +#define w r3 + +#define sl r10 +#define fp r11 +#define ip r12 +#define sp r13 +#define lr r14 +#define pc r15 + +.text + .align 0 + .global __gmpn_mul_1 + .type __gmpn_mul_1,%function +__gmpn_mul_1: + stmfd sp!, { r8, r9, lr } + ands ip, n, #1 + beq skip1 + ldr lr, [a], #4 + umull r9, ip, w, lr + str r9, [p], #4 +skip1: + tst n, #2 + beq skip2 + mov r8, ip + ldmia a!, { ip, lr } + mov r9, #0 + umlal r8, r9, w, ip + mov ip, #0 + umlal r9, ip, w, lr + stmia p!, { r8, r9 } +skip2: + bics n, n, #3 + beq return + stmfd sp!, { r6, r7 } +mul_1_loop: + mov r6, ip + ldmia a!, { r8, r9, ip, lr } + ldr r7, [p] /* Bring stuff into cache. */ + mov r7, #0 + umlal r6, r7, w, r8 + mov r8, #0 + umlal r7, r8, w, r9 + mov r9, #0 + umlal r8, r9, w, ip + mov ip, #0 + umlal r9, ip, w, lr + subs n, n, #4 + stmia p!, { r6, r7, r8, r9 } + bne mul_1_loop + ldmfd sp!, { r6, r7 } +return: + mov r0, ip + ldmfd sp!, { r8, r9, pc } +end: + .size __gmpn_mul_1, end - __gmpn_mul_1 diff --git a/ghc/rts/gmp/mpn/arm/sub_n.S b/ghc/rts/gmp/mpn/arm/sub_n.S new file mode 100644 index 0000000..856505f --- /dev/null +++ b/ghc/rts/gmp/mpn/arm/sub_n.S @@ -0,0 +1,79 @@ +@ ARM mpn_sub -- Subtract two limb vectors of the same length > 0 and store +@ difference in a third limb vector. +@ Contributed by Robert Harley. + +@ Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +@ This file is part of the GNU MP Library. + +@ The GNU MP Library is free software; you can redistribute it and/or modify +@ it under the terms of the GNU Lesser General Public License as published by +@ the Free Software Foundation; either version 2.1 of the License, or (at your +@ option) any later version. + +@ The GNU MP Library is distributed in the hope that it will be useful, but +@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +@ License for more details. + +@ You should have received a copy of the GNU Lesser General Public License +@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to +@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +@ MA 02111-1307, USA. + +#define d r0 +#define a r1 +#define b r2 +#define n r3 + +#define sl r10 +#define fp r11 +#define ip r12 +#define sp r13 +#define lr r14 +#define pc r15 + +.text + .align 0 + .global __gmpn_sub_n + .type __gmpn_sub_n,%function +__gmpn_sub_n: + stmfd sp!, { r8, r9, lr } + subs ip, ip, ip + tst n, #1 + beq skip1 + ldr ip, [a], #4 + ldr lr, [b], #4 + subs ip, ip, lr + str ip, [d], #4 +skip1: + tst n, #2 + beq skip2 + ldmia a!, { r8, r9 } + ldmia b!, { ip, lr } + sbcs r8, r8, ip + sbcs r9, r9, lr + stmia d!, { r8, r9 } +skip2: + bics n, n, #3 + beq return + stmfd sp!, { r4, r5, r6, r7 } +sub_n_loop: + ldmia a!, { r4, r5, r6, r7 } + ldmia b!, { r8, r9, ip, lr } + sbcs r4, r4, r8 + ldr r8, [d] /* Bring stuff into cache. */ + sbcs r5, r5, r9 + sbcs r6, r6, ip + sbcs r7, r7, lr + stmia d!, { r4, r5, r6, r7 } + sub n, n, #4 + teq n, #0 + bne sub_n_loop + ldmfd sp!, { r4, r5, r6, r7 } +return: + sbc r0, r0, r0 + and r0, r0, #1 + ldmfd sp!, { r8, r9, pc } +end: + .size __gmpn_sub_n, end - __gmpn_sub_n diff --git a/ghc/rts/gmp/mpn/asm-defs.m4 b/ghc/rts/gmp/mpn/asm-defs.m4 new file mode 100644 index 0000000..aa20241 --- /dev/null +++ b/ghc/rts/gmp/mpn/asm-defs.m4 @@ -0,0 +1,1182 @@ +divert(-1) +dnl +dnl m4 macros for gmp assembly code, shared by all CPUs. +dnl +dnl These macros are designed for use with any m4 and have been used on +dnl GNU, FreeBSD, OpenBSD and SysV. +dnl +dnl GNU m4 and OpenBSD 2.7 m4 will give filenames and line numbers in error +dnl messages. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl Macros: +dnl +dnl Most new m4 specific macros have an "m4_" prefix to emphasise they're +dnl m4 expansions. But new defining things like deflit() and defreg() are +dnl named like the builtin define(), and forloop() is named following the +dnl GNU m4 example on which it's based. +dnl +dnl GNU m4 with the -P option uses "m4_" as a prefix for builtins, but that +dnl option isn't going to be used, so there's no conflict or confusion. +dnl +dnl +dnl Comments in output: +dnl +dnl The m4 comment delimiters are left at # and \n, the normal assembler +dnl commenting for most CPUs. m4 passes comment text through without +dnl expanding macros in it, which is generally a good thing since it stops +dnl unexpected expansions and possible resultant errors. +dnl +dnl But note that when a quoted string is being read, a # isn't special, so +dnl apostrophes in comments in quoted strings must be avoided or they'll be +dnl interpreted as a closing quote mark. But when the quoted text is +dnl re-read # will still act like a normal comment, supressing macro +dnl expansion. +dnl +dnl For example, +dnl +dnl # apostrophes in comments that're outside quotes are ok +dnl # and using macro names like PROLOGUE is ok too +dnl ... +dnl ifdef(`PIC',` +dnl # but apostrophes aren't ok inside quotes +dnl # ^--wrong +dnl ... +dnl # though macro names like PROLOGUE are still ok +dnl ... +dnl ') +dnl +dnl If macro expansion in a comment is wanted, use `#' in the .asm (ie. a +dnl quoted hash symbol), which will turn into # in the .s but get +dnl expansions done on that line. This can make the .s more readable to +dnl humans, but it won't make a blind bit of difference to the assembler. +dnl +dnl All the above applies, mutatis mutandis, when changecom() is used to +dnl select @ ! ; or whatever other commenting. +dnl +dnl +dnl Variations in m4 affecting gmp: +dnl +dnl $# - When a macro is called as "foo" with no brackets, BSD m4 sets $# +dnl to 1, whereas GNU or SysV m4 set it to 0. In all cases though +dnl "foo()" sets $# to 1. This is worked around in various places. +dnl +dnl len() - When "len()" is given an empty argument, BSD m4 evaluates to +dnl nothing, whereas GNU, SysV, and the new OpenBSD, evaluate to 0. +dnl See m4_length() below which works around this. +dnl +dnl translit() - GNU m4 accepts character ranges like A-Z, and the new +dnl OpenBSD m4 does under option -g, but basic BSD and SysV don't. +dnl +dnl popdef() - in BSD and SysV m4 popdef() takes multiple arguments and +dnl pops each, but GNU m4 only takes one argument. +dnl +dnl push back - BSD m4 has some limits on the amount of text that can be +dnl pushed back. The limit is reasonably big and so long as macros +dnl don't gratuitously duplicate big arguments it isn't a problem. +dnl Normally an error message is given, but sometimes it just hangs. +dnl +dnl eval() &,|,^ - GNU and SysV m4 have bitwise operators &,|,^ available, +dnl but BSD m4 doesn't (contrary to what the man page suggests) and +dnl instead ^ is exponentiation. +dnl +dnl eval() ?: - The C ternary operator "?:" is available in BSD m4, but not +dnl in SysV or GNU m4 (as of GNU m4 1.4 and betas of 1.5). +dnl +dnl eval() -2^31 - BSD m4 has a bug where an eval() resulting in -2^31 +dnl (ie. -2147483648) gives "-(". Using -2147483648 within an +dnl expression is ok, it just can't be a final result. "-(" will of +dnl course upset parsing, with all sorts of strange effects. +dnl +dnl eval() <<,>> - SysV m4 doesn't support shift operators in eval() (on +dnl SunOS 5.7 /usr/xpg4/m4 has them but /usr/ccs/m4 doesn't). See +dnl m4_lshift() and m4_rshift() below for workarounds. +dnl +dnl m4wrap() - in BSD m4, m4wrap() replaces any previous m4wrap() string, +dnl in SysV m4 it appends to it, and in GNU m4 it prepends. See +dnl m4wrap_prepend() below which brings uniformity to this. +dnl +dnl __file__,__line__ - GNU m4 and OpenBSD 2.7 m4 provide these, and +dnl they're used here to make error messages more informative. GNU m4 +dnl gives an unhelpful "NONE 0" in an m4wrap(), but that's worked +dnl around. +dnl +dnl __file__ quoting - OpenBSD m4, unlike GNU m4, doesn't quote the +dnl filename in __file__, so care should be taken that no macro has +dnl the same name as a file, or an unwanted expansion will occur when +dnl printing an error or warning. +dnl +dnl OpenBSD 2.6 m4 - this m4 rejects decimal constants containing an 8 or 9 +dnl in eval(), making it pretty much unusable. This bug is confined +dnl to version 2.6 (it's not in 2.5, and has been fixed in 2.7). +dnl +dnl SunOS /usr/bin/m4 - this m4 lacks a number of desired features, +dnl including $# and $@, defn(), m4exit(), m4wrap(), pushdef(), +dnl popdef(). /usr/5bin/m4 is a SysV style m4 which should always be +dnl available, and "configure" will reject /usr/bin/m4 in favour of +dnl /usr/5bin/m4 (if necessary). +dnl +dnl The sparc code actually has modest m4 requirements currently and +dnl could manage with /usr/bin/m4, but there's no reason to put our +dnl macros through contortions when /usr/5bin/m4 is available or GNU +dnl m4 can be installed. + + +ifdef(`__ASM_DEFS_M4_INCLUDED__', +`m4_error(`asm-defs.m4 already included, dont include it twice +')m4exit(1)') +define(`__ASM_DEFS_M4_INCLUDED__') + + +dnl Detect and give a message about the unsuitable OpenBSD 2.6 m4. + +ifelse(eval(89),89,, +`errprint( +`This m4 doesnt accept 8 and/or 9 in constants in eval(), making it unusable. +This is probably OpenBSD 2.6 m4 (September 1999). Upgrade to OpenBSD 2.7, +or get a bug fix from the CVS (expr.c rev 1.9), or get GNU m4. Dont forget +to configure with M4=/wherever/m4 if you install one of these in a directory +not in $PATH. +')m4exit(1)') + + +dnl Detect and give a message about the unsuitable SunOS /usr/bin/m4. +dnl +dnl Unfortunately this test doesn't work when m4 is run in the normal way +dnl from mpn/Makefile with "m4 -DOPERATION_foo foo.asm", since the bad m4 +dnl takes "-" in "-D..." to mean read stdin, so it will look like it just +dnl hangs. But running "m4 asm-defs.m4" to try it out will work. +dnl +dnl We'd like to abort immediately on finding a problem, but unfortunately +dnl the bad m4 doesn't have an m4exit(), nor does an invalid eval() kill +dnl it. Unexpanded $#'s in some m4_assert_numargs() later on will comment +dnl out some closing parentheses and kill it with "m4: arg stack overflow". + +define(m4_dollarhash_works_test,``$#'') +ifelse(m4_dollarhash_works_test(x),1,, +`errprint( +`This m4 doesnt support $# and cant be used for GMP asm processing. +If this is on SunOS, ./configure should choose /usr/5bin/m4 if you have that +or can get it, otherwise install GNU m4. Dont forget to configure with +M4=/wherever/m4 if you install in a directory not in $PATH. +')') +undefine(`m4_dollarhash_works_test') + + +dnl -------------------------------------------------------------------------- +dnl Basic error handling things. + + +dnl Usage: m4_dollarhash_1_if_noparen_p +dnl +dnl Expand to 1 if a call "foo" gives $# set to 1 (as opposed to 0 like GNU +dnl and SysV m4 give). + +define(m4_dollarhash_1_if_noparen_test,`$#') +define(m4_dollarhash_1_if_noparen_p, +eval(m4_dollarhash_1_if_noparen_test==1)) +undefine(`m4_dollarhash_1_if_noparen_test') + + +dnl Usage: m4wrap_prepend(string) +dnl +dnl Prepend the given string to what will be exapanded under m4wrap at the +dnl end of input. +dnl +dnl This macro exists to work around variations in m4wrap() behaviour in +dnl the various m4s (notes at the start of this file). Don't use m4wrap() +dnl directly since it will interfere with this scheme. + +define(m4wrap_prepend, +m4_assert_numargs(1) +`define(`m4wrap_string',`$1'defn(`m4wrap_string'))') + +m4wrap(`m4wrap_string') +define(m4wrap_string,`') + + +dnl Usage: m4_file_and_line +dnl +dnl Expand to the current file and line number, if the GNU m4 extensions +dnl __file__ and __line__ are available. +dnl +dnl In GNU m4 1.4 at the end of input when m4wrap text is expanded, +dnl __file__ is NONE and __line__ is 0, which is not a helpful thing to +dnl print. If m4_file_seen() has been called to note the last file seen, +dnl then that file at a big line number is used, otherwise "end of input" +dnl is used (although "end of input" won't parse as an error message). + +define(m4_file_and_line, +`ifdef(`__file__', +`ifelse(__file__`'__line__,`NONE0', +`ifdef(`m4_file_seen_last',`m4_file_seen_last: 999999: ',`end of input: ')', +`__file__: __line__: ')')') + + +dnl Usage: m4_errprint_commas(arg,...) +dnl +dnl The same as errprint(), but commas are printed between arguments +dnl instead of spaces. + +define(m4_errprint_commas, +`errprint(`$1')dnl +ifelse(eval($#>1),1,`errprint(`,')m4_errprint_commas(shift($@))')') + + +dnl Usage: m4_error(args...) +dnl m4_warning(args...) +dnl +dnl Print an error message, using m4_errprint_commas, prefixed with the +dnl current filename and line number (if available). m4_error sets up to +dnl give an error exit at the end of processing, m4_warning just prints. +dnl These macros are the recommended way to print errors. +dnl +dnl The arguments here should be quoted in the usual way to prevent them +dnl being expanded when the macro call is read. (m4_error takes care not +dnl to do any further expansion.) +dnl +dnl For example, +dnl +dnl m4_error(`some error message +dnl ') +dnl +dnl which prints +dnl +dnl foo.asm:123: some error message +dnl +dnl or if __file__ and __line__ aren't available +dnl +dnl some error message +dnl +dnl The "file:line:" format is a basic style, used by gcc and GNU m4, so +dnl emacs and other editors will recognise it in their normal error message +dnl parsing. + +define(m4_warning, +`m4_errprint_commas(m4_file_and_line`'$@)') + +define(m4_error, +`define(`m4_error_occurred',1)m4_warning($@)') + +define(`m4_error_occurred',0) + +dnl This m4wrap_prepend() is first, so it'll be executed last. +m4wrap_prepend( +`ifelse(m4_error_occurred,1, +`m4_error(`Errors occurred during m4 processing +')m4exit(1)')') + + +dnl Usage: m4_assert_numargs(num) +dnl +dnl Put this unquoted on a line on its own at the start of a macro +dnl definition to add some code to check that num many arguments get passed +dnl to the macro. For example, +dnl +dnl define(foo, +dnl m4_assert_numargs(2) +dnl `something `$1' and `$2' blah blah') +dnl +dnl Then a call like foo(one,two,three) will provoke an error like +dnl +dnl file:10: foo expected 2 arguments, got 3 arguments +dnl +dnl Here are some calls and how many arguments they're interpreted as passing. +dnl +dnl foo(abc,def) 2 +dnl foo(xyz) 1 +dnl foo() 0 +dnl foo -1 +dnl +dnl The -1 for no parentheses at all means a macro that's meant to be used +dnl that way can be checked with m4_assert_numargs(-1). For example, +dnl +dnl define(SPECIAL_SUFFIX, +dnl m4_assert_numargs(-1) +dnl `ifdef(`FOO',`_foo',`_bar')') +dnl +dnl But as an alternative see also deflit() below where parenthesized +dnl expressions following a macro are passed through to the output. +dnl +dnl Note that in BSD m4 there's no way to differentiate calls "foo" and +dnl "foo()", so in BSD m4 the distinction between the two isn't enforced. +dnl (In GNU and SysV m4 it can be checked, and is.) + + +dnl m4_assert_numargs is able to check its own arguments by calling +dnl assert_numargs_internal directly. +dnl +dnl m4_doublequote($`'0) expands to ``$0'', whereas ``$`'0'' would expand +dnl to `$`'0' and do the wrong thing, and likewise for $1. The same is +dnl done in other assert macros. +dnl +dnl $`#' leaves $# in the new macro being defined, and stops # being +dnl interpreted as a comment character. +dnl +dnl `dnl ' means an explicit dnl isn't necessary when m4_assert_numargs is +dnl used. The space means that if there is a dnl it'll still work. + +dnl Usage: m4_doublequote(x) expands to ``x'' +define(m4_doublequote, +`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))``$1''') + +define(m4_assert_numargs, +`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))dnl +`m4_assert_numargs_internal'(m4_doublequote($`'0),$1,$`#',`len'(m4_doublequote($`'1)))`dnl '') + +dnl Called: m4_assert_numargs_internal(`macroname',wantargs,$#,len(`$1')) +define(m4_assert_numargs_internal, +`m4_assert_numargs_internal_check(`$1',`$2',m4_numargs_count(`$3',`$4'))') + +dnl Called: m4_assert_numargs_internal_check(`macroname',wantargs,gotargs) +dnl +dnl If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it +dnl should be -1. If wantargs is -1 but gotargs is 0 and the two can't be +dnl distinguished then it's allowed to pass. +dnl +define(m4_assert_numargs_internal_check, +`ifelse(eval($2 == $3 + || ($2==-1 && $3==0 && m4_dollarhash_1_if_noparen_p)),0, +`m4_error(`$1 expected 'm4_Narguments(`$2')`, got 'm4_Narguments(`$3') +)')') + +dnl Called: m4_numargs_count($#,len(`$1')) +dnl If $#==0 then -1 args, if $#==1 but len(`$1')==0 then 0 args, otherwise +dnl $# args. +define(m4_numargs_count, +`ifelse($1,0, -1, +`ifelse(eval($1==1 && $2-0==0),1, 0, $1)')') + +dnl Usage: m4_Narguments(N) +dnl "$1 argument" or "$1 arguments" with the plural according to $1. +define(m4_Narguments, +`$1 argument`'ifelse(`$1',1,,s)') + + +dnl -------------------------------------------------------------------------- +dnl Additional error checking things. + + +dnl Usage: m4_file_seen() +dnl +dnl Record __file__ for the benefit of m4_file_and_line in m4wrap text. +dnl The basic __file__ macro comes out quoted, like `foo.asm', and +dnl m4_file_seen_last is defined like that too. +dnl +dnl This only needs to be used with something that could generate an error +dnl message in m4wrap text. The x86 PROLOGUE is the only such at the +dnl moment (at end of input its m4wrap checks for missing EPILOGUE). A few +dnl include()s can easily trick this scheme, but you'd expect an EPILOGUE +dnl in the same file as the PROLOGUE. + +define(m4_file_seen, +m4_assert_numargs(0) +`ifelse(__file__,`NONE',, +`define(`m4_file_seen_last',m4_doublequote(__file__))')') + + +dnl Usage: m4_assert_onearg() +dnl +dnl Put this, unquoted, at the start of a macro definition to add some code +dnl to check that one argument is passed to the macro, but with that +dnl argument allowed to be empty. For example, +dnl +dnl define(foo, +dnl m4_assert_onearg() +dnl `blah blah $1 blah blah') +dnl +dnl Calls "foo(xyz)" or "foo()" are accepted. A call "foo(xyz,abc)" fails. +dnl A call "foo" fails too, but BSD m4 can't detect this case (GNU and SysV +dnl m4 can). + +define(m4_assert_onearg, +m4_assert_numargs(0) +`m4_assert_onearg_internal'(m4_doublequote($`'0),$`#')`dnl ') + +dnl Called: m4_assert_onearg(`macroname',$#) +define(m4_assert_onearg_internal, +`ifelse($2,1,, +`m4_error(`$1 expected 1 argument, got 'm4_Narguments(`$2') +)')') + + +dnl Usage: m4_assert_numargs_range(low,high) +dnl +dnl Put this, unquoted, at the start of a macro definition to add some code +dnl to check that between low and high many arguments get passed to the +dnl macro. For example, +dnl +dnl define(foo, +dnl m4_assert_numargs_range(3,5) +dnl `mandatory $1 $2 $3 optional $4 $5 end') +dnl +dnl See m4_assert_numargs() for more info. + +define(m4_assert_numargs_range, +m4_assert_numargs(2) +``m4_assert_numargs_range_internal'(m4_doublequote($`'0),$1,$2,$`#',`len'(m4_doublequote($`'1)))`dnl '') + +dnl Called: m4_assert_numargs_range_internal(`name',low,high,$#,len(`$1')) +define(m4_assert_numargs_range_internal, +m4_assert_numargs(5) +`m4_assert_numargs_range_check(`$1',`$2',`$3',m4_numargs_count(`$4',`$5'))') + +dnl Called: m4_assert_numargs_range_check(`name',low,high,gotargs) +dnl +dnl If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it +dnl should be -1. To ensure a `high' of -1 works, a fudge is applied to +dnl gotargs if it's 0 and the 0 and -1 cases can't be distinguished. +dnl +define(m4_assert_numargs_range_check, +m4_assert_numargs(4) +`ifelse(eval($2 <= $4 && + ($4 - ($4==0 && m4_dollarhash_1_if_noparen_p) <= $3)),0, +`m4_error(`$1 expected $2 to $3 arguments, got 'm4_Narguments(`$4') +)')') + + +dnl Usage: m4_assert_defined(symbol) +dnl +dnl Put this unquoted on a line of its own at the start of a macro +dnl definition to add some code to check that the given symbol is defined +dnl when the macro is used. For example, +dnl +dnl define(foo, +dnl m4_assert_defined(`FOO_PREFIX') +dnl `FOO_PREFIX whatever') +dnl +dnl This is a convenient way to check that the user or ./configure or +dnl whatever has defined the things needed by a macro, as opposed to +dnl silently generating garbage. + +define(m4_assert_defined, +m4_assert_numargs(1) +``m4_assert_defined_internal'(m4_doublequote($`'0),``$1'')`dnl '') + +dnl Called: m4_assert_defined_internal(`macroname',`define_required') +define(m4_assert_defined_internal, +m4_assert_numargs(2) +`ifdef(`$2',, +`m4_error(`$1 needs $2 defined +')')') + + +dnl Usage: m4_not_for_expansion(`SYMBOL') +dnl define_not_for_expansion(`SYMBOL') +dnl +dnl m4_not_for_expansion turns SYMBOL, if defined, into something which +dnl will give an error if expanded. For example, +dnl +dnl m4_not_for_expansion(`PIC') +dnl +dnl define_not_for_expansion is the same, but always makes a definition. +dnl +dnl These are for symbols that should be tested with ifdef(`FOO',...) +dnl rather than be expanded as such. They guard against accidentally +dnl omitting the quotes, as in ifdef(FOO,...). Note though that they only +dnl catches this when FOO is defined, so be sure to test code both with and +dnl without each definition. + +define(m4_not_for_expansion, +m4_assert_numargs(1) +`ifdef(`$1',`define_not_for_expansion(`$1')')') + +define(define_not_for_expansion, +m4_assert_numargs(1) +`ifelse(defn(`$1'),,, +`m4_error(``$1' has a non-empty value, maybe it shouldnt be munged with m4_not_for_expansion() +')')dnl +define(`$1',`m4_not_for_expansion_internal(`$1')')') + +define(m4_not_for_expansion_internal, +`m4_error(``$1' is not meant to be expanded, perhaps you mean `ifdef(`$1',...)' +')') + + +dnl -------------------------------------------------------------------------- +dnl Various generic m4 things. + + +dnl Usage: m4_ifdef_anyof_p(`symbol',...) +dnl +dnl Expand to 1 if any of the symbols in the argument list are defined, or +dnl to 0 if not. + +define(m4_ifdef_anyof_p, +`ifelse(eval($#<=1 && m4_length(`$1')==0),1, 0, +`ifdef(`$1', 1, +`m4_ifdef_anyof_p(shift($@))')')') + + +dnl Usage: m4_length(string) +dnl +dnl Determine the length of a string. This is the same as len(), but +dnl always expands to a number, working around the BSD len() which +dnl evaluates to nothing given an empty argument. + +define(m4_length, +m4_assert_onearg() +`eval(len(`$1')-0)') + + +dnl Usage: m4_stringequal_p(x,y) +dnl +dnl Expand to 1 or 0 according as strings x and y are equal or not. + +define(m4_stringequal_p, +`ifelse(`$1',`$2',1,0)') + + +dnl Usage: m4_incr_or_decr(n,last) +dnl +dnl Do an incr(n) or decr(n), whichever is in the direction of "last". +dnl Both n and last must be numbers of course. + +define(m4_incr_or_decr, +m4_assert_numargs(2) +`ifelse(eval($1<$2),1,incr($1),decr($1))') + + +dnl Usage: forloop(i, first, last, statement) +dnl +dnl Based on GNU m4 examples/forloop.m4, but extended. +dnl +dnl statement is expanded repeatedly, with i successively defined as +dnl +dnl first, first+1, ..., last-1, last +dnl +dnl Or if first > last, then it's +dnl +dnl first, first-1, ..., last+1, last +dnl +dnl If first == last, then one expansion is done. +dnl +dnl A pushdef/popdef of i is done to preserve any previous definition (or +dnl lack of definition). first and last are eval()ed and so can be +dnl expressions. +dnl +dnl forloop_first is defined to 1 on the first iteration, 0 on the rest. +dnl forloop_last is defined to 1 on the last iteration, 0 on the others. +dnl Nested forloops are allowed, in which case forloop_first and +dnl forloop_last apply to the innermost loop that's open. +dnl +dnl A simple example, +dnl +dnl forloop(i, 1, 2*2+1, `dnl +dnl iteration number i ... ifelse(forloop_first,1,FIRST) +dnl ') + + +dnl "i" and "statement" are carefully quoted, but "first" and "last" are +dnl just plain numbers once eval()ed. + +define(`forloop', +m4_assert_numargs(4) +`pushdef(`$1',eval(`$2'))dnl +pushdef(`forloop_first',1)dnl +pushdef(`forloop_last',0)dnl +forloop_internal(`$1',eval(`$3'),`$4')`'dnl +popdef(`forloop_first')dnl +popdef(`forloop_last')dnl +popdef(`$1')') + +dnl Called: forloop_internal(`var',last,statement) +define(`forloop_internal', +m4_assert_numargs(3) +`ifelse($1,$2, +`define(`forloop_last',1)$3', +`$3`'dnl +define(`forloop_first',0)dnl +define(`$1',m4_incr_or_decr($1,$2))dnl +forloop_internal(`$1',$2,`$3')')') + + +dnl Usage: m4_toupper(x) +dnl m4_tolower(x) +dnl +dnl Convert the argument string to upper or lower case, respectively. +dnl Only one argument accepted. +dnl +dnl BSD m4 doesn't take ranges like a-z in translit(), so the full alphabet +dnl is written out. + +define(m4_alphabet_lower, `abcdefghijklmnopqrstuvwxyz') +define(m4_alphabet_upper, `ABCDEFGHIJKLMNOPQRSTUVWXYZ') + +define(m4_toupper, +m4_assert_onearg() +`translit(`$1', m4_alphabet_lower, m4_alphabet_upper)') + +define(m4_tolower, +m4_assert_onearg() +`translit(`$1', m4_alphabet_upper, m4_alphabet_lower)') + + +dnl Usage: m4_empty_if_zero(x) +dnl +dnl Evaluate to x, or to nothing if x is 0. x is eval()ed and so can be an +dnl expression. +dnl +dnl This is useful for x86 addressing mode displacements since forms like +dnl (%ebx) are one byte shorter than 0(%ebx). A macro `foo' for use as +dnl foo(%ebx) could be defined with the following so it'll be empty if the +dnl expression comes out zero. +dnl +dnl deflit(`foo', `m4_empty_if_zero(a+b*4-c)') +dnl +dnl Naturally this shouldn't be done if, say, a computed jump depends on +dnl the code being a particular size. + +define(m4_empty_if_zero, +m4_assert_onearg() +`ifelse(eval($1),0,,eval($1))') + + +dnl Usage: m4_log2(x) +dnl +dnl Calculate a logarithm to base 2. +dnl x must be an integral power of 2, between 2**0 and 2**30. +dnl x is eval()ed, so it can be an expression. +dnl An error results if x is invalid. +dnl +dnl 2**31 isn't supported, because an unsigned 2147483648 is out of range +dnl of a 32-bit signed int. Also, the bug in BSD m4 where an eval() +dnl resulting in 2147483648 (or -2147483648 as the case may be) gives `-(' +dnl means tests like eval(1<<31==(x)) would be necessary, but that then +dnl gives an unattractive explosion of eval() error messages if x isn't +dnl numeric. + +define(m4_log2, +m4_assert_numargs(1) +`m4_log2_internal(0,1,eval(`$1'))') + +dnl Called: m4_log2_internal(n,2**n,target) +define(m4_log2_internal, +m4_assert_numargs(3) +`ifelse($2,$3,$1, +`ifelse($1,30, +`m4_error(`m4_log2() argument too big or not a power of two: $3 +')', +`m4_log2_internal(incr($1),eval(2*$2),$3)')')') + + +dnl Usage: m4_div2_towards_zero +dnl +dnl m4 division is probably whatever a C signed division is, and C doesn't +dnl specify what rounding gets used on negatives, so this expression forces +dnl a rounding towards zero. + +define(m4_div2_towards_zero, +m4_assert_numargs(1) +`eval((($1) + ((($1)<0) & ($1))) / 2)') + + +dnl Usage: m4_lshift(n,count) +dnl m4_rshift(n,count) +dnl +dnl Calculate n shifted left or right by count many bits. Both n and count +dnl are eval()ed and so can be expressions. +dnl +dnl Negative counts are allowed and mean a shift in the opposite direction. +dnl Negative n is allowed and right shifts will be arithmetic (meaning +dnl divide by 2**count, rounding towards zero, also meaning the sign bit is +dnl duplicated). +dnl +dnl Use these macros instead of << and >> in eval() since the basic ccs +dnl SysV m4 doesn't have those operators. + +define(m4_rshift, +m4_assert_numargs(2) +`m4_lshift(`$1',-(`$2'))') + +define(m4_lshift, +m4_assert_numargs(2) +`m4_lshift_internal(eval(`$1'),eval(`$2'))') + +define(m4_lshift_internal, +m4_assert_numargs(2) +`ifelse(eval($2-0==0),1,$1, +`ifelse(eval($2>0),1, +`m4_lshift_internal(eval($1*2),decr($2))', +`m4_lshift_internal(m4_div2_towards_zero($1),incr($2))')')') + + +dnl Usage: deflit(name,value) +dnl +dnl Like define(), but "name" expands like a literal, rather than taking +dnl arguments. For example "name(%eax)" expands to "value(%eax)". +dnl +dnl Limitations: +dnl +dnl $ characters in the value part must have quotes to stop them looking +dnl like macro parameters. For example, deflit(reg,`123+$`'4+567'). See +dnl defreg() below for handling simple register definitions like $7 etc. +dnl +dnl "name()" is turned into "name", unfortunately. In GNU and SysV m4 an +dnl error is generated when this happens, but in BSD m4 it will happen +dnl silently. The problem is that in BSD m4 $# is 1 in both "name" or +dnl "name()", so there's no way to differentiate them. Because we want +dnl plain "name" to turn into plain "value", we end up with "name()" +dnl turning into plain "value" too. +dnl +dnl "name(foo)" will lose any whitespace after commas in "foo", for example +dnl "disp(%eax, %ecx)" would become "128(%eax,%ecx)". +dnl +dnl These parentheses oddities shouldn't matter in assembler text, but if +dnl they do the suggested workaround is to write "name ()" or "name (foo)" +dnl to stop the parentheses looking like a macro argument list. If a space +dnl isn't acceptable in the output, then write "name`'()" or "name`'(foo)". +dnl The `' is stripped when read, but again stops the parentheses looking +dnl like parameters. + +dnl Quoting for deflit_emptyargcheck is similar to m4_assert_numargs. The +dnl stuff in the ifelse gives a $#, $1 and $@ evaluated in the new macro +dnl created, not in deflit. +define(deflit, +m4_assert_numargs(2) +`define(`$1', +`deflit_emptyargcheck'(``$1'',$`#',m4_doublequote($`'1))`dnl +$2`'dnl +ifelse(eval($'`#>1 || m4_length('m4_doublequote($`'1)`)!=0),1,($'`@))')') + +dnl Called: deflit_emptyargcheck(macroname,$#,`$1') +define(deflit_emptyargcheck, +`ifelse(eval($2==1 && !m4_dollarhash_1_if_noparen_p && m4_length(`$3')==0),1, +`m4_error(`dont use a deflit as $1() because it loses the brackets (see deflit in asm-incl.m4 for more information) +')')') + + +dnl Usage: m4_assert(`expr') +dnl +dnl Test a compile-time requirement with an m4 expression. The expression +dnl should be quoted, and will be eval()ed and expected to be non-zero. +dnl For example, +dnl +dnl m4_assert(`FOO*2+6 < 14') + +define(m4_assert, +m4_assert_numargs(1) +`ifelse(eval($1),1,, +`m4_error(`assertion failed: $1 +')')') + + +dnl -------------------------------------------------------------------------- +dnl Various assembler things, not specific to any particular CPU. +dnl + + +dnl Usage: include_mpn(`filename') +dnl +dnl Like include(), but adds a path to the mpn source directory. For +dnl example, +dnl +dnl include_mpn(`sparc64/addmul_1h.asm') + +define(include_mpn, +m4_assert_numargs(1) +m4_assert_defined(`CONFIG_TOP_SRCDIR') +`include(CONFIG_TOP_SRCDIR`/mpn/$1')') + + +dnl Usage: C comment ... +dnl +dnl "C" works like a FORTRAN-style comment character. This can be used for +dnl comments to the right of assembly instructions, where just dnl would +dnl remove the linefeed, and concatenate adjacent lines. +dnl +dnl "C" and/or "dnl" are useful when an assembler doesn't support comments, +dnl or where different assemblers for a particular CPU have different +dnl comment styles. The intermediate ".s" files will end up with no +dnl comments, just code. +dnl +dnl Using "C" is not intended to cause offence to anyone who doesn't like +dnl FORTRAN; but if that happens it's an unexpected bonus. + +define(C, ` +dnl') + + +dnl Various possible defines passed from the Makefile that are to be tested +dnl with ifdef() rather than be expanded. + +m4_not_for_expansion(`PIC') + +dnl aors_n +m4_not_for_expansion(`OPERATION_add_n') +m4_not_for_expansion(`OPERATION_sub_n') + +dnl aorsmul_n +m4_not_for_expansion(`OPERATION_addmul_1') +m4_not_for_expansion(`OPERATION_submul_1') + +dnl logops_n +m4_not_for_expansion(`OPERATION_and_n') +m4_not_for_expansion(`OPERATION_andn_n') +m4_not_for_expansion(`OPERATION_nand_n') +m4_not_for_expansion(`OPERATION_ior_n') +m4_not_for_expansion(`OPERATION_iorn_n') +m4_not_for_expansion(`OPERATION_nior_n') +m4_not_for_expansion(`OPERATION_xor_n') +m4_not_for_expansion(`OPERATION_xnor_n') + +dnl popham +m4_not_for_expansion(`OPERATION_popcount') +m4_not_for_expansion(`OPERATION_hamdist') + + +dnl Usage: m4_config_gmp_mparam(`symbol') +dnl +dnl Check that `symbol' is defined. If it isn't, issue an error and +dnl terminate immediately. The error message explains that the symbol +dnl should be in config.m4, copied from gmp-mparam.h. +dnl +dnl Processing is terminated immediately since missing something like +dnl KARATSUBA_SQR_THRESHOLD can lead to infinite loops with endless error +dnl messages. + +define(m4_config_gmp_mparam, +m4_assert_numargs(1) +`ifdef(`$1',, +`m4_error(`$1 is not defined. + "configure" should have extracted this from gmp-mparam.h and put it + in config.m4, but somehow this has failed. +')m4exit(1)')') + + +dnl Usage: defreg(name,reg) +dnl +dnl Give a name to a $ style register. For example, +dnl +dnl defreg(foo,$12) +dnl +dnl defreg() inserts an extra pair of quotes after the $ so that it's not +dnl interpreted as an m4 macro parameter, ie. foo is actually $`'12. m4 +dnl strips those quotes when foo is expanded. +dnl +dnl deflit() is used to make the new definition, so it will expand +dnl literally even if followed by parentheses ie. foo(99) will become +dnl $12(99). (But there's nowhere that would be used is there?) +dnl +dnl When making further definitions from existing defreg() macros, remember +dnl to use defreg() again to protect the $ in the new definitions too. For +dnl example, +dnl +dnl defreg(a0,$4) +dnl defreg(a1,$5) +dnl ... +dnl +dnl defreg(PARAM_DST,a0) +dnl +dnl This is only because a0 is expanding at the time the PARAM_DST +dnl definition is made, leaving a literal $4 that must be re-quoted. On +dnl the other hand in something like the following ra is only expanded when +dnl ret is used and its $`'31 protection will have its desired effect at +dnl that time. +dnl +dnl defreg(ra,$31) +dnl ... +dnl define(ret,`j ra') +dnl +dnl Note that only $n forms are meant to be used here, and something like +dnl 128($30) doesn't get protected and will come out wrong. + +define(defreg, +m4_assert_numargs(2) +`deflit(`$1', +substr(`$2',0,1)``''substr(`$2',1))') + + +dnl Usage: m4_instruction_wrapper(num) +dnl +dnl Put this, unquoted, on a line on its own, at the start of a macro +dnl that's a wrapper around an assembler instruction. It adds code to give +dnl a descriptive error message if the macro is invoked without arguments. +dnl +dnl For example, suppose jmp needs to be wrapped, +dnl +dnl define(jmp, +dnl m4_instruction_wrapper() +dnl m4_assert_numargs(1) +dnl `.byte 0x42 +dnl .long $1 +dnl nop') +dnl +dnl The point of m4_instruction_wrapper is to get a better error message +dnl than m4_assert_numargs would give if jmp is accidentally used as plain +dnl "jmp foo" instead of the intended "jmp( foo)". "jmp()" with no +dnl argument also provokes the error message. +dnl +dnl m4_instruction_wrapper should only be used with wrapped instructions +dnl that take arguments, since obviously something meant to be used as +dnl plain "ret", say, doesn't want to give an error when used that way. + +define(m4_instruction_wrapper, +m4_assert_numargs(0) +``m4_instruction_wrapper_internal'(m4_doublequote($`'0),dnl +m4_doublequote(ifdef(`__file__',__file__,`the m4 sources')),dnl +$`#',m4_doublequote($`'1))`dnl'') + +dnl Called: m4_instruction_wrapper_internal($0,`filename',$#,$1) +define(m4_instruction_wrapper_internal, +`ifelse(eval($3<=1 && m4_length(`$4')==0),1, +`m4_error(`$1 is a macro replacing that instruction and needs arguments, see $2 for details +')')') + + +dnl Usage: UNROLL_LOG2, UNROLL_MASK, UNROLL_BYTES +dnl CHUNK_LOG2, CHUNK_MASK, CHUNK_BYTES +dnl +dnl When code supports a variable amount of loop unrolling, the convention +dnl is to define UNROLL_COUNT to the number of limbs processed per loop. +dnl When testing code this can be varied to see how much the loop overhead +dnl is costing. For example, +dnl +dnl deflit(UNROLL_COUNT, 32) +dnl +dnl If the forloop() generating the unrolled loop has a pattern processing +dnl more than one limb, the convention is to express this with CHUNK_COUNT. +dnl For example, +dnl +dnl deflit(CHUNK_COUNT, 2) +dnl +dnl The LOG2, MASK and BYTES definitions below are derived from these COUNT +dnl definitions. If COUNT is redefined, the LOG2, MASK and BYTES follow +dnl the new definition automatically. +dnl +dnl LOG2 is the log base 2 of COUNT. MASK is COUNT-1, which can be used as +dnl a bit mask. BYTES is BYTES_PER_MP_LIMB*COUNT, the number of bytes +dnl processed in each unrolled loop. +dnl +dnl BYTES_PER_MP_LIMB is defined in a CPU specific m4 include file. It +dnl exists only so the BYTES definitions here can be common to all CPUs. +dnl In the actual code for a given CPU, an explicit 4 or 8 may as well be +dnl used because the code is only for a particular CPU, it doesn't need to +dnl be general. +dnl +dnl Note that none of these macros do anything except give conventional +dnl names to commonly used things. You still have to write your own +dnl expressions for a forloop() and the resulting address displacements. +dnl Something like the following would be typical for 4 bytes per limb. +dnl +dnl forloop(`i',0,UNROLL_COUNT-1,` +dnl deflit(`disp',eval(i*4)) +dnl ... +dnl ') +dnl +dnl Or when using CHUNK_COUNT, +dnl +dnl forloop(`i',0,UNROLL_COUNT/CHUNK_COUNT-1,` +dnl deflit(`disp0',eval(i*CHUNK_COUNT*4)) +dnl deflit(`disp1',eval(disp0+4)) +dnl ... +dnl ') +dnl +dnl Clearly `i' can be run starting from 1, or from high to low or whatever +dnl best suits. + +deflit(UNROLL_LOG2, +m4_assert_defined(`UNROLL_COUNT') +`m4_log2(UNROLL_COUNT)') + +deflit(UNROLL_MASK, +m4_assert_defined(`UNROLL_COUNT') +`eval(UNROLL_COUNT-1)') + +deflit(UNROLL_BYTES, +m4_assert_defined(`UNROLL_COUNT') +m4_assert_defined(`BYTES_PER_MP_LIMB') +`eval(UNROLL_COUNT * BYTES_PER_MP_LIMB)') + +deflit(CHUNK_LOG2, +m4_assert_defined(`CHUNK_COUNT') +`m4_log2(CHUNK_COUNT)') + +deflit(CHUNK_MASK, +m4_assert_defined(`CHUNK_COUNT') +`eval(CHUNK_COUNT-1)') + +deflit(CHUNK_BYTES, +m4_assert_defined(`CHUNK_COUNT') +m4_assert_defined(`BYTES_PER_MP_LIMB') +`eval(CHUNK_COUNT * BYTES_PER_MP_LIMB)') + + +dnl Usage: MPN(name) +dnl +dnl Add MPN_PREFIX to a name. +dnl MPN_PREFIX defaults to "__gmpn_" if not defined. + +ifdef(`MPN_PREFIX',, +`define(`MPN_PREFIX',`__gmpn_')') + +define(MPN, +m4_assert_numargs(1) +`MPN_PREFIX`'$1') + + +dnl Usage: mpn_add_n, etc +dnl +dnl Convenience definitions using MPN(), like the #defines in gmp.h. Each +dnl function that might be implemented in assembler is here. + +define(define_mpn, +m4_assert_numargs(1) +`define(`mpn_$1',`MPN(`$1')')') + +define_mpn(add) +define_mpn(add_1) +define_mpn(add_n) +define_mpn(add_nc) +define_mpn(addmul_1) +define_mpn(addmul_1c) +define_mpn(addsub_n) +define_mpn(addsub_nc) +define_mpn(and_n) +define_mpn(andn_n) +define_mpn(bdivmod) +define_mpn(cmp) +define_mpn(com_n) +define_mpn(copyd) +define_mpn(copyi) +define_mpn(divexact_by3c) +define_mpn(divrem) +define_mpn(divrem_1) +define_mpn(divrem_1c) +define_mpn(divrem_2) +define_mpn(divrem_classic) +define_mpn(divrem_newton) +define_mpn(dump) +define_mpn(gcd) +define_mpn(gcd_1) +define_mpn(gcdext) +define_mpn(get_str) +define_mpn(hamdist) +define_mpn(invert_limb) +define_mpn(ior_n) +define_mpn(iorn_n) +define_mpn(kara_mul_n) +define_mpn(kara_sqr_n) +define_mpn(lshift) +define_mpn(lshiftc) +define_mpn(mod_1) +define_mpn(mod_1c) +define_mpn(mul) +define_mpn(mul_1) +define_mpn(mul_1c) +define_mpn(mul_basecase) +define_mpn(mul_n) +define_mpn(perfect_square_p) +define_mpn(popcount) +define_mpn(preinv_mod_1) +define_mpn(nand_n) +define_mpn(nior_n) +define_mpn(random) +define_mpn(random2) +define_mpn(rshift) +define_mpn(rshiftc) +define_mpn(scan0) +define_mpn(scan1) +define_mpn(set_str) +define_mpn(sqr_basecase) +define_mpn(sub_n) +define_mpn(sqrtrem) +define_mpn(sub) +define_mpn(sub_1) +define_mpn(sub_n) +define_mpn(sub_nc) +define_mpn(submul_1) +define_mpn(submul_1c) +define_mpn(toom3_mul_n) +define_mpn(toom3_sqr_n) +define_mpn(umul_ppmm) +define_mpn(udiv_qrnnd) +define_mpn(xnor_n) +define_mpn(xor_n) + +define(`ASM_START', + `') + +define(`PROLOGUE', + ` + TEXT + ALIGN(4) + GLOBL GSYM_PREFIX`$1' + TYPE(GSYM_PREFIX`$1',`function') +GSYM_PREFIX`$1':') + +define(`EPILOGUE', + ` + SIZE(GSYM_PREFIX`$1',.-GSYM_PREFIX`$1')') + +dnl LSYM_PREFIX might be L$, so defn() must be used to quote it or the L +dnl will expand as the L macro, an infinite recursion. +define(`L',`defn(`LSYM_PREFIX')$1') + +define(`INT32', + ` + ALIGN(4) +$1: + W32 $2 + ') + +define(`INT64', + ` + ALIGN(8) +$1: + W32 $2 + W32 $3 + ') + + +dnl Usage: ALIGN(bytes) +dnl +dnl Emit a ".align" directive. The alignment is specified in bytes, and +dnl will normally need to be a power of 2. The actual ".align" generated +dnl is either bytes or logarithmic according to what ./configure detects. +dnl +dnl ALIGN_FILL_0x90, if defined and equal to "yes", means a ", 0x90" should +dnl be appended (this is for x86). + +define(ALIGN, +m4_assert_numargs(1) +m4_assert_defined(`ALIGN_LOGARITHMIC') +`.align ifelse(ALIGN_LOGARITHMIC,yes,`m4_log2($1)',`eval($1)')dnl +ifelse(ALIGN_FILL_0x90,yes,`, 0x90')') + + +dnl Usage: MULFUNC_PROLOGUE(function function...) +dnl +dnl A dummy macro which is grepped for by ./configure to know what +dnl functions a multi-function file is providing. Use this if there aren't +dnl explicit PROLOGUE()s for each possible function. +dnl +dnl Multiple MULFUNC_PROLOGUEs can be used, or just one with the function +dnl names separated by spaces. + +define(`MULFUNC_PROLOGUE', +m4_assert_numargs(1) +`') + + +divert`'dnl diff --git a/ghc/rts/gmp/mpn/clipper/add_n.s b/ghc/rts/gmp/mpn/clipper/add_n.s index 8d9b986..538a1ca 100644 --- a/ghc/rts/gmp/mpn/clipper/add_n.s +++ b/ghc/rts/gmp/mpn/clipper/add_n.s @@ -1,29 +1,29 @@ -; Clipper __mpn_add_n -- Add two limb vectors of the same length > 0 and store +; Clipper __gmpn_add_n -- Add two limb vectors of the same length > 0 and store ; sum in a third limb vector. -; Copyright (C) 1995 Free Software Foundation, Inc. +; Copyright (C) 1995, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. .text .align 16 -.globl ___mpn_add_n -___mpn_add_n: +.globl ___gmpn_add_n +___gmpn_add_n: subq $8,sp storw r6,(sp) loadw 12(sp),r2 diff --git a/ghc/rts/gmp/mpn/clipper/mul_1.s b/ghc/rts/gmp/mpn/clipper/mul_1.s index 44d92c3..c0c7564 100644 --- a/ghc/rts/gmp/mpn/clipper/mul_1.s +++ b/ghc/rts/gmp/mpn/clipper/mul_1.s @@ -1,29 +1,29 @@ -; Clipper __mpn_mul_1 -- Multiply a limb vector with a limb and store +; Clipper __gmpn_mul_1 -- Multiply a limb vector with a limb and store ; the result in a second limb vector. -; Copyright (C) 1995 Free Software Foundation, Inc. +; Copyright (C) 1995, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. .text .align 16 -.globl ___mpn_mul_1 -___mpn_mul_1: +.globl ___gmpn_mul_1 +___gmpn_mul_1: subq $8,sp storw r6,(sp) loadw 12(sp),r2 diff --git a/ghc/rts/gmp/mpn/clipper/sub_n.s b/ghc/rts/gmp/mpn/clipper/sub_n.s index 882c991..44d8797 100644 --- a/ghc/rts/gmp/mpn/clipper/sub_n.s +++ b/ghc/rts/gmp/mpn/clipper/sub_n.s @@ -1,29 +1,29 @@ -; Clipper __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +; Clipper __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and ; store difference in a third limb vector. -; Copyright (C) 1995 Free Software Foundation, Inc. +; Copyright (C) 1995, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. .text .align 16 -.globl ___mpn_sub_n -___mpn_sub_n: +.globl ___gmpn_sub_n +___gmpn_sub_n: subq $8,sp storw r6,(sp) loadw 12(sp),r2 diff --git a/ghc/rts/gmp/mpn/cray/README b/ghc/rts/gmp/mpn/cray/README new file mode 100644 index 0000000..8195c67 --- /dev/null +++ b/ghc/rts/gmp/mpn/cray/README @@ -0,0 +1,14 @@ +The (poorly optimized) code in this directory was originally written for a +j90 system, but finished on a c90. It should work on all Cray vector +computers. For the T3E and T3D systems, the `alpha' subdirectory at the +same level as the directory containing this file, is much better. + +* `+' seems to be faster than `|' when combining carries. + +* It is possible that the best multiply performance would be achived by + storing only 24 bits per element, and using lazy carry propagation. Before + calling i24mult, full carry propagation would be needed. + +* Supply tasking versions of the C loops. + + diff --git a/ghc/rts/gmp/mpn/cray/add_n.c b/ghc/rts/gmp/mpn/cray/add_n.c new file mode 100644 index 0000000..1fdb394 --- /dev/null +++ b/ghc/rts/gmp/mpn/cray/add_n.c @@ -0,0 +1,96 @@ +/* mpn_add_n -- Add two limb vectors of equal, non-zero length. + For Cray vector processors. + + Copyright (C) 1996, 2000 Free Software Foundation, Inc. + + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, write to + the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +mp_limb_t +mpn_add_n (c, a, b, n) + mp_ptr c; + mp_srcptr a, b; + mp_size_t n; +{ + mp_size_t i; + mp_size_t nm1 = n - 1; + int more_carries = 0; + int carry_out; + + /* For small operands the non-vector code is faster. */ + if (n < 16) + goto sequential; + + if (a == c || b == c) + { + TMP_DECL (marker); + TMP_MARK (marker); + if (c == a) + { + /* allocate temp space for a */ + mp_ptr ax = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB); + MPN_COPY (ax, a, n); + a = (mp_srcptr) ax; + } + if (c == b) + { + /* allocate temp space for b */ + mp_ptr bx = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB); + MPN_COPY (bx, b, n); + b = (mp_srcptr) bx; + } + carry_out = mpn_add_n (c, a, b, n); + TMP_FREE (marker); + return carry_out; + } + + carry_out = a[nm1] + b[nm1] < a[nm1]; + +#pragma _CRI ivdep /* Cray PVP systems */ + for (i = nm1; i > 0; i--) + { + int cy_in; + cy_in = a[i - 1] + b[i - 1] < a[i - 1]; + c[i] = a[i] + b[i] + cy_in; + more_carries += c[i] < cy_in; + } + c[0] = a[0] + b[0]; + + if (more_carries) + { + /* This won't vectorize, but we should come here rarely. */ + int cy; + sequential: + cy = 0; + for (i = 0; i < n; i++) + { + mp_limb_t ai, ci, t; + ai = a[i]; + t = b[i] + cy; + cy = t < cy; + ci = ai + t; + cy += ci < ai; + c[i] = ci; + } + carry_out = cy; + } + + return carry_out; +} diff --git a/ghc/rts/gmp/mpn/cray/addmul_1.c b/ghc/rts/gmp/mpn/cray/addmul_1.c new file mode 100644 index 0000000..031b4e8 --- /dev/null +++ b/ghc/rts/gmp/mpn/cray/addmul_1.c @@ -0,0 +1,46 @@ +/* mpn_addmul_1 for Cray PVP. + +Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#include "gmp.h" +#include "gmp-impl.h" + +mp_limb_t +mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb) +{ + mp_ptr p0, p1, tp; + mp_limb_t cy_limb; + TMP_DECL (marker); + TMP_MARK (marker); + + p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + tp = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + + GMPN_MULWW (p1, p0, up, &n, &limb); + cy_limb = mpn_add_n (tp, rp, p0, n); + rp[0] = tp[0]; + cy_limb += mpn_add_n (rp + 1, tp + 1, p1, n - 1); + cy_limb += p1[n - 1]; + + TMP_FREE (marker); + return cy_limb; +} diff --git a/ghc/rts/gmp/mpn/cray/gmp-mparam.h b/ghc/rts/gmp/mpn/cray/gmp-mparam.h index 349c812..14f7b8e 100644 --- a/ghc/rts/gmp/mpn/cray/gmp-mparam.h +++ b/ghc/rts/gmp/mpn/cray/gmp-mparam.h @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/cray/mul_1.c b/ghc/rts/gmp/mpn/cray/mul_1.c new file mode 100644 index 0000000..0c8750b --- /dev/null +++ b/ghc/rts/gmp/mpn/cray/mul_1.c @@ -0,0 +1,44 @@ +/* mpn_mul_1 for Cray PVP. + +Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#include "gmp.h" +#include "gmp-impl.h" + +mp_limb_t +mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb) +{ + mp_ptr p0, p1; + mp_limb_t cy_limb; + TMP_DECL (marker); + TMP_MARK (marker); + + p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + + GMPN_MULWW (p1, p0, up, &n, &limb); + rp[0] = p0[0]; + cy_limb = mpn_add_n (rp + 1, p0 + 1, p1, n - 1); + cy_limb += p1[n - 1]; + + TMP_FREE (marker); + return cy_limb; +} diff --git a/ghc/rts/gmp/mpn/cray/mulww.f b/ghc/rts/gmp/mpn/cray/mulww.f new file mode 100644 index 0000000..99507c1 --- /dev/null +++ b/ghc/rts/gmp/mpn/cray/mulww.f @@ -0,0 +1,54 @@ +c Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP. + +c Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +c This file is part of the GNU MP Library. + +c The GNU MP Library is free software; you can redistribute it and/or +c modify it under the terms of the GNU Lesser General Public License as +c published by the Free Software Foundation; either version 2.1 of the +c License, or (at your option) any later version. + +c The GNU MP Library is distributed in the hope that it will be useful, +c but WITHOUT ANY WARRANTY; without even the implied warranty of +c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +c Lesser General Public License for more details. + +c You should have received a copy of the GNU Lesser General Public +c License along with the GNU MP Library; see the file COPYING.LIB. If +c not, write to the Free Software Foundation, Inc., 59 Temple Place - +c Suite 330, Boston, MA 02111-1307, USA. + +c p1[] = hi(a[]*s); the upper limbs of each product +c p0[] = low(a[]*s); the corresponding lower limbs +c n is number of limbs in the vectors + + subroutine gmpn_mulww(p1,p0,a,n,s) + integer*8 p1(0:*),p0(0:*),a(0:*),s + integer n + + integer*8 a0,a1,a2,s0,s1,s2,c + integer*8 ai,t0,t1,t2,t3,t4 + + s0 = shiftl(and(s,4194303),24) + s1 = shiftl(and(shiftr(s,22),4194303),24) + s2 = shiftl(and(shiftr(s,44),4194303),24) + + do i = 0,n-1 + ai = a(i) + a0 = shiftl(and(ai,4194303),24) + a1 = shiftl(and(shiftr(ai,22),4194303),24) + a2 = shiftl(and(shiftr(ai,44),4194303),24) + + t0 = i24mult(a0,s0) + t1 = i24mult(a0,s1)+i24mult(a1,s0) + t2 = i24mult(a0,s2)+i24mult(a1,s1)+i24mult(a2,s0) + t3 = i24mult(a1,s2)+i24mult(a2,s1) + t4 = i24mult(a2,s2) + + p0(i)=shiftl(t2,44)+shiftl(t1,22)+t0 + c=shiftr(shiftr(t0,22)+and(t1,4398046511103)+ + $ shiftl(and(t2,1048575),22),42) + p1(i)=shiftl(t4,24)+shiftl(t3,2)+shiftr(t2,20)+shiftr(t1,42)+c + end do + end diff --git a/ghc/rts/gmp/mpn/cray/mulww.s b/ghc/rts/gmp/mpn/cray/mulww.s new file mode 100644 index 0000000..890cdcf --- /dev/null +++ b/ghc/rts/gmp/mpn/cray/mulww.s @@ -0,0 +1,245 @@ +* Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP. + +* Copyright (C) 1996, 2000 Free Software Foundation, Inc. +* This file is generated from mulww.f in this same directory. + +* This file is part of the GNU MP Library. + +* The GNU MP Library is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public License as +* published by the Free Software Foundation; either version 2.1 of the +* License, or (at your option) any later version. + +* The GNU MP Library is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* Lesser General Public License for more details. + +* You should have received a copy of the GNU Lesser General Public +* License along with the GNU MP Library; see the file COPYING.LIB. If +* not, write to the Free Software Foundation, Inc., 59 Temple Place - +* Suite 330, Boston, MA 02111-1307, USA. + + IDENT GMPN_MULWW +********************************************** +* Assemble with Cal Version 2.0 * +* * +* Generated by CFT77 6.0.4.19 * +* on 06/27/00 at 04:34:13 * +* * +********************************************** +* ALLOW UNDERSCORES IN IDENTIFIERS + EDIT OFF + FORMAT NEW +@DATA SECTION DATA,CM +@DATA = W.* + CON O'0000000000040000000000 + CON O'0435152404713723252514 ;GMPN_MUL 1 + CON O'0535270000000000000000 ;WW 1 + CON O'0000000000000001200012 ;trbk tbl 1 + VWD 32/0,32/P.GMPN_MULWW ;trbk tbl 1 + CON O'0014003000000000001416 ;trbk tbl 1 + CON O'0000000000000000000011 ;trbk tbl 1 + CON O'0000000000000000000215 ;trbk tbl 1 + BSSZ 1 ;trbk tbl 1 +@CODE SECTION CODE +@CODE = P.* +L3 = P.* ; 1 + A0 A6 ;arg base 1 + A5 6 ;num Darg 1 + B03,A5 0,A0 ;load DAs 1 + A0 A1+A2 ; 1 + A5 1 ;num Ts 1 + 0,A0 T00,A5 ; 1 + B02 A2 ;new base 1 + B66 A3 ;stk top 1 + B01 A6 ;arg base 1 + A7 P.L4 ;ofrn rtn 1 + B00 A7 ;return 1 + A6 @DATA ; 1 + J $STKOFEN ;$STKOFEN 1 +GMPN_MULWW = P.* ; 1 + A0 @DATA+3 ;(trbk) 1 + B77 A0 ;(trbk) 1 + A1 13 ;num Bs 1 + A0 B66 ;stk top 1 + A2 B66 ;stk tmp 1 + A4 B67 ;stk limt 1 + 0,A0 B77,A1 ; 1 + A7 782 ;stk size 1 + A3 A2+A7 ; 1 + A0 A4-A3 ; 1 + JAM L3 ;overflow 1 + A0 A6 ;arg base 1 + A5 6 ;num Darg 1 + B03,A5 0,A0 ;load DAs 1 + A0 A1+A2 ; 1 + A5 1 ;num Ts 1 + 0,A0 T00,A5 ; 1 + B02 A2 ;new base 1 + B66 A3 ;new top 1 + B01 A6 ;arg base 1 +L4 = P.* ;ofrn rtn 1 + A7 B07 ;regs 14 + S7 0,A7 ; 14 + A6 B10 ;regs 9 + S6 0,A6 ; 9 + S5 1 ; 14 + S4 <22 ; 9 + S7 S7-S5 ; 14 + S5 #S7 ; 14 + T00 S6 ;regs 10 + S6 S6>22 ; 10 + S7 T00 ;regs 11 + S7 S7>44 ; 11 + S3 T00 ;regs 9 + S3 S3&S4 ; 9 + S6 S6&S4 ; 10 + S7 S7&S4 ; 11 + S3 S3<24 ; 9 + S6 S6<24 ; 10 + S7 S7<24 ; 11 + S0 S5 ;regs 14 + S4 S5 ;regs 14 + S1 S6 ;regs 14 + S2 S3 ;regs 14 + S3 S7 ;regs 14 + JSP L5 ; 14 +L6 = P.* ; 14 + S7 -S4 ; 14 + A2 S7 ;regs 14 + VL A2 ;regs 14 + A3 B06 ;s_bt_sp 14 + A5 B05 ;s_bt_sp 14 + A4 B04 ;s_bt_sp 14 + A1 VL ; 14 + A2 S4 ;regs 14 +L7 = P.* ; 14 + A0 A3 ;regs 15 + VL A1 ;regs 15 + V7 ,A0,1 ; 15 + B11 A5 ;s_bt_sp 15 + A7 22 ; 17 + B12 A4 ;s_bt_sp 17 + V6 V7>A7 ; 17 + B13 A3 ;s_bt_sp 17 + S7 <22 ; 17 + A3 B02 ;s_bt_sp 17 + V5 S7&V6 ; 17 + A6 24 ; 17 + V4 V5A5 ; 18 + V2 S1*FV1 ; 21 + V3 S7&V5 ; 18 + A0 14 ; 34 + B77 A0 ;regs 34 + A4 B77 ;regs 34 + A0 A4+A3 ; 34 + ,A0,1 V2 ;v_ld_str 34 + V0 V3A7 ; 28 + V2 S2*FV0 ; 22 + V3 V6+V2 ; 22 + S7 <20 ; 28 + V1 S7&V3 ; 28 + A4 270 ; 34 + A0 A4+A3 ; 34 + ,A0,1 V0 ;v_ld_str 34 + A4 14 ; 34 + A0 A4+A3 ; 34 + V7 ,A0,1 ;v_ld_str 34 + V6 V1A5 ; 32 + V0 S1*FV4 ; 23 + A5 654 ; 34 + A0 A5+A3 ; 34 + ,A0,1 V1 ;v_ld_str 34 + V6 V7+V0 ; 23 + A5 2 ; 32 + V2 V6A6 ; 28 + A5 654 ; 34 + CPW ;cmr_vrsp 34 + A0 A5+A3 ; 34 + V1 ,A0,1 ;v_ld_str 34 + A5 398 ; 34 + A0 A5+A3 ; 34 + V3 ,A0,1 ;v_ld_str 34 + V6 V4+V1 ; 32 + V2 V3>A6 ; 32 + V5 V6+V2 ; 32 + A6 B12 ;s_bt_sp 32 + V4 V3 0; i--) + { + int cy_in; mp_limb_t t; + cy_in = a[i - 1] < b[i - 1]; + t = a[i] - b[i]; + more_carries += t < cy_in; + c[i] = t - cy_in; + } + c[0] = a[0] - b[0]; + + if (more_carries) + { + /* This won't vectorize, but we should come here rarely. */ + int cy; + sequential: + cy = 0; + for (i = 0; i < n; i++) + { + mp_limb_t ai, ci, t; + ai = a[i]; + t = b[i] + cy; + cy = t < cy; + ci = ai - t; + cy += ci > ai; + c[i] = ci; + } + carry_out = cy; + } + + return carry_out; +} diff --git a/ghc/rts/gmp/mpn/cray/submul_1.c b/ghc/rts/gmp/mpn/cray/submul_1.c new file mode 100644 index 0000000..4d2fb13 --- /dev/null +++ b/ghc/rts/gmp/mpn/cray/submul_1.c @@ -0,0 +1,46 @@ +/* mpn_submul_1 for Cray PVP. + +Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#include "gmp.h" +#include "gmp-impl.h" + +mp_limb_t +mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb) +{ + mp_ptr p0, p1, tp; + mp_limb_t cy_limb; + TMP_DECL (marker); + TMP_MARK (marker); + + p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + tp = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + + GMPN_MULWW (p1, p0, up, &n, &limb); + cy_limb = mpn_sub_n (tp, rp, p0, n); + rp[0] = tp[0]; + cy_limb += mpn_sub_n (rp + 1, tp + 1, p1, n - 1); + cy_limb += p1[n - 1]; + + TMP_FREE (marker); + return cy_limb; +} diff --git a/ghc/rts/gmp/mpn/generic/add_n.c b/ghc/rts/gmp/mpn/generic/add_n.c index 9d71df1..5fcb7e4 100644 --- a/ghc/rts/gmp/mpn/generic/add_n.c +++ b/ghc/rts/gmp/mpn/generic/add_n.c @@ -5,16 +5,16 @@ Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/generic/addmul_1.c b/ghc/rts/gmp/mpn/generic/addmul_1.c index 3a5e214..746ae31 100644 --- a/ghc/rts/gmp/mpn/generic/addmul_1.c +++ b/ghc/rts/gmp/mpn/generic/addmul_1.c @@ -8,16 +8,16 @@ Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/generic/addsub_n.c b/ghc/rts/gmp/mpn/generic/addsub_n.c new file mode 100644 index 0000000..c9bab3e --- /dev/null +++ b/ghc/rts/gmp/mpn/generic/addsub_n.c @@ -0,0 +1,167 @@ +/* mpn_addsub_n -- Add and Subtract two limb vectors of equal, non-zero length. + +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#ifndef L1_CACHE_SIZE +#define L1_CACHE_SIZE 8192 /* only 68040 has less than this */ +#endif + +#define PART_SIZE (L1_CACHE_SIZE / BYTES_PER_MP_LIMB / 6) + + +/* mpn_addsub_n. + r1[] = s1[] + s2[] + r2[] = s1[] - s2[] + All operands have n limbs. + In-place operations allowed. */ +mp_limb_t +#if __STDC__ +mpn_addsub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n) +#else +mpn_addsub_n (r1p, r2p, s1p, s2p, n) + mp_ptr r1p, r2p; + mp_srcptr s1p, s2p; + mp_size_t n; +#endif +{ + mp_limb_t acyn, acyo; /* carry for add */ + mp_limb_t scyn, scyo; /* carry for subtract */ + mp_size_t off; /* offset in operands */ + mp_size_t this_n; /* size of current chunk */ + + /* We alternatingly add and subtract in chunks that fit into the (L1) + cache. Since the chunks are several hundred limbs, the function call + overhead is insignificant, but we get much better locality. */ + + /* We have three variant of the inner loop, the proper loop is chosen + depending on whether r1 or r2 are the same operand as s1 or s2. */ + + if (r1p != s1p && r1p != s2p) + { + /* r1 is not identical to either input operand. We can therefore write + to r1 directly, without using temporary storage. */ + acyo = 0; + scyo = 0; + for (off = 0; off < n; off += PART_SIZE) + { + this_n = MIN (n - off, PART_SIZE); +#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n + acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo); +#else + acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n); + acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo); +#endif +#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n + scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); +#else + scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); + scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); +#endif + } + } + else if (r2p != s1p && r2p != s2p) + { + /* r2 is not identical to either input operand. We can therefore write + to r2 directly, without using temporary storage. */ + acyo = 0; + scyo = 0; + for (off = 0; off < n; off += PART_SIZE) + { + this_n = MIN (n - off, PART_SIZE); +#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n + scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); +#else + scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); + scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); +#endif +#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n + acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo); +#else + acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n); + acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo); +#endif + } + } + else + { + /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2=s2 or vice versa) + Need temporary storage. */ + mp_limb_t tp[PART_SIZE]; + acyo = 0; + scyo = 0; + for (off = 0; off < n; off += PART_SIZE) + { + this_n = MIN (n - off, PART_SIZE); +#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n + acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo); +#else + acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n); + acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo); +#endif +#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n + scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); +#else + scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); + scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); +#endif + MPN_COPY (r1p + off, tp, this_n); + } + } + + return 2 * acyo + scyo; +} + +#ifdef MAIN +#include +#include +#include "timing.h" + +long cputime (); + +int +main (int argc, char **argv) +{ + mp_ptr r1p, r2p, s1p, s2p; + double t; + mp_size_t n; + + n = strtol (argv[1], 0, 0); + + r1p = malloc (n * BYTES_PER_MP_LIMB); + r2p = malloc (n * BYTES_PER_MP_LIMB); + s1p = malloc (n * BYTES_PER_MP_LIMB); + s2p = malloc (n * BYTES_PER_MP_LIMB); + TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n))); + printf (" separate add and sub: %.3f\n", t); + TIME (t,mpn_addsub_n(r1p,r2p,s1p,s2p,n)); + printf ("combined addsub separate variables: %.3f\n", t); + TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n)); + printf (" combined addsub r1 overlap: %.3f\n", t); + TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n)); + printf (" combined addsub r2 overlap: %.3f\n", t); + TIME (t,mpn_addsub_n(r1p,r2p,r1p,r2p,n)); + printf (" combined addsub in-place: %.3f\n", t); + + return 0; +} +#endif diff --git a/ghc/rts/gmp/mpn/generic/bdivmod.c b/ghc/rts/gmp/mpn/generic/bdivmod.c index f095288..c4bcb41 100644 --- a/ghc/rts/gmp/mpn/generic/bdivmod.c +++ b/ghc/rts/gmp/mpn/generic/bdivmod.c @@ -1,20 +1,21 @@ /* mpn/bdivmod.c: mpn_bdivmod for computing U/V mod 2^d. -Copyright (C) 1991, 1993, 1994, 1995, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1995, 1996, 1999, 2000 Free Software +Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -65,27 +66,13 @@ mpn_bdivmod (qp, up, usize, vp, vsize, d) unsigned long int d; #endif { - /* Cache for v_inv is used to make mpn_accelgcd faster. */ - static mp_limb_t previous_low_vlimb = 0; - static mp_limb_t v_inv; /* 1/V mod 2^BITS_PER_MP_LIMB. */ + mp_limb_t v_inv; - if (vp[0] != previous_low_vlimb) /* Cache miss; compute v_inv. */ - { - mp_limb_t v = previous_low_vlimb = vp[0]; - mp_limb_t make_zero = 1; - mp_limb_t two_i = 1; - v_inv = 0; - do - { - while ((two_i & make_zero) == 0) - two_i <<= 1, v <<= 1; - v_inv += two_i; - make_zero -= v; - } - while (make_zero); - } + /* 1/V mod 2^BITS_PER_MP_LIMB. */ + modlimb_invert (v_inv, vp[0]); - /* Need faster computation for some common cases in mpn_accelgcd. */ + /* Fast code for two cases previously used by the accel part of mpn_gcd. + (Could probably remove this now it's inlined there.) */ if (usize == 2 && vsize == 2 && (d == BITS_PER_MP_LIMB || d == 2*BITS_PER_MP_LIMB)) { @@ -114,12 +101,16 @@ mpn_bdivmod (qp, up, usize, vp, vsize, d) { mp_limb_t b; mp_limb_t q = (up[0] * v_inv) & (((mp_limb_t)1< vsize) mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b); return q; diff --git a/ghc/rts/gmp/mpn/generic/bz_divrem_n.c b/ghc/rts/gmp/mpn/generic/bz_divrem_n.c new file mode 100644 index 0000000..9d2df1b --- /dev/null +++ b/ghc/rts/gmp/mpn/generic/bz_divrem_n.c @@ -0,0 +1,223 @@ +/* mpn_bz_divrem_n and auxilliary routines. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE + INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. + IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A + FUTURE GNU MP RELEASE. + + +Copyright (C) 2000 Free Software Foundation, Inc. +Contributed by Paul Zimmermann. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +/* +[1] Fast Recursive Division, by Christoph Burnikel and Joachim Ziegler, + Technical report MPI-I-98-1-022, october 1998. + http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz +*/ + +static mp_limb_t mpn_bz_div_3_halves_by_2 _PROTO ((mp_ptr, mp_ptr, mp_srcptr, + mp_size_t, mp_ptr)); + +static mp_limb_t mpn_bz_divrem_aux _PROTO ((mp_ptr, mp_ptr, mp_srcptr, + mp_size_t, mp_ptr)); + +/* mpn_bz_divrem_n(n) calls 2*mul(n/2)+2*div(n/2), thus to be faster than + div(n) = 4*div(n/2), we need mul(n/2) to be faster than the classic way, + i.e. n/2 >= KARATSUBA_MUL_THRESHOLD */ +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD) +#endif + +#if 0 +static +unused_mpn_divrem (qp, qxn, np, nn, dp, dn) + mp_ptr qp; + mp_size_t qxn; + mp_ptr np; + mp_size_t nn; + mp_srcptr dp; + mp_size_t dn; +{ + /* This might be useful: */ + if (qxn != 0) + { + mp_limb_t c; + mp_ptr tp = alloca ((nn + qxn) * BYTES_PER_MP_LIMB); + MPN_COPY (tp + qxn - nn, np, nn); + MPN_ZERO (tp, qxn); + c = mpn_divrem (qp, 0L, tp, nn + qxn, dp, dn); + /* Maybe copy proper part of tp to np? Documentation is unclear about + the returned np value when qxn != 0 */ + return c; + } +} +#endif + +/* mpn_bz_divrem_n - Implements algorithm of page 8 in [1]: divides (np,2n) + by (dp,n) and puts the quotient in (qp,n), the remainder in (np,n). + Returns most significant limb of the quotient, which is 0 or 1. + Requires that the most significant bit of the divisor is set. */ + +mp_limb_t +#if __STDC__ +mpn_bz_divrem_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n) +#else +mpn_bz_divrem_n (qp, np, dp, n) + mp_ptr qp; + mp_ptr np; + mp_srcptr dp; + mp_size_t n; +#endif +{ + mp_limb_t qhl = 0; + if (mpn_cmp (np + n, dp, n) >= 0) + { + qhl = 1; + mpn_sub_n (np + n, np + n, dp, n); + abort (); + } + if (n % 2 != 0) + { + /* divide (2n - 2) most significant limbs from np by those (n - 1) from dp */ + if (n < BZ_THRESHOLD) + qhl += mpn_sb_divrem_mn (qp + 1, np + 2, 2 * (n - 1), dp + 1, n - 1); + else + qhl += mpn_bz_divrem_n (qp + 1, np + 2, dp + 1, n - 1); + /* now (qp + 1, n - 1) contains the quotient of (np + 2, 2n - 2) by + (dp + 1, n - 1) and (np + 2, n - 1) contains the remainder */ + if (mpn_sub_1 (np + n, np + n, 1, + mpn_submul_1 (np + 1, qp + 1, n - 1, dp[0]))) + { + /* quotient too large */ + qhl -= mpn_sub_1 (qp + 1, qp + 1, n - 1, 1L); + if (mpn_add_n (np + 1, np + 1, dp, n) == 0) + { /* still too large */ + qhl -= mpn_sub_1 (qp + 1, qp + 1, n - 1, 1L); + mpn_add_n (np + 1, np + 1, dp, n); /* always carry here */ + } + } + /* now divide (np, n + 1) by (dp, n) */ + qhl += mpn_add_1 (qp + 1, qp + 1, n - 1, + mpn_sb_divrem_mn (qp, np, n + 1, dp, n)); + } + else + { + mp_ptr tmp; + mp_size_t n2 = n/2; + TMP_DECL (marker); + TMP_MARK (marker); + tmp = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB); + qhl = mpn_bz_div_3_halves_by_2 (qp + n2, np + n2, dp, n2, tmp); + qhl += mpn_add_1 (qp + n2, qp + n2, n2, + mpn_bz_div_3_halves_by_2 (qp, np, dp, n2, tmp)); + TMP_FREE (marker); + } + return qhl; +} + +/* Like mpn_bz_divrem_n, but without memory allocation. Also + assumes mpn_cmp (np + n, dp, n) < 0 */ + +static mp_limb_t +#if __STDC__ +mpn_bz_divrem_aux (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, mp_ptr tmp) +#else +mpn_bz_divrem_aux (qp, np, dp, n, tmp) + mp_ptr qp; + mp_ptr np; + mp_srcptr dp; + mp_size_t n; + mp_ptr tmp; +#endif +{ + mp_limb_t qhl; + + if (n % 2 != 0) + { + /* divide (2n - 2) most significant limbs from np by those (n - 1) from dp */ + qhl = mpn_bz_divrem_aux (qp + 1, np + 2, dp + 1, n - 1, tmp); + /* now (qp + 1, n - 1) contains the quotient of (np + 2, 2n - 2) by + (dp + 1, n - 1) and (np + 2, n - 1) contains the remainder */ + if (mpn_sub_1 (np + n, np + n, 1, + mpn_submul_1 (np + 1, qp + 1, n - 1, dp[0]))) + { + /* quotient too large */ + qhl -= mpn_sub_1 (qp + 1, qp + 1, n - 1, 1L); + if (mpn_add_n (np + 1, np + 1, dp, n) == 0) + { /* still too large */ + qhl -= mpn_sub_1 (qp + 1, qp + 1, n - 1, 1L); + mpn_add_n (np + 1, np + 1, dp, n); /* always carry here */ + } + } + /* now divide (np, n + 1) by (dp, n) */ + qhl += mpn_add_1 (qp + 1, qp + 1, n - 1, + mpn_sb_divrem_mn (qp, np, n + 1, dp, n)); + } + else + { + mp_size_t n2 = n/2; + qhl = mpn_bz_div_3_halves_by_2 (qp + n2, np + n2, dp, n2, tmp); + qhl += mpn_add_1 (qp + n2, qp + n2, n2, + mpn_bz_div_3_halves_by_2 (qp, np, dp, n2, tmp)); + } + return qhl; +} + +/* divides (np, 3n) by (dp, 2n) and puts the quotient in (qp, n), + the remainder in (np, 2n) */ + +static mp_limb_t +#if __STDC__ +mpn_bz_div_3_halves_by_2 (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, + mp_ptr tmp) +#else +mpn_bz_div_3_halves_by_2 (qp, np, dp, n, tmp) + mp_ptr qp; + mp_ptr np; + mp_srcptr dp; + mp_size_t n; + mp_ptr tmp; +#endif +{ + mp_size_t twon = n + n; + mp_limb_t qhl; + + if (n < BZ_THRESHOLD) + qhl = mpn_sb_divrem_mn (qp, np + n, twon, dp + n, n); + else + qhl = mpn_bz_divrem_aux (qp, np + n, dp + n, n, tmp); + /* q = (qp, n), c = (np + n, n) with the notations of [1] */ + mpn_mul_n (tmp, qp, dp, n); + if (qhl != 0) + mpn_add_n (tmp + n, tmp + n, dp, n); + if (mpn_sub_n (np, np, tmp, twon)) /* R = (np, 2n) */ + { + qhl -= mpn_sub_1 (qp, qp, n, 1L); + if (mpn_add_n (np, np, dp, twon) == 0) + { /* qp still too large */ + qhl -= mpn_sub_1 (qp, qp, n, 1L); + mpn_add_n (np, np, dp, twon); /* always carry here */ + } + } + return qhl; +} diff --git a/ghc/rts/gmp/mpn/generic/cmp.c b/ghc/rts/gmp/mpn/generic/cmp.c index 4e9c60d..8e9792f 100644 --- a/ghc/rts/gmp/mpn/generic/cmp.c +++ b/ghc/rts/gmp/mpn/generic/cmp.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/generic/diveby3.c b/ghc/rts/gmp/mpn/generic/diveby3.c new file mode 100644 index 0000000..a2fb552 --- /dev/null +++ b/ghc/rts/gmp/mpn/generic/diveby3.c @@ -0,0 +1,77 @@ +/* mpn_divexact_by3 -- mpn division by 3, expecting no remainder. */ + +/* +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + + +#include "gmp.h" +#include "gmp-impl.h" + + +/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB. + 0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */ +#define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1) + + +/* The "c += ..."s are adding the high limb of 3*l to c. That high limb + will be 0, 1 or 2. Doing two separate "+="s seems to turn out better + code on gcc (as of 2.95.2 at least). + + When a subtraction of a 0,1,2 carry value causes a borrow, that leaves a + limb value of either 0xFF...FF or 0xFF...FE and the multiply by INVERSE_3 + gives 0x55...55 or 0xAA...AA respectively, producing a further borrow of + only 0 or 1 respectively. Hence the carry out of each stage and for the + return value is always only 0, 1 or 2. */ + +mp_limb_t +#if __STDC__ +mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t c) +#else +mpn_divexact_by3c (dst, src, size, c) + mp_ptr dst; + mp_srcptr src; + mp_size_t size; + mp_limb_t c; +#endif +{ + mp_size_t i; + + ASSERT (size >= 1); + + i = 0; + do + { + mp_limb_t l, s; + + s = src[i]; + l = s - c; + c = (l > s); + + l *= INVERSE_3; + dst[i] = l; + + c += (l > MP_LIMB_T_MAX/3); + c += (l > (MP_LIMB_T_MAX/3)*2); + } + while (++i < size); + + return c; +} diff --git a/ghc/rts/gmp/mpn/generic/divrem.c b/ghc/rts/gmp/mpn/generic/divrem.c index 1fe865a..30673e7 100644 --- a/ghc/rts/gmp/mpn/generic/divrem.c +++ b/ghc/rts/gmp/mpn/generic/divrem.c @@ -1,21 +1,23 @@ /* mpn_divrem -- Divide natural numbers, producing both remainder and - quotient. + quotient. This is now just a middle layer for calling the new + internal mpn_tdiv_qr. -Copyright (C) 1993, 1994, 1995, 1996 Free Software Foundation, Inc. +Copyright (C) 1993, 1994, 1995, 1996, 1997, 1999, 2000 Free Software +Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -24,222 +26,76 @@ MA 02111-1307, USA. */ #include "gmp-impl.h" #include "longlong.h" -/* Divide num (NP/NSIZE) by den (DP/DSIZE) and write - the NSIZE-DSIZE least significant quotient limbs at QP - and the DSIZE long remainder at NP. If QEXTRA_LIMBS is - non-zero, generate that many fraction bits and append them after the - other quotient limbs. - Return the most significant limb of the quotient, this is always 0 or 1. - - Preconditions: - 0. NSIZE >= DSIZE. - 1. The most significant bit of the divisor must be set. - 2. QP must either not overlap with the input operands at all, or - QP + DSIZE >= NP must hold true. (This means that it's - possible to put the quotient in the high part of NUM, right after the - remainder in NUM. - 3. NSIZE >= DSIZE, even if QEXTRA_LIMBS is non-zero. */ - mp_limb_t #if __STDC__ -mpn_divrem (mp_ptr qp, mp_size_t qextra_limbs, - mp_ptr np, mp_size_t nsize, - mp_srcptr dp, mp_size_t dsize) +mpn_divrem (mp_ptr qp, mp_size_t qxn, + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn) #else -mpn_divrem (qp, qextra_limbs, np, nsize, dp, dsize) +mpn_divrem (qp, qxn, np, nn, dp, dn) mp_ptr qp; - mp_size_t qextra_limbs; + mp_size_t qxn; mp_ptr np; - mp_size_t nsize; + mp_size_t nn; mp_srcptr dp; - mp_size_t dsize; + mp_size_t dn; #endif { - mp_limb_t most_significant_q_limb = 0; - - switch (dsize) + if (dn == 1) { - case 0: - /* We are asked to divide by zero, so go ahead and do it! (To make - the compiler not remove this statement, return the value.) */ - return 1 / dsize; - - case 1: - { - mp_size_t i; - mp_limb_t n1; - mp_limb_t d; - - d = dp[0]; - n1 = np[nsize - 1]; - - if (n1 >= d) - { - n1 -= d; - most_significant_q_limb = 1; - } - - qp += qextra_limbs; - for (i = nsize - 2; i >= 0; i--) - udiv_qrnnd (qp[i], n1, n1, np[i], d); - qp -= qextra_limbs; - - for (i = qextra_limbs - 1; i >= 0; i--) - udiv_qrnnd (qp[i], n1, n1, 0, d); - - np[0] = n1; - } - break; - - case 2: - { - mp_size_t i; - mp_limb_t n1, n0, n2; - mp_limb_t d1, d0; - - np += nsize - 2; - d1 = dp[1]; - d0 = dp[0]; - n1 = np[1]; - n0 = np[0]; - - if (n1 >= d1 && (n1 > d1 || n0 >= d0)) - { - sub_ddmmss (n1, n0, n1, n0, d1, d0); - most_significant_q_limb = 1; - } - - for (i = qextra_limbs + nsize - 2 - 1; i >= 0; i--) - { - mp_limb_t q; - mp_limb_t r; - - if (i >= qextra_limbs) - np--; - else - np[0] = 0; - - if (n1 == d1) - { - /* Q should be either 111..111 or 111..110. Need special - treatment of this rare case as normal division would - give overflow. */ - q = ~(mp_limb_t) 0; - - r = n0 + d1; - if (r < d1) /* Carry in the addition? */ - { - add_ssaaaa (n1, n0, r - d0, np[0], 0, d0); - qp[i] = q; - continue; - } - n1 = d0 - (d0 != 0); - n0 = -d0; - } - else - { - udiv_qrnnd (q, r, n1, n0, d1); - umul_ppmm (n1, n0, d0, q); - } - - n2 = np[0]; - q_test: - if (n1 > r || (n1 == r && n0 > n2)) - { - /* The estimated Q was too large. */ - q--; + mp_limb_t ret; + mp_ptr q2p; + mp_size_t qn; + TMP_DECL (marker); - sub_ddmmss (n1, n0, n1, n0, 0, d0); - r += d1; - if (r >= d1) /* If not carry, test Q again. */ - goto q_test; - } + TMP_MARK (marker); + q2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB); - qp[i] = q; - sub_ddmmss (n1, n0, r, n2, n1, n0); - } - np[1] = n1; - np[0] = n0; - } - break; + np[0] = mpn_divrem_1 (q2p, qxn, np, nn, dp[0]); + qn = nn + qxn - 1; + MPN_COPY (qp, q2p, qn); + ret = q2p[qn]; - default: - { - mp_size_t i; - mp_limb_t dX, d1, n0; - - np += nsize - dsize; - dX = dp[dsize - 1]; - d1 = dp[dsize - 2]; - n0 = np[dsize - 1]; - - if (n0 >= dX) - { - if (n0 > dX || mpn_cmp (np, dp, dsize - 1) >= 0) - { - mpn_sub_n (np, np, dp, dsize); - n0 = np[dsize - 1]; - most_significant_q_limb = 1; - } - } - - for (i = qextra_limbs + nsize - dsize - 1; i >= 0; i--) - { - mp_limb_t q; - mp_limb_t n1, n2; - mp_limb_t cy_limb; - - if (i >= qextra_limbs) - { - np--; - n2 = np[dsize]; - } - else - { - n2 = np[dsize - 1]; - MPN_COPY_DECR (np + 1, np, dsize); - np[0] = 0; - } - - if (n0 == dX) - /* This might over-estimate q, but it's probably not worth - the extra code here to find out. */ - q = ~(mp_limb_t) 0; - else - { - mp_limb_t r; - - udiv_qrnnd (q, r, n0, np[dsize - 1], dX); - umul_ppmm (n1, n0, d1, q); - - while (n1 > r || (n1 == r && n0 > np[dsize - 2])) - { - q--; - r += dX; - if (r < dX) /* I.e. "carry in previous addition?" */ - break; - n1 -= n0 < d1; - n0 -= d1; - } - } - - /* Possible optimization: We already have (q * n0) and (1 * n1) - after the calculation of q. Taking advantage of that, we - could make this loop make two iterations less. */ - - cy_limb = mpn_submul_1 (np, dp, dsize, q); - - if (n2 != cy_limb) - { - mpn_add_n (np, np, dp, dsize); - q--; - } - - qp[i] = q; - n0 = np[dsize - 1]; - } - } + TMP_FREE (marker); + return ret; + } + else if (dn == 2) + { + return mpn_divrem_2 (qp, qxn, np, nn, dp); + } + else + { + mp_ptr rp, q2p; + mp_limb_t qhl; + mp_size_t qn; + TMP_DECL (marker); + + TMP_MARK (marker); + if (qxn != 0) + { + mp_ptr n2p; + n2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB); + MPN_ZERO (n2p, qxn); + MPN_COPY (n2p + qxn, np, nn); + q2p = (mp_ptr) TMP_ALLOC ((nn - dn + qxn + 1) * BYTES_PER_MP_LIMB); + rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); + mpn_tdiv_qr (q2p, rp, 0L, n2p, nn + qxn, dp, dn); + MPN_COPY (np, rp, dn); + qn = nn - dn + qxn; + MPN_COPY (qp, q2p, qn); + qhl = q2p[qn]; + } + else + { + q2p = (mp_ptr) TMP_ALLOC ((nn - dn + 1) * BYTES_PER_MP_LIMB); + rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); + mpn_tdiv_qr (q2p, rp, 0L, np, nn, dp, dn); + MPN_COPY (np, rp, dn); /* overwrite np area with remainder */ + qn = nn - dn; + MPN_COPY (qp, q2p, qn); + qhl = q2p[qn]; + } + TMP_FREE (marker); + return qhl; } - - return most_significant_q_limb; } diff --git a/ghc/rts/gmp/mpn/generic/divrem_1.c b/ghc/rts/gmp/mpn/generic/divrem_1.c index d213267..e93f241 100644 --- a/ghc/rts/gmp/mpn/generic/divrem_1.c +++ b/ghc/rts/gmp/mpn/generic/divrem_1.c @@ -6,21 +6,22 @@ QUOT_PTR and DIVIDEND_PTR might point to the same limb. -Copyright (C) 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1998, 1999, 2000 Free Software +Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -29,30 +30,219 @@ MA 02111-1307, USA. */ #include "gmp-impl.h" #include "longlong.h" -mp_limb_t + + +/* __gmpn_divmod_1_internal(quot_ptr,dividend_ptr,dividend_size,divisor_limb) + Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB. + Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR. + Return the single-limb remainder. + There are no constraints on the value of the divisor. + + QUOT_PTR and DIVIDEND_PTR might point to the same limb. */ + +#ifndef UMUL_TIME +#define UMUL_TIME 1 +#endif + +#ifndef UDIV_TIME +#define UDIV_TIME UMUL_TIME +#endif + +static mp_limb_t #if __STDC__ -mpn_divrem_1 (mp_ptr qp, mp_size_t qsize, +__gmpn_divmod_1_internal (mp_ptr quot_ptr, mp_srcptr dividend_ptr, mp_size_t dividend_size, mp_limb_t divisor_limb) #else -mpn_divrem_1 (qp, qsize, dividend_ptr, dividend_size, divisor_limb) - mp_ptr qp; - mp_size_t qsize; +__gmpn_divmod_1_internal (quot_ptr, dividend_ptr, dividend_size, divisor_limb) + mp_ptr quot_ptr; mp_srcptr dividend_ptr; mp_size_t dividend_size; mp_limb_t divisor_limb; #endif { + mp_size_t i; + mp_limb_t n1, n0, r; + int dummy; + + /* ??? Should this be handled at all? Rely on callers? */ + if (dividend_size == 0) + return 0; + + /* If multiplication is much faster than division, and the + dividend is large, pre-invert the divisor, and use + only multiplications in the inner loop. */ + + /* This test should be read: + Does it ever help to use udiv_qrnnd_preinv? + && Does what we save compensate for the inversion overhead? */ + if (UDIV_TIME > (2 * UMUL_TIME + 6) + && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME) + { + int normalization_steps; + + count_leading_zeros (normalization_steps, divisor_limb); + if (normalization_steps != 0) + { + mp_limb_t divisor_limb_inverted; + + divisor_limb <<= normalization_steps; + invert_limb (divisor_limb_inverted, divisor_limb); + + n1 = dividend_ptr[dividend_size - 1]; + r = n1 >> (BITS_PER_MP_LIMB - normalization_steps); + + /* Possible optimization: + if (r == 0 + && divisor_limb > ((n1 << normalization_steps) + | (dividend_ptr[dividend_size - 2] >> ...))) + ...one division less... */ + + for (i = dividend_size - 2; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd_preinv (quot_ptr[i + 1], r, r, + ((n1 << normalization_steps) + | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))), + divisor_limb, divisor_limb_inverted); + n1 = n0; + } + udiv_qrnnd_preinv (quot_ptr[0], r, r, + n1 << normalization_steps, + divisor_limb, divisor_limb_inverted); + return r >> normalization_steps; + } + else + { + mp_limb_t divisor_limb_inverted; + + invert_limb (divisor_limb_inverted, divisor_limb); + + i = dividend_size - 1; + r = dividend_ptr[i]; + + if (r >= divisor_limb) + r = 0; + else + { + quot_ptr[i] = 0; + i--; + } + + for (; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd_preinv (quot_ptr[i], r, r, + n0, divisor_limb, divisor_limb_inverted); + } + return r; + } + } + else + { + if (UDIV_NEEDS_NORMALIZATION) + { + int normalization_steps; + + count_leading_zeros (normalization_steps, divisor_limb); + if (normalization_steps != 0) + { + divisor_limb <<= normalization_steps; + + n1 = dividend_ptr[dividend_size - 1]; + r = n1 >> (BITS_PER_MP_LIMB - normalization_steps); + + /* Possible optimization: + if (r == 0 + && divisor_limb > ((n1 << normalization_steps) + | (dividend_ptr[dividend_size - 2] >> ...))) + ...one division less... */ + + for (i = dividend_size - 2; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd (quot_ptr[i + 1], r, r, + ((n1 << normalization_steps) + | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))), + divisor_limb); + n1 = n0; + } + udiv_qrnnd (quot_ptr[0], r, r, + n1 << normalization_steps, + divisor_limb); + return r >> normalization_steps; + } + } + /* No normalization needed, either because udiv_qrnnd doesn't require + it, or because DIVISOR_LIMB is already normalized. */ + + i = dividend_size - 1; + r = dividend_ptr[i]; + + if (r >= divisor_limb) + r = 0; + else + { + quot_ptr[i] = 0; + i--; + } + + for (; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd (quot_ptr[i], r, r, n0, divisor_limb); + } + return r; + } +} + + + +mp_limb_t +#if __STDC__ +mpn_divrem_1 (mp_ptr qp, mp_size_t qxn, + mp_srcptr np, mp_size_t nn, + mp_limb_t d) +#else +mpn_divrem_1 (qp, qxn, np, nn, d) + mp_ptr qp; + mp_size_t qxn; + mp_srcptr np; + mp_size_t nn; + mp_limb_t d; +#endif +{ mp_limb_t rlimb; - long i; + mp_size_t i; /* Develop integer part of quotient. */ - rlimb = mpn_divmod_1 (qp + qsize, dividend_ptr, dividend_size, divisor_limb); + rlimb = __gmpn_divmod_1_internal (qp + qxn, np, nn, d); - if (qsize != 0) + /* Develop fraction part of quotient. This is not as fast as it should; + the preinvert stuff from __gmpn_divmod_1_internal ought to be used here + too. */ + if (UDIV_NEEDS_NORMALIZATION) { - for (i = qsize - 1; i >= 0; i--) - udiv_qrnnd (qp[i], rlimb, rlimb, 0, divisor_limb); + int normalization_steps; + + count_leading_zeros (normalization_steps, d); + if (normalization_steps != 0) + { + d <<= normalization_steps; + rlimb <<= normalization_steps; + + for (i = qxn - 1; i >= 0; i--) + udiv_qrnnd (qp[i], rlimb, rlimb, 0, d); + + return rlimb >> normalization_steps; + } + else + /* fall out */ + ; } + + for (i = qxn - 1; i >= 0; i--) + udiv_qrnnd (qp[i], rlimb, rlimb, 0, d); + return rlimb; } diff --git a/ghc/rts/gmp/mpn/generic/divrem_2.c b/ghc/rts/gmp/mpn/generic/divrem_2.c new file mode 100644 index 0000000..0bc31ae --- /dev/null +++ b/ghc/rts/gmp/mpn/generic/divrem_2.c @@ -0,0 +1,151 @@ +/* mpn_divrem_2 -- Divide natural numbers, producing both remainder and + quotient. The divisor is two limbs. + + THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS + ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP + RELEASE. + + +Copyright (C) 1993, 1994, 1995, 1996, 1999, 2000 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Divide num (NP/NSIZE) by den (DP/2) and write + the NSIZE-2 least significant quotient limbs at QP + and the 2 long remainder at NP. If QEXTRA_LIMBS is + non-zero, generate that many fraction bits and append them after the + other quotient limbs. + Return the most significant limb of the quotient, this is always 0 or 1. + + Preconditions: + 0. NSIZE >= 2. + 1. The most significant bit of the divisor must be set. + 2. QP must either not overlap with the input operands at all, or + QP + 2 >= NP must hold true. (This means that it's + possible to put the quotient in the high part of NUM, right after the + remainder in NUM. + 3. NSIZE >= 2, even if QEXTRA_LIMBS is non-zero. */ + +mp_limb_t +#if __STDC__ +mpn_divrem_2 (mp_ptr qp, mp_size_t qxn, + mp_ptr np, mp_size_t nsize, + mp_srcptr dp) +#else +mpn_divrem_2 (qp, qxn, np, nsize, dp) + mp_ptr qp; + mp_size_t qxn; + mp_ptr np; + mp_size_t nsize; + mp_srcptr dp; +#endif +{ + mp_limb_t most_significant_q_limb = 0; + mp_size_t i; + mp_limb_t n1, n0, n2; + mp_limb_t d1, d0; + mp_limb_t d1inv; + int have_preinv; + + np += nsize - 2; + d1 = dp[1]; + d0 = dp[0]; + n1 = np[1]; + n0 = np[0]; + + if (n1 >= d1 && (n1 > d1 || n0 >= d0)) + { + sub_ddmmss (n1, n0, n1, n0, d1, d0); + most_significant_q_limb = 1; + } + + /* If multiplication is much faster than division, preinvert the most + significant divisor limb before entering the loop. */ + if (UDIV_TIME > 2 * UMUL_TIME + 6) + { + have_preinv = 0; + if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - 2) > UDIV_TIME) + { + invert_limb (d1inv, d1); + have_preinv = 1; + } + } + + for (i = qxn + nsize - 2 - 1; i >= 0; i--) + { + mp_limb_t q; + mp_limb_t r; + + if (i >= qxn) + np--; + else + np[0] = 0; + + if (n1 == d1) + { + /* Q should be either 111..111 or 111..110. Need special treatment + of this rare case as normal division would give overflow. */ + q = ~(mp_limb_t) 0; + + r = n0 + d1; + if (r < d1) /* Carry in the addition? */ + { + add_ssaaaa (n1, n0, r - d0, np[0], 0, d0); + qp[i] = q; + continue; + } + n1 = d0 - (d0 != 0); + n0 = -d0; + } + else + { + if (UDIV_TIME > 2 * UMUL_TIME + 6 && have_preinv) + udiv_qrnnd_preinv (q, r, n1, n0, d1, d1inv); + else + udiv_qrnnd (q, r, n1, n0, d1); + umul_ppmm (n1, n0, d0, q); + } + + n2 = np[0]; + + q_test: + if (n1 > r || (n1 == r && n0 > n2)) + { + /* The estimated Q was too large. */ + q--; + + sub_ddmmss (n1, n0, n1, n0, 0, d0); + r += d1; + if (r >= d1) /* If not carry, test Q again. */ + goto q_test; + } + + qp[i] = q; + sub_ddmmss (n1, n0, r, n2, n1, n0); + } + np[1] = n1; + np[0] = n0; + + return most_significant_q_limb; +} diff --git a/ghc/rts/gmp/mpn/generic/dump.c b/ghc/rts/gmp/mpn/generic/dump.c index a5831c4..66f375c 100644 --- a/ghc/rts/gmp/mpn/generic/dump.c +++ b/ghc/rts/gmp/mpn/generic/dump.c @@ -1,20 +1,76 @@ +/* THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS NOT SAFE TO + CALL THIS FUNCTION DIRECTLY. IN FACT, IT IS ALMOST GUARANTEED THAT THIS + FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + #include #include "gmp.h" #include "gmp-impl.h" void +#if __STDC__ +mpn_dump (mp_srcptr ptr, mp_size_t size) +#else mpn_dump (ptr, size) mp_srcptr ptr; mp_size_t size; +#endif { + MPN_NORMALIZE (ptr, size); + if (size == 0) printf ("0\n"); - { - while (size) - { - size--; - printf ("%0*lX", (int) (2 * BYTES_PER_MP_LIMB), ptr[size]); - } - printf ("\n"); - } + else + { + size--; + if (BYTES_PER_MP_LIMB > sizeof (long)) + { + if ((ptr[size] >> BITS_PER_MP_LIMB/2) != 0) + { + printf ("%lX", + (unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2)); + printf ("%0*lX", (int) (BYTES_PER_MP_LIMB), + (unsigned long) ptr[size]); + } + else + printf ("%lX", (unsigned long) ptr[size]); + } + else + printf ("%lX", ptr[size]); + + while (size) + { + size--; + if (BYTES_PER_MP_LIMB > sizeof (long)) + { + printf ("%0*lX", (int) (BYTES_PER_MP_LIMB), + (unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2)); + printf ("%0*lX", (int) (BYTES_PER_MP_LIMB), + (unsigned long) ptr[size]); + } + else + printf ("%0*lX", (int) (2 * BYTES_PER_MP_LIMB), ptr[size]); + } + printf ("\n"); + } } diff --git a/ghc/rts/gmp/mpn/generic/gcd.c b/ghc/rts/gmp/mpn/generic/gcd.c index 8c2bbf0..059e219 100644 --- a/ghc/rts/gmp/mpn/generic/gcd.c +++ b/ghc/rts/gmp/mpn/generic/gcd.c @@ -1,20 +1,21 @@ /* mpn/gcd.c: mpn_gcd for gcd of two odd integers. -Copyright (C) 1991, 1993, 1994, 1995, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000 Free Software +Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -22,7 +23,7 @@ MA 02111-1307, USA. */ /* Integer greatest common divisor of two unsigned integers, using the accelerated algorithm (see reference below). - mp_size_t mpn_gcd (vp, vsize, up, usize). + mp_size_t mpn_gcd (up, usize, vp, vsize). Preconditions [U = (up, usize) and V = (vp, vsize)]: @@ -47,11 +48,11 @@ MA 02111-1307, USA. */ #include "gmp-impl.h" #include "longlong.h" -/* If MIN (usize, vsize) > ACCEL_THRESHOLD, then the accelerated algorithm is - used, otherwise the binary algorithm is used. This may be adjusted for - different architectures. */ -#ifndef ACCEL_THRESHOLD -#define ACCEL_THRESHOLD 4 +/* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated + algorithm is used, otherwise the binary algorithm is used. This may be + adjusted for different architectures. */ +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 5 #endif /* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated @@ -62,13 +63,6 @@ enum BMOD_THRESHOLD = BITS_PER_MP_LIMB/2 }; -#define SIGN_BIT (~(~(mp_limb_t)0 >> 1)) - - -#define SWAP_LIMB(UL, VL) do{mp_limb_t __l=(UL);(UL)=(VL);(VL)=__l;}while(0) -#define SWAP_PTR(UP, VP) do{mp_ptr __p=(UP);(UP)=(VP);(VP)=__p;}while(0) -#define SWAP_SZ(US, VS) do{mp_size_t __s=(US);(US)=(VS);(VS)=__s;}while(0) -#define SWAP_MPN(UP, US, VP, VS) do{SWAP_PTR(UP,VP);SWAP_SZ(US,VS);}while(0) /* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2. Both U and V must be odd. */ @@ -129,7 +123,11 @@ gcd_2 (vp, up) precision. If N2 > N1 initially, the first iteration of the while loop will swap them. In all other situations, N1 >= N2 is maintained. */ -static __gmp_inline mp_limb_t +static +#if ! defined (__i386__) +__gmp_inline /* don't inline this for the x86 */ +#endif +mp_limb_t #if __STDC__ find_a (mp_srcptr cp) #else @@ -149,7 +147,7 @@ find_a (cp) while (n2_h) /* While N2 >= 2^BITS_PER_MP_LIMB. */ { /* N1 <-- N1 % N2. */ - if ((SIGN_BIT >> leading_zero_bits & n2_h) == 0) + if ((MP_LIMB_T_HIGHBIT >> leading_zero_bits & n2_h) == 0) { unsigned long int i; count_leading_zeros (i, n2_h); @@ -167,8 +165,8 @@ find_a (cp) if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l)) n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l; - SWAP_LIMB (n1_h, n2_h); - SWAP_LIMB (n1_l, n2_l); + MP_LIMB_T_SWAP (n1_h, n2_h); + MP_LIMB_T_SWAP (n1_l, n2_l); } return n2_l; @@ -176,14 +174,14 @@ find_a (cp) mp_size_t #if __STDC__ -mpn_gcd (mp_ptr gp, mp_ptr vp, mp_size_t vsize, mp_ptr up, mp_size_t usize) +mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize) #else -mpn_gcd (gp, vp, vsize, up, usize) +mpn_gcd (gp, up, usize, vp, vsize) mp_ptr gp; - mp_ptr vp; - mp_size_t vsize; mp_ptr up; mp_size_t usize; + mp_ptr vp; + mp_size_t vsize; #endif { mp_ptr orig_vp = vp; @@ -193,9 +191,9 @@ mpn_gcd (gp, vp, vsize, up, usize) TMP_MARK (marker); - /* Use accelerated algorithm if vsize is over ACCEL_THRESHOLD. + /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD. Two EXTRA limbs for U and V are required for kary reduction. */ - if (vsize > ACCEL_THRESHOLD) + if (vsize >= GCD_ACCEL_THRESHOLD) { unsigned long int vbitsize, d; mp_ptr orig_up = up; @@ -228,7 +226,9 @@ mpn_gcd (gp, vp, vsize, up, usize) do /* Main loop. */ { - if (up[usize-1] & SIGN_BIT) /* U < 0; take twos' compl. */ + /* mpn_com_n can't be used here because anchor_up and up may + partially overlap */ + if (up[usize-1] & MP_LIMB_T_HIGHBIT) /* U < 0; take twos' compl. */ { mp_size_t i; anchor_up[0] = -up[0]; @@ -241,15 +241,15 @@ mpn_gcd (gp, vp, vsize, up, usize) if ((up[0] & 1) == 0) /* Result even; remove twos. */ { - unsigned long int r; + unsigned int r; count_trailing_zeros (r, up[0]); mpn_rshift (anchor_up, up, usize, r); usize -= (anchor_up[usize-1] == 0); } else if (anchor_up != up) - MPN_COPY (anchor_up, up, usize); + MPN_COPY_INCR (anchor_up, up, usize); - SWAP_MPN (anchor_up, usize, vp, vsize); + MPN_PTR_SWAP (anchor_up,usize, vp,vsize); up = anchor_up; if (vsize <= 2) /* Kary can't handle < 2 limbs and */ @@ -271,8 +271,13 @@ mpn_gcd (gp, vp, vsize, up, usize) mp_limb_t bp[2], cp[2]; /* C <-- V/U mod 2^(2*BITS_PER_MP_LIMB). */ - cp[0] = vp[0], cp[1] = vp[1]; - mpn_bdivmod (cp, cp, 2, up, 2, 2*BITS_PER_MP_LIMB); + { + mp_limb_t u_inv, hi, lo; + modlimb_invert (u_inv, up[0]); + cp[0] = vp[0] * u_inv; + umul_ppmm (hi, lo, cp[0], up[0]); + cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv; + } /* U <-- find_a (C) * U. */ up[usize] = mpn_mul_1 (up, up, usize, find_a (cp)); @@ -280,10 +285,17 @@ mpn_gcd (gp, vp, vsize, up, usize) /* B <-- A/C == U/V mod 2^(BITS_PER_MP_LIMB + 1). bp[0] <-- U/V mod 2^BITS_PER_MP_LIMB and - bp[1] <-- ( (U - bp[0] * V)/2^BITS_PER_MP_LIMB ) / V mod 2 */ - bp[0] = up[0], bp[1] = up[1]; - mpn_bdivmod (bp, bp, 2, vp, 2, BITS_PER_MP_LIMB); - bp[1] &= 1; /* Since V is odd, division is unnecessary. */ + bp[1] <-- ( (U - bp[0] * V)/2^BITS_PER_MP_LIMB ) / V mod 2 + + Like V/U above, but simplified because only the low bit of + bp[1] is wanted. */ + { + mp_limb_t v_inv, hi, lo; + modlimb_invert (v_inv, vp[0]); + bp[0] = up[0] * v_inv; + umul_ppmm (hi, lo, bp[0], vp[0]); + bp[1] = (up[1] + hi + (bp[0]&vp[1])) & 1; + } up[usize++] = 0; if (bp[1]) /* B < 0: U <-- U + (-B) * V. */ @@ -342,7 +354,7 @@ mpn_gcd (gp, vp, vsize, up, usize) up += 1, usize -= 1; if ((up[0] & 1) == 0) { - unsigned long int r; + unsigned int r; count_trailing_zeros (r, up[0]); mpn_rshift (up, up, usize, r); usize -= (up[usize-1] == 0); @@ -350,7 +362,7 @@ mpn_gcd (gp, vp, vsize, up, usize) /* Keep usize >= vsize. */ if (usize < vsize) - SWAP_MPN (up, usize, vp, vsize); + MPN_PTR_SWAP (up, usize, vp, vsize); if (usize <= 2) /* Double precision. */ { @@ -375,7 +387,7 @@ mpn_gcd (gp, vp, vsize, up, usize) size--; while (up[size] == vp[size]); if (up[size] < vp[size]) /* usize == vsize. */ - SWAP_PTR (up, vp); + MP_PTR_SWAP (up, vp); up += zeros, usize = size + 1 - zeros; mpn_sub_n (up, up, vp + zeros, usize); } diff --git a/ghc/rts/gmp/mpn/generic/gcd_1.c b/ghc/rts/gmp/mpn/generic/gcd_1.c index ebcdfb5..1832636 100644 --- a/ghc/rts/gmp/mpn/generic/gcd_1.c +++ b/ghc/rts/gmp/mpn/generic/gcd_1.c @@ -1,20 +1,20 @@ /* mpn_gcd_1 -- -Copyright (C) 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -27,10 +27,14 @@ MA 02111-1307, USA. */ V == 0 since gcd(x,0) = x, and U does not generally fit in an mp_limb_t. */ mp_limb_t +#if __STDC__ +mpn_gcd_1 (mp_srcptr up, mp_size_t size, mp_limb_t vlimb) +#else mpn_gcd_1 (up, size, vlimb) mp_srcptr up; mp_size_t size; mp_limb_t vlimb; +#endif { mp_limb_t ulimb; unsigned long int u_low_zero_bits, v_low_zero_bits; diff --git a/ghc/rts/gmp/mpn/generic/gcdext.c b/ghc/rts/gmp/mpn/generic/gcdext.c index 245e20a..fe22d77 100644 --- a/ghc/rts/gmp/mpn/generic/gcdext.c +++ b/ghc/rts/gmp/mpn/generic/gcdext.c @@ -1,20 +1,20 @@ /* mpn_gcdext -- Extended Greatest Common Divisor. -Copyright (C) 1996 Free Software Foundation, Inc. +Copyright (C) 1996, 1998, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -23,6 +23,10 @@ MA 02111-1307, USA. */ #include "gmp-impl.h" #include "longlong.h" +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 17 +#endif + #ifndef EXTEND #define EXTEND 1 #endif @@ -31,28 +35,125 @@ MA 02111-1307, USA. */ int arr[BITS_PER_MP_LIMB]; #endif -#define SGN(A) (((A) < 0) ? -1 : ((A) > 0)) -/* Idea 1: After we have performed a full division, don't shift operands back, +/* mpn_gcdext (GP, SP, SSIZE, UP, USIZE, VP, VSIZE) + + Compute the extended GCD of {UP,USIZE} and {VP,VSIZE} and store the + greatest common divisor at GP (unless it is 0), and the first cofactor at + SP. Write the size of the cofactor through the pointer SSIZE. Return the + size of the value at GP. Note that SP might be a negative number; this is + denoted by storing the negative of the size through SSIZE. + + {UP,USIZE} and {VP,VSIZE} are both clobbered. + + The space allocation for all four areas needs to be USIZE+1. + + Preconditions: 1) U >= V. + 2) V > 0. */ + +/* We use Lehmer's algorithm. The idea is to extract the most significant + bits of the operands, and compute the continued fraction for them. We then + apply the gathered cofactors to the full operands. + + Idea 1: After we have performed a full division, don't shift operands back, but instead account for the extra factors-of-2 thus introduced. Idea 2: Simple generalization to use divide-and-conquer would give us an algorithm that runs faster than O(n^2). Idea 3: The input numbers need less space as the computation progresses, - while the s0 and s1 variables need more space. To save space, we + while the s0 and s1 variables need more space. To save memory, we could make them share space, and have the latter variables grow - into the former. */ + into the former. + Idea 4: We should not do double-limb arithmetic from the start. Instead, + do things in single-limb arithmetic until the quotients differ, + and then switch to double-limb arithmetic. */ + + +/* Division optimized for small quotients. If the quotient is more than one limb, + store 1 in *qh and return 0. */ +static mp_limb_t +#if __STDC__ +div2 (mp_limb_t *qh, mp_limb_t n1, mp_limb_t n0, mp_limb_t d1, mp_limb_t d0) +#else +div2 (qh, n1, n0, d1, d0) + mp_limb_t *qh; + mp_limb_t n1; + mp_limb_t n0; + mp_limb_t d1; + mp_limb_t d0; +#endif +{ + if (d1 == 0) + { + *qh = 1; + return 0; + } + + if ((mp_limb_signed_t) n1 < 0) + { + mp_limb_t q; + int cnt; + for (cnt = 1; (mp_limb_signed_t) d1 >= 0; cnt++) + { + d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1)); + d0 = d0 << 1; + } + + q = 0; + while (cnt) + { + q <<= 1; + if (n1 > d1 || (n1 == d1 && n0 >= d0)) + { + sub_ddmmss (n1, n0, n1, n0, d1, d0); + q |= 1; + } + d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1); + d1 = d1 >> 1; + cnt--; + } + + *qh = 0; + return q; + } + else + { + mp_limb_t q; + int cnt; + for (cnt = 0; n1 > d1 || (n1 == d1 && n0 >= d0); cnt++) + { + d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1)); + d0 = d0 << 1; + } -/* Precondition: U >= V. */ + q = 0; + while (cnt) + { + d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1); + d1 = d1 >> 1; + q <<= 1; + if (n1 > d1 || (n1 == d1 && n0 >= d0)) + { + sub_ddmmss (n1, n0, n1, n0, d1, d0); + q |= 1; + } + cnt--; + } + + *qh = 0; + return q; + } +} mp_size_t #if EXTEND #if __STDC__ -mpn_gcdext (mp_ptr gp, mp_ptr s0p, +mpn_gcdext (mp_ptr gp, mp_ptr s0p, mp_size_t *s0size, mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize) #else -mpn_gcdext (gp, s0p, up, size, vp, vsize) +mpn_gcdext (gp, s0p, s0size, up, size, vp, vsize) mp_ptr gp; mp_ptr s0p; + mp_size_t *s0size; mp_ptr up; mp_size_t size; mp_ptr vp; @@ -72,24 +173,29 @@ mpn_gcd (gp, up, size, vp, vsize) #endif #endif { - mp_limb_t uh, vh; - mp_limb_signed_t A, B, C, D; + mp_limb_t A, B, C, D; int cnt; mp_ptr tp, wp; #if RECORD - mp_limb_signed_t min = 0, max = 0; + mp_limb_t max = 0; #endif #if EXTEND mp_ptr s1p; mp_ptr orig_s0p = s0p; - mp_size_t ssize, orig_size = size; + mp_size_t ssize; + int sign = 1; +#endif + int use_double_flag; TMP_DECL (mark); TMP_MARK (mark); + use_double_flag = (size >= GCDEXT_THRESHOLD); + tp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB); wp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB); - s1p = (mp_ptr) TMP_ALLOC (size * BYTES_PER_MP_LIMB); +#if EXTEND + s1p = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB); MPN_ZERO (s0p, size); MPN_ZERO (s1p, size); @@ -117,6 +223,7 @@ mpn_gcd (gp, up, size, vp, vsize) /* This is really what it boils down to in this case... */ s0p[0] = 0; s1p[0] = 1; + sign = -sign; #endif size = vsize; if (cnt != 0) @@ -124,66 +231,192 @@ mpn_gcd (gp, up, size, vp, vsize) mpn_rshift (up, up, size, cnt); mpn_rshift (vp, vp, size, cnt); } - { - mp_ptr xp; - xp = up; up = vp; vp = xp; - } + MP_PTR_SWAP (up, vp); } for (;;) { + mp_limb_t asign; /* Figure out exact size of V. */ vsize = size; MPN_NORMALIZE (vp, vsize); if (vsize <= 1) break; - /* Make UH be the most significant limb of U, and make VH be - corresponding bits from V. */ - uh = up[size - 1]; - vh = vp[size - 1]; - count_leading_zeros (cnt, uh); - if (cnt != 0) + if (use_double_flag) { - uh = (uh << cnt) | (up[size - 2] >> (BITS_PER_MP_LIMB - cnt)); - vh = (vh << cnt) | (vp[size - 2] >> (BITS_PER_MP_LIMB - cnt)); - } + mp_limb_t uh, vh, ul, vl; + /* Let UH,UL be the most significant limbs of U, and let VH,VL be + the corresponding bits from V. */ + uh = up[size - 1]; + vh = vp[size - 1]; + ul = up[size - 2]; + vl = vp[size - 2]; + count_leading_zeros (cnt, uh); + if (cnt != 0) + { + uh = (uh << cnt) | (ul >> (BITS_PER_MP_LIMB - cnt)); + vh = (vh << cnt) | (vl >> (BITS_PER_MP_LIMB - cnt)); + vl <<= cnt; + ul <<= cnt; + if (size >= 3) + { + ul |= (up[size - 3] >> (BITS_PER_MP_LIMB - cnt)); + vl |= (vp[size - 3] >> (BITS_PER_MP_LIMB - cnt)); + } + } -#if 0 - /* For now, only handle BITS_PER_MP_LIMB-1 bits. This makes - room for sign bit. */ - uh >>= 1; - vh >>= 1; -#endif - A = 1; - B = 0; - C = 0; - D = 1; + A = 1; + B = 0; + C = 0; + D = 1; - for (;;) + asign = 0; + for (;;) + { + mp_limb_t T; + mp_limb_t qh, q1, q2; + mp_limb_t nh, nl, dh, dl; + mp_limb_t t1, t0; + mp_limb_t Th, Tl; + + sub_ddmmss (dh, dl, vh, vl, 0, C); + if ((dl | dh) == 0) + break; + add_ssaaaa (nh, nl, uh, ul, 0, A); + q1 = div2 (&qh, nh, nl, dh, dl); + if (qh != 0) + break; /* could handle this */ + + add_ssaaaa (dh, dl, vh, vl, 0, D); + if ((dl | dh) == 0) + break; + sub_ddmmss (nh, nl, uh, ul, 0, B); + q2 = div2 (&qh, nh, nl, dh, dl); + if (qh != 0) + break; /* could handle this */ + + if (q1 != q2) + break; + + asign = ~asign; + + T = A + q1 * C; + A = C; + C = T; + T = B + q1 * D; + B = D; + D = T; + umul_ppmm (t1, t0, q1, vl); + t1 += q1 * vh; + sub_ddmmss (Th, Tl, uh, ul, t1, t0); + uh = vh, ul = vl; + vh = Th, vl = Tl; + + add_ssaaaa (dh, dl, vh, vl, 0, C); + sub_ddmmss (nh, nl, uh, ul, 0, A); + q1 = div2 (&qh, nh, nl, dh, dl); + if (qh != 0) + break; /* could handle this */ + + sub_ddmmss (dh, dl, vh, vl, 0, D); + if ((dl | dh) == 0) + break; + add_ssaaaa (nh, nl, uh, ul, 0, B); + q2 = div2 (&qh, nh, nl, dh, dl); + if (qh != 0) + break; /* could handle this */ + + if (q1 != q2) + break; + + asign = ~asign; + + T = A + q1 * C; + A = C; + C = T; + T = B + q1 * D; + B = D; + D = T; + umul_ppmm (t1, t0, q1, vl); + t1 += q1 * vh; + sub_ddmmss (Th, Tl, uh, ul, t1, t0); + uh = vh, ul = vl; + vh = Th, vl = Tl; + } +#if EXTEND + if (asign) + sign = -sign; +#endif + } + else /* Same, but using single-limb calculations. */ { - mp_limb_signed_t q, T; - if (vh + C == 0 || vh + D == 0) - break; - - q = (uh + A) / (vh + C); - if (q != (uh + B) / (vh + D)) - break; - - T = A - q * C; - A = C; - C = T; - T = B - q * D; - B = D; - D = T; - T = uh - q * vh; - uh = vh; - vh = T; + mp_limb_t uh, vh; + /* Make UH be the most significant limb of U, and make VH be + corresponding bits from V. */ + uh = up[size - 1]; + vh = vp[size - 1]; + count_leading_zeros (cnt, uh); + if (cnt != 0) + { + uh = (uh << cnt) | (up[size - 2] >> (BITS_PER_MP_LIMB - cnt)); + vh = (vh << cnt) | (vp[size - 2] >> (BITS_PER_MP_LIMB - cnt)); + } + + A = 1; + B = 0; + C = 0; + D = 1; + + asign = 0; + for (;;) + { + mp_limb_t q, T; + if (vh - C == 0 || vh + D == 0) + break; + + q = (uh + A) / (vh - C); + if (q != (uh - B) / (vh + D)) + break; + + asign = ~asign; + + T = A + q * C; + A = C; + C = T; + T = B + q * D; + B = D; + D = T; + T = uh - q * vh; + uh = vh; + vh = T; + + if (vh - D == 0) + break; + + q = (uh - A) / (vh + C); + if (q != (uh + B) / (vh - D)) + break; + + asign = ~asign; + + T = A + q * C; + A = C; + C = T; + T = B + q * D; + B = D; + D = T; + T = uh - q * vh; + uh = vh; + vh = T; + } +#if EXTEND + if (asign) + sign = -sign; +#endif } #if RECORD - min = MIN (A, min); min = MIN (B, min); - min = MIN (C, min); min = MIN (D, min); max = MAX (A, max); max = MAX (B, max); max = MAX (C, max); max = MAX (D, max); #endif @@ -192,7 +425,6 @@ mpn_gcd (gp, up, size, vp, vsize) { mp_limb_t qh; mp_size_t i; - /* This is quite rare. I.e., optimize something else! */ /* Normalize V (and shift up U the same amount). */ @@ -209,32 +441,56 @@ mpn_gcd (gp, up, size, vp, vsize) qh = mpn_divmod (up + vsize, up, size, vp, vsize); #if EXTEND MPN_COPY (tp, s0p, ssize); - for (i = 0; i < size - vsize; i++) - { - mp_limb_t cy; - cy = mpn_addmul_1 (tp + i, s1p, ssize, up[vsize + i]); - if (cy != 0) - tp[ssize++] = cy; - } - if (qh != 0) - { - mp_limb_t cy; - abort (); - /* XXX since qh == 1, mpn_addmul_1 is overkill */ - cy = mpn_addmul_1 (tp + size - vsize, s1p, ssize, qh); - if (cy != 0) - tp[ssize++] = cy; - } -#if 0 - MPN_COPY (s0p, s1p, ssize); /* should be old ssize, kind of */ - MPN_COPY (s1p, tp, ssize); -#else { - mp_ptr xp; - xp = s0p; s0p = s1p; s1p = xp; - xp = s1p; s1p = tp; tp = xp; + mp_size_t qsize; + + qsize = size - vsize; /* size of stored quotient from division */ + if (ssize < qsize) + { + MPN_ZERO (tp + ssize, qsize - ssize); + MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ + for (i = 0; i < ssize; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + i, up + vsize, qsize, s1p[i]); + tp[qsize + i] = cy; + } + if (qh != 0) + { + mp_limb_t cy; + cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize); + if (cy != 0) + abort (); + } + } + else + { + MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ + for (i = 0; i < qsize; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + i, s1p, ssize, up[vsize + i]); + tp[ssize + i] = cy; + } + if (qh != 0) + { + mp_limb_t cy; + cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize); + if (cy != 0) + { + tp[qsize + ssize] = cy; + s1p[qsize + ssize] = 0; + ssize++; + } + } + } + ssize += qsize; + ssize -= tp[ssize - 1] == 0; } -#endif + + sign = -sign; + MP_PTR_SWAP (s0p, s1p); + MP_PTR_SWAP (s1p, tp); #endif size = vsize; if (cnt != 0) @@ -242,129 +498,115 @@ mpn_gcd (gp, up, size, vp, vsize) mpn_rshift (up, up, size, cnt); mpn_rshift (vp, vp, size, cnt); } - - { - mp_ptr xp; - xp = up; up = vp; vp = xp; - } - MPN_NORMALIZE (up, size); + MP_PTR_SWAP (up, vp); } else { +#if EXTEND + mp_size_t tsize, wsize; +#endif /* T = U*A + V*B W = U*C + V*D U = T V = W */ - if (SGN(A) == SGN(B)) /* should be different sign */ - abort (); - if (SGN(C) == SGN(D)) /* should be different sign */ - abort (); #if STAT - { mp_limb_t x; - x = ABS (A) | ABS (B) | ABS (C) | ABS (D); - count_leading_zeros (cnt, x); - arr[BITS_PER_MP_LIMB - cnt]++; } + { mp_limb_t x; x = A | B | C | D; count_leading_zeros (cnt, x); + arr[BITS_PER_MP_LIMB - cnt]++; } #endif if (A == 0) { - if (B != 1) abort (); + /* B == 1 and C == 1 (D is arbitrary) */ + mp_limb_t cy; MPN_COPY (tp, vp, size); - } - else - { - if (A < 0) - { - mpn_mul_1 (tp, vp, size, B); - mpn_submul_1 (tp, up, size, -A); - } - else - { - mpn_mul_1 (tp, up, size, A); - mpn_submul_1 (tp, vp, size, -B); - } - } - if (C < 0) - { - mpn_mul_1 (wp, vp, size, D); - mpn_submul_1 (wp, up, size, -C); - } - else - { - mpn_mul_1 (wp, up, size, C); - mpn_submul_1 (wp, vp, size, -D); - } - - { - mp_ptr xp; - xp = tp; tp = up; up = xp; - xp = wp; wp = vp; vp = xp; - } - + MPN_COPY (wp, up, size); + mpn_submul_1 (wp, vp, size, D); + MP_PTR_SWAP (tp, up); + MP_PTR_SWAP (wp, vp); #if EXTEND - { mp_limb_t cy; - MPN_ZERO (tp, orig_size); - if (A == 0) - { - if (B != 1) abort (); MPN_COPY (tp, s1p, ssize); + tsize = ssize; + tp[ssize] = 0; /* must zero since wp might spill below */ + MPN_COPY (wp, s0p, ssize); + cy = mpn_addmul_1 (wp, s1p, ssize, D); + wp[ssize] = cy; + wsize = ssize + (cy != 0); + MP_PTR_SWAP (tp, s0p); + MP_PTR_SWAP (wp, s1p); + ssize = MAX (wsize, tsize); +#endif } else { - if (A < 0) + if (asign) { + mp_limb_t cy; + mpn_mul_1 (tp, vp, size, B); + mpn_submul_1 (tp, up, size, A); + mpn_mul_1 (wp, up, size, C); + mpn_submul_1 (wp, vp, size, D); + MP_PTR_SWAP (tp, up); + MP_PTR_SWAP (wp, vp); +#if EXTEND cy = mpn_mul_1 (tp, s1p, ssize, B); - cy += mpn_addmul_1 (tp, s0p, ssize, -A); + cy += mpn_addmul_1 (tp, s0p, ssize, A); + tp[ssize] = cy; + tsize = ssize + (cy != 0); + cy = mpn_mul_1 (wp, s0p, ssize, C); + cy += mpn_addmul_1 (wp, s1p, ssize, D); + wp[ssize] = cy; + wsize = ssize + (cy != 0); + MP_PTR_SWAP (tp, s0p); + MP_PTR_SWAP (wp, s1p); + ssize = MAX (wsize, tsize); +#endif } else { + mp_limb_t cy; + mpn_mul_1 (tp, up, size, A); + mpn_submul_1 (tp, vp, size, B); + mpn_mul_1 (wp, vp, size, D); + mpn_submul_1 (wp, up, size, C); + MP_PTR_SWAP (tp, up); + MP_PTR_SWAP (wp, vp); +#if EXTEND cy = mpn_mul_1 (tp, s0p, ssize, A); - cy += mpn_addmul_1 (tp, s1p, ssize, -B); + cy += mpn_addmul_1 (tp, s1p, ssize, B); + tp[ssize] = cy; + tsize = ssize + (cy != 0); + cy = mpn_mul_1 (wp, s1p, ssize, D); + cy += mpn_addmul_1 (wp, s0p, ssize, C); + wp[ssize] = cy; + wsize = ssize + (cy != 0); + MP_PTR_SWAP (tp, s0p); + MP_PTR_SWAP (wp, s1p); + ssize = MAX (wsize, tsize); +#endif } - if (cy != 0) - tp[ssize++] = cy; - } - MPN_ZERO (wp, orig_size); - if (C < 0) - { - cy = mpn_mul_1 (wp, s1p, ssize, D); - cy += mpn_addmul_1 (wp, s0p, ssize, -C); } - else - { - cy = mpn_mul_1 (wp, s0p, ssize, C); - cy += mpn_addmul_1 (wp, s1p, ssize, -D); - } - if (cy != 0) - wp[ssize++] = cy; - } - { - mp_ptr xp; - xp = tp; tp = s0p; s0p = xp; - xp = wp; wp = s1p; s1p = xp; - } -#endif -#if 0 /* Is it a win to remove multiple zeros here? */ - MPN_NORMALIZE (up, size); -#else - if (up[size - 1] == 0) - size--; -#endif + + size -= up[size - 1] == 0; } } #if RECORD - printf ("min: %ld\n", min); - printf ("max: %ld\n", max); + printf ("max: %lx\n", max); +#endif + +#if STAT + {int i; for (i = 0; i < BITS_PER_MP_LIMB; i++) printf ("%d:%d\n", i, arr[i]);} #endif if (vsize == 0) { - if (gp != up) + if (gp != up && gp != 0) MPN_COPY (gp, up, size); #if EXTEND + MPN_NORMALIZE (s0p, ssize); if (orig_s0p != s0p) MPN_COPY (orig_s0p, s0p, ssize); + *s0size = sign >= 0 ? ssize : -ssize; #endif TMP_FREE (mark); return size; @@ -373,29 +615,42 @@ mpn_gcd (gp, up, size, vp, vsize) { mp_limb_t vl, ul, t; #if EXTEND - mp_limb_t cy; - mp_size_t i; + mp_size_t qsize, i; #endif vl = vp[0]; #if EXTEND t = mpn_divmod_1 (wp, up, size, vl); + MPN_COPY (tp, s0p, ssize); - for (i = 0; i < size; i++) + + qsize = size - (wp[size - 1] == 0); /* size of quotient from division */ + if (ssize < qsize) { - cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]); - if (cy != 0) - tp[ssize++] = cy; + MPN_ZERO (tp + ssize, qsize - ssize); + MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ + for (i = 0; i < ssize; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + i, wp, qsize, s1p[i]); + tp[qsize + i] = cy; + } } -#if 0 - MPN_COPY (s0p, s1p, ssize); - MPN_COPY (s1p, tp, ssize); -#else - { - mp_ptr xp; - xp = s0p; s0p = s1p; s1p = xp; - xp = s1p; s1p = tp; tp = xp; - } -#endif + else + { + MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ + for (i = 0; i < qsize; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]); + tp[ssize + i] = cy; + } + } + ssize += qsize; + ssize -= tp[ssize - 1] == 0; + + sign = -sign; + MP_PTR_SWAP (s0p, s1p); + MP_PTR_SWAP (s1p, tp); #else t = mpn_mod_1 (up, size, vl); #endif @@ -405,35 +660,39 @@ mpn_gcd (gp, up, size, vp, vsize) { mp_limb_t t; #if EXTEND - mp_limb_t q, cy; + mp_limb_t q; q = ul / vl; - t = ul - q*vl; + t = ul - q * vl; MPN_COPY (tp, s0p, ssize); - cy = mpn_addmul_1 (tp, s1p, ssize, q); - if (cy != 0) - tp[ssize++] = cy; -#if 0 - MPN_COPY (s0p, s1p, ssize); - MPN_COPY (s1p, tp, ssize); -#else + + MPN_ZERO (s1p + ssize, 1); /* zero s1 too */ + { - mp_ptr xp; - xp = s0p; s0p = s1p; s1p = xp; - xp = s1p; s1p = tp; tp = xp; + mp_limb_t cy; + cy = mpn_addmul_1 (tp, s1p, ssize, q); + tp[ssize] = cy; } -#endif + ssize += 1; + ssize -= tp[ssize - 1] == 0; + + sign = -sign; + MP_PTR_SWAP (s0p, s1p); + MP_PTR_SWAP (s1p, tp); #else t = ul % vl; #endif ul = vl; vl = t; } - gp[0] = ul; + if (gp != 0) + gp[0] = ul; #if EXTEND + MPN_NORMALIZE (s0p, ssize); if (orig_s0p != s0p) MPN_COPY (orig_s0p, s0p, ssize); + *s0size = sign >= 0 ? ssize : -ssize; #endif TMP_FREE (mark); return 1; diff --git a/ghc/rts/gmp/mpn/generic/get_str.c b/ghc/rts/gmp/mpn/generic/get_str.c index 0e7fc60..a713b61 100644 --- a/ghc/rts/gmp/mpn/generic/get_str.c +++ b/ghc/rts/gmp/mpn/generic/get_str.c @@ -1,21 +1,22 @@ /* mpn_get_str -- Convert a MSIZE long limb vector pointed to by MPTR to a printable string in STR in base BASE. -Copyright (C) 1991, 1992, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation, +Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -39,11 +40,15 @@ MA 02111-1307, USA. */ The limb vector pointed to by MPTR is clobbered. */ size_t +#if __STDC__ +mpn_get_str (unsigned char *str, int base, mp_ptr mptr, mp_size_t msize) +#else mpn_get_str (str, base, mptr, msize) unsigned char *str; int base; mp_ptr mptr; mp_size_t msize; +#endif { mp_limb_t big_base; #if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME diff --git a/ghc/rts/gmp/mpn/generic/gmp-mparam.h b/ghc/rts/gmp/mpn/generic/gmp-mparam.h index 7c88557..14bcaec 100644 --- a/ghc/rts/gmp/mpn/generic/gmp-mparam.h +++ b/ghc/rts/gmp/mpn/generic/gmp-mparam.h @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/generic/hamdist.c b/ghc/rts/gmp/mpn/generic/hamdist.c index 2190b63..35c10e8 100644 --- a/ghc/rts/gmp/mpn/generic/hamdist.c +++ b/ghc/rts/gmp/mpn/generic/hamdist.c @@ -1,20 +1,20 @@ /* mpn_hamdist -- -Copyright (C) 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -23,7 +23,9 @@ MA 02111-1307, USA. */ #include "gmp-impl.h" #if defined __GNUC__ -#if defined __sparc_v9__ && BITS_PER_MP_LIMB == 64 +/* No processor claiming to be SPARC v9 compliant seem to + implement the POPC instruction. Disable pattern for now. */ +#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64 #define popc_limb(a) \ ({ \ DItype __res; \ @@ -39,15 +41,19 @@ MA 02111-1307, USA. */ You have to figure out how this works, I won't tell you! */ static inline unsigned int +#if __STDC__ +popc_limb (mp_limb_t x) +#else popc_limb (x) mp_limb_t x; +#endif { #if BITS_PER_MP_LIMB == 64 /* We have to go into some trouble to define these constants. (For mp_limb_t being `long long'.) */ mp_limb_t cnst; - cnst = 0x55555555L | ((mp_limb_t) 0x55555555L << BITS_PER_MP_LIMB/2); - x = ((x & ~cnst) >> 1) + (x & cnst); + cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2); + x -= (x & cnst) >> 1; cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2); x = ((x & ~cnst) >> 2) + (x & cnst); cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2); @@ -57,7 +63,7 @@ popc_limb (x) x = ((x >> 32) + x) & 0xff; #endif #if BITS_PER_MP_LIMB == 32 - x = ((x >> 1) & 0x55555555L) + (x & 0x55555555L); + x -= (x & 0xaaaaaaaa) >> 1; x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L); x = ((x >> 4) + x) & 0x0f0f0f0fL; x = ((x >> 8) + x); diff --git a/ghc/rts/gmp/mpn/generic/inlines.c b/ghc/rts/gmp/mpn/generic/inlines.c index dca305e..9487e58 100644 --- a/ghc/rts/gmp/mpn/generic/inlines.c +++ b/ghc/rts/gmp/mpn/generic/inlines.c @@ -1,3 +1,24 @@ +/* +Copyright (C) 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + #define _FORCE_INLINES #define _EXTERN_INLINE /* empty */ #include "gmp.h" diff --git a/ghc/rts/gmp/mpn/generic/jacbase.c b/ghc/rts/gmp/mpn/generic/jacbase.c new file mode 100644 index 0000000..dd437f1 --- /dev/null +++ b/ghc/rts/gmp/mpn/generic/jacbase.c @@ -0,0 +1,136 @@ +/* mpn_jacobi_base -- limb/limb Jacobi symbol with restricted arguments. + + THIS INTERFACE IS PRELIMINARY AND MIGHT DISAPPEAR OR BE SUBJECT TO + INCOMPATIBLE CHANGES IN A FUTURE RELEASE OF GMP. */ + +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +#if COUNT_TRAILING_ZEROS_TIME <= 7 +/* If count_trailing_zeros is fast, use it. + K7 at 7 cycles and P6 at 2 are good here. K6 at 12-27 and P5 at 18-42 + are not. The default 15 in longlong.h is meant to mean not good here. */ + +#define PROCESS_TWOS_ANY \ + { \ + mp_limb_t twos; \ + count_trailing_zeros (twos, a); \ + result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, b); \ + a >>= twos; \ + } + +#define PROCESS_TWOS_EVEN PROCESS_TWOS_ANY + +#else +/* Use a loop instead. With "a" uniformly distributed there will usually be + only a few trailing zeros. + + Unfortunately the branch for the while loop here will be on a 50/50 + chance of a 1 or 0, which is bad for branch prediction. */ + +#define PROCESS_TWOS_EVEN \ + { \ + int two; \ + two = JACOBI_TWO_U_BIT1 (b); \ + do \ + { \ + a >>= 1; \ + result_bit1 ^= two; \ + ASSERT (a != 0); \ + } \ + while ((a & 1) == 0); \ + } + +#define PROCESS_TWOS_ANY \ + if ((a & 1) == 0) \ + PROCESS_TWOS_EVEN; + +#endif + + +/* Calculate the value of the Jacobi symbol (a/b) of two mp_limb_t's, but + with a restricted range of inputs accepted, namely b>1, b odd, and a<=b. + + The initial result_bit1 is taken as a parameter for the convenience of + mpz_kronecker_zi_ui() et al. The sign changes both here and in those + routines accumulate nicely in bit 1, see the JACOBI macros. + + The return value here is the normal +1, 0, or -1. Note that +1 and -1 + have bit 1 in the "BIT1" sense, which could be useful if the caller is + accumulating it into some extended calculation. + + Duplicating the loop body to avoid the MP_LIMB_T_SWAP(a,b) would be + possible, but a couple of tests suggest it's not a significant speedup, + and may even be a slowdown, so what's here is good enough for now. + + Future: The code doesn't demand a<=b actually, so maybe this could be + relaxed. All the places this is used currently call with a<=b though. */ + +int +#if __STDC__ +mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int result_bit1) +#else +mpn_jacobi_base (a, b, result_bit1) + mp_limb_t a; + mp_limb_t b; + int result_bit1; +#endif +{ + ASSERT (b & 1); /* b odd */ + ASSERT (b != 1); + ASSERT (a <= b); + + if (a == 0) + return 0; + + PROCESS_TWOS_ANY; + if (a == 1) + goto done; + + for (;;) + { + result_bit1 ^= JACOBI_RECIP_UU_BIT1 (a, b); + MP_LIMB_T_SWAP (a, b); + + do + { + /* working on (a/b), a,b odd, a>=b */ + ASSERT (a & 1); + ASSERT (b & 1); + ASSERT (a >= b); + + if ((a -= b) == 0) + return 0; + + PROCESS_TWOS_EVEN; + if (a == 1) + goto done; + } + while (a >= b); + } + + done: + return JACOBI_BIT1_TO_PN (result_bit1); +} diff --git a/ghc/rts/gmp/mpn/generic/lshift.c b/ghc/rts/gmp/mpn/generic/lshift.c index e244bc5..0b58389 100644 --- a/ghc/rts/gmp/mpn/generic/lshift.c +++ b/ghc/rts/gmp/mpn/generic/lshift.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/generic/mod_1.c b/ghc/rts/gmp/mpn/generic/mod_1.c index 314d11b..168ec9d 100644 --- a/ghc/rts/gmp/mpn/generic/mod_1.c +++ b/ghc/rts/gmp/mpn/generic/mod_1.c @@ -3,21 +3,21 @@ Return the single-limb remainder. There are no constraints on the value of the divisor. -Copyright (C) 1991, 1993, 1994, Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1999 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -34,9 +34,6 @@ MA 02111-1307, USA. */ #define UDIV_TIME UMUL_TIME #endif -/* FIXME: We should be using invert_limb (or invert_normalized_limb) - here (not udiv_qrnnd). */ - mp_limb_t #if __STDC__ mpn_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size, @@ -74,17 +71,7 @@ mpn_mod_1 (dividend_ptr, dividend_size, divisor_limb) mp_limb_t divisor_limb_inverted; divisor_limb <<= normalization_steps; - - /* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB. The - result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the - most significant bit (with weight 2**N) implicit. */ - - /* Special case for DIVISOR_LIMB == 100...000. */ - if (divisor_limb << 1 == 0) - divisor_limb_inverted = ~(mp_limb_t) 0; - else - udiv_qrnnd (divisor_limb_inverted, dummy, - -divisor_limb, 0, divisor_limb); + invert_limb (divisor_limb_inverted, divisor_limb); n1 = dividend_ptr[dividend_size - 1]; r = n1 >> (BITS_PER_MP_LIMB - normalization_steps); @@ -113,16 +100,7 @@ mpn_mod_1 (dividend_ptr, dividend_size, divisor_limb) { mp_limb_t divisor_limb_inverted; - /* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB. The - result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the - most significant bit (with weight 2**N) implicit. */ - - /* Special case for DIVISOR_LIMB == 100...000. */ - if (divisor_limb << 1 == 0) - divisor_limb_inverted = ~(mp_limb_t) 0; - else - udiv_qrnnd (divisor_limb_inverted, dummy, - -divisor_limb, 0, divisor_limb); + invert_limb (divisor_limb_inverted, divisor_limb); i = dividend_size - 1; r = dividend_ptr[i]; diff --git a/ghc/rts/gmp/mpn/generic/mod_1_rs.c b/ghc/rts/gmp/mpn/generic/mod_1_rs.c new file mode 100644 index 0000000..62aaa94 --- /dev/null +++ b/ghc/rts/gmp/mpn/generic/mod_1_rs.c @@ -0,0 +1,111 @@ +/* mpn_mod_1_rshift -- mpn remainder under hypothetical right shift. + + THE FUNCTION IN THIS FILE IS FOR INTERNAL USE AND HAS A MUTABLE + INTERFACE. IT IS ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. + IT'S ALMOST GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP + RELEASE. */ + +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +/* When testing on a CPU with UDIV_NEEDS_NORMALIZATION equal to 0, it can be + changed to 1 temporarily to test the code under that case too. */ +#if 0 +#undef UDIV_NEEDS_NORMALIZATION +#define UDIV_NEEDS_NORMALIZATION 1 +#endif + + +/* Calculate the remainder "(ptr,size >> shift) % divisor". Note ptr,size + is unchanged, the shift is only for its effect on the remainder. + The shift doesn't even need to be considered until the last limb. + + This function has the normal size!=0 restriction, unlike the basic + mpn_mod_1. */ + +mp_limb_t +#if __STDC__ +mpn_mod_1_rshift (mp_srcptr ptr, mp_size_t size, unsigned shift, + mp_limb_t divisor) +#else +mpn_mod_1_rshift (ptr, size, shift, divisor) + mp_srcptr ptr; + mp_size_t size; + unsigned shift; + mp_limb_t divisor; +#endif +{ + mp_limb_t quot, rem; + + ASSERT (shift >= 1); + ASSERT (shift < BITS_PER_MP_LIMB); + ASSERT (size >= 1); + + if (size == 1) + return (ptr[0] >> shift) % divisor; + +#if UDIV_NEEDS_NORMALIZATION + { + int norm; + int delta; + + count_leading_zeros (norm, divisor); + divisor <<= norm; + + delta = shift - norm; + if (delta == 0) + return mpn_mod_1 (ptr, size, divisor) >> norm; + + if (delta > 0) + { + rem = mpn_mod_1 (ptr+1, size-1, divisor); + udiv_qrnnd (quot, rem, + rem >> delta, + (rem << (BITS_PER_MP_LIMB-delta)) | (ptr[0] >> delta), + divisor); + return rem >> norm; + } + else + { + rem = mpn_mod_1 (ptr, size, divisor); + udiv_qrnnd (quot, rem, + rem >> (BITS_PER_MP_LIMB+delta), + rem << -delta, + divisor); + return rem >> norm; + } + } + +#else /* !UDIV_NEEDS_NORMALIZATION */ + + rem = mpn_mod_1 (ptr+1, size-1, divisor); + udiv_qrnnd (quot, rem, + rem >> shift, + (rem << (BITS_PER_MP_LIMB-shift)) | (ptr[0] >> shift), + divisor); + return rem; + +#endif +} diff --git a/ghc/rts/gmp/mpn/generic/mul.c b/ghc/rts/gmp/mpn/generic/mul.c index dcf8cb4..cecfa19 100644 --- a/ghc/rts/gmp/mpn/generic/mul.c +++ b/ghc/rts/gmp/mpn/generic/mul.c @@ -1,20 +1,27 @@ /* mpn_mul -- Multiply two natural numbers. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. + THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul) + ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH + THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED + THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright (C) 1991, 1993, 1994, 1996, 1997, 1999, 2000 Free Software +Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -22,131 +29,162 @@ MA 02111-1307, USA. */ #include "gmp.h" #include "gmp-impl.h" -/* Multiply the natural numbers u (pointed to by UP, with USIZE limbs) - and v (pointed to by VP, with VSIZE limbs), and store the result at - PRODP. USIZE + VSIZE limbs are always stored, but if the input - operands are normalized. Return the most significant limb of the - result. +/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v + (pointed to by VP, with VN limbs), and store the result at PRODP. The + result is UN + VN limbs. Return the most significant limb of the result. - NOTE: The space pointed to by PRODP is overwritten before finished - with U and V, so overlap is an error. + NOTE: The space pointed to by PRODP is overwritten before finished with U + and V, so overlap is an error. Argument constraints: - 1. USIZE >= VSIZE. - 2. PRODP != UP and PRODP != VP, i.e. the destination - must be distinct from the multiplier and the multiplicand. */ - -/* If KARATSUBA_THRESHOLD is not already defined, define it to a - value which is good on most machines. */ -#ifndef KARATSUBA_THRESHOLD -#define KARATSUBA_THRESHOLD 32 + 1. UN >= VN. + 2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from + the multiplier and the multiplicand. */ + +void +#if __STDC__ +mpn_sqr_n (mp_ptr prodp, + mp_srcptr up, mp_size_t un) +#else +mpn_sqr_n (prodp, up, un) + mp_ptr prodp; + mp_srcptr up; + mp_size_t un; +#endif +{ + if (un < KARATSUBA_SQR_THRESHOLD) + { /* plain schoolbook multiplication */ + if (un == 0) + return; + mpn_sqr_basecase (prodp, up, un); + } + else if (un < TOOM3_SQR_THRESHOLD) + { /* karatsuba multiplication */ + mp_ptr tspace; + TMP_DECL (marker); + TMP_MARK (marker); + tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB); + mpn_kara_sqr_n (prodp, up, un, tspace); + TMP_FREE (marker); + } +#if WANT_FFT || TUNE_PROGRAM_BUILD + else if (un < FFT_SQR_THRESHOLD) +#else + else +#endif + { /* toom3 multiplication */ + mp_ptr tspace; + TMP_DECL (marker); + TMP_MARK (marker); + tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB); + mpn_toom3_sqr_n (prodp, up, un, tspace); + TMP_FREE (marker); + } +#if WANT_FFT || TUNE_PROGRAM_BUILD + else + { + /* schoenhage multiplication */ + mpn_mul_fft_full (prodp, up, un, up, un); + } #endif +} mp_limb_t #if __STDC__ mpn_mul (mp_ptr prodp, - mp_srcptr up, mp_size_t usize, - mp_srcptr vp, mp_size_t vsize) + mp_srcptr up, mp_size_t un, + mp_srcptr vp, mp_size_t vn) #else -mpn_mul (prodp, up, usize, vp, vsize) +mpn_mul (prodp, up, un, vp, vn) mp_ptr prodp; mp_srcptr up; - mp_size_t usize; + mp_size_t un; mp_srcptr vp; - mp_size_t vsize; + mp_size_t vn; #endif { - mp_ptr prod_endp = prodp + usize + vsize - 1; - mp_limb_t cy; - mp_ptr tspace; - TMP_DECL (marker); + mp_size_t l; + mp_limb_t c; - if (vsize < KARATSUBA_THRESHOLD) + if (up == vp && un == vn) { - /* Handle simple cases with traditional multiplication. - - This is the most critical code of the entire function. All - multiplies rely on this, both small and huge. Small ones arrive - here immediately. Huge ones arrive here as this is the base case - for Karatsuba's recursive algorithm below. */ - mp_size_t i; - mp_limb_t cy_limb; - mp_limb_t v_limb; - - if (vsize == 0) - return 0; - - /* Multiply by the first limb in V separately, as the result can be - stored (not added) to PROD. We also avoid a loop for zeroing. */ - v_limb = vp[0]; - if (v_limb <= 1) + mpn_sqr_n (prodp, up, un); + return prodp[2 * un - 1]; + } + + if (vn < KARATSUBA_MUL_THRESHOLD) + { /* long multiplication */ + mpn_mul_basecase (prodp, up, un, vp, vn); + return prodp[un + vn - 1]; + } + + mpn_mul_n (prodp, up, vp, vn); + if (un != vn) + { mp_limb_t t; + mp_ptr ws; + TMP_DECL (marker); + TMP_MARK (marker); + + prodp += vn; + l = vn; + up += vn; + un -= vn; + + if (un < vn) { - if (v_limb == 1) - MPN_COPY (prodp, up, usize); - else - MPN_ZERO (prodp, usize); - cy_limb = 0; + /* Swap u's and v's. */ + MPN_SRCPTR_SWAP (up,un, vp,vn); } - else - cy_limb = mpn_mul_1 (prodp, up, usize, v_limb); - prodp[usize] = cy_limb; - prodp++; + ws = (mp_ptr) TMP_ALLOC (((vn >= KARATSUBA_MUL_THRESHOLD ? vn : un) + vn) + * BYTES_PER_MP_LIMB); - /* For each iteration in the outer loop, multiply one limb from - U with one limb from V, and add it to PROD. */ - for (i = 1; i < vsize; i++) + t = 0; + while (vn >= KARATSUBA_MUL_THRESHOLD) { - v_limb = vp[i]; - if (v_limb <= 1) + mpn_mul_n (ws, up, vp, vn); + if (l <= 2*vn) { - cy_limb = 0; - if (v_limb == 1) - cy_limb = mpn_add_n (prodp, prodp, up, usize); + t += mpn_add_n (prodp, prodp, ws, l); + if (l != 2*vn) + { + t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t); + l = 2*vn; + } } else - cy_limb = mpn_addmul_1 (prodp, up, usize, v_limb); - - prodp[usize] = cy_limb; - prodp++; + { + c = mpn_add_n (prodp, prodp, ws, 2*vn); + t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c); + } + prodp += vn; + l -= vn; + up += vn; + un -= vn; + if (un < vn) + { + /* Swap u's and v's. */ + MPN_SRCPTR_SWAP (up,un, vp,vn); + } } - return cy_limb; - } - - TMP_MARK (marker); - tspace = (mp_ptr) TMP_ALLOC (2 * vsize * BYTES_PER_MP_LIMB); - MPN_MUL_N_RECURSE (prodp, up, vp, vsize, tspace); - - prodp += vsize; - up += vsize; - usize -= vsize; - if (usize >= vsize) - { - mp_ptr tp = (mp_ptr) TMP_ALLOC (2 * vsize * BYTES_PER_MP_LIMB); - do + if (vn) { - MPN_MUL_N_RECURSE (tp, up, vp, vsize, tspace); - cy = mpn_add_n (prodp, prodp, tp, vsize); - mpn_add_1 (prodp + vsize, tp + vsize, vsize, cy); - prodp += vsize; - up += vsize; - usize -= vsize; + mpn_mul_basecase (ws, up, un, vp, vn); + if (l <= un + vn) + { + t += mpn_add_n (prodp, prodp, ws, l); + if (l != un + vn) + t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t); + } + else + { + c = mpn_add_n (prodp, prodp, ws, un + vn); + t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c); + } } - while (usize >= vsize); - } - - /* True: usize < vsize. */ - - /* Make life simple: Recurse. */ - - if (usize != 0) - { - mpn_mul (tspace, vp, vsize, up, usize); - cy = mpn_add_n (prodp, prodp, tspace, vsize); - mpn_add_1 (prodp + vsize, tspace + vsize, usize, cy); - } - TMP_FREE (marker); - return *prod_endp; + TMP_FREE (marker); + } + return prodp[un + vn - 1]; } diff --git a/ghc/rts/gmp/mpn/generic/mul_1.c b/ghc/rts/gmp/mpn/generic/mul_1.c index 2de680a..1c36b5f 100644 --- a/ghc/rts/gmp/mpn/generic/mul_1.c +++ b/ghc/rts/gmp/mpn/generic/mul_1.c @@ -6,16 +6,16 @@ Copyright (C) 1991, 1992, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/generic/mul_basecase.c b/ghc/rts/gmp/mpn/generic/mul_basecase.c new file mode 100644 index 0000000..00c06aa --- /dev/null +++ b/ghc/rts/gmp/mpn/generic/mul_basecase.c @@ -0,0 +1,87 @@ +/* mpn_mul_basecase -- Internal routine to multiply two natural numbers + of length m and n. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + + +Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +/* Handle simple cases with traditional multiplication. + + This is the most critical code of multiplication. All multiplies rely on + this, both small and huge. Small ones arrive here immediately, huge ones + arrive here as this is the base case for Karatsuba's recursive algorithm. */ + +void +#if __STDC__ +mpn_mul_basecase (mp_ptr prodp, + mp_srcptr up, mp_size_t usize, + mp_srcptr vp, mp_size_t vsize) +#else +mpn_mul_basecase (prodp, up, usize, vp, vsize) + mp_ptr prodp; + mp_srcptr up; + mp_size_t usize; + mp_srcptr vp; + mp_size_t vsize; +#endif +{ + /* We first multiply by the low order one or two limbs, as the result can + be stored, not added, to PROD. We also avoid a loop for zeroing this + way. */ +#if HAVE_NATIVE_mpn_mul_2 + if (vsize >= 2) + { + prodp[usize + 1] = mpn_mul_2 (prodp, up, usize, vp[0], vp[1]); + prodp += 2, vp += 2, vsize -= 2; + } + else + { + prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]); + return; + } +#else + prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]); + prodp += 1, vp += 1, vsize -= 1; +#endif + +#if HAVE_NATIVE_mpn_addmul_2 + while (vsize >= 2) + { + prodp[usize + 1] = mpn_addmul_2 (prodp, up, usize, vp[0], vp[1]); + prodp += 2, vp += 2, vsize -= 2; + } + if (vsize != 0) + prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]); +#else + /* For each iteration in the loop, multiply U with one limb from V, and + add the result to PROD. */ + while (vsize != 0) + { + prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]); + prodp += 1, vp += 1, vsize -= 1; + } +#endif +} diff --git a/ghc/rts/gmp/mpn/generic/mul_fft.c b/ghc/rts/gmp/mpn/generic/mul_fft.c new file mode 100644 index 0000000..00fd6d7 --- /dev/null +++ b/ghc/rts/gmp/mpn/generic/mul_fft.c @@ -0,0 +1,772 @@ +/* An implementation in GMP of Scho"nhage's fast multiplication algorithm + modulo 2^N+1, by Paul Zimmermann, INRIA Lorraine, February 1998. + + THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND THE FUNCTIONS HAVE + MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED + INTERFACES. IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN + A FUTURE GNU MP RELEASE. + +Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +/* References: + + Schnelle Multiplikation grosser Zahlen, by Arnold Scho"nhage and Volker + Strassen, Computing 7, p. 281-292, 1971. + + Asymptotically fast algorithms for the numerical multiplication + and division of polynomials with complex coefficients, by Arnold Scho"nhage, + Computer Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982. + + Tapes versus Pointers, a study in implementing fast algorithms, + by Arnold Scho"nhage, Bulletin of the EATCS, 30, p. 23-32, 1986. + + See also http://www.loria.fr/~zimmerma/bignum + + + Future: + + K==2 isn't needed in the current uses of this code and the bits specific + for that could be dropped. + + It might be possible to avoid a small number of MPN_COPYs by using a + rotating temporary or two. + + Multiplications of unequal sized operands can be done with this code, but + it needs a tighter test for identifying squaring (same sizes as well as + same pointers). */ + + +#include +#include "gmp.h" +#include "gmp-impl.h" + + +/* Change this to "#define TRACE(x) x" for some traces. */ +#define TRACE(x) + + + +FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] = { + FFT_MUL_TABLE, + FFT_SQR_TABLE +}; + + +static void mpn_mul_fft_internal +_PROTO ((mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl, + int k, int K, + mp_limb_t **Ap, mp_limb_t **Bp, + mp_limb_t *A, mp_limb_t *B, + mp_size_t nprime, mp_size_t l, mp_size_t Mp, int **_fft_l, + mp_limb_t *T, int rec)); + + +/* Find the best k to use for a mod 2^(n*BITS_PER_MP_LIMB)+1 FFT. + sqr==0 if for a multiply, sqr==1 for a square */ +int +#if __STDC__ +mpn_fft_best_k (mp_size_t n, int sqr) +#else +mpn_fft_best_k (n, sqr) + mp_size_t n; + int sqr; +#endif +{ + mp_size_t t; + int i; + + for (i = 0; mpn_fft_table[sqr][i] != 0; i++) + if (n < mpn_fft_table[sqr][i]) + return i + FFT_FIRST_K; + + /* treat 4*last as one further entry */ + if (i == 0 || n < 4*mpn_fft_table[sqr][i-1]) + return i + FFT_FIRST_K; + else + return i + FFT_FIRST_K + 1; +} + + +/* Returns smallest possible number of limbs >= pl for a fft of size 2^k. + FIXME: Is this simply pl rounded up to the next multiple of 2^k ? */ + +mp_size_t +#if __STDC__ +mpn_fft_next_size (mp_size_t pl, int k) +#else +mpn_fft_next_size (pl, k) + mp_size_t pl; + int k; +#endif +{ + mp_size_t N, M; + int K; + + /* if (k==0) k = mpn_fft_best_k (pl, sqr); */ + N = pl*BITS_PER_MP_LIMB; + K = 1<=0;i--) ap[i] = ~tp[n-d+i]; + cc = 1-mpn_add_1(ap, ap, d, 1); + if (cc) cc=mpn_sub_1(ap+d, tp, n-d, 1); + else MPN_COPY(ap+d, tp, n-d); + if (cc+=mpn_sub_1(ap+d, ap+d, n-d, tp[n])) + ap[n]=mpn_add_1(ap, ap, n, cc); + else ap[n]=0; + } + else if ((ap[n]=mpn_sub_1(ap, tp, n, tp[n]))) { + ap[n]=mpn_add_1(ap, ap, n, 1); + } + if ((e/(n*BITS_PER_MP_LIMB))%2) mpn_fft_neg_modF(ap, n); +} + + +/* a <- a+b mod 2^(n*BITS_PER_MP_LIMB)+1 */ +static void +#if __STDC__ +mpn_fft_add_modF (mp_limb_t *ap, mp_limb_t *bp, int n) +#else +mpn_fft_add_modF (ap, bp, n) + mp_limb_t *ap,*bp; + int n; +#endif +{ + mp_limb_t c; + + c = ap[n] + bp[n] + mpn_add_n(ap, ap, bp, n); + if (c>1) c -= 1+mpn_sub_1(ap,ap,n,1); + ap[n]=c; +} + + +/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where + N=n*BITS_PER_MP_LIMB + 2^omega is a primitive root mod 2^N+1 + output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */ + +static void +#if __STDC__ +mpn_fft_fft_sqr (mp_limb_t **Ap, mp_size_t K, int **ll, + mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp) +#else +mpn_fft_fft_sqr(Ap,K,ll,omega,n,inc,tp) +mp_limb_t **Ap,*tp; +mp_size_t K,omega,n,inc; +int **ll; +#endif +{ + if (K==2) { +#ifdef ADDSUB + if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1) +#else + MPN_COPY(tp, Ap[0], n+1); + mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1); + if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1)) +#endif + Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1); + } + else { + int j, inc2=2*inc; + int *lk = *ll; + mp_limb_t *tmp; + TMP_DECL(marker); + + TMP_MARK(marker); + tmp = TMP_ALLOC_LIMBS (n+1); + mpn_fft_fft_sqr(Ap, K/2,ll-1,2*omega,n,inc2, tp); + mpn_fft_fft_sqr(Ap+inc, K/2,ll-1,2*omega,n,inc2, tp); + /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc] + A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */ + for (j=0;j= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) { + int k, K2,nprime2,Nprime2,M2,maxLK,l,Mp2; + int **_fft_l; + mp_limb_t **Ap,**Bp,*A,*B,*T; + + k = mpn_fft_best_k (n, sqr); + K2 = 1<BITS_PER_MP_LIMB) ? K2 : BITS_PER_MP_LIMB; + M2 = n*BITS_PER_MP_LIMB/K2; + l = n/K2; + Nprime2 = ((2*M2+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M2+k+3)/maxLK)*maxLK*/ + nprime2 = Nprime2/BITS_PER_MP_LIMB; + Mp2 = Nprime2/K2; + + Ap = TMP_ALLOC_MP_PTRS (K2); + Bp = TMP_ALLOC_MP_PTRS (K2); + A = TMP_ALLOC_LIMBS (2*K2*(nprime2+1)); + T = TMP_ALLOC_LIMBS (nprime2+1); + B = A + K2*(nprime2+1); + _fft_l = TMP_ALLOC_TYPE (k+1, int*); + for (i=0;i<=k;i++) + _fft_l[i] = TMP_ALLOC_TYPE (1< %d times %dx%d (%1.2f)\n", n, + n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2)); + + for (i=0;i2*n) { + l = n; + rp[n] = mpn_add_1(rp+an-2*n, ap+an-2*n, 3*n-an, + mpn_add_n(rp,ap,ap+2*n,an-2*n)); + } + else { + l = an-n; + MPN_COPY(rp, ap, n); + rp[n]=0; + } + if (mpn_sub_n(rp,rp,ap+n,l)) { + if (mpn_sub_1(rp+l,rp+l,n+1-l,1)) + rp[n]=mpn_add_1(rp,rp,n,1); + } +} + + +static void +#if __STDC__ +mpn_mul_fft_internal(mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl, + int k, int K, + mp_limb_t **Ap, mp_limb_t **Bp, + mp_limb_t *A, mp_limb_t *B, + mp_size_t nprime, mp_size_t l, mp_size_t Mp, + int **_fft_l, + mp_limb_t *T, int rec) +#else +mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,rec) + mp_limb_t *op; + mp_srcptr n, m; + mp_limb_t **Ap,**Bp,*A,*B,*T; + mp_size_t pl,nprime; + int **_fft_l; + int k,K,l,Mp,rec; +#endif +{ + int i, sqr, pla, lo, sh, j; + mp_limb_t *p; + + sqr = (n==m); + + TRACE (printf ("pl=%d k=%d K=%d np=%d l=%d Mp=%d rec=%d sqr=%d\n", + pl,k,K,nprime,l,Mp,rec,sqr)); + + /* decomposition of inputs into arrays Ap[i] and Bp[i] */ + if (rec) for (i=0;i= pla, i.e. enough */ + MPN_ZERO(p, pla); + sqr=0; /* will accumulate the (signed) carry at p[pla] */ + for (i=K-1,lo=l*i+nprime,sh=l*i;i>=0;i--,lo-=l,sh-=l) { + mp_ptr n = p+sh; + j = (K-i)%K; + if (mpn_add_n(n,n,Ap[j],nprime+1)) + sqr += mpn_add_1(n+nprime+1,n+nprime+1,pla-sh-nprime-1,1); + T[2*l]=i+1; /* T = (i+1)*2^(2*M) */ + if (mpn_cmp(Ap[j],T,nprime+1)>0) { /* subtract 2^N'+1 */ + sqr -= mpn_sub_1(n,n,pla-sh,1); + sqr -= mpn_sub_1(p+lo,p+lo,pla-lo,1); + } + } + if (sqr==-1) { + if ((sqr=mpn_add_1(p+pla-pl,p+pla-pl,pl,1))) { + /* p[pla-pl]...p[pla-1] are all zero */ + mpn_sub_1(p+pla-pl-1,p+pla-pl-1,pl+1,1); + mpn_sub_1(p+pla-1,p+pla-1,1,1); + } + } + else if (sqr==1) { + if (pla>=2*pl) + while ((sqr=mpn_add_1(p+pla-2*pl,p+pla-2*pl,2*pl,sqr))); + else { + sqr = mpn_sub_1(p+pla-pl,p+pla-pl,pl,sqr); + ASSERT (sqr == 0); + } + } + else + ASSERT (sqr == 0); + + /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ] + < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ] + < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */ + mpn_fft_norm_modF(op,p,pl,pla); +} + + +/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*BITS_PER_MP_LIMB + n and m have respectively nl and ml limbs + op must have space for pl+1 limbs + One must have pl = mpn_fft_next_size(pl, k). +*/ + +void +#if __STDC__ +mpn_mul_fft (mp_ptr op, mp_size_t pl, + mp_srcptr n, mp_size_t nl, + mp_srcptr m, mp_size_t ml, + int k) +#else +mpn_mul_fft (op, pl, n, nl, m, ml, k) + mp_ptr op; + mp_size_t pl; + mp_srcptr n; + mp_size_t nl; + mp_srcptr m; + mp_size_t ml; + int k; +#endif +{ + int K,maxLK,i,j; + mp_size_t N,Nprime,nprime,M,Mp,l; + mp_limb_t **Ap,**Bp,*A,*T,*B; + int **_fft_l; + int sqr = (n==m && nl==ml); + TMP_DECL(marker); + + TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n", + pl, nl, ml, k)); + ASSERT_ALWAYS (mpn_fft_next_size(pl, k) == pl); + + TMP_MARK(marker); + N = pl*BITS_PER_MP_LIMB; + _fft_l = TMP_ALLOC_TYPE (k+1, int*); + for (i=0;i<=k;i++) + _fft_l[i] = TMP_ALLOC_TYPE (1<BITS_PER_MP_LIMB) ? K : BITS_PER_MP_LIMB; + + Nprime = ((2*M+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M+k+3)/maxLK)*maxLK; */ + nprime = Nprime/BITS_PER_MP_LIMB; + TRACE (printf ("N=%d K=%d, M=%d, l=%d, maxLK=%d, Np=%d, np=%d\n", + N, K, M, l, maxLK, Nprime, nprime)); + if (nprime >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) { + maxLK = (1< %d times %dx%d limbs (%1.2f)\n", + pl,pl,K,nprime,nprime,2.0*(double)N/Nprime/K); + printf(" temp space %ld\n", 2*K*(nprime+1))); + + A = _MP_ALLOCATE_FUNC_LIMBS (2*K*(nprime+1)); + B = A+K*(nprime+1); + Ap = TMP_ALLOC_MP_PTRS (K); + Bp = TMP_ALLOC_MP_PTRS (K); + /* special decomposition for main call */ + for (i=0;i0) { + j = (nl>=l) ? l : nl; /* limbs to store in Ap[i] */ + MPN_COPY(Ap[i], n, j); n+=l; MPN_ZERO(Ap[i]+j, nprime+1-j); + mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T); + } + else MPN_ZERO(Ap[i], nprime+1); + nl -= l; + if (n!=m) { + if (ml>0) { + j = (ml>=l) ? l : ml; /* limbs to store in Bp[i] */ + MPN_COPY(Bp[i], m, j); m+=l; MPN_ZERO(Bp[i]+j, nprime+1-j); + mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T); + } + else MPN_ZERO(Bp[i], nprime+1); + } + ml -= l; + } + mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,0); + TMP_FREE(marker); + _MP_FREE_FUNC_LIMBS (A, 2*K*(nprime+1)); +} + + +#if WANT_ASSERT +static int +#if __STDC__ +mpn_zero_p (mp_ptr p, mp_size_t n) +#else + mpn_zero_p (p, n) + mp_ptr p; + mp_size_t n; +#endif +{ + mp_size_t i; + + for (i = 0; i < n; i++) + { + if (p[i] != 0) + return 0; + } + + return 1; +} +#endif + + +/* Multiply {n,nl}*{m,ml} and write the result to {op,nl+ml}. + + FIXME: Duplicating the result like this is wasteful, do something better + perhaps at the norm_modF stage above. */ + +void +#if __STDC__ +mpn_mul_fft_full (mp_ptr op, + mp_srcptr n, mp_size_t nl, + mp_srcptr m, mp_size_t ml) +#else +mpn_mul_fft_full (op, n, nl, m, ml) + mp_ptr op; + mp_srcptr n; + mp_size_t nl; + mp_srcptr m; + mp_size_t ml; +#endif +{ + mp_ptr pad_op; + mp_size_t pl; + int k; + int sqr = (n==m && nl==ml); + + k = mpn_fft_best_k (nl+ml, sqr); + pl = mpn_fft_next_size (nl+ml, k); + + TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl=%ld k=%d\n", + nl, ml, pl, k)); + + pad_op = _MP_ALLOCATE_FUNC_LIMBS (pl+1); + mpn_mul_fft (pad_op, pl, n, nl, m, ml, k); + + ASSERT (mpn_zero_p (pad_op+nl+ml, pl+1-(nl+ml))); + MPN_COPY (op, pad_op, nl+ml); + + _MP_FREE_FUNC_LIMBS (pad_op, pl+1); +} diff --git a/ghc/rts/gmp/mpn/generic/mul_n.c b/ghc/rts/gmp/mpn/generic/mul_n.c index b38e8ad..b7563be 100644 --- a/ghc/rts/gmp/mpn/generic/mul_n.c +++ b/ghc/rts/gmp/mpn/generic/mul_n.c @@ -1,401 +1,1343 @@ -/* mpn_mul_n -- Multiply two natural numbers of length n. +/* mpn_mul_n and helper function -- Multiply/square natural numbers. -Copyright (C) 1991, 1992, 1993, 1994, 1996 Free Software Foundation, Inc. + THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n) + ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH + THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED + THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software +Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "gmp.h" #include "gmp-impl.h" +#include "longlong.h" -/* Multiply the natural numbers u (pointed to by UP) and v (pointed to by VP), - both with SIZE limbs, and store the result at PRODP. 2 * SIZE limbs are - always stored. Return the most significant limb. - Argument constraints: - 1. PRODP != UP and PRODP != VP, i.e. the destination - must be distinct from the multiplier and the multiplicand. */ +/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB. + 0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */ +#define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1) -/* If KARATSUBA_THRESHOLD is not already defined, define it to a - value which is good on most machines. */ -#ifndef KARATSUBA_THRESHOLD -#define KARATSUBA_THRESHOLD 32 +#if !defined (__alpha) && !defined (__mips) +/* For all other machines, we want to call mpn functions for the compund + operations instead of open-coding them. */ +#define USE_MORE_MPN #endif -/* The code can't handle KARATSUBA_THRESHOLD smaller than 2. */ -#if KARATSUBA_THRESHOLD < 2 -#undef KARATSUBA_THRESHOLD -#define KARATSUBA_THRESHOLD 2 -#endif +/*== Function declarations =================================================*/ + +static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr, + mp_ptr, mp_ptr, mp_ptr, + mp_srcptr, mp_srcptr, mp_srcptr, + mp_size_t, mp_size_t)); +static void interpolate3 _PROTO ((mp_srcptr, + mp_ptr, mp_ptr, mp_ptr, + mp_srcptr, + mp_ptr, mp_ptr, mp_ptr, + mp_size_t, mp_size_t)); +static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t)); + -/* Handle simple cases with traditional multiplication. +/*-- mpn_kara_mul_n ---------------------------------------------------------------*/ - This is the most critical code of multiplication. All multiplies rely - on this, both small and huge. Small ones arrive here immediately. Huge - ones arrive here as this is the base case for Karatsuba's recursive - algorithm below. */ +/* Multiplies using 3 half-sized mults and so on recursively. + * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1]. + * No overlap of p[...] with a[...] or b[...]. + * ws is workspace. + */ void #if __STDC__ -impn_mul_n_basecase (mp_ptr prodp, mp_srcptr up, mp_srcptr vp, mp_size_t size) +mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) #else -impn_mul_n_basecase (prodp, up, vp, size) - mp_ptr prodp; - mp_srcptr up; - mp_srcptr vp; - mp_size_t size; +mpn_kara_mul_n(p, a, b, n, ws) + mp_ptr p; + mp_srcptr a; + mp_srcptr b; + mp_size_t n; + mp_ptr ws; #endif { - mp_size_t i; - mp_limb_t cy_limb; - mp_limb_t v_limb; - - /* Multiply by the first limb in V separately, as the result can be - stored (not added) to PROD. We also avoid a loop for zeroing. */ - v_limb = vp[0]; - if (v_limb <= 1) + mp_limb_t i, sign, w, w0, w1; + mp_size_t n2; + mp_srcptr x, y; + + n2 = n >> 1; + ASSERT (n2 > 0); + + if (n & 1) { - if (v_limb == 1) - MPN_COPY (prodp, up, size); + /* Odd length. */ + mp_size_t n1, n3, nm1; + + n3 = n - n2; + + sign = 0; + w = a[n2]; + if (w != 0) + w -= mpn_sub_n (p, a, a + n3, n2); else - MPN_ZERO (prodp, size); - cy_limb = 0; + { + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n3+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = a + n3; + y = a; + sign = 1; + } + else + { + x = a; + y = a + n3; + } + mpn_sub_n (p, x, y, n2); + } + p[n2] = w; + + w = b[n2]; + if (w != 0) + w -= mpn_sub_n (p + n3, b, b + n3, n2); + else + { + i = n2; + do + { + --i; + w0 = b[i]; + w1 = b[n3+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = b + n3; + y = b; + sign ^= 1; + } + else + { + x = b; + y = b + n3; + } + mpn_sub_n (p + n3, x, y, n2); + } + p[n] = w; + + n1 = n + 1; + if (n2 < KARATSUBA_MUL_THRESHOLD) + { + if (n3 < KARATSUBA_MUL_THRESHOLD) + { + mpn_mul_basecase (ws, p, n3, p + n3, n3); + mpn_mul_basecase (p, a, n3, b, n3); + } + else + { + mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1); + mpn_kara_mul_n (p, a, b, n3, ws + n1); + } + mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2); + } + else + { + mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1); + mpn_kara_mul_n (p, a, b, n3, ws + n1); + mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1); + } + + if (sign) + mpn_add_n (ws, p, ws, n1); + else + mpn_sub_n (ws, p, ws, n1); + + nm1 = n - 1; + if (mpn_add_n (ws, p + n1, ws, nm1)) + { + mp_limb_t x = ws[nm1] + 1; + ws[nm1] = x; + if (x == 0) + ++ws[n]; + } + if (mpn_add_n (p + n3, p + n3, ws, n1)) + { + mp_limb_t x; + i = n1 + n3; + do + { + x = p[i] + 1; + p[i] = x; + ++i; + } while (x == 0); + } } else - cy_limb = mpn_mul_1 (prodp, up, size, v_limb); + { + /* Even length. */ + mp_limb_t t; - prodp[size] = cy_limb; - prodp++; + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n2+i]; + } + while (w0 == w1 && i != 0); + sign = 0; + if (w0 < w1) + { + x = a + n2; + y = a; + sign = 1; + } + else + { + x = a; + y = a + n2; + } + mpn_sub_n (p, x, y, n2); - /* For each iteration in the outer loop, multiply one limb from - U with one limb from V, and add it to PROD. */ - for (i = 1; i < size; i++) - { - v_limb = vp[i]; - if (v_limb <= 1) + i = n2; + do { - cy_limb = 0; - if (v_limb == 1) - cy_limb = mpn_add_n (prodp, prodp, up, size); + --i; + w0 = b[i]; + w1 = b[n2+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = b + n2; + y = b; + sign ^= 1; } else - cy_limb = mpn_addmul_1 (prodp, up, size, v_limb); + { + x = b; + y = b + n2; + } + mpn_sub_n (p + n2, x, y, n2); - prodp[size] = cy_limb; - prodp++; + /* Pointwise products. */ + if (n2 < KARATSUBA_MUL_THRESHOLD) + { + mpn_mul_basecase (ws, p, n2, p + n2, n2); + mpn_mul_basecase (p, a, n2, b, n2); + mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2); + } + else + { + mpn_kara_mul_n (ws, p, p + n2, n2, ws + n); + mpn_kara_mul_n (p, a, b, n2, ws + n); + mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n); + } + + /* Interpolate. */ + if (sign) + w = mpn_add_n (ws, p, ws, n); + else + w = -mpn_sub_n (ws, p, ws, n); + w += mpn_add_n (ws, p + n, ws, n); + w += mpn_add_n (p + n2, p + n2, ws, n); + /* TO DO: could put "if (w) { ... }" here. + * Less work but badly predicted branch. + * No measurable difference in speed on Alpha. + */ + i = n + n2; + t = p[i] + w; + p[i] = t; + if (t < w) + { + do + { + ++i; + w = p[i] + 1; + p[i] = w; + } + while (w == 0); + } } } void #if __STDC__ -impn_mul_n (mp_ptr prodp, - mp_srcptr up, mp_srcptr vp, mp_size_t size, mp_ptr tspace) +mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) #else -impn_mul_n (prodp, up, vp, size, tspace) - mp_ptr prodp; - mp_srcptr up; - mp_srcptr vp; - mp_size_t size; - mp_ptr tspace; +mpn_kara_sqr_n (p, a, n, ws) + mp_ptr p; + mp_srcptr a; + mp_size_t n; + mp_ptr ws; #endif { - if ((size & 1) != 0) - { - /* The size is odd, the code code below doesn't handle that. - Multiply the least significant (size - 1) limbs with a recursive - call, and handle the most significant limb of S1 and S2 - separately. */ - /* A slightly faster way to do this would be to make the Karatsuba - code below behave as if the size were even, and let it check for - odd size in the end. I.e., in essence move this code to the end. - Doing so would save us a recursive call, and potentially make the - stack grow a lot less. */ - - mp_size_t esize = size - 1; /* even size */ - mp_limb_t cy_limb; - - MPN_MUL_N_RECURSE (prodp, up, vp, esize, tspace); - cy_limb = mpn_addmul_1 (prodp + esize, up, esize, vp[esize]); - prodp[esize + esize] = cy_limb; - cy_limb = mpn_addmul_1 (prodp + esize, vp, size, up[esize]); - - prodp[esize + size] = cy_limb; - } - else + mp_limb_t i, sign, w, w0, w1; + mp_size_t n2; + mp_srcptr x, y; + + n2 = n >> 1; + ASSERT (n2 > 0); + + if (n & 1) { - /* Anatolij Alekseevich Karatsuba's divide-and-conquer algorithm. + /* Odd length. */ + mp_size_t n1, n3, nm1; - Split U in two pieces, U1 and U0, such that - U = U0 + U1*(B**n), - and V in V1 and V0, such that - V = V0 + V1*(B**n). + n3 = n - n2; - UV is then computed recursively using the identity + sign = 0; + w = a[n2]; + if (w != 0) + w -= mpn_sub_n (p, a, a + n3, n2); + else + { + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n3+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = a + n3; + y = a; + sign = 1; + } + else + { + x = a; + y = a + n3; + } + mpn_sub_n (p, x, y, n2); + } + p[n2] = w; - 2n n n n - UV = (B + B )U V + B (U -U )(V -V ) + (B + 1)U V - 1 1 1 0 0 1 0 0 + w = a[n2]; + if (w != 0) + w -= mpn_sub_n (p + n3, a, a + n3, n2); + else + { + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n3+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = a + n3; + y = a; + sign ^= 1; + } + else + { + x = a; + y = a + n3; + } + mpn_sub_n (p + n3, x, y, n2); + } + p[n] = w; - Where B = 2**BITS_PER_MP_LIMB. */ + n1 = n + 1; + if (n2 < KARATSUBA_SQR_THRESHOLD) + { + if (n3 < KARATSUBA_SQR_THRESHOLD) + { + mpn_sqr_basecase (ws, p, n3); + mpn_sqr_basecase (p, a, n3); + } + else + { + mpn_kara_sqr_n (ws, p, n3, ws + n1); + mpn_kara_sqr_n (p, a, n3, ws + n1); + } + mpn_sqr_basecase (p + n1, a + n3, n2); + } + else + { + mpn_kara_sqr_n (ws, p, n3, ws + n1); + mpn_kara_sqr_n (p, a, n3, ws + n1); + mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1); + } + + if (sign) + mpn_add_n (ws, p, ws, n1); + else + mpn_sub_n (ws, p, ws, n1); - mp_size_t hsize = size >> 1; - mp_limb_t cy; - int negflg; + nm1 = n - 1; + if (mpn_add_n (ws, p + n1, ws, nm1)) + { + mp_limb_t x = ws[nm1] + 1; + ws[nm1] = x; + if (x == 0) + ++ws[n]; + } + if (mpn_add_n (p + n3, p + n3, ws, n1)) + { + mp_limb_t x; + i = n1 + n3; + do + { + x = p[i] + 1; + p[i] = x; + ++i; + } while (x == 0); + } + } + else + { + /* Even length. */ + mp_limb_t t; - /*** Product H. ________________ ________________ - |_____U1 x V1____||____U0 x V0_____| */ - /* Put result in upper part of PROD and pass low part of TSPACE - as new TSPACE. */ - MPN_MUL_N_RECURSE (prodp + size, up + hsize, vp + hsize, hsize, tspace); + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n2+i]; + } + while (w0 == w1 && i != 0); + sign = 0; + if (w0 < w1) + { + x = a + n2; + y = a; + sign = 1; + } + else + { + x = a; + y = a + n2; + } + mpn_sub_n (p, x, y, n2); - /*** Product M. ________________ - |_(U1-U0)(V0-V1)_| */ - if (mpn_cmp (up + hsize, up, hsize) >= 0) + i = n2; + do { - mpn_sub_n (prodp, up + hsize, up, hsize); - negflg = 0; + --i; + w0 = a[i]; + w1 = a[n2+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = a + n2; + y = a; + sign ^= 1; } else { - mpn_sub_n (prodp, up, up + hsize, hsize); - negflg = 1; + x = a; + y = a + n2; } - if (mpn_cmp (vp + hsize, vp, hsize) >= 0) + mpn_sub_n (p + n2, x, y, n2); + + /* Pointwise products. */ + if (n2 < KARATSUBA_SQR_THRESHOLD) { - mpn_sub_n (prodp + hsize, vp + hsize, vp, hsize); - negflg ^= 1; + mpn_sqr_basecase (ws, p, n2); + mpn_sqr_basecase (p, a, n2); + mpn_sqr_basecase (p + n, a + n2, n2); } else { - mpn_sub_n (prodp + hsize, vp, vp + hsize, hsize); - /* No change of NEGFLG. */ + mpn_kara_sqr_n (ws, p, n2, ws + n); + mpn_kara_sqr_n (p, a, n2, ws + n); + mpn_kara_sqr_n (p + n, a + n2, n2, ws + n); } - /* Read temporary operands from low part of PROD. - Put result in low part of TSPACE using upper part of TSPACE - as new TSPACE. */ - MPN_MUL_N_RECURSE (tspace, prodp, prodp + hsize, hsize, tspace + size); - - /*** Add/copy product H. */ - MPN_COPY (prodp + hsize, prodp + size, hsize); - cy = mpn_add_n (prodp + size, prodp + size, prodp + size + hsize, hsize); - - /*** Add product M (if NEGFLG M is a negative number). */ - if (negflg) - cy -= mpn_sub_n (prodp + hsize, prodp + hsize, tspace, size); + + /* Interpolate. */ + if (sign) + w = mpn_add_n (ws, p, ws, n); else - cy += mpn_add_n (prodp + hsize, prodp + hsize, tspace, size); + w = -mpn_sub_n (ws, p, ws, n); + w += mpn_add_n (ws, p + n, ws, n); + w += mpn_add_n (p + n2, p + n2, ws, n); + /* TO DO: could put "if (w) { ... }" here. + * Less work but badly predicted branch. + * No measurable difference in speed on Alpha. + */ + i = n + n2; + t = p[i] + w; + p[i] = t; + if (t < w) + { + do + { + ++i; + w = p[i] + 1; + p[i] = w; + } + while (w == 0); + } + } +} - /*** Product L. ________________ ________________ - |________________||____U0 x V0_____| */ - /* Read temporary operands from low part of PROD. - Put result in low part of TSPACE using upper part of TSPACE - as new TSPACE. */ - MPN_MUL_N_RECURSE (tspace, up, vp, hsize, tspace + size); +/*-- add2Times -------------------------------------------------------------*/ - /*** Add/copy Product L (twice). */ +/* z[] = x[] + 2 * y[] + Note that z and x might point to the same vectors. */ +#ifdef USE_MORE_MPN +static inline mp_limb_t +#if __STDC__ +add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n) +#else +add2Times (z, x, y, n) + mp_ptr z; + mp_srcptr x; + mp_srcptr y; + mp_size_t n; +#endif +{ + mp_ptr t; + mp_limb_t c; + TMP_DECL (marker); + TMP_MARK (marker); + t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB); + c = mpn_lshift (t, y, n, 1); + c += mpn_add_n (z, x, t, n); + TMP_FREE (marker); + return c; +} +#else - cy += mpn_add_n (prodp + hsize, prodp + hsize, tspace, size); - if (cy) - mpn_add_1 (prodp + hsize + size, prodp + hsize + size, hsize, cy); +static mp_limb_t +#if __STDC__ +add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n) +#else +add2Times (z, x, y, n) + mp_ptr z; + mp_srcptr x; + mp_srcptr y; + mp_size_t n; +#endif +{ + mp_limb_t c, v, w; - MPN_COPY (prodp, tspace, hsize); - cy = mpn_add_n (prodp + hsize, prodp + hsize, tspace + hsize, hsize); - if (cy) - mpn_add_1 (prodp + size, prodp + size, size, 1); + ASSERT (n > 0); + v = *x; w = *y; + c = w >> (BITS_PER_MP_LIMB - 1); + w <<= 1; + v += w; + c += v < w; + *z = v; + ++x; ++y; ++z; + while (--n) + { + v = *x; + w = *y; + v += c; + c = v < c; + c += w >> (BITS_PER_MP_LIMB - 1); + w <<= 1; + v += w; + c += v < w; + *z = v; + ++x; ++y; ++z; } + + return c; } +#endif -void +/*-- evaluate3 -------------------------------------------------------------*/ + +/* Evaluates: + * ph := 4*A+2*B+C + * p1 := A+B+C + * p2 := A+2*B+4*C + * where: + * ph[], p1[], p2[], A[] and B[] all have length len, + * C[] has length len2 with len-len2 = 0, 1 or 2. + * Returns top words (overflow) at pth, pt1 and pt2 respectively. + */ +#ifdef USE_MORE_MPN +static void +#if __STDC__ +evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2, + mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2) +#else +evaluate3 (ph, p1, p2, pth, pt1, pt2, + A, B, C, len, len2) + mp_ptr ph; + mp_ptr p1; + mp_ptr p2; + mp_ptr pth; + mp_ptr pt1; + mp_ptr pt2; + mp_srcptr A; + mp_srcptr B; + mp_srcptr C; + mp_size_t len; + mp_size_t len2; +#endif +{ + mp_limb_t c, d, e; + + ASSERT (len - len2 <= 2); + + e = mpn_lshift (p1, B, len, 1); + + c = mpn_lshift (ph, A, len, 2); + c += e + mpn_add_n (ph, ph, p1, len); + d = mpn_add_n (ph, ph, C, len2); + if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d); + ASSERT (c < 7); + *pth = c; + + c = mpn_lshift (p2, C, len2, 2); +#if 1 + if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; } + c += e + mpn_add_n (p2, p2, p1, len); +#else + d = mpn_add_n (p2, p2, p1, len2); + c += d; + if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c); + c += e; +#endif + c += mpn_add_n (p2, p2, A, len); + ASSERT (c < 7); + *pt2 = c; + + c = mpn_add_n (p1, A, B, len); + d = mpn_add_n (p1, p1, C, len2); + if (len2 == len) c += d; + else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d); + ASSERT (c < 3); + *pt1 = c; + +} + +#else + +static void #if __STDC__ -impn_sqr_n_basecase (mp_ptr prodp, mp_srcptr up, mp_size_t size) +evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2, + mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls) #else -impn_sqr_n_basecase (prodp, up, size) - mp_ptr prodp; - mp_srcptr up; - mp_size_t size; +evaluate3 (ph, p1, p2, pth, pt1, pt2, + A, B, C, l, ls) + mp_ptr ph; + mp_ptr p1; + mp_ptr p2; + mp_ptr pth; + mp_ptr pt1; + mp_ptr pt2; + mp_srcptr A; + mp_srcptr B; + mp_srcptr C; + mp_size_t l; + mp_size_t ls; #endif { - mp_size_t i; - mp_limb_t cy_limb; - mp_limb_t v_limb; - - /* Multiply by the first limb in V separately, as the result can be - stored (not added) to PROD. We also avoid a loop for zeroing. */ - v_limb = up[0]; - if (v_limb <= 1) + mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2; + + ASSERT (l - ls <= 2); + + th = t1 = t2 = 0; + for (i = 0; i < l; ++i) { - if (v_limb == 1) - MPN_COPY (prodp, up, size); - else - MPN_ZERO (prodp, size); - cy_limb = 0; + a = *A; + b = *B; + c = i < ls ? *C : 0; + + /* TO DO: choose one of the following alternatives. */ +#if 0 + t = a << 2; + vh = th + t; + th = vh < t; + th += a >> (BITS_PER_MP_LIMB - 2); + t = b << 1; + vh += t; + th += vh < t; + th += b >> (BITS_PER_MP_LIMB - 1); + vh += c; + th += vh < c; +#else + vh = th + c; + th = vh < c; + t = b << 1; + vh += t; + th += vh < t; + th += b >> (BITS_PER_MP_LIMB - 1); + t = a << 2; + vh += t; + th += vh < t; + th += a >> (BITS_PER_MP_LIMB - 2); +#endif + + v1 = t1 + a; + t1 = v1 < a; + v1 += b; + t1 += v1 < b; + v1 += c; + t1 += v1 < c; + + v2 = t2 + a; + t2 = v2 < a; + t = b << 1; + v2 += t; + t2 += v2 < t; + t2 += b >> (BITS_PER_MP_LIMB - 1); + t = c << 2; + v2 += t; + t2 += v2 < t; + t2 += c >> (BITS_PER_MP_LIMB - 2); + + *ph = vh; + *p1 = v1; + *p2 = v2; + + ++A; ++B; ++C; + ++ph; ++p1; ++p2; + } + + ASSERT (th < 7); + ASSERT (t1 < 3); + ASSERT (t2 < 7); + + *pth = th; + *pt1 = t1; + *pt2 = t2; +} +#endif + + +/*-- interpolate3 ----------------------------------------------------------*/ + +/* Interpolates B, C, D (in-place) from: + * 16*A+8*B+4*C+2*D+E + * A+B+C+D+E + * A+2*B+4*C+8*D+16*E + * where: + * A[], B[], C[] and D[] all have length l, + * E[] has length ls with l-ls = 0, 2 or 4. + * + * Reads top words (from earlier overflow) from ptb, ptc and ptd, + * and returns new top words there. + */ + +#ifdef USE_MORE_MPN +static void +#if __STDC__ +interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, + mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2) +#else +interpolate3 (A, B, C, D, E, + ptb, ptc, ptd, len, len2) + mp_srcptr A; + mp_ptr B; + mp_ptr C; + mp_ptr D; + mp_srcptr E; + mp_ptr ptb; + mp_ptr ptc; + mp_ptr ptd; + mp_size_t len; + mp_size_t len2; +#endif +{ + mp_ptr ws; + mp_limb_t t, tb,tc,td; + TMP_DECL (marker); + TMP_MARK (marker); + + ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4); + + /* Let x1, x2, x3 be the values to interpolate. We have: + * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e + * c = a + x1 + x2 + x3 + e + * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e + */ + + ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB); + + tb = *ptb; tc = *ptc; td = *ptd; + + + /* b := b - 16*a - e + * c := c - a - e + * d := d - a - 16*e + */ + + t = mpn_lshift (ws, A, len, 4); + tb -= t + mpn_sub_n (B, B, ws, len); + t = mpn_sub_n (B, B, E, len2); + if (len2 == len) tb -= t; + else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t); + + tc -= mpn_sub_n (C, C, A, len); + t = mpn_sub_n (C, C, E, len2); + if (len2 == len) tc -= t; + else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t); + + t = mpn_lshift (ws, E, len2, 4); + t += mpn_add_n (ws, ws, A, len2); +#if 1 + if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t); + td -= t + mpn_sub_n (D, D, ws, len); +#else + t += mpn_sub_n (D, D, ws, len2); + if (len2 != len) { + t = mpn_sub_1 (D+len2, D+len2, len-len2, t); + t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2); + } /* end if/else */ + td -= t; +#endif + + + /* b, d := b + d, b - d */ + +#ifdef HAVE_MPN_ADD_SUB_N + /* #error TO DO ... */ +#else + t = tb + td + mpn_add_n (ws, B, D, len); + td = tb - td - mpn_sub_n (D, B, D, len); + tb = t; + MPN_COPY (B, ws, len); +#endif + + /* b := b-8*c */ + t = 8 * tc + mpn_lshift (ws, C, len, 3); + tb -= t + mpn_sub_n (B, B, ws, len); + + /* c := 2*c - b */ + tc = 2 * tc + mpn_lshift (C, C, len, 1); + tc -= tb + mpn_sub_n (C, C, B, len); + + /* d := d/3 */ + td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3; + + /* b, d := b + d, b - d */ +#ifdef HAVE_MPN_ADD_SUB_N + /* #error TO DO ... */ +#else + t = tb + td + mpn_add_n (ws, B, D, len); + td = tb - td - mpn_sub_n (D, B, D, len); + tb = t; + MPN_COPY (B, ws, len); +#endif + + /* Now: + * b = 4*x1 + * c = 2*x2 + * d = 4*x3 + */ + + ASSERT(!(*B & 3)); + mpn_rshift (B, B, len, 2); + B[len-1] |= tb<<(BITS_PER_MP_LIMB-2); + ASSERT((long)tb >= 0); + tb >>= 2; + + ASSERT(!(*C & 1)); + mpn_rshift (C, C, len, 1); + C[len-1] |= tc<<(BITS_PER_MP_LIMB-1); + ASSERT((long)tc >= 0); + tc >>= 1; + + ASSERT(!(*D & 3)); + mpn_rshift (D, D, len, 2); + D[len-1] |= td<<(BITS_PER_MP_LIMB-2); + ASSERT((long)td >= 0); + td >>= 2; + +#if WANT_ASSERT + ASSERT (tb < 2); + if (len == len2) + { + ASSERT (tc < 3); + ASSERT (td < 2); } else - cy_limb = mpn_mul_1 (prodp, up, size, v_limb); + { + ASSERT (tc < 2); + ASSERT (!td); + } +#endif + + *ptb = tb; + *ptc = tc; + *ptd = td; + + TMP_FREE (marker); +} - prodp[size] = cy_limb; - prodp++; +#else + +static void +#if __STDC__ +interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, + mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls) +#else +interpolate3 (A, B, C, D, E, + ptb, ptc, ptd, l, ls) + mp_srcptr A; + mp_ptr B; + mp_ptr C; + mp_ptr D; + mp_srcptr E; + mp_ptr ptb; + mp_ptr ptc; + mp_ptr ptd; + mp_size_t l; + mp_size_t ls; +#endif +{ + mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od; + const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1); + +#if WANT_ASSERT + t = l - ls; + ASSERT (t == 0 || t == 2 || t == 4); +#endif - /* For each iteration in the outer loop, multiply one limb from - U with one limb from V, and add it to PROD. */ - for (i = 1; i < size; i++) + sb = sc = sd = 0; + for (i = 0; i < l; ++i) { - v_limb = up[i]; - if (v_limb <= 1) + mp_limb_t tb, tc, td, tt; + + a = *A; + b = *B; + c = *C; + d = *D; + e = i < ls ? *E : 0; + + /* Let x1, x2, x3 be the values to interpolate. We have: + * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e + * c = a + x1 + x2 + x3 + e + * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e + */ + + /* b := b - 16*a - e + * c := c - a - e + * d := d - a - 16*e + */ + t = a << 4; + tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t); + b -= t; + tb -= b < e; + b -= e; + tc = -(c < a); + c -= a; + tc -= c < e; + c -= e; + td = -(d < a); + d -= a; + t = e << 4; + td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t); + d -= t; + + /* b, d := b + d, b - d */ + t = b + d; + tt = tb + td + (t < b); + td = tb - td - (b < d); + d = b - d; + b = t; + tb = tt; + + /* b := b-8*c */ + t = c << 3; + tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t); + b -= t; + + /* c := 2*c - b */ + t = c << 1; + tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b); + c = t - b; + + /* d := d/3 */ + d *= INVERSE_3; + td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d); + td *= INVERSE_3; + + /* b, d := b + d, b - d */ + t = b + d; + tt = tb + td + (t < b); + td = tb - td - (b < d); + d = b - d; + b = t; + tb = tt; + + /* Now: + * b = 4*x1 + * c = 2*x2 + * d = 4*x3 + */ + + /* sb has period 2. */ + b += sb; + tb += b < sb; + sb &= maskOffHalf; + sb |= sb >> (BITS_PER_MP_LIMB >> 1); + sb += tb; + + /* sc has period 1. */ + c += sc; + tc += c < sc; + /* TO DO: choose one of the following alternatives. */ +#if 1 + sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1)); + sc += tc; +#else + sc = tc - ((long)sc < 0L); +#endif + + /* sd has period 2. */ + d += sd; + td += d < sd; + sd &= maskOffHalf; + sd |= sd >> (BITS_PER_MP_LIMB >> 1); + sd += td; + + if (i != 0) { - cy_limb = 0; - if (v_limb == 1) - cy_limb = mpn_add_n (prodp, prodp, up, size); + B[-1] = ob | b << (BITS_PER_MP_LIMB - 2); + C[-1] = oc | c << (BITS_PER_MP_LIMB - 1); + D[-1] = od | d << (BITS_PER_MP_LIMB - 2); } - else - cy_limb = mpn_addmul_1 (prodp, up, size, v_limb); + ob = b >> 2; + oc = c >> 1; + od = d >> 2; + + ++A; ++B; ++C; ++D; ++E; + } + + /* Handle top words. */ + b = *ptb; + c = *ptc; + d = *ptd; + + t = b + d; + d = b - d; + b = t; + b -= c << 3; + c = (c << 1) - b; + d *= INVERSE_3; + t = b + d; + d = b - d; + b = t; + + b += sb; + c += sc; + d += sd; + + B[-1] = ob | b << (BITS_PER_MP_LIMB - 2); + C[-1] = oc | c << (BITS_PER_MP_LIMB - 1); + D[-1] = od | d << (BITS_PER_MP_LIMB - 2); - prodp[size] = cy_limb; - prodp++; + b >>= 2; + c >>= 1; + d >>= 2; + +#if WANT_ASSERT + ASSERT (b < 2); + if (l == ls) + { + ASSERT (c < 3); + ASSERT (d < 2); + } + else + { + ASSERT (c < 2); + ASSERT (!d); } +#endif + + *ptb = b; + *ptc = c; + *ptd = d; } +#endif + + +/*-- mpn_toom3_mul_n --------------------------------------------------------------*/ + +/* Multiplies using 5 mults of one third size and so on recursively. + * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1]. + * No overlap of p[...] with a[...] or b[...]. + * ws is workspace. + */ + +/* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the + * recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n() + * because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false. + */ + +#define TOOM3_MUL_REC(p, a, b, n, ws) \ + do { \ + if (n < KARATSUBA_MUL_THRESHOLD) \ + mpn_mul_basecase (p, a, n, b, n); \ + else if (n < TOOM3_MUL_THRESHOLD) \ + mpn_kara_mul_n (p, a, b, n, ws); \ + else \ + mpn_toom3_mul_n (p, a, b, n, ws); \ + } while (0) void #if __STDC__ -impn_sqr_n (mp_ptr prodp, - mp_srcptr up, mp_size_t size, mp_ptr tspace) +mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) #else -impn_sqr_n (prodp, up, size, tspace) - mp_ptr prodp; - mp_srcptr up; - mp_size_t size; - mp_ptr tspace; +mpn_toom3_mul_n (p, a, b, n, ws) + mp_ptr p; + mp_srcptr a; + mp_srcptr b; + mp_size_t n; + mp_ptr ws; #endif { - if ((size & 1) != 0) + mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD; + mp_limb_t *A,*B,*C,*D,*E, *W; + mp_size_t l,l2,l3,l4,l5,ls; + + /* Break n words into chunks of size l, l and ls. + * n = 3*k => l = k, ls = k + * n = 3*k+1 => l = k+1, ls = k-1 + * n = 3*k+2 => l = k+1, ls = k + */ + { + mp_limb_t m; + + ASSERT (n >= TOOM3_MUL_THRESHOLD); + l = ls = n / 3; + m = n - l * 3; + if (m != 0) + ++l; + if (m == 1) + --ls; + + l2 = l * 2; + l3 = l * 3; + l4 = l * 4; + l5 = l * 5; + A = p; + B = ws; + C = p + l2; + D = ws + l2; + E = p + l4; + W = ws + l4; + } + + /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ + evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); + evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls); + + /** Second stage: pointwise multiplies. **/ + TOOM3_MUL_REC(D, C, C + l, l, W); + tD = cD*dD; + if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD); + if (dD) tD += mpn_addmul_1 (D + l, C, l, dD); + ASSERT (tD < 49); + TOOM3_MUL_REC(C, B, B + l, l, W); + tC = cC*dC; + /* TO DO: choose one of the following alternatives. */ +#if 0 + if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC); + if (dC) tC += mpn_addmul_1 (C + l, B, l, dC); +#else + if (cC) { - /* The size is odd, the code code below doesn't handle that. - Multiply the least significant (size - 1) limbs with a recursive - call, and handle the most significant limb of S1 and S2 - separately. */ - /* A slightly faster way to do this would be to make the Karatsuba - code below behave as if the size were even, and let it check for - odd size in the end. I.e., in essence move this code to the end. - Doing so would save us a recursive call, and potentially make the - stack grow a lot less. */ - - mp_size_t esize = size - 1; /* even size */ - mp_limb_t cy_limb; - - MPN_SQR_N_RECURSE (prodp, up, esize, tspace); - cy_limb = mpn_addmul_1 (prodp + esize, up, esize, up[esize]); - prodp[esize + esize] = cy_limb; - cy_limb = mpn_addmul_1 (prodp + esize, up, size, up[esize]); - - prodp[esize + size] = cy_limb; + if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l); + else tC += add2Times (C + l, C + l, B + l, l); } - else + if (dC) { - mp_size_t hsize = size >> 1; - mp_limb_t cy; - - /*** Product H. ________________ ________________ - |_____U1 x U1____||____U0 x U0_____| */ - /* Put result in upper part of PROD and pass low part of TSPACE - as new TSPACE. */ - MPN_SQR_N_RECURSE (prodp + size, up + hsize, hsize, tspace); - - /*** Product M. ________________ - |_(U1-U0)(U0-U1)_| */ - if (mpn_cmp (up + hsize, up, hsize) >= 0) - { - mpn_sub_n (prodp, up + hsize, up, hsize); - } - else - { - mpn_sub_n (prodp, up, up + hsize, hsize); - } + if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l); + else tC += add2Times (C + l, C + l, B, l); + } +#endif + ASSERT (tC < 9); + TOOM3_MUL_REC(B, A, A + l, l, W); + tB = cB*dB; + if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB); + if (dB) tB += mpn_addmul_1 (B + l, A, l, dB); + ASSERT (tB < 49); + TOOM3_MUL_REC(A, a, b, l, W); + TOOM3_MUL_REC(E, a + l2, b + l2, ls, W); - /* Read temporary operands from low part of PROD. - Put result in low part of TSPACE using upper part of TSPACE - as new TSPACE. */ - MPN_SQR_N_RECURSE (tspace, prodp, hsize, tspace + size); + /** Third stage: interpolation. **/ + interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); - /*** Add/copy product H. */ - MPN_COPY (prodp + hsize, prodp + size, hsize); - cy = mpn_add_n (prodp + size, prodp + size, prodp + size + hsize, hsize); + /** Final stage: add up the coefficients. **/ + { + mp_limb_t i, x, y; + tB += mpn_add_n (p + l, p + l, B, l2); + tD += mpn_add_n (p + l3, p + l3, D, l2); + mpn_incr_u (p + l3, tB); + mpn_incr_u (p + l4, tC); + mpn_incr_u (p + l5, tD); + } +} - /*** Add product M (if NEGFLG M is a negative number). */ - cy -= mpn_sub_n (prodp + hsize, prodp + hsize, tspace, size); +/*-- mpn_toom3_sqr_n --------------------------------------------------------------*/ - /*** Product L. ________________ ________________ - |________________||____U0 x U0_____| */ - /* Read temporary operands from low part of PROD. - Put result in low part of TSPACE using upper part of TSPACE - as new TSPACE. */ - MPN_SQR_N_RECURSE (tspace, up, hsize, tspace + size); +/* Like previous function but for squaring */ - /*** Add/copy Product L (twice). */ +#define TOOM3_SQR_REC(p, a, n, ws) \ + do { \ + if (n < KARATSUBA_SQR_THRESHOLD) \ + mpn_sqr_basecase (p, a, n); \ + else if (n < TOOM3_SQR_THRESHOLD) \ + mpn_kara_sqr_n (p, a, n, ws); \ + else \ + mpn_toom3_sqr_n (p, a, n, ws); \ + } while (0) - cy += mpn_add_n (prodp + hsize, prodp + hsize, tspace, size); - if (cy) - mpn_add_1 (prodp + hsize + size, prodp + hsize + size, hsize, cy); +void +#if __STDC__ +mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) +#else +mpn_toom3_sqr_n (p, a, n, ws) + mp_ptr p; + mp_srcptr a; + mp_size_t n; + mp_ptr ws; +#endif +{ + mp_limb_t cB,cC,cD, tB,tC,tD; + mp_limb_t *A,*B,*C,*D,*E, *W; + mp_size_t l,l2,l3,l4,l5,ls; + + /* Break n words into chunks of size l, l and ls. + * n = 3*k => l = k, ls = k + * n = 3*k+1 => l = k+1, ls = k-1 + * n = 3*k+2 => l = k+1, ls = k + */ + { + mp_limb_t m; + + ASSERT (n >= TOOM3_MUL_THRESHOLD); + l = ls = n / 3; + m = n - l * 3; + if (m != 0) + ++l; + if (m == 1) + --ls; - MPN_COPY (prodp, tspace, hsize); - cy = mpn_add_n (prodp + hsize, prodp + hsize, tspace + hsize, hsize); - if (cy) - mpn_add_1 (prodp + size, prodp + size, size, 1); + l2 = l * 2; + l3 = l * 3; + l4 = l * 4; + l5 = l * 5; + A = p; + B = ws; + C = p + l2; + D = ws + l2; + E = p + l4; + W = ws + l4; + } + + /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ + evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); + + /** Second stage: pointwise multiplies. **/ + TOOM3_SQR_REC(D, C, l, W); + tD = cD*cD; + if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD); + ASSERT (tD < 49); + TOOM3_SQR_REC(C, B, l, W); + tC = cC*cC; + /* TO DO: choose one of the following alternatives. */ +#if 0 + if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC); +#else + if (cC >= 1) + { + tC += add2Times (C + l, C + l, B, l); + if (cC == 2) + tC += add2Times (C + l, C + l, B, l); } +#endif + ASSERT (tC < 9); + TOOM3_SQR_REC(B, A, l, W); + tB = cB*cB; + if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB); + ASSERT (tB < 49); + TOOM3_SQR_REC(A, a, l, W); + TOOM3_SQR_REC(E, a + l2, ls, W); + + /** Third stage: interpolation. **/ + interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); + + /** Final stage: add up the coefficients. **/ + { + mp_limb_t i, x, y; + tB += mpn_add_n (p + l, p + l, B, l2); + tD += mpn_add_n (p + l3, p + l3, D, l2); + mpn_incr_u (p + l3, tB); + mpn_incr_u (p + l4, tC); + mpn_incr_u (p + l5, tD); + } } -/* This should be made into an inline function in gmp.h. */ -inline void +void #if __STDC__ -mpn_mul_n (mp_ptr prodp, mp_srcptr up, mp_srcptr vp, mp_size_t size) +mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n) #else -mpn_mul_n (prodp, up, vp, size) - mp_ptr prodp; - mp_srcptr up; - mp_srcptr vp; - mp_size_t size; +mpn_mul_n (p, a, b, n) + mp_ptr p; + mp_srcptr a; + mp_srcptr b; + mp_size_t n; #endif { - TMP_DECL (marker); - TMP_MARK (marker); - if (up == vp) + if (n < KARATSUBA_MUL_THRESHOLD) + mpn_mul_basecase (p, a, n, b, n); + else if (n < TOOM3_MUL_THRESHOLD) { - if (size < KARATSUBA_THRESHOLD) - { - impn_sqr_n_basecase (prodp, up, size); - } - else - { - mp_ptr tspace; - tspace = (mp_ptr) TMP_ALLOC (2 * size * BYTES_PER_MP_LIMB); - impn_sqr_n (prodp, up, size, tspace); - } + /* Allocate workspace of fixed size on stack: fast! */ +#if TUNE_PROGRAM_BUILD + mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB]; +#else + mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB]; +#endif + mpn_kara_mul_n (p, a, b, n, ws); } +#if WANT_FFT || TUNE_PROGRAM_BUILD + else if (n < FFT_MUL_THRESHOLD) +#else else +#endif { - if (size < KARATSUBA_THRESHOLD) - { - impn_mul_n_basecase (prodp, up, vp, size); - } - else - { - mp_ptr tspace; - tspace = (mp_ptr) TMP_ALLOC (2 * size * BYTES_PER_MP_LIMB); - impn_mul_n (prodp, up, vp, size, tspace); - } + /* Use workspace of unknown size in heap, as stack space may + * be limited. Since n is at least TOOM3_MUL_THRESHOLD, the + * multiplication will take much longer than malloc()/free(). */ + mp_limb_t wsLen, *ws; + wsLen = 2 * n + 3 * BITS_PER_MP_LIMB; + ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t)); + mpn_toom3_mul_n (p, a, b, n, ws); + (*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t)); } - TMP_FREE (marker); +#if WANT_FFT || TUNE_PROGRAM_BUILD + else + { + mpn_mul_fft_full (p, a, n, b, n); + } +#endif } diff --git a/ghc/rts/gmp/mpn/generic/perfsqr.c b/ghc/rts/gmp/mpn/generic/perfsqr.c index 5a6e2af..42ee340 100644 --- a/ghc/rts/gmp/mpn/generic/perfsqr.c +++ b/ghc/rts/gmp/mpn/generic/perfsqr.c @@ -1,46 +1,31 @@ /* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square, zero otherwise. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation, +Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include /* for NULL */ #include "gmp.h" #include "gmp-impl.h" #include "longlong.h" -#ifndef UMUL_TIME -#define UMUL_TIME 1 -#endif - -#ifndef UDIV_TIME -#define UDIV_TIME UMUL_TIME -#endif - -#if BITS_PER_MP_LIMB == 32 -#define PP 0xC0CFD797L /* 3 x 5 x 7 x 11 x 13 x ... x 29 */ -#define PP_INVERTED 0x53E5645CL -#endif - -#if BITS_PER_MP_LIMB == 64 -#define PP 0xE221F97C30E94E1DL /* 3 x 5 x 7 x 11 x 13 x ... x 53 */ -#define PP_INVERTED 0x21CFE6CFC938B36BL -#endif /* sq_res_0x100[x mod 0x100] == 1 iff x mod 0x100 is a quadratic residue modulo 0x100. */ @@ -92,17 +77,17 @@ mpn_perfect_square_p (up, usize) size of A. */ #if BITS_PER_MP_LIMB == 64 - if (((0x12DD703303AED3L >> rem % 53) & 1) == 0) + if (((CNST_LIMB(0x12DD703303AED3) >> rem % 53) & 1) == 0) return 0; - if (((0x4351B2753DFL >> rem % 47) & 1) == 0) + if (((CNST_LIMB(0x4351B2753DF) >> rem % 47) & 1) == 0) return 0; - if (((0x35883A3EE53L >> rem % 43) & 1) == 0) + if (((CNST_LIMB(0x35883A3EE53) >> rem % 43) & 1) == 0) return 0; - if (((0x1B382B50737L >> rem % 41) & 1) == 0) + if (((CNST_LIMB(0x1B382B50737) >> rem % 41) & 1) == 0) return 0; - if (((0x165E211E9BL >> rem % 37) & 1) == 0) + if (((CNST_LIMB(0x165E211E9B) >> rem % 37) & 1) == 0) return 0; - if (((0x121D47B7L >> rem % 31) & 1) == 0) + if (((CNST_LIMB(0x121D47B7) >> rem % 31) & 1) == 0) return 0; #endif if (((0x13D122F3L >> rem % 29) & 1) == 0) diff --git a/ghc/rts/gmp/mpn/generic/popcount.c b/ghc/rts/gmp/mpn/generic/popcount.c index c48573a..387be95 100644 --- a/ghc/rts/gmp/mpn/generic/popcount.c +++ b/ghc/rts/gmp/mpn/generic/popcount.c @@ -1,20 +1,20 @@ /* popcount.c -Copyright (C) 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -23,7 +23,9 @@ MA 02111-1307, USA. */ #include "gmp-impl.h" #if defined __GNUC__ -#if defined __sparc_v9__ && BITS_PER_MP_LIMB == 64 +/* No processor claiming to be SPARC v9 compliant seem to + implement the POPC instruction. Disable pattern for now. */ +#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64 #define popc_limb(a) \ ({ \ DItype __res; \ @@ -39,15 +41,19 @@ MA 02111-1307, USA. */ You have to figure out how this works, I won't tell you! */ static inline unsigned int +#if __STDC__ +popc_limb (mp_limb_t x) +#else popc_limb (x) mp_limb_t x; +#endif { #if BITS_PER_MP_LIMB == 64 /* We have to go into some trouble to define these constants. (For mp_limb_t being `long long'.) */ mp_limb_t cnst; - cnst = 0x55555555L | ((mp_limb_t) 0x55555555L << BITS_PER_MP_LIMB/2); - x = ((x & ~cnst) >> 1) + (x & cnst); + cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2); + x -= (x & cnst) >> 1; cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2); x = ((x & ~cnst) >> 2) + (x & cnst); cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2); @@ -57,7 +63,7 @@ popc_limb (x) x = ((x >> 32) + x) & 0xff; #endif #if BITS_PER_MP_LIMB == 32 - x = ((x >> 1) & 0x55555555L) + (x & 0x55555555L); + x -= (x & 0xaaaaaaaa) >> 1; x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L); x = ((x >> 4) + x) & 0x0f0f0f0fL; x = ((x >> 8) + x); diff --git a/ghc/rts/gmp/mpn/generic/pre_mod_1.c b/ghc/rts/gmp/mpn/generic/pre_mod_1.c index 92d413b..2717968 100644 --- a/ghc/rts/gmp/mpn/generic/pre_mod_1.c +++ b/ghc/rts/gmp/mpn/generic/pre_mod_1.c @@ -10,16 +10,16 @@ Copyright (C) 1991, 1993, 1994, Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/generic/random.c b/ghc/rts/gmp/mpn/generic/random.c new file mode 100644 index 0000000..dea4e20 --- /dev/null +++ b/ghc/rts/gmp/mpn/generic/random.c @@ -0,0 +1,43 @@ +/* mpn_random -- Generate random numbers. + +Copyright (C) 1996, 1997, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "urandom.h" + +void +#if __STDC__ +mpn_random (mp_ptr res_ptr, mp_size_t size) +#else +mpn_random (res_ptr, size) + mp_ptr res_ptr; + mp_size_t size; +#endif +{ + mp_size_t i; + + for (i = 0; i < size; i++) + res_ptr[i] = urandom (); + + /* Make sure the most significant limb is non-zero. */ + while (res_ptr[size - 1] == 0) + res_ptr[size - 1] = urandom (); +} diff --git a/ghc/rts/gmp/mpn/generic/rshift.c b/ghc/rts/gmp/mpn/generic/rshift.c index 804f9be..59caf73 100644 --- a/ghc/rts/gmp/mpn/generic/rshift.c +++ b/ghc/rts/gmp/mpn/generic/rshift.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/generic/sb_divrem_mn.c b/ghc/rts/gmp/mpn/generic/sb_divrem_mn.c new file mode 100644 index 0000000..a269e34 --- /dev/null +++ b/ghc/rts/gmp/mpn/generic/sb_divrem_mn.c @@ -0,0 +1,201 @@ +/* mpn_sb_divrem_mn -- Divide natural numbers, producing both remainder and + quotient. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE + INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. + IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A + FUTURE GNU MP RELEASE. + + +Copyright (C) 1993, 1994, 1995, 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Divide num (NP/NSIZE) by den (DP/DSIZE) and write + the NSIZE-DSIZE least significant quotient limbs at QP + and the DSIZE long remainder at NP. If QEXTRA_LIMBS is + non-zero, generate that many fraction bits and append them after the + other quotient limbs. + Return the most significant limb of the quotient, this is always 0 or 1. + + Preconditions: + 0. NSIZE >= DSIZE. + 1. The most significant bit of the divisor must be set. + 2. QP must either not overlap with the input operands at all, or + QP + DSIZE >= NP must hold true. (This means that it's + possible to put the quotient in the high part of NUM, right after the + remainder in NUM. + 3. NSIZE >= DSIZE, even if QEXTRA_LIMBS is non-zero. + 4. DSIZE >= 2. */ + + +#define PREINVERT_VIABLE \ + (UDIV_TIME > 2 * UMUL_TIME + 6 /* && ! TARGET_REGISTER_STARVED */) + +mp_limb_t +#if __STDC__ +mpn_sb_divrem_mn (mp_ptr qp, + mp_ptr np, mp_size_t nsize, + mp_srcptr dp, mp_size_t dsize) +#else +mpn_sb_divrem_mn (qp, np, nsize, dp, dsize) + mp_ptr qp; + mp_ptr np; + mp_size_t nsize; + mp_srcptr dp; + mp_size_t dsize; +#endif +{ + mp_limb_t most_significant_q_limb = 0; + mp_size_t i; + mp_limb_t dx, d1, n0; + mp_limb_t dxinv; + int have_preinv; + + ASSERT_ALWAYS (dsize > 2); + + np += nsize - dsize; + dx = dp[dsize - 1]; + d1 = dp[dsize - 2]; + n0 = np[dsize - 1]; + + if (n0 >= dx) + { + if (n0 > dx || mpn_cmp (np, dp, dsize - 1) >= 0) + { + mpn_sub_n (np, np, dp, dsize); + most_significant_q_limb = 1; + } + } + + /* If multiplication is much faster than division, preinvert the + most significant divisor limb before entering the loop. */ + if (PREINVERT_VIABLE) + { + have_preinv = 0; + if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - dsize) > UDIV_TIME) + { + invert_limb (dxinv, dx); + have_preinv = 1; + } + } + + for (i = nsize - dsize - 1; i >= 0; i--) + { + mp_limb_t q; + mp_limb_t nx; + mp_limb_t cy_limb; + + nx = np[dsize - 1]; + np--; + + if (nx == dx) + { + /* This might over-estimate q, but it's probably not worth + the extra code here to find out. */ + q = ~(mp_limb_t) 0; + +#if 1 + cy_limb = mpn_submul_1 (np, dp, dsize, q); +#else + /* This should be faster on many machines */ + cy_limb = mpn_sub_n (np + 1, np + 1, dp, dsize); + cy = mpn_add_n (np, np, dp, dsize); + np[dsize] += cy; +#endif + + if (nx != cy_limb) + { + mpn_add_n (np, np, dp, dsize); + q--; + } + + qp[i] = q; + } + else + { + mp_limb_t rx, r1, r0, p1, p0; + + /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register + usage when np[dsize-1] is used in an asm statement like + umul_ppmm in udiv_qrnnd_preinv. The symptom is seg faults due + to registers being clobbered. gcc 2.95 i386 doesn't have the + problem. */ + { + mp_limb_t workaround = np[dsize - 1]; + if (PREINVERT_VIABLE && have_preinv) + udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv); + else + udiv_qrnnd (q, r1, nx, workaround, dx); + } + umul_ppmm (p1, p0, d1, q); + + r0 = np[dsize - 2]; + rx = 0; + if (r1 < p1 || (r1 == p1 && r0 < p0)) + { + p1 -= p0 < d1; + p0 -= d1; + q--; + r1 += dx; + rx = r1 < dx; + } + + p1 += r0 < p0; /* cannot carry! */ + rx -= r1 < p1; /* may become 11..1 if q is still too large */ + r1 -= p1; + r0 -= p0; + + cy_limb = mpn_submul_1 (np, dp, dsize - 2, q); + + { + mp_limb_t cy1, cy2; + cy1 = r0 < cy_limb; + r0 -= cy_limb; + cy2 = r1 < cy1; + r1 -= cy1; + np[dsize - 1] = r1; + np[dsize - 2] = r0; + if (cy2 != rx) + { + mpn_add_n (np, np, dp, dsize); + q--; + } + } + qp[i] = q; + } + } + + /* ______ ______ ______ + |__rx__|__r1__|__r0__| partial remainder + ______ ______ + - |__p1__|__p0__| partial product to subtract + ______ ______ + - |______|cylimb| + + rx is -1, 0 or 1. If rx=1, then q is correct (it should match + carry out). If rx=-1 then q is too large. If rx=0, then q might + be too large, but it is most likely correct. + */ + + return most_significant_q_limb; +} diff --git a/ghc/rts/gmp/mpn/generic/scan0.c b/ghc/rts/gmp/mpn/generic/scan0.c index d6f6580..96f05ce 100644 --- a/ghc/rts/gmp/mpn/generic/scan0.c +++ b/ghc/rts/gmp/mpn/generic/scan0.c @@ -5,16 +5,16 @@ Copyright (C) 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/generic/scan1.c b/ghc/rts/gmp/mpn/generic/scan1.c index c95d090..98e2e0d 100644 --- a/ghc/rts/gmp/mpn/generic/scan1.c +++ b/ghc/rts/gmp/mpn/generic/scan1.c @@ -5,16 +5,16 @@ Copyright (C) 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/generic/set_str.c b/ghc/rts/gmp/mpn/generic/set_str.c index 424fad3..e6ccc92 100644 --- a/ghc/rts/gmp/mpn/generic/set_str.c +++ b/ghc/rts/gmp/mpn/generic/set_str.c @@ -3,21 +3,22 @@ limb vector pointed to by RES_PTR. Return the number of limbs in RES_PTR. -Copyright (C) 1991, 1992, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation, +Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -26,11 +27,15 @@ MA 02111-1307, USA. */ #include "gmp-impl.h" mp_size_t +#if __STDC__ +mpn_set_str (mp_ptr xp, const unsigned char *str, size_t str_len, int base) +#else mpn_set_str (xp, str, str_len, base) mp_ptr xp; const unsigned char *str; size_t str_len; int base; +#endif { mp_size_t size; mp_limb_t big_base; diff --git a/ghc/rts/gmp/mpn/generic/sqr_basecase.c b/ghc/rts/gmp/mpn/generic/sqr_basecase.c new file mode 100644 index 0000000..760258a --- /dev/null +++ b/ghc/rts/gmp/mpn/generic/sqr_basecase.c @@ -0,0 +1,83 @@ +/* mpn_sqr_basecase -- Internal routine to square two natural numbers + of length m and n. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + + +Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +void +#if __STDC__ +mpn_sqr_basecase (mp_ptr prodp, mp_srcptr up, mp_size_t n) +#else +mpn_sqr_basecase (prodp, up, n) + mp_ptr prodp; + mp_srcptr up; + mp_size_t n; +#endif +{ + mp_size_t i; + + { + /* N.B.! We need the superfluous indirection through argh to work around + a reloader bug in GCC 2.7.*. */ + mp_limb_t x; + mp_limb_t argh; + x = up[0]; + umul_ppmm (argh, prodp[0], x, x); + prodp[1] = argh; + } + if (n > 1) + { + mp_limb_t tarr[2 * KARATSUBA_SQR_THRESHOLD]; + mp_ptr tp = tarr; + mp_limb_t cy; + + /* must fit 2*n limbs in tarr */ + ASSERT (n <= KARATSUBA_SQR_THRESHOLD); + + cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]); + tp[n - 1] = cy; + for (i = 2; i < n; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]); + tp[n + i - 2] = cy; + } + for (i = 1; i < n; i++) + { + mp_limb_t x; + x = up[i]; + umul_ppmm (prodp[2 * i + 1], prodp[2 * i], x, x); + } + { + mp_limb_t cy; + cy = mpn_lshift (tp, tp, 2 * n - 2, 1); + cy += mpn_add_n (prodp + 1, prodp + 1, tp, 2 * n - 2); + prodp[2 * n - 1] += cy; + } + } +} diff --git a/ghc/rts/gmp/mpn/generic/sqrtrem.c b/ghc/rts/gmp/mpn/generic/sqrtrem.c index 539480d..ee3b514 100644 --- a/ghc/rts/gmp/mpn/generic/sqrtrem.c +++ b/ghc/rts/gmp/mpn/generic/sqrtrem.c @@ -12,21 +12,22 @@ the function is 0 if OP is a perfect square, and *any* non-zero number otherwise. -Copyright (C) 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software +Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -35,6 +36,7 @@ MA 02111-1307, USA. */ doesn't help to use CHAR_BIT from limits.h, as the real problem is the static arrays. */ +#include /* for NULL */ #include "gmp.h" #include "gmp-impl.h" #include "longlong.h" @@ -59,7 +61,7 @@ MA 02111-1307, USA. */ /* Define this macro for IEEE P854 machines with a fast sqrt instruction. */ #if defined __GNUC__ && ! defined __SOFT_FLOAT -#if defined __sparc__ +#if defined (__sparc__) && BITS_PER_MP_LIMB == 32 #define SQRT(a) \ ({ \ double __sqrt_res; \ @@ -68,7 +70,7 @@ MA 02111-1307, USA. */ }) #endif -#if defined __HAVE_68881__ +#if defined (__HAVE_68881__) #define SQRT(a) \ ({ \ double __sqrt_res; \ @@ -77,7 +79,7 @@ MA 02111-1307, USA. */ }) #endif -#if defined __hppa +#if defined (__hppa) && BITS_PER_MP_LIMB == 32 #define SQRT(a) \ ({ \ double __sqrt_res; \ @@ -86,7 +88,7 @@ MA 02111-1307, USA. */ }) #endif -#if defined _ARCH_PWR2 +#if defined (_ARCH_PWR2) && BITS_PER_MP_LIMB == 32 #define SQRT(a) \ ({ \ double __sqrt_res; \ @@ -95,6 +97,17 @@ MA 02111-1307, USA. */ }) #endif +#if 0 +#if defined (__i386__) || defined (__i486__) +#define SQRT(a) \ + ({ \ + double __sqrt_res; \ + asm ("fsqrt" : "=t" (__sqrt_res) : "0" (a)); \ + __sqrt_res; \ + }) +#endif +#endif + #endif #ifndef SQRT @@ -112,7 +125,7 @@ MA 02111-1307, USA. */ square root of numbers with the same initial digits and an even difference in the total number of digits. Consider the square root of 1, 10, 100, 1000, ...) */ -static unsigned char even_approx_tab[256] = +static const unsigned char even_approx_tab[256] = { 0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e, 0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74, @@ -150,7 +163,7 @@ static unsigned char even_approx_tab[256] = /* Table to be used for operands with an odd total number of bits. (Further comments before previous table.) */ -static unsigned char odd_approx_tab[256] = +static const unsigned char odd_approx_tab[256] = { 0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, @@ -272,9 +285,7 @@ mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) /* Is there a fast sqrt instruction defined for this machine? */ #ifdef SQRT { - initial_approx = SQRT (t_high0 * 2.0 - * ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1)) - + t_high1); + initial_approx = SQRT (t_high0 * MP_BASE_AS_DOUBLE + t_high1); /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have become incorrect due to overflow in the conversion from double to mp_limb_t above. It will typically be zero in that case, but might be @@ -293,14 +304,14 @@ mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) if ((cnt & 1) == 0) { - /* The most sign bit of t_high0 is set. */ + /* The most significant bit of t_high0 is set. */ initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1); initial_approx &= 0xff; initial_approx = even_approx_tab[initial_approx]; } else { - /* The most significant bit of T_HIGH0 is unset, + /* The most significant bit of t_high0 is unset, the second most significant is set. */ initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2); initial_approx &= 0xff; @@ -310,7 +321,7 @@ mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) initial_approx <<= BITS_PER_MP_LIMB - 8 - 1; /* Perform small precision Newtonian iterations to get a full word - approximation. For small operands, these iteration will make the + approximation. For small operands, these iterations will do the entire job. */ if (t_high0 == ~(mp_limb_t)0) initial_approx = t_high0; @@ -328,7 +339,7 @@ mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) /* Now get a full word by one (or for > 36 bit machines) several iterations. */ - for (i = 16; i < BITS_PER_MP_LIMB; i <<= 1) + for (i = 18; i < BITS_PER_MP_LIMB; i <<= 1) { mp_limb_t ignored_remainder; @@ -343,7 +354,7 @@ mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) rp[0] = initial_approx; rsize = 1; -#ifdef DEBUG +#ifdef SQRT_DEBUG printf ("\n\nT = "); mpn_dump (tp, tsize); #endif @@ -373,7 +384,7 @@ mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) time is spent here. */ /* It is possible to do a great optimization here. The successive - divisors in the mpn_divmod call below has more and more leading + divisors in the mpn_divmod call below have more and more leading words equal to its predecessor. Therefore the beginning of each division will repeat the same work as did the last division. If we could guarantee that the leading words of two @@ -392,7 +403,7 @@ mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) while (--i >= 0) { mp_limb_t cy; -#ifdef DEBUG +#ifdef SQRT_DEBUG mp_limb_t old_least_sign_r = rp[0]; mp_size_t old_rsize = rsize; @@ -408,7 +419,7 @@ mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) cy = mpn_divmod (xp, ttp, tsize, rp, rsize); xsize = tsize - rsize; -#ifdef DEBUG +#ifdef SQRT_DEBUG printf ("X =%d ", cy); mpn_dump (xp, xsize); #endif @@ -435,7 +446,7 @@ mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) mpn_rshift (rp, xp, xsize, 1); rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1)); rsize = xsize; -#ifdef DEBUG +#ifdef SQRT_DEBUG if (old_least_sign_r != rp[rsize - old_rsize]) printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n", i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r, @@ -444,7 +455,7 @@ mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) } } -#ifdef DEBUG +#ifdef SQRT_DEBUG printf ("(final) R = "); mpn_dump (rp, rsize); #endif @@ -470,12 +481,12 @@ mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) /* These operations can't overflow. */ cy_limb = mpn_sub_n (tp, tp, rp, rsize); cy_limb += mpn_sub_n (tp, tp, rp, rsize); - mpn_sub_1 (tp + rsize, tp + rsize, tsize - rsize, cy_limb); - mpn_add_1 (tp, tp, tsize, (mp_limb_t) 1); + mpn_decr_u (tp + rsize, cy_limb); + mpn_incr_u (tp, (mp_limb_t) 1); - mpn_sub_1 (rp, rp, rsize, (mp_limb_t) 1); + mpn_decr_u (rp, (mp_limb_t) 1); -#ifdef DEBUG +#ifdef SQRT_DEBUG printf ("(adjusted) R = "); mpn_dump (rp, rsize); #endif diff --git a/ghc/rts/gmp/mpn/generic/sub_n.c b/ghc/rts/gmp/mpn/generic/sub_n.c index 9d4b216..4f2f060 100644 --- a/ghc/rts/gmp/mpn/generic/sub_n.c +++ b/ghc/rts/gmp/mpn/generic/sub_n.c @@ -5,16 +5,16 @@ Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/generic/submul_1.c b/ghc/rts/gmp/mpn/generic/submul_1.c index b144283..c7c08ee 100644 --- a/ghc/rts/gmp/mpn/generic/submul_1.c +++ b/ghc/rts/gmp/mpn/generic/submul_1.c @@ -8,16 +8,16 @@ Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/generic/tdiv_qr.c b/ghc/rts/gmp/mpn/generic/tdiv_qr.c new file mode 100644 index 0000000..b748b5d --- /dev/null +++ b/ghc/rts/gmp/mpn/generic/tdiv_qr.c @@ -0,0 +1,401 @@ +/* mpn_tdiv_qr -- Divide the numerator (np,nn) by the denominator (dp,dn) and + write the nn-dn+1 quotient limbs at qp and the dn remainder limbs at rp. If + qxn is non-zero, generate that many fraction limbs and append them after the + other quotient limbs, and update the remainder accordningly. The input + operands are unaffected. + + Preconditions: + 1. The most significant limb of of the divisor must be non-zero. + 2. No argument overlap is permitted. (??? relax this ???) + 3. nn >= dn, even if qxn is non-zero. (??? relax this ???) + + The time complexity of this is O(qn*qn+M(dn,qn)), where M(m,n) is the time + complexity of multiplication. + +Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD) +#endif + +/* Extract the middle limb from ((h,,l) << cnt) */ +#define SHL(h,l,cnt) \ + ((h << cnt) | ((l >> 1) >> ((~cnt) & (BITS_PER_MP_LIMB - 1)))) + +void +#if __STDC__ +mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn, + mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn) +#else +mpn_tdiv_qr (qp, rp, qxn, np, nn, dp, dn) + mp_ptr qp; + mp_ptr rp; + mp_size_t qxn; + mp_srcptr np; + mp_size_t nn; + mp_srcptr dp; + mp_size_t dn; +#endif +{ + /* FIXME: + 1. qxn + 2. pass allocated storage in additional parameter? + */ + if (qxn != 0) + abort (); + + switch (dn) + { + case 0: + DIVIDE_BY_ZERO; + + case 1: + { + rp[0] = mpn_divmod_1 (qp, np, nn, dp[0]); + return; + } + + case 2: + { + int cnt; + mp_ptr n2p, d2p; + mp_limb_t qhl, cy; + TMP_DECL (marker); + TMP_MARK (marker); + count_leading_zeros (cnt, dp[dn - 1]); + if (cnt != 0) + { + d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); + mpn_lshift (d2p, dp, dn, cnt); + n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB); + cy = mpn_lshift (n2p, np, nn, cnt); + n2p[nn] = cy; + qhl = mpn_divrem_2 (qp, 0L, n2p, nn + (cy != 0), d2p); + if (cy == 0) + qp[nn - 2] = qhl; /* always store nn-dn+1 quotient limbs */ + } + else + { + d2p = (mp_ptr) dp; + n2p = (mp_ptr) TMP_ALLOC (nn * BYTES_PER_MP_LIMB); + MPN_COPY (n2p, np, nn); + qhl = mpn_divrem_2 (qp, 0L, n2p, nn, d2p); + qp[nn - 2] = qhl; /* always store nn-dn+1 quotient limbs */ + } + + if (cnt != 0) + mpn_rshift (rp, n2p, dn, cnt); + else + MPN_COPY (rp, n2p, dn); + TMP_FREE (marker); + return; + } + + default: + { + int adjust; + TMP_DECL (marker); + TMP_MARK (marker); + adjust = np[nn - 1] >= dp[dn - 1]; /* conservative tests for quotient size */ + if (nn + adjust >= 2 * dn) + { + mp_ptr n2p, d2p; + mp_limb_t cy; + int cnt; + count_leading_zeros (cnt, dp[dn - 1]); + + qp[nn - dn] = 0; /* zero high quotient limb */ + if (cnt != 0) /* normalize divisor if needed */ + { + d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); + mpn_lshift (d2p, dp, dn, cnt); + n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB); + cy = mpn_lshift (n2p, np, nn, cnt); + n2p[nn] = cy; + nn += adjust; + } + else + { + d2p = (mp_ptr) dp; + n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB); + MPN_COPY (n2p, np, nn); + n2p[nn] = 0; + nn += adjust; + } + + if (dn == 2) + mpn_divrem_2 (qp, 0L, n2p, nn, d2p); + else if (dn < BZ_THRESHOLD) + mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn); + else + { + /* Perform 2*dn / dn limb divisions as long as the limbs + in np last. */ + mp_ptr q2p = qp + nn - 2 * dn; + n2p += nn - 2 * dn; + mpn_bz_divrem_n (q2p, n2p, d2p, dn); + nn -= dn; + while (nn >= 2 * dn) + { + mp_limb_t c; + q2p -= dn; n2p -= dn; + c = mpn_bz_divrem_n (q2p, n2p, d2p, dn); + ASSERT_ALWAYS (c == 0); + nn -= dn; + } + + if (nn != dn) + { + n2p -= nn - dn; + /* In theory, we could fall out to the cute code below + since we now have exactly the situation that code + is designed to handle. We botch this badly and call + the basic mpn_sb_divrem_mn! */ + if (dn == 2) + mpn_divrem_2 (qp, 0L, n2p, nn, d2p); + else + mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn); + } + } + + + if (cnt != 0) + mpn_rshift (rp, n2p, dn, cnt); + else + MPN_COPY (rp, n2p, dn); + TMP_FREE (marker); + return; + } + + /* When we come here, the numerator/partial remainder is less + than twice the size of the denominator. */ + + { + /* Problem: + + Divide a numerator N with nn limbs by a denominator D with dn + limbs forming a quotient of nn-dn+1 limbs. When qn is small + compared to dn, conventional division algorithms perform poorly. + We want an algorithm that has an expected running time that is + dependent only on qn. It is assumed that the most significant + limb of the numerator is smaller than the most significant limb + of the denominator. + + Algorithm (very informally stated): + + 1) Divide the 2 x qn most significant limbs from the numerator + by the qn most significant limbs from the denominator. Call + the result qest. This is either the correct quotient, but + might be 1 or 2 too large. Compute the remainder from the + division. (This step is implemented by a mpn_divrem call.) + + 2) Is the most significant limb from the remainder < p, where p + is the product of the most significant limb from the quotient + and the next(d). (Next(d) denotes the next ignored limb from + the denominator.) If it is, decrement qest, and adjust the + remainder accordingly. + + 3) Is the remainder >= qest? If it is, qest is the desired + quotient. The algorithm terminates. + + 4) Subtract qest x next(d) from the remainder. If there is + borrow out, decrement qest, and adjust the remainder + accordingly. + + 5) Skip one word from the denominator (i.e., let next(d) denote + the next less significant limb. */ + + mp_size_t qn; + mp_ptr n2p, d2p; + mp_ptr tp; + mp_limb_t cy; + mp_size_t in, rn; + mp_limb_t quotient_too_large; + int cnt; + + qn = nn - dn; + qp[qn] = 0; /* zero high quotient limb */ + qn += adjust; /* qn cannot become bigger */ + + if (qn == 0) + { + MPN_COPY (rp, np, dn); + TMP_FREE (marker); + return; + } + + in = dn - qn; /* (at least partially) ignored # of limbs in ops */ + /* Normalize denominator by shifting it to the left such that its + most significant bit is set. Then shift the numerator the same + amount, to mathematically preserve quotient. */ + count_leading_zeros (cnt, dp[dn - 1]); + if (cnt != 0) + { + d2p = (mp_ptr) TMP_ALLOC (qn * BYTES_PER_MP_LIMB); + + mpn_lshift (d2p, dp + in, qn, cnt); + d2p[0] |= dp[in - 1] >> (BITS_PER_MP_LIMB - cnt); + + n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB); + cy = mpn_lshift (n2p, np + nn - 2 * qn, 2 * qn, cnt); + if (adjust) + { + n2p[2 * qn] = cy; + n2p++; + } + else + { + n2p[0] |= np[nn - 2 * qn - 1] >> (BITS_PER_MP_LIMB - cnt); + } + } + else + { + d2p = (mp_ptr) dp + in; + + n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB); + MPN_COPY (n2p, np + nn - 2 * qn, 2 * qn); + if (adjust) + { + n2p[2 * qn] = 0; + n2p++; + } + } + + /* Get an approximate quotient using the extracted operands. */ + if (qn == 1) + { + mp_limb_t q0, r0; + mp_limb_t gcc272bug_n1, gcc272bug_n0, gcc272bug_d0; + /* Due to a gcc 2.7.2.3 reload pass bug, we have to use some + temps here. This doesn't hurt code quality on any machines + so we do it unconditionally. */ + gcc272bug_n1 = n2p[1]; + gcc272bug_n0 = n2p[0]; + gcc272bug_d0 = d2p[0]; + udiv_qrnnd (q0, r0, gcc272bug_n1, gcc272bug_n0, gcc272bug_d0); + n2p[0] = r0; + qp[0] = q0; + } + else if (qn == 2) + mpn_divrem_2 (qp, 0L, n2p, 4L, d2p); + else if (qn < BZ_THRESHOLD) + mpn_sb_divrem_mn (qp, n2p, qn * 2, d2p, qn); + else + mpn_bz_divrem_n (qp, n2p, d2p, qn); + + rn = qn; + /* Multiply the first ignored divisor limb by the most significant + quotient limb. If that product is > the partial remainder's + most significant limb, we know the quotient is too large. This + test quickly catches most cases where the quotient is too large; + it catches all cases where the quotient is 2 too large. */ + { + mp_limb_t dl, x; + mp_limb_t h, l; + + if (in - 2 < 0) + dl = 0; + else + dl = dp[in - 2]; + + x = SHL (dp[in - 1], dl, cnt); + umul_ppmm (h, l, x, qp[qn - 1]); + + if (n2p[qn - 1] < h) + { + mp_limb_t cy; + + mpn_decr_u (qp, (mp_limb_t) 1); + cy = mpn_add_n (n2p, n2p, d2p, qn); + if (cy) + { + /* The partial remainder is safely large. */ + n2p[qn] = cy; + ++rn; + } + } + } + + quotient_too_large = 0; + if (cnt != 0) + { + mp_limb_t cy1, cy2; + + /* Append partially used numerator limb to partial remainder. */ + cy1 = mpn_lshift (n2p, n2p, rn, BITS_PER_MP_LIMB - cnt); + n2p[0] |= np[in - 1] & (~(mp_limb_t) 0 >> cnt); + + /* Update partial remainder with partially used divisor limb. */ + cy2 = mpn_submul_1 (n2p, qp, qn, dp[in - 1] & (~(mp_limb_t) 0 >> cnt)); + if (qn != rn) + { + if (n2p[qn] < cy2) + abort (); + n2p[qn] -= cy2; + } + else + { + n2p[qn] = cy1 - cy2; + + quotient_too_large = (cy1 < cy2); + ++rn; + } + --in; + } + /* True: partial remainder now is neutral, i.e., it is not shifted up. */ + + tp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); + + if (in < qn) + { + if (in == 0) + { + MPN_COPY (rp, n2p, rn); + if (rn != dn) + abort (); + goto foo; + } + mpn_mul (tp, qp, qn, dp, in); + } + else + mpn_mul (tp, dp, in, qp, qn); + + cy = mpn_sub (n2p, n2p, rn, tp + in, qn); + MPN_COPY (rp + in, n2p, dn - in); + quotient_too_large |= cy; + cy = mpn_sub_n (rp, np, tp, in); + cy = mpn_sub_1 (rp + in, rp + in, rn, cy); + quotient_too_large |= cy; + foo: + if (quotient_too_large) + { + mpn_decr_u (qp, (mp_limb_t) 1); + mpn_add_n (rp, rp, dp, dn); + } + } + TMP_FREE (marker); + return; + } + } +} diff --git a/ghc/rts/gmp/mpn/generic/udiv_w_sdiv.c b/ghc/rts/gmp/mpn/generic/udiv_w_sdiv.c index d9e71b7..061cce8 100644 --- a/ghc/rts/gmp/mpn/generic/udiv_w_sdiv.c +++ b/ghc/rts/gmp/mpn/generic/udiv_w_sdiv.c @@ -3,21 +3,27 @@ Contributed by Peter L. Montgomery. -Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY SAFE + TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THIS FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE + GNU MP RELEASE. + + +Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/hppa/README b/ghc/rts/gmp/mpn/hppa/README index 5a2d5fd..97e7abe 100644 --- a/ghc/rts/gmp/mpn/hppa/README +++ b/ghc/rts/gmp/mpn/hppa/README @@ -15,7 +15,7 @@ dependent instruction really far from each other. STATUS 1. mpn_mul_1 could be improved to 6.5 cycles/limb on the PA7100, using the - instructions bwlow (but some sw pipelining is needed to avoid the + instructions below (but some sw pipelining is needed to avoid the xmpyu-fstds delay): fldds s1_ptr @@ -82,3 +82,10 @@ STATUS stws res_ptr addib + +3. For the PA8000 we have to stick to using 32-bit limbs before compiler + support emerges. But we want to use 64-bit operations whenever possible, + in particular for loads and stores. It is possible to handle mpn_add_n + efficiently by rotating (when s1/s2 are aligned), masking+bit field + inserting when (they are not). The speed should double compared to the + code used today. diff --git a/ghc/rts/gmp/mpn/hppa/add_n.s b/ghc/rts/gmp/mpn/hppa/add_n.s index b4a1428..c53b2f7 100644 --- a/ghc/rts/gmp/mpn/hppa/add_n.s +++ b/ghc/rts/gmp/mpn/hppa/add_n.s @@ -1,21 +1,21 @@ -; HP-PA __mpn_add_n -- Add two limb vectors of the same length > 0 and store +; HP-PA __gmpn_add_n -- Add two limb vectors of the same length > 0 and store ; sum in a third limb vector. -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -32,8 +32,8 @@ ; unrolling useless. We can't come under 5 cycles/limb anyway. .code - .export __mpn_add_n -__mpn_add_n + .export __gmpn_add_n +__gmpn_add_n .proc .callinfo frame=0,no_calls .entry diff --git a/ghc/rts/gmp/mpn/hppa/gmp-mparam.h b/ghc/rts/gmp/mpn/hppa/gmp-mparam.h new file mode 100644 index 0000000..98b6d9c --- /dev/null +++ b/ghc/rts/gmp/mpn/hppa/gmp-mparam.h @@ -0,0 +1,63 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* These values are for the PA7100 using GCC. */ +/* Generated by tuneup.c, 2000-07-25. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 30 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 172 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 59 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 185 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 96 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 122 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 18 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 46 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 33 +#endif diff --git a/ghc/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s b/ghc/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s index 0fdcb3c..c7d218f 100644 --- a/ghc/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s +++ b/ghc/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s @@ -1,21 +1,21 @@ -; HP-PA-1.1 __mpn_addmul_1 -- Multiply a limb vector with a limb and +; HP-PA-1.1 __gmpn_addmul_1 -- Multiply a limb vector with a limb and ; add the result to a second limb vector. -; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -37,8 +37,8 @@ ; There are some ideas described in mul_1.s that applies to this code too. .code - .export __mpn_addmul_1 -__mpn_addmul_1 + .export __gmpn_addmul_1 +__gmpn_addmul_1 .proc .callinfo frame=64,no_calls .entry diff --git a/ghc/rts/gmp/mpn/hppa/hppa1_1/mul_1.s b/ghc/rts/gmp/mpn/hppa/hppa1_1/mul_1.s index cdd0c1d..4512fdd 100644 --- a/ghc/rts/gmp/mpn/hppa/hppa1_1/mul_1.s +++ b/ghc/rts/gmp/mpn/hppa/hppa1_1/mul_1.s @@ -1,21 +1,21 @@ -; HP-PA-1.1 __mpn_mul_1 -- Multiply a limb vector with a limb and store +; HP-PA-1.1 __gmpn_mul_1 -- Multiply a limb vector with a limb and store ; the result in a second limb vector. -; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -45,8 +45,8 @@ ; in the cache.) .code - .export __mpn_mul_1 -__mpn_mul_1 + .export __gmpn_mul_1 +__gmpn_mul_1 .proc .callinfo frame=64,no_calls .entry diff --git a/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s b/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s index 21fe161..4f4be08 100644 --- a/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s +++ b/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s @@ -1,22 +1,22 @@ -; HP-PA __mpn_add_n -- Add two limb vectors of the same length > 0 and store +; HP-PA __gmpn_add_n -- Add two limb vectors of the same length > 0 and store ; sum in a third limb vector. ; This is optimized for the PA7100, where is runs at 4.25 cycles/limb -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -29,8 +29,8 @@ ; size gr23 .code - .export __mpn_add_n -__mpn_add_n + .export __gmpn_add_n +__gmpn_add_n .proc .callinfo frame=0,no_calls .entry diff --git a/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S b/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S index eb1d12b..04db068 100644 --- a/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S +++ b/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S @@ -1,21 +1,21 @@ -; HP-PA 7100/7200 __mpn_addmul_1 -- Multiply a limb vector with a limb and +; HP-PA 7100/7200 __gmpn_addmul_1 -- Multiply a limb vector with a limb and ; add the result to a second limb vector. -; Copyright (C) 1995 Free Software Foundation, Inc. +; Copyright (C) 1995, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -41,8 +41,8 @@ #define hi3 %r1 .code - .export __mpn_addmul_1 -__mpn_addmul_1 + .export __gmpn_addmul_1 +__gmpn_addmul_1 .proc .callinfo frame=128,no_calls .entry diff --git a/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s b/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s index 4c74a50..31669b1 100644 --- a/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s +++ b/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s @@ -1,21 +1,21 @@ -; HP-PA __mpn_lshift -- +; HP-PA __gmpn_lshift -- ; This is optimized for the PA7100, where is runs at 3.25 cycles/limb -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -28,8 +28,8 @@ ; cnt gr23 .code - .export __mpn_lshift -__mpn_lshift + .export __gmpn_lshift +__gmpn_lshift .proc .callinfo frame=64,no_calls .entry diff --git a/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s b/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s index 845418c..d32b10b 100644 --- a/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s +++ b/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s @@ -1,21 +1,21 @@ -; HP-PA __mpn_rshift -- +; HP-PA __gmpn_rshift -- ; This is optimized for the PA7100, where is runs at 3.25 cycles/limb -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -28,8 +28,8 @@ ; cnt gr23 .code - .export __mpn_rshift -__mpn_rshift + .export __gmpn_rshift +__gmpn_rshift .proc .callinfo frame=64,no_calls .entry diff --git a/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s b/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s index 1e1ebcf..0eec41c 100644 --- a/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s +++ b/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s @@ -1,22 +1,22 @@ -; HP-PA __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +; HP-PA __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and ; store difference in a third limb vector. ; This is optimized for the PA7100, where is runs at 4.25 cycles/limb -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -29,8 +29,8 @@ ; size gr23 .code - .export __mpn_sub_n -__mpn_sub_n + .export __gmpn_sub_n +__gmpn_sub_n .proc .callinfo frame=0,no_calls .entry diff --git a/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S b/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S index a71176e..0fba21d 100644 --- a/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S +++ b/ghc/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S @@ -1,21 +1,21 @@ -; HP-PA 7100/7200 __mpn_submul_1 -- Multiply a limb vector with a limb and +; HP-PA 7100/7200 __gmpn_submul_1 -- Multiply a limb vector with a limb and ; subtract the result from a second limb vector. -; Copyright (C) 1995 Free Software Foundation, Inc. +; Copyright (C) 1995, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -41,8 +41,8 @@ #define hi3 %r1 .code - .export __mpn_submul_1 -__mpn_submul_1 + .export __gmpn_submul_1 +__gmpn_submul_1 .proc .callinfo frame=128,no_calls .entry diff --git a/ghc/rts/gmp/mpn/hppa/hppa1_1/submul_1.s b/ghc/rts/gmp/mpn/hppa/hppa1_1/submul_1.s index a4a3854..20a5b5c 100644 --- a/ghc/rts/gmp/mpn/hppa/hppa1_1/submul_1.s +++ b/ghc/rts/gmp/mpn/hppa/hppa1_1/submul_1.s @@ -1,21 +1,21 @@ -; HP-PA-1.1 __mpn_submul_1 -- Multiply a limb vector with a limb and +; HP-PA-1.1 __gmpn_submul_1 -- Multiply a limb vector with a limb and ; subtract the result from a second limb vector. -; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -36,14 +36,14 @@ ; There are some ideas described in mul_1.s that applies to this code too. -; It seems possible to make this run as fast as __mpn_addmul_1, if we use +; It seems possible to make this run as fast as __gmpn_addmul_1, if we use ; sub,>>= %r29,%r19,%r22 ; addi 1,%r28,%r28 ; but that requires reworking the hairy software pipeline... .code - .export __mpn_submul_1 -__mpn_submul_1 + .export __gmpn_submul_1 +__gmpn_submul_1 .proc .callinfo frame=64,no_calls .entry diff --git a/ghc/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S b/ghc/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S new file mode 100644 index 0000000..b83d6f4 --- /dev/null +++ b/ghc/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S @@ -0,0 +1,80 @@ +; HP-PA __udiv_qrnnd division support, used from longlong.h. +; This version runs fast on PA 7000 and later. + +; Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; rem_ptr gr26 +; n1 gr25 +; n0 gr24 +; d gr23 + + .code +L$0000 .word 0x43f00000 ; 2^64 + .word 0x0 + .export __gmpn_udiv_qrnnd +__gmpn_udiv_qrnnd + .proc + .callinfo frame=64,no_calls + .entry + ldo 64(%r30),%r30 + + stws %r25,-16(0,%r30) ; n_hi + stws %r24,-12(0,%r30) ; n_lo +#ifdef PIC + addil LT%L$0000,%r19 + ldo RT%L$0000(%r1),%r19 +#else + ldil L%L$0000,%r19 + ldo R%L$0000(%r19),%r19 +#endif + fldds -16(0,%r30),%fr5 + stws %r23,-12(0,%r30) + comib,<= 0,%r25,L$1 + fcnvxf,dbl,dbl %fr5,%fr5 + fldds 0(0,%r19),%fr4 + fadd,dbl %fr4,%fr5,%fr5 +L$1 + fcpy,sgl %fr0,%fr6L + fldws -12(0,%r30),%fr6R + fcnvxf,dbl,dbl %fr6,%fr4 + + fdiv,dbl %fr5,%fr4,%fr5 + + fcnvfx,dbl,dbl %fr5,%fr4 + fstws %fr4R,-16(%r30) + xmpyu %fr4R,%fr6R,%fr6 + ldws -16(%r30),%r28 + fstds %fr6,-16(0,%r30) + ldws -12(0,%r30),%r21 + ldws -16(0,%r30),%r20 + sub %r24,%r21,%r22 + subb %r25,%r20,%r19 + comib,= 0,%r19,L$2 + ldo -64(%r30),%r30 + + add %r22,%r23,%r22 + ldo -1(%r28),%r28 +L$2 bv 0(%r2) + stws %r22,0(0,%r26) + + .exit + .procend diff --git a/ghc/rts/gmp/mpn/hppa/hppa1_1/umul.s b/ghc/rts/gmp/mpn/hppa/hppa1_1/umul.s new file mode 100644 index 0000000..1f1300a --- /dev/null +++ b/ghc/rts/gmp/mpn/hppa/hppa1_1/umul.s @@ -0,0 +1,42 @@ +; Copyright (C) 1999 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + .code + .export __umul_ppmm + .align 4 +__umul_ppmm + .proc + .callinfo frame=64,no_calls + .entry + + ldo 64(%r30),%r30 + stw %r25,-16(0,%r30) + fldws -16(0,%r30),%fr22R + stw %r24,-16(0,%r30) + fldws -16(0,%r30),%fr22L + xmpyu %fr22R,%fr22L,%fr22 + fstds %fr22,-16(0,%r30) + ldw -16(0,%r30),%r28 + ldw -12(0,%r30),%r29 + stw %r29,0(0,%r26) + bv 0(%r2) + ldo -64(%r30),%r30 + + .exit + .procend diff --git a/ghc/rts/gmp/mpn/hppa/hppa2_0/add_n.s b/ghc/rts/gmp/mpn/hppa/hppa2_0/add_n.s new file mode 100644 index 0000000..6e97278 --- /dev/null +++ b/ghc/rts/gmp/mpn/hppa/hppa2_0/add_n.s @@ -0,0 +1,88 @@ +; HP-PA 2.0 32-bit __gmpn_add_n -- Add two limb vectors of the same length > 0 +; and store sum in a third limb vector. + +; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; This runs at 2 cycles/limb on PA8000. + + .code + .export __gmpn_add_n +__gmpn_add_n + .proc + .callinfo frame=0,no_calls + .entry + + sub %r0,%r23,%r22 + zdep %r22,30,3,%r28 ; r28 = 2 * (-n & 7) + zdep %r22,29,3,%r22 ; r22 = 4 * (-n & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + sub %r24,%r22,%r24 ; offset s2_ptr + sub %r26,%r22,%r26 ; offset res_ptr + blr %r28,%r0 ; branch into loop + add %r0,%r0,%r0 ; reset carry + +L$loop ldw 0(%r25),%r20 + ldw 0(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,0(%r26) +L$7 ldw 4(%r25),%r21 + ldw 4(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,4(%r26) +L$6 ldw 8(%r25),%r20 + ldw 8(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,8(%r26) +L$5 ldw 12(%r25),%r21 + ldw 12(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,12(%r26) +L$4 ldw 16(%r25),%r20 + ldw 16(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,16(%r26) +L$3 ldw 20(%r25),%r21 + ldw 20(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,20(%r26) +L$2 ldw 24(%r25),%r20 + ldw 24(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,24(%r26) +L$1 ldw 28(%r25),%r21 + ldo 32(%r25),%r25 + ldw 28(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,28(%r26) + ldo 32(%r24),%r24 + addib,> -8,%r23,L$loop + ldo 32(%r26),%r26 + + bv (%r2) + .exit + addc %r0,%r0,%r28 + .procend diff --git a/ghc/rts/gmp/mpn/hppa/hppa2_0/sub_n.s b/ghc/rts/gmp/mpn/hppa/hppa2_0/sub_n.s new file mode 100644 index 0000000..7d9b50f --- /dev/null +++ b/ghc/rts/gmp/mpn/hppa/hppa2_0/sub_n.s @@ -0,0 +1,88 @@ +; HP-PA 2.0 32-bit __gmpn_sub_n -- Subtract two limb vectors of the same +; length > 0 and store difference in a third limb vector. + +; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; This runs at 2 cycles/limb on PA8000. + + .code + .export __gmpn_sub_n +__gmpn_sub_n + .proc + .callinfo frame=0,no_calls + .entry + + sub %r0,%r23,%r22 + zdep %r22,30,3,%r28 ; r28 = 2 * (-n & 7) + zdep %r22,29,3,%r22 ; r22 = 4 * (-n & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + sub %r24,%r22,%r24 ; offset s2_ptr + blr %r28,%r0 ; branch into loop + sub %r26,%r22,%r26 ; offset res_ptr and set carry + +L$loop ldw 0(%r25),%r20 + ldw 0(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,0(%r26) +L$7 ldw 4(%r25),%r21 + ldw 4(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,4(%r26) +L$6 ldw 8(%r25),%r20 + ldw 8(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,8(%r26) +L$5 ldw 12(%r25),%r21 + ldw 12(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,12(%r26) +L$4 ldw 16(%r25),%r20 + ldw 16(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,16(%r26) +L$3 ldw 20(%r25),%r21 + ldw 20(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,20(%r26) +L$2 ldw 24(%r25),%r20 + ldw 24(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,24(%r26) +L$1 ldw 28(%r25),%r21 + ldo 32(%r25),%r25 + ldw 28(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,28(%r26) + ldo 32(%r24),%r24 + addib,> -8,%r23,L$loop + ldo 32(%r26),%r26 + + addc %r0,%r0,%r28 + bv (%r2) + .exit + subi 1,%r28,%r28 + .procend diff --git a/ghc/rts/gmp/mpn/hppa/lshift.s b/ghc/rts/gmp/mpn/hppa/lshift.s index abac6ec..f5a2daa 100644 --- a/ghc/rts/gmp/mpn/hppa/lshift.s +++ b/ghc/rts/gmp/mpn/hppa/lshift.s @@ -1,20 +1,20 @@ -; HP-PA __mpn_lshift -- +; HP-PA __gmpn_lshift -- -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -27,8 +27,8 @@ ; cnt gr23 .code - .export __mpn_lshift -__mpn_lshift + .export __gmpn_lshift +__gmpn_lshift .proc .callinfo frame=64,no_calls .entry diff --git a/ghc/rts/gmp/mpn/hppa/rshift.s b/ghc/rts/gmp/mpn/hppa/rshift.s index c1480e5..e05e2f1 100644 --- a/ghc/rts/gmp/mpn/hppa/rshift.s +++ b/ghc/rts/gmp/mpn/hppa/rshift.s @@ -1,20 +1,20 @@ -; HP-PA __mpn_rshift -- +; HP-PA __gmpn_rshift -- -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -27,8 +27,8 @@ ; cnt gr23 .code - .export __mpn_rshift -__mpn_rshift + .export __gmpn_rshift +__gmpn_rshift .proc .callinfo frame=64,no_calls .entry diff --git a/ghc/rts/gmp/mpn/hppa/sub_n.s b/ghc/rts/gmp/mpn/hppa/sub_n.s index 04fa3e1..8f770ad 100644 --- a/ghc/rts/gmp/mpn/hppa/sub_n.s +++ b/ghc/rts/gmp/mpn/hppa/sub_n.s @@ -1,21 +1,21 @@ -; HP-PA __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +; HP-PA __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and ; store difference in a third limb vector. -; Copyright (C) 1992, 1994 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -32,8 +32,8 @@ ; unrolling useless. We can't come under 5 cycles/limb anyway. .code - .export __mpn_sub_n -__mpn_sub_n + .export __gmpn_sub_n +__gmpn_sub_n .proc .callinfo frame=0,no_calls .entry diff --git a/ghc/rts/gmp/mpn/hppa/udiv_qrnnd.s b/ghc/rts/gmp/mpn/hppa/udiv_qrnnd.s index 9b45eb4..9aa3b8a 100644 --- a/ghc/rts/gmp/mpn/hppa/udiv_qrnnd.s +++ b/ghc/rts/gmp/mpn/hppa/udiv_qrnnd.s @@ -1,21 +1,21 @@ ; HP-PA __udiv_qrnnd division support, used from longlong.h. ; This version runs fast on pre-PA7000 CPUs. -; Copyright (C) 1993, 1994 Free Software Foundation, Inc. +; Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -32,8 +32,8 @@ ; trouble is the FFFFFFFF code that would need some hacking. .code - .export __udiv_qrnnd -__udiv_qrnnd + .export __gmpn_udiv_qrnnd +__gmpn_udiv_qrnnd .proc .callinfo frame=0,no_calls .entry diff --git a/ghc/rts/gmp/mpn/i960/add_n.s b/ghc/rts/gmp/mpn/i960/add_n.s index 6e67482..387317a 100644 --- a/ghc/rts/gmp/mpn/i960/add_n.s +++ b/ghc/rts/gmp/mpn/i960/add_n.s @@ -1,29 +1,29 @@ -# I960 __mpn_add_n -- Add two limb vectors of the same length > 0 and store +# I960 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store # sum in a third limb vector. -# Copyright (C) 1995 Free Software Foundation, Inc. +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. .text .align 4 - .globl ___mpn_add_n -___mpn_add_n: + .globl ___gmpn_add_n +___gmpn_add_n: mov 0,g6 # clear carry-save register cmpo 1,0 # clear cy diff --git a/ghc/rts/gmp/mpn/i960/addmul_1.s b/ghc/rts/gmp/mpn/i960/addmul_1.s index db53f64..7df1418 100644 --- a/ghc/rts/gmp/mpn/i960/addmul_1.s +++ b/ghc/rts/gmp/mpn/i960/addmul_1.s @@ -1,29 +1,29 @@ -# I960 __mpn_addmul_1 -- Multiply a limb vector with a limb and add +# I960 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add # the result to a second limb vector. -# Copyright (C) 1995 Free Software Foundation, Inc. +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. .text .align 4 - .globl ___mpn_mul_1 -___mpn_mul_1: + .globl ___gmpn_mul_1 +___gmpn_mul_1: subo g2,0,g2 shlo 2,g2,g4 subo g4,g1,g1 diff --git a/ghc/rts/gmp/mpn/i960/mul_1.s b/ghc/rts/gmp/mpn/i960/mul_1.s index 4ccaeab..5c0c985 100644 --- a/ghc/rts/gmp/mpn/i960/mul_1.s +++ b/ghc/rts/gmp/mpn/i960/mul_1.s @@ -1,29 +1,29 @@ -# I960 __mpn_mul_1 -- Multiply a limb vector with a limb and store +# I960 __gmpn_mul_1 -- Multiply a limb vector with a limb and store # the result in a second limb vector. -# Copyright (C) 1995 Free Software Foundation, Inc. +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. .text .align 4 - .globl ___mpn_mul_1 -___mpn_mul_1: + .globl ___gmpn_mul_1 +___gmpn_mul_1: subo g2,0,g2 shlo 2,g2,g4 subo g4,g1,g1 diff --git a/ghc/rts/gmp/mpn/i960/sub_n.s b/ghc/rts/gmp/mpn/i960/sub_n.s index 01b94a1..2db2d46 100644 --- a/ghc/rts/gmp/mpn/i960/sub_n.s +++ b/ghc/rts/gmp/mpn/i960/sub_n.s @@ -1,29 +1,29 @@ -# I960 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +# I960 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and # store difference in a third limb vector. -# Copyright (C) 1995 Free Software Foundation, Inc. +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. .text .align 4 - .globl ___mpn_sub_n -___mpn_sub_n: + .globl ___gmpn_sub_n +___gmpn_sub_n: mov 1,g6 # set carry-save register cmpo 1,0 # clear cy diff --git a/ghc/rts/gmp/mpn/lisp/gmpasm-mode.el b/ghc/rts/gmp/mpn/lisp/gmpasm-mode.el new file mode 100644 index 0000000..5d9da7f --- /dev/null +++ b/ghc/rts/gmp/mpn/lisp/gmpasm-mode.el @@ -0,0 +1,351 @@ +;;; gmpasm-mode.el -- GNU MP asm and m4 editing mode. + + +;; Copyright (C) 1999, 2000 Free Software Foundation, Inc. +;; +;; This file is part of the GNU MP Library. +;; +;; The GNU MP Library is free software; you can redistribute it and/or modify +;; it under the terms of the GNU Lesser General Public License as published by +;; the Free Software Foundation; either version 2.1 of the License, or (at your +;; option) any later version. +;; +;; The GNU MP Library is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +;; License for more details. +;; +;; You should have received a copy of the GNU Lesser General Public License +;; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +;; MA 02111-1307, USA. + + +;;; Commentary: +;; +;; gmpasm-mode is an editing mode for m4 processed assembler code and m4 +;; macro files in GMP. It's similar to m4-mode, but has a number of +;; settings better suited to GMP. +;; +;; +;; Install +;; ------- +;; +;; To make M-x gmpasm-mode available, put gmpasm-mode.el somewhere in the +;; load-path and the following in .emacs +;; +;; (autoload 'gmpasm-mode "gmpasm-mode" nil t) +;; +;; To use gmpasm-mode automatically on all .asm and .m4 files, put the +;; following in .emacs +;; +;; (add-to-list 'auto-mode-alist '("\\.asm\\'" . gmpasm-mode)) +;; (add-to-list 'auto-mode-alist '("\\.m4\\'" . gmpasm-mode)) +;; +;; To have gmpasm-mode only on gmp files, try instead something like the +;; following, which uses it only in a directory starting with "gmp", or a +;; sub-directory of such. +;; +;; (add-to-list 'auto-mode-alist +;; '("/gmp.*/.*\\.\\(asm\\|m4\\)\\'" . gmpasm-mode)) +;; +;; Byte compiling will slightly speed up loading. If you want a docstring +;; in the autoload you can use M-x update-file-autoloads if you set it up +;; right. +;; +;; +;; Emacsen +;; ------- +;; +;; FSF Emacs 20.x - gmpasm-mode is designed for this. +;; XEmacs 20.x - seems to work. +;; +;; FSF Emacs 19.x - should work if replacements for some 20.x-isms are +;; available. comment-region with "C" won't really do the right thing +;; though. + + +;;; Code: + +(defgroup gmpasm nil + "GNU MP m4 and asm editing." + :prefix "gmpasm-" + :group 'languages) + +(defcustom gmpasm-mode-hook nil + "*Hook called by `gmpasm-mode'." + :type 'hook + :group 'gmpasm) + +(defcustom gmpasm-comment-start-regexp "[#;!@C]" + "*Regexp matching possible comment styles. +See `gmpasm-mode' docstring for how this is used." + :type 'regexp + :group 'gmpasm) + + +(defun gmpasm-add-to-list-second (list-var element) + "(gmpasm-add-to-list-second LIST-VAR ELEMENT) + +Add ELEMENT to LIST-VAR as the second element in the list, if it isn't +already in the list. If LIST-VAR is nil, then ELEMENT is just added as the +sole element in the list. + +This is like `add-to-list', but it puts the new value second in the list. + +The first cons cell is copied rather than changed in-place, so references to +the list elsewhere won't be affected." + + (if (member element (symbol-value list-var)) + (symbol-value list-var) + (set list-var + (if (symbol-value list-var) + (cons (car (symbol-value list-var)) + (cons element + (cdr (symbol-value list-var)))) + (list element))))) + + +(defun gmpasm-delete-from-list (list-var element) + "(gmpasm-delete-from-list LIST-VAR ELEMENT) + +Delete ELEMENT from LIST-VAR, using `delete'. +This is like `add-to-list', but the element is deleted from the list. +The list is copied rather than changed in-place, so references to it elsewhere +won't be affected." + + (set list-var (delete element (copy-sequence (symbol-value list-var))))) + + +(defvar gmpasm-mode-map + (let ((map (make-sparse-keymap))) + + ;; assembler and dnl commenting + (define-key map "\C-c\C-c" 'comment-region) + (define-key map "\C-c\C-d" 'gmpasm-comment-region-dnl) + + ;; kill an M-x compile, since it's not hard to put m4 into an infinite + ;; loop + (define-key map "\C-c\C-k" 'kill-compilation) + + map) + "Keymap for `gmpasm-mode'.") + + +(defvar gmpasm-mode-syntax-table + (let ((table (make-syntax-table))) + ;; underscore left as a symbol char, like C mode + + ;; m4 quotes + (modify-syntax-entry ?` "('" table) + (modify-syntax-entry ?' ")`" table) + + table) + "Syntax table used in `gmpasm-mode'. + +m4 ignores quote marks in # comments at the top level, but inside quotes # +isn't special and all quotes are active. There seems no easy way to express +this in the syntax table, so nothing is done for comments. Usually this is +best, since it picks up invalid apostrophes in comments inside quotes.") + + +(defvar gmpasm-font-lock-keywords + (eval-when-compile + (list + (cons + (concat + "\\b" + (regexp-opt + '("deflit" "defreg" "defframe" "defframe_pushl" + "define_not_for_expansion" + "ASM_START" "ASM_END" "PROLOGUE" "EPILOGUE" + "forloop" + "TEXT" "DATA" "ALIGN" "W32" + "builtin" "changecom" "changequote" "changeword" "debugfile" + "debugmode" "decr" "define" "defn" "divert" "divnum" "dumpdef" + "errprint" "esyscmd" "eval" "__file__" "format" "gnu" "ifdef" + "ifelse" "include" "incr" "index" "indir" "len" "__line__" + "m4exit" "m4wrap" "maketemp" "patsubst" "popdef" "pushdef" + "regexp" "shift" "sinclude" "substr" "syscmd" "sysval" + "traceoff" "traceon" "translit" "undefine" "undivert" "unix") + t) + "\\b") 'font-lock-keyword-face))) + + "`font-lock-keywords' for `gmpasm-mode'. + +The keywords are m4 builtins and some of the GMP macros used in asm files. +L and LF don't look good fontified, so they're omitted. + +The right assembler comment regexp is added dynamically buffer-local (with +dnl too).") + + +;; Initialized if gmpasm-mode finds filladapt loaded. +(defvar gmpasm-filladapt-token-table nil + "Filladapt token table used in `gmpasm-mode'.") +(defvar gmpasm-filladapt-token-match-table nil + "Filladapt token match table used in `gmpasm-mode'.") +(defvar gmpasm-filladapt-token-conversion-table nil + "Filladapt token conversion table used in `gmpasm-mode'.") + + +;;;###autoload +(defun gmpasm-mode () + "A major mode for editing GNU MP asm and m4 files. + +\\{gmpasm-mode-map} +`comment-start' and `comment-end' are set buffer-local to assembler +commenting appropriate for the CPU by looking for something matching +`gmpasm-comment-start-regexp' at the start of a line, or \"#\" is used if +there's no match (if \"#\" isn't what you want, type in a desired comment +and do \\[gmpasm-mode] to reinitialize). + +`adaptive-fill-regexp' is set buffer-local to the standard regexp with +`comment-start' and dnl added. If filladapt.el has been loaded it similarly +gets `comment-start' and dnl added as buffer-local fill prefixes. + +Font locking has the m4 builtins, some of the GMP macros, m4 dnl commenting, +and assembler commenting (based on the `comment-start' determined). + +Note that `gmpasm-comment-start-regexp' is only matched as a whole word, so +the `C' in it is only matched as a whole word, not on something that happens +to start with `C'. Also it's only the particular `comment-start' determined +that's added for filling etc, not the whole `gmpasm-comment-start-regexp'. + +`gmpasm-mode-hook' is run after initializations are complete. +" + + (interactive) + (kill-all-local-variables) + (setq major-mode 'gmpasm-mode + mode-name "gmpasm") + (use-local-map gmpasm-mode-map) + (set-syntax-table gmpasm-mode-syntax-table) + (setq fill-column 76) + + ;; Short instructions might fit with 32, but anything with labels or + ;; expressions soon needs the comments pushed out to column 40. + (setq comment-column 40) + + ;; Don't want to find out the hard way which dumb assemblers don't like a + ;; missing final newline. + (set (make-local-variable 'require-final-newline) t) + + ;; The first match of gmpasm-comment-start-regexp at the start of a line + ;; determines comment-start, or "#" if no match. + (set (make-local-variable 'comment-start) + (save-excursion + (goto-char (point-min)) + (if (re-search-forward + (concat "^\\(" gmpasm-comment-start-regexp "\\)\\(\\s-\\|$\\)") + nil t) + (match-string 1) + "#"))) + (set (make-local-variable 'comment-end) "") + + ;; If comment-start ends in an alphanumeric then \b is used to match it + ;; only as a separate word. The test is for an alphanumeric rather than + ;; \w since we might try # or ! as \w characters but without wanting \b. + (let ((comment-regexp + (concat (regexp-quote comment-start) + (if (string-match "[a-zA-Z0-9]\\'" comment-start) "\\b")))) + + ;; Whitespace is required before a comment-start so m4 $# doesn't match + ;; when comment-start is "#". + ;; Only spaces or tabs match after, so newline isn't included in the + ;; font lock below. + (set (make-local-variable 'comment-start-skip) + (concat "\\(^\\|\\s-\\)" comment-regexp "[ \t]*")) + + ;; Comment fontification based on comment-start, matching through to the + ;; end of the line. + (add-to-list (make-local-variable 'gmpasm-font-lock-keywords) + (cons (concat + "\\(\\bdnl\\b\\|" comment-start-skip "\\).*$") + 'font-lock-comment-face)) + + (set (make-local-variable 'font-lock-defaults) + '(gmpasm-font-lock-keywords + t ; no syntactic fontification (of strings etc) + nil ; no case-fold + ((?_ . "w")) ; _ part of a word while fontifying + )) + + ;; Paragraphs are separated by blank lines, or lines with only dnl or + ;; comment-start. + (set (make-local-variable 'paragraph-separate) + (concat "[ \t\f]*\\(\\(" comment-regexp "\\|dnl\\)[ \t]*\\)*$")) + (set (make-local-variable 'paragraph-start) + (concat "\f\\|" paragraph-separate)) + + ;; Adaptive fill gets dnl and comment-start as comment style prefixes on + ;; top of the standard regexp (which has # and ; already actually). + (set (make-local-variable 'adaptive-fill-regexp) + (concat "[ \t]*\\(\\(" + comment-regexp + "\\|dnl\\|[-|#;>*]+\\|(?[0-9]+[.)]\\)[ \t]*\\)*")) + (set (make-local-variable 'adaptive-fill-first-line-regexp) + "\\`\\([ \t]*dnl\\)?[ \t]*\\'") + + (when (fboundp 'filladapt-mode) + (when (not gmpasm-filladapt-token-table) + (setq gmpasm-filladapt-token-table + filladapt-token-table) + (setq gmpasm-filladapt-token-match-table + filladapt-token-match-table) + (setq gmpasm-filladapt-token-conversion-table + filladapt-token-conversion-table) + + ;; Numbered bullet points like "2.1" get matched at the start of a + ;; line when it's really something like "2.1 cycles/limb", so delete + ;; this from the list. The regexp for "1.", "2." etc is left + ;; though. + (gmpasm-delete-from-list 'gmpasm-filladapt-token-table + '("[0-9]+\\(\\.[0-9]+\\)+[ \t]" + bullet)) + + ;; "%" as a comment prefix interferes with x86 register names + ;; like %eax, so delete this. + (gmpasm-delete-from-list 'gmpasm-filladapt-token-table + '("%+" postscript-comment)) + + (add-to-list 'gmpasm-filladapt-token-match-table + '(gmpasm-comment gmpasm-comment)) + (add-to-list 'gmpasm-filladapt-token-conversion-table + '(gmpasm-comment . exact)) + ) + + (set (make-local-variable 'filladapt-token-table) + gmpasm-filladapt-token-table) + (set (make-local-variable 'filladapt-token-match-table) + gmpasm-filladapt-token-match-table) + (set (make-local-variable 'filladapt-token-conversion-table) + gmpasm-filladapt-token-conversion-table) + + ;; Add dnl and comment-start as fill prefixes. + ;; Comments in filladapt.el say filladapt-token-table must begin + ;; with ("^" beginning-of-line), so put our addition second. + (gmpasm-add-to-list-second 'filladapt-token-table + (list (concat "dnl[ \t]\\|" comment-regexp) + 'gmpasm-comment)) + )) + + (run-hooks 'gmpasm-mode-hook)) + + +(defun gmpasm-comment-region-dnl (beg end &optional arg) + "(gmpasm-comment-region BEG END &option ARG) + +Comment or uncomment each line in the region using `dnl'. +With \\[universal-argument] prefix arg, uncomment each line in region. +This is `comment-region', but using \"dnl\"." + + (interactive "r\nP") + (let ((comment-start "dnl") + (comment-end "")) + (comment-region beg end arg))) + + +(provide 'gmpasm-mode) + +;;; gmpasm-mode.el ends here diff --git a/ghc/rts/gmp/mpn/m68k/add_n.S b/ghc/rts/gmp/mpn/m68k/add_n.S index 7ca5b95..9e1d89d 100644 --- a/ghc/rts/gmp/mpn/m68k/add_n.S +++ b/ghc/rts/gmp/mpn/m68k/add_n.S @@ -1,21 +1,21 @@ -/* mc68020 __mpn_add_n -- Add two limb vectors of the same length > 0 and store +/* mc68020 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store sum in a third limb vector. -Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -28,15 +28,14 @@ MA 02111-1307, USA. */ size (sp + 12) */ -#include "sysdep.h" #include "asm-syntax.h" TEXT ALIGN - GLOBL C_SYMBOL_NAME(__mpn_add_n) + GLOBL C_SYMBOL_NAME(__gmpn_add_n) -C_SYMBOL_NAME(__mpn_add_n:) -PROLOG(__mpn_add_n) +C_SYMBOL_NAME(__gmpn_add_n:) +PROLOG(__gmpn_add_n) /* Save used registers on the stack. */ movel R(d2),MEM_PREDEC(sp) movel R(a2),MEM_PREDEC(sp) @@ -77,4 +76,4 @@ L(L2:) movel MEM_POSTINC(sp),R(d2) rts -EPILOG(__mpn_add_n) +EPILOG(__gmpn_add_n) diff --git a/ghc/rts/gmp/mpn/m68k/lshift.S b/ghc/rts/gmp/mpn/m68k/lshift.S index 77184d6..a539d5d 100644 --- a/ghc/rts/gmp/mpn/m68k/lshift.S +++ b/ghc/rts/gmp/mpn/m68k/lshift.S @@ -1,20 +1,20 @@ -/* mc68020 __mpn_lshift -- Shift left a low-level natural-number integer. +/* mc68020 __gmpn_lshift -- Shift left a low-level natural-number integer. -Copyright (C) 1996 Free Software Foundation, Inc. +Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -27,7 +27,6 @@ MA 02111-1307, USA. */ cnt (sp + 12) */ -#include "sysdep.h" #include "asm-syntax.h" #define res_ptr a1 @@ -37,10 +36,10 @@ MA 02111-1307, USA. */ TEXT ALIGN - GLOBL C_SYMBOL_NAME(__mpn_lshift) + GLOBL C_SYMBOL_NAME(__gmpn_lshift) -C_SYMBOL_NAME(__mpn_lshift:) -PROLOG(__mpn_lshift) +C_SYMBOL_NAME(__gmpn_lshift:) +PROLOG(__gmpn_lshift) /* Save used registers on the stack. */ moveml R(d2)-R(d6)/R(a2),MEM_PREDEC(sp) @@ -148,4 +147,4 @@ L(LLend:) /* Restore used registers from stack frame. */ moveml MEM_POSTINC(sp),R(d2)-R(d6)/R(a2) rts -EPILOG(__mpn_lshift) +EPILOG(__gmpn_lshift) diff --git a/ghc/rts/gmp/mpn/m68k/mc68020/addmul_1.S b/ghc/rts/gmp/mpn/m68k/mc68020/addmul_1.S index 4b99c21..6638115 100644 --- a/ghc/rts/gmp/mpn/m68k/mc68020/addmul_1.S +++ b/ghc/rts/gmp/mpn/m68k/mc68020/addmul_1.S @@ -1,21 +1,21 @@ -/* mc68020 __mpn_addmul_1 -- Multiply a limb vector with a limb and add +/* mc68020 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add the result to a second limb vector. -Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -28,15 +28,14 @@ MA 02111-1307, USA. */ s2_limb (sp + 16) */ -#include "sysdep.h" #include "asm-syntax.h" TEXT ALIGN - GLOBL C_SYMBOL_NAME(__mpn_addmul_1) + GLOBL C_SYMBOL_NAME(__gmpn_addmul_1) -C_SYMBOL_NAME(__mpn_addmul_1:) -PROLOG(__mpn_addmul_1) +C_SYMBOL_NAME(__gmpn_addmul_1:) +PROLOG(__gmpn_addmul_1) #define res_ptr a0 #define s1_ptr a1 @@ -81,4 +80,4 @@ L(L1:) movel MEM_POSTINC(s1_ptr),R(d3) moveml MEM_POSTINC(sp),R(d2)-R(d5) rts -EPILOG(__mpn_addmul_1) +EPILOG(__gmpn_addmul_1) diff --git a/ghc/rts/gmp/mpn/m68k/mc68020/mul_1.S b/ghc/rts/gmp/mpn/m68k/mc68020/mul_1.S index ef7d937..fdd4c39 100644 --- a/ghc/rts/gmp/mpn/m68k/mc68020/mul_1.S +++ b/ghc/rts/gmp/mpn/m68k/mc68020/mul_1.S @@ -1,21 +1,21 @@ -/* mc68020 __mpn_mul_1 -- Multiply a limb vector with a limb and store +/* mc68020 __gmpn_mul_1 -- Multiply a limb vector with a limb and store the result in a second limb vector. -Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -28,15 +28,14 @@ MA 02111-1307, USA. */ s2_limb (sp + 16) */ -#include "sysdep.h" #include "asm-syntax.h" TEXT ALIGN - GLOBL C_SYMBOL_NAME(__mpn_mul_1) + GLOBL C_SYMBOL_NAME(__gmpn_mul_1) -C_SYMBOL_NAME(__mpn_mul_1:) -PROLOG(__mpn_mul_1) +C_SYMBOL_NAME(__gmpn_mul_1:) +PROLOG(__gmpn_mul_1) #define res_ptr a0 #define s1_ptr a1 @@ -88,4 +87,4 @@ L(L1:) movel MEM_POSTINC(s1_ptr),R(d3) movel MEM_POSTINC(sp),R(d2) #endif rts -EPILOG(__mpn_mul_1) +EPILOG(__gmpn_mul_1) diff --git a/ghc/rts/gmp/mpn/m68k/mc68020/submul_1.S b/ghc/rts/gmp/mpn/m68k/mc68020/submul_1.S index 9770c6c..3c36b70 100644 --- a/ghc/rts/gmp/mpn/m68k/mc68020/submul_1.S +++ b/ghc/rts/gmp/mpn/m68k/mc68020/submul_1.S @@ -1,21 +1,21 @@ -/* mc68020 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract +/* mc68020 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract the result from a second limb vector. -Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -28,15 +28,14 @@ MA 02111-1307, USA. */ s2_limb (sp + 16) */ -#include "sysdep.h" #include "asm-syntax.h" TEXT ALIGN - GLOBL C_SYMBOL_NAME(__mpn_submul_1) + GLOBL C_SYMBOL_NAME(__gmpn_submul_1) -C_SYMBOL_NAME(__mpn_submul_1:) -PROLOG(__mpn_submul_1) +C_SYMBOL_NAME(__gmpn_submul_1:) +PROLOG(__gmpn_submul_1) #define res_ptr a0 #define s1_ptr a1 @@ -81,4 +80,4 @@ L(L1:) movel MEM_POSTINC(s1_ptr),R(d3) moveml MEM_POSTINC(sp),R(d2)-R(d5) rts -EPILOG(__mpn_submul_1) +EPILOG(__gmpn_submul_1) diff --git a/ghc/rts/gmp/mpn/m68k/mc68020/udiv.S b/ghc/rts/gmp/mpn/m68k/mc68020/udiv.S new file mode 100644 index 0000000..d00cf13 --- /dev/null +++ b/ghc/rts/gmp/mpn/m68k/mc68020/udiv.S @@ -0,0 +1,31 @@ +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +.text + .even +.globl ___udiv_qrnnd +___udiv_qrnnd: + movel sp@(4),a0 + movel sp@(8),d1 + movel sp@(12),d0 + divul sp@(16),d1:d0 + movel d1,a0@ + rts diff --git a/ghc/rts/gmp/mpn/m68k/mc68020/umul.S b/ghc/rts/gmp/mpn/m68k/mc68020/umul.S new file mode 100644 index 0000000..a34ae6c --- /dev/null +++ b/ghc/rts/gmp/mpn/m68k/mc68020/umul.S @@ -0,0 +1,31 @@ +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +.text + .even +.globl ___umul_ppmm +___umul_ppmm: + movel sp@(4),a0 + movel sp@(8),d1 + movel sp@(12),d0 + mulul d0,d0:d1 + movel d1,a0@ + rts diff --git a/ghc/rts/gmp/mpn/m68k/rshift.S b/ghc/rts/gmp/mpn/m68k/rshift.S index 2ca5c79..b47a48e 100644 --- a/ghc/rts/gmp/mpn/m68k/rshift.S +++ b/ghc/rts/gmp/mpn/m68k/rshift.S @@ -1,20 +1,20 @@ -/* mc68020 __mpn_rshift -- Shift right a low-level natural-number integer. +/* mc68020 __gmpn_rshift -- Shift right a low-level natural-number integer. -Copyright (C) 1996 Free Software Foundation, Inc. +Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -27,7 +27,6 @@ MA 02111-1307, USA. */ cnt (sp + 12) */ -#include "sysdep.h" #include "asm-syntax.h" #define res_ptr a1 @@ -37,10 +36,10 @@ MA 02111-1307, USA. */ TEXT ALIGN - GLOBL C_SYMBOL_NAME(__mpn_rshift) + GLOBL C_SYMBOL_NAME(__gmpn_rshift) -C_SYMBOL_NAME(__mpn_rshift:) -PROLOG(__mpn_rshift) +C_SYMBOL_NAME(__gmpn_rshift:) +PROLOG(__gmpn_rshift) /* Save used registers on the stack. */ moveml R(d2)-R(d6)/R(a2),MEM_PREDEC(sp) @@ -147,4 +146,4 @@ L(LLend:) /* Restore used registers from stack frame. */ moveml MEM_POSTINC(sp),R(d2)-R(d6)/R(a2) rts -EPILOG(__mpn_rshift) +EPILOG(__gmpn_rshift) diff --git a/ghc/rts/gmp/mpn/m68k/sub_n.S b/ghc/rts/gmp/mpn/m68k/sub_n.S index f94b0c7..ce45b24 100644 --- a/ghc/rts/gmp/mpn/m68k/sub_n.S +++ b/ghc/rts/gmp/mpn/m68k/sub_n.S @@ -1,21 +1,21 @@ -/* mc68020 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +/* mc68020 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and store difference in a third limb vector. -Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -28,15 +28,14 @@ MA 02111-1307, USA. */ size (sp + 12) */ -#include "sysdep.h" #include "asm-syntax.h" TEXT ALIGN - GLOBL C_SYMBOL_NAME(__mpn_sub_n) + GLOBL C_SYMBOL_NAME(__gmpn_sub_n) -C_SYMBOL_NAME(__mpn_sub_n:) -PROLOG(__mpn_sub_n) +C_SYMBOL_NAME(__gmpn_sub_n:) +PROLOG(__gmpn_sub_n) /* Save used registers on the stack. */ movel R(d2),MEM_PREDEC(sp) movel R(a2),MEM_PREDEC(sp) @@ -77,4 +76,4 @@ L(L2:) movel MEM_POSTINC(sp),R(d2) rts -EPILOG(__mpn_sub_n) +EPILOG(__gmpn_sub_n) diff --git a/ghc/rts/gmp/mpn/m68k/syntax.h b/ghc/rts/gmp/mpn/m68k/syntax.h index 9d6f352..9eec279 100644 --- a/ghc/rts/gmp/mpn/m68k/syntax.h +++ b/ghc/rts/gmp/mpn/m68k/syntax.h @@ -5,16 +5,16 @@ Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/m88k/add_n.s b/ghc/rts/gmp/mpn/m88k/add_n.s index 1b09cce..0b776c6 100644 --- a/ghc/rts/gmp/mpn/m88k/add_n.s +++ b/ghc/rts/gmp/mpn/m88k/add_n.s @@ -1,21 +1,21 @@ -; mc88100 __mpn_add -- Add two limb vectors of the same length > 0 and store +; mc88100 __gmpn_add -- Add two limb vectors of the same length > 0 and store ; sum in a third limb vector. -; Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -36,8 +36,8 @@ text align 16 - global ___mpn_add_n -___mpn_add_n: + global ___gmpn_add_n +___gmpn_add_n: ld r6,r3,0 ; read first limb from s1_ptr extu r10,r5,3 ld r7,r4,0 ; read first limb from s2_ptr diff --git a/ghc/rts/gmp/mpn/m88k/mc88110/add_n.S b/ghc/rts/gmp/mpn/m88k/mc88110/add_n.S index 39a44e5..843a50d 100644 --- a/ghc/rts/gmp/mpn/m88k/mc88110/add_n.S +++ b/ghc/rts/gmp/mpn/m88k/mc88110/add_n.S @@ -1,21 +1,21 @@ -; mc88110 __mpn_add_n -- Add two limb vectors of the same length > 0 and store +; mc88110 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store ; sum in a third limb vector. -; Copyright (C) 1995, 1996 Free Software Foundation, Inc. +; Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -31,8 +31,8 @@ text align 16 - global C_SYMBOL_NAME(__mpn_add_n) -C_SYMBOL_NAME(__mpn_add_n): + global C_SYMBOL_NAME(__gmpn_add_n) +C_SYMBOL_NAME(__gmpn_add_n): addu.co r0,r0,r0 ; clear cy flag xor r12,s2_ptr,res_ptr bb1 2,r12,L1 diff --git a/ghc/rts/gmp/mpn/m88k/mc88110/addmul_1.s b/ghc/rts/gmp/mpn/m88k/mc88110/addmul_1.s index 2bd6f21..7d97c87 100644 --- a/ghc/rts/gmp/mpn/m88k/mc88110/addmul_1.s +++ b/ghc/rts/gmp/mpn/m88k/mc88110/addmul_1.s @@ -1,21 +1,21 @@ -; mc88110 __mpn_addmul_1 -- Multiply a limb vector with a single limb and +; mc88110 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and ; store the product in a second limb vector. -; Copyright (C) 1996 Free Software Foundation, Inc. +; Copyright (C) 1996, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -29,8 +29,8 @@ text align 16 - global ___mpn_addmul_1 -___mpn_addmul_1: + global ___gmpn_addmul_1 +___gmpn_addmul_1: lda r3,r3[r4] lda r8,r2[r4] ; RES_PTR in r8 since r2 is retval subu r4,r0,r4 diff --git a/ghc/rts/gmp/mpn/m88k/mc88110/mul_1.s b/ghc/rts/gmp/mpn/m88k/mc88110/mul_1.s index 1518900..b8483af 100644 --- a/ghc/rts/gmp/mpn/m88k/mc88110/mul_1.s +++ b/ghc/rts/gmp/mpn/m88k/mc88110/mul_1.s @@ -1,21 +1,21 @@ -; mc88110 __mpn_mul_1 -- Multiply a limb vector with a single limb and +; mc88110 __gmpn_mul_1 -- Multiply a limb vector with a single limb and ; store the product in a second limb vector. -; Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -29,8 +29,8 @@ text align 16 - global ___mpn_mul_1 -___mpn_mul_1: + global ___gmpn_mul_1 +___gmpn_mul_1: ; Make S1_PTR and RES_PTR point at the end of their blocks ; and negate SIZE. lda r3,r3[r4] diff --git a/ghc/rts/gmp/mpn/m88k/mc88110/sub_n.S b/ghc/rts/gmp/mpn/m88k/mc88110/sub_n.S index 685f024..715a3fa 100644 --- a/ghc/rts/gmp/mpn/m88k/mc88110/sub_n.S +++ b/ghc/rts/gmp/mpn/m88k/mc88110/sub_n.S @@ -1,21 +1,21 @@ -; mc88110 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +; mc88110 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and ; store difference in a third limb vector. -; Copyright (C) 1995, 1996 Free Software Foundation, Inc. +; Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -31,8 +31,8 @@ text align 16 - global C_SYMBOL_NAME(__mpn_sub_n) -C_SYMBOL_NAME(__mpn_sub_n): + global C_SYMBOL_NAME(__gmpn_sub_n) +C_SYMBOL_NAME(__gmpn_sub_n): subu.co r0,r0,r0 ; set cy flag xor r12,s2_ptr,res_ptr bb1 2,r12,L1 diff --git a/ghc/rts/gmp/mpn/m88k/mul_1.s b/ghc/rts/gmp/mpn/m88k/mul_1.s index 26626bf..0637083 100644 --- a/ghc/rts/gmp/mpn/m88k/mul_1.s +++ b/ghc/rts/gmp/mpn/m88k/mul_1.s @@ -1,21 +1,21 @@ -; mc88100 __mpn_mul_1 -- Multiply a limb vector with a single limb and +; mc88100 __gmpn_mul_1 -- Multiply a limb vector with a single limb and ; store the product in a second limb vector. -; Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -50,8 +50,8 @@ text align 16 - global ___mpn_mul_1 -___mpn_mul_1: + global ___gmpn_mul_1 +___gmpn_mul_1: ; Make S1_PTR and RES_PTR point at the end of their blocks ; and negate SIZE. diff --git a/ghc/rts/gmp/mpn/m88k/sub_n.s b/ghc/rts/gmp/mpn/m88k/sub_n.s index 7dfffc9..2fd345a 100644 --- a/ghc/rts/gmp/mpn/m88k/sub_n.s +++ b/ghc/rts/gmp/mpn/m88k/sub_n.s @@ -1,21 +1,21 @@ -; mc88100 __mpn_sub -- Subtract two limb vectors of the same length > 0 and +; mc88100 __gmpn_sub -- Subtract two limb vectors of the same length > 0 and ; store difference in a third limb vector. -; Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. +; Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. ; This file is part of the GNU MP Library. ; The GNU MP Library is free software; you can redistribute it and/or modify -; it under the terms of the GNU Library General Public License as published by -; the Free Software Foundation; either version 2 of the License, or (at your +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your ; option) any later version. ; The GNU MP Library is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ; License for more details. -; You should have received a copy of the GNU Library General Public License +; You should have received a copy of the GNU Lesser General Public License ; along with the GNU MP Library; see the file COPYING.LIB. If not, write to ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ; MA 02111-1307, USA. @@ -36,8 +36,8 @@ text align 16 - global ___mpn_sub_n -___mpn_sub_n: + global ___gmpn_sub_n +___gmpn_sub_n: ld r6,r3,0 ; read first limb from s1_ptr extu r10,r5,3 ld r7,r4,0 ; read first limb from s2_ptr diff --git a/ghc/rts/gmp/mpn/mips2/add_n.s b/ghc/rts/gmp/mpn/mips2/add_n.s index f5525ce..5c3c7fc 100644 --- a/ghc/rts/gmp/mpn/mips2/add_n.s +++ b/ghc/rts/gmp/mpn/mips2/add_n.s @@ -1,21 +1,21 @@ - # MIPS2 __mpn_add_n -- Add two limb vectors of the same length > 0 and + # MIPS2 __gmpn_add_n -- Add two limb vectors of the same length > 0 and # store sum in a third limb vector. - # Copyright (C) 1995 Free Software Foundation, Inc. + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. - # You should have received a copy of the GNU Library General Public License + # You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -29,9 +29,9 @@ .text .align 2 - .globl __mpn_add_n - .ent __mpn_add_n -__mpn_add_n: + .globl __gmpn_add_n + .ent __gmpn_add_n +__gmpn_add_n: .set noreorder .set nomacro @@ -117,4 +117,4 @@ __mpn_add_n: j $31 or $2,$2,$8 - .end __mpn_add_n + .end __gmpn_add_n diff --git a/ghc/rts/gmp/mpn/mips2/addmul_1.s b/ghc/rts/gmp/mpn/mips2/addmul_1.s index 6145771..1e50377 100644 --- a/ghc/rts/gmp/mpn/mips2/addmul_1.s +++ b/ghc/rts/gmp/mpn/mips2/addmul_1.s @@ -1,21 +1,21 @@ - # MIPS __mpn_addmul_1 -- Multiply a limb vector with a single limb and + # MIPS __gmpn_addmul_1 -- Multiply a limb vector with a single limb and # add the product to a second limb vector. - # Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. + # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. - # You should have received a copy of the GNU Library General Public License + # You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -29,9 +29,9 @@ .text .align 4 - .globl __mpn_addmul_1 - .ent __mpn_addmul_1 -__mpn_addmul_1: + .globl __gmpn_addmul_1 + .ent __gmpn_addmul_1 +__gmpn_addmul_1: .set noreorder .set nomacro @@ -94,4 +94,4 @@ $LC0: lw $10,0($4) j $31 addu $2,$9,$2 # add high product limb and carry from addition - .end __mpn_addmul_1 + .end __gmpn_addmul_1 diff --git a/ghc/rts/gmp/mpn/mips2/lshift.s b/ghc/rts/gmp/mpn/mips2/lshift.s index ee92d79..2ca3a3c 100644 --- a/ghc/rts/gmp/mpn/mips2/lshift.s +++ b/ghc/rts/gmp/mpn/mips2/lshift.s @@ -1,20 +1,20 @@ - # MIPS2 __mpn_lshift -- + # MIPS2 __gmpn_lshift -- - # Copyright (C) 1995 Free Software Foundation, Inc. + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. - # You should have received a copy of the GNU Library General Public License + # You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -28,9 +28,9 @@ .text .align 2 - .globl __mpn_lshift - .ent __mpn_lshift -__mpn_lshift: + .globl __gmpn_lshift + .ent __gmpn_lshift +__gmpn_lshift: .set noreorder .set nomacro @@ -92,4 +92,4 @@ __mpn_lshift: .Lend: sll $8,$10,$7 j $31 sw $8,-4($4) - .end __mpn_lshift + .end __gmpn_lshift diff --git a/ghc/rts/gmp/mpn/mips2/mul_1.s b/ghc/rts/gmp/mpn/mips2/mul_1.s index d006fa1..ea8aa26 100644 --- a/ghc/rts/gmp/mpn/mips2/mul_1.s +++ b/ghc/rts/gmp/mpn/mips2/mul_1.s @@ -1,21 +1,21 @@ - # MIPS __mpn_mul_1 -- Multiply a limb vector with a single limb and + # MIPS __gmpn_mul_1 -- Multiply a limb vector with a single limb and # store the product in a second limb vector. - # Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. + # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. - # You should have received a copy of the GNU Library General Public License + # You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -29,9 +29,9 @@ .text .align 4 - .globl __mpn_mul_1 - .ent __mpn_mul_1 -__mpn_mul_1: + .globl __gmpn_mul_1 + .ent __gmpn_mul_1 +__gmpn_mul_1: .set noreorder .set nomacro @@ -82,4 +82,4 @@ $LC0: mflo $10 j $31 addu $2,$9,$2 # add high product limb and carry from addition - .end __mpn_mul_1 + .end __gmpn_mul_1 diff --git a/ghc/rts/gmp/mpn/mips2/rshift.s b/ghc/rts/gmp/mpn/mips2/rshift.s index a8beb40..37c8f39 100644 --- a/ghc/rts/gmp/mpn/mips2/rshift.s +++ b/ghc/rts/gmp/mpn/mips2/rshift.s @@ -1,20 +1,20 @@ - # MIPS2 __mpn_rshift -- + # MIPS2 __gmpn_rshift -- - # Copyright (C) 1995 Free Software Foundation, Inc. + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. - # You should have received a copy of the GNU Library General Public License + # You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -28,9 +28,9 @@ .text .align 2 - .globl __mpn_rshift - .ent __mpn_rshift -__mpn_rshift: + .globl __gmpn_rshift + .ent __gmpn_rshift +__gmpn_rshift: .set noreorder .set nomacro @@ -89,4 +89,4 @@ __mpn_rshift: .Lend: srl $8,$10,$7 j $31 sw $8,0($4) - .end __mpn_rshift + .end __gmpn_rshift diff --git a/ghc/rts/gmp/mpn/mips2/sub_n.s b/ghc/rts/gmp/mpn/mips2/sub_n.s index 3368ef2..51d34f3 100644 --- a/ghc/rts/gmp/mpn/mips2/sub_n.s +++ b/ghc/rts/gmp/mpn/mips2/sub_n.s @@ -1,21 +1,21 @@ - # MIPS2 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # MIPS2 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and # store difference in a third limb vector. - # Copyright (C) 1995 Free Software Foundation, Inc. + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. - # You should have received a copy of the GNU Library General Public License + # You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -29,9 +29,9 @@ .text .align 2 - .globl __mpn_sub_n - .ent __mpn_sub_n -__mpn_sub_n: + .globl __gmpn_sub_n + .ent __gmpn_sub_n +__gmpn_sub_n: .set noreorder .set nomacro @@ -117,4 +117,4 @@ __mpn_sub_n: j $31 or $2,$2,$8 - .end __mpn_sub_n + .end __gmpn_sub_n diff --git a/ghc/rts/gmp/mpn/mips2/submul_1.s b/ghc/rts/gmp/mpn/mips2/submul_1.s index 1324b66..495dea3 100644 --- a/ghc/rts/gmp/mpn/mips2/submul_1.s +++ b/ghc/rts/gmp/mpn/mips2/submul_1.s @@ -1,21 +1,21 @@ - # MIPS __mpn_submul_1 -- Multiply a limb vector with a single limb and + # MIPS __gmpn_submul_1 -- Multiply a limb vector with a single limb and # subtract the product from a second limb vector. - # Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. + # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. - # You should have received a copy of the GNU Library General Public License + # You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -29,9 +29,9 @@ .text .align 4 - .globl __mpn_submul_1 - .ent __mpn_submul_1 -__mpn_submul_1: + .globl __gmpn_submul_1 + .ent __gmpn_submul_1 +__gmpn_submul_1: .set noreorder .set nomacro @@ -94,4 +94,4 @@ $LC0: lw $10,0($4) j $31 addu $2,$9,$2 # add high product limb and carry from addition - .end __mpn_submul_1 + .end __gmpn_submul_1 diff --git a/ghc/rts/gmp/mpn/mips2/umul.s b/ghc/rts/gmp/mpn/mips2/umul.s new file mode 100644 index 0000000..40e8476 --- /dev/null +++ b/ghc/rts/gmp/mpn/mips2/umul.s @@ -0,0 +1,30 @@ + # Copyright (C) 1999 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + .text + .align 2 + .globl __umul_ppmm + .ent __umul_ppmm +__umul_ppmm: + multu $5,$6 + mflo $3 + mfhi $2 + sw $3,0($4) + j $31 + .end __umul_ppmm diff --git a/ghc/rts/gmp/mpn/mips3/add_n.s b/ghc/rts/gmp/mpn/mips3/add_n.s index 996a449..adad0be 100644 --- a/ghc/rts/gmp/mpn/mips3/add_n.s +++ b/ghc/rts/gmp/mpn/mips3/add_n.s @@ -1,21 +1,21 @@ - # MIPS3 __mpn_add_n -- Add two limb vectors of the same length > 0 and + # MIPS3 __gmpn_add_n -- Add two limb vectors of the same length > 0 and # store sum in a third limb vector. - # Copyright (C) 1995 Free Software Foundation, Inc. + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. - # You should have received a copy of the GNU Library General Public License + # You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -29,9 +29,9 @@ .text .align 2 - .globl __mpn_add_n - .ent __mpn_add_n -__mpn_add_n: + .globl __gmpn_add_n + .ent __gmpn_add_n +__gmpn_add_n: .set noreorder .set nomacro @@ -117,4 +117,4 @@ __mpn_add_n: j $31 or $2,$2,$8 - .end __mpn_add_n + .end __gmpn_add_n diff --git a/ghc/rts/gmp/mpn/mips3/addmul_1.s b/ghc/rts/gmp/mpn/mips3/addmul_1.s index cd75c18..d390e22 100644 --- a/ghc/rts/gmp/mpn/mips3/addmul_1.s +++ b/ghc/rts/gmp/mpn/mips3/addmul_1.s @@ -1,21 +1,21 @@ - # MIPS3 __mpn_addmul_1 -- Multiply a limb vector with a single limb and + # MIPS3 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and # add the product to a second limb vector. - # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. - # You should have received a copy of the GNU Library General Public License + # You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -29,9 +29,9 @@ .text .align 4 - .globl __mpn_addmul_1 - .ent __mpn_addmul_1 -__mpn_addmul_1: + .globl __gmpn_addmul_1 + .ent __gmpn_addmul_1 +__gmpn_addmul_1: .set noreorder .set nomacro @@ -94,4 +94,4 @@ $LC0: ld $10,0($4) j $31 daddu $2,$9,$2 # add high product limb and carry from addition - .end __mpn_addmul_1 + .end __gmpn_addmul_1 diff --git a/ghc/rts/gmp/mpn/mips3/gmp-mparam.h b/ghc/rts/gmp/mpn/mips3/gmp-mparam.h index f3df7ff..656e90c 100644 --- a/ghc/rts/gmp/mpn/mips3/gmp-mparam.h +++ b/ghc/rts/gmp/mpn/mips3/gmp-mparam.h @@ -1,20 +1,20 @@ /* gmp-mparam.h -- Compiler/machine parameter header file. -Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -25,3 +25,34 @@ MA 02111-1307, USA. */ #define BITS_PER_INT 32 #define BITS_PER_SHORTINT 16 #define BITS_PER_CHAR 8 + +/* These values are for the R10000 usign the system cc. */ +/* Generated by tuneup.c, 2000-07-25. */ +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 16 +#endif +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 32 +#endif + +/* Supressed the TOOM3 values as they looked absolutely crazy + (698 and 21 respectively) */ + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 58 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 54 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 82 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 159 +#endif diff --git a/ghc/rts/gmp/mpn/mips3/lshift.s b/ghc/rts/gmp/mpn/mips3/lshift.s index 324a602..372606f 100644 --- a/ghc/rts/gmp/mpn/mips3/lshift.s +++ b/ghc/rts/gmp/mpn/mips3/lshift.s @@ -1,20 +1,20 @@ - # MIPS3 __mpn_lshift -- + # MIPS3 __gmpn_lshift -- - # Copyright (C) 1995 Free Software Foundation, Inc. + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. - # You should have received a copy of the GNU Library General Public License + # You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -28,9 +28,9 @@ .text .align 2 - .globl __mpn_lshift - .ent __mpn_lshift -__mpn_lshift: + .globl __gmpn_lshift + .ent __gmpn_lshift +__gmpn_lshift: .set noreorder .set nomacro @@ -92,4 +92,4 @@ __mpn_lshift: .Lend: dsll $8,$10,$7 j $31 sd $8,-8($4) - .end __mpn_lshift + .end __gmpn_lshift diff --git a/ghc/rts/gmp/mpn/mips3/mul_1.s b/ghc/rts/gmp/mpn/mips3/mul_1.s index 281d057..6659e2b 100644 --- a/ghc/rts/gmp/mpn/mips3/mul_1.s +++ b/ghc/rts/gmp/mpn/mips3/mul_1.s @@ -1,21 +1,21 @@ - # MIPS3 __mpn_mul_1 -- Multiply a limb vector with a single limb and + # MIPS3 __gmpn_mul_1 -- Multiply a limb vector with a single limb and # store the product in a second limb vector. - # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. - # You should have received a copy of the GNU Library General Public License + # You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -29,9 +29,9 @@ .text .align 4 - .globl __mpn_mul_1 - .ent __mpn_mul_1 -__mpn_mul_1: + .globl __gmpn_mul_1 + .ent __gmpn_mul_1 +__gmpn_mul_1: .set noreorder .set nomacro @@ -82,4 +82,4 @@ $LC0: mflo $10 j $31 daddu $2,$9,$2 # add high product limb and carry from addition - .end __mpn_mul_1 + .end __gmpn_mul_1 diff --git a/ghc/rts/gmp/mpn/mips3/rshift.s b/ghc/rts/gmp/mpn/mips3/rshift.s index 9920e1a..59c7fd3 100644 --- a/ghc/rts/gmp/mpn/mips3/rshift.s +++ b/ghc/rts/gmp/mpn/mips3/rshift.s @@ -1,20 +1,20 @@ - # MIPS3 __mpn_rshift -- + # MIPS3 __gmpn_rshift -- - # Copyright (C) 1995 Free Software Foundation, Inc. + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. - # You should have received a copy of the GNU Library General Public License + # You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -28,9 +28,9 @@ .text .align 2 - .globl __mpn_rshift - .ent __mpn_rshift -__mpn_rshift: + .globl __gmpn_rshift + .ent __gmpn_rshift +__gmpn_rshift: .set noreorder .set nomacro @@ -89,4 +89,4 @@ __mpn_rshift: .Lend: dsrl $8,$10,$7 j $31 sd $8,0($4) - .end __mpn_rshift + .end __gmpn_rshift diff --git a/ghc/rts/gmp/mpn/mips3/sub_n.s b/ghc/rts/gmp/mpn/mips3/sub_n.s index 56c77d8..c57c824 100644 --- a/ghc/rts/gmp/mpn/mips3/sub_n.s +++ b/ghc/rts/gmp/mpn/mips3/sub_n.s @@ -1,21 +1,21 @@ - # MIPS3 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # MIPS3 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and # store difference in a third limb vector. - # Copyright (C) 1995 Free Software Foundation, Inc. + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. - # You should have received a copy of the GNU Library General Public License + # You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -29,9 +29,9 @@ .text .align 2 - .globl __mpn_sub_n - .ent __mpn_sub_n -__mpn_sub_n: + .globl __gmpn_sub_n + .ent __gmpn_sub_n +__gmpn_sub_n: .set noreorder .set nomacro @@ -117,4 +117,4 @@ __mpn_sub_n: j $31 or $2,$2,$8 - .end __mpn_sub_n + .end __gmpn_sub_n diff --git a/ghc/rts/gmp/mpn/mips3/submul_1.s b/ghc/rts/gmp/mpn/mips3/submul_1.s index a9c9fa2..531f970 100644 --- a/ghc/rts/gmp/mpn/mips3/submul_1.s +++ b/ghc/rts/gmp/mpn/mips3/submul_1.s @@ -1,21 +1,21 @@ - # MIPS3 __mpn_submul_1 -- Multiply a limb vector with a single limb and + # MIPS3 __gmpn_submul_1 -- Multiply a limb vector with a single limb and # subtract the product from a second limb vector. - # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Library General Public License as published by - # the Free Software Foundation; either version 2 of the License, or (at your + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. - # You should have received a copy of the GNU Library General Public License + # You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -29,9 +29,9 @@ .text .align 4 - .globl __mpn_submul_1 - .ent __mpn_submul_1 -__mpn_submul_1: + .globl __gmpn_submul_1 + .ent __gmpn_submul_1 +__gmpn_submul_1: .set noreorder .set nomacro @@ -94,4 +94,4 @@ $LC0: ld $10,0($4) j $31 daddu $2,$9,$2 # add high product limb and carry from addition - .end __mpn_submul_1 + .end __gmpn_submul_1 diff --git a/ghc/rts/gmp/mpn/mp_bases.c b/ghc/rts/gmp/mpn/mp_bases.c index bbe39b0..011c328 100644 --- a/ghc/rts/gmp/mpn/mp_bases.c +++ b/ghc/rts/gmp/mpn/mp_bases.c @@ -8,16 +8,16 @@ Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -25,265 +25,266 @@ MA 02111-1307, USA. */ #include "gmp.h" #include "gmp-impl.h" + #if BITS_PER_MP_LIMB == 32 const struct bases __mp_bases[256] = { /* 0 */ {0, 0.0, 0, 0}, /* 1 */ {0, 1e38, 0, 0}, - /* 2 */ {32, 1.00000000, 0x1, 0x0}, - /* 3 */ {20, 0.63092975, 0xcfd41b91, 0x3b563c24}, - /* 4 */ {16, 0.50000000, 0x2, 0x0}, - /* 5 */ {13, 0.43067656, 0x48c27395, 0xc25c2684}, - /* 6 */ {12, 0.38685281, 0x81bf1000, 0xf91bd1b6}, - /* 7 */ {11, 0.35620719, 0x75db9c97, 0x1607a2cb}, - /* 8 */ {10, 0.33333333, 0x3, 0x0}, - /* 9 */ {10, 0.31546488, 0xcfd41b91, 0x3b563c24}, - /* 10 */ {9, 0.30103000, 0x3b9aca00, 0x12e0be82}, - /* 11 */ {9, 0.28906483, 0x8c8b6d2b, 0xd24cde04}, - /* 12 */ {8, 0.27894295, 0x19a10000, 0x3fa39ab5}, - /* 13 */ {8, 0.27023815, 0x309f1021, 0x50f8ac5f}, - /* 14 */ {8, 0.26264954, 0x57f6c100, 0x74843b1e}, - /* 15 */ {8, 0.25595802, 0x98c29b81, 0xad0326c2}, - /* 16 */ {8, 0.25000000, 0x4, 0x0}, - /* 17 */ {7, 0.24465054, 0x18754571, 0x4ef0b6bd}, - /* 18 */ {7, 0.23981247, 0x247dbc80, 0xc0fc48a1}, - /* 19 */ {7, 0.23540891, 0x3547667b, 0x33838942}, - /* 20 */ {7, 0.23137821, 0x4c4b4000, 0xad7f29ab}, - /* 21 */ {7, 0.22767025, 0x6b5a6e1d, 0x313c3d15}, - /* 22 */ {7, 0.22424382, 0x94ace180, 0xb8cca9e0}, - /* 23 */ {7, 0.22106473, 0xcaf18367, 0x42ed6de9}, - /* 24 */ {6, 0.21810429, 0xb640000, 0x67980e0b}, - /* 25 */ {6, 0.21533828, 0xe8d4a51, 0x19799812}, - /* 26 */ {6, 0.21274605, 0x1269ae40, 0xbce85396}, - /* 27 */ {6, 0.21030992, 0x17179149, 0x62c103a9}, - /* 28 */ {6, 0.20801460, 0x1cb91000, 0x1d353d43}, - /* 29 */ {6, 0.20584683, 0x23744899, 0xce1decea}, - /* 30 */ {6, 0.20379505, 0x2b73a840, 0x790fc511}, - /* 31 */ {6, 0.20184909, 0x34e63b41, 0x35b865a0}, - /* 32 */ {6, 0.20000000, 0x5, 0x0}, - /* 33 */ {6, 0.19823986, 0x4cfa3cc1, 0xa9aed1b3}, - /* 34 */ {6, 0.19656163, 0x5c13d840, 0x63dfc229}, - /* 35 */ {6, 0.19495902, 0x6d91b519, 0x2b0fee30}, - /* 36 */ {6, 0.19342640, 0x81bf1000, 0xf91bd1b6}, - /* 37 */ {6, 0.19195872, 0x98ede0c9, 0xac89c3a9}, - /* 38 */ {6, 0.19055141, 0xb3773e40, 0x6d2c32fe}, - /* 39 */ {6, 0.18920036, 0xd1bbc4d1, 0x387907c9}, - /* 40 */ {6, 0.18790182, 0xf4240000, 0xc6f7a0b}, - /* 41 */ {5, 0.18665241, 0x6e7d349, 0x28928154}, - /* 42 */ {5, 0.18544902, 0x7ca30a0, 0x6e8629d}, - /* 43 */ {5, 0.18428883, 0x8c32bbb, 0xd373dca0}, - /* 44 */ {5, 0.18316925, 0x9d46c00, 0xa0b17895}, - /* 45 */ {5, 0.18208790, 0xaffacfd, 0x746811a5}, - /* 46 */ {5, 0.18104260, 0xc46bee0, 0x4da6500f}, - /* 47 */ {5, 0.18003133, 0xdab86ef, 0x2ba23582}, - /* 48 */ {5, 0.17905223, 0xf300000, 0xdb20a88}, - /* 49 */ {5, 0.17810359, 0x10d63af1, 0xe68d5ce4}, - /* 50 */ {5, 0.17718382, 0x12a05f20, 0xb7cdfd9d}, - /* 51 */ {5, 0.17629143, 0x1490aae3, 0x8e583933}, - /* 52 */ {5, 0.17542506, 0x16a97400, 0x697cc3ea}, - /* 53 */ {5, 0.17458343, 0x18ed2825, 0x48a5ca6c}, - /* 54 */ {5, 0.17376534, 0x1b5e4d60, 0x2b52db16}, - /* 55 */ {5, 0.17296969, 0x1dff8297, 0x111586a6}, - /* 56 */ {5, 0.17219543, 0x20d38000, 0xf31d2b36}, - /* 57 */ {5, 0.17144160, 0x23dd1799, 0xc8d76d19}, - /* 58 */ {5, 0.17070728, 0x271f35a0, 0xa2cb1eb4}, - /* 59 */ {5, 0.16999162, 0x2a9ce10b, 0x807c3ec3}, - /* 60 */ {5, 0.16929381, 0x2e593c00, 0x617ec8bf}, - /* 61 */ {5, 0.16861310, 0x3257844d, 0x45746cbe}, - /* 62 */ {5, 0.16794878, 0x369b13e0, 0x2c0aa273}, - /* 63 */ {5, 0.16730018, 0x3b27613f, 0x14f90805}, - /* 64 */ {5, 0.16666667, 0x6, 0x0}, - /* 65 */ {5, 0.16604765, 0x4528a141, 0xd9cf0829}, - /* 66 */ {5, 0.16544255, 0x4aa51420, 0xb6fc4841}, - /* 67 */ {5, 0.16485086, 0x50794633, 0x973054cb}, - /* 68 */ {5, 0.16427205, 0x56a94400, 0x7a1dbe4b}, - /* 69 */ {5, 0.16370566, 0x5d393975, 0x5f7fcd7f}, - /* 70 */ {5, 0.16315122, 0x642d7260, 0x47196c84}, - /* 71 */ {5, 0.16260831, 0x6b8a5ae7, 0x30b43635}, - /* 72 */ {5, 0.16207652, 0x73548000, 0x1c1fa5f6}, - /* 73 */ {5, 0.16155547, 0x7b908fe9, 0x930634a}, - /* 74 */ {5, 0.16104477, 0x84435aa0, 0xef7f4a3c}, - /* 75 */ {5, 0.16054409, 0x8d71d25b, 0xcf5552d2}, - /* 76 */ {5, 0.16005307, 0x97210c00, 0xb1a47c8e}, - /* 77 */ {5, 0.15957142, 0xa1563f9d, 0x9634b43e}, - /* 78 */ {5, 0.15909881, 0xac16c8e0, 0x7cd3817d}, - /* 79 */ {5, 0.15863496, 0xb768278f, 0x65536761}, - /* 80 */ {5, 0.15817959, 0xc3500000, 0x4f8b588e}, - /* 81 */ {5, 0.15773244, 0xcfd41b91, 0x3b563c24}, - /* 82 */ {5, 0.15729325, 0xdcfa6920, 0x28928154}, - /* 83 */ {5, 0.15686177, 0xeac8fd83, 0x1721bfb0}, - /* 84 */ {5, 0.15643779, 0xf9461400, 0x6e8629d}, - /* 85 */ {4, 0.15602107, 0x31c84b1, 0x491cc17c}, - /* 86 */ {4, 0.15561139, 0x342ab10, 0x3a11d83b}, - /* 87 */ {4, 0.15520856, 0x36a2c21, 0x2be074cd}, - /* 88 */ {4, 0.15481238, 0x3931000, 0x1e7a02e7}, - /* 89 */ {4, 0.15442266, 0x3bd5ee1, 0x11d10edd}, - /* 90 */ {4, 0.15403922, 0x3e92110, 0x5d92c68}, - /* 91 */ {4, 0.15366189, 0x4165ef1, 0xf50dbfb2}, - /* 92 */ {4, 0.15329049, 0x4452100, 0xdf9f1316}, - /* 93 */ {4, 0.15292487, 0x4756fd1, 0xcb52a684}, - /* 94 */ {4, 0.15256487, 0x4a75410, 0xb8163e97}, - /* 95 */ {4, 0.15221035, 0x4dad681, 0xa5d8f269}, - /* 96 */ {4, 0.15186115, 0x5100000, 0x948b0fcd}, - /* 97 */ {4, 0.15151715, 0x546d981, 0x841e0215}, - /* 98 */ {4, 0.15117821, 0x57f6c10, 0x74843b1e}, - /* 99 */ {4, 0.15084420, 0x5b9c0d1, 0x65b11e6e}, - /* 100 */ {4, 0.15051500, 0x5f5e100, 0x5798ee23}, - /* 101 */ {4, 0.15019048, 0x633d5f1, 0x4a30b99b}, - /* 102 */ {4, 0.14987054, 0x673a910, 0x3d6e4d94}, - /* 103 */ {4, 0.14955506, 0x6b563e1, 0x314825b0}, - /* 104 */ {4, 0.14924394, 0x6f91000, 0x25b55f2e}, - /* 105 */ {4, 0.14893706, 0x73eb721, 0x1aadaccb}, - /* 106 */ {4, 0.14863434, 0x7866310, 0x10294ba2}, - /* 107 */ {4, 0.14833567, 0x7d01db1, 0x620f8f6}, - /* 108 */ {4, 0.14804096, 0x81bf100, 0xf91bd1b6}, - /* 109 */ {4, 0.14775011, 0x869e711, 0xe6d37b2a}, - /* 110 */ {4, 0.14746305, 0x8ba0a10, 0xd55cff6e}, - /* 111 */ {4, 0.14717969, 0x90c6441, 0xc4ad2db2}, - /* 112 */ {4, 0.14689994, 0x9610000, 0xb4b985cf}, - /* 113 */ {4, 0.14662372, 0x9b7e7c1, 0xa5782bef}, - /* 114 */ {4, 0.14635096, 0xa112610, 0x96dfdd2a}, - /* 115 */ {4, 0.14608158, 0xa6cc591, 0x88e7e509}, - /* 116 */ {4, 0.14581551, 0xacad100, 0x7b8813d3}, - /* 117 */ {4, 0.14555268, 0xb2b5331, 0x6eb8b595}, - /* 118 */ {4, 0.14529302, 0xb8e5710, 0x627289db}, - /* 119 */ {4, 0.14503647, 0xbf3e7a1, 0x56aebc07}, - /* 120 */ {4, 0.14478295, 0xc5c1000, 0x4b66dc33}, - /* 121 */ {4, 0.14453241, 0xcc6db61, 0x4094d8a3}, - /* 122 */ {4, 0.14428479, 0xd345510, 0x3632f7a5}, - /* 123 */ {4, 0.14404003, 0xda48871, 0x2c3bd1f0}, - /* 124 */ {4, 0.14379807, 0xe178100, 0x22aa4d5f}, - /* 125 */ {4, 0.14355885, 0xe8d4a51, 0x19799812}, - /* 126 */ {4, 0.14332233, 0xf05f010, 0x10a523e5}, - /* 127 */ {4, 0.14308844, 0xf817e01, 0x828a237}, - /* 128 */ {4, 0.14285714, 0x7, 0x0}, - /* 129 */ {4, 0.14262838, 0x10818201, 0xf04ec452}, - /* 130 */ {4, 0.14240211, 0x11061010, 0xe136444a}, - /* 131 */ {4, 0.14217828, 0x118db651, 0xd2af9589}, - /* 132 */ {4, 0.14195685, 0x12188100, 0xc4b42a83}, - /* 133 */ {4, 0.14173777, 0x12a67c71, 0xb73dccf5}, - /* 134 */ {4, 0.14152100, 0x1337b510, 0xaa4698c5}, - /* 135 */ {4, 0.14130649, 0x13cc3761, 0x9dc8f729}, - /* 136 */ {4, 0.14109421, 0x14641000, 0x91bf9a30}, - /* 137 */ {4, 0.14088412, 0x14ff4ba1, 0x86257887}, - /* 138 */ {4, 0.14067617, 0x159df710, 0x7af5c98c}, - /* 139 */ {4, 0.14047033, 0x16401f31, 0x702c01a0}, - /* 140 */ {4, 0.14026656, 0x16e5d100, 0x65c3ceb1}, - /* 141 */ {4, 0.14006482, 0x178f1991, 0x5bb91502}, - /* 142 */ {4, 0.13986509, 0x183c0610, 0x5207ec23}, - /* 143 */ {4, 0.13966731, 0x18eca3c1, 0x48ac9c19}, - /* 144 */ {4, 0.13947147, 0x19a10000, 0x3fa39ab5}, - /* 145 */ {4, 0.13927753, 0x1a592841, 0x36e98912}, - /* 146 */ {4, 0.13908545, 0x1b152a10, 0x2e7b3140}, - /* 147 */ {4, 0.13889521, 0x1bd51311, 0x2655840b}, - /* 148 */ {4, 0.13870677, 0x1c98f100, 0x1e7596ea}, - /* 149 */ {4, 0.13852011, 0x1d60d1b1, 0x16d8a20d}, - /* 150 */ {4, 0.13833519, 0x1e2cc310, 0xf7bfe87}, - /* 151 */ {4, 0.13815199, 0x1efcd321, 0x85d2492}, - /* 152 */ {4, 0.13797047, 0x1fd11000, 0x179a9f4}, - /* 153 */ {4, 0.13779062, 0x20a987e1, 0xf59e80eb}, - /* 154 */ {4, 0.13761241, 0x21864910, 0xe8b768db}, - /* 155 */ {4, 0.13743580, 0x226761f1, 0xdc39d6d5}, - /* 156 */ {4, 0.13726078, 0x234ce100, 0xd021c5d1}, - /* 157 */ {4, 0.13708732, 0x2436d4d1, 0xc46b5e37}, - /* 158 */ {4, 0.13691539, 0x25254c10, 0xb912f39c}, - /* 159 */ {4, 0.13674498, 0x26185581, 0xae150294}, - /* 160 */ {4, 0.13657605, 0x27100000, 0xa36e2eb1}, - /* 161 */ {4, 0.13640859, 0x280c5a81, 0x991b4094}, - /* 162 */ {4, 0.13624257, 0x290d7410, 0x8f19241e}, - /* 163 */ {4, 0.13607797, 0x2a135bd1, 0x8564e6b7}, - /* 164 */ {4, 0.13591477, 0x2b1e2100, 0x7bfbb5b4}, - /* 165 */ {4, 0.13575295, 0x2c2dd2f1, 0x72dadcc8}, - /* 166 */ {4, 0.13559250, 0x2d428110, 0x69ffc498}, - /* 167 */ {4, 0.13543338, 0x2e5c3ae1, 0x6167f154}, - /* 168 */ {4, 0.13527558, 0x2f7b1000, 0x5911016e}, - /* 169 */ {4, 0.13511908, 0x309f1021, 0x50f8ac5f}, - /* 170 */ {4, 0.13496386, 0x31c84b10, 0x491cc17c}, - /* 171 */ {4, 0.13480991, 0x32f6d0b1, 0x417b26d8}, - /* 172 */ {4, 0.13465720, 0x342ab100, 0x3a11d83b}, - /* 173 */ {4, 0.13450572, 0x3563fc11, 0x32dee622}, - /* 174 */ {4, 0.13435545, 0x36a2c210, 0x2be074cd}, - /* 175 */ {4, 0.13420637, 0x37e71341, 0x2514bb58}, - /* 176 */ {4, 0.13405847, 0x39310000, 0x1e7a02e7}, - /* 177 */ {4, 0.13391173, 0x3a8098c1, 0x180ea5d0}, - /* 178 */ {4, 0.13376614, 0x3bd5ee10, 0x11d10edd}, - /* 179 */ {4, 0.13362168, 0x3d311091, 0xbbfb88e}, - /* 180 */ {4, 0.13347832, 0x3e921100, 0x5d92c68}, - /* 181 */ {4, 0.13333607, 0x3ff90031, 0x1c024c}, - /* 182 */ {4, 0.13319491, 0x4165ef10, 0xf50dbfb2}, - /* 183 */ {4, 0.13305481, 0x42d8eea1, 0xea30efa3}, - /* 184 */ {4, 0.13291577, 0x44521000, 0xdf9f1316}, - /* 185 */ {4, 0.13277777, 0x45d16461, 0xd555c0c9}, - /* 186 */ {4, 0.13264079, 0x4756fd10, 0xcb52a684}, - /* 187 */ {4, 0.13250483, 0x48e2eb71, 0xc193881f}, - /* 188 */ {4, 0.13236988, 0x4a754100, 0xb8163e97}, - /* 189 */ {4, 0.13223591, 0x4c0e0f51, 0xaed8b724}, - /* 190 */ {4, 0.13210292, 0x4dad6810, 0xa5d8f269}, - /* 191 */ {4, 0.13197089, 0x4f535d01, 0x9d15039d}, - /* 192 */ {4, 0.13183981, 0x51000000, 0x948b0fcd}, - /* 193 */ {4, 0.13170967, 0x52b36301, 0x8c394d1d}, - /* 194 */ {4, 0.13158046, 0x546d9810, 0x841e0215}, - /* 195 */ {4, 0.13145216, 0x562eb151, 0x7c3784f8}, - /* 196 */ {4, 0.13132477, 0x57f6c100, 0x74843b1e}, - /* 197 */ {4, 0.13119827, 0x59c5d971, 0x6d02985d}, - /* 198 */ {4, 0.13107265, 0x5b9c0d10, 0x65b11e6e}, - /* 199 */ {4, 0.13094791, 0x5d796e61, 0x5e8e5c64}, - /* 200 */ {4, 0.13082402, 0x5f5e1000, 0x5798ee23}, - /* 201 */ {4, 0.13070099, 0x614a04a1, 0x50cf7bde}, - /* 202 */ {4, 0.13057879, 0x633d5f10, 0x4a30b99b}, - /* 203 */ {4, 0.13045743, 0x65383231, 0x43bb66bd}, - /* 204 */ {4, 0.13033688, 0x673a9100, 0x3d6e4d94}, - /* 205 */ {4, 0.13021715, 0x69448e91, 0x374842ee}, - /* 206 */ {4, 0.13009822, 0x6b563e10, 0x314825b0}, - /* 207 */ {4, 0.12998007, 0x6d6fb2c1, 0x2b6cde75}, - /* 208 */ {4, 0.12986271, 0x6f910000, 0x25b55f2e}, - /* 209 */ {4, 0.12974613, 0x71ba3941, 0x2020a2c5}, - /* 210 */ {4, 0.12963031, 0x73eb7210, 0x1aadaccb}, - /* 211 */ {4, 0.12951524, 0x7624be11, 0x155b891f}, - /* 212 */ {4, 0.12940092, 0x78663100, 0x10294ba2}, - /* 213 */ {4, 0.12928734, 0x7aafdeb1, 0xb160fe9}, - /* 214 */ {4, 0.12917448, 0x7d01db10, 0x620f8f6}, - /* 215 */ {4, 0.12906235, 0x7f5c3a21, 0x14930ef}, - /* 216 */ {4, 0.12895094, 0x81bf1000, 0xf91bd1b6}, - /* 217 */ {4, 0.12884022, 0x842a70e1, 0xefdcb0c7}, - /* 218 */ {4, 0.12873021, 0x869e7110, 0xe6d37b2a}, - /* 219 */ {4, 0.12862089, 0x891b24f1, 0xddfeb94a}, - /* 220 */ {4, 0.12851224, 0x8ba0a100, 0xd55cff6e}, - /* 221 */ {4, 0.12840428, 0x8e2ef9d1, 0xcceced50}, - /* 222 */ {4, 0.12829698, 0x90c64410, 0xc4ad2db2}, - /* 223 */ {4, 0.12819034, 0x93669481, 0xbc9c75f9}, - /* 224 */ {4, 0.12808435, 0x96100000, 0xb4b985cf}, - /* 225 */ {4, 0.12797901, 0x98c29b81, 0xad0326c2}, - /* 226 */ {4, 0.12787431, 0x9b7e7c10, 0xa5782bef}, - /* 227 */ {4, 0.12777024, 0x9e43b6d1, 0x9e1771a9}, - /* 228 */ {4, 0.12766680, 0xa1126100, 0x96dfdd2a}, - /* 229 */ {4, 0.12756398, 0xa3ea8ff1, 0x8fd05c41}, - /* 230 */ {4, 0.12746176, 0xa6cc5910, 0x88e7e509}, - /* 231 */ {4, 0.12736016, 0xa9b7d1e1, 0x8225759d}, - /* 232 */ {4, 0.12725915, 0xacad1000, 0x7b8813d3}, - /* 233 */ {4, 0.12715874, 0xafac2921, 0x750eccf9}, - /* 234 */ {4, 0.12705891, 0xb2b53310, 0x6eb8b595}, - /* 235 */ {4, 0.12695967, 0xb5c843b1, 0x6884e923}, - /* 236 */ {4, 0.12686100, 0xb8e57100, 0x627289db}, - /* 237 */ {4, 0.12676290, 0xbc0cd111, 0x5c80c07b}, - /* 238 */ {4, 0.12666537, 0xbf3e7a10, 0x56aebc07}, - /* 239 */ {4, 0.12656839, 0xc27a8241, 0x50fbb19b}, - /* 240 */ {4, 0.12647197, 0xc5c10000, 0x4b66dc33}, - /* 241 */ {4, 0.12637609, 0xc91209c1, 0x45ef7c7c}, - /* 242 */ {4, 0.12628075, 0xcc6db610, 0x4094d8a3}, - /* 243 */ {4, 0.12618595, 0xcfd41b91, 0x3b563c24}, - /* 244 */ {4, 0.12609168, 0xd3455100, 0x3632f7a5}, - /* 245 */ {4, 0.12599794, 0xd6c16d31, 0x312a60c3}, - /* 246 */ {4, 0.12590471, 0xda488710, 0x2c3bd1f0}, - /* 247 */ {4, 0.12581200, 0xdddab5a1, 0x2766aa45}, - /* 248 */ {4, 0.12571980, 0xe1781000, 0x22aa4d5f}, - /* 249 */ {4, 0.12562811, 0xe520ad61, 0x1e06233c}, - /* 250 */ {4, 0.12553692, 0xe8d4a510, 0x19799812}, - /* 251 */ {4, 0.12544622, 0xec940e71, 0x15041c33}, - /* 252 */ {4, 0.12535601, 0xf05f0100, 0x10a523e5}, - /* 253 */ {4, 0.12526629, 0xf4359451, 0xc5c2749}, - /* 254 */ {4, 0.12517705, 0xf817e010, 0x828a237}, - /* 255 */ {4, 0.12508829, 0xfc05fc01, 0x40a1423}, + /* 2 */ {32, 1.0000000000000000, 0x1, 0x0}, + /* 3 */ {20, 0.6309297535714575, 0xcfd41b91, 0x3b563c24}, + /* 4 */ {16, 0.5000000000000000, 0x2, 0x0}, + /* 5 */ {13, 0.4306765580733931, 0x48c27395, 0xc25c2684}, + /* 6 */ {12, 0.3868528072345416, 0x81bf1000, 0xf91bd1b6}, + /* 7 */ {11, 0.3562071871080222, 0x75db9c97, 0x1607a2cb}, + /* 8 */ {10, 0.3333333333333334, 0x3, 0x0}, + /* 9 */ {10, 0.3154648767857287, 0xcfd41b91, 0x3b563c24}, + /* 10 */ {9, 0.3010299956639811, 0x3b9aca00, 0x12e0be82}, + /* 11 */ {9, 0.2890648263178878, 0x8c8b6d2b, 0xd24cde04}, + /* 12 */ {8, 0.2789429456511298, 0x19a10000, 0x3fa39ab5}, + /* 13 */ {8, 0.2702381544273197, 0x309f1021, 0x50f8ac5f}, + /* 14 */ {8, 0.2626495350371936, 0x57f6c100, 0x74843b1e}, + /* 15 */ {8, 0.2559580248098155, 0x98c29b81, 0xad0326c2}, + /* 16 */ {8, 0.2500000000000000, 0x4, 0x0}, + /* 17 */ {7, 0.2446505421182260, 0x18754571, 0x4ef0b6bd}, + /* 18 */ {7, 0.2398124665681315, 0x247dbc80, 0xc0fc48a1}, + /* 19 */ {7, 0.2354089133666382, 0x3547667b, 0x33838942}, + /* 20 */ {7, 0.2313782131597592, 0x4c4b4000, 0xad7f29ab}, + /* 21 */ {7, 0.2276702486969530, 0x6b5a6e1d, 0x313c3d15}, + /* 22 */ {7, 0.2242438242175754, 0x94ace180, 0xb8cca9e0}, + /* 23 */ {7, 0.2210647294575037, 0xcaf18367, 0x42ed6de9}, + /* 24 */ {6, 0.2181042919855316, 0xb640000, 0x67980e0b}, + /* 25 */ {6, 0.2153382790366965, 0xe8d4a51, 0x19799812}, + /* 26 */ {6, 0.2127460535533632, 0x1269ae40, 0xbce85396}, + /* 27 */ {6, 0.2103099178571525, 0x17179149, 0x62c103a9}, + /* 28 */ {6, 0.2080145976765095, 0x1cb91000, 0x1d353d43}, + /* 29 */ {6, 0.2058468324604344, 0x23744899, 0xce1decea}, + /* 30 */ {6, 0.2037950470905062, 0x2b73a840, 0x790fc511}, + /* 31 */ {6, 0.2018490865820999, 0x34e63b41, 0x35b865a0}, + /* 32 */ {6, 0.2000000000000000, 0x5, 0x0}, + /* 33 */ {6, 0.1982398631705605, 0x4cfa3cc1, 0xa9aed1b3}, + /* 34 */ {6, 0.1965616322328226, 0x5c13d840, 0x63dfc229}, + /* 35 */ {6, 0.1949590218937863, 0x6d91b519, 0x2b0fee30}, + /* 36 */ {6, 0.1934264036172708, 0x81bf1000, 0xf91bd1b6}, + /* 37 */ {6, 0.1919587200065601, 0x98ede0c9, 0xac89c3a9}, + /* 38 */ {6, 0.1905514124267734, 0xb3773e40, 0x6d2c32fe}, + /* 39 */ {6, 0.1892003595168700, 0xd1bbc4d1, 0x387907c9}, + /* 40 */ {6, 0.1879018247091076, 0xf4240000, 0xc6f7a0b}, + /* 41 */ {5, 0.1866524112389434, 0x6e7d349, 0x28928154}, + /* 42 */ {5, 0.1854490234153689, 0x7ca30a0, 0x6e8629d}, + /* 43 */ {5, 0.1842888331487062, 0x8c32bbb, 0xd373dca0}, + /* 44 */ {5, 0.1831692509136336, 0x9d46c00, 0xa0b17895}, + /* 45 */ {5, 0.1820879004699383, 0xaffacfd, 0x746811a5}, + /* 46 */ {5, 0.1810425967800402, 0xc46bee0, 0x4da6500f}, + /* 47 */ {5, 0.1800313266566926, 0xdab86ef, 0x2ba23582}, + /* 48 */ {5, 0.1790522317510414, 0xf300000, 0xdb20a88}, + /* 49 */ {5, 0.1781035935540111, 0x10d63af1, 0xe68d5ce4}, + /* 50 */ {5, 0.1771838201355579, 0x12a05f20, 0xb7cdfd9d}, + /* 51 */ {5, 0.1762914343888821, 0x1490aae3, 0x8e583933}, + /* 52 */ {5, 0.1754250635819545, 0x16a97400, 0x697cc3ea}, + /* 53 */ {5, 0.1745834300480449, 0x18ed2825, 0x48a5ca6c}, + /* 54 */ {5, 0.1737653428714400, 0x1b5e4d60, 0x2b52db16}, + /* 55 */ {5, 0.1729696904450771, 0x1dff8297, 0x111586a6}, + /* 56 */ {5, 0.1721954337940981, 0x20d38000, 0xf31d2b36}, + /* 57 */ {5, 0.1714416005739134, 0x23dd1799, 0xc8d76d19}, + /* 58 */ {5, 0.1707072796637201, 0x271f35a0, 0xa2cb1eb4}, + /* 59 */ {5, 0.1699916162869140, 0x2a9ce10b, 0x807c3ec3}, + /* 60 */ {5, 0.1692938075987814, 0x2e593c00, 0x617ec8bf}, + /* 61 */ {5, 0.1686130986895011, 0x3257844d, 0x45746cbe}, + /* 62 */ {5, 0.1679487789570419, 0x369b13e0, 0x2c0aa273}, + /* 63 */ {5, 0.1673001788101741, 0x3b27613f, 0x14f90805}, + /* 64 */ {5, 0.1666666666666667, 0x6, 0x0}, + /* 65 */ {5, 0.1660476462159378, 0x4528a141, 0xd9cf0829}, + /* 66 */ {5, 0.1654425539190583, 0x4aa51420, 0xb6fc4841}, + /* 67 */ {5, 0.1648508567221604, 0x50794633, 0x973054cb}, + /* 68 */ {5, 0.1642720499620502, 0x56a94400, 0x7a1dbe4b}, + /* 69 */ {5, 0.1637056554452156, 0x5d393975, 0x5f7fcd7f}, + /* 70 */ {5, 0.1631512196835108, 0x642d7260, 0x47196c84}, + /* 71 */ {5, 0.1626083122716341, 0x6b8a5ae7, 0x30b43635}, + /* 72 */ {5, 0.1620765243931223, 0x73548000, 0x1c1fa5f6}, + /* 73 */ {5, 0.1615554674429964, 0x7b908fe9, 0x930634a}, + /* 74 */ {5, 0.1610447717564445, 0x84435aa0, 0xef7f4a3c}, + /* 75 */ {5, 0.1605440854340214, 0x8d71d25b, 0xcf5552d2}, + /* 76 */ {5, 0.1600530732548213, 0x97210c00, 0xb1a47c8e}, + /* 77 */ {5, 0.1595714156699382, 0xa1563f9d, 0x9634b43e}, + /* 78 */ {5, 0.1590988078692941, 0xac16c8e0, 0x7cd3817d}, + /* 79 */ {5, 0.1586349589155960, 0xb768278f, 0x65536761}, + /* 80 */ {5, 0.1581795909397823, 0xc3500000, 0x4f8b588e}, + /* 81 */ {5, 0.1577324383928644, 0xcfd41b91, 0x3b563c24}, + /* 82 */ {5, 0.1572932473495469, 0xdcfa6920, 0x28928154}, + /* 83 */ {5, 0.1568617748594410, 0xeac8fd83, 0x1721bfb0}, + /* 84 */ {5, 0.1564377883420716, 0xf9461400, 0x6e8629d}, + /* 85 */ {4, 0.1560210650222250, 0x31c84b1, 0x491cc17c}, + /* 86 */ {4, 0.1556113914024940, 0x342ab10, 0x3a11d83b}, + /* 87 */ {4, 0.1552085627701551, 0x36a2c21, 0x2be074cd}, + /* 88 */ {4, 0.1548123827357682, 0x3931000, 0x1e7a02e7}, + /* 89 */ {4, 0.1544226628011101, 0x3bd5ee1, 0x11d10edd}, + /* 90 */ {4, 0.1540392219542636, 0x3e92110, 0x5d92c68}, + /* 91 */ {4, 0.1536618862898642, 0x4165ef1, 0xf50dbfb2}, + /* 92 */ {4, 0.1532904886526781, 0x4452100, 0xdf9f1316}, + /* 93 */ {4, 0.1529248683028321, 0x4756fd1, 0xcb52a684}, + /* 94 */ {4, 0.1525648706011593, 0x4a75410, 0xb8163e97}, + /* 95 */ {4, 0.1522103467132434, 0x4dad681, 0xa5d8f269}, + /* 96 */ {4, 0.1518611533308632, 0x5100000, 0x948b0fcd}, + /* 97 */ {4, 0.1515171524096389, 0x546d981, 0x841e0215}, + /* 98 */ {4, 0.1511782109217764, 0x57f6c10, 0x74843b1e}, + /* 99 */ {4, 0.1508442006228941, 0x5b9c0d1, 0x65b11e6e}, + /* 100 */ {4, 0.1505149978319906, 0x5f5e100, 0x5798ee23}, + /* 101 */ {4, 0.1501904832236879, 0x633d5f1, 0x4a30b99b}, + /* 102 */ {4, 0.1498705416319474, 0x673a910, 0x3d6e4d94}, + /* 103 */ {4, 0.1495550618645152, 0x6b563e1, 0x314825b0}, + /* 104 */ {4, 0.1492439365274121, 0x6f91000, 0x25b55f2e}, + /* 105 */ {4, 0.1489370618588283, 0x73eb721, 0x1aadaccb}, + /* 106 */ {4, 0.1486343375718350, 0x7866310, 0x10294ba2}, + /* 107 */ {4, 0.1483356667053617, 0x7d01db1, 0x620f8f6}, + /* 108 */ {4, 0.1480409554829326, 0x81bf100, 0xf91bd1b6}, + /* 109 */ {4, 0.1477501131786861, 0x869e711, 0xe6d37b2a}, + /* 110 */ {4, 0.1474630519902391, 0x8ba0a10, 0xd55cff6e}, + /* 111 */ {4, 0.1471796869179852, 0x90c6441, 0xc4ad2db2}, + /* 112 */ {4, 0.1468999356504447, 0x9610000, 0xb4b985cf}, + /* 113 */ {4, 0.1466237184553111, 0x9b7e7c1, 0xa5782bef}, + /* 114 */ {4, 0.1463509580758620, 0xa112610, 0x96dfdd2a}, + /* 115 */ {4, 0.1460815796324244, 0xa6cc591, 0x88e7e509}, + /* 116 */ {4, 0.1458155105286054, 0xacad100, 0x7b8813d3}, + /* 117 */ {4, 0.1455526803620167, 0xb2b5331, 0x6eb8b595}, + /* 118 */ {4, 0.1452930208392428, 0xb8e5710, 0x627289db}, + /* 119 */ {4, 0.1450364656948130, 0xbf3e7a1, 0x56aebc07}, + /* 120 */ {4, 0.1447829506139581, 0xc5c1000, 0x4b66dc33}, + /* 121 */ {4, 0.1445324131589439, 0xcc6db61, 0x4094d8a3}, + /* 122 */ {4, 0.1442847926987864, 0xd345510, 0x3632f7a5}, + /* 123 */ {4, 0.1440400303421672, 0xda48871, 0x2c3bd1f0}, + /* 124 */ {4, 0.1437980688733775, 0xe178100, 0x22aa4d5f}, + /* 125 */ {4, 0.1435588526911310, 0xe8d4a51, 0x19799812}, + /* 126 */ {4, 0.1433223277500932, 0xf05f010, 0x10a523e5}, + /* 127 */ {4, 0.1430884415049874, 0xf817e01, 0x828a237}, + /* 128 */ {4, 0.1428571428571428, 0x7, 0x0}, + /* 129 */ {4, 0.1426283821033600, 0x10818201, 0xf04ec452}, + /* 130 */ {4, 0.1424021108869747, 0x11061010, 0xe136444a}, + /* 131 */ {4, 0.1421782821510107, 0x118db651, 0xd2af9589}, + /* 132 */ {4, 0.1419568500933153, 0x12188100, 0xc4b42a83}, + /* 133 */ {4, 0.1417377701235801, 0x12a67c71, 0xb73dccf5}, + /* 134 */ {4, 0.1415209988221527, 0x1337b510, 0xaa4698c5}, + /* 135 */ {4, 0.1413064939005528, 0x13cc3761, 0x9dc8f729}, + /* 136 */ {4, 0.1410942141636095, 0x14641000, 0x91bf9a30}, + /* 137 */ {4, 0.1408841194731412, 0x14ff4ba1, 0x86257887}, + /* 138 */ {4, 0.1406761707131039, 0x159df710, 0x7af5c98c}, + /* 139 */ {4, 0.1404703297561400, 0x16401f31, 0x702c01a0}, + /* 140 */ {4, 0.1402665594314587, 0x16e5d100, 0x65c3ceb1}, + /* 141 */ {4, 0.1400648234939879, 0x178f1991, 0x5bb91502}, + /* 142 */ {4, 0.1398650865947379, 0x183c0610, 0x5207ec23}, + /* 143 */ {4, 0.1396673142523192, 0x18eca3c1, 0x48ac9c19}, + /* 144 */ {4, 0.1394714728255649, 0x19a10000, 0x3fa39ab5}, + /* 145 */ {4, 0.1392775294872041, 0x1a592841, 0x36e98912}, + /* 146 */ {4, 0.1390854521985406, 0x1b152a10, 0x2e7b3140}, + /* 147 */ {4, 0.1388952096850913, 0x1bd51311, 0x2655840b}, + /* 148 */ {4, 0.1387067714131417, 0x1c98f100, 0x1e7596ea}, + /* 149 */ {4, 0.1385201075671774, 0x1d60d1b1, 0x16d8a20d}, + /* 150 */ {4, 0.1383351890281539, 0x1e2cc310, 0xf7bfe87}, + /* 151 */ {4, 0.1381519873525671, 0x1efcd321, 0x85d2492}, + /* 152 */ {4, 0.1379704747522905, 0x1fd11000, 0x179a9f4}, + /* 153 */ {4, 0.1377906240751463, 0x20a987e1, 0xf59e80eb}, + /* 154 */ {4, 0.1376124087861776, 0x21864910, 0xe8b768db}, + /* 155 */ {4, 0.1374358029495937, 0x226761f1, 0xdc39d6d5}, + /* 156 */ {4, 0.1372607812113589, 0x234ce100, 0xd021c5d1}, + /* 157 */ {4, 0.1370873187823978, 0x2436d4d1, 0xc46b5e37}, + /* 158 */ {4, 0.1369153914223921, 0x25254c10, 0xb912f39c}, + /* 159 */ {4, 0.1367449754241439, 0x26185581, 0xae150294}, + /* 160 */ {4, 0.1365760475984821, 0x27100000, 0xa36e2eb1}, + /* 161 */ {4, 0.1364085852596902, 0x280c5a81, 0x991b4094}, + /* 162 */ {4, 0.1362425662114337, 0x290d7410, 0x8f19241e}, + /* 163 */ {4, 0.1360779687331669, 0x2a135bd1, 0x8564e6b7}, + /* 164 */ {4, 0.1359147715670014, 0x2b1e2100, 0x7bfbb5b4}, + /* 165 */ {4, 0.1357529539050150, 0x2c2dd2f1, 0x72dadcc8}, + /* 166 */ {4, 0.1355924953769863, 0x2d428110, 0x69ffc498}, + /* 167 */ {4, 0.1354333760385373, 0x2e5c3ae1, 0x6167f154}, + /* 168 */ {4, 0.1352755763596663, 0x2f7b1000, 0x5911016e}, + /* 169 */ {4, 0.1351190772136599, 0x309f1021, 0x50f8ac5f}, + /* 170 */ {4, 0.1349638598663645, 0x31c84b10, 0x491cc17c}, + /* 171 */ {4, 0.1348099059658079, 0x32f6d0b1, 0x417b26d8}, + /* 172 */ {4, 0.1346571975321549, 0x342ab100, 0x3a11d83b}, + /* 173 */ {4, 0.1345057169479844, 0x3563fc11, 0x32dee622}, + /* 174 */ {4, 0.1343554469488779, 0x36a2c210, 0x2be074cd}, + /* 175 */ {4, 0.1342063706143054, 0x37e71341, 0x2514bb58}, + /* 176 */ {4, 0.1340584713587980, 0x39310000, 0x1e7a02e7}, + /* 177 */ {4, 0.1339117329233981, 0x3a8098c1, 0x180ea5d0}, + /* 178 */ {4, 0.1337661393673756, 0x3bd5ee10, 0x11d10edd}, + /* 179 */ {4, 0.1336216750601996, 0x3d311091, 0xbbfb88e}, + /* 180 */ {4, 0.1334783246737591, 0x3e921100, 0x5d92c68}, + /* 181 */ {4, 0.1333360731748201, 0x3ff90031, 0x1c024c}, + /* 182 */ {4, 0.1331949058177136, 0x4165ef10, 0xf50dbfb2}, + /* 183 */ {4, 0.1330548081372441, 0x42d8eea1, 0xea30efa3}, + /* 184 */ {4, 0.1329157659418126, 0x44521000, 0xdf9f1316}, + /* 185 */ {4, 0.1327777653067443, 0x45d16461, 0xd555c0c9}, + /* 186 */ {4, 0.1326407925678156, 0x4756fd10, 0xcb52a684}, + /* 187 */ {4, 0.1325048343149731, 0x48e2eb71, 0xc193881f}, + /* 188 */ {4, 0.1323698773862368, 0x4a754100, 0xb8163e97}, + /* 189 */ {4, 0.1322359088617821, 0x4c0e0f51, 0xaed8b724}, + /* 190 */ {4, 0.1321029160581950, 0x4dad6810, 0xa5d8f269}, + /* 191 */ {4, 0.1319708865228925, 0x4f535d01, 0x9d15039d}, + /* 192 */ {4, 0.1318398080287045, 0x51000000, 0x948b0fcd}, + /* 193 */ {4, 0.1317096685686114, 0x52b36301, 0x8c394d1d}, + /* 194 */ {4, 0.1315804563506306, 0x546d9810, 0x841e0215}, + /* 195 */ {4, 0.1314521597928493, 0x562eb151, 0x7c3784f8}, + /* 196 */ {4, 0.1313247675185968, 0x57f6c100, 0x74843b1e}, + /* 197 */ {4, 0.1311982683517524, 0x59c5d971, 0x6d02985d}, + /* 198 */ {4, 0.1310726513121843, 0x5b9c0d10, 0x65b11e6e}, + /* 199 */ {4, 0.1309479056113158, 0x5d796e61, 0x5e8e5c64}, + /* 200 */ {4, 0.1308240206478128, 0x5f5e1000, 0x5798ee23}, + /* 201 */ {4, 0.1307009860033912, 0x614a04a1, 0x50cf7bde}, + /* 202 */ {4, 0.1305787914387386, 0x633d5f10, 0x4a30b99b}, + /* 203 */ {4, 0.1304574268895465, 0x65383231, 0x43bb66bd}, + /* 204 */ {4, 0.1303368824626505, 0x673a9100, 0x3d6e4d94}, + /* 205 */ {4, 0.1302171484322746, 0x69448e91, 0x374842ee}, + /* 206 */ {4, 0.1300982152363760, 0x6b563e10, 0x314825b0}, + /* 207 */ {4, 0.1299800734730872, 0x6d6fb2c1, 0x2b6cde75}, + /* 208 */ {4, 0.1298627138972530, 0x6f910000, 0x25b55f2e}, + /* 209 */ {4, 0.1297461274170591, 0x71ba3941, 0x2020a2c5}, + /* 210 */ {4, 0.1296303050907487, 0x73eb7210, 0x1aadaccb}, + /* 211 */ {4, 0.1295152381234257, 0x7624be11, 0x155b891f}, + /* 212 */ {4, 0.1294009178639407, 0x78663100, 0x10294ba2}, + /* 213 */ {4, 0.1292873358018581, 0x7aafdeb1, 0xb160fe9}, + /* 214 */ {4, 0.1291744835645007, 0x7d01db10, 0x620f8f6}, + /* 215 */ {4, 0.1290623529140715, 0x7f5c3a21, 0x14930ef}, + /* 216 */ {4, 0.1289509357448472, 0x81bf1000, 0xf91bd1b6}, + /* 217 */ {4, 0.1288402240804449, 0x842a70e1, 0xefdcb0c7}, + /* 218 */ {4, 0.1287302100711567, 0x869e7110, 0xe6d37b2a}, + /* 219 */ {4, 0.1286208859913518, 0x891b24f1, 0xddfeb94a}, + /* 220 */ {4, 0.1285122442369443, 0x8ba0a100, 0xd55cff6e}, + /* 221 */ {4, 0.1284042773229231, 0x8e2ef9d1, 0xcceced50}, + /* 222 */ {4, 0.1282969778809442, 0x90c64410, 0xc4ad2db2}, + /* 223 */ {4, 0.1281903386569819, 0x93669481, 0xbc9c75f9}, + /* 224 */ {4, 0.1280843525090381, 0x96100000, 0xb4b985cf}, + /* 225 */ {4, 0.1279790124049077, 0x98c29b81, 0xad0326c2}, + /* 226 */ {4, 0.1278743114199984, 0x9b7e7c10, 0xa5782bef}, + /* 227 */ {4, 0.1277702427352035, 0x9e43b6d1, 0x9e1771a9}, + /* 228 */ {4, 0.1276667996348261, 0xa1126100, 0x96dfdd2a}, + /* 229 */ {4, 0.1275639755045533, 0xa3ea8ff1, 0x8fd05c41}, + /* 230 */ {4, 0.1274617638294791, 0xa6cc5910, 0x88e7e509}, + /* 231 */ {4, 0.1273601581921741, 0xa9b7d1e1, 0x8225759d}, + /* 232 */ {4, 0.1272591522708010, 0xacad1000, 0x7b8813d3}, + /* 233 */ {4, 0.1271587398372755, 0xafac2921, 0x750eccf9}, + /* 234 */ {4, 0.1270589147554692, 0xb2b53310, 0x6eb8b595}, + /* 235 */ {4, 0.1269596709794558, 0xb5c843b1, 0x6884e923}, + /* 236 */ {4, 0.1268610025517973, 0xb8e57100, 0x627289db}, + /* 237 */ {4, 0.1267629036018709, 0xbc0cd111, 0x5c80c07b}, + /* 238 */ {4, 0.1266653683442337, 0xbf3e7a10, 0x56aebc07}, + /* 239 */ {4, 0.1265683910770258, 0xc27a8241, 0x50fbb19b}, + /* 240 */ {4, 0.1264719661804097, 0xc5c10000, 0x4b66dc33}, + /* 241 */ {4, 0.1263760881150453, 0xc91209c1, 0x45ef7c7c}, + /* 242 */ {4, 0.1262807514205999, 0xcc6db610, 0x4094d8a3}, + /* 243 */ {4, 0.1261859507142915, 0xcfd41b91, 0x3b563c24}, + /* 244 */ {4, 0.1260916806894653, 0xd3455100, 0x3632f7a5}, + /* 245 */ {4, 0.1259979361142023, 0xd6c16d31, 0x312a60c3}, + /* 246 */ {4, 0.1259047118299582, 0xda488710, 0x2c3bd1f0}, + /* 247 */ {4, 0.1258120027502338, 0xdddab5a1, 0x2766aa45}, + /* 248 */ {4, 0.1257198038592741, 0xe1781000, 0x22aa4d5f}, + /* 249 */ {4, 0.1256281102107963, 0xe520ad61, 0x1e06233c}, + /* 250 */ {4, 0.1255369169267456, 0xe8d4a510, 0x19799812}, + /* 251 */ {4, 0.1254462191960791, 0xec940e71, 0x15041c33}, + /* 252 */ {4, 0.1253560122735751, 0xf05f0100, 0x10a523e5}, + /* 253 */ {4, 0.1252662914786691, 0xf4359451, 0xc5c2749}, + /* 254 */ {4, 0.1251770521943144, 0xf817e010, 0x828a237}, + /* 255 */ {4, 0.1250882898658681, 0xfc05fc01, 0x40a1423}, }; #endif #if BITS_PER_MP_LIMB == 64 @@ -291,259 +292,259 @@ const struct bases __mp_bases[256] = { /* 0 */ {0, 0.0, 0, 0}, /* 1 */ {0, 1e38, 0, 0}, - /* 2 */ {64, 1.00000000, 0x1, 0x0}, - /* 3 */ {40, 0.63092975, 0xa8b8b452291fe821L, 0x846d550e37b5063dL}, - /* 4 */ {32, 0.50000000, 0x2L, 0x0L}, - /* 5 */ {27, 0.43067656, 0x6765c793fa10079dL, 0x3ce9a36f23c0fc90L}, - /* 6 */ {24, 0.38685281, 0x41c21cb8e1000000L, 0xf24f62335024a295L}, - /* 7 */ {22, 0.35620719, 0x3642798750226111L, 0x2df495ccaa57147bL}, - /* 8 */ {21, 0.33333333, 0x3L, 0x0L}, - /* 9 */ {20, 0.31546488, 0xa8b8b452291fe821L, 0x846d550e37b5063dL}, - /* 10 */ {19, 0.30103000, 0x8ac7230489e80000L, 0xd83c94fb6d2ac34aL}, - /* 11 */ {18, 0.28906483, 0x4d28cb56c33fa539L, 0xa8adf7ae45e7577bL}, - /* 12 */ {17, 0.27894295, 0x1eca170c00000000L, 0xa10c2bec5da8f8fL}, - /* 13 */ {17, 0.27023815, 0x780c7372621bd74dL, 0x10f4becafe412ec3L}, - /* 14 */ {16, 0.26264954, 0x1e39a5057d810000L, 0xf08480f672b4e86L}, - /* 15 */ {16, 0.25595802, 0x5b27ac993df97701L, 0x6779c7f90dc42f48L}, - /* 16 */ {16, 0.25000000, 0x4L, 0x0L}, - /* 17 */ {15, 0.24465054, 0x27b95e997e21d9f1L, 0x9c71e11bab279323L}, - /* 18 */ {15, 0.23981247, 0x5da0e1e53c5c8000L, 0x5dfaa697ec6f6a1cL}, - /* 19 */ {15, 0.23540891, 0xd2ae3299c1c4aedbL, 0x3711783f6be7e9ecL}, - /* 20 */ {14, 0.23137821, 0x16bcc41e90000000L, 0x6849b86a12b9b01eL}, - /* 21 */ {14, 0.22767025, 0x2d04b7fdd9c0ef49L, 0x6bf097ba5ca5e239L}, - /* 22 */ {14, 0.22424382, 0x5658597bcaa24000L, 0x7b8015c8d7af8f08L}, - /* 23 */ {14, 0.22106473, 0xa0e2073737609371L, 0x975a24b3a3151b38L}, - /* 24 */ {13, 0.21810429, 0xc29e98000000000L, 0x50bd367972689db1L}, - /* 25 */ {13, 0.21533828, 0x14adf4b7320334b9L, 0x8c240c4aecb13bb5L}, - /* 26 */ {13, 0.21274605, 0x226ed36478bfa000L, 0xdbd2e56854e118c9L}, - /* 27 */ {13, 0.21030992, 0x383d9170b85ff80bL, 0x2351ffcaa9c7c4aeL}, - /* 28 */ {13, 0.20801460, 0x5a3c23e39c000000L, 0x6b24188ca33b0636L}, - /* 29 */ {13, 0.20584683, 0x8e65137388122bcdL, 0xcc3dceaf2b8ba99dL}, - /* 30 */ {13, 0.20379505, 0xdd41bb36d259e000L, 0x2832e835c6c7d6b6L}, - /* 31 */ {12, 0.20184909, 0xaee5720ee830681L, 0x76b6aa272e1873c5L}, - /* 32 */ {12, 0.20000000, 0x5L, 0x0L}, - /* 33 */ {12, 0.19823986, 0x172588ad4f5f0981L, 0x61eaf5d402c7bf4fL}, - /* 34 */ {12, 0.19656163, 0x211e44f7d02c1000L, 0xeeb658123ffb27ecL}, - /* 35 */ {12, 0.19495902, 0x2ee56725f06e5c71L, 0x5d5e3762e6fdf509L}, - /* 36 */ {12, 0.19342640, 0x41c21cb8e1000000L, 0xf24f62335024a295L}, - /* 37 */ {12, 0.19195872, 0x5b5b57f8a98a5dd1L, 0x66ae7831762efb6fL}, - /* 38 */ {12, 0.19055141, 0x7dcff8986ea31000L, 0x47388865a00f544L}, - /* 39 */ {12, 0.18920036, 0xabd4211662a6b2a1L, 0x7d673c33a123b54cL}, - /* 40 */ {12, 0.18790182, 0xe8d4a51000000000L, 0x19799812dea11197L}, - /* 41 */ {11, 0.18665241, 0x7a32956ad081b79L, 0xc27e62e0686feaeL}, - /* 42 */ {11, 0.18544902, 0x9f49aaff0e86800L, 0x9b6e7507064ce7c7L}, - /* 43 */ {11, 0.18428883, 0xce583bb812d37b3L, 0x3d9ac2bf66cfed94L}, - /* 44 */ {11, 0.18316925, 0x109b79a654c00000L, 0xed46bc50ce59712aL}, - /* 45 */ {11, 0.18208790, 0x1543beff214c8b95L, 0x813d97e2c89b8d46L}, - /* 46 */ {11, 0.18104260, 0x1b149a79459a3800L, 0x2e81751956af8083L}, - /* 47 */ {11, 0.18003133, 0x224edfb5434a830fL, 0xdd8e0a95e30c0988L}, - /* 48 */ {11, 0.17905223, 0x2b3fb00000000000L, 0x7ad4dd48a0b5b167L}, - /* 49 */ {11, 0.17810359, 0x3642798750226111L, 0x2df495ccaa57147bL}, - /* 50 */ {11, 0.17718382, 0x43c33c1937564800L, 0xe392010175ee5962L}, - /* 51 */ {11, 0.17629143, 0x54411b2441c3cd8bL, 0x84eaf11b2fe7738eL}, - /* 52 */ {11, 0.17542506, 0x6851455acd400000L, 0x3a1e3971e008995dL}, - /* 53 */ {11, 0.17458343, 0x80a23b117c8feb6dL, 0xfd7a462344ffce25L}, - /* 54 */ {11, 0.17376534, 0x9dff7d32d5dc1800L, 0x9eca40b40ebcef8aL}, - /* 55 */ {11, 0.17296969, 0xc155af6faeffe6a7L, 0x52fa161a4a48e43dL}, - /* 56 */ {11, 0.17219543, 0xebb7392e00000000L, 0x1607a2cbacf930c1L}, - /* 57 */ {10, 0.17144160, 0x50633659656d971L, 0x97a014f8e3be55f1L}, - /* 58 */ {10, 0.17070728, 0x5fa8624c7fba400L, 0x568df8b76cbf212cL}, - /* 59 */ {10, 0.16999162, 0x717d9faa73c5679L, 0x20ba7c4b4e6ef492L}, - /* 60 */ {10, 0.16929381, 0x86430aac6100000L, 0xe81ee46b9ef492f5L}, - /* 61 */ {10, 0.16861310, 0x9e64d9944b57f29L, 0x9dc0d10d51940416L}, - /* 62 */ {10, 0.16794878, 0xba5ca5392cb0400L, 0x5fa8ed2f450272a5L}, - /* 63 */ {10, 0.16730018, 0xdab2ce1d022cd81L, 0x2ba9eb8c5e04e641L}, - /* 64 */ {10, 0.16666667, 0x6L, 0x0L}, - /* 65 */ {10, 0.16604765, 0x12aeed5fd3e2d281L, 0xb67759cc00287bf1L}, - /* 66 */ {10, 0.16544255, 0x15c3da1572d50400L, 0x78621feeb7f4ed33L}, - /* 67 */ {10, 0.16485086, 0x194c05534f75ee29L, 0x43d55b5f72943bc0L}, - /* 68 */ {10, 0.16427205, 0x1d56299ada100000L, 0x173decb64d1d4409L}, - /* 69 */ {10, 0.16370566, 0x21f2a089a4ff4f79L, 0xe29fb54fd6b6074fL}, - /* 70 */ {10, 0.16315122, 0x2733896c68d9a400L, 0xa1f1f5c210d54e62L}, - /* 71 */ {10, 0.16260831, 0x2d2cf2c33b533c71L, 0x6aac7f9bfafd57b2L}, - /* 72 */ {10, 0.16207652, 0x33f506e440000000L, 0x3b563c2478b72ee2L}, - /* 73 */ {10, 0.16155547, 0x3ba43bec1d062211L, 0x12b536b574e92d1bL}, - /* 74 */ {10, 0.16104477, 0x4455872d8fd4e400L, 0xdf86c03020404fa5L}, - /* 75 */ {10, 0.16054409, 0x4e2694539f2f6c59L, 0xa34adf02234eea8eL}, - /* 76 */ {10, 0.16005307, 0x5938006c18900000L, 0x6f46eb8574eb59ddL}, - /* 77 */ {10, 0.15957142, 0x65ad9912474aa649L, 0x42459b481df47cecL}, - /* 78 */ {10, 0.15909881, 0x73ae9ff4241ec400L, 0x1b424b95d80ca505L}, - /* 79 */ {10, 0.15863496, 0x836612ee9c4ce1e1L, 0xf2c1b982203a0dacL}, - /* 80 */ {10, 0.15817959, 0x9502f90000000000L, 0xb7cdfd9d7bdbab7dL}, - /* 81 */ {10, 0.15773244, 0xa8b8b452291fe821L, 0x846d550e37b5063dL}, - /* 82 */ {10, 0.15729325, 0xbebf59a07dab4400L, 0x57931eeaf85cf64fL}, - /* 83 */ {10, 0.15686177, 0xd7540d4093bc3109L, 0x305a944507c82f47L}, - /* 84 */ {10, 0.15643779, 0xf2b96616f1900000L, 0xe007ccc9c22781aL}, - /* 85 */ {9, 0.15602107, 0x336de62af2bca35L, 0x3e92c42e000eeed4L}, - /* 86 */ {9, 0.15561139, 0x39235ec33d49600L, 0x1ebe59130db2795eL}, - /* 87 */ {9, 0.15520856, 0x3f674e539585a17L, 0x268859e90f51b89L}, - /* 88 */ {9, 0.15481238, 0x4645b6958000000L, 0xd24cde0463108cfaL}, - /* 89 */ {9, 0.15442266, 0x4dcb74afbc49c19L, 0xa536009f37adc383L}, - /* 90 */ {9, 0.15403922, 0x56064e1d18d9a00L, 0x7cea06ce1c9ace10L}, - /* 91 */ {9, 0.15366189, 0x5f04fe2cd8a39fbL, 0x58db032e72e8ba43L}, - /* 92 */ {9, 0.15329049, 0x68d74421f5c0000L, 0x388cc17cae105447L}, - /* 93 */ {9, 0.15292487, 0x738df1f6ab4827dL, 0x1b92672857620ce0L}, - /* 94 */ {9, 0.15256487, 0x7f3afbc9cfb5e00L, 0x18c6a9575c2ade4L}, - /* 95 */ {9, 0.15221035, 0x8bf187fba88f35fL, 0xd44da7da8e44b24fL}, - /* 96 */ {9, 0.15186115, 0x99c600000000000L, 0xaa2f78f1b4cc6794L}, - /* 97 */ {9, 0.15151715, 0xa8ce21eb6531361L, 0x843c067d091ee4ccL}, - /* 98 */ {9, 0.15117821, 0xb92112c1a0b6200L, 0x62005e1e913356e3L}, - /* 99 */ {9, 0.15084420, 0xcad7718b8747c43L, 0x4316eed01dedd518L}, - /* 100 */ {9, 0.15051500, 0xde0b6b3a7640000L, 0x2725dd1d243aba0eL}, - /* 101 */ {9, 0.15019048, 0xf2d8cf5fe6d74c5L, 0xddd9057c24cb54fL}, - /* 102 */ {9, 0.14987054, 0x1095d25bfa712600L, 0xedeee175a736d2a1L}, - /* 103 */ {9, 0.14955506, 0x121b7c4c3698faa7L, 0xc4699f3df8b6b328L}, - /* 104 */ {9, 0.14924394, 0x13c09e8d68000000L, 0x9ebbe7d859cb5a7cL}, - /* 105 */ {9, 0.14893706, 0x15876ccb0b709ca9L, 0x7c828b9887eb2179L}, - /* 106 */ {9, 0.14863434, 0x17723c2976da2a00L, 0x5d652ab99001adcfL}, - /* 107 */ {9, 0.14833567, 0x198384e9c259048bL, 0x4114f1754e5d7b32L}, - /* 108 */ {9, 0.14804096, 0x1bbde41dfeec0000L, 0x274b7c902f7e0188L}, - /* 109 */ {9, 0.14775011, 0x1e241d6e3337910dL, 0xfc9e0fbb32e210cL}, - /* 110 */ {9, 0.14746305, 0x20b91cee9901ee00L, 0xf4afa3e594f8ea1fL}, - /* 111 */ {9, 0.14717969, 0x237ff9079863dfefL, 0xcd85c32e9e4437b0L}, - /* 112 */ {9, 0.14689994, 0x267bf47000000000L, 0xa9bbb147e0dd92a8L}, - /* 113 */ {9, 0.14662372, 0x29b08039fbeda7f1L, 0x8900447b70e8eb82L}, - /* 114 */ {9, 0.14635096, 0x2d213df34f65f200L, 0x6b0a92adaad5848aL}, - /* 115 */ {9, 0.14608158, 0x30d201d957a7c2d3L, 0x4f990ad8740f0ee5L}, - /* 116 */ {9, 0.14581551, 0x34c6d52160f40000L, 0x3670a9663a8d3610L}, - /* 117 */ {9, 0.14555268, 0x3903f855d8f4c755L, 0x1f5c44188057be3cL}, - /* 118 */ {9, 0.14529302, 0x3d8de5c8ec59b600L, 0xa2bea956c4e4977L}, - /* 119 */ {9, 0.14503647, 0x4269541d1ff01337L, 0xed68b23033c3637eL}, - /* 120 */ {9, 0.14478295, 0x479b38e478000000L, 0xc99cf624e50549c5L}, - /* 121 */ {9, 0.14453241, 0x4d28cb56c33fa539L, 0xa8adf7ae45e7577bL}, - /* 122 */ {9, 0.14428479, 0x5317871fa13aba00L, 0x8a5bc740b1c113e5L}, - /* 123 */ {9, 0.14404003, 0x596d2f44de9fa71bL, 0x6e6c7efb81cfbb9bL}, - /* 124 */ {9, 0.14379807, 0x602fd125c47c0000L, 0x54aba5c5cada5f10L}, - /* 125 */ {9, 0.14355885, 0x6765c793fa10079dL, 0x3ce9a36f23c0fc90L}, - /* 126 */ {9, 0.14332233, 0x6f15be069b847e00L, 0x26fb43de2c8cd2a8L}, - /* 127 */ {9, 0.14308844, 0x7746b3e82a77047fL, 0x12b94793db8486a1L}, - /* 128 */ {9, 0.14285714, 0x7L, 0x0L}, - /* 129 */ {9, 0.14262838, 0x894953f7ea890481L, 0xdd5deca404c0156dL}, - /* 130 */ {9, 0.14240211, 0x932abffea4848200L, 0xbd51373330291de0L}, - /* 131 */ {9, 0.14217828, 0x9dacb687d3d6a163L, 0x9fa4025d66f23085L}, - /* 132 */ {9, 0.14195685, 0xa8d8102a44840000L, 0x842530ee2db4949dL}, - /* 133 */ {9, 0.14173777, 0xb4b60f9d140541e5L, 0x6aa7f2766b03dc25L}, - /* 134 */ {9, 0.14152100, 0xc15065d4856e4600L, 0x53035ba7ebf32e8dL}, - /* 135 */ {9, 0.14130649, 0xceb1363f396d23c7L, 0x3d12091fc9fb4914L}, - /* 136 */ {9, 0.14109421, 0xdce31b2488000000L, 0x28b1cb81b1ef1849L}, - /* 137 */ {9, 0.14088412, 0xebf12a24bca135c9L, 0x15c35be67ae3e2c9L}, - /* 138 */ {9, 0.14067617, 0xfbe6f8dbf88f4a00L, 0x42a17bd09be1ff0L}, - /* 139 */ {8, 0.14047033, 0x1ef156c084ce761L, 0x8bf461f03cf0bbfL}, - /* 140 */ {8, 0.14026656, 0x20c4e3b94a10000L, 0xf3fbb43f68a32d05L}, - /* 141 */ {8, 0.14006482, 0x22b0695a08ba421L, 0xd84f44c48564dc19L}, - /* 142 */ {8, 0.13986509, 0x24b4f35d7a4c100L, 0xbe58ebcce7956abeL}, - /* 143 */ {8, 0.13966731, 0x26d397284975781L, 0xa5fac463c7c134b7L}, - /* 144 */ {8, 0.13947147, 0x290d74100000000L, 0x8f19241e28c7d757L}, - /* 145 */ {8, 0.13927753, 0x2b63b3a37866081L, 0x799a6d046c0ae1aeL}, - /* 146 */ {8, 0.13908545, 0x2dd789f4d894100L, 0x6566e37d746a9e40L}, - /* 147 */ {8, 0.13889521, 0x306a35e51b58721L, 0x526887dbfb5f788fL}, - /* 148 */ {8, 0.13870677, 0x331d01712e10000L, 0x408af3382b8efd3dL}, - /* 149 */ {8, 0.13852011, 0x35f14200a827c61L, 0x2fbb374806ec05f1L}, - /* 150 */ {8, 0.13833519, 0x38e858b62216100L, 0x1fe7c0f0afce87feL}, - /* 151 */ {8, 0.13815199, 0x3c03b2c13176a41L, 0x11003d517540d32eL}, - /* 152 */ {8, 0.13797047, 0x3f44c9b21000000L, 0x2f5810f98eff0dcL}, - /* 153 */ {8, 0.13779062, 0x42ad23cef3113c1L, 0xeb72e35e7840d910L}, - /* 154 */ {8, 0.13761241, 0x463e546b19a2100L, 0xd27de19593dc3614L}, - /* 155 */ {8, 0.13743580, 0x49f9fc3f96684e1L, 0xbaf391fd3e5e6fc2L}, - /* 156 */ {8, 0.13726078, 0x4de1c9c5dc10000L, 0xa4bd38c55228c81dL}, - /* 157 */ {8, 0.13708732, 0x51f77994116d2a1L, 0x8fc5a8de8e1de782L}, - /* 158 */ {8, 0.13691539, 0x563cd6bb3398100L, 0x7bf9265bea9d3a3bL}, - /* 159 */ {8, 0.13674498, 0x5ab3bb270beeb01L, 0x69454b325983dccdL}, - /* 160 */ {8, 0.13657605, 0x5f5e10000000000L, 0x5798ee2308c39df9L}, - /* 161 */ {8, 0.13640859, 0x643dce0ec16f501L, 0x46e40ba0fa66a753L}, - /* 162 */ {8, 0.13624257, 0x6954fe21e3e8100L, 0x3717b0870b0db3a7L}, - /* 163 */ {8, 0.13607797, 0x6ea5b9755f440a1L, 0x2825e6775d11cdebL}, - /* 164 */ {8, 0.13591477, 0x74322a1c0410000L, 0x1a01a1c09d1b4dacL}, - /* 165 */ {8, 0.13575295, 0x79fc8b6ae8a46e1L, 0xc9eb0a8bebc8f3eL}, - /* 166 */ {8, 0.13559250, 0x80072a66d512100L, 0xffe357ff59e6a004L}, - /* 167 */ {8, 0.13543338, 0x86546633b42b9c1L, 0xe7dfd1be05fa61a8L}, - /* 168 */ {8, 0.13527558, 0x8ce6b0861000000L, 0xd11ed6fc78f760e5L}, - /* 169 */ {8, 0.13511908, 0x93c08e16a022441L, 0xbb8db609dd29ebfeL}, - /* 170 */ {8, 0.13496386, 0x9ae49717f026100L, 0xa71aec8d1813d532L}, - /* 171 */ {8, 0.13480991, 0xa25577ae24c1a61L, 0x93b612a9f20fbc02L}, - /* 172 */ {8, 0.13465720, 0xaa15f068e610000L, 0x814fc7b19a67d317L}, - /* 173 */ {8, 0.13450572, 0xb228d6bf7577921L, 0x6fd9a03f2e0a4b7cL}, - /* 174 */ {8, 0.13435545, 0xba91158ef5c4100L, 0x5f4615a38d0d316eL}, - /* 175 */ {8, 0.13420637, 0xc351ad9aec0b681L, 0x4f8876863479a286L}, - /* 176 */ {8, 0.13405847, 0xcc6db6100000000L, 0x4094d8a3041b60ebL}, - /* 177 */ {8, 0.13391173, 0xd5e85d09025c181L, 0x32600b8ed883a09bL}, - /* 178 */ {8, 0.13376614, 0xdfc4e816401c100L, 0x24df8c6eb4b6d1f1L}, - /* 179 */ {8, 0.13362168, 0xea06b4c72947221L, 0x18097a8ee151acefL}, - /* 180 */ {8, 0.13347832, 0xf4b139365210000L, 0xbd48cc8ec1cd8e3L}, - /* 181 */ {8, 0.13333607, 0xffc80497d520961L, 0x3807a8d67485fbL}, - /* 182 */ {8, 0.13319491, 0x10b4ebfca1dee100L, 0xea5768860b62e8d8L}, - /* 183 */ {8, 0.13305481, 0x117492de921fc141L, 0xd54faf5b635c5005L}, - /* 184 */ {8, 0.13291577, 0x123bb2ce41000000L, 0xc14a56233a377926L}, - /* 185 */ {8, 0.13277777, 0x130a8b6157bdecc1L, 0xae39a88db7cd329fL}, - /* 186 */ {8, 0.13264079, 0x13e15dede0e8a100L, 0x9c10bde69efa7ab6L}, - /* 187 */ {8, 0.13250483, 0x14c06d941c0ca7e1L, 0x8ac36c42a2836497L}, - /* 188 */ {8, 0.13236988, 0x15a7ff487a810000L, 0x7a463c8b84f5ef67L}, - /* 189 */ {8, 0.13223591, 0x169859ddc5c697a1L, 0x6a8e5f5ad090fd4bL}, - /* 190 */ {8, 0.13210292, 0x1791c60f6fed0100L, 0x5b91a2943596fc56L}, - /* 191 */ {8, 0.13197089, 0x18948e8c0e6fba01L, 0x4d4667b1c468e8f0L}, - /* 192 */ {8, 0.13183981, 0x19a1000000000000L, 0x3fa39ab547994dafL}, - /* 193 */ {8, 0.13170967, 0x1ab769203dafc601L, 0x32a0a9b2faee1e2aL}, - /* 194 */ {8, 0.13158046, 0x1bd81ab557f30100L, 0x26357ceac0e96962L}, - /* 195 */ {8, 0.13145216, 0x1d0367a69fed1ba1L, 0x1a5a6f65caa5859eL}, - /* 196 */ {8, 0.13132477, 0x1e39a5057d810000L, 0xf08480f672b4e86L}, - /* 197 */ {8, 0.13119827, 0x1f7b2a18f29ac3e1L, 0x4383340615612caL}, - /* 198 */ {8, 0.13107265, 0x20c850694c2aa100L, 0xf3c77969ee4be5a2L}, - /* 199 */ {8, 0.13094791, 0x222173cc014980c1L, 0xe00993cc187c5ec9L}, - /* 200 */ {8, 0.13082402, 0x2386f26fc1000000L, 0xcd2b297d889bc2b6L}, - /* 201 */ {8, 0.13070099, 0x24f92ce8af296d41L, 0xbb214d5064862b22L}, - /* 202 */ {8, 0.13057879, 0x2678863cd0ece100L, 0xa9e1a7ca7ea10e20L}, - /* 203 */ {8, 0.13045743, 0x280563f0a9472d61L, 0x99626e72b39ea0cfL}, - /* 204 */ {8, 0.13033688, 0x29a02e1406210000L, 0x899a5ba9c13fafd9L}, - /* 205 */ {8, 0.13021715, 0x2b494f4efe6d2e21L, 0x7a80a705391e96ffL}, - /* 206 */ {8, 0.13009822, 0x2d0134ef21cbc100L, 0x6c0cfe23de23042aL}, - /* 207 */ {8, 0.12998007, 0x2ec84ef4da2ef581L, 0x5e377df359c944ddL}, - /* 208 */ {8, 0.12986271, 0x309f102100000000L, 0x50f8ac5fc8f53985L}, - /* 209 */ {8, 0.12974613, 0x3285ee02a1420281L, 0x44497266278e35b7L}, - /* 210 */ {8, 0.12963031, 0x347d6104fc324100L, 0x382316831f7ee175L}, - /* 211 */ {8, 0.12951524, 0x3685e47dade53d21L, 0x2c7f377833b8946eL}, - /* 212 */ {8, 0.12940092, 0x389ff6bb15610000L, 0x2157c761ab4163efL}, - /* 213 */ {8, 0.12928734, 0x3acc1912ebb57661L, 0x16a7071803cc49a9L}, - /* 214 */ {8, 0.12917448, 0x3d0acff111946100L, 0xc6781d80f8224fcL}, - /* 215 */ {8, 0.12906235, 0x3f5ca2e692eaf841L, 0x294092d370a900bL}, - /* 216 */ {8, 0.12895094, 0x41c21cb8e1000000L, 0xf24f62335024a295L}, - /* 217 */ {8, 0.12884022, 0x443bcb714399a5c1L, 0xe03b98f103fad6d2L}, - /* 218 */ {8, 0.12873021, 0x46ca406c81af2100L, 0xcee3d32cad2a9049L}, - /* 219 */ {8, 0.12862089, 0x496e106ac22aaae1L, 0xbe3f9df9277fdadaL}, - /* 220 */ {8, 0.12851224, 0x4c27d39fa5410000L, 0xae46f0d94c05e933L}, - /* 221 */ {8, 0.12840428, 0x4ef825c296e43ca1L, 0x9ef2280fb437a33dL}, - /* 222 */ {8, 0.12829698, 0x51dfa61f5ad88100L, 0x9039ff426d3f284bL}, - /* 223 */ {8, 0.12819034, 0x54def7a6d2f16901L, 0x82178c6d6b51f8f4L}, - /* 224 */ {8, 0.12808435, 0x57f6c10000000000L, 0x74843b1ee4c1e053L}, - /* 225 */ {8, 0.12797901, 0x5b27ac993df97701L, 0x6779c7f90dc42f48L}, - /* 226 */ {8, 0.12787431, 0x5e7268b9bbdf8100L, 0x5af23c74f9ad9fe9L}, - /* 227 */ {8, 0.12777024, 0x61d7a7932ff3d6a1L, 0x4ee7eae2acdc617eL}, - /* 228 */ {8, 0.12766680, 0x65581f53c8c10000L, 0x43556aa2ac262a0bL}, - /* 229 */ {8, 0.12756398, 0x68f48a385b8320e1L, 0x3835949593b8ddd1L}, - /* 230 */ {8, 0.12746176, 0x6cada69ed07c2100L, 0x2d837fbe78458762L}, - /* 231 */ {8, 0.12736016, 0x70843718cdbf27c1L, 0x233a7e150a54a555L}, - /* 232 */ {8, 0.12725915, 0x7479027ea1000000L, 0x19561984a50ff8feL}, - /* 233 */ {8, 0.12715874, 0x788cd40268f39641L, 0xfd211159fe3490fL}, - /* 234 */ {8, 0.12705891, 0x7cc07b437ecf6100L, 0x6aa563e655033e3L}, - /* 235 */ {8, 0.12695967, 0x8114cc6220762061L, 0xfbb614b3f2d3b14cL}, - /* 236 */ {8, 0.12686100, 0x858aa0135be10000L, 0xeac0f8837fb05773L}, - /* 237 */ {8, 0.12676290, 0x8a22d3b53c54c321L, 0xda6e4c10e8615ca5L}, - /* 238 */ {8, 0.12666537, 0x8ede496339f34100L, 0xcab755a8d01fa67fL}, - /* 239 */ {8, 0.12656839, 0x93bde80aec3a1481L, 0xbb95a9ae71aa3e0cL}, - /* 240 */ {8, 0.12647197, 0x98c29b8100000000L, 0xad0326c296b4f529L}, - /* 241 */ {8, 0.12637609, 0x9ded549671832381L, 0x9ef9f21eed31b7c1L}, - /* 242 */ {8, 0.12628075, 0xa33f092e0b1ac100L, 0x91747422be14b0b2L}, - /* 243 */ {8, 0.12618595, 0xa8b8b452291fe821L, 0x846d550e37b5063dL}, - /* 244 */ {8, 0.12609168, 0xae5b564ac3a10000L, 0x77df79e9a96c06f6L}, - /* 245 */ {8, 0.12599794, 0xb427f4b3be74c361L, 0x6bc6019636c7d0c2L}, - /* 246 */ {8, 0.12590471, 0xba1f9a938041e100L, 0x601c4205aebd9e47L}, - /* 247 */ {8, 0.12581200, 0xc0435871d1110f41L, 0x54ddc59756f05016L}, - /* 248 */ {8, 0.12571980, 0xc694446f01000000L, 0x4a0648979c838c18L}, - /* 249 */ {8, 0.12562811, 0xcd137a5b57ac3ec1L, 0x3f91b6e0bb3a053dL}, - /* 250 */ {8, 0.12553692, 0xd3c21bcecceda100L, 0x357c299a88ea76a5L}, - /* 251 */ {8, 0.12544622, 0xdaa150410b788de1L, 0x2bc1e517aecc56e3L}, - /* 252 */ {8, 0.12535601, 0xe1b24521be010000L, 0x225f56ceb3da9f5dL}, - /* 253 */ {8, 0.12526629, 0xe8f62df12777c1a1L, 0x1951136d53ad63acL}, - /* 254 */ {8, 0.12517705, 0xf06e445906fc0100L, 0x1093d504b3cd7d93L}, - /* 255 */ {8, 0.12508829, 0xf81bc845c81bf801L, 0x824794d1ec1814fL}, + /* 2 */ {64, 1.0000000000000000, CNST_LIMB(0x1), CNST_LIMB(0x0)}, + /* 3 */ {40, 0.6309297535714574, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)}, + /* 4 */ {32, 0.5000000000000000, CNST_LIMB(0x2), CNST_LIMB(0x0)}, + /* 5 */ {27, 0.4306765580733931, CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90)}, + /* 6 */ {24, 0.3868528072345416, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)}, + /* 7 */ {22, 0.3562071871080222, CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b)}, + /* 8 */ {21, 0.3333333333333334, CNST_LIMB(0x3), CNST_LIMB(0x0)}, + /* 9 */ {20, 0.3154648767857287, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)}, + /* 10 */ {19, 0.3010299956639811, CNST_LIMB(0x8ac7230489e80000), CNST_LIMB(0xd83c94fb6d2ac34a)}, + /* 11 */ {18, 0.2890648263178878, CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b)}, + /* 12 */ {17, 0.2789429456511298, CNST_LIMB(0x1eca170c00000000), CNST_LIMB(0xa10c2bec5da8f8f)}, + /* 13 */ {17, 0.2702381544273197, CNST_LIMB(0x780c7372621bd74d), CNST_LIMB(0x10f4becafe412ec3)}, + /* 14 */ {16, 0.2626495350371936, CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86)}, + /* 15 */ {16, 0.2559580248098155, CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48)}, + /* 16 */ {16, 0.2500000000000000, CNST_LIMB(0x4), CNST_LIMB(0x0)}, + /* 17 */ {15, 0.2446505421182260, CNST_LIMB(0x27b95e997e21d9f1), CNST_LIMB(0x9c71e11bab279323)}, + /* 18 */ {15, 0.2398124665681315, CNST_LIMB(0x5da0e1e53c5c8000), CNST_LIMB(0x5dfaa697ec6f6a1c)}, + /* 19 */ {15, 0.2354089133666382, CNST_LIMB(0xd2ae3299c1c4aedb), CNST_LIMB(0x3711783f6be7e9ec)}, + /* 20 */ {14, 0.2313782131597592, CNST_LIMB(0x16bcc41e90000000), CNST_LIMB(0x6849b86a12b9b01e)}, + /* 21 */ {14, 0.2276702486969530, CNST_LIMB(0x2d04b7fdd9c0ef49), CNST_LIMB(0x6bf097ba5ca5e239)}, + /* 22 */ {14, 0.2242438242175754, CNST_LIMB(0x5658597bcaa24000), CNST_LIMB(0x7b8015c8d7af8f08)}, + /* 23 */ {14, 0.2210647294575037, CNST_LIMB(0xa0e2073737609371), CNST_LIMB(0x975a24b3a3151b38)}, + /* 24 */ {13, 0.2181042919855316, CNST_LIMB(0xc29e98000000000), CNST_LIMB(0x50bd367972689db1)}, + /* 25 */ {13, 0.2153382790366965, CNST_LIMB(0x14adf4b7320334b9), CNST_LIMB(0x8c240c4aecb13bb5)}, + /* 26 */ {13, 0.2127460535533632, CNST_LIMB(0x226ed36478bfa000), CNST_LIMB(0xdbd2e56854e118c9)}, + /* 27 */ {13, 0.2103099178571525, CNST_LIMB(0x383d9170b85ff80b), CNST_LIMB(0x2351ffcaa9c7c4ae)}, + /* 28 */ {13, 0.2080145976765095, CNST_LIMB(0x5a3c23e39c000000), CNST_LIMB(0x6b24188ca33b0636)}, + /* 29 */ {13, 0.2058468324604344, CNST_LIMB(0x8e65137388122bcd), CNST_LIMB(0xcc3dceaf2b8ba99d)}, + /* 30 */ {13, 0.2037950470905062, CNST_LIMB(0xdd41bb36d259e000), CNST_LIMB(0x2832e835c6c7d6b6)}, + /* 31 */ {12, 0.2018490865820999, CNST_LIMB(0xaee5720ee830681), CNST_LIMB(0x76b6aa272e1873c5)}, + /* 32 */ {12, 0.2000000000000000, CNST_LIMB(0x5), CNST_LIMB(0x0)}, + /* 33 */ {12, 0.1982398631705605, CNST_LIMB(0x172588ad4f5f0981), CNST_LIMB(0x61eaf5d402c7bf4f)}, + /* 34 */ {12, 0.1965616322328226, CNST_LIMB(0x211e44f7d02c1000), CNST_LIMB(0xeeb658123ffb27ec)}, + /* 35 */ {12, 0.1949590218937863, CNST_LIMB(0x2ee56725f06e5c71), CNST_LIMB(0x5d5e3762e6fdf509)}, + /* 36 */ {12, 0.1934264036172708, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)}, + /* 37 */ {12, 0.1919587200065601, CNST_LIMB(0x5b5b57f8a98a5dd1), CNST_LIMB(0x66ae7831762efb6f)}, + /* 38 */ {12, 0.1905514124267734, CNST_LIMB(0x7dcff8986ea31000), CNST_LIMB(0x47388865a00f544)}, + /* 39 */ {12, 0.1892003595168700, CNST_LIMB(0xabd4211662a6b2a1), CNST_LIMB(0x7d673c33a123b54c)}, + /* 40 */ {12, 0.1879018247091076, CNST_LIMB(0xe8d4a51000000000), CNST_LIMB(0x19799812dea11197)}, + /* 41 */ {11, 0.1866524112389434, CNST_LIMB(0x7a32956ad081b79), CNST_LIMB(0xc27e62e0686feae)}, + /* 42 */ {11, 0.1854490234153689, CNST_LIMB(0x9f49aaff0e86800), CNST_LIMB(0x9b6e7507064ce7c7)}, + /* 43 */ {11, 0.1842888331487062, CNST_LIMB(0xce583bb812d37b3), CNST_LIMB(0x3d9ac2bf66cfed94)}, + /* 44 */ {11, 0.1831692509136336, CNST_LIMB(0x109b79a654c00000), CNST_LIMB(0xed46bc50ce59712a)}, + /* 45 */ {11, 0.1820879004699383, CNST_LIMB(0x1543beff214c8b95), CNST_LIMB(0x813d97e2c89b8d46)}, + /* 46 */ {11, 0.1810425967800402, CNST_LIMB(0x1b149a79459a3800), CNST_LIMB(0x2e81751956af8083)}, + /* 47 */ {11, 0.1800313266566926, CNST_LIMB(0x224edfb5434a830f), CNST_LIMB(0xdd8e0a95e30c0988)}, + /* 48 */ {11, 0.1790522317510413, CNST_LIMB(0x2b3fb00000000000), CNST_LIMB(0x7ad4dd48a0b5b167)}, + /* 49 */ {11, 0.1781035935540111, CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b)}, + /* 50 */ {11, 0.1771838201355579, CNST_LIMB(0x43c33c1937564800), CNST_LIMB(0xe392010175ee5962)}, + /* 51 */ {11, 0.1762914343888821, CNST_LIMB(0x54411b2441c3cd8b), CNST_LIMB(0x84eaf11b2fe7738e)}, + /* 52 */ {11, 0.1754250635819545, CNST_LIMB(0x6851455acd400000), CNST_LIMB(0x3a1e3971e008995d)}, + /* 53 */ {11, 0.1745834300480449, CNST_LIMB(0x80a23b117c8feb6d), CNST_LIMB(0xfd7a462344ffce25)}, + /* 54 */ {11, 0.1737653428714400, CNST_LIMB(0x9dff7d32d5dc1800), CNST_LIMB(0x9eca40b40ebcef8a)}, + /* 55 */ {11, 0.1729696904450771, CNST_LIMB(0xc155af6faeffe6a7), CNST_LIMB(0x52fa161a4a48e43d)}, + /* 56 */ {11, 0.1721954337940981, CNST_LIMB(0xebb7392e00000000), CNST_LIMB(0x1607a2cbacf930c1)}, + /* 57 */ {10, 0.1714416005739134, CNST_LIMB(0x50633659656d971), CNST_LIMB(0x97a014f8e3be55f1)}, + /* 58 */ {10, 0.1707072796637201, CNST_LIMB(0x5fa8624c7fba400), CNST_LIMB(0x568df8b76cbf212c)}, + /* 59 */ {10, 0.1699916162869140, CNST_LIMB(0x717d9faa73c5679), CNST_LIMB(0x20ba7c4b4e6ef492)}, + /* 60 */ {10, 0.1692938075987814, CNST_LIMB(0x86430aac6100000), CNST_LIMB(0xe81ee46b9ef492f5)}, + /* 61 */ {10, 0.1686130986895011, CNST_LIMB(0x9e64d9944b57f29), CNST_LIMB(0x9dc0d10d51940416)}, + /* 62 */ {10, 0.1679487789570419, CNST_LIMB(0xba5ca5392cb0400), CNST_LIMB(0x5fa8ed2f450272a5)}, + /* 63 */ {10, 0.1673001788101741, CNST_LIMB(0xdab2ce1d022cd81), CNST_LIMB(0x2ba9eb8c5e04e641)}, + /* 64 */ {10, 0.1666666666666667, CNST_LIMB(0x6), CNST_LIMB(0x0)}, + /* 65 */ {10, 0.1660476462159378, CNST_LIMB(0x12aeed5fd3e2d281), CNST_LIMB(0xb67759cc00287bf1)}, + /* 66 */ {10, 0.1654425539190583, CNST_LIMB(0x15c3da1572d50400), CNST_LIMB(0x78621feeb7f4ed33)}, + /* 67 */ {10, 0.1648508567221604, CNST_LIMB(0x194c05534f75ee29), CNST_LIMB(0x43d55b5f72943bc0)}, + /* 68 */ {10, 0.1642720499620502, CNST_LIMB(0x1d56299ada100000), CNST_LIMB(0x173decb64d1d4409)}, + /* 69 */ {10, 0.1637056554452156, CNST_LIMB(0x21f2a089a4ff4f79), CNST_LIMB(0xe29fb54fd6b6074f)}, + /* 70 */ {10, 0.1631512196835108, CNST_LIMB(0x2733896c68d9a400), CNST_LIMB(0xa1f1f5c210d54e62)}, + /* 71 */ {10, 0.1626083122716341, CNST_LIMB(0x2d2cf2c33b533c71), CNST_LIMB(0x6aac7f9bfafd57b2)}, + /* 72 */ {10, 0.1620765243931223, CNST_LIMB(0x33f506e440000000), CNST_LIMB(0x3b563c2478b72ee2)}, + /* 73 */ {10, 0.1615554674429964, CNST_LIMB(0x3ba43bec1d062211), CNST_LIMB(0x12b536b574e92d1b)}, + /* 74 */ {10, 0.1610447717564444, CNST_LIMB(0x4455872d8fd4e400), CNST_LIMB(0xdf86c03020404fa5)}, + /* 75 */ {10, 0.1605440854340214, CNST_LIMB(0x4e2694539f2f6c59), CNST_LIMB(0xa34adf02234eea8e)}, + /* 76 */ {10, 0.1600530732548213, CNST_LIMB(0x5938006c18900000), CNST_LIMB(0x6f46eb8574eb59dd)}, + /* 77 */ {10, 0.1595714156699382, CNST_LIMB(0x65ad9912474aa649), CNST_LIMB(0x42459b481df47cec)}, + /* 78 */ {10, 0.1590988078692941, CNST_LIMB(0x73ae9ff4241ec400), CNST_LIMB(0x1b424b95d80ca505)}, + /* 79 */ {10, 0.1586349589155960, CNST_LIMB(0x836612ee9c4ce1e1), CNST_LIMB(0xf2c1b982203a0dac)}, + /* 80 */ {10, 0.1581795909397823, CNST_LIMB(0x9502f90000000000), CNST_LIMB(0xb7cdfd9d7bdbab7d)}, + /* 81 */ {10, 0.1577324383928644, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)}, + /* 82 */ {10, 0.1572932473495469, CNST_LIMB(0xbebf59a07dab4400), CNST_LIMB(0x57931eeaf85cf64f)}, + /* 83 */ {10, 0.1568617748594410, CNST_LIMB(0xd7540d4093bc3109), CNST_LIMB(0x305a944507c82f47)}, + /* 84 */ {10, 0.1564377883420716, CNST_LIMB(0xf2b96616f1900000), CNST_LIMB(0xe007ccc9c22781a)}, + /* 85 */ {9, 0.1560210650222250, CNST_LIMB(0x336de62af2bca35), CNST_LIMB(0x3e92c42e000eeed4)}, + /* 86 */ {9, 0.1556113914024940, CNST_LIMB(0x39235ec33d49600), CNST_LIMB(0x1ebe59130db2795e)}, + /* 87 */ {9, 0.1552085627701551, CNST_LIMB(0x3f674e539585a17), CNST_LIMB(0x268859e90f51b89)}, + /* 88 */ {9, 0.1548123827357682, CNST_LIMB(0x4645b6958000000), CNST_LIMB(0xd24cde0463108cfa)}, + /* 89 */ {9, 0.1544226628011101, CNST_LIMB(0x4dcb74afbc49c19), CNST_LIMB(0xa536009f37adc383)}, + /* 90 */ {9, 0.1540392219542636, CNST_LIMB(0x56064e1d18d9a00), CNST_LIMB(0x7cea06ce1c9ace10)}, + /* 91 */ {9, 0.1536618862898642, CNST_LIMB(0x5f04fe2cd8a39fb), CNST_LIMB(0x58db032e72e8ba43)}, + /* 92 */ {9, 0.1532904886526781, CNST_LIMB(0x68d74421f5c0000), CNST_LIMB(0x388cc17cae105447)}, + /* 93 */ {9, 0.1529248683028321, CNST_LIMB(0x738df1f6ab4827d), CNST_LIMB(0x1b92672857620ce0)}, + /* 94 */ {9, 0.1525648706011593, CNST_LIMB(0x7f3afbc9cfb5e00), CNST_LIMB(0x18c6a9575c2ade4)}, + /* 95 */ {9, 0.1522103467132434, CNST_LIMB(0x8bf187fba88f35f), CNST_LIMB(0xd44da7da8e44b24f)}, + /* 96 */ {9, 0.1518611533308632, CNST_LIMB(0x99c600000000000), CNST_LIMB(0xaa2f78f1b4cc6794)}, + /* 97 */ {9, 0.1515171524096389, CNST_LIMB(0xa8ce21eb6531361), CNST_LIMB(0x843c067d091ee4cc)}, + /* 98 */ {9, 0.1511782109217764, CNST_LIMB(0xb92112c1a0b6200), CNST_LIMB(0x62005e1e913356e3)}, + /* 99 */ {9, 0.1508442006228941, CNST_LIMB(0xcad7718b8747c43), CNST_LIMB(0x4316eed01dedd518)}, + /* 100 */ {9, 0.1505149978319906, CNST_LIMB(0xde0b6b3a7640000), CNST_LIMB(0x2725dd1d243aba0e)}, + /* 101 */ {9, 0.1501904832236879, CNST_LIMB(0xf2d8cf5fe6d74c5), CNST_LIMB(0xddd9057c24cb54f)}, + /* 102 */ {9, 0.1498705416319474, CNST_LIMB(0x1095d25bfa712600), CNST_LIMB(0xedeee175a736d2a1)}, + /* 103 */ {9, 0.1495550618645152, CNST_LIMB(0x121b7c4c3698faa7), CNST_LIMB(0xc4699f3df8b6b328)}, + /* 104 */ {9, 0.1492439365274121, CNST_LIMB(0x13c09e8d68000000), CNST_LIMB(0x9ebbe7d859cb5a7c)}, + /* 105 */ {9, 0.1489370618588283, CNST_LIMB(0x15876ccb0b709ca9), CNST_LIMB(0x7c828b9887eb2179)}, + /* 106 */ {9, 0.1486343375718350, CNST_LIMB(0x17723c2976da2a00), CNST_LIMB(0x5d652ab99001adcf)}, + /* 107 */ {9, 0.1483356667053617, CNST_LIMB(0x198384e9c259048b), CNST_LIMB(0x4114f1754e5d7b32)}, + /* 108 */ {9, 0.1480409554829326, CNST_LIMB(0x1bbde41dfeec0000), CNST_LIMB(0x274b7c902f7e0188)}, + /* 109 */ {9, 0.1477501131786861, CNST_LIMB(0x1e241d6e3337910d), CNST_LIMB(0xfc9e0fbb32e210c)}, + /* 110 */ {9, 0.1474630519902391, CNST_LIMB(0x20b91cee9901ee00), CNST_LIMB(0xf4afa3e594f8ea1f)}, + /* 111 */ {9, 0.1471796869179852, CNST_LIMB(0x237ff9079863dfef), CNST_LIMB(0xcd85c32e9e4437b0)}, + /* 112 */ {9, 0.1468999356504447, CNST_LIMB(0x267bf47000000000), CNST_LIMB(0xa9bbb147e0dd92a8)}, + /* 113 */ {9, 0.1466237184553111, CNST_LIMB(0x29b08039fbeda7f1), CNST_LIMB(0x8900447b70e8eb82)}, + /* 114 */ {9, 0.1463509580758620, CNST_LIMB(0x2d213df34f65f200), CNST_LIMB(0x6b0a92adaad5848a)}, + /* 115 */ {9, 0.1460815796324244, CNST_LIMB(0x30d201d957a7c2d3), CNST_LIMB(0x4f990ad8740f0ee5)}, + /* 116 */ {9, 0.1458155105286054, CNST_LIMB(0x34c6d52160f40000), CNST_LIMB(0x3670a9663a8d3610)}, + /* 117 */ {9, 0.1455526803620167, CNST_LIMB(0x3903f855d8f4c755), CNST_LIMB(0x1f5c44188057be3c)}, + /* 118 */ {9, 0.1452930208392428, CNST_LIMB(0x3d8de5c8ec59b600), CNST_LIMB(0xa2bea956c4e4977)}, + /* 119 */ {9, 0.1450364656948130, CNST_LIMB(0x4269541d1ff01337), CNST_LIMB(0xed68b23033c3637e)}, + /* 120 */ {9, 0.1447829506139581, CNST_LIMB(0x479b38e478000000), CNST_LIMB(0xc99cf624e50549c5)}, + /* 121 */ {9, 0.1445324131589439, CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b)}, + /* 122 */ {9, 0.1442847926987864, CNST_LIMB(0x5317871fa13aba00), CNST_LIMB(0x8a5bc740b1c113e5)}, + /* 123 */ {9, 0.1440400303421672, CNST_LIMB(0x596d2f44de9fa71b), CNST_LIMB(0x6e6c7efb81cfbb9b)}, + /* 124 */ {9, 0.1437980688733775, CNST_LIMB(0x602fd125c47c0000), CNST_LIMB(0x54aba5c5cada5f10)}, + /* 125 */ {9, 0.1435588526911310, CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90)}, + /* 126 */ {9, 0.1433223277500932, CNST_LIMB(0x6f15be069b847e00), CNST_LIMB(0x26fb43de2c8cd2a8)}, + /* 127 */ {9, 0.1430884415049874, CNST_LIMB(0x7746b3e82a77047f), CNST_LIMB(0x12b94793db8486a1)}, + /* 128 */ {9, 0.1428571428571428, CNST_LIMB(0x7), CNST_LIMB(0x0)}, + /* 129 */ {9, 0.1426283821033600, CNST_LIMB(0x894953f7ea890481), CNST_LIMB(0xdd5deca404c0156d)}, + /* 130 */ {9, 0.1424021108869747, CNST_LIMB(0x932abffea4848200), CNST_LIMB(0xbd51373330291de0)}, + /* 131 */ {9, 0.1421782821510107, CNST_LIMB(0x9dacb687d3d6a163), CNST_LIMB(0x9fa4025d66f23085)}, + /* 132 */ {9, 0.1419568500933153, CNST_LIMB(0xa8d8102a44840000), CNST_LIMB(0x842530ee2db4949d)}, + /* 133 */ {9, 0.1417377701235801, CNST_LIMB(0xb4b60f9d140541e5), CNST_LIMB(0x6aa7f2766b03dc25)}, + /* 134 */ {9, 0.1415209988221527, CNST_LIMB(0xc15065d4856e4600), CNST_LIMB(0x53035ba7ebf32e8d)}, + /* 135 */ {9, 0.1413064939005528, CNST_LIMB(0xceb1363f396d23c7), CNST_LIMB(0x3d12091fc9fb4914)}, + /* 136 */ {9, 0.1410942141636095, CNST_LIMB(0xdce31b2488000000), CNST_LIMB(0x28b1cb81b1ef1849)}, + /* 137 */ {9, 0.1408841194731412, CNST_LIMB(0xebf12a24bca135c9), CNST_LIMB(0x15c35be67ae3e2c9)}, + /* 138 */ {9, 0.1406761707131039, CNST_LIMB(0xfbe6f8dbf88f4a00), CNST_LIMB(0x42a17bd09be1ff0)}, + /* 139 */ {8, 0.1404703297561400, CNST_LIMB(0x1ef156c084ce761), CNST_LIMB(0x8bf461f03cf0bbf)}, + /* 140 */ {8, 0.1402665594314587, CNST_LIMB(0x20c4e3b94a10000), CNST_LIMB(0xf3fbb43f68a32d05)}, + /* 141 */ {8, 0.1400648234939879, CNST_LIMB(0x22b0695a08ba421), CNST_LIMB(0xd84f44c48564dc19)}, + /* 142 */ {8, 0.1398650865947379, CNST_LIMB(0x24b4f35d7a4c100), CNST_LIMB(0xbe58ebcce7956abe)}, + /* 143 */ {8, 0.1396673142523192, CNST_LIMB(0x26d397284975781), CNST_LIMB(0xa5fac463c7c134b7)}, + /* 144 */ {8, 0.1394714728255649, CNST_LIMB(0x290d74100000000), CNST_LIMB(0x8f19241e28c7d757)}, + /* 145 */ {8, 0.1392775294872041, CNST_LIMB(0x2b63b3a37866081), CNST_LIMB(0x799a6d046c0ae1ae)}, + /* 146 */ {8, 0.1390854521985406, CNST_LIMB(0x2dd789f4d894100), CNST_LIMB(0x6566e37d746a9e40)}, + /* 147 */ {8, 0.1388952096850913, CNST_LIMB(0x306a35e51b58721), CNST_LIMB(0x526887dbfb5f788f)}, + /* 148 */ {8, 0.1387067714131417, CNST_LIMB(0x331d01712e10000), CNST_LIMB(0x408af3382b8efd3d)}, + /* 149 */ {8, 0.1385201075671774, CNST_LIMB(0x35f14200a827c61), CNST_LIMB(0x2fbb374806ec05f1)}, + /* 150 */ {8, 0.1383351890281539, CNST_LIMB(0x38e858b62216100), CNST_LIMB(0x1fe7c0f0afce87fe)}, + /* 151 */ {8, 0.1381519873525671, CNST_LIMB(0x3c03b2c13176a41), CNST_LIMB(0x11003d517540d32e)}, + /* 152 */ {8, 0.1379704747522905, CNST_LIMB(0x3f44c9b21000000), CNST_LIMB(0x2f5810f98eff0dc)}, + /* 153 */ {8, 0.1377906240751463, CNST_LIMB(0x42ad23cef3113c1), CNST_LIMB(0xeb72e35e7840d910)}, + /* 154 */ {8, 0.1376124087861776, CNST_LIMB(0x463e546b19a2100), CNST_LIMB(0xd27de19593dc3614)}, + /* 155 */ {8, 0.1374358029495937, CNST_LIMB(0x49f9fc3f96684e1), CNST_LIMB(0xbaf391fd3e5e6fc2)}, + /* 156 */ {8, 0.1372607812113589, CNST_LIMB(0x4de1c9c5dc10000), CNST_LIMB(0xa4bd38c55228c81d)}, + /* 157 */ {8, 0.1370873187823978, CNST_LIMB(0x51f77994116d2a1), CNST_LIMB(0x8fc5a8de8e1de782)}, + /* 158 */ {8, 0.1369153914223921, CNST_LIMB(0x563cd6bb3398100), CNST_LIMB(0x7bf9265bea9d3a3b)}, + /* 159 */ {8, 0.1367449754241439, CNST_LIMB(0x5ab3bb270beeb01), CNST_LIMB(0x69454b325983dccd)}, + /* 160 */ {8, 0.1365760475984821, CNST_LIMB(0x5f5e10000000000), CNST_LIMB(0x5798ee2308c39df9)}, + /* 161 */ {8, 0.1364085852596902, CNST_LIMB(0x643dce0ec16f501), CNST_LIMB(0x46e40ba0fa66a753)}, + /* 162 */ {8, 0.1362425662114337, CNST_LIMB(0x6954fe21e3e8100), CNST_LIMB(0x3717b0870b0db3a7)}, + /* 163 */ {8, 0.1360779687331669, CNST_LIMB(0x6ea5b9755f440a1), CNST_LIMB(0x2825e6775d11cdeb)}, + /* 164 */ {8, 0.1359147715670014, CNST_LIMB(0x74322a1c0410000), CNST_LIMB(0x1a01a1c09d1b4dac)}, + /* 165 */ {8, 0.1357529539050150, CNST_LIMB(0x79fc8b6ae8a46e1), CNST_LIMB(0xc9eb0a8bebc8f3e)}, + /* 166 */ {8, 0.1355924953769863, CNST_LIMB(0x80072a66d512100), CNST_LIMB(0xffe357ff59e6a004)}, + /* 167 */ {8, 0.1354333760385373, CNST_LIMB(0x86546633b42b9c1), CNST_LIMB(0xe7dfd1be05fa61a8)}, + /* 168 */ {8, 0.1352755763596663, CNST_LIMB(0x8ce6b0861000000), CNST_LIMB(0xd11ed6fc78f760e5)}, + /* 169 */ {8, 0.1351190772136599, CNST_LIMB(0x93c08e16a022441), CNST_LIMB(0xbb8db609dd29ebfe)}, + /* 170 */ {8, 0.1349638598663645, CNST_LIMB(0x9ae49717f026100), CNST_LIMB(0xa71aec8d1813d532)}, + /* 171 */ {8, 0.1348099059658079, CNST_LIMB(0xa25577ae24c1a61), CNST_LIMB(0x93b612a9f20fbc02)}, + /* 172 */ {8, 0.1346571975321549, CNST_LIMB(0xaa15f068e610000), CNST_LIMB(0x814fc7b19a67d317)}, + /* 173 */ {8, 0.1345057169479844, CNST_LIMB(0xb228d6bf7577921), CNST_LIMB(0x6fd9a03f2e0a4b7c)}, + /* 174 */ {8, 0.1343554469488779, CNST_LIMB(0xba91158ef5c4100), CNST_LIMB(0x5f4615a38d0d316e)}, + /* 175 */ {8, 0.1342063706143054, CNST_LIMB(0xc351ad9aec0b681), CNST_LIMB(0x4f8876863479a286)}, + /* 176 */ {8, 0.1340584713587980, CNST_LIMB(0xcc6db6100000000), CNST_LIMB(0x4094d8a3041b60eb)}, + /* 177 */ {8, 0.1339117329233981, CNST_LIMB(0xd5e85d09025c181), CNST_LIMB(0x32600b8ed883a09b)}, + /* 178 */ {8, 0.1337661393673756, CNST_LIMB(0xdfc4e816401c100), CNST_LIMB(0x24df8c6eb4b6d1f1)}, + /* 179 */ {8, 0.1336216750601996, CNST_LIMB(0xea06b4c72947221), CNST_LIMB(0x18097a8ee151acef)}, + /* 180 */ {8, 0.1334783246737591, CNST_LIMB(0xf4b139365210000), CNST_LIMB(0xbd48cc8ec1cd8e3)}, + /* 181 */ {8, 0.1333360731748201, CNST_LIMB(0xffc80497d520961), CNST_LIMB(0x3807a8d67485fb)}, + /* 182 */ {8, 0.1331949058177136, CNST_LIMB(0x10b4ebfca1dee100), CNST_LIMB(0xea5768860b62e8d8)}, + /* 183 */ {8, 0.1330548081372441, CNST_LIMB(0x117492de921fc141), CNST_LIMB(0xd54faf5b635c5005)}, + /* 184 */ {8, 0.1329157659418126, CNST_LIMB(0x123bb2ce41000000), CNST_LIMB(0xc14a56233a377926)}, + /* 185 */ {8, 0.1327777653067443, CNST_LIMB(0x130a8b6157bdecc1), CNST_LIMB(0xae39a88db7cd329f)}, + /* 186 */ {8, 0.1326407925678156, CNST_LIMB(0x13e15dede0e8a100), CNST_LIMB(0x9c10bde69efa7ab6)}, + /* 187 */ {8, 0.1325048343149731, CNST_LIMB(0x14c06d941c0ca7e1), CNST_LIMB(0x8ac36c42a2836497)}, + /* 188 */ {8, 0.1323698773862368, CNST_LIMB(0x15a7ff487a810000), CNST_LIMB(0x7a463c8b84f5ef67)}, + /* 189 */ {8, 0.1322359088617821, CNST_LIMB(0x169859ddc5c697a1), CNST_LIMB(0x6a8e5f5ad090fd4b)}, + /* 190 */ {8, 0.1321029160581950, CNST_LIMB(0x1791c60f6fed0100), CNST_LIMB(0x5b91a2943596fc56)}, + /* 191 */ {8, 0.1319708865228925, CNST_LIMB(0x18948e8c0e6fba01), CNST_LIMB(0x4d4667b1c468e8f0)}, + /* 192 */ {8, 0.1318398080287045, CNST_LIMB(0x19a1000000000000), CNST_LIMB(0x3fa39ab547994daf)}, + /* 193 */ {8, 0.1317096685686114, CNST_LIMB(0x1ab769203dafc601), CNST_LIMB(0x32a0a9b2faee1e2a)}, + /* 194 */ {8, 0.1315804563506306, CNST_LIMB(0x1bd81ab557f30100), CNST_LIMB(0x26357ceac0e96962)}, + /* 195 */ {8, 0.1314521597928493, CNST_LIMB(0x1d0367a69fed1ba1), CNST_LIMB(0x1a5a6f65caa5859e)}, + /* 196 */ {8, 0.1313247675185968, CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86)}, + /* 197 */ {8, 0.1311982683517524, CNST_LIMB(0x1f7b2a18f29ac3e1), CNST_LIMB(0x4383340615612ca)}, + /* 198 */ {8, 0.1310726513121843, CNST_LIMB(0x20c850694c2aa100), CNST_LIMB(0xf3c77969ee4be5a2)}, + /* 199 */ {8, 0.1309479056113158, CNST_LIMB(0x222173cc014980c1), CNST_LIMB(0xe00993cc187c5ec9)}, + /* 200 */ {8, 0.1308240206478128, CNST_LIMB(0x2386f26fc1000000), CNST_LIMB(0xcd2b297d889bc2b6)}, + /* 201 */ {8, 0.1307009860033912, CNST_LIMB(0x24f92ce8af296d41), CNST_LIMB(0xbb214d5064862b22)}, + /* 202 */ {8, 0.1305787914387386, CNST_LIMB(0x2678863cd0ece100), CNST_LIMB(0xa9e1a7ca7ea10e20)}, + /* 203 */ {8, 0.1304574268895465, CNST_LIMB(0x280563f0a9472d61), CNST_LIMB(0x99626e72b39ea0cf)}, + /* 204 */ {8, 0.1303368824626505, CNST_LIMB(0x29a02e1406210000), CNST_LIMB(0x899a5ba9c13fafd9)}, + /* 205 */ {8, 0.1302171484322746, CNST_LIMB(0x2b494f4efe6d2e21), CNST_LIMB(0x7a80a705391e96ff)}, + /* 206 */ {8, 0.1300982152363760, CNST_LIMB(0x2d0134ef21cbc100), CNST_LIMB(0x6c0cfe23de23042a)}, + /* 207 */ {8, 0.1299800734730872, CNST_LIMB(0x2ec84ef4da2ef581), CNST_LIMB(0x5e377df359c944dd)}, + /* 208 */ {8, 0.1298627138972530, CNST_LIMB(0x309f102100000000), CNST_LIMB(0x50f8ac5fc8f53985)}, + /* 209 */ {8, 0.1297461274170591, CNST_LIMB(0x3285ee02a1420281), CNST_LIMB(0x44497266278e35b7)}, + /* 210 */ {8, 0.1296303050907487, CNST_LIMB(0x347d6104fc324100), CNST_LIMB(0x382316831f7ee175)}, + /* 211 */ {8, 0.1295152381234257, CNST_LIMB(0x3685e47dade53d21), CNST_LIMB(0x2c7f377833b8946e)}, + /* 212 */ {8, 0.1294009178639407, CNST_LIMB(0x389ff6bb15610000), CNST_LIMB(0x2157c761ab4163ef)}, + /* 213 */ {8, 0.1292873358018581, CNST_LIMB(0x3acc1912ebb57661), CNST_LIMB(0x16a7071803cc49a9)}, + /* 214 */ {8, 0.1291744835645007, CNST_LIMB(0x3d0acff111946100), CNST_LIMB(0xc6781d80f8224fc)}, + /* 215 */ {8, 0.1290623529140715, CNST_LIMB(0x3f5ca2e692eaf841), CNST_LIMB(0x294092d370a900b)}, + /* 216 */ {8, 0.1289509357448472, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)}, + /* 217 */ {8, 0.1288402240804449, CNST_LIMB(0x443bcb714399a5c1), CNST_LIMB(0xe03b98f103fad6d2)}, + /* 218 */ {8, 0.1287302100711567, CNST_LIMB(0x46ca406c81af2100), CNST_LIMB(0xcee3d32cad2a9049)}, + /* 219 */ {8, 0.1286208859913518, CNST_LIMB(0x496e106ac22aaae1), CNST_LIMB(0xbe3f9df9277fdada)}, + /* 220 */ {8, 0.1285122442369443, CNST_LIMB(0x4c27d39fa5410000), CNST_LIMB(0xae46f0d94c05e933)}, + /* 221 */ {8, 0.1284042773229231, CNST_LIMB(0x4ef825c296e43ca1), CNST_LIMB(0x9ef2280fb437a33d)}, + /* 222 */ {8, 0.1282969778809442, CNST_LIMB(0x51dfa61f5ad88100), CNST_LIMB(0x9039ff426d3f284b)}, + /* 223 */ {8, 0.1281903386569819, CNST_LIMB(0x54def7a6d2f16901), CNST_LIMB(0x82178c6d6b51f8f4)}, + /* 224 */ {8, 0.1280843525090381, CNST_LIMB(0x57f6c10000000000), CNST_LIMB(0x74843b1ee4c1e053)}, + /* 225 */ {8, 0.1279790124049077, CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48)}, + /* 226 */ {8, 0.1278743114199984, CNST_LIMB(0x5e7268b9bbdf8100), CNST_LIMB(0x5af23c74f9ad9fe9)}, + /* 227 */ {8, 0.1277702427352035, CNST_LIMB(0x61d7a7932ff3d6a1), CNST_LIMB(0x4ee7eae2acdc617e)}, + /* 228 */ {8, 0.1276667996348261, CNST_LIMB(0x65581f53c8c10000), CNST_LIMB(0x43556aa2ac262a0b)}, + /* 229 */ {8, 0.1275639755045533, CNST_LIMB(0x68f48a385b8320e1), CNST_LIMB(0x3835949593b8ddd1)}, + /* 230 */ {8, 0.1274617638294791, CNST_LIMB(0x6cada69ed07c2100), CNST_LIMB(0x2d837fbe78458762)}, + /* 231 */ {8, 0.1273601581921741, CNST_LIMB(0x70843718cdbf27c1), CNST_LIMB(0x233a7e150a54a555)}, + /* 232 */ {8, 0.1272591522708010, CNST_LIMB(0x7479027ea1000000), CNST_LIMB(0x19561984a50ff8fe)}, + /* 233 */ {8, 0.1271587398372755, CNST_LIMB(0x788cd40268f39641), CNST_LIMB(0xfd211159fe3490f)}, + /* 234 */ {8, 0.1270589147554692, CNST_LIMB(0x7cc07b437ecf6100), CNST_LIMB(0x6aa563e655033e3)}, + /* 235 */ {8, 0.1269596709794558, CNST_LIMB(0x8114cc6220762061), CNST_LIMB(0xfbb614b3f2d3b14c)}, + /* 236 */ {8, 0.1268610025517973, CNST_LIMB(0x858aa0135be10000), CNST_LIMB(0xeac0f8837fb05773)}, + /* 237 */ {8, 0.1267629036018709, CNST_LIMB(0x8a22d3b53c54c321), CNST_LIMB(0xda6e4c10e8615ca5)}, + /* 238 */ {8, 0.1266653683442337, CNST_LIMB(0x8ede496339f34100), CNST_LIMB(0xcab755a8d01fa67f)}, + /* 239 */ {8, 0.1265683910770258, CNST_LIMB(0x93bde80aec3a1481), CNST_LIMB(0xbb95a9ae71aa3e0c)}, + /* 240 */ {8, 0.1264719661804097, CNST_LIMB(0x98c29b8100000000), CNST_LIMB(0xad0326c296b4f529)}, + /* 241 */ {8, 0.1263760881150453, CNST_LIMB(0x9ded549671832381), CNST_LIMB(0x9ef9f21eed31b7c1)}, + /* 242 */ {8, 0.1262807514205999, CNST_LIMB(0xa33f092e0b1ac100), CNST_LIMB(0x91747422be14b0b2)}, + /* 243 */ {8, 0.1261859507142915, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)}, + /* 244 */ {8, 0.1260916806894653, CNST_LIMB(0xae5b564ac3a10000), CNST_LIMB(0x77df79e9a96c06f6)}, + /* 245 */ {8, 0.1259979361142023, CNST_LIMB(0xb427f4b3be74c361), CNST_LIMB(0x6bc6019636c7d0c2)}, + /* 246 */ {8, 0.1259047118299582, CNST_LIMB(0xba1f9a938041e100), CNST_LIMB(0x601c4205aebd9e47)}, + /* 247 */ {8, 0.1258120027502338, CNST_LIMB(0xc0435871d1110f41), CNST_LIMB(0x54ddc59756f05016)}, + /* 248 */ {8, 0.1257198038592741, CNST_LIMB(0xc694446f01000000), CNST_LIMB(0x4a0648979c838c18)}, + /* 249 */ {8, 0.1256281102107963, CNST_LIMB(0xcd137a5b57ac3ec1), CNST_LIMB(0x3f91b6e0bb3a053d)}, + /* 250 */ {8, 0.1255369169267456, CNST_LIMB(0xd3c21bcecceda100), CNST_LIMB(0x357c299a88ea76a5)}, + /* 251 */ {8, 0.1254462191960791, CNST_LIMB(0xdaa150410b788de1), CNST_LIMB(0x2bc1e517aecc56e3)}, + /* 252 */ {8, 0.1253560122735751, CNST_LIMB(0xe1b24521be010000), CNST_LIMB(0x225f56ceb3da9f5d)}, + /* 253 */ {8, 0.1252662914786691, CNST_LIMB(0xe8f62df12777c1a1), CNST_LIMB(0x1951136d53ad63ac)}, + /* 254 */ {8, 0.1251770521943144, CNST_LIMB(0xf06e445906fc0100), CNST_LIMB(0x1093d504b3cd7d93)}, + /* 255 */ {8, 0.1250882898658681, CNST_LIMB(0xf81bc845c81bf801), CNST_LIMB(0x824794d1ec1814f)}, }; #endif diff --git a/ghc/rts/gmp/mpn/ns32k/add_n.s b/ghc/rts/gmp/mpn/ns32k/add_n.s index dde2e15..bd063d0 100644 --- a/ghc/rts/gmp/mpn/ns32k/add_n.s +++ b/ghc/rts/gmp/mpn/ns32k/add_n.s @@ -1,29 +1,29 @@ -# ns32000 __mpn_add_n -- Add two limb vectors of the same length > 0 and store +# ns32000 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store # sum in a third limb vector. -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. .align 1 -.globl ___mpn_add_n -___mpn_add_n: +.globl ___gmpn_add_n +___gmpn_add_n: save [r3,r4,r5] negd 28(sp),r3 movd r3,r0 diff --git a/ghc/rts/gmp/mpn/ns32k/addmul_1.s b/ghc/rts/gmp/mpn/ns32k/addmul_1.s index 205bfe3..df0dcdd 100644 --- a/ghc/rts/gmp/mpn/ns32k/addmul_1.s +++ b/ghc/rts/gmp/mpn/ns32k/addmul_1.s @@ -1,29 +1,29 @@ -# ns32000 __mpn_addmul_1 -- Multiply a limb vector with a limb and add +# ns32000 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add # the result to a second limb vector. -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. .align 1 -.globl ___mpn_addmul_1 -___mpn_addmul_1: +.globl ___gmpn_addmul_1 +___gmpn_addmul_1: save [r3,r4,r5,r6,r7] negd 24(sp),r4 movd r4,r0 diff --git a/ghc/rts/gmp/mpn/ns32k/mul_1.s b/ghc/rts/gmp/mpn/ns32k/mul_1.s index 64e4abb..0a77efb 100644 --- a/ghc/rts/gmp/mpn/ns32k/mul_1.s +++ b/ghc/rts/gmp/mpn/ns32k/mul_1.s @@ -1,29 +1,29 @@ -# ns32000 __mpn_mul_1 -- Multiply a limb vector with a limb and store +# ns32000 __gmpn_mul_1 -- Multiply a limb vector with a limb and store # the result in a second limb vector. -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. .align 1 -.globl ___mpn_mul_1 -___mpn_mul_1: +.globl ___gmpn_mul_1 +___gmpn_mul_1: save [r3,r4,r5,r6,r7] negd 24(sp),r4 movd r4,r0 diff --git a/ghc/rts/gmp/mpn/ns32k/sub_n.s b/ghc/rts/gmp/mpn/ns32k/sub_n.s index ef6c889..cd89f4f 100644 --- a/ghc/rts/gmp/mpn/ns32k/sub_n.s +++ b/ghc/rts/gmp/mpn/ns32k/sub_n.s @@ -1,29 +1,29 @@ -# ns32000 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +# ns32000 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and # store difference in a third limb vector. -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. .align 1 -.globl ___mpn_sub_n -___mpn_sub_n: +.globl ___gmpn_sub_n +___gmpn_sub_n: save [r3,r4,r5] negd 28(sp),r3 movd r3,r0 diff --git a/ghc/rts/gmp/mpn/ns32k/submul_1.s b/ghc/rts/gmp/mpn/ns32k/submul_1.s index 5093095..f811aed 100644 --- a/ghc/rts/gmp/mpn/ns32k/submul_1.s +++ b/ghc/rts/gmp/mpn/ns32k/submul_1.s @@ -1,29 +1,29 @@ -# ns32000 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract +# ns32000 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract # the result from a second limb vector. -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. .align 1 -.globl ___mpn_submul_1 -___mpn_submul_1: +.globl ___gmpn_submul_1 +___gmpn_submul_1: save [r3,r4,r5,r6,r7] negd 24(sp),r4 movd r4,r0 diff --git a/ghc/rts/gmp/mpn/pa64/README b/ghc/rts/gmp/mpn/pa64/README new file mode 100644 index 0000000..8d2976d --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64/README @@ -0,0 +1,38 @@ +This directory contains mpn functions for 64-bit PA-RISC 2.0. + +RELEVANT OPTIMIZATION ISSUES + +The PA8000 has a multi-issue pipeline with large buffers for instructions +awaiting pending results. Therefore, no latency scheduling is necessary +(and might actually be harmful). + +Two 64-bit loads can be completed per cycle. One 64-bit store can be +completed per cycle. A store cannot complete in the same cycle as a load. + +STATUS + +* mpn_lshift, mpn_rshift, mpn_add_n, mpn_sub_n are all well-tuned and run at + the peak cache bandwidth; 1.5 cycles/limb for shifting and 2.0 cycles/limb + for add/subtract. + +* The multiplication functions run at 11 cycles/limb. The cache bandwidth + allows 7.5 cycles/limb. Perhaps it would be possible, using unrolling or + better scheduling, to get closer to the cache bandwidth limit. + +* xaddmul_1.S contains a quicker method for forming the 128 bit product. It + uses some fewer operations, and keep the carry flag live across the loop + boundary. But it seems hard to make it run more than 1/4 cycle faster + than the old code. Perhaps we really ought to unroll this loop be 2x? + 2x should suffice since register latency schedling is never needed, + but the unrolling would hide the store-load latency. Here is a sketch: + + 1. A multiply and store 64-bit products + 2. B sum 64-bit products 128-bit product + 3. B load 64-bit products to integer registers + 4. B multiply and store 64-bit products + 5. A sum 64-bit products 128-bit product + 6. A load 64-bit products to integer registers + 7. goto 1 + + In practice, adjacent groups (1 and 2, 2 and 3, etc) will be interleaved + for better instruction mix. diff --git a/ghc/rts/gmp/mpn/pa64/add_n.s b/ghc/rts/gmp/mpn/pa64/add_n.s new file mode 100644 index 0000000..22ff19c --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64/add_n.s @@ -0,0 +1,90 @@ +; HP-PA 2.0 __gmpn_add_n -- Add two limb vectors of the same length > 0 and +; store sum in a third limb vector. + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; This runs at 2 cycles/limb on PA8000. + + .level 2.0n + .code + .export __gmpn_add_n,entry +__gmpn_add_n + .proc + .callinfo frame=0,args_saved + .entry + + sub %r0,%r23,%r22 + depw,z %r22,30,3,%r28 ; r28 = 2 * (-n & 7) + depw,z %r22,28,3,%r22 ; r22 = 8 * (-n & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + sub %r24,%r22,%r24 ; offset s2_ptr + sub %r26,%r22,%r26 ; offset res_ptr + blr %r28,%r0 ; branch into loop + add %r0,%r0,%r0 ; reset carry + +L$loop ldd 0(%r25),%r20 + ldd 0(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,0(%r26) +L$7 ldd 8(%r25),%r21 + ldd 8(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,8(%r26) +L$6 ldd 16(%r25),%r20 + ldd 16(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,16(%r26) +L$5 ldd 24(%r25),%r21 + ldd 24(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,24(%r26) +L$4 ldd 32(%r25),%r20 + ldd 32(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,32(%r26) +L$3 ldd 40(%r25),%r21 + ldd 40(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,40(%r26) +L$2 ldd 48(%r25),%r20 + ldd 48(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,48(%r26) +L$1 ldd 56(%r25),%r21 + ldo 64(%r25),%r25 + ldd 56(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,56(%r26) + ldo 64(%r24),%r24 + addib,> -8,%r23,L$loop + ldo 64(%r26),%r26 + + add,dc %r0,%r0,%r29 + bve (%r2) + .exit + ldi 0,%r28 + .procend diff --git a/ghc/rts/gmp/mpn/pa64/addmul_1.S b/ghc/rts/gmp/mpn/pa64/addmul_1.S new file mode 100644 index 0000000..b1885b4 --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64/addmul_1.S @@ -0,0 +1,167 @@ +; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and +; add the result to a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb -56(%r30) + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0n + .code + .export __gmpn_addmul_1,entry +__gmpn_addmul_1 + .proc + .callinfo frame=128,no_calls + .entry + fldd -56(%r30),%fr5 ; s2limb passed on stack + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,rlimb,rlimb + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + add t4,rlimb,t3 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,rlimb,rlimb + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + add t4,rlimb,t3 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,rlimb,rlimb + add,dc t2,hi,cylimb + add t4,rlimb,t3 + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + extrd,u cylimb,31,32,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/ghc/rts/gmp/mpn/pa64/gmp-mparam.h b/ghc/rts/gmp/mpn/pa64/gmp-mparam.h new file mode 100644 index 0000000..847735b --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64/gmp-mparam.h @@ -0,0 +1,65 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 64 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* These values were measured in a PA8000 using the system compiler version + A.10.32.30. Presumably the PA8200 and PA8500 have the same timing + characteristic, but GCC might give somewhat different results. */ +/* Generated by tuneup.c, 2000-07-25. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 16 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 105 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 40 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 116 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 72 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 94 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 50 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 46 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 1 +#endif diff --git a/ghc/rts/gmp/mpn/pa64/lshift.s b/ghc/rts/gmp/mpn/pa64/lshift.s new file mode 100644 index 0000000..994bc1c --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64/lshift.s @@ -0,0 +1,103 @@ +; HP-PA 2.0 __gmpn_lshift -- + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; size gr24 +; cnt gr23 + +; This runs at 1.5 cycles/limb on PA8000. + + .level 2.0n + .code + .export __gmpn_lshift,entry +__gmpn_lshift + .proc + .callinfo frame=0,args_saved + .entry + + shladd %r24,3,%r25,%r25 + shladd %r24,3,%r26,%r26 + subi 64,%r23,%r23 + mtsar %r23 + ldd -8(%r25),%r21 + addib,= -1,%r24,L$end + shrpd %r0,%r21,%sar,%r29 ; compute carry out limb + depw,z %r24,31,3,%r28 ; r28 = (size & 7) + sub %r0,%r24,%r22 + depw,z %r22,28,3,%r22 ; r22 = 8 * (-size & 7) + add %r25,%r22,%r25 ; offset s1_ptr + blr %r28,%r0 ; branch into jump table + add %r26,%r22,%r26 ; offset res_ptr + b L$0 + nop + b L$1 + copy %r21,%r20 + b L$2 + nop + b L$3 + copy %r21,%r20 + b L$4 + nop + b L$5 + copy %r21,%r20 + b L$6 + nop + b L$7 + copy %r21,%r20 + +L$loop +L$0 ldd -16(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-8(%r26) +L$7 ldd -24(%r25),%r21 + shrpd %r20,%r21,%sar,%r20 + std %r20,-16(%r26) +L$6 ldd -32(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-24(%r26) +L$5 ldd -40(%r25),%r21 + shrpd %r20,%r21,%sar,%r20 + std %r20,-32(%r26) +L$4 ldd -48(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-40(%r26) +L$3 ldd -56(%r25),%r21 + shrpd %r20,%r21,%sar,%r20 + std %r20,-48(%r26) +L$2 ldd -64(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-56(%r26) +L$1 ldd -72(%r25),%r21 + ldo -64(%r25),%r25 + shrpd %r20,%r21,%sar,%r20 + std %r20,-64(%r26) + addib,> -8,%r24,L$loop + ldo -64(%r26),%r26 + +L$end shrpd %r21,%r0,%sar,%r21 + std %r21,-8(%r26) + bve (%r2) + .exit + extrd,u %r29,31,32,%r28 + .procend diff --git a/ghc/rts/gmp/mpn/pa64/mul_1.S b/ghc/rts/gmp/mpn/pa64/mul_1.S new file mode 100644 index 0000000..ab310c1 --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64/mul_1.S @@ -0,0 +1,158 @@ +; HP-PA 2.0 64-bit __gmpn_mul_1 -- Multiply a limb vector with a limb and +; store the result in a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb -56(%r30) + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0n + .code + .export __gmpn_mul_1,entry +__gmpn_mul_1 + .proc + .callinfo frame=128,no_calls + .entry + fldd -56(%r30),%fr5 ; s2limb passed on stack + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t3 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t3 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t2 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,t4,t3 + add,dc t2,hi,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + extrd,u cylimb,31,32,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/ghc/rts/gmp/mpn/pa64/rshift.s b/ghc/rts/gmp/mpn/pa64/rshift.s new file mode 100644 index 0000000..f0730e2 --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64/rshift.s @@ -0,0 +1,100 @@ +; HP-PA 2.0 __gmpn_rshift -- + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; size gr24 +; cnt gr23 + +; This runs at 1.5 cycles/limb on PA8000. + + .level 2.0n + .code + .export __gmpn_rshift,entry +__gmpn_rshift + .proc + .callinfo frame=0,args_saved + .entry + + mtsar %r23 + ldd 0(%r25),%r21 + addib,= -1,%r24,L$end + shrpd %r21,%r0,%sar,%r29 ; compute carry out limb + depw,z %r24,31,3,%r28 ; r28 = (size & 7) + sub %r0,%r24,%r22 + depw,z %r22,28,3,%r22 ; r22 = 8 * (-size & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + blr %r28,%r0 ; branch into jump table + sub %r26,%r22,%r26 ; offset res_ptr + b L$0 + nop + b L$1 + copy %r21,%r20 + b L$2 + nop + b L$3 + copy %r21,%r20 + b L$4 + nop + b L$5 + copy %r21,%r20 + b L$6 + nop + b L$7 + copy %r21,%r20 + +L$loop +L$0 ldd 8(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,0(%r26) +L$7 ldd 16(%r25),%r21 + shrpd %r21,%r20,%sar,%r20 + std %r20,8(%r26) +L$6 ldd 24(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,16(%r26) +L$5 ldd 32(%r25),%r21 + shrpd %r21,%r20,%sar,%r20 + std %r20,24(%r26) +L$4 ldd 40(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,32(%r26) +L$3 ldd 48(%r25),%r21 + shrpd %r21,%r20,%sar,%r20 + std %r20,40(%r26) +L$2 ldd 56(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,48(%r26) +L$1 ldd 64(%r25),%r21 + ldo 64(%r25),%r25 + shrpd %r21,%r20,%sar,%r20 + std %r20,56(%r26) + addib,> -8,%r24,L$loop + ldo 64(%r26),%r26 + +L$end shrpd %r0,%r21,%sar,%r21 + std %r21,0(%r26) + bve (%r2) + .exit + extrd,u %r29,31,32,%r28 + .procend diff --git a/ghc/rts/gmp/mpn/pa64/sub_n.s b/ghc/rts/gmp/mpn/pa64/sub_n.s new file mode 100644 index 0000000..dda1f54 --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64/sub_n.s @@ -0,0 +1,90 @@ +; HP-PA 2.0 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 +; and store difference in a third limb vector. + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; This runs at 2 cycles/limb on PA8000. + + .level 2.0n + .code + .export __gmpn_sub_n,entry +__gmpn_sub_n + .proc + .callinfo frame=0,args_saved + .entry + + sub %r0,%r23,%r22 + depw,z %r22,30,3,%r28 ; r28 = 2 * (-n & 7) + depw,z %r22,28,3,%r22 ; r22 = 8 * (-n & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + sub %r24,%r22,%r24 ; offset s2_ptr + blr %r28,%r0 ; branch into loop + sub %r26,%r22,%r26 ; offset res_ptr and set carry + +L$loop ldd 0(%r25),%r20 + ldd 0(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,0(%r26) +L$7 ldd 8(%r25),%r21 + ldd 8(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,8(%r26) +L$6 ldd 16(%r25),%r20 + ldd 16(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,16(%r26) +L$5 ldd 24(%r25),%r21 + ldd 24(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,24(%r26) +L$4 ldd 32(%r25),%r20 + ldd 32(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,32(%r26) +L$3 ldd 40(%r25),%r21 + ldd 40(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,40(%r26) +L$2 ldd 48(%r25),%r20 + ldd 48(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,48(%r26) +L$1 ldd 56(%r25),%r21 + ldo 64(%r25),%r25 + ldd 56(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,56(%r26) + ldo 64(%r24),%r24 + addib,> -8,%r23,L$loop + ldo 64(%r26),%r26 + + add,dc %r0,%r0,%r29 + subi 1,%r29,%r29 + bve (%r2) + .exit + ldi 0,%r28 + .procend diff --git a/ghc/rts/gmp/mpn/pa64/submul_1.S b/ghc/rts/gmp/mpn/pa64/submul_1.S new file mode 100644 index 0000000..27666b9 --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64/submul_1.S @@ -0,0 +1,170 @@ +; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and +; subtract the result from a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb -56(%r30) + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0n + .code + .export __gmpn_submul_1,entry +__gmpn_submul_1 + .proc + .callinfo frame=128,no_calls + .entry + fldd -56(%r30),%fr5 ; s2limb passed on stack + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t4 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + sub rlimb,t4,t3 + add t4,t3,%r0 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t4 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + sub rlimb,t4,t3 + add t4,t3,%r0 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,t4,t4 + add,dc t2,hi,cylimb + sub rlimb,t4,t3 + add t4,t3,%r0 + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + extrd,u cylimb,31,32,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/ghc/rts/gmp/mpn/pa64/udiv_qrnnd.c b/ghc/rts/gmp/mpn/pa64/udiv_qrnnd.c new file mode 100644 index 0000000..1c9fe08 --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64/udiv_qrnnd.c @@ -0,0 +1,111 @@ +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#define TWO64 18446744073709551616.0 + +mp_limb_t +#if __STDC__ +__MPN(udiv_qrnnd) (mp_limb_t n1, mp_limb_t n0, mp_limb_t d, mp_limb_t *r) +#else +__MPN(udiv_qrnnd) (n1, n0, d, r) + mp_limb_t n1; + mp_limb_t n0; + mp_limb_t d; + mp_limb_t *r; +#endif +{ + mp_limb_t q1, q2, q; + mp_limb_t p1, p0; + double di, dq; + + di = 1.0 / d; + + /* Generate upper 53 bits of quotient. Be careful here; the `double' + quotient may be rounded to 2^64 which we cannot safely convert back + to a 64-bit integer. */ + dq = (TWO64 * (double) n1 + (double) n0) * di; + if (dq >= TWO64) + q1 = 0xfffffffffffff800LL; + else + q1 = (mp_limb_t) dq; + + /* Multiply back in order to compare the product to the dividend. */ + umul_ppmm (p1, p0, q1, d); + + /* Was the 53-bit quotient greater that our sought quotient? Test the + sign of the partial remainder to find out. */ + if (n1 < p1 || (n1 == p1 && n0 < p0)) + { + /* 53-bit quotient too large. Partial remainder is negative. + Compute the absolute value of the remainder in n1,,n0. */ + n1 = p1 - (n1 + (p0 < n0)); + n0 = p0 - n0; + + /* Now use the partial remainder as new dividend to compute more bits of + quotient. This is an adjustment for the one we got previously. */ + q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di); + umul_ppmm (p1, p0, q2, d); + + q = q1 - q2; + if (n1 < p1 || (n1 == p1 && n0 <= p0)) + { + n0 = p0 - n0; + } + else + { + n0 = p0 - n0; + n0 += d; + q--; + } + } + else + { + n1 = n1 - (p1 + (n0 < p0)); + n0 = n0 - p0; + + q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di); + umul_ppmm (p1, p0, q2, d); + + q = q1 + q2; + if (n1 < p1 || (n1 == p1 && n0 < p0)) + { + n0 = n0 - p0; + n0 += d; + q--; + } + else + { + n0 = n0 - p0; + if (n0 >= d) + { + n0 -= d; + q++; + } + } + } + + *r = n0; + return q; +} diff --git a/ghc/rts/gmp/mpn/pa64/umul_ppmm.S b/ghc/rts/gmp/mpn/pa64/umul_ppmm.S new file mode 100644 index 0000000..ceff2d7 --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64/umul_ppmm.S @@ -0,0 +1,74 @@ +; Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +#define p0 %r28 +#define p1 %r29 +#define t32 %r19 +#define t0 %r20 +#define t1 %r21 +#define x %r22 +#define m0 %r23 +#define m1 %r24 + .level 2.0n + .code + .export __gmpn_umul_ppmm,entry +__gmpn_umul_ppmm + .proc + .callinfo frame=128,no_calls + .entry + ldo 128(%r30),%r30 + depd %r25,31,32,%r26 + std %r26,-64(%r30) + depd %r23,31,32,%r24 + std %r24,-56(%r30) + + ldw -180(%r30),%r31 + + fldd -64(%r30),%fr4 + fldd -56(%r30),%fr5 + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + + depdi,z 1,31,1,t32 ; t32 = 2^32 + + ldd -128(%r30),p0 ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),p1 ; hi = high 64 bit of product + + add,l,*nuv m0,m1,x ; x = m1+m0 + add,l t32,p1,p1 ; propagate carry to mid of p1 + depd,z x,31,32,t0 ; lo32(m1+m0) + add t0,p0,p0 + extrd,u x,31,32,t1 ; hi32(m1+m0) + add,dc t1,p1,p1 + + std p0,0(%r31) ; store low half of product + extrd,u p1,31,32,%r28 ; return high half of product + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/ghc/rts/gmp/mpn/pa64w/README b/ghc/rts/gmp/mpn/pa64w/README new file mode 100644 index 0000000..cf590a7 --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64w/README @@ -0,0 +1,2 @@ +This directory contains mpn functions for 64-bit PA-RISC 2.0 +using 64-bit pointers (2.0W). diff --git a/ghc/rts/gmp/mpn/pa64w/add_n.s b/ghc/rts/gmp/mpn/pa64w/add_n.s new file mode 100644 index 0000000..1bb9e8f --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64w/add_n.s @@ -0,0 +1,90 @@ +; HP-PA 2.0 __gmpn_add_n -- Add two limb vectors of the same length > 0 and +; store sum in a third limb vector. + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; This runs at 2 cycles/limb on PA8000. + + .level 2.0w + .code + .export __gmpn_add_n,entry +__gmpn_add_n + .proc + .callinfo frame=0,args_saved + .entry + + sub %r0,%r23,%r22 + depw,z %r22,30,3,%r28 ; r28 = 2 * (-n & 7) + depw,z %r22,28,3,%r22 ; r22 = 8 * (-n & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + sub %r24,%r22,%r24 ; offset s2_ptr + sub %r26,%r22,%r26 ; offset res_ptr + blr %r28,%r0 ; branch into loop + add %r0,%r0,%r0 ; reset carry + +L$loop ldd 0(%r25),%r20 + ldd 0(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,0(%r26) +L$7 ldd 8(%r25),%r21 + ldd 8(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,8(%r26) +L$6 ldd 16(%r25),%r20 + ldd 16(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,16(%r26) +L$5 ldd 24(%r25),%r21 + ldd 24(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,24(%r26) +L$4 ldd 32(%r25),%r20 + ldd 32(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,32(%r26) +L$3 ldd 40(%r25),%r21 + ldd 40(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,40(%r26) +L$2 ldd 48(%r25),%r20 + ldd 48(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,48(%r26) +L$1 ldd 56(%r25),%r21 + ldo 64(%r25),%r25 + ldd 56(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,56(%r26) + ldo 64(%r24),%r24 + addib,> -8,%r23,L$loop + ldo 64(%r26),%r26 + + add,dc %r0,%r0,%r29 + bve (%r2) + .exit + copy %r29,%r28 + .procend diff --git a/ghc/rts/gmp/mpn/pa64w/addmul_1.S b/ghc/rts/gmp/mpn/pa64w/addmul_1.S new file mode 100644 index 0000000..4799f90 --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64w/addmul_1.S @@ -0,0 +1,168 @@ +; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and +; add the result to a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb %r23 + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0w + .code + .export __gmpn_addmul_1,entry +__gmpn_addmul_1 + .proc + .callinfo frame=128,no_calls + .entry + std s2limb,-56(%r30) + fldd -56(%r30),%fr5 + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,rlimb,rlimb + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + add t4,rlimb,t3 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,rlimb,rlimb + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + add t4,rlimb,t3 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,rlimb,rlimb + add,dc t2,hi,cylimb + add t4,rlimb,t3 + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + copy cylimb,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/ghc/rts/gmp/mpn/pa64w/gmp-mparam.h b/ghc/rts/gmp/mpn/pa64w/gmp-mparam.h new file mode 100644 index 0000000..ee5a0a3 --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64w/gmp-mparam.h @@ -0,0 +1,65 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 64 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* These values were measured on a PA8500 using the system compiler version + A.11.01.02. Presumably the PA8000 and PA8200 have the same timing + characteristic, but GCC might give somewhat different results.. */ +/* Generated by tuneup.c, 2000-07-25. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 18 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 105 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 46 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 83 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 58 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 134 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 56 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 26 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 1 +#endif diff --git a/ghc/rts/gmp/mpn/pa64w/lshift.s b/ghc/rts/gmp/mpn/pa64w/lshift.s new file mode 100644 index 0000000..84f925a --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64w/lshift.s @@ -0,0 +1,103 @@ +; HP-PA 2.0 __gmpn_lshift -- + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; size gr24 +; cnt gr23 + +; This runs at 1.5 cycles/limb on PA8000. + + .level 2.0w + .code + .export __gmpn_lshift,entry +__gmpn_lshift + .proc + .callinfo frame=0,args_saved + .entry + + shladd %r24,3,%r25,%r25 + shladd %r24,3,%r26,%r26 + subi 64,%r23,%r23 + mtsar %r23 + ldd -8(%r25),%r21 + addib,= -1,%r24,L$end + shrpd %r0,%r21,%sar,%r29 ; compute carry out limb + depw,z %r24,31,3,%r28 ; r28 = (size & 7) + sub %r0,%r24,%r22 + depw,z %r22,28,3,%r22 ; r22 = 8 * (-size & 7) + add %r25,%r22,%r25 ; offset s1_ptr + blr %r28,%r0 ; branch into jump table + add %r26,%r22,%r26 ; offset res_ptr + b L$0 + nop + b L$1 + copy %r21,%r20 + b L$2 + nop + b L$3 + copy %r21,%r20 + b L$4 + nop + b L$5 + copy %r21,%r20 + b L$6 + nop + b L$7 + copy %r21,%r20 + +L$loop +L$0 ldd -16(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-8(%r26) +L$7 ldd -24(%r25),%r21 + shrpd %r20,%r21,%sar,%r20 + std %r20,-16(%r26) +L$6 ldd -32(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-24(%r26) +L$5 ldd -40(%r25),%r21 + shrpd %r20,%r21,%sar,%r20 + std %r20,-32(%r26) +L$4 ldd -48(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-40(%r26) +L$3 ldd -56(%r25),%r21 + shrpd %r20,%r21,%sar,%r20 + std %r20,-48(%r26) +L$2 ldd -64(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-56(%r26) +L$1 ldd -72(%r25),%r21 + ldo -64(%r25),%r25 + shrpd %r20,%r21,%sar,%r20 + std %r20,-64(%r26) + addib,> -8,%r24,L$loop + ldo -64(%r26),%r26 + +L$end shrpd %r21,%r0,%sar,%r21 + std %r21,-8(%r26) + bve (%r2) + .exit + copy %r29,%r28 + .procend diff --git a/ghc/rts/gmp/mpn/pa64w/mul_1.S b/ghc/rts/gmp/mpn/pa64w/mul_1.S new file mode 100644 index 0000000..48f13fb --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64w/mul_1.S @@ -0,0 +1,159 @@ +; HP-PA 2.0 64-bit __gmpn_mul_1 -- Multiply a limb vector with a limb and +; store the result in a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb %r23 + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0w + .code + .export __gmpn_mul_1,entry +__gmpn_mul_1 + .proc + .callinfo frame=128,no_calls + .entry + std s2limb,-56(%r30) + fldd -56(%r30),%fr5 + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t3 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t3 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t2 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,t4,t3 + add,dc t2,hi,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + copy cylimb,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/ghc/rts/gmp/mpn/pa64w/rshift.s b/ghc/rts/gmp/mpn/pa64w/rshift.s new file mode 100644 index 0000000..2517cb1 --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64w/rshift.s @@ -0,0 +1,100 @@ +; HP-PA 2.0 __gmpn_rshift -- + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; size gr24 +; cnt gr23 + +; This runs at 1.5 cycles/limb on PA8000. + + .level 2.0w + .code + .export __gmpn_rshift,entry +__gmpn_rshift + .proc + .callinfo frame=0,args_saved + .entry + + mtsar %r23 + ldd 0(%r25),%r21 + addib,= -1,%r24,L$end + shrpd %r21,%r0,%sar,%r29 ; compute carry out limb + depw,z %r24,31,3,%r28 ; r28 = (size & 7) + sub %r0,%r24,%r22 + depw,z %r22,28,3,%r22 ; r22 = 8 * (-size & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + blr %r28,%r0 ; branch into jump table + sub %r26,%r22,%r26 ; offset res_ptr + b L$0 + nop + b L$1 + copy %r21,%r20 + b L$2 + nop + b L$3 + copy %r21,%r20 + b L$4 + nop + b L$5 + copy %r21,%r20 + b L$6 + nop + b L$7 + copy %r21,%r20 + +L$loop +L$0 ldd 8(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,0(%r26) +L$7 ldd 16(%r25),%r21 + shrpd %r21,%r20,%sar,%r20 + std %r20,8(%r26) +L$6 ldd 24(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,16(%r26) +L$5 ldd 32(%r25),%r21 + shrpd %r21,%r20,%sar,%r20 + std %r20,24(%r26) +L$4 ldd 40(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,32(%r26) +L$3 ldd 48(%r25),%r21 + shrpd %r21,%r20,%sar,%r20 + std %r20,40(%r26) +L$2 ldd 56(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,48(%r26) +L$1 ldd 64(%r25),%r21 + ldo 64(%r25),%r25 + shrpd %r21,%r20,%sar,%r20 + std %r20,56(%r26) + addib,> -8,%r24,L$loop + ldo 64(%r26),%r26 + +L$end shrpd %r0,%r21,%sar,%r21 + std %r21,0(%r26) + bve (%r2) + .exit + copy %r29,%r28 + .procend diff --git a/ghc/rts/gmp/mpn/pa64w/sub_n.s b/ghc/rts/gmp/mpn/pa64w/sub_n.s new file mode 100644 index 0000000..ad01e24 --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64w/sub_n.s @@ -0,0 +1,90 @@ +; HP-PA 2.0 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 +; and store difference in a third limb vector. + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; This runs at 2 cycles/limb on PA8000. + + .level 2.0w + .code + .export __gmpn_sub_n,entry +__gmpn_sub_n + .proc + .callinfo frame=0,args_saved + .entry + + sub %r0,%r23,%r22 + depw,z %r22,30,3,%r28 ; r28 = 2 * (-n & 7) + depw,z %r22,28,3,%r22 ; r22 = 8 * (-n & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + sub %r24,%r22,%r24 ; offset s2_ptr + blr %r28,%r0 ; branch into loop + sub %r26,%r22,%r26 ; offset res_ptr and set carry + +L$loop ldd 0(%r25),%r20 + ldd 0(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,0(%r26) +L$7 ldd 8(%r25),%r21 + ldd 8(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,8(%r26) +L$6 ldd 16(%r25),%r20 + ldd 16(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,16(%r26) +L$5 ldd 24(%r25),%r21 + ldd 24(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,24(%r26) +L$4 ldd 32(%r25),%r20 + ldd 32(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,32(%r26) +L$3 ldd 40(%r25),%r21 + ldd 40(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,40(%r26) +L$2 ldd 48(%r25),%r20 + ldd 48(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,48(%r26) +L$1 ldd 56(%r25),%r21 + ldo 64(%r25),%r25 + ldd 56(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,56(%r26) + ldo 64(%r24),%r24 + addib,> -8,%r23,L$loop + ldo 64(%r26),%r26 + + add,dc %r0,%r0,%r29 + subi 1,%r29,%r29 + bve (%r2) + .exit + copy %r29,%r28 + .procend diff --git a/ghc/rts/gmp/mpn/pa64w/submul_1.S b/ghc/rts/gmp/mpn/pa64w/submul_1.S new file mode 100644 index 0000000..294f623 --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64w/submul_1.S @@ -0,0 +1,171 @@ +; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and +; subtract the result from a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb %r23 + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0w + .code + .export __gmpn_submul_1,entry +__gmpn_submul_1 + .proc + .callinfo frame=128,no_calls + .entry + std s2limb,-56(%r30) + fldd -56(%r30),%fr5 + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t4 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + sub rlimb,t4,t3 + add t4,t3,%r0 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t4 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + sub rlimb,t4,t3 + add t4,t3,%r0 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,t4,t4 + add,dc t2,hi,cylimb + sub rlimb,t4,t3 + add t4,t3,%r0 + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + copy cylimb,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/ghc/rts/gmp/mpn/pa64w/udiv_qrnnd.c b/ghc/rts/gmp/mpn/pa64w/udiv_qrnnd.c new file mode 100644 index 0000000..1852913 --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64w/udiv_qrnnd.c @@ -0,0 +1,117 @@ +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#define TWO64 18446744073709551616.0 +#define TWO63 9223372036854775808.0 + +mp_limb_t +#if __STDC__ +__MPN(udiv_qrnnd) (mp_limb_t n1, mp_limb_t n0, mp_limb_t d, mp_limb_t *r) +#else +__MPN(udiv_qrnnd) (n1, n0, d, r) + mp_limb_t n1; + mp_limb_t n0; + mp_limb_t d; + mp_limb_t *r; +#endif +{ + mp_limb_t q1, q2, q; + mp_limb_t p1, p0; + double di, dq; + + di = 1.0 / d; + + /* Generate upper 53 bits of quotient. Be careful here; the `double' + quotient may be rounded to 2^64 which we cannot safely convert back + to a 64-bit integer. */ + dq = (TWO64 * (double) n1 + (double) n0) * di; + if (dq >= TWO64) + q1 = 0xfffffffffffff800L; +#ifndef __GNUC__ + /* Work around HP compiler bug. */ + else if (dq > TWO63) + q1 = (mp_limb_t) (dq - TWO63) + 0x8000000000000000L; +#endif + else + q1 = (mp_limb_t) dq; + + /* Multiply back in order to compare the product to the dividend. */ + umul_ppmm (p1, p0, q1, d); + + /* Was the 53-bit quotient greater that our sought quotient? Test the + sign of the partial remainder to find out. */ + if (n1 < p1 || (n1 == p1 && n0 < p0)) + { + /* 53-bit quotient too large. Partial remainder is negative. + Compute the absolute value of the remainder in n1,,n0. */ + n1 = p1 - (n1 + (p0 < n0)); + n0 = p0 - n0; + + /* Now use the partial remainder as new dividend to compute more bits of + quotient. This is an adjustment for the one we got previously. */ + q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di); + umul_ppmm (p1, p0, q2, d); + + q = q1 - q2; + if (n1 < p1 || (n1 == p1 && n0 <= p0)) + { + n0 = p0 - n0; + } + else + { + n0 = p0 - n0; + n0 += d; + q--; + } + } + else + { + n1 = n1 - (p1 + (n0 < p0)); + n0 = n0 - p0; + + q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di); + umul_ppmm (p1, p0, q2, d); + + q = q1 + q2; + if (n1 < p1 || (n1 == p1 && n0 < p0)) + { + n0 = n0 - p0; + n0 += d; + q--; + } + else + { + n0 = n0 - p0; + if (n0 >= d) + { + n0 -= d; + q++; + } + } + } + + *r = n0; + return q; +} diff --git a/ghc/rts/gmp/mpn/pa64w/umul_ppmm.S b/ghc/rts/gmp/mpn/pa64w/umul_ppmm.S new file mode 100644 index 0000000..d9fb92b --- /dev/null +++ b/ghc/rts/gmp/mpn/pa64w/umul_ppmm.S @@ -0,0 +1,72 @@ +; Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +#define p0 %r28 +#define p1 %r29 +#define t32 %r19 +#define t0 %r20 +#define t1 %r21 +#define x %r22 +#define m0 %r23 +#define m1 %r24 + .level 2.0w + .code + .export __gmpn_umul_ppmm,entry +__gmpn_umul_ppmm + .proc + .callinfo frame=128,no_calls + .entry + ldo 128(%r30),%r30 + std %r26,-64(%r30) + std %r25,-56(%r30) + + copy %r24,%r31 + + fldd -64(%r30),%fr4 + fldd -56(%r30),%fr5 + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + + depdi,z 1,31,1,t32 ; t32 = 2^32 + + ldd -128(%r30),p0 ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),p1 ; hi = high 64 bit of product + + add,l,*nuv m0,m1,x ; x = m1+m0 + add,l t32,p1,p1 ; propagate carry to mid of p1 + depd,z x,31,32,t0 ; lo32(m1+m0) + add t0,p0,p0 + extrd,u x,31,32,t1 ; hi32(m1+m0) + add,dc t1,p1,p1 + + std p0,0(%r31) ; store low half of product + copy p1,%r28 ; return high half of product + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/ghc/rts/gmp/mpn/power/add_n.s b/ghc/rts/gmp/mpn/power/add_n.s index 9e1c948..0f9f48f 100644 --- a/ghc/rts/gmp/mpn/power/add_n.s +++ b/ghc/rts/gmp/mpn/power/add_n.s @@ -1,20 +1,21 @@ -# IBM POWER __mpn_add_n -- Add two limb vectors of equal, non-zero length. +# IBM POWER __gmpn_add_n -- Add two limb vectors of equal, non-zero length. -# Copyright (C) 1992, 1994, 1995, 1996 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software Foundation, +# Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -27,17 +28,14 @@ # size r6 .toc - .extern __mpn_add_n[DS] - .extern .__mpn_add_n -.csect [PR] - .align 2 - .globl __mpn_add_n - .globl .__mpn_add_n - .csect __mpn_add_n[DS] -__mpn_add_n: - .long .__mpn_add_n, TOC[tc0], 0 - .csect [PR] -.__mpn_add_n: + .globl __gmpn_add_n + .globl .__gmpn_add_n + .csect __gmpn_add_n[DS] +__gmpn_add_n: + .long .__gmpn_add_n, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__gmpn_add_n: andil. 10,6,1 # odd or even number of limbs? l 8,0(4) # load least significant s1 limb l 0,0(5) # load least significant s2 limb @@ -49,7 +47,7 @@ __mpn_add_n: # We have an odd # of limbs. Add the first limbs separately. cmpi 1,10,0 # is count for unrolled loop zero? - bne 1,L1 # branch if not + bc 4,6,L1 # bne cr1,L1 (misassembled by gas) st 7,4(3) aze 3,10 # use the fact that r10 is zero... br # return diff --git a/ghc/rts/gmp/mpn/power/addmul_1.s b/ghc/rts/gmp/mpn/power/addmul_1.s index 2db6984..8ecc651 100644 --- a/ghc/rts/gmp/mpn/power/addmul_1.s +++ b/ghc/rts/gmp/mpn/power/addmul_1.s @@ -1,21 +1,21 @@ -# IBM POWER __mpn_addmul_1 -- Multiply a limb vector with a limb and add +# IBM POWER __gmpn_addmul_1 -- Multiply a limb vector with a limb and add # the result to a second limb vector. -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -27,26 +27,25 @@ # size r5 # s2_limb r6 -# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To -# obtain that operation, we have to use the 32x32->64 signed multiplication -# instruction, and add the appropriate compensation to the high limb of the -# result. We add the multiplicand if the multiplier has its most significant -# bit set, and we add the multiplier if the multiplicand has its most -# significant bit set. We need to preserve the carry flag between each +# The POWER architecture has no unsigned 32x32->64 bit multiplication +# instruction. To obtain that operation, we have to use the 32x32->64 signed +# multiplication instruction, and add the appropriate compensation to the high +# limb of the result. We add the multiplicand if the multiplier has its most +# significant bit set, and we add the multiplier if the multiplicand has its +# most significant bit set. We need to preserve the carry flag between each # iteration, so we have to compute the compensation carefully (the natural, -# srai+and doesn't work). Since the POWER architecture has a branch unit -# we can branch in zero cycles, so that's how we perform the additions. +# srai+and doesn't work). Since the POWER architecture has a branch unit we +# can branch in zero cycles, so that's how we perform the additions. .toc - .csect .__mpn_addmul_1[PR] - .align 2 - .globl __mpn_addmul_1 - .globl .__mpn_addmul_1 - .csect __mpn_addmul_1[DS] -__mpn_addmul_1: - .long .__mpn_addmul_1[PR], TOC[tc0], 0 - .csect .__mpn_addmul_1[PR] -.__mpn_addmul_1: + .globl __gmpn_addmul_1 + .globl .__gmpn_addmul_1 + .csect __gmpn_addmul_1[DS] +__gmpn_addmul_1: + .long .__gmpn_addmul_1, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__gmpn_addmul_1: cal 3,-4(3) l 0,0(4) diff --git a/ghc/rts/gmp/mpn/power/lshift.s b/ghc/rts/gmp/mpn/power/lshift.s index 38169bf..ab71fb7 100644 --- a/ghc/rts/gmp/mpn/power/lshift.s +++ b/ghc/rts/gmp/mpn/power/lshift.s @@ -1,20 +1,20 @@ -# IBM POWER __mpn_lshift -- +# IBM POWER __gmpn_lshift -- -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -27,17 +27,14 @@ # cnt r6 .toc - .extern __mpn_lshift[DS] - .extern .__mpn_lshift -.csect [PR] - .align 2 - .globl __mpn_lshift - .globl .__mpn_lshift - .csect __mpn_lshift[DS] -__mpn_lshift: - .long .__mpn_lshift, TOC[tc0], 0 - .csect [PR] -.__mpn_lshift: + .globl __gmpn_lshift + .globl .__gmpn_lshift + .csect __gmpn_lshift[DS] +__gmpn_lshift: + .long .__gmpn_lshift, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__gmpn_lshift: sli 0,5,2 cax 9,3,0 cax 4,4,0 diff --git a/ghc/rts/gmp/mpn/power/mul_1.s b/ghc/rts/gmp/mpn/power/mul_1.s index a72bce6..4e08ade 100644 --- a/ghc/rts/gmp/mpn/power/mul_1.s +++ b/ghc/rts/gmp/mpn/power/mul_1.s @@ -1,21 +1,21 @@ -# IBM POWER __mpn_mul_1 -- Multiply a limb vector with a limb and store +# IBM POWER __gmpn_mul_1 -- Multiply a limb vector with a limb and store # the result in a second limb vector. -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -27,26 +27,25 @@ # size r5 # s2_limb r6 -# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To -# obtain that operation, we have to use the 32x32->64 signed multiplication -# instruction, and add the appropriate compensation to the high limb of the -# result. We add the multiplicand if the multiplier has its most significant -# bit set, and we add the multiplier if the multiplicand has its most -# significant bit set. We need to preserve the carry flag between each +# The POWER architecture has no unsigned 32x32->64 bit multiplication +# instruction. To obtain that operation, we have to use the 32x32->64 signed +# multiplication instruction, and add the appropriate compensation to the high +# limb of the result. We add the multiplicand if the multiplier has its most +# significant bit set, and we add the multiplier if the multiplicand has its +# most significant bit set. We need to preserve the carry flag between each # iteration, so we have to compute the compensation carefully (the natural, -# srai+and doesn't work). Since the POWER architecture has a branch unit -# we can branch in zero cycles, so that's how we perform the additions. +# srai+and doesn't work). Since the POWER architecture has a branch unit we +# can branch in zero cycles, so that's how we perform the additions. .toc - .csect .__mpn_mul_1[PR] - .align 2 - .globl __mpn_mul_1 - .globl .__mpn_mul_1 - .csect __mpn_mul_1[DS] -__mpn_mul_1: - .long .__mpn_mul_1[PR], TOC[tc0], 0 - .csect .__mpn_mul_1[PR] -.__mpn_mul_1: + .globl __gmpn_mul_1 + .globl .__gmpn_mul_1 + .csect __gmpn_mul_1[DS] +__gmpn_mul_1: + .long .__gmpn_mul_1, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__gmpn_mul_1: cal 3,-4(3) l 0,0(4) diff --git a/ghc/rts/gmp/mpn/power/rshift.s b/ghc/rts/gmp/mpn/power/rshift.s index 30d408a..65b3945 100644 --- a/ghc/rts/gmp/mpn/power/rshift.s +++ b/ghc/rts/gmp/mpn/power/rshift.s @@ -1,20 +1,20 @@ -# IBM POWER __mpn_rshift -- +# IBM POWER __gmpn_rshift -- -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -27,17 +27,14 @@ # cnt r6 .toc - .extern __mpn_rshift[DS] - .extern .__mpn_rshift -.csect [PR] - .align 2 - .globl __mpn_rshift - .globl .__mpn_rshift - .csect __mpn_rshift[DS] -__mpn_rshift: - .long .__mpn_rshift, TOC[tc0], 0 - .csect [PR] -.__mpn_rshift: + .globl __gmpn_rshift + .globl .__gmpn_rshift + .csect __gmpn_rshift[DS] +__gmpn_rshift: + .long .__gmpn_rshift, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__gmpn_rshift: sfi 8,6,32 mtctr 5 # put limb count in CTR loop register l 0,0(4) # read least significant limb diff --git a/ghc/rts/gmp/mpn/power/sdiv.s b/ghc/rts/gmp/mpn/power/sdiv.s new file mode 100644 index 0000000..81da622 --- /dev/null +++ b/ghc/rts/gmp/mpn/power/sdiv.s @@ -0,0 +1,34 @@ +# Copyright (C) 1999 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + .toc + .globl __sdiv_qrnnd + .globl .__sdiv_qrnnd + .csect __sdiv_qrnnd[DS] +__sdiv_qrnnd: + .long .__sdiv_qrnnd, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__sdiv_qrnnd: + mtmq 5 + div 0,4,6 + mfmq 9 + st 9,0(3) + mr 3,0 + br diff --git a/ghc/rts/gmp/mpn/power/sub_n.s b/ghc/rts/gmp/mpn/power/sub_n.s index 30d4fee..aa09cf5 100644 --- a/ghc/rts/gmp/mpn/power/sub_n.s +++ b/ghc/rts/gmp/mpn/power/sub_n.s @@ -1,20 +1,21 @@ -# IBM POWER __mpn_sub_n -- Subtract two limb vectors of equal, non-zero length. +# IBM POWER __gmpn_sub_n -- Subtract two limb vectors of equal, non-zero length. -# Copyright (C) 1992, 1994, 1995, 1996 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software Foundation, +# Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -27,17 +28,14 @@ # size r6 .toc - .extern __mpn_sub_n[DS] - .extern .__mpn_sub_n -.csect [PR] - .align 2 - .globl __mpn_sub_n - .globl .__mpn_sub_n - .csect __mpn_sub_n[DS] -__mpn_sub_n: - .long .__mpn_sub_n, TOC[tc0], 0 - .csect [PR] -.__mpn_sub_n: + .globl __gmpn_sub_n + .globl .__gmpn_sub_n + .csect __gmpn_sub_n[DS] +__gmpn_sub_n: + .long .__gmpn_sub_n, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__gmpn_sub_n: andil. 10,6,1 # odd or even number of limbs? l 8,0(4) # load least significant s1 limb l 0,0(5) # load least significant s2 limb @@ -49,7 +47,7 @@ __mpn_sub_n: # We have an odd # of limbs. Add the first limbs separately. cmpi 1,10,0 # is count for unrolled loop zero? - bne 1,L1 # branch if not + bc 4,6,L1 # bne cr1,L1 (misassembled by gas) st 7,4(3) sfe 3,0,0 # load !cy into ... sfi 3,3,0 # ... return value register diff --git a/ghc/rts/gmp/mpn/power/submul_1.s b/ghc/rts/gmp/mpn/power/submul_1.s index 8e5946f..bc01b7c 100644 --- a/ghc/rts/gmp/mpn/power/submul_1.s +++ b/ghc/rts/gmp/mpn/power/submul_1.s @@ -1,21 +1,21 @@ -# IBM POWER __mpn_submul_1 -- Multiply a limb vector with a limb and subtract +# IBM POWER __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract # the result from a second limb vector. -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -27,26 +27,25 @@ # size r5 # s2_limb r6 -# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To -# obtain that operation, we have to use the 32x32->64 signed multiplication -# instruction, and add the appropriate compensation to the high limb of the -# result. We add the multiplicand if the multiplier has its most significant -# bit set, and we add the multiplier if the multiplicand has its most -# significant bit set. We need to preserve the carry flag between each +# The POWER architecture has no unsigned 32x32->64 bit multiplication +# instruction. To obtain that operation, we have to use the 32x32->64 signed +# multiplication instruction, and add the appropriate compensation to the high +# limb of the result. We add the multiplicand if the multiplier has its most +# significant bit set, and we add the multiplier if the multiplicand has its +# most significant bit set. We need to preserve the carry flag between each # iteration, so we have to compute the compensation carefully (the natural, -# srai+and doesn't work). Since the POWER architecture has a branch unit -# we can branch in zero cycles, so that's how we perform the additions. +# srai+and doesn't work). Since the POWER architecture has a branch unit we +# can branch in zero cycles, so that's how we perform the additions. .toc - .csect .__mpn_submul_1[PR] - .align 2 - .globl __mpn_submul_1 - .globl .__mpn_submul_1 - .csect __mpn_submul_1[DS] -__mpn_submul_1: - .long .__mpn_submul_1[PR], TOC[tc0], 0 - .csect .__mpn_submul_1[PR] -.__mpn_submul_1: + .globl __gmpn_submul_1 + .globl .__gmpn_submul_1 + .csect __gmpn_submul_1[DS] +__gmpn_submul_1: + .long .__gmpn_submul_1, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__gmpn_submul_1: cal 3,-4(3) l 0,0(4) diff --git a/ghc/rts/gmp/mpn/power/umul.s b/ghc/rts/gmp/mpn/power/umul.s new file mode 100644 index 0000000..8c77496 --- /dev/null +++ b/ghc/rts/gmp/mpn/power/umul.s @@ -0,0 +1,38 @@ +# Copyright (C) 1999 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + .toc + .globl __umul_ppmm + .globl .__umul_ppmm + .csect __umul_ppmm[DS] +__umul_ppmm: + .long .__umul_ppmm, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__umul_ppmm: + mul 9,4,5 + srai 0,4,31 + and 0,0,5 + srai 5,5,31 + and 5,5,4 + cax 0,0,5 + mfmq 11 + st 11,0(3) + cax 3,9,0 + br diff --git a/ghc/rts/gmp/mpn/powerpc32/add_n.asm b/ghc/rts/gmp/mpn/powerpc32/add_n.asm new file mode 100644 index 0000000..81ed04b --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc32/add_n.asm @@ -0,0 +1,61 @@ +dnl PowerPC-32 mpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl s2_ptr r5 +dnl size r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_add_n) + mtctr r6 C copy size into CTR + addic r0,r0,0 C clear cy + lwz r8,0(r4) C load least significant s1 limb + lwz r0,0(r5) C load least significant s2 limb + addi r3,r3,-4 C offset res_ptr, it's updated before it's used + bdz .Lend C If done, skip loop +.Loop: lwz r9,4(r4) C load s1 limb + lwz r10,4(r5) C load s2 limb + adde r7,r0,r8 C add limbs with cy, set cy + stw r7,4(r3) C store result limb + bdz .Lexit C decrement CTR and exit if done + lwzu r8,8(r4) C load s1 limb and update s1_ptr + lwzu r0,8(r5) C load s2 limb and update s2_ptr + adde r7,r10,r9 C add limbs with cy, set cy + stwu r7,8(r3) C store result limb and update res_ptr + bdnz .Loop C decrement CTR and loop back + +.Lend: adde r7,r0,r8 + stw r7,4(r3) C store ultimate result limb + li r3,0 C load cy into ... + addze r3,r3 C ... return value register + blr +.Lexit: adde r7,r10,r9 + stw r7,8(r3) + li r3,0 C load cy into ... + addze r3,r3 C ... return value register + blr +EPILOGUE(mpn_add_n) diff --git a/ghc/rts/gmp/mpn/powerpc32/addmul_1.asm b/ghc/rts/gmp/mpn/powerpc32/addmul_1.asm new file mode 100644 index 0000000..3ef75b1 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc32/addmul_1.asm @@ -0,0 +1,124 @@ +dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright (C) 1995, 1997, 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl size r5 +dnl s2_limb r6 + +dnl This is optimized for the PPC604. It has not been tested on PPC601, PPC603 +dnl or PPC750 since I don't have access to any such machines. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_addmul_1) + cmpi cr0,r5,9 C more than 9 limbs? + bgt cr0,.Lbig C branch if more than 9 limbs + + mtctr r5 + lwz r0,0(r4) + mullw r7,r0,r6 + mulhwu r10,r0,r6 + lwz r9,0(r3) + addc r8,r7,r9 + addi r3,r3,-4 + bdz .Lend +.Lloop: + lwzu r0,4(r4) + stwu r8,4(r3) + mullw r8,r0,r6 + adde r7,r8,r10 + mulhwu r10,r0,r6 + lwz r9,4(r3) + addze r10,r10 + addc r8,r7,r9 + bdnz .Lloop +.Lend: stw r8,4(r3) + addze r3,r10 + blr + +.Lbig: stmw r30,-32(r1) + addi r5,r5,-1 + srwi r0,r5,2 + mtctr r0 + + lwz r7,0(r4) + mullw r8,r7,r6 + mulhwu r0,r7,r6 + lwz r7,0(r3) + addc r8,r8,r7 + stw r8,0(r3) + +.LloopU: + lwz r7,4(r4) + lwz r12,8(r4) + lwz r30,12(r4) + lwzu r31,16(r4) + mullw r8,r7,r6 + mullw r9,r12,r6 + mullw r10,r30,r6 + mullw r11,r31,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + adde r9,r9,r0 + mulhwu r0,r12,r6 + lwz r12,8(r3) + adde r10,r10,r0 + mulhwu r0,r30,r6 + lwz r30,12(r3) + adde r11,r11,r0 + mulhwu r0,r31,r6 + lwz r31,16(r3) + addze r0,r0 C new cy_limb + addc r8,r8,r7 + stw r8,4(r3) + adde r9,r9,r12 + stw r9,8(r3) + adde r10,r10,r30 + stw r10,12(r3) + adde r11,r11,r31 + stwu r11,16(r3) + bdnz .LloopU + + andi. r31,r5,3 + mtctr r31 + beq cr0,.Lendx + +.LloopE: + lwzu r7,4(r4) + mullw r8,r7,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + addze r0,r0 C new cy_limb + addc r8,r8,r7 + stwu r8,4(r3) + bdnz .LloopE +.Lendx: + addze r3,r0 + lmw r30,-32(r1) + blr +EPILOGUE(mpn_addmul_1) diff --git a/ghc/rts/gmp/mpn/powerpc32/aix.m4 b/ghc/rts/gmp/mpn/powerpc32/aix.m4 new file mode 100644 index 0000000..2bd8425 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc32/aix.m4 @@ -0,0 +1,39 @@ +divert(-1) +dnl m4 macros for AIX 32-bit assembly. + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + +define(`ASM_START', + `.toc') + +define(`PROLOGUE', + ` + .globl $1 + .globl .$1 + .csect $1[DS],2 +$1: + .long .$1, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.$1:') + +define(`EPILOGUE', `') + +divert diff --git a/ghc/rts/gmp/mpn/powerpc32/gmp-mparam.h b/ghc/rts/gmp/mpn/powerpc32/gmp-mparam.h new file mode 100644 index 0000000..b283185 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc32/gmp-mparam.h @@ -0,0 +1,66 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* These values are for the 604. Presumably, these should be considerably + different for the 603 and 750 that have much slower multiply + instructions. */ + +/* Generated by tuneup.c, 2000-05-26. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 26 /* tuneup says 20 */ +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 228 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 46 /* tuneup says 44 */ +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 262 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 52 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 86 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 23 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 7 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 53 +#endif diff --git a/ghc/rts/gmp/mpn/powerpc32/lshift.asm b/ghc/rts/gmp/mpn/powerpc32/lshift.asm new file mode 100644 index 0000000..73a8543 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc32/lshift.asm @@ -0,0 +1,145 @@ +dnl PowerPC-32 mpn_lshift -- Shift a number left. + +dnl Copyright (C) 1995, 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl size r5 +dnl cnt r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_lshift) + cmpi cr0,r5,12 C more than 12 limbs? + slwi r0,r5,2 + add r4,r4,r0 C make r4 point at end of s1 + add r7,r3,r0 C make r7 point at end of res + bgt .LBIG C branch if more than 12 limbs + + mtctr r5 C copy size into CTR + subfic r8,r6,32 + lwzu r11,-4(r4) C load first s1 limb + srw r3,r11,r8 C compute function return value + bdz .Lend1 + +.Loop: lwzu r10,-4(r4) + slw r9,r11,r6 + srw r12,r10,r8 + or r9,r9,r12 + stwu r9,-4(r7) + bdz .Lend2 + lwzu r11,-4(r4) + slw r9,r10,r6 + srw r12,r11,r8 + or r9,r9,r12 + stwu r9,-4(r7) + bdnz .Loop + +.Lend1: slw r0,r11,r6 + stw r0,-4(r7) + blr +.Lend2: slw r0,r10,r6 + stw r0,-4(r7) + blr + +.LBIG: + stmw r24,-32(r1) C save registers we are supposed to preserve + lwzu r9,-4(r4) + subfic r8,r6,32 + srw r3,r9,r8 C compute function return value + slw r0,r9,r6 + addi r5,r5,-1 + + andi. r10,r5,3 C count for spill loop + beq .Le + mtctr r10 + lwzu r28,-4(r4) + bdz .Lxe0 + +.Loop0: slw r12,r28,r6 + srw r24,r28,r8 + lwzu r28,-4(r4) + or r24,r0,r24 + stwu r24,-4(r7) + mr r0,r12 + bdnz .Loop0 C taken at most once! + +.Lxe0: slw r12,r28,r6 + srw r24,r28,r8 + or r24,r0,r24 + stwu r24,-4(r7) + mr r0,r12 + +.Le: srwi r5,r5,2 C count for unrolled loop + addi r5,r5,-1 + mtctr r5 + lwz r28,-4(r4) + lwz r29,-8(r4) + lwz r30,-12(r4) + lwzu r31,-16(r4) + +.LoopU: slw r9,r28,r6 + srw r24,r28,r8 + lwz r28,-4(r4) + slw r10,r29,r6 + srw r25,r29,r8 + lwz r29,-8(r4) + slw r11,r30,r6 + srw r26,r30,r8 + lwz r30,-12(r4) + slw r12,r31,r6 + srw r27,r31,r8 + lwzu r31,-16(r4) + or r24,r0,r24 + stw r24,-4(r7) + or r25,r9,r25 + stw r25,-8(r7) + or r26,r10,r26 + stw r26,-12(r7) + or r27,r11,r27 + stwu r27,-16(r7) + mr r0,r12 + bdnz .LoopU + + slw r9,r28,r6 + srw r24,r28,r8 + slw r10,r29,r6 + srw r25,r29,r8 + slw r11,r30,r6 + srw r26,r30,r8 + slw r12,r31,r6 + srw r27,r31,r8 + or r24,r0,r24 + stw r24,-4(r7) + or r25,r9,r25 + stw r25,-8(r7) + or r26,r10,r26 + stw r26,-12(r7) + or r27,r11,r27 + stwu r27,-16(r7) + mr r0,r12 + + stw r0,-4(r7) + lmw r24,-32(r1) C restore registers + blr +EPILOGUE(mpn_lshift) diff --git a/ghc/rts/gmp/mpn/powerpc32/mul_1.asm b/ghc/rts/gmp/mpn/powerpc32/mul_1.asm new file mode 100644 index 0000000..ec878b5 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc32/mul_1.asm @@ -0,0 +1,86 @@ +dnl PowerPC-32 mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl size r5 +dnl s2_limb r6 + +dnl This is optimized for the PPC604 but it runs decently even on PPC601. It +dnl has not been tested on a PPC603 since I don't have access to any such +dnl machines. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_mul_1) + mtctr r5 + addi r3,r3,-4 C adjust res_ptr, it's offset before it's used + li r12,0 C clear upper product reg + addic r0,r0,0 C clear cy +C Start software pipeline + lwz r8,0(r4) + bdz .Lend3 + stmw r30,-8(r1) C save registers we are supposed to preserve + lwzu r9,4(r4) + mullw r11,r8,r6 + mulhwu r0,r8,r6 + bdz .Lend1 +C Software pipelined main loop +.Loop: lwz r8,4(r4) + mullw r10,r9,r6 + adde r30,r11,r12 + mulhwu r12,r9,r6 + stw r30,4(r3) + bdz .Lend2 + lwzu r9,8(r4) + mullw r11,r8,r6 + adde r31,r10,r0 + mulhwu r0,r8,r6 + stwu r31,8(r3) + bdnz .Loop +C Finish software pipeline +.Lend1: mullw r10,r9,r6 + adde r30,r11,r12 + mulhwu r12,r9,r6 + stw r30,4(r3) + adde r31,r10,r0 + stwu r31,8(r3) + addze r3,r12 + lmw r30,-8(r1) C restore registers from stack + blr +.Lend2: mullw r11,r8,r6 + adde r31,r10,r0 + mulhwu r0,r8,r6 + stwu r31,8(r3) + adde r30,r11,r12 + stw r30,4(r3) + addze r3,r0 + lmw r30,-8(r1) C restore registers from stack + blr +.Lend3: mullw r11,r8,r6 + stw r11,4(r3) + mulhwu r3,r8,r6 + blr +EPILOGUE(mpn_mul_1) diff --git a/ghc/rts/gmp/mpn/powerpc32/regmap.m4 b/ghc/rts/gmp/mpn/powerpc32/regmap.m4 new file mode 100644 index 0000000..978f189 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc32/regmap.m4 @@ -0,0 +1,34 @@ +divert(-1) + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl Map register names r0, r1, etc, to just `0', `1', etc. +dnl This is needed on all systems but NeXT, Rhapsody, and MacOS-X +forloop(i,0,31, +`define(`r'i,i)' +) + +dnl Likewise for cr0, cr1, etc. +forloop(i,0,7, +`define(`cr'i,i)' +) + +divert diff --git a/ghc/rts/gmp/mpn/powerpc32/rshift.asm b/ghc/rts/gmp/mpn/powerpc32/rshift.asm new file mode 100644 index 0000000..a09ba04 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc32/rshift.asm @@ -0,0 +1,60 @@ +dnl PowerPC-32 mpn_rshift -- Shift a number right. + +dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl size r5 +dnl cnt r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_rshift) + mtctr r5 C copy size into CTR + addi r7,r3,-4 C move adjusted res_ptr to free return reg + subfic r8,r6,32 + lwz r11,0(r4) C load first s1 limb + slw r3,r11,r8 C compute function return value + bdz .Lend1 + +.Loop: lwzu r10,4(r4) + srw r9,r11,r6 + slw r12,r10,r8 + or r9,r9,r12 + stwu r9,4(r7) + bdz .Lend2 + lwzu r11,4(r4) + srw r9,r10,r6 + slw r12,r11,r8 + or r9,r9,r12 + stwu r9,4(r7) + bdnz .Loop + +.Lend1: srw r0,r11,r6 + stw r0,4(r7) + blr + +.Lend2: srw r0,r10,r6 + stw r0,4(r7) + blr +EPILOGUE(mpn_rshift) diff --git a/ghc/rts/gmp/mpn/powerpc32/sub_n.asm b/ghc/rts/gmp/mpn/powerpc32/sub_n.asm new file mode 100644 index 0000000..b04b419 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc32/sub_n.asm @@ -0,0 +1,61 @@ +dnl PowerPC-32 mpn_sub_n -- Subtract two limb vectors of the same length > 0 +dnl and store difference in a third limb vector. + +dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl s2_ptr r5 +dnl size r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_sub_n) + mtctr r6 C copy size into CTR + addic r0,r6,-1 C set cy + lwz r8,0(r4) C load least significant s1 limb + lwz r0,0(r5) C load least significant s2 limb + addi r3,r3,-4 C offset res_ptr, it's updated before it's used + bdz .Lend C If done, skip loop +.Loop: lwz r9,4(r4) C load s1 limb + lwz r10,4(r5) C load s2 limb + subfe r7,r0,r8 C subtract limbs with cy, set cy + stw r7,4(r3) C store result limb + bdz .Lexit C decrement CTR and exit if done + lwzu r8,8(r4) C load s1 limb and update s1_ptr + lwzu r0,8(r5) C load s2 limb and update s2_ptr + subfe r7,r10,r9 C subtract limbs with cy, set cy + stwu r7,8(r3) C store result limb and update res_ptr + bdnz .Loop C decrement CTR and loop back + +.Lend: subfe r7,r0,r8 + stw r7,4(r3) C store ultimate result limb + subfe r3,r0,r0 C load !cy into ... + subfic r3,r3,0 C ... return value register + blr +.Lexit: subfe r7,r10,r9 + stw r7,8(r3) + subfe r3,r0,r0 C load !cy into ... + subfic r3,r3,0 C ... return value register + blr +EPILOGUE(mpn_sub_n) diff --git a/ghc/rts/gmp/mpn/powerpc32/submul_1.asm b/ghc/rts/gmp/mpn/powerpc32/submul_1.asm new file mode 100644 index 0000000..a129e9f --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc32/submul_1.asm @@ -0,0 +1,130 @@ +dnl PowerPC-32 mpn_submul_1 -- Multiply a limb vector with a limb and subtract +dnl the result from a second limb vector. + +dnl Copyright (C) 1995, 1997, 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl size r5 +dnl s2_limb r6 + +dnl This is optimized for the PPC604. It has not been tested on PPC601, PPC603 +dnl or PPC750 since I don't have access to any such machines. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_submul_1) + cmpi cr0,r5,9 C more than 9 limbs? + bgt cr0,.Lbig C branch if more than 9 limbs + + mtctr r5 + lwz r0,0(r4) + mullw r7,r0,r6 + mulhwu r10,r0,r6 + lwz r9,0(r3) + subfc r8,r7,r9 + addc r7,r7,r8 C invert cy (r7 is junk) + addi r3,r3,-4 + bdz .Lend +.Lloop: + lwzu r0,4(r4) + stwu r8,4(r3) + mullw r8,r0,r6 + adde r7,r8,r10 + mulhwu r10,r0,r6 + lwz r9,4(r3) + addze r10,r10 + subfc r8,r7,r9 + addc r7,r7,r8 C invert cy (r7 is junk) + bdnz .Lloop +.Lend: stw r8,4(r3) + addze r3,r10 + blr + +.Lbig: stmw r30,-32(r1) + addi r5,r5,-1 + srwi r0,r5,2 + mtctr r0 + + lwz r7,0(r4) + mullw r8,r7,r6 + mulhwu r0,r7,r6 + lwz r7,0(r3) + subfc r7,r8,r7 + addc r8,r8,r7 + stw r7,0(r3) + +.LloopU: + lwz r7,4(r4) + lwz r12,8(r4) + lwz r30,12(r4) + lwzu r31,16(r4) + mullw r8,r7,r6 + mullw r9,r12,r6 + mullw r10,r30,r6 + mullw r11,r31,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + adde r9,r9,r0 + mulhwu r0,r12,r6 + lwz r12,8(r3) + adde r10,r10,r0 + mulhwu r0,r30,r6 + lwz r30,12(r3) + adde r11,r11,r0 + mulhwu r0,r31,r6 + lwz r31,16(r3) + addze r0,r0 C new cy_limb + subfc r7,r8,r7 + stw r7,4(r3) + subfe r12,r9,r12 + stw r12,8(r3) + subfe r30,r10,r30 + stw r30,12(r3) + subfe r31,r11,r31 + stwu r31,16(r3) + subfe r11,r11,r11 C invert ... + addic r11,r11,1 C ... carry + bdnz .LloopU + + andi. r31,r5,3 + mtctr r31 + beq cr0,.Lendx + +.LloopE: + lwzu r7,4(r4) + mullw r8,r7,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + addze r0,r0 C new cy_limb + subfc r7,r8,r7 + addc r8,r8,r7 + stwu r7,4(r3) + bdnz .LloopE +.Lendx: + addze r3,r0 + lmw r30,-32(r1) + blr +EPILOGUE(mpn_submul_1) diff --git a/ghc/rts/gmp/mpn/powerpc32/umul.asm b/ghc/rts/gmp/mpn/powerpc32/umul.asm new file mode 100644 index 0000000..eeaa0a4 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc32/umul.asm @@ -0,0 +1,32 @@ +dnl PowerPC-32 umul_ppmm -- support for longlong.h + +dnl Copyright (C) 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + mullw 0,4,5 + mulhwu 9,4,5 + stw 0,0(3) + mr 3,9 + blr +EPILOGUE(mpn_umul_ppmm) diff --git a/ghc/rts/gmp/mpn/powerpc64/README b/ghc/rts/gmp/mpn/powerpc64/README new file mode 100644 index 0000000..c779276 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc64/README @@ -0,0 +1,36 @@ +PPC630 (aka Power3) pipeline information: + +Decoding is 4-way and issue is 8-way with some out-of-order capability. +LS1 - ld/st unit 1 +LS2 - ld/st unit 2 +FXU1 - integer unit 1, handles any simple integer instructions +FXU2 - integer unit 2, handles any simple integer instructions +FXU3 - integer unit 3, handles integer multiply and divide +FPU1 - floating-point unit 1 +FPU2 - floating-point unit 2 + +Memory: Any two memory operations can issue, but memory subsystem + can sustain just one store per cycle. +Simple integer: 2 operations (such as add, rl*) +Integer multiply: 1 operation every 9th cycle worst case; exact timing depends + on 2nd operand most significant bit position (10 bits per + cycle). Multiply unit is not pipelined, only one multiply + operation in progress is allowed. +Integer divide: ? +Floating-point: Any plain 2 arithmetic instructions (such as fmul, fadd, fmadd) + Latency = 4. +Floating-point divide: + ? +Floating-point square root: + ? + +Best possible times for the main loops: +shift: 1.5 cycles limited by integer unit contention. + With 63 special loops, one for each shift count, we could + reduce the needed integer instructions to 2, which would + reduce the best possible time to 1 cycle. +add/sub: 1.5 cycles, limited by ld/st unit contention. +mul: 18 cycles (average) unless floating-point operations are used, + but that would only help for multiplies of perhaps 10 and more + limbs. +addmul/submul:Same situation as for mul. diff --git a/ghc/rts/gmp/mpn/powerpc64/add_n.asm b/ghc/rts/gmp/mpn/powerpc64/add_n.asm new file mode 100644 index 0000000..c332537 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc64/add_n.asm @@ -0,0 +1,61 @@ +# PowerPC-64 mpn_add_n -- Add two limb vectors of the same length > 0 and +# store sum in a third limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# s2_ptr r5 +# size r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_add_n) + mtctr r6 # copy size into CTR + addic r0,r0,0 # clear cy + ld r8,0(r4) # load least significant s1 limb + ld r0,0(r5) # load least significant s2 limb + addi r3,r3,-8 # offset res_ptr, it's updated before it's used + bdz .Lend # If done, skip loop +.Loop: ld r9,8(r4) # load s1 limb + ld r10,8(r5) # load s2 limb + adde r7,r0,r8 # add limbs with cy, set cy + std r7,8(r3) # store result limb + bdz .Lexit # decrement CTR and exit if done + ldu r8,16(r4) # load s1 limb and update s1_ptr + ldu r0,16(r5) # load s2 limb and update s2_ptr + adde r7,r10,r9 # add limbs with cy, set cy + stdu r7,16(r3) # store result limb and update res_ptr + bdnz .Loop # decrement CTR and loop back + +.Lend: adde r7,r0,r8 + std r7,8(r3) # store ultimate result limb + li r3,0 # load cy into ... + addze r3,r3 # ... return value register + blr +.Lexit: adde r7,r10,r9 + std r7,16(r3) + li r3,0 # load cy into ... + addze r3,r3 # ... return value register + blr +EPILOGUE(mpn_add_n) diff --git a/ghc/rts/gmp/mpn/powerpc64/addmul_1.asm b/ghc/rts/gmp/mpn/powerpc64/addmul_1.asm new file mode 100644 index 0000000..8177448 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc64/addmul_1.asm @@ -0,0 +1,52 @@ +# PowerPC-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add +# the result to a second limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_addmul_1) + mtctr 5 + li 9,0 # cy_limb = 0 + addic 0,0,0 + cal 3,-8(3) + cal 4,-8(4) +.Loop: + ldu 0,8(4) + ld 10,8(3) + mulld 7,0,6 + adde 7,7,9 + mulhdu 9,0,6 + addze 9,9 + addc 7,7,10 + stdu 7,8(3) + bdnz .Loop + + addze 3,9 + blr +EPILOGUE(mpn_addmul_1) diff --git a/ghc/rts/gmp/mpn/powerpc64/addsub_n.asm b/ghc/rts/gmp/mpn/powerpc64/addsub_n.asm new file mode 100644 index 0000000..4ed40d7 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc64/addsub_n.asm @@ -0,0 +1,107 @@ +# PowerPC-64 mpn_addsub_n -- Simultaneous add and sub. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# s2_ptr r5 +# size r6 + +include(`asm-syntax.m4') + +define(SAVE_BORROW_RESTORE_CARRY, + `sldi $1,$1,63 + adde $1,$1,$1') +define(SAVE_CARRY_RESTORE_BORROW, + `sldi $1,$1,63 + adde $1,$1,$1') + +# 19991117 + +# This is just crafted for testing some ideas, and verifying that we can make +# it run fast. It runs at 2.55 cycles/limb on the 630, which is very good. +# We should play a little with the schedule. No time has been spent on that. + +# To finish this, the loop warm up and cool down code needs to be written, +# and the result need to be tested. Also, the proper calling sequence should +# be used. + +# r1p r2p s1p s2p n +# Use reg r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12 + +ASM_START() +PROLOGUE(mpn_addsub_n) + std r14,-64(1) + std r15,-56(1) + std r16,-48(1) + std r17,-40(1) + std r18,-32(1) + std r19,-24(1) + + srdi r7,r7,2 + mtctr r7 # copy size into CTR + addic r0,r0,0 # clear cy + addi r3,r3,-8 # offset res_ptr, it's updated before it's used + addi r4,r4,-8 # offset res_ptr, it's updated before it's used + +.Loop: + adde r12,r8,r9 + std r12,8(r3) + adde r12,r10,r11 + std r12,16(r3) + + SAVE_CARRY_RESTORE_BORROW(r0) + + subfe r12,r8,r9 + std r12,8(r4) + ld r8,8(r5) # s1 L 1 + ld r9,8(r6) # s2 L 1 + subfe r12,r10,r11 + std r12,16(r4) + ld r10,16(r5) # s1 L 2 + ld r11,16(r6) # s2 L 2 +# pair ------------------------- + subfe r12,r14,r15 + std r12,24(r4) + subfe r12,r16,r17 + stdu r12,32(r4) + + SAVE_BORROW_RESTORE_CARRY(r0) + + adde r12,r14,r15 + std r12,24(r3) + ld r14,24(r5) # s1 L 3 + ld r15,24(r6) # s2 L 3 + adde r12,r16,r17 + stdu r12,32(r3) + ldu r16,32(r5) # s1 L 4 + ldu r17,32(r6) # s2 L 4 + bdnz .Loop + + ld r14,-64(1) + ld r15,-56(1) + ld r16,-48(1) + ld r17,-40(1) + ld r18,-32(1) + ld r19,-24(1) + blr +EPILOGUE(mpn_addsub_n) diff --git a/ghc/rts/gmp/mpn/powerpc64/aix.m4 b/ghc/rts/gmp/mpn/powerpc64/aix.m4 new file mode 100644 index 0000000..aee9f1f --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc64/aix.m4 @@ -0,0 +1,40 @@ +divert(-1) +dnl m4 macros for AIX 64-bit assembly. + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + +define(`ASM_START', + `.machine "ppc64" + .toc') + +define(`PROLOGUE', + ` + .globl $1 + .globl .$1 + .csect $1[DS],3 +$1: + .llong .$1, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.$1:') + +define(`EPILOGUE', `') + +divert diff --git a/ghc/rts/gmp/mpn/powerpc64/copyd.asm b/ghc/rts/gmp/mpn/powerpc64/copyd.asm new file mode 100644 index 0000000..d06e8c2 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc64/copyd.asm @@ -0,0 +1,45 @@ +# PowerPC-64 mpn_copyd -- Copy a limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# rptr r3 +# sptr r4 +# n r5 + +include(`../config.m4') + +# Unrolling this analogous to sparc64/copyi.s doesn't help for any +# operand sizes. + +ASM_START() +PROLOGUE(mpn_copyd) + cmpdi cr0,r5,0 + mtctr r5 + sldi r5,r5,3 + add r4,r4,r5 + add r3,r3,r5 + beq cr0,.Lend +.Loop: ldu r0,-8(r4) + stdu r0,-8(r3) + bdnz .Loop +.Lend: blr +EPILOGUE(mpn_copyd) diff --git a/ghc/rts/gmp/mpn/powerpc64/copyi.asm b/ghc/rts/gmp/mpn/powerpc64/copyi.asm new file mode 100644 index 0000000..a1bedc4 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc64/copyi.asm @@ -0,0 +1,44 @@ +# PowerPC-64 mpn_copyi -- Copy a limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# rptr r3 +# sptr r4 +# n r5 + +include(`../config.m4') + +# Unrolling this analogous to sparc64/copyi.s doesn't help for any +# operand sizes. + +ASM_START() +PROLOGUE(mpn_copyi) + cmpdi cr0,r5,0 + mtctr r5 + addi r4,r4,-8 + addi r3,r3,-8 + beq cr0,.Lend +.Loop: ldu r0,8(r4) + stdu r0,8(r3) + bdnz .Loop +.Lend: blr +EPILOGUE(mpn_copyi) diff --git a/ghc/rts/gmp/mpn/powerpc64/gmp-mparam.h b/ghc/rts/gmp/mpn/powerpc64/gmp-mparam.h index 48eb85d..6fefb96 100644 --- a/ghc/rts/gmp/mpn/powerpc64/gmp-mparam.h +++ b/ghc/rts/gmp/mpn/powerpc64/gmp-mparam.h @@ -1,20 +1,20 @@ /* gmp-mparam.h -- Compiler/machine parameter header file. -Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1995, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -25,3 +25,38 @@ MA 02111-1307, USA. */ #define BITS_PER_INT 32 #define BITS_PER_SHORTINT 16 #define BITS_PER_CHAR 8 + +/* Generated by tuneup.c, 2000-07-16. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 10 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 57 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 16 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 89 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 28 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 216 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 14 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 6 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 163 +#endif diff --git a/ghc/rts/gmp/mpn/powerpc64/lshift.asm b/ghc/rts/gmp/mpn/powerpc64/lshift.asm new file mode 100644 index 0000000..cef3a81 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc64/lshift.asm @@ -0,0 +1,159 @@ +# PowerPC-64 mpn_lshift -- Shift a number left. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# cnt r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_lshift) + cmpdi cr0,r5,20 # more than 20 limbs? + sldi r0,r5,3 + add r4,r4,r0 # make r4 point at end of s1 + add r7,r3,r0 # make r7 point at end of res + bgt .LBIG # branch if more than 12 limbs + + mtctr r5 # copy size into CTR + subfic r8,r6,64 + ldu r11,-8(r4) # load first s1 limb + srd r3,r11,r8 # compute function return value + bdz .Lend1 + +.Loop: ldu r10,-8(r4) + sld r9,r11,r6 + srd r12,r10,r8 + or r9,r9,r12 + stdu r9,-8(r7) + bdz .Lend2 + ldu r11,-8(r4) + sld r9,r10,r6 + srd r12,r11,r8 + or r9,r9,r12 + stdu r9,-8(r7) + bdnz .Loop + +.Lend1: sld r0,r11,r6 + std r0,-8(r7) + blr +.Lend2: sld r0,r10,r6 + std r0,-8(r7) + blr + +.LBIG: + std r24,-64(1) + std r25,-56(1) + std r26,-48(1) + std r27,-40(1) + std r28,-32(1) + std r29,-24(1) + std r30,-16(1) + std r31,-8(1) + ldu r9,-8(r4) + subfic r8,r6,64 + srd r3,r9,r8 # compute function return value + sld r0,r9,r6 + addi r5,r5,-1 + + andi. r10,r5,3 # count for spill loop + beq .Le + mtctr r10 + ldu r28,-8(r4) + bdz .Lxe0 + +.Loop0: sld r12,r28,r6 + srd r24,r28,r8 + ldu r28,-8(r4) + or r24,r0,r24 + stdu r24,-8(r7) + mr r0,r12 + bdnz .Loop0 # taken at most once! + +.Lxe0: sld r12,r28,r6 + srd r24,r28,r8 + or r24,r0,r24 + stdu r24,-8(r7) + mr r0,r12 + +.Le: srdi r5,r5,2 # count for unrolled loop + addi r5,r5,-1 + mtctr r5 + ld r28,-8(r4) + ld r29,-16(r4) + ld r30,-24(r4) + ldu r31,-32(r4) + +.LoopU: sld r9,r28,r6 + srd r24,r28,r8 + ld r28,-8(r4) + sld r10,r29,r6 + srd r25,r29,r8 + ld r29,-16(r4) + sld r11,r30,r6 + srd r26,r30,r8 + ld r30,-24(r4) + sld r12,r31,r6 + srd r27,r31,r8 + ldu r31,-32(r4) + or r24,r0,r24 + std r24,-8(r7) + or r25,r9,r25 + std r25,-16(r7) + or r26,r10,r26 + std r26,-24(r7) + or r27,r11,r27 + stdu r27,-32(r7) + mr r0,r12 + bdnz .LoopU + + sld r9,r28,r6 + srd r24,r28,r8 + sld r10,r29,r6 + srd r25,r29,r8 + sld r11,r30,r6 + srd r26,r30,r8 + sld r12,r31,r6 + srd r27,r31,r8 + or r24,r0,r24 + std r24,-8(r7) + or r25,r9,r25 + std r25,-16(r7) + or r26,r10,r26 + std r26,-24(r7) + or r27,r11,r27 + stdu r27,-32(r7) + mr r0,r12 + + std r0,-8(r7) + ld r24,-64(1) + ld r25,-56(1) + ld r26,-48(1) + ld r27,-40(1) + ld r28,-32(1) + ld r29,-24(1) + ld r30,-16(1) + ld r31,-8(1) + blr +EPILOGUE(mpn_lshift) diff --git a/ghc/rts/gmp/mpn/powerpc64/mul_1.asm b/ghc/rts/gmp/mpn/powerpc64/mul_1.asm new file mode 100644 index 0000000..4759728 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc64/mul_1.asm @@ -0,0 +1,49 @@ +# PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store +# the result in a second limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_mul_1) + mtctr 5 + li 9,0 # cy_limb = 0 + addic 0,0,0 + cal 3,-8(3) + cal 4,-8(4) +.Loop: + ldu 0,8(4) + mulld 7,0,6 + adde 7,7,9 + mulhdu 9,0,6 + stdu 7,8(3) + bdnz .Loop + + addze 3,9 + blr +EPILOGUE(mpn_mul_1) diff --git a/ghc/rts/gmp/mpn/powerpc64/rshift.asm b/ghc/rts/gmp/mpn/powerpc64/rshift.asm new file mode 100644 index 0000000..88272c7 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc64/rshift.asm @@ -0,0 +1,60 @@ +# PowerPC-64 mpn_rshift -- Shift a number right. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# cnt r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_rshift) + mtctr r5 # copy size into CTR + addi r7,r3,-8 # move adjusted res_ptr to free return reg + subfic r8,r6,64 + ld r11,0(r4) # load first s1 limb + sld r3,r11,r8 # compute function return value + bdz .Lend1 + +.Loop: ldu r10,8(r4) + srd r9,r11,r6 + sld r12,r10,r8 + or r9,r9,r12 + stdu r9,8(r7) + bdz .Lend2 + ldu r11,8(r4) + srd r9,r10,r6 + sld r12,r11,r8 + or r9,r9,r12 + stdu r9,8(r7) + bdnz .Loop + +.Lend1: srd r0,r11,r6 + std r0,8(r7) + blr + +.Lend2: srd r0,r10,r6 + std r0,8(r7) + blr +EPILOGUE(mpn_rshift) diff --git a/ghc/rts/gmp/mpn/powerpc64/sub_n.asm b/ghc/rts/gmp/mpn/powerpc64/sub_n.asm new file mode 100644 index 0000000..4de3de6 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc64/sub_n.asm @@ -0,0 +1,61 @@ +# PowerPC-64 mpn_sub_n -- Subtract two limb vectors of the same length > 0 +# and store difference in a third limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc.b + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# s2_ptr r5 +# size r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_sub_n) + mtctr r6 # copy size into CTR + addic r0,r6,-1 # set cy + ld r8,0(r4) # load least significant s1 limb + ld r0,0(r5) # load least significant s2 limb + addi r3,r3,-8 # offset res_ptr, it's updated before it's used + bdz .Lend # If done, skip loop +.Loop: ld r9,8(r4) # load s1 limb + ld r10,8(r5) # load s2 limb + subfe r7,r0,r8 # subtract limbs with cy, set cy + std r7,8(r3) # store result limb + bdz .Lexit # decrement CTR and exit if done + ldu r8,16(r4) # load s1 limb and update s1_ptr + ldu r0,16(r5) # load s2 limb and update s2_ptr + subfe r7,r10,r9 # subtract limbs with cy, set cy + stdu r7,16(r3) # store result limb and update res_ptr + bdnz .Loop # decrement CTR and loop back + +.Lend: subfe r7,r0,r8 + std r7,8(r3) # store ultimate result limb + subfe r3,r0,r0 # load !cy into ... + subfic r3,r3,0 # ... return value register + blr +.Lexit: subfe r7,r10,r9 + std r7,16(r3) + subfe r3,r0,r0 # load !cy into ... + subfic r3,r3,0 # ... return value register + blr +EPILOGUE(mpn_sub_n) diff --git a/ghc/rts/gmp/mpn/powerpc64/submul_1.asm b/ghc/rts/gmp/mpn/powerpc64/submul_1.asm new file mode 100644 index 0000000..17f6369 --- /dev/null +++ b/ghc/rts/gmp/mpn/powerpc64/submul_1.asm @@ -0,0 +1,54 @@ +# PowerPC-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract +# the result from a second limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_submul_1) + mtctr 5 + li 9,0 # cy_limb = 0 + addic 0,0,0 + cal 3,-8(3) + cal 4,-8(4) +.Loop: + ldu 0,8(4) + ld 10,8(3) + mulld 7,0,6 + adde 7,7,9 + mulhdu 9,0,6 + addze 9,9 + subfc 7,7,10 + stdu 7,8(3) + subfe 11,11,11 # invert ... + addic 11,11,1 # ... carry + bdnz .Loop + + addze 3,9 + blr +EPILOGUE(mpn_submul_1) diff --git a/ghc/rts/gmp/mpn/pyr/add_n.s b/ghc/rts/gmp/mpn/pyr/add_n.s index 416c660..e1fc535 100644 --- a/ghc/rts/gmp/mpn/pyr/add_n.s +++ b/ghc/rts/gmp/mpn/pyr/add_n.s @@ -1,29 +1,29 @@ -# Pyramid __mpn_add_n -- Add two limb vectors of the same length > 0 and store +# Pyramid __gmpn_add_n -- Add two limb vectors of the same length > 0 and store # sum in a third limb vector. -# Copyright (C) 1995 Free Software Foundation, Inc. +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. .text .align 2 -.globl ___mpn_add_n -___mpn_add_n: +.globl ___gmpn_add_n +___gmpn_add_n: movw $-1,tr0 # representation for carry clear movw pr3,tr2 diff --git a/ghc/rts/gmp/mpn/pyr/addmul_1.s b/ghc/rts/gmp/mpn/pyr/addmul_1.s index a1495ca..65c3f8f 100644 --- a/ghc/rts/gmp/mpn/pyr/addmul_1.s +++ b/ghc/rts/gmp/mpn/pyr/addmul_1.s @@ -1,29 +1,29 @@ -# Pyramid __mpn_addmul_1 -- Multiply a limb vector with a limb and add +# Pyramid __gmpn_addmul_1 -- Multiply a limb vector with a limb and add # the result to a second limb vector. -# Copyright (C) 1995 Free Software Foundation, Inc. +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. .text .align 2 -.globl ___mpn_addmul_1 -___mpn_addmul_1: +.globl ___gmpn_addmul_1 +___gmpn_addmul_1: mova (pr0)[pr2*4],pr0 mova (pr1)[pr2*4],pr1 mnegw pr2,pr2 diff --git a/ghc/rts/gmp/mpn/pyr/mul_1.s b/ghc/rts/gmp/mpn/pyr/mul_1.s index e6b9791..1272297 100644 --- a/ghc/rts/gmp/mpn/pyr/mul_1.s +++ b/ghc/rts/gmp/mpn/pyr/mul_1.s @@ -1,29 +1,29 @@ -# Pyramid __mpn_mul_1 -- Multiply a limb vector with a limb and store +# Pyramid __gmpn_mul_1 -- Multiply a limb vector with a limb and store # the result in a second limb vector. -# Copyright (C) 1995 Free Software Foundation, Inc. +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. .text .align 2 -.globl ___mpn_mul_1 -___mpn_mul_1: +.globl ___gmpn_mul_1 +___gmpn_mul_1: mova (pr0)[pr2*4],pr0 mova (pr1)[pr2*4],pr1 mnegw pr2,pr2 diff --git a/ghc/rts/gmp/mpn/pyr/sub_n.s b/ghc/rts/gmp/mpn/pyr/sub_n.s index 5664859..1fd2eb0 100644 --- a/ghc/rts/gmp/mpn/pyr/sub_n.s +++ b/ghc/rts/gmp/mpn/pyr/sub_n.s @@ -1,29 +1,29 @@ -# Pyramid __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +# Pyramid __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and # store difference in a third limb vector. -# Copyright (C) 1995 Free Software Foundation, Inc. +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. .text .align 2 -.globl ___mpn_sub_n -___mpn_sub_n: +.globl ___gmpn_sub_n +___gmpn_sub_n: movw $-1,tr0 # representation for carry clear movw pr3,tr2 diff --git a/ghc/rts/gmp/mpn/sh/add_n.s b/ghc/rts/gmp/mpn/sh/add_n.s index 93dad51..df388b3 100644 --- a/ghc/rts/gmp/mpn/sh/add_n.s +++ b/ghc/rts/gmp/mpn/sh/add_n.s @@ -1,21 +1,21 @@ -! SH __mpn_add_n -- Add two limb vectors of the same length > 0 and store +! SH __gmpn_add_n -- Add two limb vectors of the same length > 0 and store ! sum in a third limb vector. -! Copyright (C) 1995 Free Software Foundation, Inc. +! Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc. ! This file is part of the GNU MP Library. ! The GNU MP Library is free software; you can redistribute it and/or modify -! it under the terms of the GNU Library General Public License as published by -! the Free Software Foundation; either version 2 of the License, or (at your +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your ! option) any later version. ! The GNU MP Library is distributed in the hope that it will be useful, but ! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ! License for more details. -! You should have received a copy of the GNU Library General Public License +! You should have received a copy of the GNU Lesser General Public License ! along with the GNU MP Library; see the file COPYING.LIB. If not, write to ! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ! MA 02111-1307, USA. @@ -29,8 +29,8 @@ .text .align 2 - .global ___mpn_add_n -___mpn_add_n: + .global ___gmpn_add_n +___gmpn_add_n: mov #0,r3 ! clear cy save reg Loop: mov.l @r5+,r1 @@ -44,4 +44,4 @@ Loop: mov.l @r5+,r1 add #4,r4 rts - movt r0 ! return carry-out from most sign. limb + mov r3,r0 ! return carry-out from most sign. limb diff --git a/ghc/rts/gmp/mpn/sh/sh2/addmul_1.s b/ghc/rts/gmp/mpn/sh/sh2/addmul_1.s index 19d81da..f34a7f0 100644 --- a/ghc/rts/gmp/mpn/sh/sh2/addmul_1.s +++ b/ghc/rts/gmp/mpn/sh/sh2/addmul_1.s @@ -1,21 +1,21 @@ -! SH2 __mpn_addmul_1 -- Multiply a limb vector with a limb and add +! SH2 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add ! the result to a second limb vector. -! Copyright (C) 1995 Free Software Foundation, Inc. +! Copyright (C) 1995, 2000 Free Software Foundation, Inc. ! This file is part of the GNU MP Library. ! The GNU MP Library is free software; you can redistribute it and/or modify -! it under the terms of the GNU Library General Public License as published by -! the Free Software Foundation; either version 2 of the License, or (at your +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your ! option) any later version. ! The GNU MP Library is distributed in the hope that it will be useful, but ! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ! License for more details. -! You should have received a copy of the GNU Library General Public License +! You should have received a copy of the GNU Lesser General Public License ! along with the GNU MP Library; see the file COPYING.LIB. If not, write to ! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ! MA 02111-1307, USA. @@ -29,8 +29,8 @@ .text .align 1 - .global ___mpn_addmul_1 -___mpn_addmul_1: + .global ___gmpn_addmul_1 +___gmpn_addmul_1: mov #0,r2 ! cy_limb = 0 mov #0,r0 ! Keep r0 = 0 for entire loop clrt diff --git a/ghc/rts/gmp/mpn/sh/sh2/mul_1.s b/ghc/rts/gmp/mpn/sh/sh2/mul_1.s index 7ca2756..2a117a3 100644 --- a/ghc/rts/gmp/mpn/sh/sh2/mul_1.s +++ b/ghc/rts/gmp/mpn/sh/sh2/mul_1.s @@ -1,21 +1,21 @@ -! SH2 __mpn_mul_1 -- Multiply a limb vector with a limb and store +! SH2 __gmpn_mul_1 -- Multiply a limb vector with a limb and store ! the result in a second limb vector. -! Copyright (C) 1995 Free Software Foundation, Inc. +! Copyright (C) 1995, 2000 Free Software Foundation, Inc. ! This file is part of the GNU MP Library. ! The GNU MP Library is free software; you can redistribute it and/or modify -! it under the terms of the GNU Library General Public License as published by -! the Free Software Foundation; either version 2 of the License, or (at your +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your ! option) any later version. ! The GNU MP Library is distributed in the hope that it will be useful, but ! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ! License for more details. -! You should have received a copy of the GNU Library General Public License +! You should have received a copy of the GNU Lesser General Public License ! along with the GNU MP Library; see the file COPYING.LIB. If not, write to ! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ! MA 02111-1307, USA. @@ -29,8 +29,8 @@ .text .align 1 - .global ___mpn_mul_1 -___mpn_mul_1: + .global ___gmpn_mul_1 +___gmpn_mul_1: mov #0,r2 ! cy_limb = 0 mov #0,r0 ! Keep r0 = 0 for entire loop clrt diff --git a/ghc/rts/gmp/mpn/sh/sh2/submul_1.s b/ghc/rts/gmp/mpn/sh/sh2/submul_1.s index 9ef380c..eb9a27d 100644 --- a/ghc/rts/gmp/mpn/sh/sh2/submul_1.s +++ b/ghc/rts/gmp/mpn/sh/sh2/submul_1.s @@ -1,21 +1,21 @@ -! SH2 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract +! SH2 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract ! the result from a second limb vector. -! Copyright (C) 1995 Free Software Foundation, Inc. +! Copyright (C) 1995, 2000 Free Software Foundation, Inc. ! This file is part of the GNU MP Library. ! The GNU MP Library is free software; you can redistribute it and/or modify -! it under the terms of the GNU Library General Public License as published by -! the Free Software Foundation; either version 2 of the License, or (at your +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your ! option) any later version. ! The GNU MP Library is distributed in the hope that it will be useful, but ! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ! License for more details. -! You should have received a copy of the GNU Library General Public License +! You should have received a copy of the GNU Lesser General Public License ! along with the GNU MP Library; see the file COPYING.LIB. If not, write to ! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ! MA 02111-1307, USA. @@ -29,8 +29,8 @@ .text .align 1 - .global ___mpn_submul_1 -___mpn_submul_1: + .global ___gmpn_submul_1 +___gmpn_submul_1: mov #0,r2 ! cy_limb = 0 mov #0,r0 ! Keep r0 = 0 for entire loop clrt diff --git a/ghc/rts/gmp/mpn/sh/sub_n.s b/ghc/rts/gmp/mpn/sh/sub_n.s index 6b201f6..5f818c9 100644 --- a/ghc/rts/gmp/mpn/sh/sub_n.s +++ b/ghc/rts/gmp/mpn/sh/sub_n.s @@ -1,21 +1,21 @@ -! SH __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and store +! SH __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and store ! difference in a third limb vector. -! Copyright (C) 1995 Free Software Foundation, Inc. +! Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc. ! This file is part of the GNU MP Library. ! The GNU MP Library is free software; you can redistribute it and/or modify -! it under the terms of the GNU Library General Public License as published by -! the Free Software Foundation; either version 2 of the License, or (at your +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your ! option) any later version. ! The GNU MP Library is distributed in the hope that it will be useful, but ! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ! License for more details. -! You should have received a copy of the GNU Library General Public License +! You should have received a copy of the GNU Lesser General Public License ! along with the GNU MP Library; see the file COPYING.LIB. If not, write to ! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ! MA 02111-1307, USA. @@ -29,8 +29,8 @@ .text .align 2 - .global ___mpn_sub_n -___mpn_sub_n: + .global ___gmpn_sub_n +___gmpn_sub_n: mov #0,r3 ! clear cy save reg Loop: mov.l @r5+,r1 @@ -44,4 +44,4 @@ Loop: mov.l @r5+,r1 add #4,r4 rts - movt r0 ! return carry-out from most sign. limb + mov r3,r0 ! return carry-out from most sign. limb diff --git a/ghc/rts/gmp/mpn/sparc32/add_n.asm b/ghc/rts/gmp/mpn/sparc32/add_n.asm new file mode 100644 index 0000000..5f1d00c --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/add_n.asm @@ -0,0 +1,236 @@ +dnl SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store +dnl sum in a third limb vector. + +dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +define(res_ptr,%o0) +define(s1_ptr,%o1) +define(s2_ptr,%o2) +define(n,%o3) + +ASM_START() +PROLOGUE(mpn_add_n) + xor s2_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(1) C branch if alignment differs + nop +C ** V1a ** +L(0): andcc res_ptr,4,%g0 C res_ptr unaligned? Side effect: cy=0 + be L(v1) C if no, branch + nop +C Add least significant limb separately to align res_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + addcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr +L(v1): addx %g0,%g0,%o4 C save cy in register + cmp n,2 C if n < 2 ... + bl L(end2) C ... branch to tail code + subcc %g0,%o4,%g0 C restore cy + + ld [s1_ptr+0],%g4 + addcc n,-10,n + ld [s1_ptr+4],%g1 + ldd [s2_ptr+0],%g2 + blt L(fin1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop1): + addxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + addxcc %g4,%g2,%o4 + ld [s1_ptr+16],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+20],%g1 + ldd [s2_ptr+16],%g2 + std %o4,[res_ptr+8] + addxcc %g4,%g2,%o4 + ld [s1_ptr+24],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+28],%g1 + ldd [s2_ptr+24],%g2 + std %o4,[res_ptr+16] + addxcc %g4,%g2,%o4 + ld [s1_ptr+32],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+36],%g1 + ldd [s2_ptr+32],%g2 + std %o4,[res_ptr+24] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop1) + subcc %g0,%o4,%g0 C restore cy + +L(fin1): + addcc n,8-2,n + blt L(end1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 2 limbs until less than 2 limbs remain +L(loope1): + addxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope1) + subcc %g0,%o4,%g0 C restore cy +L(end1): + addxcc %g4,%g2,%o4 + addxcc %g1,%g3,%o5 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + + andcc n,1,%g0 + be L(ret1) + subcc %g0,%o4,%g0 C restore cy +C Add last limb + ld [s1_ptr+8],%g4 + ld [s2_ptr+8],%g2 + addxcc %g4,%g2,%o4 + st %o4,[res_ptr+8] + +L(ret1): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb + +L(1): xor s1_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(2) + nop +C ** V1b ** + mov s2_ptr,%g1 + mov s1_ptr,s2_ptr + b L(0) + mov %g1,s1_ptr + +C ** V2 ** +C If we come here, the alignment of s1_ptr and res_ptr as well as the +C alignment of s2_ptr and res_ptr differ. Since there are only two ways +C things can be aligned (that we care about) we now know that the alignment +C of s1_ptr and s2_ptr are the same. + +L(2): cmp n,1 + be L(jone) + nop + andcc s1_ptr,4,%g0 C s1_ptr unaligned? Side effect: cy=0 + be L(v2) C if no, branch + nop +C Add least significant limb separately to align s1_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + addcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr + +L(v2): addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + blt L(fin2) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + ldd [s1_ptr+8],%g2 + ldd [s2_ptr+8],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+8] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+12] + ldd [s1_ptr+16],%g2 + ldd [s2_ptr+16],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+16] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+20] + ldd [s1_ptr+24],%g2 + ldd [s2_ptr+24],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+24] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+28] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop2) + subcc %g0,%o4,%g0 C restore cy + +L(fin2): + addcc n,8-2,n + blt L(end2) + subcc %g0,%o4,%g0 C restore cy +L(loope2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope2) + subcc %g0,%o4,%g0 C restore cy +L(end2): + andcc n,1,%g0 + be L(ret2) + subcc %g0,%o4,%g0 C restore cy +C Add last limb +L(jone): + ld [s1_ptr],%g4 + ld [s2_ptr],%g2 + addxcc %g4,%g2,%o4 + st %o4,[res_ptr] + +L(ret2): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb +EPILOGUE(mpn_add_n) diff --git a/ghc/rts/gmp/mpn/sparc32/addmul_1.asm b/ghc/rts/gmp/mpn/sparc32/addmul_1.asm new file mode 100644 index 0000000..80c94e4 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/addmul_1.asm @@ -0,0 +1,146 @@ +dnl SPARC mpn_addmul_1 -- Multiply a limb vector with a limb and add the +dnl result to a second limb vector. + +dnl Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_addmul_1) + C Make S1_PTR and RES_PTR point at the end of their blocks + C and put (- 4 x SIZE) in index/loop counter. + sll %o2,2,%o2 + add %o0,%o2,%o4 C RES_PTR in o4 since o0 is retval + add %o1,%o2,%o1 + sub %g0,%o2,%o2 + + cmp %o3,0xfff + bgu L(large) + nop + + ld [%o1+%o2],%o5 + mov 0,%o0 + b L(0) + add %o4,-4,%o4 +L(loop0): + addcc %o5,%g1,%g1 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g1,[%o4+%o2] +L(0): wr %g0,%o3,%y + sra %o5,31,%g2 + and %o3,%g2,%g2 + andcc %g1,0,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,0,%g1 + sra %g1,20,%g4 + sll %g1,12,%g1 + rd %y,%g3 + srl %g3,20,%g3 + or %g1,%g3,%g1 + + addcc %g1,%o0,%g1 + addx %g2,%g4,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne L(loop0) + ld [%o4+%o2],%o5 + + addcc %o5,%g1,%g1 + addx %o0,%g0,%o0 + retl + st %g1,[%o4+%o2] + +L(large): + ld [%o1+%o2],%o5 + mov 0,%o0 + sra %o3,31,%g4 C g4 = mask of ones iff S2_LIMB < 0 + b L(1) + add %o4,-4,%o4 +L(loop): + addcc %o5,%g3,%g3 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g3,[%o4+%o2] +L(1): wr %g0,%o5,%y + and %o5,%g4,%g2 + andcc %g0,%g0,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%g0,%g1 + rd %y,%g3 + addcc %g3,%o0,%g3 + addx %g2,%g1,%o0 + addcc %o2,4,%o2 + bne L(loop) + ld [%o4+%o2],%o5 + + addcc %o5,%g3,%g3 + addx %o0,%g0,%o0 + retl + st %g3,[%o4+%o2] +EPILOGUE(mpn_addmul_1) diff --git a/ghc/rts/gmp/mpn/sparc32/lshift.asm b/ghc/rts/gmp/mpn/sparc32/lshift.asm new file mode 100644 index 0000000..529733a --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/lshift.asm @@ -0,0 +1,97 @@ +dnl SPARC mpn_lshift -- Shift a number left. +dnl + +dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr %o0 +C src_ptr %o1 +C size %o2 +C cnt %o3 + +ASM_START() +PROLOGUE(mpn_lshift) + sll %o2,2,%g1 + add %o1,%g1,%o1 C make %o1 point at end of src + ld [%o1-4],%g2 C load first limb + sub %g0,%o3,%o5 C negate shift count + add %o0,%g1,%o0 C make %o0 point at end of res + add %o2,-1,%o2 + andcc %o2,4-1,%g4 C number of limbs in first loop + srl %g2,%o5,%g1 C compute function result + be L(0) C if multiple of 4 limbs, skip first loop + st %g1,[%sp+80] + + sub %o2,%g4,%o2 C adjust count for main loop + +L(loop0): + ld [%o1-8],%g3 + add %o0,-4,%o0 + add %o1,-4,%o1 + addcc %g4,-1,%g4 + sll %g2,%o3,%o4 + srl %g3,%o5,%g1 + mov %g3,%g2 + or %o4,%g1,%o4 + bne L(loop0) + st %o4,[%o0+0] + +L(0): tst %o2 + be L(end) + nop + +L(loop): + ld [%o1-8],%g3 + add %o0,-16,%o0 + addcc %o2,-4,%o2 + sll %g2,%o3,%o4 + srl %g3,%o5,%g1 + + ld [%o1-12],%g2 + sll %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0+12] + srl %g2,%o5,%g1 + + ld [%o1-16],%g3 + sll %g2,%o3,%o4 + or %g4,%g1,%g4 + st %g4,[%o0+8] + srl %g3,%o5,%g1 + + ld [%o1-20],%g2 + sll %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0+4] + srl %g2,%o5,%g1 + + add %o1,-16,%o1 + or %g4,%g1,%g4 + bne L(loop) + st %g4,[%o0+0] + +L(end): sll %g2,%o3,%g2 + st %g2,[%o0-4] + retl + ld [%sp+80],%o0 +EPILOGUE(mpn_lshift) diff --git a/ghc/rts/gmp/mpn/sparc32/mul_1.asm b/ghc/rts/gmp/mpn/sparc32/mul_1.asm new file mode 100644 index 0000000..e5fedea --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/mul_1.asm @@ -0,0 +1,137 @@ +dnl SPARC mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_mul_1) + C Make S1_PTR and RES_PTR point at the end of their blocks + C and put (- 4 x SIZE) in index/loop counter. + sll %o2,2,%o2 + add %o0,%o2,%o4 C RES_PTR in o4 since o0 is retval + add %o1,%o2,%o1 + sub %g0,%o2,%o2 + + cmp %o3,0xfff + bgu L(large) + nop + + ld [%o1+%o2],%o5 + mov 0,%o0 + b L(0) + add %o4,-4,%o4 +L(loop0): + st %g1,[%o4+%o2] +L(0): wr %g0,%o3,%y + sra %o5,31,%g2 + and %o3,%g2,%g2 + andcc %g1,0,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,0,%g1 + sra %g1,20,%g4 + sll %g1,12,%g1 + rd %y,%g3 + srl %g3,20,%g3 + or %g1,%g3,%g1 + + addcc %g1,%o0,%g1 + addx %g2,%g4,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne,a L(loop0) + ld [%o1+%o2],%o5 + + retl + st %g1,[%o4+%o2] + + +L(large): + ld [%o1+%o2],%o5 + mov 0,%o0 + sra %o3,31,%g4 C g4 = mask of ones iff S2_LIMB < 0 + b L(1) + add %o4,-4,%o4 +L(loop): + st %g3,[%o4+%o2] +L(1): wr %g0,%o5,%y + and %o5,%g4,%g2 C g2 = S1_LIMB iff S2_LIMB < 0, else 0 + andcc %g0,%g0,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%g0,%g1 + rd %y,%g3 + addcc %g3,%o0,%g3 + addx %g2,%g1,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne,a L(loop) + ld [%o1+%o2],%o5 + + retl + st %g3,[%o4+%o2] +EPILOGUE(mpn_mul_1) diff --git a/ghc/rts/gmp/mpn/sparc32/rshift.asm b/ghc/rts/gmp/mpn/sparc32/rshift.asm new file mode 100644 index 0000000..9187dba --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/rshift.asm @@ -0,0 +1,93 @@ +dnl SPARC mpn_rshift -- Shift a number right. + +dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr %o0 +C src_ptr %o1 +C size %o2 +C cnt %o3 + +ASM_START() +PROLOGUE(mpn_rshift) + ld [%o1],%g2 C load first limb + sub %g0,%o3,%o5 C negate shift count + add %o2,-1,%o2 + andcc %o2,4-1,%g4 C number of limbs in first loop + sll %g2,%o5,%g1 C compute function result + be L(0) C if multiple of 4 limbs, skip first loop + st %g1,[%sp+80] + + sub %o2,%g4,%o2 C adjust count for main loop + +L(loop0): + ld [%o1+4],%g3 + add %o0,4,%o0 + add %o1,4,%o1 + addcc %g4,-1,%g4 + srl %g2,%o3,%o4 + sll %g3,%o5,%g1 + mov %g3,%g2 + or %o4,%g1,%o4 + bne L(loop0) + st %o4,[%o0-4] + +L(0): tst %o2 + be L(end) + nop + +L(loop): + ld [%o1+4],%g3 + add %o0,16,%o0 + addcc %o2,-4,%o2 + srl %g2,%o3,%o4 + sll %g3,%o5,%g1 + + ld [%o1+8],%g2 + srl %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0-16] + sll %g2,%o5,%g1 + + ld [%o1+12],%g3 + srl %g2,%o3,%o4 + or %g4,%g1,%g4 + st %g4,[%o0-12] + sll %g3,%o5,%g1 + + ld [%o1+16],%g2 + srl %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0-8] + sll %g2,%o5,%g1 + + add %o1,16,%o1 + or %g4,%g1,%g4 + bne L(loop) + st %g4,[%o0-4] + +L(end): srl %g2,%o3,%g2 + st %g2,[%o0-0] + retl + ld [%sp+80],%o0 +EPILOGUE(mpn_rshift) diff --git a/ghc/rts/gmp/mpn/sparc32/sub_n.asm b/ghc/rts/gmp/mpn/sparc32/sub_n.asm new file mode 100644 index 0000000..071909a --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/sub_n.asm @@ -0,0 +1,326 @@ +dnl SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +define(res_ptr,%o0) +define(s1_ptr,%o1) +define(s2_ptr,%o2) +define(n,%o3) + +ASM_START() +PROLOGUE(mpn_sub_n) + xor s2_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(1) C branch if alignment differs + nop +C ** V1a ** + andcc res_ptr,4,%g0 C res_ptr unaligned? Side effect: cy=0 + be L(v1) C if no, branch + nop +C Add least significant limb separately to align res_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + subcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr +L(v1): addx %g0,%g0,%o4 C save cy in register + cmp n,2 C if n < 2 ... + bl L(end2) C ... branch to tail code + subcc %g0,%o4,%g0 C restore cy + + ld [s1_ptr+0],%g4 + addcc n,-10,n + ld [s1_ptr+4],%g1 + ldd [s2_ptr+0],%g2 + blt L(fin1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop1): + subxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + subxcc %g4,%g2,%o4 + ld [s1_ptr+16],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+20],%g1 + ldd [s2_ptr+16],%g2 + std %o4,[res_ptr+8] + subxcc %g4,%g2,%o4 + ld [s1_ptr+24],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+28],%g1 + ldd [s2_ptr+24],%g2 + std %o4,[res_ptr+16] + subxcc %g4,%g2,%o4 + ld [s1_ptr+32],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+36],%g1 + ldd [s2_ptr+32],%g2 + std %o4,[res_ptr+24] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop1) + subcc %g0,%o4,%g0 C restore cy + +L(fin1): + addcc n,8-2,n + blt L(end1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 2 limbs until less than 2 limbs remain +L(loope1): + subxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope1) + subcc %g0,%o4,%g0 C restore cy +L(end1): + subxcc %g4,%g2,%o4 + subxcc %g1,%g3,%o5 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + + andcc n,1,%g0 + be L(ret1) + subcc %g0,%o4,%g0 C restore cy +C Add last limb + ld [s1_ptr+8],%g4 + ld [s2_ptr+8],%g2 + subxcc %g4,%g2,%o4 + st %o4,[res_ptr+8] + +L(ret1): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb + +L(1): xor s1_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(2) + nop +C ** V1b ** + andcc res_ptr,4,%g0 C res_ptr unaligned? Side effect: cy=0 + be L(v1b) C if no, branch + nop +C Add least significant limb separately to align res_ptr and s1_ptr + ld [s2_ptr],%g4 + add s2_ptr,4,s2_ptr + ld [s1_ptr],%g2 + add s1_ptr,4,s1_ptr + add n,-1,n + subcc %g2,%g4,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr +L(v1b): addx %g0,%g0,%o4 C save cy in register + cmp n,2 C if n < 2 ... + bl L(end2) C ... branch to tail code + subcc %g0,%o4,%g0 C restore cy + + ld [s2_ptr+0],%g4 + addcc n,-10,n + ld [s2_ptr+4],%g1 + ldd [s1_ptr+0],%g2 + blt L(fin1b) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop1b): + subxcc %g2,%g4,%o4 + ld [s2_ptr+8],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+12],%g1 + ldd [s1_ptr+8],%g2 + std %o4,[res_ptr+0] + subxcc %g2,%g4,%o4 + ld [s2_ptr+16],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+20],%g1 + ldd [s1_ptr+16],%g2 + std %o4,[res_ptr+8] + subxcc %g2,%g4,%o4 + ld [s2_ptr+24],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+28],%g1 + ldd [s1_ptr+24],%g2 + std %o4,[res_ptr+16] + subxcc %g2,%g4,%o4 + ld [s2_ptr+32],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+36],%g1 + ldd [s1_ptr+32],%g2 + std %o4,[res_ptr+24] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop1b) + subcc %g0,%o4,%g0 C restore cy + +L(fin1b): + addcc n,8-2,n + blt L(end1b) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 2 limbs until less than 2 limbs remain +L(loope1b): + subxcc %g2,%g4,%o4 + ld [s2_ptr+8],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+12],%g1 + ldd [s1_ptr+8],%g2 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope1b) + subcc %g0,%o4,%g0 C restore cy +L(end1b): + subxcc %g2,%g4,%o4 + subxcc %g3,%g1,%o5 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + + andcc n,1,%g0 + be L(ret1b) + subcc %g0,%o4,%g0 C restore cy +C Add last limb + ld [s2_ptr+8],%g4 + ld [s1_ptr+8],%g2 + subxcc %g2,%g4,%o4 + st %o4,[res_ptr+8] + +L(ret1b): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb + +C ** V2 ** +C If we come here, the alignment of s1_ptr and res_ptr as well as the +C alignment of s2_ptr and res_ptr differ. Since there are only two ways +C things can be aligned (that we care about) we now know that the alignment +C of s1_ptr and s2_ptr are the same. + +L(2): cmp n,1 + be L(jone) + nop + andcc s1_ptr,4,%g0 C s1_ptr unaligned? Side effect: cy=0 + be L(v2) C if no, branch + nop +C Add least significant limb separately to align s1_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + subcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr + +L(v2): addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + blt L(fin2) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + ldd [s1_ptr+8],%g2 + ldd [s2_ptr+8],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+8] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+12] + ldd [s1_ptr+16],%g2 + ldd [s2_ptr+16],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+16] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+20] + ldd [s1_ptr+24],%g2 + ldd [s2_ptr+24],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+24] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+28] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop2) + subcc %g0,%o4,%g0 C restore cy + +L(fin2): + addcc n,8-2,n + blt L(end2) + subcc %g0,%o4,%g0 C restore cy +L(loope2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope2) + subcc %g0,%o4,%g0 C restore cy +L(end2): + andcc n,1,%g0 + be L(ret2) + subcc %g0,%o4,%g0 C restore cy +C Add last limb +L(jone): + ld [s1_ptr],%g4 + ld [s2_ptr],%g2 + subxcc %g4,%g2,%o4 + st %o4,[res_ptr] + +L(ret2): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb +EPILOGUE(mpn_sub_n) diff --git a/ghc/rts/gmp/mpn/sparc32/submul_1.asm b/ghc/rts/gmp/mpn/sparc32/submul_1.asm new file mode 100644 index 0000000..12abd84 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/submul_1.asm @@ -0,0 +1,146 @@ +dnl SPARC mpn_submul_1 -- Multiply a limb vector with a limb and subtract +dnl the result from a second limb vector. + +dnl Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_submul_1) + C Make S1_PTR and RES_PTR point at the end of their blocks + C and put (- 4 x SIZE) in index/loop counter. + sll %o2,2,%o2 + add %o0,%o2,%o4 C RES_PTR in o4 since o0 is retval + add %o1,%o2,%o1 + sub %g0,%o2,%o2 + + cmp %o3,0xfff + bgu L(large) + nop + + ld [%o1+%o2],%o5 + mov 0,%o0 + b L(0) + add %o4,-4,%o4 +L(loop0): + subcc %o5,%g1,%g1 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g1,[%o4+%o2] +L(0): wr %g0,%o3,%y + sra %o5,31,%g2 + and %o3,%g2,%g2 + andcc %g1,0,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,0,%g1 + sra %g1,20,%g4 + sll %g1,12,%g1 + rd %y,%g3 + srl %g3,20,%g3 + or %g1,%g3,%g1 + + addcc %g1,%o0,%g1 + addx %g2,%g4,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne L(loop0) + ld [%o4+%o2],%o5 + + subcc %o5,%g1,%g1 + addx %o0,%g0,%o0 + retl + st %g1,[%o4+%o2] + +L(large): + ld [%o1+%o2],%o5 + mov 0,%o0 + sra %o3,31,%g4 C g4 = mask of ones iff S2_LIMB < 0 + b L(1) + add %o4,-4,%o4 +L(loop): + subcc %o5,%g3,%g3 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g3,[%o4+%o2] +L(1): wr %g0,%o5,%y + and %o5,%g4,%g2 + andcc %g0,%g0,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%g0,%g1 + rd %y,%g3 + addcc %g3,%o0,%g3 + addx %g2,%g1,%o0 + addcc %o2,4,%o2 + bne L(loop) + ld [%o4+%o2],%o5 + + subcc %o5,%g3,%g3 + addx %o0,%g0,%o0 + retl + st %g3,[%o4+%o2] +EPILOGUE(mpn_submul_1) diff --git a/ghc/rts/gmp/mpn/sparc32/udiv_fp.asm b/ghc/rts/gmp/mpn/sparc32/udiv_fp.asm new file mode 100644 index 0000000..e340e14 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/udiv_fp.asm @@ -0,0 +1,158 @@ +dnl SPARC v7 __udiv_qrnnd division support, used from longlong.h. +dnl This is for v7 CPUs with a floating-point unit. + +dnl Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr i0 +C n1 i1 +C n0 i2 +C d i3 + +ASM_START() + +ifdef(`PIC', +` TEXT +L(getpc): + retl + nop') + + TEXT + ALIGN(8) +L(C0): .double 0r4294967296 +L(C1): .double 0r2147483648 + +PROLOGUE(mpn_udiv_qrnnd) + save %sp,-104,%sp + st %i1,[%fp-8] + ld [%fp-8],%f10 + +ifdef(`PIC', +`L(pc): call L(getpc) C put address of this insn in %o7 + ldd [%o7+L(C0)-L(pc)],%f8', +` sethi %hi(L(C0)),%o7 + ldd [%o7+%lo(L(C0))],%f8') + + fitod %f10,%f4 + cmp %i1,0 + bge L(248) + mov %i0,%i5 + faddd %f4,%f8,%f4 +L(248): + st %i2,[%fp-8] + ld [%fp-8],%f10 + fmuld %f4,%f8,%f6 + cmp %i2,0 + bge L(249) + fitod %f10,%f2 + faddd %f2,%f8,%f2 +L(249): + st %i3,[%fp-8] + faddd %f6,%f2,%f2 + ld [%fp-8],%f10 + cmp %i3,0 + bge L(250) + fitod %f10,%f4 + faddd %f4,%f8,%f4 +L(250): + fdivd %f2,%f4,%f2 + +ifdef(`PIC', +` ldd [%o7+L(C1)-L(pc)],%f4', +` sethi %hi(L(C1)),%o7 + ldd [%o7+%lo(L(C1))],%f4') + + fcmped %f2,%f4 + nop + fbge,a L(251) + fsubd %f2,%f4,%f2 + fdtoi %f2,%f2 + st %f2,[%fp-8] + b L(252) + ld [%fp-8],%i4 +L(251): + fdtoi %f2,%f2 + st %f2,[%fp-8] + ld [%fp-8],%i4 + sethi %hi(-2147483648),%g2 + xor %i4,%g2,%i4 +L(252): + wr %g0,%i4,%y + sra %i3,31,%g2 + and %i4,%g2,%g2 + andcc %g0,0,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,0,%g1 + add %g1,%g2,%i0 + rd %y,%g3 + subcc %i2,%g3,%o7 + subxcc %i1,%i0,%g0 + be L(253) + cmp %o7,%i3 + + add %i4,-1,%i0 + add %o7,%i3,%o7 + st %o7,[%i5] + ret + restore +L(253): + blu L(246) + mov %i4,%i0 + add %i4,1,%i0 + sub %o7,%i3,%o7 +L(246): + st %o7,[%i5] + ret + restore +EPILOGUE(mpn_udiv_qrnnd) diff --git a/ghc/rts/gmp/mpn/sparc32/udiv_nfp.asm b/ghc/rts/gmp/mpn/sparc32/udiv_nfp.asm new file mode 100644 index 0000000..ae19f4c --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/udiv_nfp.asm @@ -0,0 +1,193 @@ +dnl SPARC v7 __udiv_qrnnd division support, used from longlong.h. +dnl This is for v7 CPUs without a floating-point unit. + +dnl Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr o0 +C n1 o1 +C n0 o2 +C d o3 + +ASM_START() +PROLOGUE(mpn_udiv_qrnnd) + tst %o3 + bneg L(largedivisor) + mov 8,%g1 + + b L(p1) + addxcc %o2,%o2,%o2 + +L(plop): + bcc L(n1) + addxcc %o2,%o2,%o2 +L(p1): addx %o1,%o1,%o1 + subcc %o1,%o3,%o4 + bcc L(n2) + addxcc %o2,%o2,%o2 +L(p2): addx %o1,%o1,%o1 + subcc %o1,%o3,%o4 + bcc L(n3) + addxcc %o2,%o2,%o2 +L(p3): addx %o1,%o1,%o1 + subcc %o1,%o3,%o4 + bcc L(n4) + addxcc %o2,%o2,%o2 +L(p4): addx %o1,%o1,%o1 + addcc %g1,-1,%g1 + bne L(plop) + subcc %o1,%o3,%o4 + bcc L(n5) + addxcc %o2,%o2,%o2 +L(p5): st %o1,[%o0] + retl + xnor %g0,%o2,%o0 + +L(nlop): + bcc L(p1) + addxcc %o2,%o2,%o2 +L(n1): addx %o4,%o4,%o4 + subcc %o4,%o3,%o1 + bcc L(p2) + addxcc %o2,%o2,%o2 +L(n2): addx %o4,%o4,%o4 + subcc %o4,%o3,%o1 + bcc L(p3) + addxcc %o2,%o2,%o2 +L(n3): addx %o4,%o4,%o4 + subcc %o4,%o3,%o1 + bcc L(p4) + addxcc %o2,%o2,%o2 +L(n4): addx %o4,%o4,%o4 + addcc %g1,-1,%g1 + bne L(nlop) + subcc %o4,%o3,%o1 + bcc L(p5) + addxcc %o2,%o2,%o2 +L(n5): st %o4,[%o0] + retl + xnor %g0,%o2,%o0 + +L(largedivisor): + and %o2,1,%o5 C %o5 = n0 & 1 + + srl %o2,1,%o2 + sll %o1,31,%g2 + or %g2,%o2,%o2 C %o2 = lo(n1n0 >> 1) + srl %o1,1,%o1 C %o1 = hi(n1n0 >> 1) + + and %o3,1,%g2 + srl %o3,1,%g3 C %g3 = floor(d / 2) + add %g3,%g2,%g3 C %g3 = ceil(d / 2) + + b L(Lp1) + addxcc %o2,%o2,%o2 + +L(Lplop): + bcc L(Ln1) + addxcc %o2,%o2,%o2 +L(Lp1): addx %o1,%o1,%o1 + subcc %o1,%g3,%o4 + bcc L(Ln2) + addxcc %o2,%o2,%o2 +L(Lp2): addx %o1,%o1,%o1 + subcc %o1,%g3,%o4 + bcc L(Ln3) + addxcc %o2,%o2,%o2 +L(Lp3): addx %o1,%o1,%o1 + subcc %o1,%g3,%o4 + bcc L(Ln4) + addxcc %o2,%o2,%o2 +L(Lp4): addx %o1,%o1,%o1 + addcc %g1,-1,%g1 + bne L(Lplop) + subcc %o1,%g3,%o4 + bcc L(Ln5) + addxcc %o2,%o2,%o2 +L(Lp5): add %o1,%o1,%o1 C << 1 + tst %g2 + bne L(oddp) + add %o5,%o1,%o1 + st %o1,[%o0] + retl + xnor %g0,%o2,%o0 + +L(Lnlop): + bcc L(Lp1) + addxcc %o2,%o2,%o2 +L(Ln1): addx %o4,%o4,%o4 + subcc %o4,%g3,%o1 + bcc L(Lp2) + addxcc %o2,%o2,%o2 +L(Ln2): addx %o4,%o4,%o4 + subcc %o4,%g3,%o1 + bcc L(Lp3) + addxcc %o2,%o2,%o2 +L(Ln3): addx %o4,%o4,%o4 + subcc %o4,%g3,%o1 + bcc L(Lp4) + addxcc %o2,%o2,%o2 +L(Ln4): addx %o4,%o4,%o4 + addcc %g1,-1,%g1 + bne L(Lnlop) + subcc %o4,%g3,%o1 + bcc L(Lp5) + addxcc %o2,%o2,%o2 +L(Ln5): add %o4,%o4,%o4 C << 1 + tst %g2 + bne L(oddn) + add %o5,%o4,%o4 + st %o4,[%o0] + retl + xnor %g0,%o2,%o0 + +L(oddp): + xnor %g0,%o2,%o2 + C q' in %o2. r' in %o1 + addcc %o1,%o2,%o1 + bcc L(Lp6) + addx %o2,0,%o2 + sub %o1,%o3,%o1 +L(Lp6): subcc %o1,%o3,%g0 + bcs L(Lp7) + subx %o2,-1,%o2 + sub %o1,%o3,%o1 +L(Lp7): st %o1,[%o0] + retl + mov %o2,%o0 + +L(oddn): + xnor %g0,%o2,%o2 + C q' in %o2. r' in %o4 + addcc %o4,%o2,%o4 + bcc L(Ln6) + addx %o2,0,%o2 + sub %o4,%o3,%o4 +L(Ln6): subcc %o4,%o3,%g0 + bcs L(Ln7) + subx %o2,-1,%o2 + sub %o4,%o3,%o4 +L(Ln7): st %o4,[%o0] + retl + mov %o2,%o0 +EPILOGUE(mpn_udiv_qrnnd) diff --git a/ghc/rts/gmp/mpn/sparc32/umul.asm b/ghc/rts/gmp/mpn/sparc32/umul.asm new file mode 100644 index 0000000..efa5685 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/umul.asm @@ -0,0 +1,68 @@ +dnl SPARC mpn_umul_ppmm -- support for longlong.h for non-gcc. + +dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + wr %g0,%o1,%y + sra %o2,31,%g2 C Don't move this insn + and %o1,%g2,%g2 C Don't move this insn + andcc %g0,0,%g1 C Don't move this insn + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,0,%g1 + rd %y,%g3 + st %g3,[%o0] + retl + add %g1,%g2,%o0 +EPILOGUE(mpn_umul_ppmm) diff --git a/ghc/rts/gmp/mpn/sparc32/v8/addmul_1.asm b/ghc/rts/gmp/mpn/sparc32/v8/addmul_1.asm new file mode 100644 index 0000000..da44644 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/v8/addmul_1.asm @@ -0,0 +1,122 @@ +dnl SPARC v8 mpn_addmul_1 -- Multiply a limb vector with a limb and +dnl add the result to a second limb vector. + +dnl Copyright (C) 1992, 1993, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_addmul_1) + orcc %g0,%g0,%g2 + ld [%o1+0],%o4 C 1 + + sll %o2,4,%g1 + and %g1,(4-1)<<4,%g1 +ifdef(`PIC', +` mov %o7,%g4 C Save return address register +0: call 1f + add %o7,L(1)-0b,%g3 +1: mov %g4,%o7 C Restore return address register +', +` sethi %hi(L(1)),%g3 + or %g3,%lo(L(1)),%g3 +') + jmp %g3+%g1 + nop +L(1): +L(L00): add %o0,-4,%o0 + b L(loop00) C 4, 8, 12, ... + add %o1,-4,%o1 + nop +L(L01): b L(loop01) C 1, 5, 9, ... + nop + nop + nop +L(L10): add %o0,-12,%o0 C 2, 6, 10, ... + b L(loop10) + add %o1,4,%o1 + nop +L(L11): add %o0,-8,%o0 C 3, 7, 11, ... + b L(loop11) + add %o1,-8,%o1 + nop + +L(loop): + addcc %g3,%g2,%g3 C 1 + ld [%o1+4],%o4 C 2 + rd %y,%g2 C 1 + addx %g0,%g2,%g2 + ld [%o0+0],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+0] C 1 +L(loop00): + umul %o4,%o3,%g3 C 2 + ld [%o0+4],%g1 C 2 + addxcc %g3,%g2,%g3 C 2 + ld [%o1+8],%o4 C 3 + rd %y,%g2 C 2 + addx %g0,%g2,%g2 + nop + addcc %g1,%g3,%g3 + st %g3,[%o0+4] C 2 +L(loop11): + umul %o4,%o3,%g3 C 3 + addxcc %g3,%g2,%g3 C 3 + ld [%o1+12],%o4 C 4 + rd %y,%g2 C 3 + add %o1,16,%o1 + addx %g0,%g2,%g2 + ld [%o0+8],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+8] C 3 +L(loop10): + umul %o4,%o3,%g3 C 4 + addxcc %g3,%g2,%g3 C 4 + ld [%o1+0],%o4 C 1 + rd %y,%g2 C 4 + addx %g0,%g2,%g2 + ld [%o0+12],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+12] C 4 + add %o0,16,%o0 + addx %g0,%g2,%g2 +L(loop01): + addcc %o2,-4,%o2 + bg L(loop) + umul %o4,%o3,%g3 C 1 + + addcc %g3,%g2,%g3 C 4 + rd %y,%g2 C 4 + addx %g0,%g2,%g2 + ld [%o0+0],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+0] C 4 + addx %g0,%g2,%o0 + + retl + nop +EPILOGUE(mpn_addmul_1) diff --git a/ghc/rts/gmp/mpn/sparc32/v8/mul_1.asm b/ghc/rts/gmp/mpn/sparc32/v8/mul_1.asm new file mode 100644 index 0000000..8012475 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/v8/mul_1.asm @@ -0,0 +1,103 @@ +dnl SPARC v8 mpn_mul_1 -- Multiply a limb vector with a single limb and +dnl store the product in a second limb vector. + +dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_mul_1) + sll %o2,4,%g1 + and %g1,(4-1)<<4,%g1 +ifdef(`PIC', +` mov %o7,%g4 C Save return address register +0: call 1f + add %o7,L(1)-0b,%g3 +1: mov %g4,%o7 C Restore return address register +', +` sethi %hi(L(1)),%g3 + or %g3,%lo(L(1)),%g3 +') + jmp %g3+%g1 + ld [%o1+0],%o4 C 1 +L(1): +L(L00): add %o0,-4,%o0 + add %o1,-4,%o1 + b L(loop00) C 4, 8, 12, ... + orcc %g0,%g0,%g2 +L(L01): b L(loop01) C 1, 5, 9, ... + orcc %g0,%g0,%g2 + nop + nop +L(L10): add %o0,-12,%o0 C 2, 6, 10, ... + add %o1,4,%o1 + b L(loop10) + orcc %g0,%g0,%g2 + nop +L(L11): add %o0,-8,%o0 C 3, 7, 11, ... + add %o1,-8,%o1 + b L(loop11) + orcc %g0,%g0,%g2 + +L(loop): + addcc %g3,%g2,%g3 C 1 + ld [%o1+4],%o4 C 2 + st %g3,[%o0+0] C 1 + rd %y,%g2 C 1 +L(loop00): + umul %o4,%o3,%g3 C 2 + addxcc %g3,%g2,%g3 C 2 + ld [%o1+8],%o4 C 3 + st %g3,[%o0+4] C 2 + rd %y,%g2 C 2 +L(loop11): + umul %o4,%o3,%g3 C 3 + addxcc %g3,%g2,%g3 C 3 + ld [%o1+12],%o4 C 4 + add %o1,16,%o1 + st %g3,[%o0+8] C 3 + rd %y,%g2 C 3 +L(loop10): + umul %o4,%o3,%g3 C 4 + addxcc %g3,%g2,%g3 C 4 + ld [%o1+0],%o4 C 1 + st %g3,[%o0+12] C 4 + add %o0,16,%o0 + rd %y,%g2 C 4 + addx %g0,%g2,%g2 +L(loop01): + addcc %o2,-4,%o2 + bg L(loop) + umul %o4,%o3,%g3 C 1 + + addcc %g3,%g2,%g3 C 4 + st %g3,[%o0+0] C 4 + rd %y,%g2 C 4 + + retl + addx %g0,%g2,%o0 +EPILOGUE(mpn_mul_1) diff --git a/ghc/rts/gmp/mpn/sparc32/v8/submul_1.asm b/ghc/rts/gmp/mpn/sparc32/v8/submul_1.asm new file mode 100644 index 0000000..9ed132f --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/v8/submul_1.asm @@ -0,0 +1,58 @@ +dnl SPARC v8 mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_submul_1) + sub %g0,%o2,%o2 C negate ... + sll %o2,2,%o2 C ... and scale size + sub %o1,%o2,%o1 C o1 is offset s1_ptr + sub %o0,%o2,%g1 C g1 is offset res_ptr + + mov 0,%o0 C clear cy_limb + +L(loop): + ld [%o1+%o2],%o4 + ld [%g1+%o2],%g2 + umul %o4,%o3,%o5 + rd %y,%g3 + addcc %o5,%o0,%o5 + addx %g3,0,%o0 + subcc %g2,%o5,%g2 + addx %o0,0,%o0 + st %g2,[%g1+%o2] + + addcc %o2,4,%o2 + bne L(loop) + nop + + retl + nop +EPILOGUE(mpn_submul_1) diff --git a/ghc/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm b/ghc/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm new file mode 100644 index 0000000..0d5e8d4 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm @@ -0,0 +1,122 @@ +dnl SuperSPARC mpn_udiv_qrnnd division support, used from longlong.h. +dnl This is for SuperSPARC only, to compensate for its semi-functional +dnl udiv instruction. + +dnl Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr i0 +C n1 i1 +C n0 i2 +C d i3 + +ASM_START() + +ifdef(`PIC', +` TEXT +L(getpc): + retl + nop') + + TEXT + ALIGN(8) +L(C0): .double 0r4294967296 +L(C1): .double 0r2147483648 + +PROLOGUE(mpn_udiv_qrnnd) + save %sp,-104,%sp + st %i1,[%fp-8] + ld [%fp-8],%f10 + +ifdef(`PIC', +`L(pc): call L(getpc) C put address of this insn in %o7 + ldd [%o7+L(C0)-L(pc)],%f8', +` sethi %hi(L(C0)),%o7 + ldd [%o7+%lo(L(C0))],%f8') + + fitod %f10,%f4 + cmp %i1,0 + bge L(248) + mov %i0,%i5 + faddd %f4,%f8,%f4 +L(248): + st %i2,[%fp-8] + ld [%fp-8],%f10 + fmuld %f4,%f8,%f6 + cmp %i2,0 + bge L(249) + fitod %f10,%f2 + faddd %f2,%f8,%f2 +L(249): + st %i3,[%fp-8] + faddd %f6,%f2,%f2 + ld [%fp-8],%f10 + cmp %i3,0 + bge L(250) + fitod %f10,%f4 + faddd %f4,%f8,%f4 +L(250): + fdivd %f2,%f4,%f2 + +ifdef(`PIC', +` ldd [%o7+L(C1)-L(pc)],%f4', +` sethi %hi(L(C1)),%o7 + ldd [%o7+%lo(L(C1))],%f4') + + fcmped %f2,%f4 + nop + fbge,a L(251) + fsubd %f2,%f4,%f2 + fdtoi %f2,%f2 + st %f2,[%fp-8] + b L(252) + ld [%fp-8],%i4 +L(251): + fdtoi %f2,%f2 + st %f2,[%fp-8] + ld [%fp-8],%i4 + sethi %hi(-2147483648),%g2 + xor %i4,%g2,%i4 +L(252): + umul %i3,%i4,%g3 + rd %y,%i0 + subcc %i2,%g3,%o7 + subxcc %i1,%i0,%g0 + be L(253) + cmp %o7,%i3 + + add %i4,-1,%i0 + add %o7,%i3,%o7 + st %o7,[%i5] + ret + restore +L(253): + blu L(246) + mov %i4,%i0 + add %i4,1,%i0 + sub %o7,%i3,%o7 +L(246): + st %o7,[%i5] + ret + restore +EPILOGUE(mpn_udiv_qrnnd) diff --git a/ghc/rts/gmp/mpn/sparc32/v8/umul.asm b/ghc/rts/gmp/mpn/sparc32/v8/umul.asm new file mode 100644 index 0000000..ae8f692 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/v8/umul.asm @@ -0,0 +1,31 @@ +dnl SPARC v8 mpn_umul_ppmm -- support for longlong.h for non-gcc. + +dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + umul %o1,%o2,%g2 + st %g2,[%o0] + retl + rd %y,%o0 +EPILOGUE(mpn_umul_ppmm) diff --git a/ghc/rts/gmp/mpn/sparc32/v9/README b/ghc/rts/gmp/mpn/sparc32/v9/README new file mode 100644 index 0000000..9b39713 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/v9/README @@ -0,0 +1,4 @@ +Code for SPARC processors implementing version 9 of the SPARC architecture. +This code is for systems that doesn't preserve the full 64-bit contents of +integer register at context switch. For other systems (such as Solaris 7 or +later) use the code in ../../sparc64. diff --git a/ghc/rts/gmp/mpn/sparc32/v9/addmul_1.asm b/ghc/rts/gmp/mpn/sparc32/v9/addmul_1.asm new file mode 100644 index 0000000..c1762cc --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/v9/addmul_1.asm @@ -0,0 +1,288 @@ +dnl SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and +dnl add the result to a second limb vector. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr i0 +C s1_ptr i1 +C size i2 +C s2_limb i3 + +ASM_START() + + TEXT + ALIGN(4) +L(noll): + .word 0 + +PROLOGUE(mpn_addmul_1) + save %sp,-256,%sp + +ifdef(`PIC', +`L(pc): rd %pc,%o7 + ld [%o7+L(noll)-L(pc)],%f10', +` sethi %hi(L(noll)),%g1 + ld [%g1+%lo(L(noll))],%f10') + + sethi %hi(0xffff0000),%o0 + andn %i3,%o0,%o0 + st %o0,[%fp-16] + ld [%fp-16],%f11 + fxtod %f10,%f6 + + srl %i3,16,%o0 + st %o0,[%fp-16] + ld [%fp-16],%f11 + fxtod %f10,%f8 + + mov 0,%g3 C cy = 0 + + ld [%i1],%f11 + subcc %i2,1,%i2 + be,pn %icc,L(end1) + add %i1,4,%i1 C s1_ptr++ + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-24] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,L(end2) + std %f12,[%fp-16] + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-40] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,L(end3) + std %f12,[%fp-32] + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + add %i0,4,%i0 C res_ptr++ + subcc %i2,1,%i2 + be,pn %icc,L(end4) + std %f12,[%fp-16] + + b,a L(loopm) + + .align 16 +C BEGIN LOOP +L(loop): + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + subcc %i2,1,%i2 + be,pn %icc,L(loope) + add %i0,4,%i0 C res_ptr++ +L(loopm): + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + subcc %i2,1,%i2 + bne,pt %icc,L(loop) + add %i0,4,%i0 C res_ptr++ +C END LOOP + + fxtod %f10,%f2 + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + b,a L(xxx) +L(loope): +L(end4): + fxtod %f10,%f2 + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + add %i0,4,%i0 C res_ptr++ + + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + b,a L(yyy) + +L(end3): + fxtod %f10,%f2 + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 +L(xxx): fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + add %i0,4,%i0 C res_ptr++ + + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + ldx [%fp-32],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + b,a L(ret) + +L(end2): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 +L(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + ldx [%fp-32],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + b,a L(ret) + +L(end1): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + +L(ret): add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + st %g4,[%i0-4] + + ret + restore %g0,%g3,%o0 C sideeffect: put cy in retreg +EPILOGUE(mpn_addmul_1) diff --git a/ghc/rts/gmp/mpn/sparc32/v9/gmp-mparam.h b/ghc/rts/gmp/mpn/sparc32/v9/gmp-mparam.h new file mode 100644 index 0000000..f946b90 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/v9/gmp-mparam.h @@ -0,0 +1,69 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + + +/* These values are for UltraSPARC I, II, and IIi. It is bogus that + this file lives in v9, but that will do for now. */ + +/* Variations in addmul_1 speed make the multiply and square thresholds + doubtful. TOOM3_SQR_THRESHOLD had to be estimated here. */ + +/* Generated by tuneup.c, 2000-07-06. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 30 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 200 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 59 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 500 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 107 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 146 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 29 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 3 +#endif diff --git a/ghc/rts/gmp/mpn/sparc32/v9/mul_1.asm b/ghc/rts/gmp/mpn/sparc32/v9/mul_1.asm new file mode 100644 index 0000000..f8f0fdd --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/v9/mul_1.asm @@ -0,0 +1,267 @@ +dnl SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and +dnl store the result in a second limb vector. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr i0 +C s1_ptr i1 +C size i2 +C s2_limb i3 + +ASM_START() + + TEXT + ALIGN(4) +L(noll): + .word 0 + +PROLOGUE(mpn_mul_1) + save %sp,-256,%sp + +ifdef(`PIC', +`L(pc): rd %pc,%o7 + ld [%o7+L(noll)-L(pc)],%f10', +` sethi %hi(L(noll)),%g1 + ld [%g1+%lo(L(noll))],%f10') + + sethi %hi(0xffff0000),%o0 + andn %i3,%o0,%o0 + st %o0,[%fp-16] + ld [%fp-16],%f11 + fxtod %f10,%f6 + + srl %i3,16,%o0 + st %o0,[%fp-16] + ld [%fp-16],%f11 + fxtod %f10,%f8 + + mov 0,%g3 C cy = 0 + + ld [%i1],%f11 + subcc %i2,1,%i2 + be,pn %icc,L(end1) + add %i1,4,%i1 C s1_ptr++ + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-24] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,L(end2) + std %f12,[%fp-16] + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-40] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,L(end3) + std %f12,[%fp-32] + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + add %i0,4,%i0 C res_ptr++ + subcc %i2,1,%i2 + be,pn %icc,L(end4) + std %f12,[%fp-16] + + b,a L(loopm) + + .align 16 +C BEGIN LOOP +L(loop): + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + subcc %i2,1,%i2 + be,pn %icc,L(loope) + add %i0,4,%i0 C res_ptr++ +L(loopm): + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + subcc %i2,1,%i2 + bne,pt %icc,L(loop) + add %i0,4,%i0 C res_ptr++ +C END LOOP + + fxtod %f10,%f2 + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + b,a L(xxx) +L(loope): +L(end4): + fxtod %f10,%f2 + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + b,a L(yyy) + +L(end3): + fxtod %f10,%f2 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 +L(xxx): fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + ldx [%fp-32],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + b,a L(ret) + +L(end2): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 +L(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + ldx [%fp-32],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + b,a L(ret) + +L(end1): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + +L(ret): add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + st %g4,[%i0-4] + + ret + restore %g0,%g3,%o0 C sideeffect: put cy in retreg +EPILOGUE(mpn_mul_1) diff --git a/ghc/rts/gmp/mpn/sparc32/v9/submul_1.asm b/ghc/rts/gmp/mpn/sparc32/v9/submul_1.asm new file mode 100644 index 0000000..6195ea8 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc32/v9/submul_1.asm @@ -0,0 +1,291 @@ +dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr i0 +C s1_ptr i1 +C size i2 +C s2_limb i3 + +ASM_START() + + TEXT + ALIGN(4) +L(noll): + .word 0 + +PROLOGUE(mpn_submul_1) + save %sp,-256,%sp + +ifdef(`PIC', +`L(pc): rd %pc,%o7 + ld [%o7+L(noll)-L(pc)],%f10', +` sethi %hi(L(noll)),%g1 + ld [%g1+%lo(L(noll))],%f10') + + sethi %hi(0xffff0000),%o0 + andn %i3,%o0,%o0 + st %o0,[%fp-16] + ld [%fp-16],%f11 + fxtod %f10,%f6 + + srl %i3,16,%o0 + st %o0,[%fp-16] + ld [%fp-16],%f11 + fxtod %f10,%f8 + + mov 0,%g3 C cy = 0 + + ld [%i1],%f11 + subcc %i2,1,%i2 + be,pn %icc,L(end1) + add %i1,4,%i1 C s1_ptr++ + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-24] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,L(end2) + std %f12,[%fp-16] + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-40] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,L(end3) + std %f12,[%fp-32] + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + add %i0,4,%i0 C res_ptr++ + subcc %i2,1,%i2 + be,pn %icc,L(end4) + std %f12,[%fp-16] + + b,a L(loopm) + + .align 16 +C BEGIN LOOP +L(loop): + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + addx %g3,0,%g3 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + subcc %i2,1,%i2 + be,pn %icc,L(loope) + add %i0,4,%i0 C res_ptr++ +L(loopm): + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + addx %g3,0,%g3 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + subcc %i2,1,%i2 + bne,pt %icc,L(loop) + add %i0,4,%i0 C res_ptr++ +C END LOOP + + fxtod %f10,%f2 + add %g3,%g1,%g4 C p += cy + subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + b,a L(xxx) +L(loope): +L(end4): + fxtod %f10,%f2 + add %g3,%g1,%g4 C p += cy + subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + b,a L(yyy) + +L(end3): + fxtod %f10,%f2 + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 +L(xxx): fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + ldx [%fp-32],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + b,a L(ret) + +L(end2): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 +L(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + ldx [%fp-32],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + b,a L(ret) + +L(end1): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + +L(ret): add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + srlx %g4,32,%g3 + st %l2,[%i0-4] + + addx %g3,%g0,%g3 + ret + restore %g0,%g3,%o0 C sideeffect: put cy in retreg +EPILOGUE(mpn_submul_1) diff --git a/ghc/rts/gmp/mpn/sparc64/README b/ghc/rts/gmp/mpn/sparc64/README new file mode 100644 index 0000000..6923a13 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc64/README @@ -0,0 +1,48 @@ +This directory contains mpn functions for 64-bit V9 SPARC + +RELEVANT OPTIMIZATION ISSUES + +The Ultra I/II pipeline executes up to two simple integer arithmetic operations +per cycle. The 64-bit integer multiply instruction mulx takes from 5 cycles to +35 cycles, depending on the position of the most significant bit of the 1st +source operand. It cannot overlap with other instructions. For our use of +mulx, it will take from 5 to 20 cycles. + +Integer conditional move instructions cannot dual-issue with other integer +instructions. No conditional move can issue 1-5 cycles after a load. (Or +something such bizzare.) + +Integer branches can issue with two integer arithmetic instructions. Likewise +for integer loads. Four instructions may issue (arith, arith, ld/st, branch) +but only if the branch is last. + +(The V9 architecture manual recommends that the 2nd operand of a multiply +instruction be the smaller one. For UltraSPARC, they got things backwards and +optimize for the wrong operand! Really helpful in the light of that multiply +is incredibly slow on these CPUs!) + +STATUS + +There is new code in ~/prec/gmp-remote/sparc64. Not tested or completed, but +the pipelines are worked out. Here are the timings: + +* lshift, rshift: The code is well-optimized and runs at 2.0 cycles/limb. + +* add_n, sub_n: add3.s currently runs at 6 cycles/limb. We use a bizarre + scheme of compares and branches (with some nops and fnops to align things) + and carefully stay away from the instructions intended for this application + (i.e., movcs and movcc). + + Using movcc/movcs, even with deep unrolling, seems to get down to 7 + cycles/limb. + + The most promising approach is to split operands in 32-bit pieces using + srlx, then use two addccc, and finally compile the results with sllx+or. + The result could run at 5 cycles/limb, I think. It might be possible to + do without unrolling, or with minimal unrolling. + +* addmul_1/submul_1: Should optimize for when scalar operand < 2^32. +* addmul_1/submul_1: Since mulx is horrendously slow on UltraSPARC I/II, + Karatsuba's method should save up to 16 cycles (i.e. > 20%). +* mul_1 (and possibly the other multiply functions): Handle carry in the + same tricky way as add_n,sub_n. diff --git a/ghc/rts/gmp/mpn/sparc64/add_n.asm b/ghc/rts/gmp/mpn/sparc64/add_n.asm new file mode 100644 index 0000000..72b3895 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc64/add_n.asm @@ -0,0 +1,172 @@ +! SPARC v9 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store +! sum in a third limb vector. + +! Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr %o0 +! s1_ptr %o1 +! s2_ptr %o2 +! size %o3 + +include(`../config.m4') + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch +PROLOGUE(mpn_add_n) + +! 12 mem ops >= 12 cycles +! 8 shift insn >= 8 cycles +! 8 addccc, executing alone, +8 cycles +! Unrolling not mandatory...perhaps 2-way is best? +! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl +! All in all, it runs at 5 cycles/limb + + save %sp,-160,%sp + + addcc %g0,%g0,%g0 + + add %i3,-4,%i3 + brlz,pn %i3,L(there) + nop + + ldx [%i1+0],%l0 + ldx [%i2+0],%l4 + ldx [%i1+8],%l1 + ldx [%i2+8],%l5 + ldx [%i1+16],%l2 + ldx [%i2+16],%l6 + ldx [%i1+24],%l3 + ldx [%i2+24],%l7 + add %i1,32,%i1 + add %i2,32,%i2 + + add %i3,-4,%i3 + brlz,pn %i3,L(skip) + nop + b L(loop1) ! jump instead of executing many NOPs + nop + ALIGN(32) +!--------- Start main loop --------- +L(loop1): + addccc %l0,%l4,%g1 +!- + srlx %l0,32,%o0 + ldx [%i1+0],%l0 +!- + srlx %l4,32,%o4 + ldx [%i2+0],%l4 +!- + addccc %o0,%o4,%g0 +!- + addccc %l1,%l5,%g2 +!- + srlx %l1,32,%o1 + ldx [%i1+8],%l1 +!- + srlx %l5,32,%o5 + ldx [%i2+8],%l5 +!- + addccc %o1,%o5,%g0 +!- + addccc %l2,%l6,%g3 +!- + srlx %l2,32,%o2 + ldx [%i1+16],%l2 +!- + srlx %l6,32,%g5 ! asymmetry + ldx [%i2+16],%l6 +!- + addccc %o2,%g5,%g0 +!- + addccc %l3,%l7,%g4 +!- + srlx %l3,32,%o3 + ldx [%i1+24],%l3 + add %i1,32,%i1 +!- + srlx %l7,32,%o7 + ldx [%i2+24],%l7 + add %i2,32,%i2 +!- + addccc %o3,%o7,%g0 +!- + stx %g1,[%i0+0] +!- + stx %g2,[%i0+8] +!- + stx %g3,[%i0+16] + add %i3,-4,%i3 +!- + stx %g4,[%i0+24] + add %i0,32,%i0 + + brgez,pt %i3,L(loop1) + nop +!--------- End main loop --------- +L(skip): + addccc %l0,%l4,%g1 + srlx %l0,32,%o0 + srlx %l4,32,%o4 + addccc %o0,%o4,%g0 + addccc %l1,%l5,%g2 + srlx %l1,32,%o1 + srlx %l5,32,%o5 + addccc %o1,%o5,%g0 + addccc %l2,%l6,%g3 + srlx %l2,32,%o2 + srlx %l6,32,%g5 ! asymmetry + addccc %o2,%g5,%g0 + addccc %l3,%l7,%g4 + srlx %l3,32,%o3 + srlx %l7,32,%o7 + addccc %o3,%o7,%g0 + stx %g1,[%i0+0] + stx %g2,[%i0+8] + stx %g3,[%i0+16] + stx %g4,[%i0+24] + add %i0,32,%i0 + +L(there): + add %i3,4,%i3 + brz,pt %i3,L(end) + nop + +L(loop2): + ldx [%i1+0],%l0 + add %i1,8,%i1 + ldx [%i2+0],%l4 + add %i2,8,%i2 + srlx %l0,32,%g2 + srlx %l4,32,%g3 + addccc %l0,%l4,%g1 + addccc %g2,%g3,%g0 + stx %g1,[%i0+0] + add %i0,8,%i0 + add %i3,-1,%i3 + brgz,pt %i3,L(loop2) + nop + +L(end): addc %g0,%g0,%i0 + ret + restore +EPILOGUE(mpn_add_n) diff --git a/ghc/rts/gmp/mpn/sparc64/addmul1h.asm b/ghc/rts/gmp/mpn/sparc64/addmul1h.asm new file mode 100644 index 0000000..96cb5f7 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc64/addmul1h.asm @@ -0,0 +1,203 @@ +dnl SPARC 64-bit addmull/addmulu -- Helper for mpn_addmul_1 and mpn_mul_1. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +ifdef(`LOWPART', +`addmull:', +`addmulu:') + save %sp,-256,%sp + + sethi %hi(0xffff0000),%o0 + andn %i3,%o0,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f6 + + srl %i3,16,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f8 + + mov 0,%g3 C cy = 0 + + ld [%i1+4],%f11 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end1) + add %i1,4,%i1 C s1_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-25] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,E(end2) + std %f12,[%fp-17] + + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end3) + std %f12,[%fp-33] + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + ld [%i0+DLO],%g5 + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + add %i0,4,%i0 C res_ptr++ + subcc %i2,1,%i2 + be,pn %icc,E(end4) + std %f12,[%fp-17] + + b,a E(loop) + nop C nop is cheap to nullify + + ALIGN(16) +C BEGIN LOOP +E(loop): + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0+DHI],%g5 + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + sub %i2,2,%i2 + add %i0,4,%i0 C res_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0+DLO],%g5 + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DHI] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + std %f12,[%fp-17] + brnz,pt %i2,E(loop) + add %i0,4,%i0 C res_ptr++ +C END LOOP +E(loope): +E(end4): + fxtod %f10,%f2 + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0+DHI],%g5 + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + add %i0,4,%i0 C res_ptr++ + + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0+DLO],%g5 + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DHI] + b,a E(yyy) + +E(end2): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + ld [%i0+DLO],%g5 + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 +E(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy +ifdef(`LOWPART', +` ld [%i0+DHI],%g5') + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + ldx [%fp-33],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + +ifdef(`LOWPART', +` add %g5,%g1,%g1') C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy +ifdef(`LOWPART', +` st %g4,[%i0-4+DHI] + srlx %g4,32,%g4') + + ret + restore %g0,%g4,%o0 C sideeffect: put cy in retreg +ifdef(`LOWPART', +`EPILOGUE(addmull)', +`EPILOGUE(addmulu)') diff --git a/ghc/rts/gmp/mpn/sparc64/addmul_1.asm b/ghc/rts/gmp/mpn/sparc64/addmul_1.asm new file mode 100644 index 0000000..c3f04ce --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc64/addmul_1.asm @@ -0,0 +1,114 @@ +dnl SPARC 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and +dnl add the result to a second limb vector. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr i0 +C s1_ptr i1 +C size i2 +C s2_limb i3 + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch + +PROLOGUE(mpn_addmul_1) + save %sp,-256,%sp + +C We store 0.0 in f10 and keep it invariant accross thw two +C function calls below. Note that this is not ABI conformant, +C but since the functions are local, that's acceptable. +ifdef(`PIC', +`L(pc): rd %pc,%o7 + ld [%o7+L(noll)-L(pc)],%f10', +` sethi %hh(L(noll)),%g2 + sethi %lm(L(noll)),%g1 + or %g2,%hm(L(noll)),%g2 + or %g1,%lo(L(noll)),%g1 + sllx %g2,32,%g2 + ld [%g1+%g2],%f10') + + sub %i1,%i0,%g1 + srlx %g1,3,%g1 + cmp %g1,%i2 + bcc,pt %xcc,L(nooverlap) + nop + + sllx %i2,3,%g2 C compute stack allocation byte count + add %g2,15,%o0 + and %o0,-16,%o0 + sub %sp,%o0,%sp + add %sp,2223,%o0 + + mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp + call mpn_copyi + mov %i2,%o2 C copy n to mpn_copyi's count parameter + + add %sp,2223,%i1 + +L(nooverlap): +C First multiply-add with low 32 bits of s2_limb + mov %i0,%o0 + mov %i1,%o1 + add %i2,%i2,%o2 + call addmull + srl %i3,0,%o3 + + mov %o0,%l0 C keep carry-out from accmull + +C Now multiply-add with high 32 bits of s2_limb, unless it is zero. + srlx %i3,32,%o3 + brz,a,pn %o3,L(small) + mov %o0,%i0 + mov %i1,%o1 + add %i2,%i2,%o2 + call addmulu + add %i0,4,%o0 + + add %l0,%o0,%i0 +L(small): + ret + restore %g0,%g0,%g0 +EPILOGUE(mpn_addmul_1) + +C Put a zero in the text segment to allow us to t the address +C quickly when compiling for PIC + TEXT + ALIGN(4) +L(noll): + .word 0 + +define(`LO',`(+4)') +define(`HI',`(-4)') + +define(`DLO',`(+4)') +define(`DHI',`(-4)') +define(`LOWPART') +define(`E',`L(l.$1)') +include_mpn(`sparc64/addmul1h.asm') + +define(`DLO',`(-4)') +define(`DHI',`(+4)') +undefine(`LOWPART') +define(`E',`L(u.$1)') +include_mpn(`sparc64/addmul1h.asm') diff --git a/ghc/rts/gmp/mpn/sparc64/copyi.asm b/ghc/rts/gmp/mpn/sparc64/copyi.asm new file mode 100644 index 0000000..d9957e3 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc64/copyi.asm @@ -0,0 +1,79 @@ +! SPARC v9 __gmpn_copy -- Copy a limb vector. + +! Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! rptr %o0 +! sptr %o1 +! n %o2 + +include(`../config.m4') + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch +PROLOGUE(mpn_copyi) + add %o2,-8,%o2 + brlz,pn %o2,L(skip) + nop + b,a L(loop1) + nop + + ALIGN(16) +L(loop1): + ldx [%o1+0],%g1 + ldx [%o1+8],%g2 + ldx [%o1+16],%g3 + ldx [%o1+24],%g4 + ldx [%o1+32],%g5 + ldx [%o1+40],%o3 + ldx [%o1+48],%o4 + ldx [%o1+56],%o5 + add %o1,64,%o1 + stx %g1,[%o0+0] + stx %g2,[%o0+8] + stx %g3,[%o0+16] + stx %g4,[%o0+24] + stx %g5,[%o0+32] + stx %o3,[%o0+40] + stx %o4,[%o0+48] + stx %o5,[%o0+56] + add %o2,-8,%o2 + brgez,pt %o2,L(loop1) + add %o0,64,%o0 + +L(skip): + add %o2,8,%o2 + brz,pt %o2,L(end) + nop + +L(loop2): + ldx [%o1],%g1 + add %o1,8,%o1 + add %o2,-1,%o2 + stx %g1,[%o0] + add %o0,8,%o0 + brgz,pt %o2,L(loop2) + nop + +L(end): retl + nop +EPILOGUE(mpn_copyi) diff --git a/ghc/rts/gmp/mpn/sparc64/gmp-mparam.h b/ghc/rts/gmp/mpn/sparc64/gmp-mparam.h index a3c6697..74f6166 100644 --- a/ghc/rts/gmp/mpn/sparc64/gmp-mparam.h +++ b/ghc/rts/gmp/mpn/sparc64/gmp-mparam.h @@ -1,20 +1,20 @@ -/* gmp-mparam.h -- Compiler/machine parameter header file. +/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file. -Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -25,3 +25,64 @@ MA 02111-1307, USA. */ #define BITS_PER_INT 32 #define BITS_PER_SHORTINT 16 #define BITS_PER_CHAR 8 + +/* Tell the toom3 multiply implementation to call low-level mpn + functions instead of open-coding operations in C. */ +#define USE_MORE_MPN 1 + + +/* Run on sun workshop cc. */ +/* Generated by tuneup.c, 2000-07-30. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 12 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 95 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 33 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 125 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 27 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 107 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 12 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 199 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 304, 608, 1344, 2304, 7168, 20480, 49152, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 320 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 1664 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 304, 608, 1344, 2816, 7168, 20480, 49152, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 320 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 1664 +#endif diff --git a/ghc/rts/gmp/mpn/sparc64/lshift.asm b/ghc/rts/gmp/mpn/sparc64/lshift.asm new file mode 100644 index 0000000..2d2edc5 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc64/lshift.asm @@ -0,0 +1,97 @@ +! SPARC v9 __gmpn_lshift -- + +! Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr %o0 +! src_ptr %o1 +! size %o2 +! cnt %o3 + +include(`../config.m4') + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch +PROLOGUE(mpn_lshift) + sllx %o2,3,%g1 + add %o1,%g1,%o1 ! make %o1 point at end of src + ldx [%o1-8],%g2 ! load first limb + sub %g0,%o3,%o5 ! negate shift count + add %o0,%g1,%o0 ! make %o0 point at end of res + add %o2,-1,%o2 + and %o2,4-1,%g4 ! number of limbs in first loop + srlx %g2,%o5,%g1 ! compute function result + brz,pn %g4,L(0) ! if multiple of 4 limbs, skip first loop + mov %g1,%g5 + + sub %o2,%g4,%o2 ! adjust count for main loop + +L(loop0): + ldx [%o1-16],%g3 + add %o0,-8,%o0 + add %o1,-8,%o1 + add %g4,-1,%g4 + sllx %g2,%o3,%o4 + srlx %g3,%o5,%g1 + mov %g3,%g2 + or %o4,%g1,%o4 + brnz,pt %g4,L(loop0) + stx %o4,[%o0+0] + +L(0): brz,pn %o2,L(end) + nop + +L(loop1): + ldx [%o1-16],%g3 + add %o0,-32,%o0 + add %o2,-4,%o2 + sllx %g2,%o3,%o4 + srlx %g3,%o5,%g1 + + ldx [%o1-24],%g2 + sllx %g3,%o3,%g4 + or %o4,%g1,%o4 + stx %o4,[%o0+24] + srlx %g2,%o5,%g1 + + ldx [%o1-32],%g3 + sllx %g2,%o3,%o4 + or %g4,%g1,%g4 + stx %g4,[%o0+16] + srlx %g3,%o5,%g1 + + ldx [%o1-40],%g2 + sllx %g3,%o3,%g4 + or %o4,%g1,%o4 + stx %o4,[%o0+8] + srlx %g2,%o5,%g1 + + add %o1,-32,%o1 + or %g4,%g1,%g4 + brnz,pt %o2,L(loop1) + stx %g4,[%o0+0] + +L(end): sllx %g2,%o3,%g2 + stx %g2,[%o0-8] + retl + mov %g5,%o0 +EPILOGUE(mpn_lshift) diff --git a/ghc/rts/gmp/mpn/sparc64/mul_1.asm b/ghc/rts/gmp/mpn/sparc64/mul_1.asm new file mode 100644 index 0000000..f2f2821 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc64/mul_1.asm @@ -0,0 +1,113 @@ +dnl SPARC 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and +dnl store the result to a second limb vector. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr i0 +C s1_ptr i1 +C size i2 +C s2_limb i3 + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch + +PROLOGUE(mpn_mul_1) + save %sp,-256,%sp + +C We store 0.0 in f10 and keep it invariant accross thw two +C function calls below. Note that this is not ABI conformant, +C but since the functions are local, that's acceptable. +ifdef(`PIC', +`L(pc): rd %pc,%o7 + ld [%o7+L(noll)-L(pc)],%f10', +` sethi %hh(L(noll)),%g2 + sethi %lm(L(noll)),%g1 + or %g2,%hm(L(noll)),%g2 + or %g1,%lo(L(noll)),%g1 + sllx %g2,32,%g2 + ld [%g1+%g2],%f10') + + sub %i1,%i0,%g1 + srlx %g1,3,%g1 + cmp %g1,%i2 + bcc,pt %xcc,L(nooverlap) + nop + + sllx %i2,3,%g2 C compute stack allocation byte count + add %g2,15,%o0 + and %o0,-16,%o0 + sub %sp,%o0,%sp + add %sp,2223,%o0 + + mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp + call mpn_copyi + mov %i2,%o2 C copy n to mpn_copyi's count parameter + + add %sp,2223,%i1 + +L(nooverlap): +C First multiply-add with low 32 bits of s2_limb + mov %i0,%o0 + mov %i1,%o1 + add %i2,%i2,%o2 + call mull + srl %i3,0,%o3 + + mov %o0,%l0 C keep carry-out from accmull + +C Now multiply-add with high 32 bits of s2_limb, unless it is zero. + srlx %i3,32,%o3 + brz,a,pn %o3,L(small) + mov %o0,%i0 + mov %i1,%o1 + add %i2,%i2,%o2 + call addmulu + add %i0,4,%o0 + + add %l0,%o0,%i0 +L(small): + ret + restore %g0,%g0,%g0 +EPILOGUE(mpn_mul_1) + +C Put a zero in the text segment to allow us to t the address +C quickly when compiling for PIC + TEXT + ALIGN(4) +L(noll): + .word 0 + +define(`LO',`(+4)') +define(`HI',`(-4)') + +define(`DLO',`(+4)') +define(`DHI',`(-4)') +define(`E',`L($1)') +include_mpn(`sparc64/mul_1h.asm') + +define(`DLO',`(-4)') +define(`DHI',`(+4)') +undefine(`LOWPART') +define(`E',`L(u.$1)') +include_mpn(`sparc64/addmul1h.asm') diff --git a/ghc/rts/gmp/mpn/sparc64/mul_1h.asm b/ghc/rts/gmp/mpn/sparc64/mul_1h.asm new file mode 100644 index 0000000..5078c01 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc64/mul_1h.asm @@ -0,0 +1,183 @@ +dnl SPARC 64-bit mull -- Helper for mpn_mul_1. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +mull: + save %sp,-256,%sp + + sethi %hi(0xffff0000),%o0 + andn %i3,%o0,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f6 + + srl %i3,16,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f8 + + mov 0,%g3 C cy = 0 + + ld [%i1+4],%f11 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end1) + add %i1,4,%i1 C s1_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-25] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,E(end2) + std %f12,[%fp-17] + + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end3) + std %f12,[%fp-33] + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + add %i0,4,%i0 C res_ptr++ + subcc %i2,1,%i2 + be,pn %icc,E(end4) + std %f12,[%fp-17] + + b,a E(loop) + nop C nop is cheap to nullify + + ALIGN(16) +C BEGIN LOOP +E(loop): + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + sub %i2,2,%i2 + add %i0,4,%i0 C res_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DHI] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + std %f12,[%fp-17] + brnz,pt %i2,E(loop) + add %i0,4,%i0 C res_ptr++ +C END LOOP +E(loope): +E(end4): + fxtod %f10,%f2 + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DHI] + b,a E(yyy) + +E(end2): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 +E(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + ldx [%fp-33],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + st %g4,[%i0-4+DHI] + srlx %g4,32,%g4 + + ret + restore %g0,%g4,%o0 C sideeffect: put cy in retreg +EPILOGUE(mull) diff --git a/ghc/rts/gmp/mpn/sparc64/rshift.asm b/ghc/rts/gmp/mpn/sparc64/rshift.asm new file mode 100644 index 0000000..baf7920 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc64/rshift.asm @@ -0,0 +1,94 @@ +! SPARC v9 __gmpn_rshift -- + +! Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr %o0 +! src_ptr %o1 +! size %o2 +! cnt %o3 + +include(`../config.m4') + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch +PROLOGUE(mpn_rshift) + ldx [%o1],%g2 ! load first limb + sub %g0,%o3,%o5 ! negate shift count + add %o2,-1,%o2 + and %o2,4-1,%g4 ! number of limbs in first loop + sllx %g2,%o5,%g1 ! compute function result + brz,pn %g4,L(0) ! if multiple of 4 limbs, skip first loop + mov %g1,%g5 + + sub %o2,%g4,%o2 ! adjust count for main loop + +L(loop0): + ldx [%o1+8],%g3 + add %o0,8,%o0 + add %o1,8,%o1 + add %g4,-1,%g4 + srlx %g2,%o3,%o4 + sllx %g3,%o5,%g1 + mov %g3,%g2 + or %o4,%g1,%o4 + brnz,pt %g4,L(loop0) + stx %o4,[%o0-8] + +L(0): brz,pn %o2,L(end) + nop + +L(loop1): + ldx [%o1+8],%g3 + add %o0,32,%o0 + add %o2,-4,%o2 + srlx %g2,%o3,%o4 + sllx %g3,%o5,%g1 + + ldx [%o1+16],%g2 + srlx %g3,%o3,%g4 + or %o4,%g1,%o4 + stx %o4,[%o0-32] + sllx %g2,%o5,%g1 + + ldx [%o1+24],%g3 + srlx %g2,%o3,%o4 + or %g4,%g1,%g4 + stx %g4,[%o0-24] + sllx %g3,%o5,%g1 + + ldx [%o1+32],%g2 + srlx %g3,%o3,%g4 + or %o4,%g1,%o4 + stx %o4,[%o0-16] + sllx %g2,%o5,%g1 + + add %o1,32,%o1 + or %g4,%g1,%g4 + brnz %o2,L(loop1) + stx %g4,[%o0-8] + +L(end): srlx %g2,%o3,%g2 + stx %g2,[%o0-0] + retl + mov %g5,%o0 +EPILOGUE(mpn_rshift) diff --git a/ghc/rts/gmp/mpn/sparc64/sub_n.asm b/ghc/rts/gmp/mpn/sparc64/sub_n.asm new file mode 100644 index 0000000..6154713 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc64/sub_n.asm @@ -0,0 +1,172 @@ +! SPARC v9 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and +! store difference in a third limb vector. + +! Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr %o0 +! s1_ptr %o1 +! s2_ptr %o2 +! size %o3 + +include(`../config.m4') + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch +PROLOGUE(mpn_sub_n) + +! 12 mem ops >= 12 cycles +! 8 shift insn >= 8 cycles +! 8 addccc, executing alone, +8 cycles +! Unrolling not mandatory...perhaps 2-way is best? +! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl +! All in all, it runs at 5 cycles/limb + + save %sp,-160,%sp + + addcc %g0,%g0,%g0 + + add %i3,-4,%i3 + brlz,pn %i3,L(there) + nop + + ldx [%i1+0],%l0 + ldx [%i2+0],%l4 + ldx [%i1+8],%l1 + ldx [%i2+8],%l5 + ldx [%i1+16],%l2 + ldx [%i2+16],%l6 + ldx [%i1+24],%l3 + ldx [%i2+24],%l7 + add %i1,32,%i1 + add %i2,32,%i2 + + add %i3,-4,%i3 + brlz,pn %i3,L(skip) + nop + b L(loop1) ! jump instead of executing many NOPs + nop + ALIGN(32) +!--------- Start main loop --------- +L(loop1): + subccc %l0,%l4,%g1 +!- + srlx %l0,32,%o0 + ldx [%i1+0],%l0 +!- + srlx %l4,32,%o4 + ldx [%i2+0],%l4 +!- + subccc %o0,%o4,%g0 +!- + subccc %l1,%l5,%g2 +!- + srlx %l1,32,%o1 + ldx [%i1+8],%l1 +!- + srlx %l5,32,%o5 + ldx [%i2+8],%l5 +!- + subccc %o1,%o5,%g0 +!- + subccc %l2,%l6,%g3 +!- + srlx %l2,32,%o2 + ldx [%i1+16],%l2 +!- + srlx %l6,32,%g5 ! asymmetry + ldx [%i2+16],%l6 +!- + subccc %o2,%g5,%g0 +!- + subccc %l3,%l7,%g4 +!- + srlx %l3,32,%o3 + ldx [%i1+24],%l3 + add %i1,32,%i1 +!- + srlx %l7,32,%o7 + ldx [%i2+24],%l7 + add %i2,32,%i2 +!- + subccc %o3,%o7,%g0 +!- + stx %g1,[%i0+0] +!- + stx %g2,[%i0+8] +!- + stx %g3,[%i0+16] + add %i3,-4,%i3 +!- + stx %g4,[%i0+24] + add %i0,32,%i0 + + brgez,pt %i3,L(loop1) + nop +!--------- End main loop --------- +L(skip): + subccc %l0,%l4,%g1 + srlx %l0,32,%o0 + srlx %l4,32,%o4 + subccc %o0,%o4,%g0 + subccc %l1,%l5,%g2 + srlx %l1,32,%o1 + srlx %l5,32,%o5 + subccc %o1,%o5,%g0 + subccc %l2,%l6,%g3 + srlx %l2,32,%o2 + srlx %l6,32,%g5 ! asymmetry + subccc %o2,%g5,%g0 + subccc %l3,%l7,%g4 + srlx %l3,32,%o3 + srlx %l7,32,%o7 + subccc %o3,%o7,%g0 + stx %g1,[%i0+0] + stx %g2,[%i0+8] + stx %g3,[%i0+16] + stx %g4,[%i0+24] + add %i0,32,%i0 + +L(there): + add %i3,4,%i3 + brz,pt %i3,L(end) + nop + +L(loop2): + ldx [%i1+0],%l0 + add %i1,8,%i1 + ldx [%i2+0],%l4 + add %i2,8,%i2 + srlx %l0,32,%g2 + srlx %l4,32,%g3 + subccc %l0,%l4,%g1 + subccc %g2,%g3,%g0 + stx %g1,[%i0+0] + add %i0,8,%i0 + add %i3,-1,%i3 + brgz,pt %i3,L(loop2) + nop + +L(end): addc %g0,%g0,%i0 + ret + restore +EPILOGUE(mpn_sub_n) diff --git a/ghc/rts/gmp/mpn/sparc64/submul1h.asm b/ghc/rts/gmp/mpn/sparc64/submul1h.asm new file mode 100644 index 0000000..7f51ba5 --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc64/submul1h.asm @@ -0,0 +1,204 @@ +dnl SPARC 64-bit submull/submulu -- Helper for mpn_submul_1 and mpn_mul_1. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +ifdef(`LOWPART', +`submull:', +`submulu:') + save %sp,-256,%sp + + sethi %hi(0xffff0000),%o0 + andn %i3,%o0,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f6 + + srl %i3,16,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f8 + + mov 0,%g3 C cy = 0 + + ld [%i1+4],%f11 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end1) + add %i1,4,%i1 C s1_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-25] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,E(end2) + std %f12,[%fp-17] + + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end3) + std %f12,[%fp-33] + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + ld [%i0+DLO],%g5 + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + add %i0,4,%i0 C res_ptr++ + subcc %i2,1,%i2 + be,pn %icc,E(end4) + std %f12,[%fp-17] + + b,a E(loop) + nop C nop is cheap to nullify + + ALIGN(16) +C BEGIN LOOP +E(loop): + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0+DHI],%g5 + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + sub %i2,2,%i2 + add %i0,4,%i0 C res_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0+DLO],%g5 + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4+DHI] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + std %f12,[%fp-17] + brnz,pt %i2,E(loop) + add %i0,4,%i0 C res_ptr++ +C END LOOP +E(loope): +E(end4): + fxtod %f10,%f2 + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0+DHI],%g5 + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0+DLO],%g5 + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4+DHI] + b,a E(yyy) + +E(end2): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + ld [%i0+DLO],%g5 + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 +E(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) +ifdef(`LOWPART', +` ld [%i0+DHI],%g5') + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + ldx [%fp-33],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4+DLO] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy +ifdef(`LOWPART', +` subxcc %g5,%g4,%l2') C add *res_ptr to p0 (ADD2) +ifdef(`LOWPART', +` st %l2,[%i0-4+DHI] + srlx %g4,32,%g4') + + addx %g4,0,%g4 + ret + restore %g0,%g4,%o0 C sideeffect: put cy in retreg +ifdef(`LOWPART', +`EPILOGUE(submull)', +`EPILOGUE(submulu)') diff --git a/ghc/rts/gmp/mpn/sparc64/submul_1.asm b/ghc/rts/gmp/mpn/sparc64/submul_1.asm new file mode 100644 index 0000000..7c6af0a --- /dev/null +++ b/ghc/rts/gmp/mpn/sparc64/submul_1.asm @@ -0,0 +1,114 @@ +dnl SPARC 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr i0 +C s1_ptr i1 +C size i2 +C s2_limb i3 + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch + +PROLOGUE(mpn_submul_1) + save %sp,-256,%sp + +C We store 0.0 in f10 and keep it invariant accross thw two +C function calls below. Note that this is not ABI conformant, +C but since the functions are local, that's acceptable. +ifdef(`PIC', +`L(pc): rd %pc,%o7 + ld [%o7+L(noll)-L(pc)],%f10', +` sethi %hh(L(noll)),%g2 + sethi %lm(L(noll)),%g1 + or %g2,%hm(L(noll)),%g2 + or %g1,%lo(L(noll)),%g1 + sllx %g2,32,%g2 + ld [%g1+%g2],%f10') + + sub %i1,%i0,%g1 + srlx %g1,3,%g1 + cmp %g1,%i2 + bcc,pt %xcc,L(nooverlap) + nop + + sllx %i2,3,%g2 C compute stack allocation byte count + add %g2,15,%o0 + and %o0,-16,%o0 + sub %sp,%o0,%sp + add %sp,2223,%o0 + + mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp + call mpn_copyi + mov %i2,%o2 C copy n to mpn_copyi's count parameter + + add %sp,2223,%i1 + +L(nooverlap): +C First multiply-add with low 32 bits of s2_limb + mov %i0,%o0 + mov %i1,%o1 + add %i2,%i2,%o2 + call submull + srl %i3,0,%o3 + + mov %o0,%l0 C keep carry-out from accmull + +C Now multiply-add with high 32 bits of s2_limb, unless it is zero. + srlx %i3,32,%o3 + brz,a,pn %o3,L(small) + mov %o0,%i0 + mov %i1,%o1 + add %i2,%i2,%o2 + call submulu + add %i0,4,%o0 + + add %l0,%o0,%i0 +L(small): + ret + restore %g0,%g0,%g0 +EPILOGUE(mpn_submul_1) + +C Put a zero in the text segment to allow us to t the address +C quickly when compiling for PIC + TEXT + ALIGN(4) +L(noll): + .word 0 + +define(`LO',`(+4)') +define(`HI',`(-4)') + +define(`DLO',`(+4)') +define(`DHI',`(-4)') +define(`LOWPART') +define(`E',`L(l.$1)') +include_mpn(`sparc64/submul1h.asm') + +define(`DLO',`(-4)') +define(`DHI',`(+4)') +undefine(`LOWPART') +define(`E',`L(u.$1)') +include_mpn(`sparc64/submul1h.asm') diff --git a/ghc/rts/gmp/mpn/thumb/add_n.s b/ghc/rts/gmp/mpn/thumb/add_n.s new file mode 100644 index 0000000..c1eeb6c --- /dev/null +++ b/ghc/rts/gmp/mpn/thumb/add_n.s @@ -0,0 +1,50 @@ +@ ARM/Thumb __gmpn_add -- Add two limb vectors of the same length > 0 and store +@ sum in a third limb vector. + +@ Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +@ This file is part of the GNU MP Library. + +@ The GNU MP Library is free software; you can redistribute it and/or modify +@ it under the terms of the GNU Lesser General Public License as published by +@ the Free Software Foundation; either version 2.1 of the License, or (at your +@ option) any later version. + +@ The GNU MP Library is distributed in the hope that it will be useful, but +@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +@ License for more details. + +@ You should have received a copy of the GNU Lesser General Public License +@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to +@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +@ MA 02111-1307, USA. + + +@ INPUT PARAMETERS +@ RES_ptr r0 +@ S1_ptr r1 +@ S2_ptr r2 +@ SIZE r3 + +@ NOT TESTED CODE + + .text + .thumb + .align 0 + .global ___gmpn_add_n +___gmpn_add_n: + push {r4, r5, r6, lr} + mov r6, #1 @ init carry save register + +Loop: sub r6, #1 @ restore carry (set iff r6 was 0) + ldmia r1!, {r4} @ load next limb from S1 + ldmia r2!, {r5} @ load next limb from S2 + adc r4, r5 + stmia r0!, {r4} @ store result limb to RES + sbc r6, r6 @ save negated carry + sub r3, #1 + bge Loop @ loop back while remaining count >= 4 + + mov r0, r6 + pop {r4, r5, r6, pc} diff --git a/ghc/rts/gmp/mpn/thumb/sub_n.s b/ghc/rts/gmp/mpn/thumb/sub_n.s new file mode 100644 index 0000000..53c2923 --- /dev/null +++ b/ghc/rts/gmp/mpn/thumb/sub_n.s @@ -0,0 +1,50 @@ +@ ARM/Thumb __gmpn_sub -- Subtract two limb vectors of the same length > 0 and +@ store difference in a third limb vector. + +@ Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +@ This file is part of the GNU MP Library. + +@ The GNU MP Library is free software; you can redistribute it and/or modify +@ it under the terms of the GNU Lesser General Public License as published by +@ the Free Software Foundation; either version 2.1 of the License, or (at your +@ option) any later version. + +@ The GNU MP Library is distributed in the hope that it will be useful, but +@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +@ License for more details. + +@ You should have received a copy of the GNU Lesser General Public License +@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to +@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +@ MA 02111-1307, USA. + + +@ INPUT PARAMETERS +@ RES_ptr r0 +@ S1_ptr r1 +@ S2_ptr r2 +@ SIZE r3 + +@ NOT TESTED CODE + + .text + .thumb + .align 0 + .global ___gmpn_sub_n +___gmpn_sub_n: + push {r4, r5, r6, lr} + mov r6, #1 @ init carry save register + +Loop: sub r6, #1 @ restore carry (set iff r6 was 0) + ldmia r1!, {r4} @ load next limb from S1 + ldmia r2!, {r5} @ load next limb from S2 + sbc r4, r5 + stmia r0!, {r4} @ store result limb to RES + sbc r6, r6 @ save negated carry + sub r3, #1 + bge Loop @ loop back while remaining count >= 4 + + mov r0, r6 + pop {r4, r5, r6, pc} diff --git a/ghc/rts/gmp/mpn/underscore.h b/ghc/rts/gmp/mpn/underscore.h new file mode 100644 index 0000000..240dae0 --- /dev/null +++ b/ghc/rts/gmp/mpn/underscore.h @@ -0,0 +1,26 @@ +/* +Copyright (C) 1999 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#if __STDC__ +#define C_SYMBOL_NAME(name) _##name +#else +#define C_SYMBOL_NAME(name) _/**/name +#endif diff --git a/ghc/rts/gmp/mpn/vax/add_n.s b/ghc/rts/gmp/mpn/vax/add_n.s index d4764e2..cf4060f 100644 --- a/ghc/rts/gmp/mpn/vax/add_n.s +++ b/ghc/rts/gmp/mpn/vax/add_n.s @@ -1,21 +1,21 @@ -# VAX __mpn_add_n -- Add two limb vectors of the same length > 0 and store +# VAX __gmpn_add_n -- Add two limb vectors of the same length > 0 and store # sum in a third limb vector. -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -29,20 +29,33 @@ .text .align 1 -.globl ___mpn_add_n -___mpn_add_n: +.globl ___gmpn_add_n +___gmpn_add_n: .word 0x0 movl 16(ap),r0 movl 12(ap),r1 movl 8(ap),r2 movl 4(ap),r3 - subl2 r4,r4 - -Loop: + mnegl r0,r5 + addl2 $3,r0 + ashl $-2,r0,r0 # unroll loop count + bicl2 $-4,r5 # mask out low 2 bits + movaq (r5)[r5],r5 # 9x + jmp Loop(r5) + +Loop: movl (r2)+,r4 + adwc (r1)+,r4 + movl r4,(r3)+ + movl (r2)+,r4 + adwc (r1)+,r4 + movl r4,(r3)+ + movl (r2)+,r4 + adwc (r1)+,r4 + movl r4,(r3)+ movl (r2)+,r4 adwc (r1)+,r4 movl r4,(r3)+ - jsobgtr r0,Loop + sobgtr r0,Loop adwc r0,r0 ret diff --git a/ghc/rts/gmp/mpn/vax/addmul_1.s b/ghc/rts/gmp/mpn/vax/addmul_1.s index 746d95b..379061d 100644 --- a/ghc/rts/gmp/mpn/vax/addmul_1.s +++ b/ghc/rts/gmp/mpn/vax/addmul_1.s @@ -1,21 +1,21 @@ -# VAX __mpn_addmul_1 -- Multiply a limb vector with a limb and add +# VAX __gmpn_addmul_1 -- Multiply a limb vector with a limb and add # the result to a second limb vector. -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -29,8 +29,8 @@ .text .align 1 -.globl ___mpn_addmul_1 -___mpn_addmul_1: +.globl ___gmpn_addmul_1 +___gmpn_addmul_1: .word 0xfc0 movl 12(ap),r4 movl 8(ap),r8 @@ -60,7 +60,7 @@ L1p1: emul r1,r6,$0,r10 addl2 r10,(r9)+ adwc $0,r11 - jsobgtr r7,Loop1 + sobgtr r7,Loop1 movl r11,r0 ret @@ -77,7 +77,7 @@ L1n1: emul r1,r6,$0,r10 addl2 r10,(r9)+ adwc $0,r11 - jsobgtr r7,Loop1 + sobgtr r7,Loop1 movl r11,r0 ret @@ -104,7 +104,7 @@ L2p1: emul r1,r6,$0,r10 addl2 r10,(r9)+ adwc $0,r11 - jsobgtr r7,Loop2 + sobgtr r7,Loop2 movl r11,r0 ret @@ -121,6 +121,6 @@ L2n1: emul r1,r6,$0,r10 addl2 r10,(r9)+ adwc r1,r11 - jsobgtr r7,Loop2 + sobgtr r7,Loop2 movl r11,r0 ret diff --git a/ghc/rts/gmp/mpn/vax/lshift.s b/ghc/rts/gmp/mpn/vax/lshift.s new file mode 100644 index 0000000..fd311a9 --- /dev/null +++ b/ghc/rts/gmp/mpn/vax/lshift.s @@ -0,0 +1,58 @@ +# VAX __gmpn_lshift -- left shift. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# rptr (sp + 4) +# sptr (sp + 8) +# size (sp + 12) +# cnt (sp + 16) +# r0=retval r1=size r2,r3=itmp r4,r5=otmp call-used registers +# r6=sptr r7=rptr r8=cnt r9 r10 r11 call-saved registers + +.text + .align 1 +.globl ___gmpn_lshift +___gmpn_lshift: + .word 0x1c0 + movl 4(ap),r7 + movl 8(ap),r6 + movl 12(ap),r1 + movl 16(ap),r8 + + moval (r6)[r1],r6 + moval (r7)[r1],r7 + clrl r3 + movl -(r6),r2 + ashq r8,r2,r4 + movl r5,r0 + movl r2,r3 + decl r1 + jeql Lend + +Loop: movl -(r6),r2 + ashq r8,r2,r4 + movl r5,-(r7) + movl r2,r3 + jsobgtr r1,Loop + +Lend: movl r4,-4(r7) + ret diff --git a/ghc/rts/gmp/mpn/vax/mul_1.s b/ghc/rts/gmp/mpn/vax/mul_1.s index e2ff5a1..708e8ca 100644 --- a/ghc/rts/gmp/mpn/vax/mul_1.s +++ b/ghc/rts/gmp/mpn/vax/mul_1.s @@ -1,21 +1,21 @@ -# VAX __mpn_mul_1 -- Multiply a limb vector with a limb and store +# VAX __gmpn_mul_1 -- Multiply a limb vector with a limb and store # the result in a second limb vector. -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -29,8 +29,8 @@ .text .align 1 -.globl ___mpn_mul_1 -___mpn_mul_1: +.globl ___gmpn_mul_1 +___gmpn_mul_1: .word 0xfc0 movl 12(ap),r4 movl 8(ap),r8 @@ -61,7 +61,7 @@ L1p1: emul r1,r6,$0,r10 adwc $0,r11 movl r10,(r9)+ - jsobgtr r7,Loop1 + sobgtr r7,Loop1 movl r11,r0 ret @@ -76,7 +76,7 @@ L1n1: emul r1,r6,$0,r10 adwc r6,r11 movl r10,(r9)+ - jsobgtr r7,Loop1 + sobgtr r7,Loop1 movl r11,r0 ret @@ -101,7 +101,7 @@ L2p1: emul r1,r6,$0,r10 adwc r1,r11 movl r10,(r9)+ - jsobgtr r7,Loop2 + sobgtr r7,Loop2 movl r11,r0 ret @@ -118,6 +118,6 @@ L2n1: emul r1,r6,$0,r10 adwc r6,r11 movl r10,(r9)+ - jsobgtr r7,Loop2 + sobgtr r7,Loop2 movl r11,r0 ret diff --git a/ghc/rts/gmp/mpn/vax/rshift.s b/ghc/rts/gmp/mpn/vax/rshift.s new file mode 100644 index 0000000..5158132 --- /dev/null +++ b/ghc/rts/gmp/mpn/vax/rshift.s @@ -0,0 +1,56 @@ +# VAX __gmpn_rshift -- right shift. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# rptr (sp + 4) +# sptr (sp + 8) +# size (sp + 12) +# cnt (sp + 16) +# r0=retval r1=size r2,r3=itmp r4,r5=otmp call-used registers +# r6=sptr r7=rptr r8=cnt r9 r10 r11 call-saved registers + +.text + .align 1 +.globl ___gmpn_rshift +___gmpn_rshift: + .word 0x1c0 + movl 4(ap),r7 + movl 8(ap),r6 + movl 12(ap),r1 + movl 16(ap),r8 + + movl (r6)+,r2 + subl3 r8,$32,r8 + ashl r8,r2,r0 + decl r1 + jeql Lend + +Loop: movl (r6)+,r3 + ashq r8,r2,r4 + movl r5,(r7)+ + movl r3,r2 + jsobgtr r1,Loop + +Lend: clrl r3 + ashq r8,r2,r4 + movl r5,(r7) + ret diff --git a/ghc/rts/gmp/mpn/vax/sub_n.s b/ghc/rts/gmp/mpn/vax/sub_n.s index a891c44..eff4b1c 100644 --- a/ghc/rts/gmp/mpn/vax/sub_n.s +++ b/ghc/rts/gmp/mpn/vax/sub_n.s @@ -1,21 +1,21 @@ -# VAX __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and store +# VAX __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and store # difference in a third limb vector. -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -29,20 +29,33 @@ .text .align 1 -.globl ___mpn_sub_n -___mpn_sub_n: +.globl ___gmpn_sub_n +___gmpn_sub_n: .word 0x0 movl 16(ap),r0 movl 12(ap),r1 movl 8(ap),r2 movl 4(ap),r3 - subl2 r4,r4 - -Loop: + mnegl r0,r5 + addl2 $3,r0 + ashl $-2,r0,r0 # unroll loop count + bicl2 $-4,r5 # mask out low 2 bits + movaq (r5)[r5],r5 # 9x + jmp Loop(r5) + +Loop: movl (r2)+,r4 + sbwc (r1)+,r4 + movl r4,(r3)+ + movl (r2)+,r4 + sbwc (r1)+,r4 + movl r4,(r3)+ + movl (r2)+,r4 + sbwc (r1)+,r4 + movl r4,(r3)+ movl (r2)+,r4 sbwc (r1)+,r4 movl r4,(r3)+ - jsobgtr r0,Loop + sobgtr r0,Loop adwc r0,r0 ret diff --git a/ghc/rts/gmp/mpn/vax/submul_1.s b/ghc/rts/gmp/mpn/vax/submul_1.s index c473937..be42286 100644 --- a/ghc/rts/gmp/mpn/vax/submul_1.s +++ b/ghc/rts/gmp/mpn/vax/submul_1.s @@ -1,21 +1,21 @@ -# VAX __mpn_submul_1 -- Multiply a limb vector with a limb and subtract +# VAX __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract # the result from a second limb vector. -# Copyright (C) 1992, 1994 Free Software Foundation, Inc. +# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. # This file is part of the GNU MP Library. # The GNU MP Library is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library General Public License as published by -# the Free Software Foundation; either version 2 of the License, or (at your +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. -# You should have received a copy of the GNU Library General Public License +# You should have received a copy of the GNU Lesser General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. @@ -29,8 +29,8 @@ .text .align 1 -.globl ___mpn_submul_1 -___mpn_submul_1: +.globl ___gmpn_submul_1 +___gmpn_submul_1: .word 0xfc0 movl 12(ap),r4 movl 8(ap),r8 @@ -60,7 +60,7 @@ L1p1: emul r1,r6,$0,r10 subl2 r10,(r9)+ adwc $0,r11 - jsobgtr r7,Loop1 + sobgtr r7,Loop1 movl r11,r0 ret @@ -77,7 +77,7 @@ L1n1: emul r1,r6,$0,r10 subl2 r10,(r9)+ adwc $0,r11 - jsobgtr r7,Loop1 + sobgtr r7,Loop1 movl r11,r0 ret @@ -104,7 +104,7 @@ L2p1: emul r1,r6,$0,r10 subl2 r10,(r9)+ adwc $0,r11 - jsobgtr r7,Loop2 + sobgtr r7,Loop2 movl r11,r0 ret @@ -121,6 +121,6 @@ L2n1: emul r1,r6,$0,r10 subl2 r10,(r9)+ adwc r1,r11 - jsobgtr r7,Loop2 + sobgtr r7,Loop2 movl r11,r0 ret diff --git a/ghc/rts/gmp/mpn/x86/README b/ghc/rts/gmp/mpn/x86/README new file mode 100644 index 0000000..3507548 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/README @@ -0,0 +1,40 @@ + + X86 MPN SUBROUTINES + + +This directory contains mpn functions for various 80x86 chips. + + +CODE ORGANIZATION + + x86 i386, i486, generic + x86/pentium Intel Pentium (P5, P54) + x86/pentium/mmx Intel Pentium with MMX (P55) + x86/p6 Intel Pentium Pro + x86/p6/mmx Intel Pentium II, III + x86/p6/p3mmx Intel Pentium III + x86/k6 AMD K6, K6-2, K6-3 + x86/k6/mmx + x86/k6/k62mmx AMD K6-2 + x86/k7 AMD Athlon + x86/k7/mmx + + +The x86 directory is also the main support for P6 at the moment, and +is something of a blended style, meant to be reasonable on all x86s. + + + +STATUS + +The code is well-optimized for AMD and Intel chips, but not so well +optimized for Cyrix chips. + + + +RELEVANT OPTIMIZATION ISSUES + +For implementations with slow double shift instructions (SHLD and +SHRD), it might be better to mimic their operation with SHL+SHR+OR. +(M2 is likely to benefit from that, but not Pentium due to its slow +plain SHL and SHR.) diff --git a/ghc/rts/gmp/mpn/x86/README.family b/ghc/rts/gmp/mpn/x86/README.family new file mode 100644 index 0000000..3bc73f5 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/README.family @@ -0,0 +1,333 @@ + + X86 CPU FAMILY MPN SUBROUTINES + + +This file has some notes on things common to all the x86 family code. + + + +ASM FILES + +The x86 .asm files are BSD style x86 assembler code, first put through m4 +for macro processing. The generic mpn/asm-defs.m4 is used, together with +mpn/x86/x86-defs.m4. Detailed notes are in those files. + +The code is meant for use with GNU "gas" or a system "as". There's no +support for assemblers that demand Intel style, and with gas freely +available and easy to use that shouldn't be a problem. + + + +STACK FRAME + +m4 macros are used to define the parameters passed on the stack, and these +act like comments on what the stack frame looks like too. For example, +mpn_mul_1() has the following. + + defframe(PARAM_MULTIPLIER, 16) + defframe(PARAM_SIZE, 12) + defframe(PARAM_SRC, 8) + defframe(PARAM_DST, 4) + +Here PARAM_MULTIPLIER gets defined as `FRAME+16(%esp)', and the others +similarly. The return address is at offset 0, but there's not normally any +need to access that. + +FRAME is redefined as necessary through the code so it's the number of bytes +pushed on the stack, and hence the offsets in the parameter macros stay +correct. At the start of a routine FRAME should be zero. + + deflit(`FRAME',0) + ... + deflit(`FRAME',4) + ... + deflit(`FRAME',8) + ... + +Helper macros FRAME_pushl(), FRAME_popl(), FRAME_addl_esp() and +FRAME_subl_esp() exist to adjust FRAME for the effect of those instructions, +and can be used instead of explicit definitions if preferred. +defframe_pushl() is a combination FRAME_pushl() and defframe(). + +There's generally some slackness in redefining FRAME. If new values aren't +going to get used, then the redefinitions are omitted to keep from +cluttering up the code. This happens for instance at the end of a routine, +where there might be just four register pops and then a ret, so FRAME isn't +getting used. + +Local variables and saved registers can be similarly defined, with negative +offsets representing stack space below the initial stack pointer. For +example, + + defframe(SAVE_ESI, -4) + defframe(SAVE_EDI, -8) + defframe(VAR_COUNTER,-12) + + deflit(STACK_SPACE, 12) + +Here STACK_SPACE gets used in a "subl $STACK_SPACE, %esp" to allocate the +space, and that instruction must be followed by a redefinition of FRAME +(setting it equal to STACK_SPACE) to reflect the change in %esp. + +Definitions for pushed registers are only put in when they're going to be +used. If registers are just saved and restored with pushes and pops then +definitions aren't made. + + + +ASSEMBLER EXPRESSIONS + +Only addition and subtraction seem to be universally available, certainly +that's all the Solaris 8 "as" seems to accept. If expressions are wanted +then m4 eval() should be used. + +In particular note that a "/" anywhere in a line starts a comment in Solaris +"as", and in some configurations of gas too. + + addl $32/2, %eax <-- wrong + + addl $eval(32/2), %eax <-- right + +Binutils gas/config/tc-i386.c has a choice between "/" being a comment +anywhere in a line, or only at the start. FreeBSD patches 2.9.1 to select +the latter, and as of 2.9.5 it's the default for GNU/Linux too. + + + +ASSEMBLER COMMENTS + +Solaris "as" doesn't support "#" commenting, using /* */ instead, +unfortunately. For that reason "C" commenting is used (see asm-defs.m4) and +the intermediate ".s" files have no comments. + + + +ZERO DISPLACEMENTS + +In a couple of places addressing modes like 0(%ebx) with a byte-sized zero +displacement are wanted, rather than (%ebx) with no displacement. These are +either for computed jumps or to get desirable code alignment. Explicit +.byte sequences are used to ensure the assembler doesn't turn 0(%ebx) into +(%ebx). The Zdisp() macro in x86-defs.m4 is used for this. + +Current gas 2.9.5 or recent 2.9.1 leave 0(%ebx) as written, but old gas +1.92.3 changes it. In general changing would be the sort of "optimization" +an assembler might perform, hence explicit ".byte"s are used where +necessary. + + + +SHLD/SHRD INSTRUCTIONS + +The %cl count forms of double shift instructions like "shldl %cl,%eax,%ebx" +must be written "shldl %eax,%ebx" for some assemblers. gas takes either, +Solaris "as" doesn't allow %cl, gcc generates %cl for gas and NeXT (which is +gas), and omits %cl elsewhere. + +For GMP an autoconf test is used to determine whether %cl should be used and +the macros shldl, shrdl, shldw and shrdw in mpn/x86/x86-defs.m4 then pass +through or omit %cl as necessary. See comments with those macros for usage. + + + +DIRECTION FLAG + +The x86 calling conventions say that the direction flag should be clear at +function entry and exit. (See iBCS2 and SVR4 ABI books, references below.) + +Although this has been so since the year dot, it's not absolutely clear +whether it's universally respected. Since it's better to be safe than +sorry, gmp follows glibc and does a "cld" if it depends on the direction +flag being clear. This happens only in a few places. + + + +POSITION INDEPENDENT CODE + +Defining the symbol PIC in m4 processing selects position independent code. +This mainly affects computed jumps, and these are implemented in a +self-contained fashion (without using the global offset table). The few +calls from assembly code to global functions use the normal procedure +linkage table. + +PIC is necessary for ELF shared libraries because they can be mapped into +different processes at different virtual addresses. Text relocations in +shared libraries are allowed, but that presumably means a page with such a +relocation isn't shared. The use of the PLT for PIC adds a fixed cost to +every function call, which is small but might be noticeable when working with +small operands. + +Calls from one library function to another don't need to go through the PLT, +since of course the call instruction uses a displacement, not an absolute +address, and the relative locations of object files are known when libgmp.so +is created. "ld -Bsymbolic" (or "gcc -Wl,-Bsymbolic") will resolve calls +this way, so that there's no jump through the PLT, but of course leaving +setups of the GOT address in %ebx that may be unnecessary. + +The %ebx setup could be avoided in assembly if a separate option controlled +PIC for calls as opposed to computed jumps etc. But there's only ever +likely to be a handful of calls out of assembler, and getting the same +optimization for C intra-library calls would be more important. There seems +no easy way to tell gcc that certain functions can be called non-PIC, and +unfortunately many gmp functions use the global memory allocation variables, +so they need the GOT anyway. Object files with no global data references +and only intra-library calls could go into the library as non-PIC under +-Bsymbolic. Integrating this into libtool and automake is left as an +exercise for the reader. + + + +SIMPLE LOOPS + +The overheads in setting up for an unrolled loop can mean that at small +sizes a simple loop is faster. Making small sizes go fast is important, +even if it adds a cycle or two to bigger sizes. To this end various +routines choose between a simple loop and an unrolled loop according to +operand size. The path to the simple loop, or to special case code for +small sizes, is always as fast as possible. + +Adding a simple loop requires a conditional jump to choose between the +simple and unrolled code. The size of a branch misprediction penalty +affects whether a simple loop is worthwhile. + +The convention is for an m4 definition UNROLL_THRESHOLD to set the crossover +point, with sizes < UNROLL_THRESHOLD using the simple loop, sizes >= +UNROLL_THRESHOLD using the unrolled loop. If position independent code adds +a couple of cycles to an unrolled loop setup, the threshold will vary with +PIC or non-PIC. Something like the following is typical. + + ifdef(`PIC',` + deflit(UNROLL_THRESHOLD, 10) + ',` + deflit(UNROLL_THRESHOLD, 8) + ') + +There's no automated way to determine the threshold. Setting it to a small +value and then to a big value makes it possible to measure the simple and +unrolled loops each over a range of sizes, from which the crossover point +can be determined. Alternately, just adjust the threshold up or down until +there's no more speedups. + + + +UNROLLED LOOP CODING + +The x86 addressing modes allow a byte displacement of -128 to +127, making +it possible to access 256 bytes, which is 64 limbs, without adjusting +pointer registers within the loop. Dword sized displacements can be used +too, but they increase code size, and unrolling to 64 ought to be enough. + +When unrolling to the full 64 limbs/loop, the limb at the top of the loop +will have a displacement of -128, so pointers have to have a corresponding ++128 added before entering the loop. When unrolling to 32 limbs/loop +displacements 0 to 127 can be used with 0 at the top of the loop and no +adjustment needed to the pointers. + +Where 64 limbs/loop is supported, the +128 adjustment is done only when 64 +limbs/loop is selected. Usually the gain in speed using 64 instead of 32 or +16 is small, so support for 64 limbs/loop is generally only for comparison. + + + +COMPUTED JUMPS + +When working from least significant limb to most significant limb (most +routines) the computed jump and pointer calculations in preparation for an +unrolled loop are as follows. + + S = operand size in limbs + N = number of limbs per loop (UNROLL_COUNT) + L = log2 of unrolling (UNROLL_LOG2) + M = mask for unrolling (UNROLL_MASK) + C = code bytes per limb in the loop + B = bytes per limb (4 for x86) + + computed jump (-S & M) * C + entrypoint + subtract from pointers (-S & M) * B + initial loop counter (S-1) >> L + displacements 0 to B*(N-1) + +The loop counter is decremented at the end of each loop, and the looping +stops when the decrement takes the counter to -1. The displacements are for +the addressing accessing each limb, eg. a load with "movl disp(%ebx), %eax". + +Usually the multiply by "C" can be handled without an imul, using instead an +leal, or a shift and subtract. + +When working from most significant to least significant limb (eg. mpn_lshift +and mpn_copyd), the calculations change as follows. + + add to pointers (-S & M) * B + displacements 0 to -B*(N-1) + + + +OLD GAS 1.92.3 + +This version comes with FreeBSD 2.2.8 and has a couple of gremlins that +affect gmp code. + +Firstly, an expression involving two forward references to labels comes out +as zero. For example, + + addl $bar-foo, %eax + foo: + nop + bar: + +This should lead to "addl $1, %eax", but it comes out as "addl $0, %eax". +When only one forward reference is involved, it works correctly, as for +example, + + foo: + addl $bar-foo, %eax + nop + bar: + +Secondly, an expression involving two labels can't be used as the +displacement for an leal. For example, + + foo: + nop + bar: + leal bar-foo(%eax,%ebx,8), %ecx + +A slightly cryptic error is given, "Unimplemented segment type 0 in +parse_operand". When only one label is used it's ok, and the label can be a +forward reference too, as for example, + + leal foo(%eax,%ebx,8), %ecx + nop + foo: + +These problems only affect PIC computed jump calculations. The workarounds +are just to do an leal without a displacement and then an addl, and to make +sure the code is placed so that there's at most one forward reference in the +addl. + + + +REFERENCES + +"Intel Architecture Software Developer's Manual", volumes 1 to 3, 1999, +order numbers 243190, 243191 and 243192. Available on-line, + + ftp://download.intel.com/design/PentiumII/manuals/243190.htm + ftp://download.intel.com/design/PentiumII/manuals/243191.htm + ftp://download.intel.com/design/PentiumII/manuals/243192.htm + +"Intel386 Family Binary Compatibility Specification 2", Intel Corporation, +published by McGraw-Hill, 1991, ISBN 0-07-031219-2. + +"System V Application Binary Interface", Unix System Laboratories Inc, 1992, +published by Prentice Hall, ISBN 0-13-880410-9. And the "Intel386 Processor +Supplement", AT&T, 1991, ISBN 0-13-877689-X. (These have details of ELF +shared library PIC coding.) + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/ghc/rts/gmp/mpn/x86/addsub_n.S b/ghc/rts/gmp/mpn/x86/addsub_n.S new file mode 100644 index 0000000..fe6f648 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/addsub_n.S @@ -0,0 +1,174 @@ +/* Currently not working and not used. */ + +/* +Copyright (C) 1999 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + + +#define SAVE_BORROW_RESTORE_CARRY(r) adcl r,r; shll $31,r +#define SAVE_CARRY_RESTORE_BORROW(r) adcl r,r + + .globl mpn_addsub_n_0 + .globl mpn_addsub_n_1 + +/* Cute i386/i486/p6 addsub loop for the "full overlap" case r1==s2,r2==s1. + We let subtraction and addition alternate in being two limbs + ahead of the other, thereby avoiding some SAVE_RESTORE. */ +// r1 = r2 + r1 edi = esi + edi +// r2 = r2 - r1 esi = esi - edi +// s1 s2 +// r2 r1 +// eax,ebx,ecx,edx,esi,edi,ebp +mpn_addsub_n_0: + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),%edi /* res_ptr */ + movl 24(%esp),%esi /* s1_ptr */ + movl 36(%esp),%ebp /* size */ + + shrl $2,%ebp + xorl %edx,%edx + .align 4 +Loop0: // L=load E=execute S=store + movl (%esi),%ebx // sub 0 L + movl 4(%esi),%ecx // sub 1 L + sbbl (%edi),%ebx // sub 0 LE + sbbl 4(%edi),%ecx // sub 1 LE +// SAVE_BORROW_RESTORE_CARRY(%edx) + movl (%esi),%eax // add 0 L + adcl %eax,(%edi) // add 0 LES + movl 4(%esi),%eax // add 1 L + adcl %eax,4(%edi) // add 1 LES + movl %ebx,(%esi) // sub 0 S + movl %ecx,4(%esi) // sub 1 S + movl 8(%esi),%ebx // add 2 L + adcl 8(%edi),%ebx // add 2 LE + movl 12(%esi),%ecx // add 3 L + adcl 12(%edi),%ecx // add 3 LE +// SAVE_CARRY_RESTORE_BORROW(%edx) + movl 8(%edi),%eax // sub 2 L + sbbl %eax,8(%esi) // sub 2 LES + movl 12(%edi),%eax // sub 3 L + sbbl %eax,12(%esi) // sub 3 LES + movl %ebx,8(%edi) // add 2 S + movl %ecx,12(%edi) // add 3 S + leal 16(%esi),%esi + leal 16(%edi),%edi + decl %ebp + jnz Loop0 + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +/* Cute i386/i486/p6 addsub loop for the "full overlap" case r1==s1,r2==s2. + We let subtraction and addition alternate in being two limbs + ahead of the other, thereby avoiding some SAVE_RESTORE. */ +// r1 = r1 + r2 edi = edi + esi +// r2 = r1 - r2 esi = edi - esi +// s2 s1 +// r2 r1 +// eax,ebx,ecx,edx,esi,edi,ebp +mpn_addsub_n_1: + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),%edi /* res_ptr */ + movl 24(%esp),%esi /* s1_ptr */ + movl 36(%esp),%ebp /* size */ + + shrl $2,%ebp + xorl %edx,%edx + .align 4 +Loop1: // L=load E=execute S=store + movl (%edi),%ebx // sub 0 L + sbbl (%esi),%ebx // sub 0 LE + movl 4(%edi),%ecx // sub 1 L + sbbl 4(%esi),%ecx // sub 1 LE +// SAVE_BORROW_RESTORE_CARRY(%edx) + movl (%esi),%eax // add 0 L + adcl %eax,(%edi) // add 0 LES + movl 4(%esi),%eax // add 1 L + adcl %eax,4(%edi) // add 1 LES + movl %ebx,(%esi) // sub 0 S + movl %ecx,4(%esi) // sub 1 S + movl 8(%esi),%ebx // add 2 L + adcl 8(%edi),%ebx // add 2 LE + movl 12(%esi),%ecx // add 3 L + adcl 12(%edi),%ecx // add 3 LE +// SAVE_CARRY_RESTORE_BORROW(%edx) + movl 8(%edi),%eax // sub 2 L + sbbl 8(%esi),%eax // sub 2 LES + movl %eax,8(%esi) // sub 2 S + movl 12(%edi),%eax // sub 3 L + sbbl 12(%esi),%eax // sub 3 LE + movl %eax,12(%esi) // sub 3 S + movl %ebx,8(%edi) // add 2 S + movl %ecx,12(%edi) // add 3 S + leal 16(%esi),%esi + leal 16(%edi),%edi + decl %ebp + jnz Loop1 + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + .globl mpn_copy +mpn_copy: + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),%edi /* res_ptr */ + movl 24(%esp),%esi /* s1_ptr */ + movl 28(%esp),%ebp /* size */ + + shrl $2,%ebp + .align 4 +Loop2: + movl (%esi),%eax + movl 4(%esi),%ebx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl 8(%esi),%eax + movl 12(%esi),%ebx + movl %eax,8(%edi) + movl %ebx,12(%edi) + leal 16(%esi),%esi + leal 16(%edi),%edi + decl %ebp + jnz Loop2 + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret diff --git a/ghc/rts/gmp/mpn/x86/aors_n.asm b/ghc/rts/gmp/mpn/x86/aors_n.asm new file mode 100644 index 0000000..18ef816 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/aors_n.asm @@ -0,0 +1,187 @@ +dnl x86 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. + +dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software +dnl Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +ifdef(`OPERATION_add_n',` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + +',`ifdef(`OPERATION_sub_n',` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) + +PROLOGUE(M4_function_nc) +deflit(`FRAME',0) + + pushl %edi FRAME_pushl() + pushl %esi FRAME_pushl() + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%edx + movl PARAM_SIZE,%ecx + + movl %ecx,%eax + shrl $3,%ecx C compute count for unrolled loop + negl %eax + andl $7,%eax C get index where to start loop + jz LF(M4_function_n,oopgo) C necessary special case for 0 + incl %ecx C adjust loop count + shll $2,%eax C adjustment for pointers... + subl %eax,%edi C ... since they are offset ... + subl %eax,%esi C ... by a constant when we ... + subl %eax,%edx C ... enter the loop + shrl $2,%eax C restore previous value + +ifdef(`PIC',` + C Calculate start address in loop for PIC. Due to limitations in + C old gas, LF(M4_function_n,oop)-L(0a)-3 cannot be put into the leal + call L(0a) +L(0a): leal (%eax,%eax,8),%eax + addl (%esp),%eax + addl $LF(M4_function_n,oop)-L(0a)-3,%eax + addl $4,%esp +',` + C Calculate start address in loop for non-PIC. + leal LF(M4_function_n,oop)-3(%eax,%eax,8),%eax +') + + C These lines initialize carry from the 5th parameter. Should be + C possible to simplify. + pushl %ebp FRAME_pushl() + movl PARAM_CARRY,%ebp + shrl $1,%ebp C shift bit 0 into carry + popl %ebp FRAME_popl() + + jmp *%eax C jump into loop + +EPILOGUE() + + + ALIGN(8) +PROLOGUE(M4_function_n) +deflit(`FRAME',0) + + pushl %edi FRAME_pushl() + pushl %esi FRAME_pushl() + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%edx + movl PARAM_SIZE,%ecx + + movl %ecx,%eax + shrl $3,%ecx C compute count for unrolled loop + negl %eax + andl $7,%eax C get index where to start loop + jz L(oop) C necessary special case for 0 + incl %ecx C adjust loop count + shll $2,%eax C adjustment for pointers... + subl %eax,%edi C ... since they are offset ... + subl %eax,%esi C ... by a constant when we ... + subl %eax,%edx C ... enter the loop + shrl $2,%eax C restore previous value + +ifdef(`PIC',` + C Calculate start address in loop for PIC. Due to limitations in + C some assemblers, L(oop)-L(0b)-3 cannot be put into the leal + call L(0b) +L(0b): leal (%eax,%eax,8),%eax + addl (%esp),%eax + addl $L(oop)-L(0b)-3,%eax + addl $4,%esp +',` + C Calculate start address in loop for non-PIC. + leal L(oop)-3(%eax,%eax,8),%eax +') + jmp *%eax C jump into loop + +L(oopgo): + pushl %ebp FRAME_pushl() + movl PARAM_CARRY,%ebp + shrl $1,%ebp C shift bit 0 into carry + popl %ebp FRAME_popl() + + ALIGN(8) +L(oop): movl (%esi),%eax + M4_inst (%edx),%eax + movl %eax,(%edi) + movl 4(%esi),%eax + M4_inst 4(%edx),%eax + movl %eax,4(%edi) + movl 8(%esi),%eax + M4_inst 8(%edx),%eax + movl %eax,8(%edi) + movl 12(%esi),%eax + M4_inst 12(%edx),%eax + movl %eax,12(%edi) + movl 16(%esi),%eax + M4_inst 16(%edx),%eax + movl %eax,16(%edi) + movl 20(%esi),%eax + M4_inst 20(%edx),%eax + movl %eax,20(%edi) + movl 24(%esi),%eax + M4_inst 24(%edx),%eax + movl %eax,24(%edi) + movl 28(%esi),%eax + M4_inst 28(%edx),%eax + movl %eax,28(%edi) + leal 32(%edi),%edi + leal 32(%esi),%esi + leal 32(%edx),%edx + decl %ecx + jnz L(oop) + + sbbl %eax,%eax + negl %eax + + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/aorsmul_1.asm b/ghc/rts/gmp/mpn/x86/aorsmul_1.asm new file mode 100644 index 0000000..f32ad83 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/aorsmul_1.asm @@ -0,0 +1,134 @@ +dnl x86 __gmpn_addmul_1 (for 386 and 486) -- Multiply a limb vector with a +dnl limb and add the result to a second limb vector. + + +dnl Copyright (C) 1992, 1994, 1997, 1999, 2000 Free Software Foundation, +dnl Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +ifdef(`OPERATION_addmul_1',` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + +',`ifdef(`OPERATION_submul_1',` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + + +C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); + +define(PARAM_MULTIPLIER, `FRAME+16(%esp)') +define(PARAM_SIZE, `FRAME+12(%esp)') +define(PARAM_SRC, `FRAME+8(%esp)') +define(PARAM_DST, `FRAME+4(%esp)') + + TEXT + ALIGN(8) + +PROLOGUE(M4_function_1) +deflit(`FRAME',0) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%ecx + + xorl %ebx,%ebx + andl $3,%ecx + jz L(end0) + +L(oop0): + movl (%esi),%eax + mull PARAM_MULTIPLIER + leal 4(%esi),%esi + addl %ebx,%eax + movl $0,%ebx + adcl %ebx,%edx + M4_inst %eax,(%edi) + adcl %edx,%ebx C propagate carry into cylimb + + leal 4(%edi),%edi + decl %ecx + jnz L(oop0) + +L(end0): + movl PARAM_SIZE,%ecx + shrl $2,%ecx + jz L(end) + + ALIGN(8) +L(oop): movl (%esi),%eax + mull PARAM_MULTIPLIER + addl %eax,%ebx + movl $0,%ebp + adcl %edx,%ebp + + movl 4(%esi),%eax + mull PARAM_MULTIPLIER + M4_inst %ebx,(%edi) + adcl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + movl 8(%esi),%eax + mull PARAM_MULTIPLIER + M4_inst %ebp,4(%edi) + adcl %eax,%ebx C new lo + cylimb + movl $0,%ebp + adcl %edx,%ebp + + movl 12(%esi),%eax + mull PARAM_MULTIPLIER + M4_inst %ebx,8(%edi) + adcl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + M4_inst %ebp,12(%edi) + adcl $0,%ebx C propagate carry into cylimb + + leal 16(%esi),%esi + leal 16(%edi),%edi + decl %ecx + jnz L(oop) + +L(end): movl %ebx,%eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/copyd.asm b/ghc/rts/gmp/mpn/x86/copyd.asm new file mode 100644 index 0000000..439640e --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/copyd.asm @@ -0,0 +1,80 @@ +dnl x86 mpn_copyd -- copy limb vector, decrementing. +dnl +dnl Future: On P6 an MMX loop should be able to go faster than this code. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Copy src,size to dst,size, working from high to low addresses. +C +C The code here is very generic and can be expected to be reasonable on all +C the x86 family. +C +C P5 - 1.0 cycles/limb. +C +C P6 - 2.4 cycles/limb, approx 40 cycles startup. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + .text + ALIGN(32) + +PROLOGUE(mpn_copyd) + C eax saved esi + C ebx + C ecx counter + C edx saved edi + C esi src + C edi dst + C ebp + + movl PARAM_SIZE, %ecx + movl %esi, %eax + + movl PARAM_SRC, %esi + movl %edi, %edx + + movl PARAM_DST, %edi + leal -4(%esi,%ecx,4), %esi + + leal -4(%edi,%ecx,4), %edi + + std + + rep + movsl + + cld + + movl %eax, %esi + movl %edx, %edi + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/copyi.asm b/ghc/rts/gmp/mpn/x86/copyi.asm new file mode 100644 index 0000000..5bc4e36 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/copyi.asm @@ -0,0 +1,79 @@ +dnl x86 mpn_copyi -- copy limb vector, incrementing. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Copy src,size to dst,size, working from low to high addresses. +C +C The code here is very generic and can be expected to be reasonable on all +C the x86 family. +C +C P5 - 1.0 cycles/limb. +C +C P6 - 0.75 cycles/limb. An MMX based copy was tried, but was found to be +C slower than a rep movs in all cases. The fastest MMX found was 0.8 +C cycles/limb (when fully aligned). A rep movs seems to have a startup +C time of about 15 cycles, but doing something special for small sizes +C could lead to a branch misprediction that would destroy any saving. +C For now a plain rep movs seems ok for P6. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + .text + ALIGN(32) + + C eax saved esi + C ebx + C ecx counter + C edx saved edi + C esi src + C edi dst + C ebp + +PROLOGUE(mpn_copyi) + + movl PARAM_SIZE, %ecx + movl %esi, %eax + + movl PARAM_SRC, %esi + movl %edi, %edx + + movl PARAM_DST, %edi + + cld C better safe than sorry, see mpn/x86/README.family + + rep + movsl + + movl %eax, %esi + movl %edx, %edi + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/diveby3.asm b/ghc/rts/gmp/mpn/x86/diveby3.asm new file mode 100644 index 0000000..df879da --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/diveby3.asm @@ -0,0 +1,115 @@ +dnl x86 mpn_divexact_by3 -- mpn division by 3, expecting no remainder. + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl The following all have their own optimized versions of this routine, +dnl but for reference the code here runs as follows. +dnl +dnl cycles/limb +dnl P54 18.0 +dnl P55 17.0 +dnl P6 14.5 +dnl K6 14.0 +dnl K7 10.0 + + +include(`../config.m4') + + +C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t carry); + +defframe(PARAM_CARRY,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl multiplicative inverse of 3, modulo 2^32 +deflit(INVERSE_3, 0xAAAAAAAB) + +dnl ceil(b/3) and ceil(b*2/3) where b=2^32 +deflit(ONE_THIRD_CEIL, 0x55555556) +deflit(TWO_THIRDS_CEIL, 0xAAAAAAAB) + + .text + ALIGN(8) + +PROLOGUE(mpn_divexact_by3c) +deflit(`FRAME',0) + + movl PARAM_SRC, %ecx + pushl %ebp FRAME_pushl() + + movl PARAM_SIZE, %ebp + pushl %edi FRAME_pushl() + + movl PARAM_DST, %edi + pushl %esi FRAME_pushl() + + movl $INVERSE_3, %esi + pushl %ebx FRAME_pushl() + + leal (%ecx,%ebp,4), %ecx + movl PARAM_CARRY, %ebx + + leal (%edi,%ebp,4), %edi + negl %ebp + + + ALIGN(8) +L(top): + C eax scratch, low product + C ebx carry limb (0 to 3) + C ecx &src[size] + C edx scratch, high product + C esi multiplier + C edi &dst[size] + C ebp counter, limbs, negative + + movl (%ecx,%ebp,4), %eax + + subl %ebx, %eax + + setc %bl + + imull %esi + + cmpl $ONE_THIRD_CEIL, %eax + movl %eax, (%edi,%ebp,4) + + sbbl $-1, %ebx C +1 if eax>=ceil(b/3) + cmpl $TWO_THIRDS_CEIL, %eax + + sbbl $-1, %ebx C +1 if eax>=ceil(b*2/3) + incl %ebp + + jnz L(top) + + + movl %ebx, %eax + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/divrem_1.asm b/ghc/rts/gmp/mpn/x86/divrem_1.asm new file mode 100644 index 0000000..12f1467 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/divrem_1.asm @@ -0,0 +1,232 @@ +dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient. + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl cycles/limb +dnl K6 20 +dnl P5 44 +dnl P6 39 +dnl 486 approx 43 maybe +dnl +dnl +dnl The following have their own optimized divrem_1 implementations, but +dnl for reference the code here runs as follows. +dnl +dnl cycles/limb +dnl P6MMX 39 +dnl K7 42 + + +include(`../config.m4') + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C +C Divide src,size by divisor and store the quotient in dst+xsize,size. +C Extend the division to fractional quotient limbs in dst,xsize. Return the +C remainder. Either or both xsize and size can be 0. +C +C mpn_divrem_1c takes a carry parameter which is an initial high limb, +C effectively one extra limb at the top of src,size. Must have +C carry= b^2, which is u*b+v >= b^2-x*y, and +C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of +C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0 +C and b-1, then the total probability can be summed over x and y, +C +C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1) +C --- * sum sum --- = --- * ------- * ------- = 1/4 +C b^2 x=0 y=1 b^2 b^4 2 2 +C +C Actually it's a very tiny bit less than 1/4 of course. If y is fixed, +C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2. + + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 9) +',` +deflit(UNROLL_THRESHOLD, 6) +') + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) + +PROLOGUE(M4_function_1c) + pushl %esi +deflit(`FRAME',4) + movl PARAM_CARRY, %esi + jmp LF(M4_function_1,start_nc) +EPILOGUE() + +PROLOGUE(M4_function_1) + push %esi +deflit(`FRAME',4) + xorl %esi, %esi C initial carry + +L(start_nc): + movl PARAM_SIZE, %ecx + pushl %ebx +deflit(`FRAME',8) + + movl PARAM_SRC, %ebx + pushl %edi +deflit(`FRAME',12) + + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_DST, %edi + + pushl %ebp +deflit(`FRAME',16) + jae L(unroll) + + + C simple loop + + movl PARAM_MULTIPLIER, %ebp + +L(simple): + C eax scratch + C ebx src + C ecx counter + C edx scratch + C esi carry + C edi dst + C ebp multiplier + + movl (%ebx), %eax + addl $4, %ebx + + mull %ebp + + addl $4, %edi + addl %esi, %eax + + adcl $0, %edx + + M4_inst %eax, -4(%edi) + + adcl $0, %edx + + movl %edx, %esi + loop L(simple) + + + popl %ebp + popl %edi + + popl %ebx + movl %esi, %eax + + popl %esi + ret + + + +C ----------------------------------------------------------------------------- +C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop +C the carries are ecx=lo, esi=hi, then they swap for each limb processed. +C For the computed jump an odd size means they start one way around, an even +C size the other. +C +C VAR_JUMP holds the computed jump temporarily because there's not enough +C registers at the point of doing the mul for the initial two carry limbs. +C +C The add/adc for the initial carry in %esi is necessary only for the +C mpn_addmul/submul_1c entry points. Duplicating the startup code to +C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good +C idea. + +dnl overlapping with parameters already fetched +define(VAR_COUNTER, `PARAM_SIZE') +define(VAR_JUMP, `PARAM_DST') + +L(unroll): + C eax + C ebx src + C ecx size + C edx + C esi initial carry + C edi dst + C ebp + + movl %ecx, %edx + decl %ecx + + subl $2, %edx + negl %ecx + + shrl $UNROLL_LOG2, %edx + andl $UNROLL_MASK, %ecx + + movl %edx, VAR_COUNTER + movl %ecx, %edx + + shll $4, %edx + negl %ecx + + C 15 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%edx,%ecx,1), %edx +') + movl (%ebx), %eax C src low limb + + movl PARAM_MULTIPLIER, %ebp + movl %edx, VAR_JUMP + + mull %ebp + + addl %esi, %eax C initial carry (from _1c) + jadcl0( %edx) + + + leal 4(%ebx,%ecx,4), %ebx + movl %edx, %esi C high carry + + movl VAR_JUMP, %edx + leal (%edi,%ecx,4), %edi + + testl $1, %ecx + movl %eax, %ecx C low carry + + jz L(noswap) + movl %esi, %ecx C high,low carry other way around + + movl %eax, %esi +L(noswap): + + jmp *%edx + + +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + leal (%edx,%ecx,1), %edx + addl $L(entry)-L(here), %edx + addl (%esp), %edx + ret +') + + +C ----------------------------------------------------------- + ALIGN(32) +L(top): +deflit(`FRAME',16) + C eax scratch + C ebx src + C ecx carry lo + C edx scratch + C esi carry hi + C edi dst + C ebp multiplier + C + C 15 code bytes per limb + + leal UNROLL_BYTES(%edi), %edi + +L(entry): +forloop(`i', 0, UNROLL_COUNT/2-1, ` + deflit(`disp0', eval(2*i*4)) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%ebx), %eax) + mull %ebp +Zdisp( M4_inst,%ecx, disp0,(%edi)) + adcl %eax, %esi + movl %edx, %ecx + jadcl0( %ecx) + + movl disp1(%ebx), %eax + mull %ebp + M4_inst %esi, disp1(%edi) + adcl %eax, %ecx + movl %edx, %esi + jadcl0( %esi) +') + + decl VAR_COUNTER + leal UNROLL_BYTES(%ebx), %ebx + + jns L(top) + + + popl %ebp + M4_inst %ecx, UNROLL_BYTES(%edi) + + popl %edi + movl %esi, %eax + + popl %ebx + jadcl0( %eax) + + popl %esi + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k6/cross.pl b/ghc/rts/gmp/mpn/x86/k6/cross.pl new file mode 100644 index 0000000..21734f3 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/cross.pl @@ -0,0 +1,141 @@ +#! /usr/bin/perl + +# Copyright (C) 2000 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; either version 2.1 of the License, or (at +# your option) any later version. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# Usage: cross.pl [filename.o]... +# +# Produce an annotated disassembly of the given object files, indicating +# certain code alignment and addressing mode problems afflicting K6 chips. +# "ZZ" is used on all annotations, so this can be searched for. +# +# With no arguments, all .o files corresponding to .asm files are processed. +# This is good in the mpn object directory of a k6*-*-* build. +# +# As far as fixing problems goes, any cache line crossing problems in loops +# get attention, but as a rule it's too tedious to rearrange code or slip in +# nops to fix every problem in setup or finishup code. +# +# Bugs: +# +# Instructions without mod/rm bytes or which are already vector decoded are +# unaffected by cache line boundary crossing, but not all of these have yet +# been put in as exceptions. All that occur in practice in GMP are present +# though. +# +# There's no messages for using the vector decoded addressing mode (%esi), +# but that mode is easy to avoid when coding. + +use strict; + +sub disassemble { + my ($file) = @_; + my ($addr,$b1,$b2,$b3, $prefix,$opcode,$modrm); + + open (IN, "objdump -Srfh $file |") + || die "Cannot open pipe from objdump\n"; + while () { + print; + + if (/^[ \t]*[0-9]+[ \t]+\.text[ \t]/ && /2\*\*([0-9]+)$/) { + if ($1 < 5) { + print "ZZ need at least 2**5 for predictable cache line crossing\n"; + } + } + + if (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)[ \t]+([0-9a-f]+)/) { + ($addr,$b1,$b2,$b3) = ($1,$2,$3,$4); + + } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)/) { + ($addr,$b1,$b2,$b3) = ($1,$2,$3,''); + + } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)/) { + ($addr,$b1,$b2,$b3) = ($1,$2,'',''); + + } else { + next; + } + + if ($b1 =~ /0f/) { + $prefix = $b1; + $opcode = $b2; + $modrm = $b3; + } else { + $prefix = ''; + $opcode = $b1; + $modrm = $b2; + } + + # modrm of the form 00-xxx-100 with an 0F prefix is the problem case + # for K6 and pre-CXT K6-2 + if ($prefix =~ /0f/ + && $opcode !~ /^8/ # jcond disp32 + && $modrm =~ /^[0-3][4c]/) { + print "ZZ ($file) >3 bytes to determine instruction length\n"; + } + + # with just an opcode, starting 1f mod 20h + if ($addr =~ /[13579bdf]f$/ + && $prefix !~ /0f/ + && $opcode !~ /1[012345]/ # adc + && $opcode !~ /1[89abcd]/ # sbb + && $opcode !~ /68/ # push $imm32 + && $opcode !~ /^7/ # jcond disp8 + && $opcode !~ /a[89]/ # test+imm + && $opcode !~ /a[a-f]/ # stos/lods/scas + && $opcode !~ /b8/ # movl $imm32,%eax + && $opcode !~ /e[0123]/ # loop/loopz/loopnz/jcxz + && $opcode !~ /e[b9]/ # jmp disp8/disp32 + && $opcode !~ /f[89abcd]/ # clc,stc,cli,sti,cld,std + && !($opcode =~ /f[67]/ # grp 1 + && $modrm =~ /^[2367abef]/) # mul, imul, div, idiv + && $modrm !~ /^$/) { + print "ZZ ($file) opcode/modrm cross 32-byte boundary\n"; + } + + # with an 0F prefix, anything starting at 1f mod 20h + if ($addr =~ /[13579bdf][f]$/ + && $prefix =~ /0f/) { + print "ZZ ($file) prefix/opcode cross 32-byte boundary\n"; + } + + # with an 0F prefix, anything with mod/rm starting at 1e mod 20h + if ($addr =~ /[13579bdf][e]$/ + && $prefix =~ /0f/ + && $opcode !~ /^8/ # jcond disp32 + && $modrm !~ /^$/) { + print "ZZ ($file) prefix/opcode/modrm cross 32-byte boundary\n"; + } + } + close IN || die "Error from objdump (or objdump not available)\n"; +} + + +my @files; +if ($#ARGV >= 0) { + @files = @ARGV; +} else { + @files = glob "*.asm"; + map {s/.asm/.o/} @files; +} + +foreach (@files) { + disassemble($_); +} diff --git a/ghc/rts/gmp/mpn/x86/k6/diveby3.asm b/ghc/rts/gmp/mpn/x86/k6/diveby3.asm new file mode 100644 index 0000000..ffb97bc --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/diveby3.asm @@ -0,0 +1,110 @@ +dnl AMD K6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder. +dnl +dnl K6: 11.0 cycles/limb + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t carry); +C +C Using %esi in (%esi,%ecx,4) or 0(%esi,%ecx,4) addressing modes doesn't +C lead to vector decoding, unlike plain (%esi) does. + +defframe(PARAM_CARRY,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl multiplicative inverse of 3, modulo 2^32 +deflit(INVERSE_3, 0xAAAAAAAB) + + .text + ALIGN(32) + +PROLOGUE(mpn_divexact_by3c) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + pushl %esi defframe_pushl(SAVE_ESI) + + movl PARAM_SRC, %esi + pushl %edi defframe_pushl(SAVE_EDI) + + movl PARAM_DST, %edi + pushl %ebx defframe_pushl(SAVE_EBX) + + movl PARAM_CARRY, %ebx + leal (%esi,%ecx,4), %esi + + pushl $3 defframe_pushl(VAR_THREE) + leal (%edi,%ecx,4), %edi + + negl %ecx + + + C Need 32 alignment for claimed speed, to avoid the movl store + C opcode/modrm crossing a cache line boundary + + ALIGN(32) +L(top): + C eax scratch, low product + C ebx carry limb (0 to 3) + C ecx counter, limbs, negative + C edx scratch, high product + C esi &src[size] + C edi &dst[size] + C ebp + C + C The 0(%esi,%ecx,4) form pads so the finishup "movl %ebx, %eax" + C doesn't cross a 32 byte boundary, saving a couple of cycles + C (that's a fixed couple, not per loop). + +Zdisp( movl, 0,(%esi,%ecx,4), %eax) + subl %ebx, %eax + + setc %bl + + imull $INVERSE_3, %eax + + movl %eax, (%edi,%ecx,4) + addl $2, %ecx + + mull VAR_THREE + + addl %edx, %ebx + loop L(top) + + + movl SAVE_ESI, %esi + movl %ebx, %eax + + movl SAVE_EBX, %ebx + + movl SAVE_EDI, %edi + addl $FRAME, %esp + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k6/gmp-mparam.h b/ghc/rts/gmp/mpn/x86/k6/gmp-mparam.h new file mode 100644 index 0000000..77f3948 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/gmp-mparam.h @@ -0,0 +1,97 @@ +/* AMD K6 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + + +#ifndef UMUL_TIME +#define UMUL_TIME 3 /* cycles */ +#endif + +#ifndef UDIV_TIME +#define UDIV_TIME 20 /* cycles */ +#endif + +/* bsfl takes 12-27 cycles, put an average for uniform random numbers */ +#ifndef COUNT_TRAILING_ZEROS_TIME +#define COUNT_TRAILING_ZEROS_TIME 14 /* cycles */ +#endif + + +/* Generated by tuneup.c, 2000-07-04. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 18 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 130 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 34 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 116 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 68 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 98 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 13 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 67 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 528, 1184, 2176, 5632, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 472 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 4352 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 528, 1184, 2176, 5632, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 544 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 4352 +#endif diff --git a/ghc/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm b/ghc/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm new file mode 100644 index 0000000..20a33e6 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm @@ -0,0 +1,179 @@ +dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing. +dnl +dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data +dnl alignment. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K6-2 aligned: +dnl UNROLL_COUNT cycles/limb +dnl 8 0.75 +dnl 16 0.625 +dnl 32 0.5625 +dnl 64 0.53 +dnl Maximum possible with the current code is 64, the minimum is 2. + +deflit(UNROLL_COUNT, 32) + + +C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Copy src,size to dst,size, processing limbs from high to low addresses. +C +C The comments in copyi.asm apply here too. + + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + .text + ALIGN(32) + +PROLOGUE(mpn_copyd) + movl PARAM_SIZE, %ecx + movl %esi, %eax + + movl PARAM_SRC, %esi + movl %edi, %edx + + std + + movl PARAM_DST, %edi + cmpl $UNROLL_COUNT, %ecx + + leal -4(%esi,%ecx,4), %esi + + leal -4(%edi,%ecx,4), %edi + ja L(unroll) + +L(simple): + rep + movsl + + cld + + movl %eax, %esi + movl %edx, %edi + + ret + + +L(unroll): + C if src and dst are different alignments mod8, then use rep movs + C if src and dst are both 4mod8 then process one limb to get 0mod8 + + pushl %ebx + leal (%esi,%edi), %ebx + + testb $4, %bl + popl %ebx + + jnz L(simple) + testl $4, %esi + + leal -UNROLL_COUNT(%ecx), %ecx + jnz L(already_aligned) + + movsl + + decl %ecx +L(already_aligned): + + +ifelse(UNROLL_BYTES,256,` + subl $128, %esi + subl $128, %edi +') + + C offset 0x3D here, but gets full speed without further alignment +L(top): + C eax saved esi + C ebx + C ecx counter, limbs + C edx saved edi + C esi src, incrementing + C edi dst, incrementing + C ebp + C + C `disp' is never 0, so don't need to force 0(%esi). + +deflit(CHUNK_COUNT, 2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128))) + movq disp(%esi), %mm0 + movq %mm0, disp(%edi) +') + + leal -UNROLL_BYTES(%esi), %esi + subl $UNROLL_COUNT, %ecx + + leal -UNROLL_BYTES(%edi), %edi + jns L(top) + + + C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to + C UNROLL_COUNT-1 limbs remaining + + testb $eval(UNROLL_COUNT/2), %cl + + leal UNROLL_COUNT(%ecx), %ecx + jz L(not_half) + + + C at an unroll count of 32 this block of code is 16 cycles faster than + C the rep movs, less 3 or 4 to test whether to do it + +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, ` + deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128))) + movq disp(%esi), %mm0 + movq %mm0, disp(%edi) +') + + subl $eval(UNROLL_BYTES/2), %esi + subl $eval(UNROLL_BYTES/2), %edi + + subl $eval(UNROLL_COUNT/2), %ecx +L(not_half): + + +ifelse(UNROLL_BYTES,256,` + addl $128, %esi + addl $128, %edi +') + + rep + movsl + + cld + + movl %eax, %esi + movl %edx, %edi + + femms + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm b/ghc/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm new file mode 100644 index 0000000..215d805 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm @@ -0,0 +1,196 @@ +dnl AMD K6-2 mpn_copyi -- copy limb vector, incrementing. +dnl +dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data +dnl alignment. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K6-2 aligned: +dnl UNROLL_COUNT cycles/limb +dnl 8 0.75 +dnl 16 0.625 +dnl 32 0.5625 +dnl 64 0.53 +dnl Maximum possible with the current code is 64, the minimum is 2. + +deflit(UNROLL_COUNT, 32) + + +C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The MMX loop is faster than a rep movs when src and dst are both 0mod8. +C With one 0mod8 and one 4mod8 it's 1.056 c/l and the rep movs at 1.0 c/l is +C used instead. +C +C mod8 +C src dst +C 0 0 both aligned, use mmx +C 0 4 unaligned, use rep movs +C 4 0 unaligned, use rep movs +C 4 4 do one movs, then both aligned, use mmx +C +C The MMX code on aligned data is 0.5 c/l, plus loop overhead of 2 +C cycles/loop, which is 0.0625 c/l at 32 limbs/loop. +C +C A pattern of two movq loads and two movq stores (or four and four) was +C tried, but found to be the same speed as just one of each. +C +C Note that this code only suits K6-2 and K6-3. Plain K6 does only one mmx +C instruction per cycle, so "movq"s are no faster than the simple 1 c/l rep +C movs. +C +C Enhancement: +C +C Addressing modes like disp(%esi,%ecx,4) aren't currently used. They'd +C make it possible to avoid incrementing %esi and %edi in the loop and hence +C get loop overhead down to 1 cycle. Care would be needed to avoid bad +C cache line crossings since the "movq"s would then be 5 code bytes rather +C than 4. + + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + .text + ALIGN(32) + +PROLOGUE(mpn_copyi) + movl PARAM_SIZE, %ecx + movl %esi, %eax + + movl PARAM_SRC, %esi + movl %edi, %edx + + cld + + movl PARAM_DST, %edi + cmpl $UNROLL_COUNT, %ecx + + ja L(unroll) + +L(simple): + rep + movsl + + movl %eax, %esi + movl %edx, %edi + + ret + + +L(unroll): + C if src and dst are different alignments mod8, then use rep movs + C if src and dst are both 4mod8 then process one limb to get 0mod8 + + pushl %ebx + leal (%esi,%edi), %ebx + + testb $4, %bl + popl %ebx + + jnz L(simple) + testl $4, %esi + + leal -UNROLL_COUNT(%ecx), %ecx + jz L(already_aligned) + + decl %ecx + + movsl +L(already_aligned): + + +ifelse(UNROLL_BYTES,256,` + addl $128, %esi + addl $128, %edi +') + + C this is offset 0x34, no alignment needed +L(top): + C eax saved esi + C ebx + C ecx counter, limbs + C edx saved edi + C esi src, incrementing + C edi dst, incrementing + C ebp + C + C Zdisp gets 0(%esi) left that way to avoid vector decode, and with + C 0(%edi) keeps code aligned to 16 byte boundaries. + +deflit(CHUNK_COUNT, 2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) +Zdisp( movq, disp,(%esi), %mm0) +Zdisp( movq, %mm0, disp,(%edi)) +') + + addl $UNROLL_BYTES, %esi + subl $UNROLL_COUNT, %ecx + + leal UNROLL_BYTES(%edi), %edi + jns L(top) + + + C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to + C UNROLL_COUNT-1 limbs remaining + + testb $eval(UNROLL_COUNT/2), %cl + + leal UNROLL_COUNT(%ecx), %ecx + jz L(not_half) + + C at an unroll count of 32 this block of code is 16 cycles faster than + C the rep movs, less 3 or 4 to test whether to do it + +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, ` + deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + movq disp(%esi), %mm0 + movq %mm0, disp(%edi) +') + addl $eval(UNROLL_BYTES/2), %esi + addl $eval(UNROLL_BYTES/2), %edi + + subl $eval(UNROLL_COUNT/2), %ecx +L(not_half): + + +ifelse(UNROLL_BYTES,256,` + subl $128, %esi + subl $128, %edi +') + + rep + movsl + + movl %eax, %esi + movl %edx, %edi + + femms + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm b/ghc/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm new file mode 100644 index 0000000..f6d54f9 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm @@ -0,0 +1,286 @@ +dnl AMD K6-2 mpn_lshift -- mpn left shift. +dnl +dnl K6-2: 1.75 cycles/limb + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl used after src has been fetched +define(VAR_RETVAL,`PARAM_SRC') + +dnl minimum 9, because unrolled loop can't handle less +deflit(UNROLL_THRESHOLD, 9) + + .text + ALIGN(32) + +PROLOGUE(mpn_lshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shldl( %cl, %edx, %eax) C return value + + shll %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) C avoid offset 0x1f +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx,%eax,4), %edx C src high limb + negl %ecx + + movd PARAM_SHIFT, %mm6 + addl $32, %ecx C 32-shift + + shrl %cl, %edx + cmpl $UNROLL_THRESHOLD-1, %eax + + movl %edx, VAR_RETVAL + jae L(unroll) + + + movd %ecx, %mm7 + movl %eax, %ecx + + movl PARAM_DST, %eax + +L(simple): + C eax dst + C ebx src + C ecx counter, size-1 to 1 + C edx retval + C + C mm0 scratch + C mm6 shift + C mm7 32-shift + + movq -4(%ebx,%ecx,4), %mm0 + + psrlq %mm7, %mm0 + +Zdisp( movd, %mm0, 0,(%eax,%ecx,4)) + loop L(simple) + + + movd (%ebx), %mm0 + popl %ebx + + psllq %mm6, %mm0 + + movd %mm0, (%eax) + movl %edx, %eax + + femms + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax size-1 + C ebx src + C ecx 32-shift + C edx retval (but instead VAR_RETVAL is used) + C + C mm6 shift + + addl $32, %ecx + movl PARAM_DST, %edx + + movd %ecx, %mm7 + subl $7, %eax C size-8 + + leal (%edx,%eax,4), %ecx C alignment of dst + + movq 32-8(%ebx,%eax,4), %mm2 C src high qword + testb $4, %cl + + jz L(dst_aligned) + psllq %mm6, %mm2 + + psrlq $32, %mm2 + decl %eax + + movd %mm2, 32(%edx,%eax,4) C dst high limb + movq 32-8(%ebx,%eax,4), %mm2 C new src high qword +L(dst_aligned): + + movq 32-16(%ebx,%eax,4), %mm0 C src second highest qword + + + C This loop is the important bit, the rest is just support for it. + C Four src limbs are held at the start, and four more will be read. + C Four dst limbs will be written. This schedule seems necessary for + C full speed. + C + C The use of size-8 lets the loop stop when %eax goes negative and + C leaves -4 to -1 which can be tested with test $1 and $2. + +L(top): + C eax counter, size-8 step by -4 until <0 + C ebx src + C ecx + C edx dst + C + C mm0 src next qword + C mm1 scratch + C mm2 src prev qword + C mm6 shift + C mm7 64-shift + + psllq %mm6, %mm2 + subl $4, %eax + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm0, %mm2 + movq 24(%ebx,%eax,4), %mm0 + + psllq %mm6, %mm1 + movq %mm2, 40(%edx,%eax,4) + + movq %mm0, %mm2 + psrlq %mm7, %mm0 + + por %mm0, %mm1 + movq 16(%ebx,%eax,4), %mm0 + + movq %mm1, 32(%edx,%eax,4) + jnc L(top) + + + C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4. + C + C 8(%ebx) is the next source, and 24(%edx) is the next destination. + C %eax is between -4 and -1, representing respectively 0 to 3 extra + C limbs that must be read. + + + testl $2, %eax C testl to avoid bad cache line crossing + jz L(finish_nottwo) + + C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes + C new mm2 and a new mm0 is loaded. + + psllq %mm6, %mm2 + movq %mm0, %mm1 + + psrlq %mm7, %mm0 + subl $2, %eax + + por %mm0, %mm2 + movq 16(%ebx,%eax,4), %mm0 + + movq %mm2, 32(%edx,%eax,4) + movq %mm1, %mm2 +L(finish_nottwo): + + + C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0 + + testb $1, %al + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm0, %mm2 + psllq %mm6, %mm1 + + movq %mm2, 24(%edx,%eax,4) + jz L(finish_even) + + + C Size is odd, so mm1 and one extra limb to process. + + movd (%ebx), %mm0 C src[0] + popl %ebx +deflit(`FRAME',0) + + movq %mm0, %mm2 + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + psllq %mm6, %mm2 + por %mm0, %mm1 + + movq %mm1, 4(%edx) C dst[1,2] + movd %mm2, (%edx) C dst[0] + + movl VAR_RETVAL, %eax + + femms + ret + + + nop C avoid bad cache line crossing +L(finish_even): +deflit(`FRAME',4) + C Size is even, so only mm1 left to process. + + movq %mm1, (%edx) C dst[0,1] + movl VAR_RETVAL, %eax + + popl %ebx + femms + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm b/ghc/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm new file mode 100644 index 0000000..8a8c144 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm @@ -0,0 +1,285 @@ +dnl AMD K6-2 mpn_rshift -- mpn right shift. +dnl +dnl K6-2: 1.75 cycles/limb + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl Minimum 9, because the unrolled loop can't handle less. +dnl +deflit(UNROLL_THRESHOLD, 9) + + .text + ALIGN(32) + +PROLOGUE(mpn_rshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shrdl( %cl, %edx, %eax) C return value + + shrl %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) C avoid offset 0x1f +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx), %edx C src low limb + negl %ecx + + addl $32, %ecx + movd PARAM_SHIFT, %mm6 + + shll %cl, %edx + cmpl $UNROLL_THRESHOLD-1, %eax + + jae L(unroll) + + + C eax size-1 + C ebx src + C ecx 32-shift + C edx retval + C + C mm6 shift + + movl PARAM_DST, %ecx + leal (%ebx,%eax,4), %ebx + + leal -4(%ecx,%eax,4), %ecx + negl %eax + + C This loop runs at about 3 cycles/limb, which is the amount of + C decoding, and this is despite every second access being unaligned. + +L(simple): + C eax counter, -(size-1) to -1 + C ebx &src[size-1] + C ecx &dst[size-1] + C edx retval + C + C mm0 scratch + C mm6 shift + +Zdisp( movq, 0,(%ebx,%eax,4), %mm0) + incl %eax + + psrlq %mm6, %mm0 + +Zdisp( movd, %mm0, 0,(%ecx,%eax,4)) + jnz L(simple) + + + movq %mm0, (%ecx) + movl %edx, %eax + + popl %ebx + + femms + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax size-1 + C ebx src + C ecx 32-shift + C edx retval + C + C mm6 shift + + addl $32, %ecx + subl $7, %eax C size-8 + + movd %ecx, %mm7 + movl PARAM_DST, %ecx + + movq (%ebx), %mm2 C src low qword + leal (%ebx,%eax,4), %ebx C src end - 32 + + testb $4, %cl + leal (%ecx,%eax,4), %ecx C dst end - 32 + + notl %eax C -(size-7) + jz L(dst_aligned) + + psrlq %mm6, %mm2 + incl %eax + +Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb + movq 4(%ebx,%eax,4), %mm2 C new src low qword +L(dst_aligned): + + movq 12(%ebx,%eax,4), %mm0 C src second lowest qword + nop C avoid bad cache line crossing + + + C This loop is the important bit, the rest is just support for it. + C Four src limbs are held at the start, and four more will be read. + C Four dst limbs will be written. This schedule seems necessary for + C full speed. + C + C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and + C and leaves 0 to 3 which can be tested with test $1 and $2. + +L(top): + C eax counter, -(size-7) step by +4 until >=0 + C ebx src end - 32 + C ecx dst end - 32 + C edx retval + C + C mm0 src next qword + C mm1 scratch + C mm2 src prev qword + C mm6 shift + C mm7 64-shift + + psrlq %mm6, %mm2 + addl $4, %eax + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + por %mm0, %mm2 + movq 4(%ebx,%eax,4), %mm0 + + psrlq %mm6, %mm1 + movq %mm2, -12(%ecx,%eax,4) + + movq %mm0, %mm2 + psllq %mm7, %mm0 + + por %mm0, %mm1 + movq 12(%ebx,%eax,4), %mm0 + + movq %mm1, -4(%ecx,%eax,4) + ja L(top) C jump if no carry and not zero + + + + C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0 + C to 3 representing respectively 3 to 0 further limbs. + + testl $2, %eax C testl to avoid bad cache line crossings + jnz L(finish_nottwo) + + C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0 + C becomes new mm2 and a new mm0 is loaded. + + psrlq %mm6, %mm2 + movq %mm0, %mm1 + + psllq %mm7, %mm0 + addl $2, %eax + + por %mm0, %mm2 + movq 12(%ebx,%eax,4), %mm0 + + movq %mm2, -4(%ecx,%eax,4) + movq %mm1, %mm2 +L(finish_nottwo): + + + testb $1, %al + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + por %mm0, %mm2 + psrlq %mm6, %mm1 + + movq %mm2, 4(%ecx,%eax,4) + jnz L(finish_even) + + + C one further extra limb to process + + movd 32-4(%ebx), %mm0 C src[size-1], most significant limb + popl %ebx + + movq %mm0, %mm2 + psllq %mm7, %mm0 + + por %mm0, %mm1 + psrlq %mm6, %mm2 + + movq %mm1, 32-12(%ecx) C dst[size-3,size-2] + movd %mm2, 32-4(%ecx) C dst[size-1] + + movl %edx, %eax C retval + + femms + ret + + + nop C avoid bad cache line crossing +L(finish_even): + C no further extra limbs + + movq %mm1, 32-8(%ecx) C dst[size-2,size-1] + movl %edx, %eax C retval + + popl %ebx + + femms + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k6/mmx/com_n.asm b/ghc/rts/gmp/mpn/x86/k6/mmx/com_n.asm new file mode 100644 index 0000000..8915080 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/mmx/com_n.asm @@ -0,0 +1,91 @@ +dnl AMD K6-2 mpn_com_n -- mpn bitwise one's complement. +dnl +dnl alignment dst/src, A=0mod8 N=4mod8 +dnl A/A A/N N/A N/N +dnl K6-2 1.0 1.18 1.18 1.18 cycles/limb +dnl K6 1.5 1.85 1.75 1.85 + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Take the bitwise ones-complement of src,size and write it to dst,size. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) +PROLOGUE(mpn_com_n) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + shrl %ecx + jnz L(two_or_more) + + movl (%eax), %eax + notl %eax + movl %eax, (%edx) + ret + + +L(two_or_more): + pushl %ebx +FRAME_pushl() + movl %ecx, %ebx + + pcmpeqd %mm7, %mm7 C all ones + + + ALIGN(16) +L(top): + C eax src + C ebx floor(size/2) + C ecx counter + C edx dst + C esi + C edi + C ebp + + movq -8(%eax,%ecx,8), %mm0 + pxor %mm7, %mm0 + movq %mm0, -8(%edx,%ecx,8) + loop L(top) + + + jnc L(no_extra) + movl (%eax,%ebx,8), %eax + notl %eax + movl %eax, (%edx,%ebx,8) +L(no_extra): + + popl %ebx + emms_or_femms + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k6/mmx/logops_n.asm b/ghc/rts/gmp/mpn/x86/k6/mmx/logops_n.asm new file mode 100644 index 0000000..46cb3b7 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/mmx/logops_n.asm @@ -0,0 +1,212 @@ +dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n, +dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations. +dnl +dnl alignment dst/src1/src2, A=0mod8, N=4mod8 +dnl A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N +dnl +dnl K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor +dnl K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor +dnl K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior +dnl +dnl K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor +dnl K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor +dnl K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl M4_p and M4_i are the MMX and integer instructions +dnl M4_*_neg_dst means whether to negate the final result before writing +dnl M4_*_neg_src2 means whether to negate the src2 values before using them + +define(M4_choose_op, +m4_assert_numargs(7) +`ifdef(`OPERATION_$1',` +define(`M4_function', `mpn_$1') +define(`M4_operation', `$1') +define(`M4_p', `$2') +define(`M4_p_neg_dst', `$3') +define(`M4_p_neg_src2',`$4') +define(`M4_i', `$5') +define(`M4_i_neg_dst', `$6') +define(`M4_i_neg_src2',`$7') +')') + +dnl xnor is done in "iorn" style because it's a touch faster than "nior" +dnl style (the two are equivalent for xor). + +M4_choose_op( and_n, pand,0,0, andl,0,0) +M4_choose_op( andn_n, pandn,0,0, andl,0,1) +M4_choose_op( nand_n, pand,1,0, andl,1,0) +M4_choose_op( ior_n, por,0,0, orl,0,0) +M4_choose_op( iorn_n, por,0,1, orl,0,1) +M4_choose_op( nior_n, por,1,0, orl,1,0) +M4_choose_op( xor_n, pxor,0,0, xorl,0,0) +M4_choose_op( xnor_n, pxor,0,1, xorl,0,1) + +ifdef(`M4_function',, +`m4_error(`Unrecognised or undefined OPERATION symbol +')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + + +C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C +C Do src1,size M4_operation src2,size, storing the result in dst,size. +C +C Unaligned movq loads and stores are a bit slower than aligned ones. The +C test at the start of the routine checks the alignment of src1 and if +C necessary processes one limb separately at the low end to make it aligned. +C +C The raw speeds without this alignment switch are as follows. +C +C alignment dst/src1/src2, A=0mod8, N=4mod8 +C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N +C +C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor +C K6 1.75 2.2 2.0 2.28 iorn,xnor +C K6 2.0 2.25 2.35 2.28 nand,nior +C +C +C Future: +C +C K6 can do one 64-bit load per cycle so each of these routines should be +C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be +C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs. +C The others are 4 instructions per 2 limbs, and so can only approach 1.0 +C because there's nowhere to hide some loop control. + +defframe(PARAM_SIZE,16) +defframe(PARAM_SRC2,12) +defframe(PARAM_SRC1,8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + .text + ALIGN(32) +PROLOGUE(M4_function) + movl PARAM_SIZE, %ecx + pushl %ebx + FRAME_pushl() + movl PARAM_SRC1, %eax + movl PARAM_SRC2, %ebx + cmpl $1, %ecx + movl PARAM_DST, %edx + ja L(two_or_more) + + + movl (%ebx), %ecx + popl %ebx +ifelse(M4_i_neg_src2,1,`notl %ecx') + M4_i (%eax), %ecx +ifelse(M4_i_neg_dst,1,` notl %ecx') + movl %ecx, (%edx) + + ret + + +L(two_or_more): + C eax src1 + C ebx src2 + C ecx size + C edx dst + C esi + C edi + C ebp + C + C carry bit is low of size + + pushl %esi + FRAME_pushl() + testl $4, %eax + jz L(alignment_ok) + + movl (%ebx), %esi + addl $4, %ebx +ifelse(M4_i_neg_src2,1,`notl %esi') + M4_i (%eax), %esi + addl $4, %eax +ifelse(M4_i_neg_dst,1,` notl %esi') + movl %esi, (%edx) + addl $4, %edx + decl %ecx + +L(alignment_ok): + movl %ecx, %esi + shrl %ecx + jnz L(still_two_or_more) + + movl (%ebx), %ecx + popl %esi +ifelse(M4_i_neg_src2,1,`notl %ecx') + M4_i (%eax), %ecx +ifelse(M4_i_neg_dst,1,` notl %ecx') + popl %ebx + movl %ecx, (%edx) + ret + + +L(still_two_or_more): +ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,` + pcmpeqd %mm7, %mm7 C all ones +') + + ALIGN(16) +L(top): + C eax src1 + C ebx src2 + C ecx counter + C edx dst + C esi + C edi + C ebp + C + C carry bit is low of size + + movq -8(%ebx,%ecx,8), %mm0 +ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0') + M4_p -8(%eax,%ecx,8), %mm0 +ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0') + movq %mm0, -8(%edx,%ecx,8) + + loop L(top) + + + jnc L(no_extra) + + movl -4(%ebx,%esi,4), %ebx +ifelse(M4_i_neg_src2,1,`notl %ebx') + M4_i -4(%eax,%esi,4), %ebx +ifelse(M4_i_neg_dst,1,` notl %ebx') + movl %ebx, -4(%edx,%esi,4) +L(no_extra): + + popl %esi + popl %ebx + emms_or_femms + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k6/mmx/lshift.asm b/ghc/rts/gmp/mpn/x86/k6/mmx/lshift.asm new file mode 100644 index 0000000..f1dc83d --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/mmx/lshift.asm @@ -0,0 +1,122 @@ +dnl AMD K6 mpn_lshift -- mpn left shift. +dnl +dnl K6: 3.0 cycles/limb + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx +C instructions. This is despite every second fetch being unaligned. + + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) + +PROLOGUE(mpn_lshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shldl( %cl, %edx, %eax) C return value + + shll %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + + ALIGN(16) C avoid offset 0x1f + nop C avoid bad cache line crossing +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx,%eax,4), %edx C src high limb + negl %ecx + + movd PARAM_SHIFT, %mm6 + addl $32, %ecx C 32-shift + + shrl %cl, %edx + + movd %ecx, %mm7 + movl PARAM_DST, %ecx + +L(top): + C eax counter, size-1 to 1 + C ebx src + C ecx dst + C edx retval + C + C mm0 scratch + C mm6 shift + C mm7 32-shift + + movq -4(%ebx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + movd %mm0, 4(%ecx,%eax,4) + jnz L(top) + + + movd (%ebx), %mm0 + popl %ebx + + psllq %mm6, %mm0 + movl %edx, %eax + + movd %mm0, (%ecx) + + emms + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k6/mmx/popham.asm b/ghc/rts/gmp/mpn/x86/k6/mmx/popham.asm new file mode 100644 index 0000000..70efb80 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/mmx/popham.asm @@ -0,0 +1,238 @@ +dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and +dnl hamming distance. +dnl +dnl popcount hamdist +dnl K6-2: 9.0 11.5 cycles/limb +dnl K6: 12.5 13.0 + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); +C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); +C +C The code here isn't optimal, but it's already a 2x speedup over the plain +C integer mpn/generic/popcount.c,hamdist.c. + + +ifdef(`OPERATION_popcount',, +`ifdef(`OPERATION_hamdist',, +`m4_error(`Need OPERATION_popcount or OPERATION_hamdist +')m4exit(1)')') + +define(HAM, +m4_assert_numargs(1) +`ifdef(`OPERATION_hamdist',`$1')') + +define(POP, +m4_assert_numargs(1) +`ifdef(`OPERATION_popcount',`$1')') + +HAM(` +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC2, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_hamdist) +') +POP(` +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_popcount) +') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + + +ifdef(`PIC',,` + dnl non-PIC + + .section .rodata + ALIGN(8) + +define(LS, +m4_assert_numargs(1) +`LF(M4_function,`$1')') + +LS(rodata_AAAAAAAAAAAAAAAA): + .long 0xAAAAAAAA + .long 0xAAAAAAAA + +LS(rodata_3333333333333333): + .long 0x33333333 + .long 0x33333333 + +LS(rodata_0F0F0F0F0F0F0F0F): + .long 0x0F0F0F0F + .long 0x0F0F0F0F + +LS(rodata_000000FF000000FF): + .long 0x000000FF + .long 0x000000FF +') + + .text + ALIGN(32) + +POP(`ifdef(`PIC', ` + C avoid shrl crossing a 32-byte boundary + nop')') + +PROLOGUE(M4_function) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + orl %ecx, %ecx + jz L(zero) + +ifdef(`PIC',` + movl $0xAAAAAAAA, %eax + movl $0x33333333, %edx + + movd %eax, %mm7 + movd %edx, %mm6 + + movl $0x0F0F0F0F, %eax + movl $0x000000FF, %edx + + punpckldq %mm7, %mm7 + punpckldq %mm6, %mm6 + + movd %eax, %mm5 + movd %edx, %mm4 + + punpckldq %mm5, %mm5 + punpckldq %mm4, %mm4 +',` + + movq LS(rodata_AAAAAAAAAAAAAAAA), %mm7 + movq LS(rodata_3333333333333333), %mm6 + movq LS(rodata_0F0F0F0F0F0F0F0F), %mm5 + movq LS(rodata_000000FF000000FF), %mm4 +') + +define(REG_AAAAAAAAAAAAAAAA, %mm7) +define(REG_3333333333333333, %mm6) +define(REG_0F0F0F0F0F0F0F0F, %mm5) +define(REG_000000FF000000FF, %mm4) + + + movl PARAM_SRC, %eax +HAM(` movl PARAM_SRC2, %edx') + + pxor %mm2, %mm2 C total + + shrl %ecx + jnc L(top) + +Zdisp( movd, 0,(%eax,%ecx,8), %mm1) + +HAM(` +Zdisp( movd, 0,(%edx,%ecx,8), %mm0) + pxor %mm0, %mm1 +') + + incl %ecx + jmp L(loaded) + + + ALIGN(16) +POP(` nop C alignment to avoid crossing 32-byte boundaries') + +L(top): + C eax src + C ebx + C ecx counter, qwords, decrementing + C edx [hamdist] src2 + C + C mm0 (scratch) + C mm1 (scratch) + C mm2 total (low dword) + C mm3 + C mm4 \ + C mm5 | special constants + C mm6 | + C mm7 / + + movq -8(%eax,%ecx,8), %mm1 +HAM(` pxor -8(%edx,%ecx,8), %mm1') + +L(loaded): + movq %mm1, %mm0 + pand REG_AAAAAAAAAAAAAAAA, %mm1 + + psrlq $1, %mm1 +HAM(` nop C code alignment') + + psubd %mm1, %mm0 C bit pairs +HAM(` nop C code alignment') + + + movq %mm0, %mm1 + psrlq $2, %mm0 + + pand REG_3333333333333333, %mm0 + pand REG_3333333333333333, %mm1 + + paddd %mm1, %mm0 C nibbles + + + movq %mm0, %mm1 + psrlq $4, %mm0 + + pand REG_0F0F0F0F0F0F0F0F, %mm0 + pand REG_0F0F0F0F0F0F0F0F, %mm1 + + paddd %mm1, %mm0 C bytes + + movq %mm0, %mm1 + psrlq $8, %mm0 + + + paddb %mm1, %mm0 C words + + + movq %mm0, %mm1 + psrlq $16, %mm0 + + paddd %mm1, %mm0 C dwords + + pand REG_000000FF000000FF, %mm0 + + paddd %mm0, %mm2 C low to total + psrlq $32, %mm0 + + paddd %mm0, %mm2 C high to total + loop L(top) + + + + movd %mm2, %eax + emms_or_femms + ret + +L(zero): + movl $0, %eax + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k6/mmx/rshift.asm b/ghc/rts/gmp/mpn/x86/k6/mmx/rshift.asm new file mode 100644 index 0000000..cc5948f --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/mmx/rshift.asm @@ -0,0 +1,122 @@ +dnl AMD K6 mpn_rshift -- mpn right shift. +dnl +dnl K6: 3.0 cycles/limb + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx +C instructions. This is despite every second fetch being unaligned. + + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + .text + ALIGN(32) + +PROLOGUE(mpn_rshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shrdl( %cl, %edx, %eax) C return value + + shrl %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + + ALIGN(16) C avoid offset 0x1f +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx), %edx C src low limb + negl %ecx + + addl $32, %ecx C 32-shift + movd PARAM_SHIFT, %mm6 + + shll %cl, %edx C retval + movl PARAM_DST, %ecx + + leal (%ebx,%eax,4), %ebx + + leal -4(%ecx,%eax,4), %ecx + negl %eax + + +L(simple): + C eax counter (negative) + C ebx &src[size-1] + C ecx &dst[size-1] + C edx retval + C + C mm0 scratch + C mm6 shift + +Zdisp( movq, 0,(%ebx,%eax,4), %mm0) + incl %eax + + psrlq %mm6, %mm0 + +Zdisp( movd, %mm0, 0,(%ecx,%eax,4)) + jnz L(simple) + + + movq %mm0, (%ecx) + movl %edx, %eax + + popl %ebx + + emms + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k6/mul_1.asm b/ghc/rts/gmp/mpn/x86/k6/mul_1.asm new file mode 100644 index 0000000..c2220fe --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/mul_1.asm @@ -0,0 +1,272 @@ +dnl AMD K6 mpn_mul_1 -- mpn by limb multiply. +dnl +dnl K6: 6.25 cycles/limb. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); +C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier, mp_limb_t carry); +C +C Multiply src,size by mult and store the result in dst,size. +C Return the carry limb from the top of the result. +C +C mpn_mul_1c() accepts an initial carry for the calculation, it's added into +C the low limb of the result. + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl minimum 5 because the unrolled code can't handle less +deflit(UNROLL_THRESHOLD, 5) + + .text + ALIGN(32) + +PROLOGUE(mpn_mul_1c) + pushl %esi +deflit(`FRAME',4) + movl PARAM_CARRY, %esi + jmp LF(mpn_mul_1,start_nc) +EPILOGUE() + + +PROLOGUE(mpn_mul_1) + push %esi +deflit(`FRAME',4) + xorl %esi, %esi C initial carry + +L(start_nc): + mov PARAM_SIZE, %ecx + push %ebx +FRAME_pushl() + + movl PARAM_SRC, %ebx + push %edi +FRAME_pushl() + + movl PARAM_DST, %edi + pushl %ebp +FRAME_pushl() + + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_MULTIPLIER, %ebp + + jae L(unroll) + + + C code offset 0x22 here, close enough to aligned +L(simple): + C eax scratch + C ebx src + C ecx counter + C edx scratch + C esi carry + C edi dst + C ebp multiplier + C + C this loop 8 cycles/limb + + movl (%ebx), %eax + addl $4, %ebx + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, (%edi) + addl $4, %edi + + loop L(simple) + + + popl %ebp + + popl %edi + popl %ebx + + movl %esi, %eax + popl %esi + + ret + + +C ----------------------------------------------------------------------------- +C The code for each limb is 6 cycles, with instruction decoding being the +C limiting factor. At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25 +C cycles/limb in total. +C +C The secret ingredient to get 6.25 is to start the loop with the mul and +C have the load/store pair at the end. Rotating the load/store to the top +C is an 0.5 c/l slowdown. (Some address generation effect probably.) +C +C The whole unrolled loop fits nicely in exactly 80 bytes. + + + ALIGN(16) C already aligned to 16 here actually +L(unroll): + movl (%ebx), %eax + leal -16(%ebx,%ecx,4), %ebx + + leal -16(%edi,%ecx,4), %edi + subl $4, %ecx + + negl %ecx + + + ALIGN(16) C one byte nop for this alignment +L(top): + C eax scratch + C ebx &src[size-4] + C ecx counter + C edx scratch + C esi carry + C edi &dst[size-4] + C ebp multiplier + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, (%edi,%ecx,4) + movl 4(%ebx,%ecx,4), %eax + + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, 4(%edi,%ecx,4) + movl 8(%ebx,%ecx,4), %eax + + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, 8(%edi,%ecx,4) + movl 12(%ebx,%ecx,4), %eax + + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, 12(%edi,%ecx,4) + movl 16(%ebx,%ecx,4), %eax + + + addl $4, %ecx + js L(top) + + + + C eax next src limb + C ebx &src[size-4] + C ecx 0 to 3 representing respectively 4 to 1 further limbs + C edx + C esi carry + C edi &dst[size-4] + + testb $2, %cl + jnz L(finish_not_two) + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, (%edi,%ecx,4) + movl 4(%ebx,%ecx,4), %eax + + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, 4(%edi,%ecx,4) + movl 8(%ebx,%ecx,4), %eax + + addl $2, %ecx +L(finish_not_two): + + + testb $1, %cl + jnz L(finish_not_one) + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, 8(%edi) + movl 12(%ebx), %eax +L(finish_not_one): + + + mull %ebp + + addl %esi, %eax + popl %ebp + + adcl $0, %edx + + movl %eax, 12(%edi) + popl %edi + + popl %ebx + movl %edx, %eax + + popl %esi + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k6/mul_basecase.asm b/ghc/rts/gmp/mpn/x86/k6/mul_basecase.asm new file mode 100644 index 0000000..1f5a3a4 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/mul_basecase.asm @@ -0,0 +1,600 @@ +dnl AMD K6 mpn_mul_basecase -- multiply two mpn numbers. +dnl +dnl K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop +dnl unrolling). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K6: UNROLL_COUNT cycles/product (approx) +dnl 8 9.75 +dnl 16 9.3 +dnl 32 9.3 +dnl Maximum possible with the current code is 32. +dnl +dnl With 16 the inner unrolled loop fits exactly in a 256 byte block, which +dnl might explain it's good performance. + +deflit(UNROLL_COUNT, 16) + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); +C +C Calculate xp,xsize multiplied by yp,ysize, storing the result in +C wp,xsize+ysize. +C +C This routine is essentially the same as mpn/generic/mul_basecase.c, but +C it's faster because it does most of the mpn_addmul_1() entry code only +C once. The saving is about 10-20% on typical sizes coming from the +C Karatsuba multiply code. +C +C Future: +C +C The unrolled loop could be shared by mpn_addmul_1, with some extra stack +C setups and maybe 2 or 3 wasted cycles at the end. Code saving would be +C 256 bytes. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 8) +',` +deflit(UNROLL_THRESHOLD, 8) +') + +defframe(PARAM_YSIZE,20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE,12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + + .text + ALIGN(32) +PROLOGUE(mpn_mul_basecase) +deflit(`FRAME',0) + + movl PARAM_XSIZE, %ecx + movl PARAM_YP, %eax + + movl PARAM_XP, %edx + movl (%eax), %eax C yp low limb + + cmpl $2, %ecx + ja L(xsize_more_than_two_limbs) + je L(two_by_something) + + + C one limb by one limb + + movl (%edx), %edx C xp low limb + movl PARAM_WP, %ecx + + mull %edx + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + ret + + +C ----------------------------------------------------------------------------- +L(two_by_something): + decl PARAM_YSIZE + pushl %ebx +deflit(`FRAME',4) + + movl PARAM_WP, %ebx + pushl %esi +deflit(`FRAME',8) + + movl %eax, %ecx C yp low limb + movl (%edx), %eax C xp low limb + + movl %edx, %esi C xp + jnz L(two_by_two) + + + C two limbs by one limb + + mull %ecx + + movl %eax, (%ebx) + movl 4(%esi), %eax + + movl %edx, %esi C carry + + mull %ecx + + addl %eax, %esi + movl %esi, 4(%ebx) + + adcl $0, %edx + + movl %edx, 8(%ebx) + popl %esi + + popl %ebx + ret + + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(two_by_two): + C eax xp low limb + C ebx wp + C ecx yp low limb + C edx + C esi xp + C edi + C ebp +deflit(`FRAME',8) + + mull %ecx C xp[0] * yp[0] + + push %edi +deflit(`FRAME',12) + movl %eax, (%ebx) + + movl 4(%esi), %eax + movl %edx, %edi C carry, for wp[1] + + mull %ecx C xp[1] * yp[0] + + addl %eax, %edi + movl PARAM_YP, %ecx + + adcl $0, %edx + + movl %edi, 4(%ebx) + movl 4(%ecx), %ecx C yp[1] + + movl 4(%esi), %eax C xp[1] + movl %edx, %edi C carry, for wp[2] + + mull %ecx C xp[1] * yp[1] + + addl %eax, %edi + + adcl $0, %edx + + movl (%esi), %eax C xp[0] + movl %edx, %esi C carry, for wp[3] + + mull %ecx C xp[0] * yp[1] + + addl %eax, 4(%ebx) + adcl %edx, %edi + adcl $0, %esi + + movl %edi, 8(%ebx) + popl %edi + + movl %esi, 12(%ebx) + popl %esi + + popl %ebx + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(xsize_more_than_two_limbs): + +C The first limb of yp is processed with a simple mpn_mul_1 style loop +C inline. Unrolling this doesn't seem worthwhile since it's only run once +C (whereas the addmul below is run ysize-1 many times). A call to the +C actual mpn_mul_1 will be slowed down by the call and parameter pushing and +C popping, and doesn't seem likely to be worthwhile on the typical 10-20 +C limb operations the Karatsuba code calls here with. + + C eax yp[0] + C ebx + C ecx xsize + C edx xp + C esi + C edi + C ebp +deflit(`FRAME',0) + + pushl %edi defframe_pushl(SAVE_EDI) + pushl %ebp defframe_pushl(SAVE_EBP) + + movl PARAM_WP, %edi + pushl %esi defframe_pushl(SAVE_ESI) + + movl %eax, %ebp + pushl %ebx defframe_pushl(SAVE_EBX) + + leal (%edx,%ecx,4), %ebx C xp end + xorl %esi, %esi + + leal (%edi,%ecx,4), %edi C wp end of mul1 + negl %ecx + + +L(mul1): + C eax scratch + C ebx xp end + C ecx counter, negative + C edx scratch + C esi carry + C edi wp end of mul1 + C ebp multiplier + + movl (%ebx,%ecx,4), %eax + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, (%edi,%ecx,4) + incl %ecx + + jnz L(mul1) + + + movl PARAM_YSIZE, %edx + movl %esi, (%edi) C final carry + + movl PARAM_XSIZE, %ecx + decl %edx + + jnz L(ysize_more_than_one_limb) + + popl %ebx + popl %esi + popl %ebp + popl %edi + ret + + +L(ysize_more_than_one_limb): + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_YP, %eax + + jae L(unroll) + + +C ----------------------------------------------------------------------------- +C Simple addmul loop. +C +C Using ebx and edi pointing at the ends of their respective locations saves +C a couple of instructions in the outer loop. The inner loop is still 11 +C cycles, the same as the simple loop in aorsmul_1.asm. + + C eax yp + C ebx xp end + C ecx xsize + C edx ysize-1 + C esi + C edi wp end of mul1 + C ebp + + movl 4(%eax), %ebp C multiplier + negl %ecx + + movl %ecx, PARAM_XSIZE C -xsize + xorl %esi, %esi C initial carry + + leal 4(%eax,%edx,4), %eax C yp end + negl %edx + + movl %eax, PARAM_YP + movl %edx, PARAM_YSIZE + + jmp L(simple_outer_entry) + + + C aligning here saves a couple of cycles + ALIGN(16) +L(simple_outer_top): + C edx ysize counter, negative + + movl PARAM_YP, %eax C yp end + xorl %esi, %esi C carry + + movl PARAM_XSIZE, %ecx C -xsize + movl %edx, PARAM_YSIZE + + movl (%eax,%edx,4), %ebp C yp limb multiplier +L(simple_outer_entry): + addl $4, %edi + + +L(simple_inner): + C eax scratch + C ebx xp end + C ecx counter, negative + C edx scratch + C esi carry + C edi wp end of this addmul + C ebp multiplier + + movl (%ebx,%ecx,4), %eax + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl $0, %edx + addl %eax, (%edi,%ecx,4) + adcl %edx, %esi + + incl %ecx + jnz L(simple_inner) + + + movl PARAM_YSIZE, %edx + movl %esi, (%edi) + + incl %edx + jnz L(simple_outer_top) + + + popl %ebx + popl %esi + popl %ebp + popl %edi + ret + + +C ----------------------------------------------------------------------------- +C Unrolled loop. +C +C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for +C some comments. +C +C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to +C 0, inclusive. +C +C VAR_JMP is the computed jump into the unrolled loop. +C +C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop +C is entered. +C +C VAR_XP_LOW is the least significant limb of xp, which is needed at the +C start of the unrolled loop. This can't just be fetched through the xp +C pointer because of the offset applied to it. +C +C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, +C inclusive. +C +C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be +C added to give the location of the next limb of yp, which is the multiplier +C in the unrolled loop. +C +C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added +C to give the starting point in the destination for each unrolled loop (this +C point is one limb upwards for each limb of yp processed). +C +C Having PARAM_YSIZE count negative to zero means it's not necessary to +C store new values of PARAM_YP and PARAM_WP on each loop. Those values on +C the stack remain constant and on each loop an leal adjusts them with the +C PARAM_YSIZE counter value. + + +defframe(VAR_COUNTER, -20) +defframe(VAR_COUNTER_INIT, -24) +defframe(VAR_JMP, -28) +defframe(VAR_XP_LOW, -32) +deflit(VAR_STACK_SPACE, 16) + +dnl For some strange reason using (%esp) instead of 0(%esp) is a touch +dnl slower in this code, hence the defframe empty-if-zero feature is +dnl disabled. +dnl +dnl If VAR_COUNTER is at (%esp), the effect is worse. In this case the +dnl unrolled loop is 255 instead of 256 bytes, but quite how this affects +dnl anything isn't clear. +dnl +define(`defframe_empty_if_zero_disabled',1) + +L(unroll): + C eax yp (not used) + C ebx xp end (not used) + C ecx xsize + C edx ysize-1 + C esi + C edi wp end of mul1 (not used) + C ebp +deflit(`FRAME', 16) + + leal -2(%ecx), %ebp C one limb processed at start, + decl %ecx C and ebp is one less + + shrl $UNROLL_LOG2, %ebp + negl %ecx + + subl $VAR_STACK_SPACE, %esp +deflit(`FRAME', 16+VAR_STACK_SPACE) + andl $UNROLL_MASK, %ecx + + movl %ecx, %esi + shll $4, %ecx + + movl %ebp, VAR_COUNTER_INIT + negl %esi + + C 15 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(unroll_here): +',` + leal L(unroll_entry) (%ecx,%esi,1), %ecx +') + + movl PARAM_XP, %ebx + movl %ebp, VAR_COUNTER + + movl PARAM_WP, %edi + movl %ecx, VAR_JMP + + movl (%ebx), %eax + leal 4(%edi,%esi,4), %edi C wp adjust for unrolling and mul1 + + leal (%ebx,%esi,4), %ebx C xp adjust for unrolling + + movl %eax, VAR_XP_LOW + + movl %ebx, PARAM_XP + movl PARAM_YP, %ebx + + leal (%edi,%edx,4), %ecx C wp adjust for ysize indexing + movl 4(%ebx), %ebp C multiplier (yp second limb) + + leal 4(%ebx,%edx,4), %ebx C yp adjust for ysize indexing + + movl %ecx, PARAM_WP + + leal 1(%esi), %ecx C adjust parity for decl %ecx above + + movl %ebx, PARAM_YP + negl %edx + + movl %edx, PARAM_YSIZE + jmp L(unroll_outer_entry) + + +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + leal (%ecx,%esi,1), %ecx + addl $L(unroll_entry)-L(unroll_here), %ecx + addl (%esp), %ecx + ret +') + + +C ----------------------------------------------------------------------------- + C Aligning here saves a couple of cycles per loop. Using 32 doesn't + C cost any extra space, since the inner unrolled loop below is + C aligned to 32. + ALIGN(32) +L(unroll_outer_top): + C edx ysize + + movl PARAM_YP, %eax + movl %edx, PARAM_YSIZE C incremented ysize counter + + movl PARAM_WP, %edi + + movl VAR_COUNTER_INIT, %ebx + movl (%eax,%edx,4), %ebp C next multiplier + + movl PARAM_XSIZE, %ecx + leal (%edi,%edx,4), %edi C adjust wp for where we are in yp + + movl VAR_XP_LOW, %eax + movl %ebx, VAR_COUNTER + +L(unroll_outer_entry): + mull %ebp + + C using testb is a tiny bit faster than testl + testb $1, %cl + + movl %eax, %ecx C low carry + movl VAR_JMP, %eax + + movl %edx, %esi C high carry + movl PARAM_XP, %ebx + + jnz L(unroll_noswap) + movl %ecx, %esi C high,low carry other way around + + movl %edx, %ecx +L(unroll_noswap): + + jmp *%eax + + + +C ----------------------------------------------------------------------------- + ALIGN(32) +L(unroll_top): + C eax scratch + C ebx xp + C ecx carry low + C edx scratch + C esi carry high + C edi wp + C ebp multiplier + C VAR_COUNTER loop counter + C + C 15 code bytes each limb + + leal UNROLL_BYTES(%edi), %edi + +L(unroll_entry): +deflit(CHUNK_COUNT,2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4)) + deflit(`disp1', eval(disp0 + 4)) + deflit(`disp2', eval(disp1 + 4)) + + movl disp1(%ebx), %eax + mull %ebp +Zdisp( addl, %ecx, disp0,(%edi)) + adcl %eax, %esi + movl %edx, %ecx + jadcl0( %ecx) + + movl disp2(%ebx), %eax + mull %ebp + addl %esi, disp1(%edi) + adcl %eax, %ecx + movl %edx, %esi + jadcl0( %esi) +') + + decl VAR_COUNTER + leal UNROLL_BYTES(%ebx), %ebx + + jns L(unroll_top) + + + movl PARAM_YSIZE, %edx + addl %ecx, UNROLL_BYTES(%edi) + + adcl $0, %esi + + incl %edx + movl %esi, UNROLL_BYTES+4(%edi) + + jnz L(unroll_outer_top) + + + movl SAVE_ESI, %esi + movl SAVE_EBP, %ebp + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + + addl $FRAME, %esp + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k6/sqr_basecase.asm b/ghc/rts/gmp/mpn/x86/k6/sqr_basecase.asm new file mode 100644 index 0000000..70d49b3 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k6/sqr_basecase.asm @@ -0,0 +1,672 @@ +dnl AMD K6 mpn_sqr_basecase -- square an mpn number. +dnl +dnl K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular +dnl product (measured on the speed difference between 17 and 33 limbs, +dnl which is roughly the Karatsuba recursing range). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl KARATSUBA_SQR_THRESHOLD_MAX is the maximum KARATSUBA_SQR_THRESHOLD this +dnl code supports. This value is used only by the tune program to know +dnl what it can go up to. (An attempt to compile with a bigger value will +dnl trigger some m4_assert()s in the code, making the build fail.) +dnl +dnl The value is determined by requiring the displacements in the unrolled +dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of +dnl 63, giving a maximum KARATSUBA_SQR_THRESHOLD of 66. + +deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66) + + +dnl Allow a value from the tune program to override config.m4. + +ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE', +`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)') + + +dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The +dnl number required is determined by KARATSUBA_SQR_THRESHOLD, since +dnl mpn_sqr_basecase only needs to handle sizes < KARATSUBA_SQR_THRESHOLD. +dnl +dnl The first addmul is the biggest, and this takes the second least +dnl significant limb and multiplies it by the third least significant and +dnl up. Hence for a maximum operand size of KARATSUBA_SQR_THRESHOLD-1 +dnl limbs, UNROLL_COUNT needs to be KARATSUBA_SQR_THRESHOLD-3. + +m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD') +deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3)) + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a +C lot of function call overheads are avoided, especially when the given size +C is small. +C +C The code size might look a bit excessive, but not all of it is executed +C and so won't fill up the code cache. The 1x1, 2x2 and 3x3 special cases +C clearly apply only to those sizes; mid sizes like 10x10 only need part of +C the unrolled addmul; and big sizes like 35x35 that do need all of it will +C at least be getting value for money, because 35x35 spends something like +C 5780 cycles here. +C +C Different values of UNROLL_COUNT give slightly different speeds, between +C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs. +C This isn't a big difference, but it's presumably some alignment effect +C which if understood could give a simple speedup. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %eax + + cmpl $2, %ecx + je L(two_limbs) + + movl PARAM_DST, %edx + ja L(three_or_more) + + +C ----------------------------------------------------------------------------- +C one limb only + C eax src + C ebx + C ecx size + C edx dst + + movl (%eax), %eax + movl %edx, %ecx + + mull %eax + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(two_limbs): + C eax src + C ebx + C ecx size + C edx dst + + pushl %ebx + movl %eax, %ebx C src +deflit(`FRAME',4) + + movl (%ebx), %eax + movl PARAM_DST, %ecx + + mull %eax C src[0]^2 + + movl %eax, (%ecx) + movl 4(%ebx), %eax + + movl %edx, 4(%ecx) + + mull %eax C src[1]^2 + + movl %eax, 8(%ecx) + movl (%ebx), %eax + + movl %edx, 12(%ecx) + movl 4(%ebx), %edx + + mull %edx C src[0]*src[1] + + addl %eax, 4(%ecx) + + adcl %edx, 8(%ecx) + adcl $0, 12(%ecx) + + popl %ebx + addl %eax, 4(%ecx) + + adcl %edx, 8(%ecx) + adcl $0, 12(%ecx) + + ret + + +C ----------------------------------------------------------------------------- +L(three_or_more): +deflit(`FRAME',0) + cmpl $4, %ecx + jae L(four_or_more) + + +C ----------------------------------------------------------------------------- +C three limbs + C eax src + C ecx size + C edx dst + + pushl %ebx + movl %eax, %ebx C src + + movl (%ebx), %eax + movl %edx, %ecx C dst + + mull %eax C src[0] ^ 2 + + movl %eax, (%ecx) + movl 4(%ebx), %eax + + movl %edx, 4(%ecx) + pushl %esi + + mull %eax C src[1] ^ 2 + + movl %eax, 8(%ecx) + movl 8(%ebx), %eax + + movl %edx, 12(%ecx) + pushl %edi + + mull %eax C src[2] ^ 2 + + movl %eax, 16(%ecx) + movl (%ebx), %eax + + movl %edx, 20(%ecx) + movl 4(%ebx), %edx + + mull %edx C src[0] * src[1] + + movl %eax, %esi + movl (%ebx), %eax + + movl %edx, %edi + movl 8(%ebx), %edx + + pushl %ebp + xorl %ebp, %ebp + + mull %edx C src[0] * src[2] + + addl %eax, %edi + movl 4(%ebx), %eax + + adcl %edx, %ebp + + movl 8(%ebx), %edx + + mull %edx C src[1] * src[2] + + addl %eax, %ebp + + adcl $0, %edx + + + C eax will be dst[5] + C ebx + C ecx dst + C edx dst[4] + C esi dst[1] + C edi dst[2] + C ebp dst[3] + + xorl %eax, %eax + addl %esi, %esi + adcl %edi, %edi + adcl %ebp, %ebp + adcl %edx, %edx + adcl $0, %eax + + addl %esi, 4(%ecx) + adcl %edi, 8(%ecx) + adcl %ebp, 12(%ecx) + + popl %ebp + popl %edi + + adcl %edx, 16(%ecx) + + popl %esi + popl %ebx + + adcl %eax, 20(%ecx) + ASSERT(nc) + + ret + + +C ----------------------------------------------------------------------------- + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +defframe(VAR_COUNTER,-20) +defframe(VAR_JMP, -24) +deflit(STACK_SPACE, 24) + + ALIGN(16) +L(four_or_more): + + C eax src + C ebx + C ecx size + C edx dst + C esi + C edi + C ebp + +C First multiply src[0]*src[1..size-1] and store at dst[1..size]. +C +C A test was done calling mpn_mul_1 here to get the benefit of its unrolled +C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off +C a 5780 cycle operation, which is not surprising since the loop here is 8 +C c/l and mpn_mul_1 is 6.25 c/l. + + subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) + + movl %edi, SAVE_EDI + leal 4(%edx), %edi + + movl %ebx, SAVE_EBX + leal 4(%eax), %ebx + + movl %esi, SAVE_ESI + xorl %esi, %esi + + movl %ebp, SAVE_EBP + + C eax + C ebx src+4 + C ecx size + C edx + C esi + C edi dst+4 + C ebp + + movl (%eax), %ebp C multiplier + leal -1(%ecx), %ecx C size-1, and pad to a 16 byte boundary + + + ALIGN(16) +L(mul_1): + C eax scratch + C ebx src ptr + C ecx counter + C edx scratch + C esi carry + C edi dst ptr + C ebp multiplier + + movl (%ebx), %eax + addl $4, %ebx + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, (%edi) + addl $4, %edi + + loop L(mul_1) + + +C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. +C +C The last two addmuls, which are the bottom right corner of the product +C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] +C and src[size-2]*src[size-1]. If size is 4 then it's only these corner +C cases that need to be done. +C +C The unrolled code is the same as mpn_addmul_1(), see that routine for some +C comments. +C +C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive. +C +C VAR_JMP is the computed jump into the unrolled code, stepped by one code +C chunk each outer loop. +C +C K6 doesn't do any branch prediction on indirect jumps, which is good +C actually because it's a different target each time. The unrolled addmul +C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of +C the indirect jump is quickly recovered. + + +dnl This value is also implicitly encoded in a shift and add. +dnl +deflit(CODE_BYTES_PER_LIMB, 15) + +dnl With the unmodified &src[size] and &dst[size] pointers, the +dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT +dnl values up to 31. Above that an offset must be added to them. +dnl +deflit(OFFSET, +ifelse(eval(UNROLL_COUNT>31),1, +eval((UNROLL_COUNT-31)*4), +0)) + + C eax + C ebx &src[size] + C ecx + C edx + C esi carry + C edi &dst[size] + C ebp + + movl PARAM_SIZE, %ecx + movl %esi, (%edi) + + subl $4, %ecx + jz L(corner) + + movl %ecx, %edx +ifelse(OFFSET,0,, +` subl $OFFSET, %ebx') + + shll $4, %ecx +ifelse(OFFSET,0,, +` subl $OFFSET, %edi') + + negl %ecx + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx +') + negl %edx + + + C The calculated jump mustn't be before the start of the available + C code. This is the limitation UNROLL_COUNT puts on the src operand + C size, but checked here using the jump address directly. + C + ASSERT(ae,` + movl_text_address( L(unroll_inner_start), %eax) + cmpl %eax, %ecx + ') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll_outer_top): + C eax + C ebx &src[size], constant + C ecx VAR_JMP + C edx VAR_COUNTER, limbs, negative + C esi high limb to store + C edi dst ptr, high of last addmul + C ebp + + movl -12+OFFSET(%ebx,%edx,4), %ebp C multiplier + movl %edx, VAR_COUNTER + + movl -8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand + + mull %ebp + + testb $1, %cl + + movl %edx, %esi C high carry + movl %ecx, %edx C jump + + movl %eax, %ecx C low carry + leal CODE_BYTES_PER_LIMB(%edx), %edx + + movl %edx, VAR_JMP + leal 4(%edi), %edi + + C A branch-free version of this using some xors was found to be a + C touch slower than just a conditional jump, despite the jump + C switching between taken and not taken on every loop. + +ifelse(eval(UNROLL_COUNT%2),0, + jz,jnz) L(unroll_noswap) + movl %esi, %eax C high,low carry other way around + + movl %ecx, %esi + movl %eax, %ecx +L(unroll_noswap): + + jmp *%edx + + + C Must be on an even address here so the low bit of the jump address + C will indicate which way around ecx/esi should start. + C + C An attempt was made at padding here to get the end of the unrolled + C code to come out on a good alignment, to save padding before + C L(corner). This worked, but turned out to run slower than just an + C ALIGN(2). The reason for this is not clear, it might be related + C to the different speeds on different UNROLL_COUNTs noted above. + + ALIGN(2) + +L(unroll_inner_start): + C eax scratch + C ebx src + C ecx carry low + C edx scratch + C esi carry high + C edi dst + C ebp multiplier + C + C 15 code bytes each limb + C ecx/esi swapped on each chunk + +forloop(`i', UNROLL_COUNT, 1, ` + deflit(`disp_src', eval(-i*4 + OFFSET)) + deflit(`disp_dst', eval(disp_src - 4)) + + m4_assert(`disp_src>=-128 && disp_src<128') + m4_assert(`disp_dst>=-128 && disp_dst<128') + +ifelse(eval(i%2),0,` +Zdisp( movl, disp_src,(%ebx), %eax) + mull %ebp +Zdisp( addl, %esi, disp_dst,(%edi)) + adcl %eax, %ecx + movl %edx, %esi + jadcl0( %esi) +',` + dnl this one comes out last +Zdisp( movl, disp_src,(%ebx), %eax) + mull %ebp +Zdisp( addl, %ecx, disp_dst,(%edi)) + adcl %eax, %esi + movl %edx, %ecx + jadcl0( %ecx) +') +') +L(unroll_inner_end): + + addl %esi, -4+OFFSET(%edi) + + movl VAR_COUNTER, %edx + jadcl0( %ecx) + + movl %ecx, m4_empty_if_zero(OFFSET)(%edi) + movl VAR_JMP, %ecx + + incl %edx + jnz L(unroll_outer_top) + + +ifelse(OFFSET,0,,` + addl $OFFSET, %ebx + addl $OFFSET, %edi +') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(corner): + C ebx &src[size] + C edi &dst[2*size-5] + + movl -12(%ebx), %ebp + + movl -8(%ebx), %eax + movl %eax, %ecx + + mull %ebp + + addl %eax, -4(%edi) + adcl $0, %edx + + movl -4(%ebx), %eax + movl %edx, %esi + movl %eax, %ebx + + mull %ebp + + addl %esi, %eax + adcl $0, %edx + + addl %eax, (%edi) + adcl $0, %edx + + movl %edx, %esi + movl %ebx, %eax + + mull %ecx + + addl %esi, %eax + movl %eax, 4(%edi) + + adcl $0, %edx + + movl %edx, 8(%edi) + + +C ----------------------------------------------------------------------------- +C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1]. +C The loop measures about 6 cycles/iteration, though it looks like it should +C decode in 5. + +L(lshift_start): + movl PARAM_SIZE, %ecx + + movl PARAM_DST, %edi + subl $1, %ecx C size-1 and clear carry + + movl PARAM_SRC, %ebx + movl %ecx, %edx + + xorl %eax, %eax C ready for adcl + + + ALIGN(16) +L(lshift): + C eax + C ebx src (for later use) + C ecx counter, decrementing + C edx size-1 (for later use) + C esi + C edi dst, incrementing + C ebp + + rcll 4(%edi) + rcll 8(%edi) + leal 8(%edi), %edi + loop L(lshift) + + + adcl %eax, %eax + + movl %eax, 4(%edi) C dst most significant limb + movl (%ebx), %eax C src[0] + + leal 4(%ebx,%edx,4), %ebx C &src[size] + subl %edx, %ecx C -(size-1) + + +C ----------------------------------------------------------------------------- +C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + + mull %eax + + movl %eax, (%edi,%ecx,8) C dst[0] + + + ALIGN(16) +L(diag): + C eax scratch + C ebx &src[size] + C ecx counter, negative + C edx carry + C esi scratch + C edi dst[2*size-2] + C ebp + + movl (%ebx,%ecx,4), %eax + movl %edx, %esi + + mull %eax + + addl %esi, 4(%edi,%ecx,8) + adcl %eax, 8(%edi,%ecx,8) + adcl $0, %edx + + incl %ecx + jnz L(diag) + + + movl SAVE_EBX, %ebx + movl SAVE_ESI, %esi + + addl %edx, 4(%edi) C dst most significant limb + + movl SAVE_EDI, %edi + movl SAVE_EBP, %ebp + addl $FRAME, %esp + ret + + + +C ----------------------------------------------------------------------------- +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + addl (%esp), %ecx + addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx + addl %edx, %ecx + ret +') + + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k7/README b/ghc/rts/gmp/mpn/x86/k7/README new file mode 100644 index 0000000..c34315c --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k7/README @@ -0,0 +1,145 @@ + + AMD K7 MPN SUBROUTINES + + +This directory contains code optimized for the AMD Athlon CPU. + +The mmx subdirectory has routines using MMX instructions. All Athlons have +MMX, the separate directory is just so that configure can omit it if the +assembler doesn't support MMX. + + + +STATUS + +Times for the loops, with all code and data in L1 cache. + + cycles/limb + mpn_add/sub_n 1.6 + + mpn_copyi 0.75 or 1.0 \ varying with data alignment + mpn_copyd 0.75 or 1.0 / + + mpn_divrem_1 17.0 integer part, 15.0 fractional part + mpn_mod_1 17.0 + mpn_divexact_by3 8.0 + + mpn_l/rshift 1.2 + + mpn_mul_1 3.4 + mpn_addmul/submul_1 3.9 + + mpn_mul_basecase 4.42 cycles/crossproduct (approx) + + mpn_popcount 5.0 + mpn_hamdist 6.0 + +Prefetching of sources hasn't yet been tried. + + + +NOTES + +cmov, MMX, 3DNow and some extensions to MMX and 3DNow are available. + +Write-allocate L1 data cache means prefetching of destinations is unnecessary. + +Floating point multiplications can be done in parallel with integer +multiplications, but there doesn't seem to be any way to make use of this. + +Unsigned "mul"s can be issued every 3 cycles. This suggests 3 is a limit on +the speed of the multiplication routines. The documentation shows mul +executing in IEU0 (or maybe in IEU0 and IEU1 together), so it might be that, +to get near 3 cycles code has to be arranged so that nothing else is issued +to IEU0. A busy IEU0 could explain why some code takes 4 cycles and other +apparently equivalent code takes 5. + + + +OPTIMIZATIONS + +Unrolled loops are used to reduce looping overhead. The unrolling is +configurable up to 32 limbs/loop for most routines and up to 64 for some. +The K7 has 64k L1 code cache so quite big unrolling is allowable. + +Computed jumps into the unrolling are used to handle sizes not a multiple of +the unrolling. An attractive feature of this is that times increase +smoothly with operand size, but it may be that some routines should just +have simple loops to finish up, especially when PIC adds between 2 and 16 +cycles to get %eip. + +Position independent code is implemented using a call to get %eip for the +computed jumps and a ret is always done, rather than an addl $4,%esp or a +popl, so the CPU return address branch prediction stack stays synchronised +with the actual stack in memory. + +Branch prediction, in absence of any history, will guess forward jumps are +not taken and backward jumps are taken. Where possible it's arranged that +the less likely or less important case is under a taken forward jump. + + + +CODING + +Instructions in general code have been shown grouped if they can execute +together, which means up to three direct-path instructions which have no +successive dependencies. K7 always decodes three and has out-of-order +execution, but the groupings show what slots might be available and what +dependency chains exist. + +When there's vector-path instructions an effort is made to get triplets of +direct-path instructions in between them, even if there's dependencies, +since this maximizes decoding throughput and might save a cycle or two if +decoding is the limiting factor. + + + +INSTRUCTIONS + +adcl direct +divl 39 cycles back-to-back +lodsl,etc vector +loop 1 cycle vector (decl/jnz opens up one decode slot) +movd reg vector +movd mem direct +mull issue every 3 cycles, latency 4 cycles low word, 6 cycles high word +popl vector (use movl for more than one pop) +pushl direct, will pair with a load +shrdl %cl vector, 3 cycles, seems to be 3 decode too +xorl r,r false read dependency recognised + + + +REFERENCES + +"AMD Athlon Processor X86 Code Optimization Guide", AMD publication number +22007, revision E, November 1999. Available on-line, + + http://www.amd.com/products/cpg/athlon/techdocs/pdf/22007.pdf + +"3DNow Technology Manual", AMD publication number 21928F/0-August 1999. +This describes the femms and prefetch instructions. Available on-line, + + http://www.amd.com/K6/k6docs/pdf/21928.pdf + +"AMD Extensions to the 3DNow and MMX Instruction Sets Manual", AMD +publication number 22466, revision B, August 1999. This describes +instructions added in the Athlon processor, such as pswapd and the extra +prefetch forms. Available on-line, + + http://www.amd.com/products/cpg/athlon/techdocs/pdf/22466.pdf + +"3DNow Instruction Porting Guide", AMD publication number 22621, revision B, +August 1999. This has some notes on general Athlon optimizations as well as +3DNow. Available on-line, + + http://www.amd.com/products/cpg/athlon/techdocs/pdf/22621.pdf + + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/ghc/rts/gmp/mpn/x86/k7/aors_n.asm b/ghc/rts/gmp/mpn/x86/k7/aors_n.asm new file mode 100644 index 0000000..85fa9d3 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k7/aors_n.asm @@ -0,0 +1,250 @@ +dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract. +dnl +dnl K7: 1.64 cycles/limb (at 16 limb/loop). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 8 1.9 +dnl 16 1.64 +dnl 32 1.7 +dnl 64 2.0 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +ifdef(`OPERATION_add_n', ` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + define(M4_description, add) +',`ifdef(`OPERATION_sub_n', ` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + define(M4_description, subtract) +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); +C +C Calculate src1,size M4_description src2,size, and store the result in +C dst,size. The return value is the carry bit from the top of the result (1 +C or 0). +C +C The _nc version accepts 1 or 0 for an initial carry into the low limb of +C the calculation. Note values other than 1 or 0 here will lead to garbage +C results. +C +C This code runs at 1.64 cycles/limb, which is probably the best possible +C with plain integer operations. Each limb is 2 loads and 1 store, and in +C one cycle the K7 can do two loads, or a load and a store, leading to 1.5 +C c/l. + +dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1. +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 8) +',` +deflit(UNROLL_THRESHOLD, 8) +') + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBP, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EBX, -12) +defframe(SAVE_EDI, -16) +deflit(STACK_SPACE, 16) + + .text + ALIGN(32) +deflit(`FRAME',0) + +PROLOGUE(M4_function_nc) + movl PARAM_CARRY, %eax + jmp LF(M4_function_n,start) +EPILOGUE() + +PROLOGUE(M4_function_n) + + xorl %eax, %eax C carry +L(start): + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %edi, SAVE_EDI + movl %ebx, SAVE_EBX + cmpl $UNROLL_THRESHOLD, %ecx + + movl PARAM_SRC2, %edx + movl PARAM_SRC1, %ebx + jae L(unroll) + + movl PARAM_DST, %edi + leal (%ebx,%ecx,4), %ebx + leal (%edx,%ecx,4), %edx + + leal (%edi,%ecx,4), %edi + negl %ecx + shrl %eax + + C This loop in in a single 16 byte code block already, so no + C alignment necessary. +L(simple): + C eax scratch + C ebx src1 + C ecx counter + C edx src2 + C esi + C edi dst + C ebp + + movl (%ebx,%ecx,4), %eax + M4_inst (%edx,%ecx,4), %eax + movl %eax, (%edi,%ecx,4) + incl %ecx + jnz L(simple) + + movl $0, %eax + movl SAVE_EDI, %edi + + movl SAVE_EBX, %ebx + setc %al + addl $STACK_SPACE, %esp + + ret + + +C ----------------------------------------------------------------------------- + C This is at 0x55, close enough to aligned. +L(unroll): +deflit(`FRAME',STACK_SPACE) + movl %ebp, SAVE_EBP + andl $-2, %ecx C size low bit masked out + andl $1, PARAM_SIZE C size low bit kept + + movl %ecx, %edi + decl %ecx + movl PARAM_DST, %ebp + + shrl $UNROLL_LOG2, %ecx + negl %edi + movl %esi, SAVE_ESI + + andl $UNROLL_MASK, %edi + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%edi,%edi,8), %esi C 9 bytes per +') + negl %edi + shrl %eax + + leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx + leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx + leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi + + jmp *%esi + + +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + leal (%edi,%edi,8), %esi + addl $L(entry)-L(here), %esi + addl (%esp), %esi + ret +') + + +C ----------------------------------------------------------------------------- + ALIGN(32) +L(top): + C eax zero + C ebx src1 + C ecx counter + C edx src2 + C esi scratch (was computed jump) + C edi dst + C ebp scratch + + leal UNROLL_BYTES(%edx), %edx + +L(entry): +deflit(CHUNK_COUNT, 2) +forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%ebx), %esi) + movl disp1(%ebx), %ebp +Zdisp( M4_inst,disp0,(%edx), %esi) +Zdisp( movl, %esi, disp0,(%edi)) + M4_inst disp1(%edx), %ebp + movl %ebp, disp1(%edi) +') + + decl %ecx + leal UNROLL_BYTES(%ebx), %ebx + leal UNROLL_BYTES(%edi), %edi + jns L(top) + + + mov PARAM_SIZE, %esi + movl SAVE_EBP, %ebp + movl $0, %eax + + decl %esi + js L(even) + + movl (%ebx), %ecx + M4_inst UNROLL_BYTES(%edx), %ecx + movl %ecx, (%edi) +L(even): + + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + setc %al + + movl SAVE_ESI, %esi + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k7/aorsmul_1.asm b/ghc/rts/gmp/mpn/x86/k7/aorsmul_1.asm new file mode 100644 index 0000000..9f9c3da --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k7/aorsmul_1.asm @@ -0,0 +1,364 @@ +dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. +dnl +dnl K7: 3.9 cycles/limb. +dnl +dnl Future: It should be possible to avoid the separate mul after the +dnl unrolled loop by moving the movl/adcl to the top. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 4 4.42 +dnl 8 4.16 +dnl 16 3.9 +dnl 32 3.9 +dnl 64 3.87 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +ifdef(`OPERATION_addmul_1',` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + define(M4_function_1c, mpn_addmul_1c) + define(M4_description, add it to) + define(M4_desc_retval, carry) +',`ifdef(`OPERATION_submul_1',` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + define(M4_function_1c, mpn_submul_1c) + define(M4_description, subtract it from) + define(M4_desc_retval, borrow) +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + + +C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C +C Calculate src,size multiplied by mult and M4_description dst,size. +C Return the M4_desc_retval limb from the top of the result. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 9) +',` +deflit(UNROLL_THRESHOLD, 6) +') + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +deflit(SAVE_SIZE, 16) + + .text + ALIGN(32) +PROLOGUE(M4_function_1) + movl PARAM_SIZE, %edx + movl PARAM_SRC, %eax + xorl %ecx, %ecx + + decl %edx + jnz LF(M4_function_1c,start_1) + + movl (%eax), %eax + movl PARAM_DST, %ecx + + mull PARAM_MULTIPLIER + + M4_inst %eax, (%ecx) + adcl $0, %edx + movl %edx, %eax + + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(M4_function_1c) + movl PARAM_SIZE, %edx + movl PARAM_SRC, %eax + + decl %edx + jnz L(more_than_one_limb) + + movl (%eax), %eax + movl PARAM_DST, %ecx + + mull PARAM_MULTIPLIER + + addl PARAM_CARRY, %eax + + adcl $0, %edx + M4_inst %eax, (%ecx) + + adcl $0, %edx + movl %edx, %eax + + ret + + + C offset 0x44 so close enough to aligned +L(more_than_one_limb): + movl PARAM_CARRY, %ecx +L(start_1): + C eax src + C ecx initial carry + C edx size-1 + subl $SAVE_SIZE, %esp +deflit(`FRAME',16) + + movl %ebx, SAVE_EBX + movl %esi, SAVE_ESI + movl %edx, %ebx C size-1 + + movl PARAM_SRC, %esi + movl %ebp, SAVE_EBP + cmpl $UNROLL_THRESHOLD, %edx + + movl PARAM_MULTIPLIER, %ebp + movl %edi, SAVE_EDI + + movl (%esi), %eax C src low limb + movl PARAM_DST, %edi + ja L(unroll) + + + C simple loop + + leal 4(%esi,%ebx,4), %esi C point one limb past last + leal (%edi,%ebx,4), %edi C point at last limb + negl %ebx + + C The movl to load the next source limb is done well ahead of the + C mul. This is necessary for full speed, and leads to one limb + C handled separately at the end. + +L(simple): + C eax src limb + C ebx loop counter + C ecx carry limb + C edx scratch + C esi src + C edi dst + C ebp multiplier + + mull %ebp + + addl %eax, %ecx + adcl $0, %edx + + M4_inst %ecx, (%edi,%ebx,4) + movl (%esi,%ebx,4), %eax + adcl $0, %edx + + incl %ebx + movl %edx, %ecx + jnz L(simple) + + + mull %ebp + + movl SAVE_EBX, %ebx + movl SAVE_ESI, %esi + movl SAVE_EBP, %ebp + + addl %eax, %ecx + adcl $0, %edx + + M4_inst %ecx, (%edi) + adcl $0, %edx + movl SAVE_EDI, %edi + + addl $SAVE_SIZE, %esp + movl %edx, %eax + ret + + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax src low limb + C ebx size-1 + C ecx carry + C edx size-1 + C esi src + C edi dst + C ebp multiplier + +dnl overlapping with parameters no longer needed +define(VAR_COUNTER,`PARAM_SIZE') +define(VAR_JUMP, `PARAM_MULTIPLIER') + + subl $2, %ebx C (size-2)-1 + decl %edx C size-2 + + shrl $UNROLL_LOG2, %ebx + negl %edx + + movl %ebx, VAR_COUNTER + andl $UNROLL_MASK, %edx + + movl %edx, %ebx + shll $4, %edx + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%edx,%ebx,1), %edx +') + negl %ebx + movl %edx, VAR_JUMP + + mull %ebp + + addl %eax, %ecx C initial carry, becomes low carry + adcl $0, %edx + testb $1, %bl + + movl 4(%esi), %eax C src second limb + leal ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi + leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebx,4), %edi + + movl %edx, %ebx C high carry + cmovnz( %ecx, %ebx) C high,low carry other way around + cmovnz( %edx, %ecx) + + jmp *VAR_JUMP + + +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + leal (%edx,%ebx,1), %edx + addl $L(entry)-L(here), %edx + addl (%esp), %edx + ret +') + + +C ----------------------------------------------------------------------------- +C This code uses a "two carry limbs" scheme. At the top of the loop the +C carries are ebx=lo, ecx=hi, then they swap for each limb processed. For +C the computed jump an odd size means they start one way around, an even +C size the other. Either way one limb is handled separately at the start of +C the loop. +C +C The positioning of the movl to load the next source limb is important. +C Moving it after the adcl with a view to avoiding a separate mul at the end +C of the loop slows the code down. + + ALIGN(32) +L(top): + C eax src limb + C ebx carry high + C ecx carry low + C edx scratch + C esi src+8 + C edi dst + C ebp multiplier + C + C VAR_COUNTER loop counter + C + C 17 bytes each limb + +L(entry): +deflit(CHUNK_COUNT,2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + + mull %ebp + +Zdisp( M4_inst,%ecx, disp0,(%edi)) + movl $0, %ecx + + adcl %eax, %ebx + +Zdisp( movl, disp0,(%esi), %eax) + adcl %edx, %ecx + + + mull %ebp + + M4_inst %ebx, disp1(%edi) + movl $0, %ebx + + adcl %eax, %ecx + + movl disp1(%esi), %eax + adcl %edx, %ebx +') + + decl VAR_COUNTER + leal UNROLL_BYTES(%esi), %esi + leal UNROLL_BYTES(%edi), %edi + + jns L(top) + + + C eax src limb + C ebx carry high + C ecx carry low + C edx + C esi + C edi dst (points at second last limb) + C ebp multiplier +deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) +deflit(`disp1', eval(disp0-0 + 4)) + + mull %ebp + + M4_inst %ecx, disp0(%edi) + movl SAVE_EBP, %ebp + + adcl %ebx, %eax + movl SAVE_EBX, %ebx + movl SAVE_ESI, %esi + + adcl $0, %edx + M4_inst %eax, disp1(%edi) + movl SAVE_EDI, %edi + + adcl $0, %edx + addl $SAVE_SIZE, %esp + + movl %edx, %eax + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k7/diveby3.asm b/ghc/rts/gmp/mpn/x86/k7/diveby3.asm new file mode 100644 index 0000000..5768495 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k7/diveby3.asm @@ -0,0 +1,131 @@ +dnl AMD K7 mpn_divexact_by3 -- mpn division by 3, expecting no remainder. +dnl +dnl K7: 8.0 cycles/limb + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t carry); + +defframe(PARAM_CARRY,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl multiplicative inverse of 3, modulo 2^32 +deflit(INVERSE_3, 0xAAAAAAAB) + +dnl ceil(b/3) and floor(b*2/3) where b=2^32 +deflit(ONE_THIRD_CEIL, 0x55555556) +deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA) + + .text + ALIGN(32) + +PROLOGUE(mpn_divexact_by3c) +deflit(`FRAME',0) + + movl PARAM_SRC, %ecx + pushl %ebx defframe_pushl(SAVE_EBX) + + movl PARAM_CARRY, %ebx + pushl %ebp defframe_pushl(SAVE_EBP) + + movl PARAM_SIZE, %ebp + pushl %edi defframe_pushl(SAVE_EDI) + + movl (%ecx), %eax C src low limb + pushl %esi defframe_pushl(SAVE_ESI) + + movl PARAM_DST, %edi + movl $TWO_THIRDS_FLOOR, %esi + leal -4(%ecx,%ebp,4), %ecx C &src[size-1] + + subl %ebx, %eax + + setc %bl + decl %ebp + jz L(last) + + leal (%edi,%ebp,4), %edi C &dst[size-1] + negl %ebp + + + ALIGN(16) +L(top): + C eax src limb, carry subtracted + C ebx carry limb (0 or 1) + C ecx &src[size-1] + C edx scratch + C esi TWO_THIRDS_FLOOR + C edi &dst[size-1] + C ebp counter, limbs, negative + + imull $INVERSE_3, %eax, %edx + + movl 4(%ecx,%ebp,4), %eax C next src limb + cmpl $ONE_THIRD_CEIL, %edx + + sbbl $-1, %ebx C +1 if result>=ceil(b/3) + cmpl %edx, %esi + + sbbl %ebx, %eax C and further 1 if result>=ceil(b*2/3) + movl %edx, (%edi,%ebp,4) + incl %ebp + + setc %bl C new carry + jnz L(top) + + + +L(last): + C eax src limb, carry subtracted + C ebx carry limb (0 or 1) + C ecx &src[size-1] + C edx scratch + C esi multiplier + C edi &dst[size-1] + C ebp + + imull $INVERSE_3, %eax + + cmpl $ONE_THIRD_CEIL, %eax + movl %eax, (%edi) + movl SAVE_EBP, %ebp + + sbbl $-1, %ebx C +1 if eax>=ceil(b/3) + cmpl %eax, %esi + movl $0, %eax + + adcl %ebx, %eax C further +1 if eax>=ceil(b*2/3) + movl SAVE_EDI, %edi + movl SAVE_ESI, %esi + + movl SAVE_EBX, %ebx + addl $FRAME, %esp + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k7/gmp-mparam.h b/ghc/rts/gmp/mpn/x86/k7/gmp-mparam.h new file mode 100644 index 0000000..c3bba0a --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k7/gmp-mparam.h @@ -0,0 +1,100 @@ +/* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + + +/* the low limb is ready after 4 cycles, but normally it's the high limb + which is of interest, and that comes out after 6 cycles */ +#ifndef UMUL_TIME +#define UMUL_TIME 6 /* cycles */ +#endif + +/* AMD doco says 40, but it measures 39 back-to-back */ +#ifndef UDIV_TIME +#define UDIV_TIME 39 /* cycles */ +#endif + +/* using bsf */ +#ifndef COUNT_TRAILING_ZEROS_TIME +#define COUNT_TRAILING_ZEROS_TIME 7 /* cycles */ +#endif + + +/* Generated by tuneup.c, 2000-07-06. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 26 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 177 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 52 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 173 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 76 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 114 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 34 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 5 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 54 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 720, 1440, 2944, 7680, 18432, 57344, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 736 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 6912 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 784, 1696, 3200, 7680, 18432, 57344, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 800 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 8448 +#endif diff --git a/ghc/rts/gmp/mpn/x86/k7/mmx/copyd.asm b/ghc/rts/gmp/mpn/x86/k7/mmx/copyd.asm new file mode 100644 index 0000000..33214da --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k7/mmx/copyd.asm @@ -0,0 +1,136 @@ +dnl AMD K7 mpn_copyd -- copy limb vector, decrementing. +dnl +dnl alignment dst/src, A=0mod8 N=4mod8 +dnl A/A A/N N/A N/N +dnl K7 0.75 1.0 1.0 0.75 + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The various comments in mpn/x86/k7/copyi.asm apply here too. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl parameter space reused +define(SAVE_EBX,`PARAM_SIZE') +define(SAVE_ESI,`PARAM_SRC') + +dnl minimum 5 since the unrolled code can't handle less than 5 +deflit(UNROLL_THRESHOLD, 5) + + .text + ALIGN(32) +PROLOGUE(mpn_copyd) + + movl PARAM_SIZE, %ecx + movl %ebx, SAVE_EBX + + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + + cmpl $UNROLL_THRESHOLD, %ecx + jae L(unroll) + + orl %ecx, %ecx + jz L(simple_done) + +L(simple): + C eax src + C ebx scratch + C ecx counter + C edx dst + C + C this loop is 2 cycles/limb + + movl -4(%eax,%ecx,4), %ebx + movl %ebx, -4(%edx,%ecx,4) + decl %ecx + jnz L(simple) + +L(simple_done): + movl SAVE_EBX, %ebx + ret + + +L(unroll): + movl %esi, SAVE_ESI + leal (%eax,%ecx,4), %ebx + leal (%edx,%ecx,4), %esi + + andl %esi, %ebx + movl SAVE_ESI, %esi + subl $4, %ecx C size-4 + + testl $4, %ebx C testl to pad code closer to 16 bytes for L(top) + jz L(aligned) + + C both src and dst unaligned, process one limb to align them + movl 12(%eax,%ecx,4), %ebx + movl %ebx, 12(%edx,%ecx,4) + decl %ecx +L(aligned): + + + ALIGN(16) +L(top): + C eax src + C ebx + C ecx counter, limbs + C edx dst + + movq 8(%eax,%ecx,4), %mm0 + movq (%eax,%ecx,4), %mm1 + subl $4, %ecx + movq %mm0, 16+8(%edx,%ecx,4) + movq %mm1, 16(%edx,%ecx,4) + jns L(top) + + + C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining + + testb $2, %cl + jz L(finish_not_two) + + movq 8(%eax,%ecx,4), %mm0 + movq %mm0, 8(%edx,%ecx,4) +L(finish_not_two): + + testb $1, %cl + jz L(done) + + movl (%eax), %ebx + movl %ebx, (%edx) + +L(done): + movl SAVE_EBX, %ebx + emms + ret + + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k7/mmx/copyi.asm b/ghc/rts/gmp/mpn/x86/k7/mmx/copyi.asm new file mode 100644 index 0000000..b234a16 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k7/mmx/copyi.asm @@ -0,0 +1,147 @@ +dnl AMD K7 mpn_copyi -- copy limb vector, incrementing. +dnl +dnl alignment dst/src, A=0mod8 N=4mod8 +dnl A/A A/N N/A N/N +dnl K7 0.75 1.0 1.0 0.75 + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Copy src,size to dst,size. +C +C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at +C 1.33 c/l. +C +C The K7 can do two loads, or two stores, or a load and a store, in one +C cycle, so if those are 64-bit operations then 0.5 c/l should be possible, +C however nothing under 0.7 c/l is known. +C +C If both source and destination are unaligned then one limb is processed at +C the start to make them aligned and so get 0.75 c/l, whereas if they'd been +C used unaligned it would be 1.5 c/l. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl parameter space reused +define(SAVE_EBX,`PARAM_SIZE') + +dnl minimum 5 since the unrolled code can't handle less than 5 +deflit(UNROLL_THRESHOLD, 5) + + .text + ALIGN(32) +PROLOGUE(mpn_copyi) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl %ebx, SAVE_EBX + + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + + cmpl $UNROLL_THRESHOLD, %ecx + jae L(unroll) + + orl %ecx, %ecx + jz L(simple_done) + +L(simple): + C eax src, incrementing + C ebx scratch + C ecx counter + C edx dst, incrementing + C + C this loop is 2 cycles/limb + + movl (%eax), %ebx + movl %ebx, (%edx) + decl %ecx + leal 4(%eax), %eax + leal 4(%edx), %edx + jnz L(simple) + +L(simple_done): + movl SAVE_EBX, %ebx + ret + + +L(unroll): + movl %eax, %ebx + leal -12(%eax,%ecx,4), %eax C src end - 12 + subl $3, %ecx C size-3 + + andl %edx, %ebx + leal (%edx,%ecx,4), %edx C dst end - 12 + negl %ecx + + testl $4, %ebx C testl to pad code closer to 16 bytes for L(top) + jz L(aligned) + + C both src and dst unaligned, process one limb to align them + movl (%eax,%ecx,4), %ebx + movl %ebx, (%edx,%ecx,4) + incl %ecx +L(aligned): + + + ALIGN(16) +L(top): + C eax src end - 12 + C ebx + C ecx counter, negative, limbs + C edx dst end - 12 + + movq (%eax,%ecx,4), %mm0 + movq 8(%eax,%ecx,4), %mm1 + addl $4, %ecx + movq %mm0, -16(%edx,%ecx,4) + movq %mm1, -16+8(%edx,%ecx,4) + ja L(top) C jump no carry and not zero + + + C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining + + testb $2, %cl + jnz L(finish_not_two) + + movq (%eax,%ecx,4), %mm0 + movq %mm0, (%edx,%ecx,4) +L(finish_not_two): + + testb $1, %cl + jnz L(done) + + movl 8(%eax), %ebx + movl %ebx, 8(%edx) + +L(done): + movl SAVE_EBX, %ebx + emms + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm b/ghc/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm new file mode 100644 index 0000000..483ad6a --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm @@ -0,0 +1,718 @@ +dnl AMD K7 mpn_divrem_1 -- mpn by limb division. +dnl +dnl K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C The method and nomenclature follow part 8 of "Division by Invariant +C Integers using Multiplication" by Granlund and Montgomery, reference in +C gmp.texi. +C +C The "and"s shown in the paper are done here with "cmov"s. "m" is written +C for m', and "d" for d_norm, which won't cause any confusion since it's +C only the normalized divisor that's of any use in the code. "b" is written +C for 2^N, the size of a limb, N being 32 here. +C +C mpn_divrem_1 avoids one division if the src high limb is less than the +C divisor. mpn_divrem_1c doesn't check for a zero carry, since in normal +C circumstances that will be a very rare event. +C +C There's a small bias towards expecting xsize==0, by having code for +C xsize==0 in a straight line and xsize!=0 under forward jumps. + + +dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by +dnl inverse method is used, rather than plain "divl"s. Minimum value 1. +dnl +dnl The inverse takes about 50 cycles to calculate, but after that the +dnl multiply is 17 c/l versus division at 42 c/l. +dnl +dnl At 3 limbs the mul is a touch faster than div on the integer part, and +dnl even more so on the fractional part. + +deflit(MUL_THRESHOLD, 3) + + +defframe(PARAM_CARRY, 24) +defframe(PARAM_DIVISOR,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC, 12) +defframe(PARAM_XSIZE, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +defframe(VAR_NORM, -20) +defframe(VAR_INVERSE, -24) +defframe(VAR_SRC, -28) +defframe(VAR_DST, -32) +defframe(VAR_DST_STOP,-36) + +deflit(STACK_SPACE, 36) + + .text + ALIGN(32) + +PROLOGUE(mpn_divrem_1c) +deflit(`FRAME',0) + movl PARAM_CARRY, %edx + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + leal -4(%edi,%ebx,4), %edi + jmp LF(mpn_divrem_1,start_1c) + +EPILOGUE() + + + C offset 0x31, close enough to aligned +PROLOGUE(mpn_divrem_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl $0, %edx C initial carry (if can't skip a div) + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + orl %ecx, %ecx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] + + jz L(no_skip_div) + movl -4(%esi,%ecx,4), %eax C src high limb + + cmpl %ebp, %eax C one less div if high=MUL_THRESHOLD, so with size==0 then + C must have xsize!=0 + jmp L(fraction_some) + + + +C ----------------------------------------------------------------------------- +C +C The multiply by inverse loop is 17 cycles, and relies on some out-of-order +C execution. The instruction scheduling is important, with various +C apparently equivalent forms running 1 to 5 cycles slower. +C +C A lower bound for the time would seem to be 16 cycles, based on the +C following successive dependencies. +C +C cycles +C n2+n1 1 +C mul 6 +C q1+1 1 +C mul 6 +C sub 1 +C addback 1 +C --- +C 16 +C +C This chain is what the loop has already, but 16 cycles isn't achieved. +C K7 has enough decode, and probably enough execute (depending maybe on what +C a mul actually consumes), but nothing running under 17 has been found. +C +C In theory n2+n1 could be done in the sub and addback stages (by +C calculating both n2 and n2+n1 there), but lack of registers makes this an +C unlikely proposition. +C +C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow +C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent +C chain, and nothing better than 18 cycles has been found when using it. +C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will +C be an extremely rare event. +C +C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but +C if some special data is coming out with this always, the q1_ff special +C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to +C induce the q1_ff case, for speed measurements or testing. Note that +C 0xFFF...FFF divided by 1 or 2 doesn't induce it. +C +C The instruction groupings and empty comments show the cycles for a naive +C in-order view of the code (conveniently ignoring the load latency on +C VAR_INVERSE). This shows some of where the time is going, but is nonsense +C to the extent that out-of-order execution rearranges it. In this case +C there's 19 cycles shown, but it executes at 17. + + ALIGN(16) +L(integer_top): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 scratch (src qword) + C mm7 rshift for normalization + + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + movl VAR_SRC, %ecx + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movq (%ecx), %mm0 C next limb and the one below it + subl $4, %ecx + + movl %ecx, VAR_SRC + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + jz L(q1_ff) + movl VAR_DST, %ecx + + mull %ebx C (q1+1)*d + + psrlq %mm7, %mm0 + + leal -4(%ecx), %ecx + + C + + subl %eax, %esi + movl VAR_DST_STOP, %eax + + C + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + movd %mm0, %esi + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + cmpl %eax, %ecx + + movl %ebx, (%ecx) + movl %ecx, VAR_DST + jne L(integer_top) + + +L(integer_loop_done): + + +C ----------------------------------------------------------------------------- +C +C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz +C q1_ff special case. This make the code a bit smaller and simpler, and +C costs only 1 cycle (each). + +L(integer_two_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + movl PARAM_SRC, %ecx + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movd (%ecx), %mm0 C src low limb + + movl VAR_DST_STOP, %ecx + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + + mull %ebx C (q1+1)*d + + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + C + + subl %eax, %esi + + C + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + movd %mm0, %esi + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + + movl %ebx, -4(%ecx) + + +C ----------------------------------------------------------------------------- +L(integer_one_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx dst + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + movl VAR_DST_STOP, %ecx + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx C q1 if q1+1 overflowed + + mull %ebx + + C + + C + + C + + subl %eax, %esi + + C + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + + movl %ebx, -8(%ecx) + subl $8, %ecx + + + +L(integer_none): + cmpl $0, PARAM_XSIZE + jne L(fraction_some) + + movl %edi, %eax +L(fraction_done): + movl VAR_NORM, %ecx + movl SAVE_EBP, %ebp + + movl SAVE_EDI, %edi + movl SAVE_ESI, %esi + + movl SAVE_EBX, %ebx + addl $STACK_SPACE, %esp + + shrl %cl, %eax + emms + + ret + + +C ----------------------------------------------------------------------------- +C +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d + +L(q1_ff): + C eax (divisor) + C ebx (q1+1 == 0) + C ecx + C edx + C esi n10 + C edi n2 + C ebp divisor + + movl VAR_DST, %ecx + movl VAR_DST_STOP, %edx + subl $4, %ecx + + psrlq %mm7, %mm0 + leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 + movl %ecx, VAR_DST + + movd %mm0, %esi C next n10 + + movl $-1, (%ecx) + cmpl %ecx, %edx + jne L(integer_top) + + jmp L(integer_loop_done) + + + +C ----------------------------------------------------------------------------- +C +C Being the fractional part, the "source" limbs are all zero, meaning +C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated. +C +C The loop runs at 15 cycles. The dependent chain is the same as the +C general case above, but without the n2+n1 stage (due to n1==0), so 15 +C would seem to be the lower bound. +C +C A not entirely obvious simplification is that q1+1 never overflows a limb, +C and so there's no need for the sbbl $0 or jz q1_ff from the general case. +C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always. +C rnd() means rounding down to a multiple of d. +C +C m*n2 + b*n2 <= m*(d-1) + b*(d-1) +C = m*d + b*d - m - b +C = floor((b(b-d)-1)/d)*d + b*d - m - b +C = rnd(b(b-d)-1) + b*d - m - b +C = rnd(b(b-d)-1 + b*d) - m - b +C = rnd(b*b-1) - m - b +C <= (b-2)*b +C +C Unchanged from the general case is that the final quotient limb q can be +C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from +C equation 8.4 of the paper which simplifies as follows when n1==0 and +C n0==0. +C +C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b +C +C As before, the instruction groupings and empty comments show a naive +C in-order view of the code, which is made a nonsense by out of order +C execution. There's 17 cycles shown, but it executes at 15. +C +C Rotating the store q and remainder->n2 instructions up to the top of the +C loop gets the run time down from 16 to 15. + + ALIGN(16) +L(fraction_some): + C eax + C ebx + C ecx + C edx + C esi + C edi carry + C ebp divisor + + movl PARAM_DST, %esi + movl VAR_DST_STOP, %ecx + movl %edi, %eax + + subl $8, %ecx + + jmp L(fraction_entry) + + + ALIGN(16) +L(fraction_top): + C eax n2 carry, then scratch + C ebx scratch (nadj, q1) + C ecx dst, decrementing + C edx scratch + C esi dst stop point + C edi (will be n2) + C ebp divisor + + movl %ebx, (%ecx) C previous q + movl %eax, %edi C remainder->n2 + +L(fraction_entry): + mull VAR_INVERSE C m*n2 + + movl %ebp, %eax C d + subl $4, %ecx C dst + leal 1(%edi), %ebx + + C + + C + + C + + C + + addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1 + + mull %ebx C (q1+1)*d + + C + + C + + C + + negl %eax C low of n - (q1+1)*d + + C + + sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry + leal (%ebp,%eax), %edx + + cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + cmpl %esi, %ecx + + jne L(fraction_top) + + + movl %ebx, (%ecx) + jmp L(fraction_done) + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k7/mmx/lshift.asm b/ghc/rts/gmp/mpn/x86/k7/mmx/lshift.asm new file mode 100644 index 0000000..4d17c88 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k7/mmx/lshift.asm @@ -0,0 +1,472 @@ +dnl AMD K7 mpn_lshift -- mpn left shift. +dnl +dnl K7: 1.21 cycles/limb (at 16 limbs/loop). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 4 1.51 +dnl 8 1.26 +dnl 16 1.21 +dnl 32 1.2 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size left by shift many bits and store the result in dst,size. +C Zeros are shifted in at the right. The bits shifted out at the left are +C the return value. +C +C The comments in mpn_rshift apply here too. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 10) +',` +deflit(UNROLL_THRESHOLD, 10) +') + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EDI, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EBX, -12) +deflit(SAVE_SIZE, 12) + + .text + ALIGN(32) + +PROLOGUE(mpn_lshift) +deflit(`FRAME',0) + + movl PARAM_SIZE, %eax + movl PARAM_SRC, %edx + subl $SAVE_SIZE, %esp +deflit(`FRAME',SAVE_SIZE) + + movl PARAM_SHIFT, %ecx + movl %edi, SAVE_EDI + + movl PARAM_DST, %edi + decl %eax + jnz L(more_than_one_limb) + + movl (%edx), %edx + + shldl( %cl, %edx, %eax) C eax was decremented to zero + + shll %cl, %edx + + movl %edx, (%edi) + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + + ret + + +C ----------------------------------------------------------------------------- +L(more_than_one_limb): + C eax size-1 + C ebx + C ecx shift + C edx src + C esi + C edi dst + C ebp + + movd PARAM_SHIFT, %mm6 + movd (%edx,%eax,4), %mm5 C src high limb + cmp $UNROLL_THRESHOLD-1, %eax + + jae L(unroll) + negl %ecx + movd (%edx), %mm4 C src low limb + + addl $32, %ecx + + movd %ecx, %mm7 + +L(simple_top): + C eax loop counter, limbs + C ebx + C ecx + C edx src + C esi + C edi dst + C ebp + C + C mm0 scratch + C mm4 src low limb + C mm5 src high limb + C mm6 shift + C mm7 32-shift + + movq -4(%edx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + movd %mm0, 4(%edi,%eax,4) + jnz L(simple_top) + + + psllq %mm6, %mm5 + psllq %mm6, %mm4 + + psrlq $32, %mm5 + movd %mm4, (%edi) C dst low limb + + movd %mm5, %eax C return value + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax size-1 + C ebx (saved) + C ecx shift + C edx src + C esi + C edi dst + C ebp + C + C mm5 src high limb, for return value + C mm6 lshift + + movl %esi, SAVE_ESI + movl %ebx, SAVE_EBX + leal -4(%edx,%eax,4), %edx C &src[size-2] + + testb $4, %dl + movq (%edx), %mm1 C src high qword + + jz L(start_src_aligned) + + + C src isn't aligned, process high limb (marked xxx) separately to + C make it so + C + C source -4(edx,%eax,4) + C | + C +-------+-------+-------+-- + C | xxx | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + C + C dest -4(edi,%eax,4) + C | + C +-------+-------+-- + C | xxx | | + C +-------+-------+-- + + psllq %mm6, %mm1 + subl $4, %edx + movl %eax, PARAM_SIZE C size-1 + + psrlq $32, %mm1 + decl %eax C size-2 is new size-1 + + movd %mm1, 4(%edi,%eax,4) + movq (%edx), %mm1 C new src high qword +L(start_src_aligned): + + + leal -4(%edi,%eax,4), %edi C &dst[size-2] + psllq %mm6, %mm5 + + testl $4, %edi + psrlq $32, %mm5 C return value + + jz L(start_dst_aligned) + + + C dst isn't aligned, subtract 4 bytes to make it so, and pretend the + C shift is 32 bits extra. High limb of dst (marked xxx) handled + C here separately. + C + C source %edx + C +-------+-------+-- + C | mm1 | + C +-------+-------+-- + C 0mod8 4mod8 + C + C dest %edi + C +-------+-------+-------+-- + C | xxx | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + + movq %mm1, %mm0 + psllq %mm6, %mm1 + addl $32, %ecx C shift+32 + + psrlq $32, %mm1 + + movd %mm1, 4(%edi) + movq %mm0, %mm1 + subl $4, %edi + + movd %ecx, %mm6 C new lshift +L(start_dst_aligned): + + decl %eax C size-2, two last limbs handled at end + movq %mm1, %mm2 C copy of src high qword + negl %ecx + + andl $-2, %eax C round size down to even + addl $64, %ecx + + movl %eax, %ebx + negl %eax + + andl $UNROLL_MASK, %eax + decl %ebx + + shll %eax + + movd %ecx, %mm7 C rshift = 64-lshift + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%eax,%eax,4), %esi +') + shrl $UNROLL_LOG2, %ebx C loop counter + + leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx + leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi + movl PARAM_SIZE, %eax C for use at end + jmp *%esi + + +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + leal (%eax,%eax,4), %esi + addl $L(entry)-L(here), %esi + addl (%esp), %esi + + ret +') + + +C ----------------------------------------------------------------------------- + ALIGN(32) +L(top): + C eax size (for use at end) + C ebx loop counter + C ecx rshift + C edx src + C esi computed jump + C edi dst + C ebp + C + C mm0 scratch + C mm1 \ carry (alternating, mm2 first) + C mm2 / + C mm6 lshift + C mm7 rshift + C + C 10 code bytes/limb + C + C The two chunks differ in whether mm1 or mm2 hold the carry. + C The computed jump puts the initial carry in both mm1 and mm2. + +L(entry): +deflit(CHUNK_COUNT, 4) +forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 - 8)) + + movq disp0(%edx), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm2, %mm0 + movq %mm0, disp0(%edi) + + + movq disp1(%edx), %mm0 + psllq %mm6, %mm1 + + movq %mm0, %mm2 + psrlq %mm7, %mm0 + + por %mm1, %mm0 + movq %mm0, disp1(%edi) +') + + subl $UNROLL_BYTES, %edx + subl $UNROLL_BYTES, %edi + decl %ebx + + jns L(top) + + + +define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))') + +L(end): + testb $1, %al + movl SAVE_EBX, %ebx + psllq %mm6, %mm2 C wanted left shifted in all cases below + + movd %mm5, %eax + + movl SAVE_ESI, %esi + jz L(end_even) + + +L(end_odd): + + C Size odd, destination was aligned. + C + C source edx+8 edx+4 + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edi + C --+---------------+---------------+-------+ + C | written | | | + C --+---------------+---------------+-------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size odd, destination was unaligned. + C + C source edx+8 edx+4 + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edi + C --+---------------+---------------+ + C | written | | + C --+---------------+---------------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at (%edi), and in the aligned case + C there's an extra limb of dst to be formed from that extra src limb + C left shifted. + + movd disp(4) (%edx), %mm0 + testb $32, %cl + + movq %mm0, %mm1 + psllq $32, %mm0 + + psrlq %mm7, %mm0 + psllq %mm6, %mm1 + + por %mm2, %mm0 + + movq %mm0, disp(0) (%edi) + jz L(end_odd_unaligned) + movd %mm1, disp(-4) (%edi) +L(end_odd_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +L(end_even): + + C Size even, destination was aligned. + C + C source edx+8 + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edi + C --+---------------+---------------+ + C | written | | + C --+---------------+---------------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size even, destination was unaligned. + C + C source edx+8 + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edi+4 + C --+---------------+-------+ + C | written | | + C --+---------------+-------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C The movq for the aligned case overwrites the movd for the + C unaligned case. + + movq %mm2, %mm0 + psrlq $32, %mm2 + + testb $32, %cl + movd %mm2, disp(4) (%edi) + + jz L(end_even_unaligned) + movq %mm0, disp(0) (%edi) +L(end_even_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k7/mmx/mod_1.asm b/ghc/rts/gmp/mpn/x86/k7/mmx/mod_1.asm new file mode 100644 index 0000000..545ca56 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k7/mmx/mod_1.asm @@ -0,0 +1,457 @@ +dnl AMD K7 mpn_mod_1 -- mpn by limb remainder. +dnl +dnl K7: 17.0 cycles/limb. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t carry); +C +C The code here is the same as mpn_divrem_1, but with the quotient +C discarded. See mpn/x86/k7/mmx/divrem_1.c for some comments. + + +dnl MUL_THRESHOLD is the size at which the multiply by inverse method is +dnl used, rather than plain "divl"s. Minimum value 2. +dnl +dnl The inverse takes about 50 cycles to calculate, but after that the +dnl multiply is 17 c/l versus division at 41 c/l. +dnl +dnl Using mul or div is about the same speed at 3 limbs, so the threshold +dnl is set to 4 to get the smaller div code used at 3. + +deflit(MUL_THRESHOLD, 4) + + +defframe(PARAM_CARRY, 16) +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +defframe(VAR_NORM, -20) +defframe(VAR_INVERSE, -24) +defframe(VAR_SRC_STOP,-28) + +deflit(STACK_SPACE, 28) + + .text + ALIGN(32) + +PROLOGUE(mpn_mod_1c) +deflit(`FRAME',0) + movl PARAM_CARRY, %edx + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + jmp LF(mpn_mod_1,start_1c) + +EPILOGUE() + + + ALIGN(32) +PROLOGUE(mpn_mod_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl $0, %edx C initial carry (if can't skip a div) + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + orl %ecx, %ecx + jz L(divide_done) + + movl -4(%esi,%ecx,4), %eax C src high limb + + cmpl %ebp, %eax C carry flag if high n2 + leal (%ebp,%esi), %edx + + movd %mm0, %esi + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + cmpl %eax, %ecx + jne L(inverse_top) + + +L(inverse_loop_done): + + +C ----------------------------------------------------------------------------- + +L(inverse_two_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx &src[-1] + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 scratch (src dword) + C mm7 rshift + + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movd 4(%ecx), %mm0 C src low limb + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + + mull %ebx C (q1+1)*d + + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + C + + subl %eax, %esi + + C + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + movd %mm0, %esi + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + + +C One limb left + + C eax scratch + C ebx scratch (nadj, q1) + C ecx + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movl VAR_NORM, %ecx C for final denorm + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + + mull %ebx C (q1+1)*d + + movl SAVE_EBX, %ebx + + C + + C + + subl %eax, %esi + + movl %esi, %eax C remainder + movl SAVE_ESI, %esi + + sbbl %edx, %edi C n - (q1+1)*d + leal (%ebp,%eax), %edx + movl SAVE_EBP, %ebp + + cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 + movl SAVE_EDI, %edi + + shrl %cl, %eax C denorm remainder + addl $STACK_SPACE, %esp + emms + + ret + + +C ----------------------------------------------------------------------------- +C +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d + +L(q1_ff): + C eax (divisor) + C ebx (q1+1 == 0) + C ecx src pointer + C edx + C esi n10 + C edi (n2) + C ebp divisor + + movl VAR_SRC_STOP, %edx + leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 + psrlq %mm7, %mm0 + + movd %mm0, %esi C next n10 + + cmpl %ecx, %edx + jne L(inverse_top) + jmp L(inverse_loop_done) + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k7/mmx/popham.asm b/ghc/rts/gmp/mpn/x86/k7/mmx/popham.asm new file mode 100644 index 0000000..68e049a --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k7/mmx/popham.asm @@ -0,0 +1,239 @@ +dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming +dnl distance. +dnl +dnl K7: popcount 5.0 cycles/limb, hamdist 6.0 cycles/limb + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl Only recent versions of gas know psadbw, in particular gas 2.9.1 on +dnl FreeBSD 3.3 and 3.4 doesn't recognise it. + +define(psadbw_mm4_mm0, +`ifelse(m4_ifdef_anyof_p(`HAVE_TARGET_CPU_athlon', + `HAVE_TARGET_CPU_pentium3'),1, + `.byte 0x0f,0xf6,0xc4 C psadbw %mm4, %mm0', + +`m4_warning(`warning, using simulated and only partly functional psadbw, use for testing only +') C this works enough for the sum of bytes done below, making it + C possible to test on an older cpu + leal -8(%esp), %esp + movq %mm4, (%esp) + movq %mm0, %mm4 +forloop(i,1,7, +` psrlq $ 8, %mm4 + paddb %mm4, %mm0 +') + pushl $ 0 + pushl $ 0xFF + pand (%esp), %mm0 + movq 8(%esp), %mm4 + leal 16(%esp), %esp +')') + + +C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); +C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); +C +C The code here is almost certainly not optimal, but is already a 3x speedup +C over the generic C code. The main improvement would be to interleave +C processing of two qwords in the loop so as to fully exploit the available +C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs). +C +C The loop is based on the example "Efficient 64-bit population count using +C MMX instructions" in the Athlon Optimization Guide, AMD document 22007, +C page 158 of rev E (reference in mpn/x86/k7/README). + +ifdef(`OPERATION_popcount',, +`ifdef(`OPERATION_hamdist',, +`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined +')')') + +define(HAM, +m4_assert_numargs(1) +`ifdef(`OPERATION_hamdist',`$1')') + +define(POP, +m4_assert_numargs(1) +`ifdef(`OPERATION_popcount',`$1')') + +HAM(` +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC2, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_hamdist) +') +POP(` +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_popcount) +') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + + +ifdef(`PIC',,` + dnl non-PIC + + .section .rodata + ALIGN(8) + +define(LS, +m4_assert_numargs(1) +`LF(M4_function,`$1')') + +LS(rodata_AAAAAAAAAAAAAAAA): + .long 0xAAAAAAAA + .long 0xAAAAAAAA + +LS(rodata_3333333333333333): + .long 0x33333333 + .long 0x33333333 + +LS(rodata_0F0F0F0F0F0F0F0F): + .long 0x0F0F0F0F + .long 0x0F0F0F0F +') + + .text + ALIGN(32) + +PROLOGUE(M4_function) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + orl %ecx, %ecx + jz L(zero) + +ifdef(`PIC',` + movl $0xAAAAAAAA, %eax + movl $0x33333333, %edx + + movd %eax, %mm7 + movd %edx, %mm6 + + movl $0x0F0F0F0F, %eax + + punpckldq %mm7, %mm7 + punpckldq %mm6, %mm6 + + movd %eax, %mm5 + movd %edx, %mm4 + + punpckldq %mm5, %mm5 + +',` + movq LS(rodata_AAAAAAAAAAAAAAAA), %mm7 + movq LS(rodata_3333333333333333), %mm6 + movq LS(rodata_0F0F0F0F0F0F0F0F), %mm5 +') + pxor %mm4, %mm4 + +define(REG_AAAAAAAAAAAAAAAA,%mm7) +define(REG_3333333333333333,%mm6) +define(REG_0F0F0F0F0F0F0F0F,%mm5) +define(REG_0000000000000000,%mm4) + + + movl PARAM_SRC, %eax +HAM(` movl PARAM_SRC2, %edx') + + pxor %mm2, %mm2 C total + + shrl %ecx + jnc L(top) + + movd (%eax,%ecx,8), %mm1 + +HAM(` movd 0(%edx,%ecx,8), %mm0 + pxor %mm0, %mm1 +') + orl %ecx, %ecx + jmp L(loaded) + + + ALIGN(16) +L(top): + C eax src + C ebx + C ecx counter, qwords, decrementing + C edx [hamdist] src2 + C + C mm0 (scratch) + C mm1 (scratch) + C mm2 total (low dword) + C mm3 + C mm4 \ + C mm5 | special constants + C mm6 | + C mm7 / + + movq -8(%eax,%ecx,8), %mm1 + +HAM(` pxor -8(%edx,%ecx,8), %mm1') + decl %ecx + +L(loaded): + movq %mm1, %mm0 + pand REG_AAAAAAAAAAAAAAAA, %mm1 + + psrlq $1, %mm1 + + psubd %mm1, %mm0 C bit pairs + + + movq %mm0, %mm1 + psrlq $2, %mm0 + + pand REG_3333333333333333, %mm0 + pand REG_3333333333333333, %mm1 + + paddd %mm1, %mm0 C nibbles + + + movq %mm0, %mm1 + psrlq $4, %mm0 + + pand REG_0F0F0F0F0F0F0F0F, %mm0 + pand REG_0F0F0F0F0F0F0F0F, %mm1 + + paddd %mm1, %mm0 C bytes + + + psadbw_mm4_mm0 + + paddd %mm0, %mm2 C add to total + jnz L(top) + + + movd %mm2, %eax + emms + ret + + +L(zero): + movl $0, %eax + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k7/mmx/rshift.asm b/ghc/rts/gmp/mpn/x86/k7/mmx/rshift.asm new file mode 100644 index 0000000..abb546c --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k7/mmx/rshift.asm @@ -0,0 +1,471 @@ +dnl AMD K7 mpn_rshift -- mpn right shift. +dnl +dnl K7: 1.21 cycles/limb (at 16 limbs/loop). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 4 1.51 +dnl 8 1.26 +dnl 16 1.21 +dnl 32 1.2 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size right by shift many bits and store the result in dst,size. +C Zeros are shifted in at the left. The bits shifted out at the right are +C the return value. +C +C This code uses 64-bit MMX operations, which makes it possible to handle +C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer +C code, on the other hand, suffers from shrd being a vector path decode and +C running at 3 cycles back-to-back. +C +C Full speed depends on source and destination being aligned, and some hairy +C setups and finish-ups are done to arrange this for the loop. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 10) +',` +deflit(UNROLL_THRESHOLD, 10) +') + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EDI, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EBX, -12) +deflit(SAVE_SIZE, 12) + + .text + ALIGN(32) + +PROLOGUE(mpn_rshift) +deflit(`FRAME',0) + + movl PARAM_SIZE, %eax + movl PARAM_SRC, %edx + subl $SAVE_SIZE, %esp +deflit(`FRAME',SAVE_SIZE) + + movl PARAM_SHIFT, %ecx + movl %edi, SAVE_EDI + + movl PARAM_DST, %edi + decl %eax + jnz L(more_than_one_limb) + + movl (%edx), %edx C src limb + + shrdl( %cl, %edx, %eax) C eax was decremented to zero + + shrl %cl, %edx + + movl %edx, (%edi) C dst limb + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + + ret + + +C ----------------------------------------------------------------------------- +L(more_than_one_limb): + C eax size-1 + C ebx + C ecx shift + C edx src + C esi + C edi dst + C ebp + + movd PARAM_SHIFT, %mm6 C rshift + movd (%edx), %mm5 C src low limb + cmp $UNROLL_THRESHOLD-1, %eax + + jae L(unroll) + leal (%edx,%eax,4), %edx C &src[size-1] + leal -4(%edi,%eax,4), %edi C &dst[size-2] + + movd (%edx), %mm4 C src high limb + negl %eax + + +L(simple_top): + C eax loop counter, limbs, negative + C ebx + C ecx shift + C edx carry + C edx &src[size-1] + C edi &dst[size-2] + C ebp + C + C mm0 scratch + C mm4 src high limb + C mm5 src low limb + C mm6 shift + + movq (%edx,%eax,4), %mm0 + incl %eax + + psrlq %mm6, %mm0 + + movd %mm0, (%edi,%eax,4) + jnz L(simple_top) + + + psllq $32, %mm5 + psrlq %mm6, %mm4 + + psrlq %mm6, %mm5 + movd %mm4, 4(%edi) C dst high limb + + movd %mm5, %eax C return value + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax size-1 + C ebx + C ecx shift + C edx src + C esi + C edi dst + C ebp + C + C mm5 src low limb + C mm6 rshift + + testb $4, %dl + movl %esi, SAVE_ESI + movl %ebx, SAVE_EBX + + psllq $32, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process low limb separately (marked xxx) and + C step src and dst by one limb, making src aligned. + C + C source edx + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + C + C dest edi + C --+-------+-------+ + C | | xxx | + C --+-------+-------+ + + movq (%edx), %mm0 C src low two limbs + addl $4, %edx + movl %eax, PARAM_SIZE C size-1 + + addl $4, %edi + decl %eax C size-2 is new size-1 + + psrlq %mm6, %mm0 + movl %edi, PARAM_DST C new dst + + movd %mm0, -4(%edi) +L(start_src_aligned): + + + movq (%edx), %mm1 C src low two limbs + decl %eax C size-2, two last limbs handled at end + testl $4, %edi + + psrlq %mm6, %mm5 + jz L(start_dst_aligned) + + + C dst isn't aligned, add 4 to make it so, and pretend the shift is + C 32 bits extra. Low limb of dst (marked xxx) handled here separately. + C + C source edx + C --+-------+-------+ + C | mm1 | + C --+-------+-------+ + C 4mod8 0mod8 + C + C dest edi + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + + movq %mm1, %mm0 + psrlq %mm6, %mm1 + addl $32, %ecx C shift+32 + + movd %mm1, (%edi) + movq %mm0, %mm1 + addl $4, %edi C new dst + + movd %ecx, %mm6 +L(start_dst_aligned): + + + movq %mm1, %mm2 C copy of src low two limbs + negl %ecx + andl $-2, %eax C round size down to even + + movl %eax, %ebx + negl %eax + addl $64, %ecx + + andl $UNROLL_MASK, %eax + decl %ebx + + shll %eax + + movd %ecx, %mm7 C lshift = 64-rshift + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%eax,%eax,4), %esi + negl %eax +') + shrl $UNROLL_LOG2, %ebx C loop counter + + leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx + leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi + movl PARAM_SIZE, %eax C for use at end + + jmp *%esi + + +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + leal (%eax,%eax,4), %esi + addl $L(entry)-L(here), %esi + addl (%esp), %esi + negl %eax + + ret +') + + +C ----------------------------------------------------------------------------- + ALIGN(64) +L(top): + C eax size, for use at end + C ebx loop counter + C ecx lshift + C edx src + C esi was computed jump + C edi dst + C ebp + C + C mm0 scratch + C mm1 \ carry (alternating) + C mm2 / + C mm6 rshift + C mm7 lshift + C + C 10 code bytes/limb + C + C The two chunks differ in whether mm1 or mm2 hold the carry. + C The computed jump puts the initial carry in both mm1 and mm2. + +L(entry): +deflit(CHUNK_COUNT, 4) +forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 8)) + + movq disp0(%edx), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + por %mm2, %mm0 + movq %mm0, disp0(%edi) + + + movq disp1(%edx), %mm0 + psrlq %mm6, %mm1 + + movq %mm0, %mm2 + psllq %mm7, %mm0 + + por %mm1, %mm0 + movq %mm0, disp1(%edi) +') + + addl $UNROLL_BYTES, %edx + addl $UNROLL_BYTES, %edi + decl %ebx + + jns L(top) + + +deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) +deflit(`disp1', eval(disp0-0 + 8)) + + testb $1, %al + psrlq %mm6, %mm2 C wanted rshifted in all cases below + movl SAVE_ESI, %esi + + movd %mm5, %eax C return value + + movl SAVE_EBX, %ebx + jz L(end_even) + + + C Size odd, destination was aligned. + C + C source + C edx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edi + C +-------+---------------+---------------+-- + C | | | written | + C +-------+---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size odd, destination was unaligned. + C + C source + C edx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edi + C +---------------+---------------+-- + C | | written | + C +---------------+---------------+-- + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword to store, and in the aligned case there's + C a further extra limb of dst to be formed. + + + movd disp0(%edx), %mm0 + movq %mm0, %mm1 + + psllq %mm7, %mm0 + testb $32, %cl + + por %mm2, %mm0 + psrlq %mm6, %mm1 + + movq %mm0, disp0(%edi) + jz L(finish_odd_unaligned) + + movd %mm1, disp1(%edi) +L(finish_odd_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +L(end_even): + + C Size even, destination was aligned. + C + C source + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edi + C +---------------+---------------+-- + C | | mm3 | + C +---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size even, destination was unaligned. + C + C source + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edi + C +-------+---------------+-- + C | | mm3 | + C +-------+---------------+-- + C + C mm6 = shift+32 + C mm7 = 64-(shift+32) + + + C The movd for the unaligned case is the same data as the movq for + C the aligned case, it's just a choice between whether one or two + C limbs should be written. + + + testb $32, %cl + movd %mm2, disp0(%edi) + + jz L(end_even_unaligned) + + movq %mm2, disp0(%edi) +L(end_even_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k7/mul_1.asm b/ghc/rts/gmp/mpn/x86/k7/mul_1.asm new file mode 100644 index 0000000..07f7085 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k7/mul_1.asm @@ -0,0 +1,265 @@ +dnl AMD K7 mpn_mul_1 -- mpn by limb multiply. +dnl +dnl K7: 3.4 cycles/limb (at 16 limbs/loop). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 8 3.9 +dnl 16 3.4 +dnl 32 3.4 +dnl 64 3.35 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); +C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier, mp_limb_t carry); +C +C Multiply src,size by mult and store the result in dst,size. +C Return the carry limb from the top of the result. +C +C mpn_mul_1c() accepts an initial carry for the calculation, it's added into +C the low limb of the destination. +C +C Variations on the unrolled loop have been tried, with the current +C registers or with the counter on the stack to free up ecx. The current +C code is the fastest found. +C +C An interesting effect is that removing the stores "movl %ebx, disp0(%edi)" +C from the unrolled loop actually slows it down to 5.0 cycles/limb. Code +C with this change can be tested on sizes of the form UNROLL_COUNT*n+1 +C without having to change the computed jump. There's obviously something +C fishy going on, perhaps with what execution units the mul needs. + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBP, -4) +defframe(SAVE_EDI, -8) +defframe(SAVE_ESI, -12) +defframe(SAVE_EBX, -16) +deflit(STACK_SPACE, 16) + +dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1. +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 7) +',` +deflit(UNROLL_THRESHOLD, 5) +') + + .text + ALIGN(32) +PROLOGUE(mpn_mul_1c) +deflit(`FRAME',0) + movl PARAM_CARRY, %edx + jmp LF(mpn_mul_1,start_nc) +EPILOGUE() + + +PROLOGUE(mpn_mul_1) +deflit(`FRAME',0) + xorl %edx, %edx C initial carry +L(start_nc): + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME', STACK_SPACE) + + movl %edi, SAVE_EDI + movl %ebx, SAVE_EBX + movl %edx, %ebx + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + cmpl $UNROLL_THRESHOLD, %ecx + + movl PARAM_DST, %edi + movl %ebp, SAVE_EBP + jae L(unroll) + + leal (%esi,%ecx,4), %esi + leal (%edi,%ecx,4), %edi + negl %ecx + + movl PARAM_MULTIPLIER, %ebp + +L(simple): + C eax scratch + C ebx carry + C ecx counter (negative) + C edx scratch + C esi src + C edi dst + C ebp multiplier + + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %ebx, %eax + movl %eax, (%edi,%ecx,4) + movl $0, %ebx + + adcl %edx, %ebx + incl %ecx + jnz L(simple) + + movl %ebx, %eax + movl SAVE_EBX, %ebx + movl SAVE_ESI, %esi + + movl SAVE_EDI, %edi + movl SAVE_EBP, %ebp + addl $STACK_SPACE, %esp + + ret + + +C ----------------------------------------------------------------------------- +C The mov to load the next source limb is done well ahead of the mul, this +C is necessary for full speed. It leads to one limb handled separately +C after the loop. +C +C When unrolling to 32 or more, an offset of +4 is used on the src pointer, +C to avoid having an 0x80 displacement in the code for the last limb in the +C unrolled loop. This is for a fair comparison between 16 and 32 unrolling. + +ifelse(eval(UNROLL_COUNT >= 32),1,` +deflit(SRC_OFFSET,4) +',` +deflit(SRC_OFFSET,) +') + + C this is offset 0x62, so close enough to aligned +L(unroll): + C eax + C ebx initial carry + C ecx size + C edx + C esi src + C edi dst + C ebp +deflit(`FRAME', STACK_SPACE) + + leal -1(%ecx), %edx C one limb handled at end + leal -2(%ecx), %ecx C and ecx is one less than edx + movl %ebp, SAVE_EBP + + negl %edx + shrl $UNROLL_LOG2, %ecx C unrolled loop counter + movl (%esi), %eax C src low limb + + andl $UNROLL_MASK, %edx + movl PARAM_DST, %edi + + movl %edx, %ebp + shll $4, %edx + + C 17 code bytes per limb +ifdef(`PIC',` + call L(add_eip_to_edx) +L(here): +',` + leal L(entry) (%edx,%ebp), %edx +') + negl %ebp + + leal ifelse(UNROLL_BYTES,256,128+) SRC_OFFSET(%esi,%ebp,4), %esi + leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebp,4), %edi + movl PARAM_MULTIPLIER, %ebp + + jmp *%edx + + +ifdef(`PIC',` +L(add_eip_to_edx): + C See README.family about old gas bugs + leal (%edx,%ebp), %edx + addl $L(entry)-L(here), %edx + addl (%esp), %edx + ret +') + + +C ---------------------------------------------------------------------------- + ALIGN(32) +L(top): + C eax next src limb + C ebx carry + C ecx counter + C edx scratch + C esi src+4 + C edi dst + C ebp multiplier + C + C 17 code bytes per limb processed + +L(entry): +forloop(i, 0, UNROLL_COUNT-1, ` + deflit(`disp_dst', eval(i*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp_src', eval(disp_dst + 4-(SRC_OFFSET-0))) + + mull %ebp + + addl %eax, %ebx +Zdisp( movl, disp_src,(%esi), %eax) +Zdisp( movl, %ebx, disp_dst,(%edi)) + + movl $0, %ebx + adcl %edx, %ebx +') + + decl %ecx + + leal UNROLL_BYTES(%esi), %esi + leal UNROLL_BYTES(%edi), %edi + jns L(top) + + +deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) + + mull %ebp + + addl %eax, %ebx + movl $0, %eax + movl SAVE_ESI, %esi + + movl %ebx, disp0(%edi) + movl SAVE_EBX, %ebx + movl SAVE_EDI, %edi + + adcl %edx, %eax + movl SAVE_EBP, %ebp + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k7/mul_basecase.asm b/ghc/rts/gmp/mpn/x86/k7/mul_basecase.asm new file mode 100644 index 0000000..c4be62e --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k7/mul_basecase.asm @@ -0,0 +1,593 @@ +dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers. +dnl +dnl K7: approx 4.42 cycles per cross product at around 20x20 limbs (16 +dnl limbs/loop unrolling). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K7 UNROLL_COUNT cycles/product (at around 20x20) +dnl 8 4.67 +dnl 16 4.59 +dnl 32 4.42 +dnl Maximum possible with the current code is 32. +dnl +dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get +dnl done with a straight run through a block of code, no inner loop. Using +dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache. + +deflit(UNROLL_COUNT, 32) + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); +C +C Calculate xp,xsize multiplied by yp,ysize, storing the result in +C wp,xsize+ysize. +C +C This routine is essentially the same as mpn/generic/mul_basecase.c, but +C it's faster because it does most of the mpn_addmul_1() startup +C calculations only once. The saving is 15-25% on typical sizes coming from +C the Karatsuba multiply code. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 5) +',` +deflit(UNROLL_THRESHOLD, 5) +') + +defframe(PARAM_YSIZE,20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE,12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + + .text + ALIGN(32) +PROLOGUE(mpn_mul_basecase) +deflit(`FRAME',0) + + movl PARAM_XSIZE, %ecx + movl PARAM_YP, %eax + + movl PARAM_XP, %edx + movl (%eax), %eax C yp low limb + + cmpl $2, %ecx + ja L(xsize_more_than_two) + je L(two_by_something) + + + C one limb by one limb + + mull (%edx) + + movl PARAM_WP, %ecx + movl %eax, (%ecx) + movl %edx, 4(%ecx) + ret + + +C ----------------------------------------------------------------------------- +L(two_by_something): +deflit(`FRAME',0) + decl PARAM_YSIZE + pushl %ebx defframe_pushl(`SAVE_EBX') + movl %eax, %ecx C yp low limb + + movl PARAM_WP, %ebx + pushl %esi defframe_pushl(`SAVE_ESI') + movl %edx, %esi C xp + + movl (%edx), %eax C xp low limb + jnz L(two_by_two) + + + C two limbs by one limb + + mull %ecx + + movl %eax, (%ebx) + movl 4(%esi), %eax + movl %edx, %esi C carry + + mull %ecx + + addl %eax, %esi + + movl %esi, 4(%ebx) + movl SAVE_ESI, %esi + + adcl $0, %edx + + movl %edx, 8(%ebx) + movl SAVE_EBX, %ebx + addl $FRAME, %esp + + ret + + + +C ----------------------------------------------------------------------------- +C Could load yp earlier into another register. + + ALIGN(16) +L(two_by_two): + C eax xp low limb + C ebx wp + C ecx yp low limb + C edx + C esi xp + C edi + C ebp + +dnl FRAME carries on from previous + + mull %ecx C xp[0] * yp[0] + + push %edi defframe_pushl(`SAVE_EDI') + movl %edx, %edi C carry, for wp[1] + + movl %eax, (%ebx) + movl 4(%esi), %eax + + mull %ecx C xp[1] * yp[0] + + addl %eax, %edi + movl PARAM_YP, %ecx + + adcl $0, %edx + movl 4(%ecx), %ecx C yp[1] + movl %edi, 4(%ebx) + + movl 4(%esi), %eax C xp[1] + movl %edx, %edi C carry, for wp[2] + + mull %ecx C xp[1] * yp[1] + + addl %eax, %edi + + adcl $0, %edx + movl (%esi), %eax C xp[0] + + movl %edx, %esi C carry, for wp[3] + + mull %ecx C xp[0] * yp[1] + + addl %eax, 4(%ebx) + adcl %edx, %edi + movl %edi, 8(%ebx) + + adcl $0, %esi + movl SAVE_EDI, %edi + movl %esi, 12(%ebx) + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + addl $FRAME, %esp + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(xsize_more_than_two): + +C The first limb of yp is processed with a simple mpn_mul_1 style loop +C inline. Unrolling this doesn't seem worthwhile since it's only run once +C (whereas the addmul below is run ysize-1 many times). A call to the +C actual mpn_mul_1 will be slowed down by the call and parameter pushing and +C popping, and doesn't seem likely to be worthwhile on the typical 13-26 +C limb operations the Karatsuba code calls here with. + + C eax yp[0] + C ebx + C ecx xsize + C edx xp + C esi + C edi + C ebp + +dnl FRAME doesn't carry on from previous, no pushes yet here +defframe(`SAVE_EBX',-4) +defframe(`SAVE_ESI',-8) +defframe(`SAVE_EDI',-12) +defframe(`SAVE_EBP',-16) +deflit(`FRAME',0) + + subl $16, %esp +deflit(`FRAME',16) + + movl %edi, SAVE_EDI + movl PARAM_WP, %edi + + movl %ebx, SAVE_EBX + movl %ebp, SAVE_EBP + movl %eax, %ebp + + movl %esi, SAVE_ESI + xorl %ebx, %ebx + leal (%edx,%ecx,4), %esi C xp end + + leal (%edi,%ecx,4), %edi C wp end of mul1 + negl %ecx + + +L(mul1): + C eax scratch + C ebx carry + C ecx counter, negative + C edx scratch + C esi xp end + C edi wp end of mul1 + C ebp multiplier + + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %ebx, %eax + movl %eax, (%edi,%ecx,4) + movl $0, %ebx + + adcl %edx, %ebx + incl %ecx + jnz L(mul1) + + + movl PARAM_YSIZE, %edx + movl PARAM_XSIZE, %ecx + + movl %ebx, (%edi) C final carry + decl %edx + + jnz L(ysize_more_than_one) + + + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + + movl SAVE_EBP, %ebp + movl SAVE_ESI, %esi + addl $FRAME, %esp + + ret + + +L(ysize_more_than_one): + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_YP, %eax + + jae L(unroll) + + +C ----------------------------------------------------------------------------- + C simple addmul looping + C + C eax yp + C ebx + C ecx xsize + C edx ysize-1 + C esi xp end + C edi wp end of mul1 + C ebp + + leal 4(%eax,%edx,4), %ebp C yp end + negl %ecx + negl %edx + + movl (%esi,%ecx,4), %eax C xp low limb + movl %edx, PARAM_YSIZE C -(ysize-1) + incl %ecx + + xorl %ebx, %ebx C initial carry + movl %ecx, PARAM_XSIZE C -(xsize-1) + movl %ebp, PARAM_YP + + movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier + jmp L(simple_outer_entry) + + + C this is offset 0x121 so close enough to aligned +L(simple_outer_top): + C ebp ysize counter, negative + + movl PARAM_YP, %edx + movl PARAM_XSIZE, %ecx C -(xsize-1) + xorl %ebx, %ebx C carry + + movl %ebp, PARAM_YSIZE + addl $4, %edi C next position in wp + + movl (%edx,%ebp,4), %ebp C yp limb - multiplier + movl -4(%esi,%ecx,4), %eax C xp low limb + + +L(simple_outer_entry): + +L(simple_inner): + C eax xp limb + C ebx carry limb + C ecx loop counter (negative) + C edx scratch + C esi xp end + C edi wp end + C ebp multiplier + + mull %ebp + + addl %eax, %ebx + adcl $0, %edx + + addl %ebx, (%edi,%ecx,4) + movl (%esi,%ecx,4), %eax + adcl $0, %edx + + incl %ecx + movl %edx, %ebx + jnz L(simple_inner) + + + mull %ebp + + movl PARAM_YSIZE, %ebp + addl %eax, %ebx + + adcl $0, %edx + addl %ebx, (%edi) + + adcl $0, %edx + incl %ebp + + movl %edx, 4(%edi) + jnz L(simple_outer_top) + + + movl SAVE_EBX, %ebx + movl SAVE_ESI, %esi + + movl SAVE_EDI, %edi + movl SAVE_EBP, %ebp + addl $FRAME, %esp + + ret + + + +C ----------------------------------------------------------------------------- +C +C The unrolled loop is the same as in mpn_addmul_1(), see that code for some +C comments. +C +C VAR_ADJUST is the negative of how many limbs the leals in the inner loop +C increment xp and wp. This is used to adjust back xp and wp, and rshifted +C to given an initial VAR_COUNTER at the top of the outer loop. +C +C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT +C up to -1, inclusive. +C +C VAR_JMP is the computed jump into the unrolled loop. +C +C VAR_XP_LOW is the least significant limb of xp, which is needed at the +C start of the unrolled loop. +C +C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, +C inclusive. +C +C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be +C added to give the location of the next limb of yp, which is the multiplier +C in the unrolled loop. +C +C The trick with VAR_ADJUST means it's only necessary to do one fetch in the +C outer loop to take care of xp, wp and the inner loop counter. + +defframe(VAR_COUNTER, -20) +defframe(VAR_ADJUST, -24) +defframe(VAR_JMP, -28) +defframe(VAR_XP_LOW, -32) +deflit(VAR_EXTRA_SPACE, 16) + + +L(unroll): + C eax yp + C ebx + C ecx xsize + C edx ysize-1 + C esi xp end + C edi wp end of mul1 + C ebp + + movl PARAM_XP, %esi + movl 4(%eax), %ebp C multiplier (yp second limb) + leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing + + movl PARAM_WP, %edi + movl %eax, PARAM_YP + negl %edx + + movl %edx, PARAM_YSIZE + leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1 + decl %ecx C xsize-1 + + movl (%esi), %eax C xp low limb + andl $-UNROLL_MASK-1, %ebx + negl %ecx + + subl $VAR_EXTRA_SPACE, %esp +deflit(`FRAME',16+VAR_EXTRA_SPACE) + negl %ebx + andl $UNROLL_MASK, %ecx + + movl %ebx, VAR_ADJUST + movl %ecx, %edx + shll $4, %ecx + + sarl $UNROLL_LOG2, %ebx + + C 17 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(unroll_here): +',` + leal L(unroll_entry) (%ecx,%edx,1), %ecx +') + negl %edx + + movl %eax, VAR_XP_LOW + movl %ecx, VAR_JMP + leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling, + leal 4(%esi,%edx,4), %esi C and start at second limb + jmp L(unroll_outer_entry) + + +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + leal (%ecx,%edx,1), %ecx + addl $L(unroll_entry)-L(unroll_here), %ecx + addl (%esp), %ecx + ret +') + + +C -------------------------------------------------------------------------- + ALIGN(32) +L(unroll_outer_top): + C ebp ysize counter, negative + + movl VAR_ADJUST, %ebx + movl PARAM_YP, %edx + + movl VAR_XP_LOW, %eax + movl %ebp, PARAM_YSIZE C store incremented ysize counter + + leal 4(%edi,%ebx,4), %edi + leal (%esi,%ebx,4), %esi + sarl $UNROLL_LOG2, %ebx + + movl (%edx,%ebp,4), %ebp C yp next multiplier + movl VAR_JMP, %ecx + +L(unroll_outer_entry): + mull %ebp + + testb $1, %cl C and clear carry bit + movl %ebx, VAR_COUNTER + movl $0, %ebx + + movl $0, %ecx + cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb + cmovnz( %eax, %ebx) + + C Extra fetch of VAR_JMP is bad, but registers are tight + jmp *VAR_JMP + + +C ----------------------------------------------------------------------------- + ALIGN(32) +L(unroll_top): + C eax xp limb + C ebx carry high + C ecx carry low + C edx scratch + C esi xp+8 + C edi wp + C ebp yp multiplier limb + C + C VAR_COUNTER loop counter, negative + C + C 17 bytes each limb + +L(unroll_entry): + +deflit(CHUNK_COUNT,2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%esi), %eax) + adcl %edx, %ebx + + mull %ebp + +Zdisp( addl, %ecx, disp0,(%edi)) + movl $0, %ecx + + adcl %eax, %ebx + + + movl disp1(%esi), %eax + adcl %edx, %ecx + + mull %ebp + + addl %ebx, disp1(%edi) + movl $0, %ebx + + adcl %eax, %ecx +') + + + incl VAR_COUNTER + leal UNROLL_BYTES(%esi), %esi + leal UNROLL_BYTES(%edi), %edi + + jnz L(unroll_top) + + + C eax + C ebx zero + C ecx low + C edx high + C esi + C edi wp, pointing at second last limb) + C ebp + C + C carry flag to be added to high + +deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) +deflit(`disp1', eval(disp0-0 + 4)) + + movl PARAM_YSIZE, %ebp + adcl $0, %edx + addl %ecx, disp0(%edi) + + adcl $0, %edx + incl %ebp + + movl %edx, disp1(%edi) + jnz L(unroll_outer_top) + + + movl SAVE_ESI, %esi + movl SAVE_EBP, %ebp + + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + addl $FRAME, %esp + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/k7/sqr_basecase.asm b/ghc/rts/gmp/mpn/x86/k7/sqr_basecase.asm new file mode 100644 index 0000000..84861ea --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/k7/sqr_basecase.asm @@ -0,0 +1,627 @@ +dnl AMD K7 mpn_sqr_basecase -- square an mpn number. +dnl +dnl K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product +dnl (measured on the speed difference between 25 and 50 limbs, which is +dnl roughly the Karatsuba recursing range). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for +dnl some comments. + +deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66) + +ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE', +`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)') + +m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD') +deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3)) + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C With a KARATSUBA_SQR_THRESHOLD around 50 this code is about 1500 bytes, +C which is quite a bit, but is considered good value since squares big +C enough to use most of the code will be spending quite a few cycles in it. + + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %eax + cmpl $2, %ecx + + movl PARAM_DST, %edx + je L(two_limbs) + ja L(three_or_more) + + +C------------------------------------------------------------------------------ +C one limb only + C eax src + C ecx size + C edx dst + + movl (%eax), %eax + movl %edx, %ecx + + mull %eax + + movl %edx, 4(%ecx) + movl %eax, (%ecx) + ret + + +C------------------------------------------------------------------------------ +C +C Using the read/modify/write "add"s seems to be faster than saving and +C restoring registers. Perhaps the loads for the first set hide under the +C mul latency and the second gets store to load forwarding. + + ALIGN(16) +L(two_limbs): + C eax src + C ebx + C ecx size + C edx dst +deflit(`FRAME',0) + + pushl %ebx FRAME_pushl() + movl %eax, %ebx C src + movl (%eax), %eax + + movl %edx, %ecx C dst + + mull %eax C src[0]^2 + + movl %eax, (%ecx) C dst[0] + movl 4(%ebx), %eax + + movl %edx, 4(%ecx) C dst[1] + + mull %eax C src[1]^2 + + movl %eax, 8(%ecx) C dst[2] + movl (%ebx), %eax + + movl %edx, 12(%ecx) C dst[3] + + mull 4(%ebx) C src[0]*src[1] + + popl %ebx + + addl %eax, 4(%ecx) + adcl %edx, 8(%ecx) + adcl $0, 12(%ecx) + ASSERT(nc) + + addl %eax, 4(%ecx) + adcl %edx, 8(%ecx) + adcl $0, 12(%ecx) + ASSERT(nc) + + ret + + +C------------------------------------------------------------------------------ +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +deflit(STACK_SPACE, 16) + +L(three_or_more): + subl $STACK_SPACE, %esp + cmpl $4, %ecx + jae L(four_or_more) +deflit(`FRAME',STACK_SPACE) + + +C------------------------------------------------------------------------------ +C Three limbs +C +C Writing out the loads and stores separately at the end of this code comes +C out about 10 cycles faster than using adcls to memory. + + C eax src + C ecx size + C edx dst + + movl %ebx, SAVE_EBX + movl %eax, %ebx C src + movl (%eax), %eax + + movl %edx, %ecx C dst + movl %esi, SAVE_ESI + movl %edi, SAVE_EDI + + mull %eax C src[0] ^ 2 + + movl %eax, (%ecx) + movl 4(%ebx), %eax + movl %edx, 4(%ecx) + + mull %eax C src[1] ^ 2 + + movl %eax, 8(%ecx) + movl 8(%ebx), %eax + movl %edx, 12(%ecx) + + mull %eax C src[2] ^ 2 + + movl %eax, 16(%ecx) + movl (%ebx), %eax + movl %edx, 20(%ecx) + + mull 4(%ebx) C src[0] * src[1] + + movl %eax, %esi + movl (%ebx), %eax + movl %edx, %edi + + mull 8(%ebx) C src[0] * src[2] + + addl %eax, %edi + movl %ebp, SAVE_EBP + movl $0, %ebp + + movl 4(%ebx), %eax + adcl %edx, %ebp + + mull 8(%ebx) C src[1] * src[2] + + xorl %ebx, %ebx + addl %eax, %ebp + + adcl $0, %edx + + C eax + C ebx zero, will be dst[5] + C ecx dst + C edx dst[4] + C esi dst[1] + C edi dst[2] + C ebp dst[3] + + adcl $0, %edx + addl %esi, %esi + + adcl %edi, %edi + movl 4(%ecx), %eax + + adcl %ebp, %ebp + + adcl %edx, %edx + + adcl $0, %ebx + addl %eax, %esi + movl 8(%ecx), %eax + + adcl %eax, %edi + movl 12(%ecx), %eax + movl %esi, 4(%ecx) + + adcl %eax, %ebp + movl 16(%ecx), %eax + movl %edi, 8(%ecx) + + movl SAVE_ESI, %esi + movl SAVE_EDI, %edi + + adcl %eax, %edx + movl 20(%ecx), %eax + movl %ebp, 12(%ecx) + + adcl %ebx, %eax + ASSERT(nc) + movl SAVE_EBX, %ebx + movl SAVE_EBP, %ebp + + movl %edx, 16(%ecx) + movl %eax, 20(%ecx) + addl $FRAME, %esp + + ret + + +C------------------------------------------------------------------------------ +L(four_or_more): + +C First multiply src[0]*src[1..size-1] and store at dst[1..size]. +C Further products are added in rather than stored. + + C eax src + C ebx + C ecx size + C edx dst + C esi + C edi + C ebp + +defframe(`VAR_COUNTER',-20) +defframe(`VAR_JMP', -24) +deflit(EXTRA_STACK_SPACE, 8) + + movl %ebx, SAVE_EBX + movl %edi, SAVE_EDI + leal (%edx,%ecx,4), %edi C &dst[size] + + movl %esi, SAVE_ESI + movl %ebp, SAVE_EBP + leal (%eax,%ecx,4), %esi C &src[size] + + movl (%eax), %ebp C multiplier + movl $0, %ebx + decl %ecx + + negl %ecx + subl $EXTRA_STACK_SPACE, %esp +FRAME_subl_esp(EXTRA_STACK_SPACE) + +L(mul_1): + C eax scratch + C ebx carry + C ecx counter + C edx scratch + C esi &src[size] + C edi &dst[size] + C ebp multiplier + + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %ebx, %eax + movl %eax, (%edi,%ecx,4) + movl $0, %ebx + + adcl %edx, %ebx + incl %ecx + jnz L(mul_1) + + +C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. +C +C The last two products, which are the bottom right corner of the product +C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] +C and src[size-2]*src[size-1]. If size is 4 then it's only these corner +C cases that need to be done. +C +C The unrolled code is the same as in mpn_addmul_1, see that routine for +C some comments. +C +C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive. +C +C VAR_JMP is the computed jump into the unrolled code, stepped by one code +C chunk each outer loop. +C +C K7 does branch prediction on indirect jumps, which is bad since it's a +C different target each time. There seems no way to avoid this. + +dnl This value also hard coded in some shifts and adds +deflit(CODE_BYTES_PER_LIMB, 17) + +dnl With the unmodified &src[size] and &dst[size] pointers, the +dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT +dnl values up to 31, but above that an offset must be added to them. + +deflit(OFFSET, +ifelse(eval(UNROLL_COUNT>31),1, +eval((UNROLL_COUNT-31)*4), +0)) + +dnl Because the last chunk of code is generated differently, a label placed +dnl at the end doesn't work. Instead calculate the implied end using the +dnl start and how many chunks of code there are. + +deflit(UNROLL_INNER_END, +`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)') + + C eax + C ebx carry + C ecx + C edx + C esi &src[size] + C edi &dst[size] + C ebp + + movl PARAM_SIZE, %ecx + movl %ebx, (%edi) + + subl $4, %ecx + jz L(corner) + + negl %ecx +ifelse(OFFSET,0,,`subl $OFFSET, %edi') +ifelse(OFFSET,0,,`subl $OFFSET, %esi') + + movl %ecx, %edx + shll $4, %ecx + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx +') + + + C The calculated jump mustn't come out to before the start of the + C code available. This is the limit UNROLL_COUNT puts on the src + C operand size, but checked here directly using the jump address. + ASSERT(ae, + `movl_text_address(L(unroll_inner_start), %eax) + cmpl %eax, %ecx') + + +C------------------------------------------------------------------------------ + ALIGN(16) +L(unroll_outer_top): + C eax + C ebx high limb to store + C ecx VAR_JMP + C edx VAR_COUNTER, limbs, negative + C esi &src[size], constant + C edi dst ptr, high of last addmul + C ebp + + movl -12+OFFSET(%esi,%edx,4), %ebp C next multiplier + movl -8+OFFSET(%esi,%edx,4), %eax C first of multiplicand + + movl %edx, VAR_COUNTER + + mull %ebp + +define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')') + + testb $1, %cl + movl %edx, %ebx C high carry + movl %ecx, %edx C jump + + movl %eax, %ecx C low carry + cmovX( %ebx, %ecx) C high carry reverse + cmovX( %eax, %ebx) C low carry reverse + + leal CODE_BYTES_PER_LIMB(%edx), %eax + xorl %edx, %edx + leal 4(%edi), %edi + + movl %eax, VAR_JMP + + jmp *%eax + + +ifdef(`PIC',` +L(pic_calc): + addl (%esp), %ecx + addl $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx + addl %edx, %ecx + ret +') + + + C Must be an even address to preserve the significance of the low + C bit of the jump address indicating which way around ecx/ebx should + C start. + ALIGN(2) + +L(unroll_inner_start): + C eax next limb + C ebx carry high + C ecx carry low + C edx scratch + C esi src + C edi dst + C ebp multiplier + +forloop(`i', UNROLL_COUNT, 1, ` + deflit(`disp_src', eval(-i*4 + OFFSET)) + deflit(`disp_dst', eval(disp_src - 4)) + + m4_assert(`disp_src>=-128 && disp_src<128') + m4_assert(`disp_dst>=-128 && disp_dst<128') + +ifelse(eval(i%2),0,` +Zdisp( movl, disp_src,(%esi), %eax) + adcl %edx, %ebx + + mull %ebp + +Zdisp( addl, %ecx, disp_dst,(%edi)) + movl $0, %ecx + + adcl %eax, %ebx + +',` + dnl this bit comes out last +Zdisp( movl, disp_src,(%esi), %eax) + adcl %edx, %ecx + + mull %ebp + +dnl Zdisp( addl %ebx, disp_src,(%edi)) + addl %ebx, disp_dst(%edi) +ifelse(forloop_last,0, +` movl $0, %ebx') + + adcl %eax, %ecx +') +') + + C eax next limb + C ebx carry high + C ecx carry low + C edx scratch + C esi src + C edi dst + C ebp multiplier + + adcl $0, %edx + addl %ecx, -4+OFFSET(%edi) + movl VAR_JMP, %ecx + + adcl $0, %edx + + movl %edx, m4_empty_if_zero(OFFSET) (%edi) + movl VAR_COUNTER, %edx + + incl %edx + jnz L(unroll_outer_top) + + +ifelse(OFFSET,0,,` + addl $OFFSET, %esi + addl $OFFSET, %edi +') + + +C------------------------------------------------------------------------------ +L(corner): + C esi &src[size] + C edi &dst[2*size-5] + + movl -12(%esi), %ebp + movl -8(%esi), %eax + movl %eax, %ecx + + mull %ebp + + addl %eax, -4(%edi) + movl -4(%esi), %eax + + adcl $0, %edx + movl %edx, %ebx + movl %eax, %esi + + mull %ebp + + addl %ebx, %eax + + adcl $0, %edx + addl %eax, (%edi) + movl %esi, %eax + + adcl $0, %edx + movl %edx, %ebx + + mull %ecx + + addl %ebx, %eax + movl %eax, 4(%edi) + + adcl $0, %edx + movl %edx, 8(%edi) + + + +C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. + +L(lshift_start): + movl PARAM_SIZE, %eax + movl PARAM_DST, %edi + xorl %ecx, %ecx C clear carry + + leal (%edi,%eax,8), %edi + notl %eax C -size-1, preserve carry + + leal 2(%eax), %eax C -(size-1) + +L(lshift): + C eax counter, negative + C ebx + C ecx + C edx + C esi + C edi dst, pointing just after last limb + C ebp + + rcll -4(%edi,%eax,8) + rcll (%edi,%eax,8) + incl %eax + jnz L(lshift) + + setc %al + + movl PARAM_SRC, %esi + movl %eax, -4(%edi) C dst most significant limb + + movl PARAM_SIZE, %ecx + + +C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + movl (%esi), %eax C src[0] + + mull %eax + + leal (%esi,%ecx,4), %esi C src point just after last limb + negl %ecx + + movl %eax, (%edi,%ecx,8) C dst[0] + incl %ecx + +L(diag): + C eax scratch + C ebx scratch + C ecx counter, negative + C edx carry + C esi src just after last limb + C edi dst just after last limb + C ebp + + movl (%esi,%ecx,4), %eax + movl %edx, %ebx + + mull %eax + + addl %ebx, -4(%edi,%ecx,8) + adcl %eax, (%edi,%ecx,8) + adcl $0, %edx + + incl %ecx + jnz L(diag) + + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + + addl %edx, -4(%edi) C dst most significant limb + movl SAVE_EDI, %edi + + movl SAVE_EBP, %ebp + addl $FRAME, %esp + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/lshift.asm b/ghc/rts/gmp/mpn/x86/lshift.asm new file mode 100644 index 0000000..4735335 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/lshift.asm @@ -0,0 +1,90 @@ +dnl x86 mpn_lshift -- mpn left shift. + +dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, +dnl Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_lshift) + + pushl %edi + pushl %esi + pushl %ebx +deflit(`FRAME',12) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%edx + movl PARAM_SHIFT,%ecx + + subl $4,%esi C adjust src + + movl (%esi,%edx,4),%ebx C read most significant limb + xorl %eax,%eax + shldl( %cl, %ebx, %eax) C compute carry limb + decl %edx + jz L(end) + pushl %eax C push carry limb onto stack + testb $1,%dl + jnz L(1) C enter loop in the middle + movl %ebx,%eax + + ALIGN(8) +L(oop): movl (%esi,%edx,4),%ebx C load next lower limb + shldl( %cl, %ebx, %eax) C compute result limb + movl %eax,(%edi,%edx,4) C store it + decl %edx +L(1): movl (%esi,%edx,4),%eax + shldl( %cl, %eax, %ebx) + movl %ebx,(%edi,%edx,4) + decl %edx + jnz L(oop) + + shll %cl,%eax C compute least significant limb + movl %eax,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebx + popl %esi + popl %edi + ret + +L(end): shll %cl,%ebx C compute least significant limb + movl %ebx,(%edi) C store it + + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/mod_1.asm b/ghc/rts/gmp/mpn/x86/mod_1.asm new file mode 100644 index 0000000..3908161 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/mod_1.asm @@ -0,0 +1,141 @@ +dnl x86 mpn_mod_1 -- mpn by limb remainder. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl cycles/limb +dnl K6 20 +dnl P5 44 +dnl P6 39 +dnl 486 approx 42 maybe +dnl +dnl The following have their own optimized mod_1 implementations, but for +dnl reference the code here runs as follows. +dnl +dnl P6MMX 39 +dnl K7 41 + + +include(`../config.m4') + + +C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t carry); +C +C Divide src,size by divisor and return the remainder. The quotient is +C discarded. +C +C See mpn/x86/divrem_1.asm for some comments. + +defframe(PARAM_CARRY, 16) +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + + .text + ALIGN(16) + +PROLOGUE(mpn_mod_1c) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + pushl %esi FRAME_pushl() + + movl PARAM_DIVISOR, %esi + orl %ecx, %ecx + + movl PARAM_CARRY, %edx + jnz LF(mpn_mod_1,top) + + popl %esi + movl %edx, %eax + + popl %ebx + + ret + +EPILOGUE() + + +PROLOGUE(mpn_mod_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + pushl %esi FRAME_pushl() + + orl %ecx, %ecx + jz L(done_zero) + + movl PARAM_DIVISOR, %esi + movl -4(%ebx,%ecx,4), %eax C src high limb + + cmpl %esi, %eax + + sbbl %edx, %edx C -1 if high 4-fold for that +C part of the function, but since it is not very large, that would be +C acceptable. +C +C The mul loop (at L(oopM)) might need some tweaking. It's current speed is +C unknown. + +defframe(PARAM_YSIZE,20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE,12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + +defframe(VAR_MULTIPLIER, -4) +defframe(VAR_COUNTER, -8) +deflit(VAR_STACK_SPACE, 8) + + .text + ALIGN(8) + +PROLOGUE(mpn_mul_basecase) +deflit(`FRAME',0) + + subl $VAR_STACK_SPACE,%esp + pushl %esi + pushl %ebp + pushl %edi +deflit(`FRAME',eval(VAR_STACK_SPACE+12)) + + movl PARAM_XP,%esi + movl PARAM_WP,%edi + movl PARAM_YP,%ebp + + movl (%esi),%eax C load xp[0] + mull (%ebp) C multiply by yp[0] + movl %eax,(%edi) C store to wp[0] + movl PARAM_XSIZE,%ecx C xsize + decl %ecx C If xsize = 1, ysize = 1 too + jz L(done) + + pushl %ebx +FRAME_pushl() + movl %edx,%ebx + + leal 4(%esi),%esi + leal 4(%edi),%edi + +L(oopM): + movl (%esi),%eax C load next limb at xp[j] + leal 4(%esi),%esi + mull (%ebp) + addl %ebx,%eax + movl %edx,%ebx + adcl $0,%ebx + movl %eax,(%edi) + leal 4(%edi),%edi + decl %ecx + jnz L(oopM) + + movl %ebx,(%edi) C most significant limb of product + addl $4,%edi C increment wp + movl PARAM_XSIZE,%eax + shll $2,%eax + subl %eax,%edi + subl %eax,%esi + + movl PARAM_YSIZE,%eax C ysize + decl %eax + jz L(skip) + movl %eax,VAR_COUNTER C set index i to ysize + +L(outer): + movl PARAM_YP,%ebp C yp + addl $4,%ebp C make ebp point to next v limb + movl %ebp,PARAM_YP + movl (%ebp),%eax C copy y limb ... + movl %eax,VAR_MULTIPLIER C ... to stack slot + movl PARAM_XSIZE,%ecx + + xorl %ebx,%ebx + andl $3,%ecx + jz L(end0) + +L(oop0): + movl (%esi),%eax + mull VAR_MULTIPLIER + leal 4(%esi),%esi + addl %ebx,%eax + movl $0,%ebx + adcl %ebx,%edx + addl %eax,(%edi) + adcl %edx,%ebx C propagate carry into cylimb + + leal 4(%edi),%edi + decl %ecx + jnz L(oop0) + +L(end0): + movl PARAM_XSIZE,%ecx + shrl $2,%ecx + jz L(endX) + + ALIGN(8) +L(oopX): + movl (%esi),%eax + mull VAR_MULTIPLIER + addl %eax,%ebx + movl $0,%ebp + adcl %edx,%ebp + + movl 4(%esi),%eax + mull VAR_MULTIPLIER + addl %ebx,(%edi) + adcl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + movl 8(%esi),%eax + mull VAR_MULTIPLIER + addl %ebp,4(%edi) + adcl %eax,%ebx C new lo + cylimb + movl $0,%ebp + adcl %edx,%ebp + + movl 12(%esi),%eax + mull VAR_MULTIPLIER + addl %ebx,8(%edi) + adcl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + addl %ebp,12(%edi) + adcl $0,%ebx C propagate carry into cylimb + + leal 16(%esi),%esi + leal 16(%edi),%edi + decl %ecx + jnz L(oopX) + +L(endX): + movl %ebx,(%edi) + addl $4,%edi + + C we incremented wp and xp in the loop above; compensate + movl PARAM_XSIZE,%eax + shll $2,%eax + subl %eax,%edi + subl %eax,%esi + + movl VAR_COUNTER,%eax + decl %eax + movl %eax,VAR_COUNTER + jnz L(outer) + +L(skip): + popl %ebx + popl %edi + popl %ebp + popl %esi + addl $8,%esp + ret + +L(done): + movl %edx,4(%edi) C store to wp[1] + popl %edi + popl %ebp + popl %esi + addl $8,%esp + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/p6/README b/ghc/rts/gmp/mpn/x86/p6/README new file mode 100644 index 0000000..7dbc905 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/p6/README @@ -0,0 +1,95 @@ + + INTEL P6 MPN SUBROUTINES + + + +This directory contains code optimized for Intel P6 class CPUs, meaning +PentiumPro, Pentium II and Pentium III. The mmx and p3mmx subdirectories +have routines using MMX instructions. + + + +STATUS + +Times for the loops, with all code and data in L1 cache, are as follows. +Some of these might be able to be improved. + + cycles/limb + + mpn_add_n/sub_n 3.7 + + mpn_copyi 0.75 + mpn_copyd 2.4 + + mpn_divrem_1 39.0 + mpn_mod_1 39.0 + mpn_divexact_by3 8.5 + + mpn_mul_1 5.5 + mpn_addmul/submul_1 6.35 + + mpn_l/rshift 2.5 + + mpn_mul_basecase 8.2 cycles/crossproduct (approx) + mpn_sqr_basecase 4.0 cycles/crossproduct (approx) + or 7.75 cycles/triangleproduct (approx) + +Pentium II and III have MMX and get the following improvements. + + mpn_divrem_1 25.0 integer part, 17.5 fractional part + mpn_mod_1 24.0 + + mpn_l/rshift 1.75 + + + + +NOTES + +Write-allocate L1 data cache means prefetching of destinations is unnecessary. + +Mispredicted branches have a penalty of between 9 and 15 cycles, and even up +to 26 cycles depending how far speculative execution has gone. The 9 cycle +minimum penalty comes from the issue pipeline being 9 stages. + +A copy with rep movs seems to copy 16 bytes at a time, since speeds for 4, +5, 6 or 7 limb operations are all the same. The 0.75 cycles/limb would be 3 +cycles per 16 byte block. + + + + +CODING + +Instructions in general code have been shown grouped if they can execute +together, which means up to three instructions with no successive +dependencies, and with only the first being a multiple micro-op. + +P6 has out-of-order execution, so the groupings are really only showing +dependent paths where some shuffling might allow some latencies to be +hidden. + + + + +REFERENCES + +"Intel Architecture Optimization Reference Manual", 1999, revision 001 dated +02/99, order number 245127 (order number 730795-001 is in the document too). +Available on-line: + + http://download.intel.com/design/PentiumII/manuals/245127.htm + +"Intel Architecture Optimization Manual", 1997, order number 242816. This +is an older document mostly about P5 and not as good as the above. +Available on-line: + + http://download.intel.com/design/PentiumII/manuals/242816.htm + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/ghc/rts/gmp/mpn/x86/p6/aorsmul_1.asm b/ghc/rts/gmp/mpn/x86/p6/aorsmul_1.asm new file mode 100644 index 0000000..feb364e --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/p6/aorsmul_1.asm @@ -0,0 +1,300 @@ +dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. +dnl +dnl P6: 6.35 cycles/limb (at 16 limbs/loop). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl P6 UNROLL_COUNT cycles/limb +dnl 8 6.7 +dnl 16 6.35 +dnl 32 6.3 +dnl 64 6.3 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +ifdef(`OPERATION_addmul_1', ` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + define(M4_function_1c, mpn_addmul_1c) + define(M4_description, add it to) + define(M4_desc_retval, carry) +',`ifdef(`OPERATION_submul_1', ` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + define(M4_function_1c, mpn_submul_1c) + define(M4_description, subtract it from) + define(M4_desc_retval, borrow) +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + + +C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C +C Calculate src,size multiplied by mult and M4_description dst,size. +C Return the M4_desc_retval limb from the top of the result. +C +C This code is pretty much the same as the K6 code. The unrolled loop is +C the same, but there's just a few scheduling tweaks in the setups and the +C simple loop. +C +C A number of variations have been tried for the unrolled loop, with one or +C two carries, and with loads scheduled earlier, but nothing faster than 6 +C cycles/limb has been found. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 5) +',` +deflit(UNROLL_THRESHOLD, 5) +') + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) + +PROLOGUE(M4_function_1c) + pushl %ebx +deflit(`FRAME',4) + movl PARAM_CARRY, %ebx + jmp LF(M4_function_1,start_nc) +EPILOGUE() + +PROLOGUE(M4_function_1) + push %ebx +deflit(`FRAME',4) + xorl %ebx, %ebx C initial carry + +L(start_nc): + movl PARAM_SIZE, %ecx + pushl %esi +deflit(`FRAME',8) + + movl PARAM_SRC, %esi + pushl %edi +deflit(`FRAME',12) + + movl PARAM_DST, %edi + pushl %ebp +deflit(`FRAME',16) + cmpl $UNROLL_THRESHOLD, %ecx + + movl PARAM_MULTIPLIER, %ebp + jae L(unroll) + + + C simple loop + C this is offset 0x22, so close enough to aligned +L(simple): + C eax scratch + C ebx carry + C ecx counter + C edx scratch + C esi src + C edi dst + C ebp multiplier + + movl (%esi), %eax + addl $4, %edi + + mull %ebp + + addl %ebx, %eax + adcl $0, %edx + + M4_inst %eax, -4(%edi) + movl %edx, %ebx + + adcl $0, %ebx + decl %ecx + + leal 4(%esi), %esi + jnz L(simple) + + + popl %ebp + popl %edi + + popl %esi + movl %ebx, %eax + + popl %ebx + ret + + + +C------------------------------------------------------------------------------ +C VAR_JUMP holds the computed jump temporarily because there's not enough +C registers when doing the mul for the initial two carry limbs. +C +C The add/adc for the initial carry in %ebx is necessary only for the +C mpn_add/submul_1c entry points. Duplicating the startup code to +C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good +C idea. + +dnl overlapping with parameters already fetched +define(VAR_COUNTER,`PARAM_SIZE') +define(VAR_JUMP, `PARAM_DST') + + C this is offset 0x43, so close enough to aligned +L(unroll): + C eax + C ebx initial carry + C ecx size + C edx + C esi src + C edi dst + C ebp + + movl %ecx, %edx + decl %ecx + + subl $2, %edx + negl %ecx + + shrl $UNROLL_LOG2, %edx + andl $UNROLL_MASK, %ecx + + movl %edx, VAR_COUNTER + movl %ecx, %edx + + C 15 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + shll $4, %edx + negl %ecx + + leal L(entry) (%edx,%ecx,1), %edx +') + movl (%esi), %eax C src low limb + + movl %edx, VAR_JUMP + leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi + + mull %ebp + + addl %ebx, %eax C initial carry (from _1c) + adcl $0, %edx + + movl %edx, %ebx C high carry + leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi + + movl VAR_JUMP, %edx + testl $1, %ecx + movl %eax, %ecx C low carry + + cmovnz( %ebx, %ecx) C high,low carry other way around + cmovnz( %eax, %ebx) + + jmp *%edx + + +ifdef(`PIC',` +L(pic_calc): + shll $4, %edx + negl %ecx + + C See README.family about old gas bugs + leal (%edx,%ecx,1), %edx + addl $L(entry)-L(here), %edx + + addl (%esp), %edx + + ret +') + + +C ----------------------------------------------------------- + ALIGN(32) +L(top): +deflit(`FRAME',16) + C eax scratch + C ebx carry hi + C ecx carry lo + C edx scratch + C esi src + C edi dst + C ebp multiplier + C + C VAR_COUNTER loop counter + C + C 15 code bytes per limb + + addl $UNROLL_BYTES, %edi + +L(entry): +deflit(CHUNK_COUNT,2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%esi), %eax) + mull %ebp +Zdisp( M4_inst,%ecx, disp0,(%edi)) + adcl %eax, %ebx + movl %edx, %ecx + adcl $0, %ecx + + movl disp1(%esi), %eax + mull %ebp + M4_inst %ebx, disp1(%edi) + adcl %eax, %ecx + movl %edx, %ebx + adcl $0, %ebx +') + + decl VAR_COUNTER + leal UNROLL_BYTES(%esi), %esi + + jns L(top) + + +deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128))) + + M4_inst %ecx, disp0(%edi) + movl %ebx, %eax + + popl %ebp + popl %edi + + popl %esi + popl %ebx + adcl $0, %eax + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/p6/diveby3.asm b/ghc/rts/gmp/mpn/x86/p6/diveby3.asm new file mode 100644 index 0000000..a77703e --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/p6/diveby3.asm @@ -0,0 +1,37 @@ +dnl Intel P6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder. +dnl +dnl P6: 8.5 cycles/limb + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl The P5 code runs well on P6, in fact better than anything else found so +dnl far. An imul is 4 cycles, meaning the two cmp/sbbl pairs on the +dnl dependent path are taking 4.5 cycles. +dnl +dnl The destination cache line prefetching is unnecessary on P6, but +dnl removing it is a 2 cycle slowdown (approx), so it must be inducing +dnl something good in the out of order execution. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_divexact_by3c) +include_mpn(`x86/pentium/diveby3.asm') diff --git a/ghc/rts/gmp/mpn/x86/p6/gmp-mparam.h b/ghc/rts/gmp/mpn/x86/p6/gmp-mparam.h new file mode 100644 index 0000000..d7bfb6d --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/p6/gmp-mparam.h @@ -0,0 +1,96 @@ +/* Intel P6 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + + +#ifndef UMUL_TIME +#define UMUL_TIME 5 /* cycles */ +#endif +#ifndef UDIV_TIME +#define UDIV_TIME 39 /* cycles */ +#endif + +#ifndef COUNT_TRAILING_ZEROS_TIME +#define COUNT_TRAILING_ZEROS_TIME 2 /* cycles */ +#endif + + +/* Generated by tuneup.c, 2000-07-06. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 23 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 139 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 52 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 166 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 116 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 66 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 20 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 54 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 592, 1440, 2688, 5632, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 608 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 5888 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 656, 1504, 2944, 6656, 18432, 57344, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 672 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 5888 +#endif diff --git a/ghc/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm b/ghc/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm new file mode 100644 index 0000000..f1b011b --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm @@ -0,0 +1,677 @@ +dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division. +dnl +dnl P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm, +C see that file for some comments. It's likely what's here can be improved. + + +dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by +dnl inverse method is used, rather than plain "divl"s. Minimum value 1. +dnl +dnl The different speeds of the integer and fraction parts means that using +dnl xsize+size isn't quite right. The threshold wants to be a bit higher +dnl for the integer part and a bit lower for the fraction part. (Or what's +dnl really wanted is to speed up the integer part!) +dnl +dnl The threshold is set to make the integer part right. At 4 limbs the +dnl div and mul are about the same there, but on the fractional part the +dnl mul is much faster. + +deflit(MUL_THRESHOLD, 4) + + +defframe(PARAM_CARRY, 24) +defframe(PARAM_DIVISOR,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC, 12) +defframe(PARAM_XSIZE, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +defframe(VAR_NORM, -20) +defframe(VAR_INVERSE, -24) +defframe(VAR_SRC, -28) +defframe(VAR_DST, -32) +defframe(VAR_DST_STOP,-36) + +deflit(STACK_SPACE, 36) + + .text + ALIGN(16) + +PROLOGUE(mpn_divrem_1c) +deflit(`FRAME',0) + movl PARAM_CARRY, %edx + + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + leal -4(%edi,%ebx,4), %edi + jmp LF(mpn_divrem_1,start_1c) + +EPILOGUE() + + + C offset 0x31, close enough to aligned +PROLOGUE(mpn_divrem_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl $0, %edx C initial carry (if can't skip a div) + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + orl %ecx, %ecx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] + jz L(no_skip_div) + + movl -4(%esi,%ecx,4), %eax C src high limb + cmpl %ebp, %eax C one less div if high=MUL_THRESHOLD, so with size==0 then + C must have xsize!=0 + jmp L(fraction_some) + + + +C ----------------------------------------------------------------------------- +C +C This loop runs at about 25 cycles, which is probably sub-optimal, and +C certainly more than the dependent chain would suggest. A better loop, or +C a better rough analysis of what's possible, would be welcomed. +C +C In the current implementation, the following successively dependent +C micro-ops seem to exist. +C +C uops +C n2+n1 1 (addl) +C mul 5 +C q1+1 3 (addl/adcl) +C mul 5 +C sub 3 (subl/sbbl) +C addback 2 (cmov) +C --- +C 19 +C +C Lack of registers hinders explicit scheduling and it might be that the +C normal out of order execution isn't able to hide enough under the mul +C latencies. +C +C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than +C cmov (and takes one uop off the dependent chain). A sarl/andl/addl +C combination was tried for the addback (despite the fact it would lengthen +C the dependent chain) but found to be no faster. + + + ALIGN(16) +L(integer_top): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp d + C + C mm0 scratch (src qword) + C mm7 rshift for normalization + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + movl VAR_SRC, %ecx + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + movq (%ecx), %mm0 C next src limb and the one below it + + mull VAR_INVERSE C m*(n2+n1) + + subl $4, %ecx + + movl %ecx, VAR_SRC + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + movl %ebp, %eax C d + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + jz L(q1_ff) + + mull %ebx C (q1+1)*d + + movl VAR_DST, %ecx + psrlq %mm7, %mm0 + + C + + C + + C + + subl %eax, %esi + movl VAR_DST_STOP, %eax + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + + sbbl $0, %ebx C q + subl $4, %ecx + + movl %ebx, (%ecx) + cmpl %eax, %ecx + + movl %ecx, VAR_DST + jne L(integer_top) + + +L(integer_loop_done): + + +C ----------------------------------------------------------------------------- +C +C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz +C q1_ff special case. This make the code a bit smaller and simpler, and +C costs only 2 cycles (each). + +L(integer_two_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + movl PARAM_SRC, %ecx + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movd (%ecx), %mm0 C src low limb + + movl VAR_DST_STOP, %ecx + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + + mull %ebx C (q1+1)*d + + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + C + + C + + subl %eax, %esi + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + + sbbl $0, %ebx C q + + movl %ebx, -4(%ecx) + + +C ----------------------------------------------------------------------------- +L(integer_one_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + movl VAR_DST_STOP, %ecx + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx C q1 if q1+1 overflowed + + mull %ebx + + C + + C + + C + + C + + subl %eax, %esi + movl PARAM_XSIZE, %eax + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + + sbbl $0, %ebx C q + + movl %ebx, -8(%ecx) + subl $8, %ecx + + + + orl %eax, %eax C xsize + jnz L(fraction_some) + + movl %edi, %eax +L(fraction_done): + movl VAR_NORM, %ecx + movl SAVE_EBP, %ebp + + movl SAVE_EDI, %edi + + movl SAVE_ESI, %esi + + movl SAVE_EBX, %ebx + addl $STACK_SPACE, %esp + + shrl %cl, %eax + emms + + ret + + +C ----------------------------------------------------------------------------- +C +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d + +L(q1_ff): + C eax (divisor) + C ebx (q1+1 == 0) + C ecx + C edx + C esi n10 + C edi n2 + C ebp divisor + + movl VAR_DST, %ecx + movl VAR_DST_STOP, %edx + subl $4, %ecx + + movl %ecx, VAR_DST + psrlq %mm7, %mm0 + leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 + + movl $-1, (%ecx) + movd %mm0, %esi C next n10 + + cmpl %ecx, %edx + jne L(integer_top) + + jmp L(integer_loop_done) + + + +C ----------------------------------------------------------------------------- +C +C In the current implementation, the following successively dependent +C micro-ops seem to exist. +C +C uops +C mul 5 +C q1+1 1 (addl) +C mul 5 +C sub 3 (negl/sbbl) +C addback 2 (cmov) +C --- +C 16 +C +C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for +C the addback was found to be a touch slower. + + + ALIGN(16) +L(fraction_some): + C eax + C ebx + C ecx + C edx + C esi + C edi carry + C ebp divisor + + movl PARAM_DST, %esi + movl VAR_DST_STOP, %ecx + movl %edi, %eax + + subl $8, %ecx + + + ALIGN(16) +L(fraction_top): + C eax n2, then scratch + C ebx scratch (nadj, q1) + C ecx dst, decrementing + C edx scratch + C esi dst stop point + C edi n2 + C ebp divisor + + mull VAR_INVERSE C m*n2 + + movl %ebp, %eax C d + subl $4, %ecx C dst + leal 1(%edi), %ebx + + C + + C + + C + + addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1 + + mull %ebx C (q1+1)*d + + C + + C + + C + + C + + negl %eax C low of n - (q1+1)*d + + sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry + leal (%ebp,%eax), %edx + + cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 + + sbbl $0, %ebx C q + movl %eax, %edi C remainder->n2 + cmpl %esi, %ecx + + movl %ebx, (%ecx) C previous q + jne L(fraction_top) + + + jmp L(fraction_done) + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/p6/mmx/mod_1.asm b/ghc/rts/gmp/mpn/x86/p6/mmx/mod_1.asm new file mode 100644 index 0000000..e7d8d94 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/p6/mmx/mod_1.asm @@ -0,0 +1,444 @@ +dnl Intel Pentium-II mpn_mod_1 -- mpn by limb remainder. +dnl +dnl P6MMX: 24.0 cycles/limb. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t carry); +C +C The code here very similar to mpn_divrem_1, but with the quotient +C discarded. What's here probably isn't optimal. +C +C See mpn/x86/p6/mmx/divrem_1.c and mpn/x86/k7/mmx/mod_1.asm for some +C comments. + + +dnl MUL_THRESHOLD is the size at which the multiply by inverse method is +dnl used, rather than plain "divl"s. Minimum value 2. + +deflit(MUL_THRESHOLD, 4) + + +defframe(PARAM_CARRY, 16) +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +defframe(VAR_NORM, -20) +defframe(VAR_INVERSE, -24) +defframe(VAR_SRC_STOP,-28) + +deflit(STACK_SPACE, 28) + + .text + ALIGN(16) + +PROLOGUE(mpn_mod_1c) +deflit(`FRAME',0) + movl PARAM_CARRY, %edx + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + jmp LF(mpn_mod_1,start_1c) + +EPILOGUE() + + + ALIGN(16) +PROLOGUE(mpn_mod_1) +deflit(`FRAME',0) + + movl $0, %edx C initial carry (if can't skip a div) + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + orl %ecx, %ecx + jz L(divide_done) + + movl -4(%esi,%ecx,4), %eax C src high limb + + cmpl %ebp, %eax C carry flag if high n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + cmpl %ebx, %ecx + + jne L(inverse_top) + + +L(inverse_loop_done): + + +C ----------------------------------------------------------------------------- + +L(inverse_two_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx &src[-1] + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 scratch (src dword) + C mm7 rshift + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movd 4(%ecx), %mm0 C src low limb + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + movl %ebp, %eax C d + + mull %ebx C (q1+1)*d + + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + C + + C + + subl %eax, %esi + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + + +C One limb left + + C eax scratch + C ebx scratch (nadj, q1) + C ecx + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movl VAR_NORM, %ecx C for final denorm + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + movl %ebp, %eax C d + + mull %ebx C (q1+1)*d + + movl SAVE_EBX, %ebx + + C + + C + + C + + subl %eax, %esi + + sbbl %edx, %edi C n - (q1+1)*d + leal (%ebp,%esi), %edx + movl SAVE_EBP, %ebp + + movl %esi, %eax C remainder + movl SAVE_ESI, %esi + + cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 + movl SAVE_EDI, %edi + + shrl %cl, %eax C denorm remainder + addl $STACK_SPACE, %esp + emms + + ret + + +C ----------------------------------------------------------------------------- +C +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d + +L(q1_ff): + C eax (divisor) + C ebx (q1+1 == 0) + C ecx src pointer + C edx + C esi n10 + C edi (n2) + C ebp divisor + + leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 + movl VAR_SRC_STOP, %edx + psrlq %mm7, %mm0 + + movd %mm0, %esi C next n10 + cmpl %ecx, %edx + jne L(inverse_top) + + jmp L(inverse_loop_done) + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/p6/mmx/popham.asm b/ghc/rts/gmp/mpn/x86/p6/mmx/popham.asm new file mode 100644 index 0000000..50f9a11 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/p6/mmx/popham.asm @@ -0,0 +1,31 @@ +dnl Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and +dnl hamming distance. +dnl +dnl P6MMX: popcount 11 cycles/limb (approx), hamdist 11.5 cycles/limb +dnl (approx) + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) +include_mpn(`x86/k6/mmx/popham.asm') diff --git a/ghc/rts/gmp/mpn/x86/p6/p3mmx/popham.asm b/ghc/rts/gmp/mpn/x86/p6/p3mmx/popham.asm new file mode 100644 index 0000000..e63fbf3 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/p6/p3mmx/popham.asm @@ -0,0 +1,30 @@ +dnl Intel Pentium-III mpn_popcount, mpn_hamdist -- population count and +dnl hamming distance. + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl Haven't actually measured it, but the K7 code with the psadbw should be +dnl good on P-III. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) +include_mpn(`x86/k7/mmx/popham.asm') diff --git a/ghc/rts/gmp/mpn/x86/p6/sqr_basecase.asm b/ghc/rts/gmp/mpn/x86/p6/sqr_basecase.asm new file mode 100644 index 0000000..174c784 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/p6/sqr_basecase.asm @@ -0,0 +1,641 @@ +dnl Intel P6 mpn_sqr_basecase -- square an mpn number. +dnl +dnl P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular +dnl product (measured on the speed difference between 20 and 40 limbs, +dnl which is the Karatsuba recursing range). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for +dnl a description. The only difference here is that UNROLL_COUNT can go up +dnl to 64 (not 63) making KARATSUBA_SQR_THRESHOLD_MAX 67. + +deflit(KARATSUBA_SQR_THRESHOLD_MAX, 67) + +ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE', +`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)') + +m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD') +deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3)) + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a +C lot of function call overheads are avoided, especially when the given size +C is small. +C +C The code size might look a bit excessive, but not all of it is executed so +C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases +C clearly apply only to those sizes; mid sizes like 10x10 only need part of +C the unrolled addmul; and big sizes like 40x40 that do use the full +C unrolling will least be making good use of it, because 40x40 will take +C something like 7000 cycles. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + + movl PARAM_SRC, %eax + + cmpl $2, %edx + movl PARAM_DST, %ecx + je L(two_limbs) + + movl (%eax), %eax + ja L(three_or_more) + + +C ----------------------------------------------------------------------------- +C one limb only + C eax src limb + C ebx + C ecx dst + C edx + + mull %eax + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + ret + + +C ----------------------------------------------------------------------------- +L(two_limbs): + C eax src + C ebx + C ecx dst + C edx + +defframe(SAVE_ESI, -4) +defframe(SAVE_EBX, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +deflit(`STACK_SPACE',16) + + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %esi, SAVE_ESI + movl %eax, %esi + movl (%eax), %eax + + mull %eax C src[0]^2 + + movl %eax, (%ecx) C dst[0] + movl 4(%esi), %eax + + movl %ebx, SAVE_EBX + movl %edx, %ebx C dst[1] + + mull %eax C src[1]^2 + + movl %edi, SAVE_EDI + movl %eax, %edi C dst[2] + movl (%esi), %eax + + movl %ebp, SAVE_EBP + movl %edx, %ebp C dst[3] + + mull 4(%esi) C src[0]*src[1] + + addl %eax, %ebx + movl SAVE_ESI, %esi + + adcl %edx, %edi + + adcl $0, %ebp + addl %ebx, %eax + movl SAVE_EBX, %ebx + + adcl %edi, %edx + movl SAVE_EDI, %edi + + adcl $0, %ebp + + movl %eax, 4(%ecx) + + movl %ebp, 12(%ecx) + movl SAVE_EBP, %ebp + + movl %edx, 8(%ecx) + addl $FRAME, %esp + + ret + + +C ----------------------------------------------------------------------------- +L(three_or_more): + C eax src low limb + C ebx + C ecx dst + C edx size +deflit(`FRAME',0) + + pushl %esi defframe_pushl(`SAVE_ESI') + cmpl $4, %edx + + movl PARAM_SRC, %esi + jae L(four_or_more) + + +C ----------------------------------------------------------------------------- +C three limbs + + C eax src low limb + C ebx + C ecx dst + C edx + C esi src + C edi + C ebp + + pushl %ebp defframe_pushl(`SAVE_EBP') + pushl %edi defframe_pushl(`SAVE_EDI') + + mull %eax C src[0] ^ 2 + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + movl 4(%esi), %eax + xorl %ebp, %ebp + + mull %eax C src[1] ^ 2 + + movl %eax, 8(%ecx) + movl %edx, 12(%ecx) + movl 8(%esi), %eax + + pushl %ebx defframe_pushl(`SAVE_EBX') + + mull %eax C src[2] ^ 2 + + movl %eax, 16(%ecx) + movl %edx, 20(%ecx) + + movl (%esi), %eax + + mull 4(%esi) C src[0] * src[1] + + movl %eax, %ebx + movl %edx, %edi + + movl (%esi), %eax + + mull 8(%esi) C src[0] * src[2] + + addl %eax, %edi + movl %edx, %ebp + + adcl $0, %ebp + movl 4(%esi), %eax + + mull 8(%esi) C src[1] * src[2] + + xorl %esi, %esi + addl %eax, %ebp + + C eax + C ebx dst[1] + C ecx dst + C edx dst[4] + C esi zero, will be dst[5] + C edi dst[2] + C ebp dst[3] + + adcl $0, %edx + addl %ebx, %ebx + + adcl %edi, %edi + + adcl %ebp, %ebp + + adcl %edx, %edx + movl 4(%ecx), %eax + + adcl $0, %esi + addl %ebx, %eax + + movl %eax, 4(%ecx) + movl 8(%ecx), %eax + + adcl %edi, %eax + movl 12(%ecx), %ebx + + adcl %ebp, %ebx + movl 16(%ecx), %edi + + movl %eax, 8(%ecx) + movl SAVE_EBP, %ebp + + movl %ebx, 12(%ecx) + movl SAVE_EBX, %ebx + + adcl %edx, %edi + movl 20(%ecx), %eax + + movl %edi, 16(%ecx) + movl SAVE_EDI, %edi + + adcl %esi, %eax C no carry out of this + movl SAVE_ESI, %esi + + movl %eax, 20(%ecx) + addl $FRAME, %esp + + ret + + + +C ----------------------------------------------------------------------------- +defframe(VAR_COUNTER,-20) +defframe(VAR_JMP, -24) +deflit(`STACK_SPACE',24) + +L(four_or_more): + C eax src low limb + C ebx + C ecx + C edx size + C esi src + C edi + C ebp +deflit(`FRAME',4) dnl %esi already pushed + +C First multiply src[0]*src[1..size-1] and store at dst[1..size]. + + subl $STACK_SPACE-FRAME, %esp +deflit(`FRAME',STACK_SPACE) + movl $1, %ecx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl %ebx, SAVE_EBX + subl %edx, %ecx C -(size-1) + + movl %ebp, SAVE_EBP + movl $0, %ebx C initial carry + + leal (%esi,%edx,4), %esi C &src[size] + movl %eax, %ebp C multiplier + + leal -4(%edi,%edx,4), %edi C &dst[size-1] + + +C This loop runs at just over 6 c/l. + +L(mul_1): + C eax scratch + C ebx carry + C ecx counter, limbs, negative, -(size-1) to -1 + C edx scratch + C esi &src[size] + C edi &dst[size-1] + C ebp multiplier + + movl %ebp, %eax + + mull (%esi,%ecx,4) + + addl %ebx, %eax + movl $0, %ebx + + adcl %edx, %ebx + movl %eax, 4(%edi,%ecx,4) + + incl %ecx + jnz L(mul_1) + + + movl %ebx, 4(%edi) + + +C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. +C +C The last two addmuls, which are the bottom right corner of the product +C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] +C and src[size-2]*src[size-1]. If size is 4 then it's only these corner +C cases that need to be done. +C +C The unrolled code is the same as mpn_addmul_1(), see that routine for some +C comments. +C +C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive. +C +C VAR_JMP is the computed jump into the unrolled code, stepped by one code +C chunk each outer loop. + +dnl This is also hard-coded in the address calculation below. +deflit(CODE_BYTES_PER_LIMB, 15) + +dnl With &src[size] and &dst[size-1] pointers, the displacements in the +dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above +dnl that an offset must be added to them. +deflit(OFFSET, +ifelse(eval(UNROLL_COUNT>32),1, +eval((UNROLL_COUNT-32)*4), +0)) + + C eax + C ebx carry + C ecx + C edx + C esi &src[size] + C edi &dst[size-1] + C ebp + + movl PARAM_SIZE, %ecx + + subl $4, %ecx + jz L(corner) + + movl %ecx, %edx + negl %ecx + + shll $4, %ecx +ifelse(OFFSET,0,,`subl $OFFSET, %esi') + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx +') + negl %edx + +ifelse(OFFSET,0,,`subl $OFFSET, %edi') + + C The calculated jump mustn't be before the start of the available + C code. This is the limit that UNROLL_COUNT puts on the src operand + C size, but checked here using the jump address directly. + + ASSERT(ae, + `movl_text_address( L(unroll_inner_start), %eax) + cmpl %eax, %ecx') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll_outer_top): + C eax + C ebx high limb to store + C ecx VAR_JMP + C edx VAR_COUNTER, limbs, negative + C esi &src[size], constant + C edi dst ptr, second highest limb of last addmul + C ebp + + movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier + movl %edx, VAR_COUNTER + + movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand + + mull %ebp + +define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')') + + testb $1, %cl + + movl %edx, %ebx C high carry + leal 4(%edi), %edi + + movl %ecx, %edx C jump + + movl %eax, %ecx C low carry + leal CODE_BYTES_PER_LIMB(%edx), %edx + + cmovX( %ebx, %ecx) C high carry reverse + cmovX( %eax, %ebx) C low carry reverse + movl %edx, VAR_JMP + jmp *%edx + + + C Must be on an even address here so the low bit of the jump address + C will indicate which way around ecx/ebx should start. + + ALIGN(2) + +L(unroll_inner_start): + C eax scratch + C ebx carry high + C ecx carry low + C edx scratch + C esi src pointer + C edi dst pointer + C ebp multiplier + C + C 15 code bytes each limb + C ecx/ebx reversed on each chunk + +forloop(`i', UNROLL_COUNT, 1, ` + deflit(`disp_src', eval(-i*4 + OFFSET)) + deflit(`disp_dst', eval(disp_src)) + + m4_assert(`disp_src>=-128 && disp_src<128') + m4_assert(`disp_dst>=-128 && disp_dst<128') + +ifelse(eval(i%2),0,` +Zdisp( movl, disp_src,(%esi), %eax) + mull %ebp +Zdisp( addl, %ebx, disp_dst,(%edi)) + adcl %eax, %ecx + movl %edx, %ebx + adcl $0, %ebx +',` + dnl this one comes out last +Zdisp( movl, disp_src,(%esi), %eax) + mull %ebp +Zdisp( addl, %ecx, disp_dst,(%edi)) + adcl %eax, %ebx + movl %edx, %ecx + adcl $0, %ecx +') +') +L(unroll_inner_end): + + addl %ebx, m4_empty_if_zero(OFFSET)(%edi) + + movl VAR_COUNTER, %edx + adcl $0, %ecx + + movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi) + movl VAR_JMP, %ecx + + incl %edx + jnz L(unroll_outer_top) + + +ifelse(OFFSET,0,,` + addl $OFFSET, %esi + addl $OFFSET, %edi +') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(corner): + C eax + C ebx + C ecx + C edx + C esi &src[size] + C edi &dst[2*size-5] + C ebp + + movl -12(%esi), %eax + + mull -8(%esi) + + addl %eax, (%edi) + movl -12(%esi), %eax + movl $0, %ebx + + adcl %edx, %ebx + + mull -4(%esi) + + addl %eax, %ebx + movl -8(%esi), %eax + + adcl $0, %edx + + addl %ebx, 4(%edi) + movl $0, %ebx + + adcl %edx, %ebx + + mull -4(%esi) + + movl PARAM_SIZE, %ecx + addl %ebx, %eax + + adcl $0, %edx + + movl %eax, 8(%edi) + + movl %edx, 12(%edi) + movl PARAM_DST, %edi + + +C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1]. + + subl $1, %ecx C size-1 + xorl %eax, %eax C ready for final adcl, and clear carry + + movl %ecx, %edx + movl PARAM_SRC, %esi + + +L(lshift): + C eax + C ebx + C ecx counter, size-1 to 1 + C edx size-1 (for later use) + C esi src (for later use) + C edi dst, incrementing + C ebp + + rcll 4(%edi) + rcll 8(%edi) + + leal 8(%edi), %edi + decl %ecx + jnz L(lshift) + + + adcl %eax, %eax + + movl %eax, 4(%edi) C dst most significant limb + movl (%esi), %eax C src[0] + + leal 4(%esi,%edx,4), %esi C &src[size] + subl %edx, %ecx C -(size-1) + + +C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + + mull %eax + + movl %eax, (%edi,%ecx,8) C dst[0] + + +L(diag): + C eax scratch + C ebx scratch + C ecx counter, negative + C edx carry + C esi &src[size] + C edi dst[2*size-2] + C ebp + + movl (%esi,%ecx,4), %eax + movl %edx, %ebx + + mull %eax + + addl %ebx, 4(%edi,%ecx,8) + adcl %eax, 8(%edi,%ecx,8) + adcl $0, %edx + + incl %ecx + jnz L(diag) + + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + + addl %edx, 4(%edi) C dst most significant limb + + movl SAVE_EDI, %edi + movl SAVE_EBP, %ebp + addl $FRAME, %esp + ret + + + +C ----------------------------------------------------------------------------- +ifdef(`PIC',` +L(pic_calc): + addl (%esp), %ecx + addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx + addl %edx, %ecx + ret +') + + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/pentium/README b/ghc/rts/gmp/mpn/x86/pentium/README index d73b082..3b9ec8a 100644 --- a/ghc/rts/gmp/mpn/x86/pentium/README +++ b/ghc/rts/gmp/mpn/x86/pentium/README @@ -1,5 +1,51 @@ -This directory contains mpn functions optimized for Intel Pentium -processors. + + INTEL PENTIUM P5 MPN SUBROUTINES + + +This directory contains mpn functions optimized for Intel Pentium (P5,P54) +processors. The mmx subdirectory has code for Pentium with MMX (P55). + + +STATUS + + cycles/limb + + mpn_add_n/sub_n 2.375 + + mpn_copyi/copyd 1.0 + + mpn_divrem_1 44.0 + mpn_mod_1 44.0 + mpn_divexact_by3 15.0 + + mpn_l/rshift 5.375 normal (6.0 on P54) + 1.875 special shift by 1 bit + + mpn_mul_1 13.0 + mpn_add/submul_1 14.0 + + mpn_mul_basecase 14.2 cycles/crossproduct (approx) + + mpn_sqr_basecase 8 cycles/crossproduct (approx) + or 15.5 cycles/triangleproduct (approx) + +Pentium MMX gets the following improvements + + mpn_l/rshift 1.75 + + +1. mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the +documentation indicates that they should take only 43/8 = 5.375 cycles/limb, +or 5 cycles/limb asymptotically. The P55 runs them at the expected speed. + +2. mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb. Due to loop +overhead and other delays (cache refill?), they run at or near 2.5 cycles/limb. + +3. mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they +should. Intel documentation says a mul instruction is 10 cycles, but it +measures 9 and the routines using it run with it as 9. + + RELEVANT OPTIMIZATION ISSUES @@ -13,14 +59,19 @@ to different cache banks. The simplest way to insure this is to read/write two words from the same object. If we make operations on different objects, they might or might not be to the same cache bank. -STATUS -1. mpn_lshift and mpn_rshift run at about 6 cycles/limb, but the Pentium -documentation indicates that they should take only 43/8 = 5.375 cycles/limb, -or 5 cycles/limb asymptotically. -2. mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb. Due to loop -overhead and other delays (cache refill?), they run at or near 2.5 cycles/limb. +REFERENCES -3. mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they -should... +"Intel Architecture Optimization Manual", 1997, order number 242816. This +is mostly about P5, the parts about P6 aren't relevant. Available on-line: + + http://download.intel.com/design/PentiumII/manuals/242816.htm + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/ghc/rts/gmp/mpn/x86/pentium/aors_n.asm b/ghc/rts/gmp/mpn/x86/pentium/aors_n.asm new file mode 100644 index 0000000..a61082a --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/pentium/aors_n.asm @@ -0,0 +1,196 @@ +dnl Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction. +dnl +dnl P5: 2.375 cycles/limb + + +dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software +dnl Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +ifdef(`OPERATION_add_n',` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + +',`ifdef(`OPERATION_sub_n',` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(M4_function_nc) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%ebp + movl PARAM_SIZE,%ecx + + movl (%ebp),%ebx + + decl %ecx + movl %ecx,%edx + shrl $3,%ecx + andl $7,%edx + testl %ecx,%ecx C zero carry flag + jz L(endgo) + + pushl %edx +FRAME_pushl() + movl PARAM_CARRY,%eax + shrl $1,%eax C shift bit 0 into carry + jmp LF(M4_function_n,oop) + +L(endgo): +deflit(`FRAME',16) + movl PARAM_CARRY,%eax + shrl $1,%eax C shift bit 0 into carry + jmp LF(M4_function_n,end) + +EPILOGUE() + + + ALIGN(8) +PROLOGUE(M4_function_n) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%ebp + movl PARAM_SIZE,%ecx + + movl (%ebp),%ebx + + decl %ecx + movl %ecx,%edx + shrl $3,%ecx + andl $7,%edx + testl %ecx,%ecx C zero carry flag + jz L(end) + pushl %edx +FRAME_pushl() + + ALIGN(8) +L(oop): movl 28(%edi),%eax C fetch destination cache line + leal 32(%edi),%edi + +L(1): movl (%esi),%eax + movl 4(%esi),%edx + M4_inst %ebx,%eax + movl 4(%ebp),%ebx + M4_inst %ebx,%edx + movl 8(%ebp),%ebx + movl %eax,-32(%edi) + movl %edx,-28(%edi) + +L(2): movl 8(%esi),%eax + movl 12(%esi),%edx + M4_inst %ebx,%eax + movl 12(%ebp),%ebx + M4_inst %ebx,%edx + movl 16(%ebp),%ebx + movl %eax,-24(%edi) + movl %edx,-20(%edi) + +L(3): movl 16(%esi),%eax + movl 20(%esi),%edx + M4_inst %ebx,%eax + movl 20(%ebp),%ebx + M4_inst %ebx,%edx + movl 24(%ebp),%ebx + movl %eax,-16(%edi) + movl %edx,-12(%edi) + +L(4): movl 24(%esi),%eax + movl 28(%esi),%edx + M4_inst %ebx,%eax + movl 28(%ebp),%ebx + M4_inst %ebx,%edx + movl 32(%ebp),%ebx + movl %eax,-8(%edi) + movl %edx,-4(%edi) + + leal 32(%esi),%esi + leal 32(%ebp),%ebp + decl %ecx + jnz L(oop) + + popl %edx +FRAME_popl() +L(end): + decl %edx C test %edx w/o clobbering carry + js L(end2) + incl %edx +L(oop2): + leal 4(%edi),%edi + movl (%esi),%eax + M4_inst %ebx,%eax + movl 4(%ebp),%ebx + movl %eax,-4(%edi) + leal 4(%esi),%esi + leal 4(%ebp),%ebp + decl %edx + jnz L(oop2) +L(end2): + movl (%esi),%eax + M4_inst %ebx,%eax + movl %eax,(%edi) + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/pentium/aorsmul_1.asm b/ghc/rts/gmp/mpn/x86/pentium/aorsmul_1.asm new file mode 100644 index 0000000..147b556 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/pentium/aorsmul_1.asm @@ -0,0 +1,99 @@ +dnl Intel Pentium mpn_addmul_1 -- mpn by limb multiplication. +dnl +dnl P5: 14.0 cycles/limb + + +dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, +dnl Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. */ + + +include(`../config.m4') + + +ifdef(`OPERATION_addmul_1', ` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + +',`ifdef(`OPERATION_submul_1', ` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + + +C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); + +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) + +PROLOGUE(M4_function_1) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST, %edi + movl PARAM_SRC, %esi + movl PARAM_SIZE, %ecx + movl PARAM_MULTIPLIER, %ebp + + leal (%edi,%ecx,4), %edi + leal (%esi,%ecx,4), %esi + negl %ecx + xorl %ebx, %ebx + ALIGN(8) + +L(oop): adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %ebx, %eax + movl (%edi,%ecx,4), %ebx + + adcl $0, %edx + M4_inst %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(oop) + + adcl $0, %ebx + movl %ebx, %eax + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/pentium/diveby3.asm b/ghc/rts/gmp/mpn/x86/pentium/diveby3.asm new file mode 100644 index 0000000..dbac816 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/pentium/diveby3.asm @@ -0,0 +1,183 @@ +dnl Intel P5 mpn_divexact_by3 -- mpn division by 3, expecting no remainder. +dnl +dnl P5: 15.0 cycles/limb + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t carry); + +defframe(PARAM_CARRY,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl multiplicative inverse of 3, modulo 2^32 +deflit(INVERSE_3, 0xAAAAAAAB) + +dnl ceil(b/3), ceil(b*2/3) and floor(b*2/3) where b=2^32 +deflit(ONE_THIRD_CEIL, 0x55555556) +deflit(TWO_THIRDS_CEIL, 0xAAAAAAAB) +deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA) + + .text + ALIGN(8) + +PROLOGUE(mpn_divexact_by3c) +deflit(`FRAME',0) + + movl PARAM_SRC, %ecx + movl PARAM_SIZE, %edx + + decl %edx + jnz L(two_or_more) + + movl (%ecx), %edx + movl PARAM_CARRY, %eax C risk of cache bank clash here + + movl PARAM_DST, %ecx + subl %eax, %edx + + sbbl %eax, %eax C 0 or -1 + + imull $INVERSE_3, %edx, %edx + + negl %eax C 0 or 1 + cmpl $ONE_THIRD_CEIL, %edx + + sbbl $-1, %eax C +1 if edx>=ceil(b/3) + cmpl $TWO_THIRDS_CEIL, %edx + + sbbl $-1, %eax C +1 if edx>=ceil(b*2/3) + movl %edx, (%ecx) + + ret + + +L(two_or_more): + C eax + C ebx + C ecx src + C edx size-1 + C esi + C edi + C ebp + + pushl %ebx FRAME_pushl() + pushl %esi FRAME_pushl() + + pushl %edi FRAME_pushl() + pushl %ebp FRAME_pushl() + + movl PARAM_DST, %edi + movl PARAM_CARRY, %esi + + movl (%ecx), %eax C src low limb + xorl %ebx, %ebx + + sub %esi, %eax + movl $TWO_THIRDS_FLOOR, %esi + + leal (%ecx,%edx,4), %ecx C &src[size-1] + leal (%edi,%edx,4), %edi C &dst[size-1] + + adcl $0, %ebx C carry, 0 or 1 + negl %edx C -(size-1) + + +C The loop needs a source limb ready at the top, which leads to one limb +C handled separately at the end, and the special case above for size==1. +C There doesn't seem to be any scheduling that would keep the speed but move +C the source load and carry subtract up to the top. +C +C The destination cache line prefetching adds 1 cycle to the loop but is +C considered worthwhile. The slowdown is a factor of 1.07, but will prevent +C repeated write-throughs if the destination isn't in L1. A version using +C an outer loop to prefetch only every 8 limbs (a cache line) proved to be +C no faster, due to unavoidable branch mispreditions in the inner loop. +C +C setc is 2 cycles on P54, so an adcl is used instead. If the movl $0,%ebx +C could be avoided then the src limb fetch could pair up and save a cycle. +C This would probably mean going to a two limb loop with the carry limb +C alternately positive or negative, since an sbbl %ebx,%ebx will leave a +C value which is in the opposite sense to the preceding sbbl/adcl %ebx,%eax. +C +C A register is used for TWO_THIRDS_FLOOR because a cmp can't be done as +C "cmpl %edx, $n" with the immediate as the second operand. +C +C The "4" source displacement is in the loop rather than the setup because +C this gets L(top) aligned to 8 bytes at no cost. + + ALIGN(8) +L(top): + C eax source limb, carry subtracted + C ebx carry (0 or 1) + C ecx &src[size-1] + C edx counter, limbs, negative + C esi TWO_THIRDS_FLOOR + C edi &dst[size-1] + C ebp scratch (result limb) + + imull $INVERSE_3, %eax, %ebp + + cmpl $ONE_THIRD_CEIL, %ebp + movl (%edi,%edx,4), %eax C dst cache line prefetch + + sbbl $-1, %ebx C +1 if ebp>=ceil(b/3) + cmpl %ebp, %esi + + movl 4(%ecx,%edx,4), %eax C next src limb + + sbbl %ebx, %eax C and further -1 if ebp>=ceil(b*2/3) + movl $0, %ebx + + adcl $0, %ebx C new carry + movl %ebp, (%edi,%edx,4) + + incl %edx + jnz L(top) + + + + imull $INVERSE_3, %eax, %edx + + cmpl $ONE_THIRD_CEIL, %edx + movl %edx, (%edi) + + sbbl $-1, %ebx C +1 if edx>=ceil(b/3) + cmpl $TWO_THIRDS_CEIL, %edx + + sbbl $-1, %ebx C +1 if edx>=ceil(b*2/3) + popl %ebp + + movl %ebx, %eax + popl %edi + + popl %esi + popl %ebx + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/pentium/gmp-mparam.h b/ghc/rts/gmp/mpn/x86/pentium/gmp-mparam.h new file mode 100644 index 0000000..d3ed3d7 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/pentium/gmp-mparam.h @@ -0,0 +1,97 @@ +/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + + +#ifndef UMUL_TIME +#define UMUL_TIME 9 /* cycles */ +#endif +#ifndef UDIV_TIME +#define UDIV_TIME 41 /* cycles */ +#endif + +/* bsf takes 18-42 cycles, put an average for uniform random numbers */ +#ifndef COUNT_TRAILING_ZEROS_TIME +#define COUNT_TRAILING_ZEROS_TIME 20 /* cycles */ +#endif + + +/* Generated by tuneup.c, 2000-07-06. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 14 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 179 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 22 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 153 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 46 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 110 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 13 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 25 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 496, 928, 1920, 4608, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 512 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 3840 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 496, 1184, 1920, 5632, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 512 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 3840 +#endif diff --git a/ghc/rts/gmp/mpn/x86/pentium/lshift.asm b/ghc/rts/gmp/mpn/x86/pentium/lshift.asm new file mode 100644 index 0000000..e1e35d4 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/pentium/lshift.asm @@ -0,0 +1,236 @@ +dnl Intel Pentium mpn_lshift -- mpn left shift. +dnl +dnl cycles/limb +dnl P5,P54: 6.0 +dnl P55: 5.375 + + +dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software +dnl Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does, +C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_lshift) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%ebp + movl PARAM_SHIFT,%ecx + +C We can use faster code for shift-by-1 under certain conditions. + cmp $1,%ecx + jne L(normal) + leal 4(%esi),%eax + cmpl %edi,%eax + jnc L(special) C jump if s_ptr + 1 >= res_ptr + leal (%esi,%ebp,4),%eax + cmpl %eax,%edi + jnc L(special) C jump if res_ptr >= s_ptr + size + +L(normal): + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + xorl %eax,%eax + shldl( %cl, %edx, %eax) C compute carry limb + pushl %eax C push carry limb onto stack + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz L(end) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(oop): movl -28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + shldl( %cl, %eax, %ebx) + shldl( %cl, %edx, %eax) + movl %ebx,(%edi) + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + shldl( %cl, %ebx, %edx) + shldl( %cl, %eax, %ebx) + movl %edx,-8(%edi) + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + shldl( %cl, %edx, %eax) + shldl( %cl, %ebx, %edx) + movl %eax,-16(%edi) + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + shldl( %cl, %eax, %ebx) + shldl( %cl, %edx, %eax) + movl %ebx,-24(%edi) + movl %eax,-28(%edi) + + subl $32,%esi + subl $32,%edi + decl %ebp + jnz L(oop) + +L(end): popl %ebp + andl $7,%ebp + jz L(end2) +L(oop2): + movl (%esi),%eax + shldl( %cl,%eax,%edx) + movl %edx,(%edi) + movl %eax,%edx + subl $4,%esi + subl $4,%edi + decl %ebp + jnz L(oop2) + +L(end2): + shll %cl,%edx C compute least significant limb + movl %edx,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + +C We loop from least significant end of the arrays, which is only +C permissable if the source and destination don't overlap, since the +C function is documented to work for overlapping source and destination. + +L(special): + movl (%esi),%edx + addl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + addl %edx,%edx + incl %ebp + decl %ebp + jz L(Lend) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(Loop): + movl 28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + adcl %eax,%eax + movl %ebx,(%edi) + adcl %edx,%edx + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + adcl %ebx,%ebx + movl %edx,8(%edi) + adcl %eax,%eax + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + adcl %edx,%edx + movl %eax,16(%edi) + adcl %ebx,%ebx + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + adcl %eax,%eax + movl %ebx,24(%edi) + adcl %edx,%edx + movl %eax,28(%edi) + + leal 32(%esi),%esi C use leal not to clobber carry + leal 32(%edi),%edi + decl %ebp + jnz L(Loop) + +L(Lend): + popl %ebp + sbbl %eax,%eax C save carry in %eax + andl $7,%ebp + jz L(Lend2) + addl %eax,%eax C restore carry from eax +L(Loop2): + movl %edx,%ebx + movl (%esi),%edx + adcl %edx,%edx + movl %ebx,(%edi) + + leal 4(%esi),%esi C use leal not to clobber carry + leal 4(%edi),%edi + decl %ebp + jnz L(Loop2) + + jmp L(L1) +L(Lend2): + addl %eax,%eax C restore carry from eax +L(L1): movl %edx,(%edi) C store last limb + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h b/ghc/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h new file mode 100644 index 0000000..2379077 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h @@ -0,0 +1,97 @@ +/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + + +#ifndef UMUL_TIME +#define UMUL_TIME 9 /* cycles */ +#endif +#ifndef UDIV_TIME +#define UDIV_TIME 41 /* cycles */ +#endif + +/* bsf takes 18-42 cycles, put an average for uniform random numbers */ +#ifndef COUNT_TRAILING_ZEROS_TIME +#define COUNT_TRAILING_ZEROS_TIME 20 /* cycles */ +#endif + + +/* Generated by tuneup.c, 2000-07-06. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 14 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 99 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 22 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 89 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 40 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 98 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 13 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 5 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 25 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 496, 1056, 1920, 4608, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 512 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 3840 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 496, 1184, 2176, 5632, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 512 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 4352 +#endif diff --git a/ghc/rts/gmp/mpn/x86/pentium/mmx/lshift.asm b/ghc/rts/gmp/mpn/x86/pentium/mmx/lshift.asm new file mode 100644 index 0000000..2225438 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/pentium/mmx/lshift.asm @@ -0,0 +1,455 @@ +dnl Intel P5 mpn_lshift -- mpn left shift. +dnl +dnl P5: 1.75 cycles/limb. + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size left by shift many bits and store the result in dst,size. +C Zeros are shifted in at the right. Return the bits shifted out at the +C left. +C +C The comments in mpn_rshift apply here too. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl minimum 5, because the unrolled loop can't handle less +deflit(UNROLL_THRESHOLD, 5) + + .text + ALIGN(8) + +PROLOGUE(mpn_lshift) + + pushl %ebx + pushl %edi +deflit(`FRAME',8) + + movl PARAM_SIZE, %eax + movl PARAM_DST, %edx + + movl PARAM_SRC, %ebx + movl PARAM_SHIFT, %ecx + + cmp $UNROLL_THRESHOLD, %eax + jae L(unroll) + + movl -4(%ebx,%eax,4), %edi C src high limb + decl %eax + + jnz L(simple) + + shldl( %cl, %edi, %eax) C eax was decremented to zero + + shll %cl, %edi + + movl %edi, (%edx) C dst low limb + popl %edi C risk of data cache bank clash + + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- +L(simple): + C eax size-1 + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx,%eax,4), %mm5 C src high limb + + movd %ecx, %mm6 C lshift + negl %ecx + + psllq %mm6, %mm5 + addl $32, %ecx + + movd %ecx, %mm7 + psrlq $32, %mm5 C retval + + +L(simple_top): + C eax counter, limbs, negative + C ebx src + C ecx + C edx dst + C esi + C edi + C + C mm0 scratch + C mm5 return value + C mm6 shift + C mm7 32-shift + + movq -4(%ebx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + C + + movd %mm0, 4(%edx,%eax,4) + jnz L(simple_top) + + + movd (%ebx), %mm0 + + movd %mm5, %eax + psllq %mm6, %mm0 + + popl %edi + popl %ebx + + movd %mm0, (%edx) + + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(unroll): + C eax size + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd -4(%ebx,%eax,4), %mm5 C src high limb + leal (%ebx,%eax,4), %edi + + movd %ecx, %mm6 C lshift + andl $4, %edi + + psllq %mm6, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process high limb separately (marked xxx) to + C make it so. + C + C source -8(ebx,%eax,4) + C | + C +-------+-------+-------+-- + C | | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + C + C dest + C -4(edx,%eax,4) + C | + C +-------+-------+-- + C | xxx | | + C +-------+-------+-- + + movq -8(%ebx,%eax,4), %mm0 C unaligned load + + psllq %mm6, %mm0 + decl %eax + + psrlq $32, %mm0 + + C + + movd %mm0, (%edx,%eax,4) +L(start_src_aligned): + + movq -8(%ebx,%eax,4), %mm1 C src high qword + leal (%edx,%eax,4), %edi + + andl $4, %edi + psrlq $32, %mm5 C return value + + movq -16(%ebx,%eax,4), %mm3 C src second highest qword + jz L(start_dst_aligned) + + C dst isn't aligned, subtract 4 to make it so, and pretend the shift + C is 32 bits extra. High limb of dst (marked xxx) handled here + C separately. + C + C source -8(ebx,%eax,4) + C | + C +-------+-------+-- + C | mm1 | + C +-------+-------+-- + C 0mod8 4mod8 + C + C dest + C -4(edx,%eax,4) + C | + C +-------+-------+-------+-- + C | xxx | | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + + movq %mm1, %mm0 + addl $32, %ecx C new shift + + psllq %mm6, %mm0 + + movd %ecx, %mm6 + psrlq $32, %mm0 + + C wasted cycle here waiting for %mm0 + + movd %mm0, -4(%edx,%eax,4) + subl $4, %edx +L(start_dst_aligned): + + + psllq %mm6, %mm1 + negl %ecx C -shift + + addl $64, %ecx C 64-shift + movq %mm3, %mm2 + + movd %ecx, %mm7 + subl $8, %eax C size-8 + + psrlq %mm7, %mm3 + + por %mm1, %mm3 C mm3 ready to store + jc L(finish) + + + C The comments in mpn_rshift apply here too. + + ALIGN(8) +L(unroll_loop): + C eax counter, limbs + C ebx src + C ecx + C edx dst + C esi + C edi + C + C mm0 + C mm1 + C mm2 src qword from 48(%ebx,%eax,4) + C mm3 dst qword ready to store to 56(%edx,%eax,4) + C + C mm5 return value + C mm6 lshift + C mm7 rshift + + movq 8(%ebx,%eax,4), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + movq %mm3, 24(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq (%ebx,%eax,4), %mm3 C + psllq %mm6, %mm1 C + + movq %mm0, 16(%edx,%eax,4) + movq %mm3, %mm2 C + + psrlq %mm7, %mm3 C + subl $4, %eax + + por %mm1, %mm3 C + jnc L(unroll_loop) + + + +L(finish): + C eax -4 to -1 representing respectively 0 to 3 limbs remaining + + testb $2, %al + + jz L(finish_no_two) + + movq 8(%ebx,%eax,4), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + movq %mm3, 24(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq %mm1, %mm2 + movq %mm0, %mm3 + + subl $2, %eax +L(finish_no_two): + + + C eax -4 or -3 representing respectively 0 or 1 limbs remaining + C + C mm2 src prev qword, from 48(%ebx,%eax,4) + C mm3 dst qword, for 56(%edx,%eax,4) + + testb $1, %al + movd %mm5, %eax C retval + + popl %edi + jz L(finish_zero) + + + C One extra src limb, destination was aligned. + C + C source ebx + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edx+12 edx+4 edx + C --+---------------+---------------+-------+ + C | mm3 | | | + C --+---------------+---------------+-------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C One extra src limb, destination was unaligned. + C + C source ebx + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edx+12 edx+4 + C --+---------------+---------------+ + C | mm3 | | + C --+---------------+---------------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at 4(%edx), and in the aligned case + C there's an extra limb of dst to be formed from that extra src limb + C left shifted. + + + movd (%ebx), %mm0 + psllq %mm6, %mm2 + + movq %mm3, 12(%edx) + psllq $32, %mm0 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm2, %mm0 + psllq %mm6, %mm1 + + movq %mm0, 4(%edx) + psrlq $32, %mm1 + + andl $32, %ecx + popl %ebx + + jz L(finish_one_unaligned) + + movd %mm1, (%edx) +L(finish_one_unaligned): + + emms + + ret + + +L(finish_zero): + + C No extra src limbs, destination was aligned. + C + C source ebx + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edx+8 edx + C --+---------------+---------------+ + C | mm3 | | + C --+---------------+---------------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C No extra src limbs, destination was unaligned. + C + C source ebx + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edx+8 edx+4 + C --+---------------+-------+ + C | mm3 | | + C --+---------------+-------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C The movd for the unaligned case writes the same data to 4(%edx) + C that the movq does for the aligned case. + + + movq %mm3, 8(%edx) + andl $32, %ecx + + psllq %mm6, %mm2 + jz L(finish_zero_unaligned) + + movq %mm2, (%edx) +L(finish_zero_unaligned): + + psrlq $32, %mm2 + popl %ebx + + movd %mm5, %eax C retval + + movd %mm2, 4(%edx) + + emms + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/pentium/mmx/popham.asm b/ghc/rts/gmp/mpn/x86/pentium/mmx/popham.asm new file mode 100644 index 0000000..587a07a --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/pentium/mmx/popham.asm @@ -0,0 +1,30 @@ +dnl Intel P55 mpn_popcount, mpn_hamdist -- population count and hamming +dnl distance. +dnl +dnl P55: popcount 11.5 cycles/limb, hamdist 12.0 cycles/limb + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) +include_mpn(`x86/k6/mmx/popham.asm') diff --git a/ghc/rts/gmp/mpn/x86/pentium/mmx/rshift.asm b/ghc/rts/gmp/mpn/x86/pentium/mmx/rshift.asm new file mode 100644 index 0000000..7672630 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/pentium/mmx/rshift.asm @@ -0,0 +1,460 @@ +dnl Intel P5 mpn_rshift -- mpn right shift. +dnl +dnl P5: 1.75 cycles/limb. + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size right by shift many bits and store the result in dst,size. +C Zeros are shifted in at the left. Return the bits shifted out at the +C right. +C +C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb, +C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l. +C +C Full speed depends on source and destination being aligned. Unaligned mmx +C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy +C setups and finish-ups are done to ensure alignment for the loop. +C +C MMX shifts work out a bit faster even for the simple loop. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl Minimum 5, because the unrolled loop can't handle less. +deflit(UNROLL_THRESHOLD, 5) + + .text + ALIGN(8) + +PROLOGUE(mpn_rshift) + + pushl %ebx + pushl %edi +deflit(`FRAME',8) + + movl PARAM_SIZE, %eax + movl PARAM_DST, %edx + + movl PARAM_SRC, %ebx + movl PARAM_SHIFT, %ecx + + cmp $UNROLL_THRESHOLD, %eax + jae L(unroll) + + decl %eax + movl (%ebx), %edi C src low limb + + jnz L(simple) + + shrdl( %cl, %edi, %eax) C eax was decremented to zero + + shrl %cl, %edi + + movl %edi, (%edx) C dst low limb + popl %edi C risk of data cache bank clash + + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(simple): + C eax size-1 + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx), %mm5 C src[0] + leal (%ebx,%eax,4), %ebx C &src[size-1] + + movd %ecx, %mm6 C rshift + leal -4(%edx,%eax,4), %edx C &dst[size-2] + + psllq $32, %mm5 + negl %eax + + +C This loop is 5 or 8 cycles, with every second load unaligned and a wasted +C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4 +C cycles and would be 8 in a simple loop. Using mmx helps the return value +C and last limb calculations too. + +L(simple_top): + C eax counter, limbs, negative + C ebx &src[size-1] + C ecx return value + C edx &dst[size-2] + C + C mm0 scratch + C mm5 return value + C mm6 shift + + movq (%ebx,%eax,4), %mm0 + incl %eax + + psrlq %mm6, %mm0 + + movd %mm0, (%edx,%eax,4) + jnz L(simple_top) + + + movd (%ebx), %mm0 + psrlq %mm6, %mm5 C return value + + psrlq %mm6, %mm0 + popl %edi + + movd %mm5, %eax + popl %ebx + + movd %mm0, 4(%edx) + + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(unroll): + C eax size + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx), %mm5 C src[0] + movl $4, %edi + + movd %ecx, %mm6 C rshift + testl %edi, %ebx + + psllq $32, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process low limb separately (marked xxx) and + C step src and dst by one limb, making src aligned. + C + C source ebx + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + C + C dest edx + C --+-------+-------+ + C | | xxx | + C --+-------+-------+ + + movq (%ebx), %mm0 C unaligned load + + psrlq %mm6, %mm0 + addl $4, %ebx + + decl %eax + + movd %mm0, (%edx) + addl $4, %edx +L(start_src_aligned): + + + movq (%ebx), %mm1 + testl %edi, %edx + + psrlq %mm6, %mm5 C retval + jz L(start_dst_aligned) + + C dst isn't aligned, add 4 to make it so, and pretend the shift is + C 32 bits extra. Low limb of dst (marked xxx) handled here + C separately. + C + C source ebx + C --+-------+-------+ + C | mm1 | + C --+-------+-------+ + C 4mod8 0mod8 + C + C dest edx + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + + movq %mm1, %mm0 + addl $32, %ecx C new shift + + psrlq %mm6, %mm0 + + movd %ecx, %mm6 + + movd %mm0, (%edx) + addl $4, %edx +L(start_dst_aligned): + + + movq 8(%ebx), %mm3 + negl %ecx + + movq %mm3, %mm2 C mm2 src qword + addl $64, %ecx + + movd %ecx, %mm7 + psrlq %mm6, %mm1 + + leal -12(%ebx,%eax,4), %ebx + leal -20(%edx,%eax,4), %edx + + psllq %mm7, %mm3 + subl $7, %eax C size-7 + + por %mm1, %mm3 C mm3 ready to store + negl %eax C -(size-7) + + jns L(finish) + + + C This loop is the important bit, the rest is just support. Careful + C instruction scheduling achieves the claimed 1.75 c/l. The + C relevant parts of the pairing rules are: + C + C - mmx loads and stores execute only in the U pipe + C - only one mmx shift in a pair + C - wait one cycle before storing an mmx register result + C - the usual address generation interlock + C + C Two qword calculations are slightly interleaved. The instructions + C marked "C" belong to the second qword, and the "C prev" one is for + C the second qword from the previous iteration. + + ALIGN(8) +L(unroll_loop): + C eax counter, limbs, negative + C ebx &src[size-12] + C ecx + C edx &dst[size-12] + C esi + C edi + C + C mm0 + C mm1 + C mm2 src qword from -8(%ebx,%eax,4) + C mm3 dst qword ready to store to -8(%edx,%eax,4) + C + C mm5 return value + C mm6 rshift + C mm7 lshift + + movq (%ebx,%eax,4), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, -8(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq 8(%ebx,%eax,4), %mm3 C + psrlq %mm6, %mm1 C + + movq %mm0, (%edx,%eax,4) + movq %mm3, %mm2 C + + psllq %mm7, %mm3 C + addl $4, %eax + + por %mm1, %mm3 C + js L(unroll_loop) + + +L(finish): + C eax 0 to 3 representing respectively 3 to 0 limbs remaining + + testb $2, %al + + jnz L(finish_no_two) + + movq (%ebx,%eax,4), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, -8(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq %mm1, %mm2 + movq %mm0, %mm3 + + addl $2, %eax +L(finish_no_two): + + + C eax 2 or 3 representing respectively 1 or 0 limbs remaining + C + C mm2 src prev qword, from -8(%ebx,%eax,4) + C mm3 dst qword, for -8(%edx,%eax,4) + + testb $1, %al + popl %edi + + movd %mm5, %eax C retval + jnz L(finish_zero) + + + C One extra limb, destination was aligned. + C + C source ebx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edx + C +-------+---------------+---------------+-- + C | | | mm3 | + C +-------+---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C One extra limb, destination was unaligned. + C + C source ebx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edx + C +---------------+---------------+-- + C | | mm3 | + C +---------------+---------------+-- + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at 8(%edx), and in the aligned case + C there's a further extra limb of dst to be formed. + + + movd 8(%ebx), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, (%edx) + por %mm2, %mm0 + + psrlq %mm6, %mm1 + andl $32, %ecx + + popl %ebx + jz L(finish_one_unaligned) + + C dst was aligned, must store one extra limb + movd %mm1, 16(%edx) +L(finish_one_unaligned): + + movq %mm0, 8(%edx) + + emms + + ret + + +L(finish_zero): + + C No extra limbs, destination was aligned. + C + C source ebx + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edx+4 + C +---------------+---------------+-- + C | | mm3 | + C +---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C No extra limbs, destination was unaligned. + C + C source ebx + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edx+4 + C +-------+---------------+-- + C | | mm3 | + C +-------+---------------+-- + C + C mm6 = shift+32 + C mm7 = 64-(shift+32) + + + C The movd for the unaligned case is clearly the same data as the + C movq for the aligned case, it's just a choice between whether one + C or two limbs should be written. + + + movq %mm3, 4(%edx) + psrlq %mm6, %mm2 + + movd %mm2, 12(%edx) + andl $32, %ecx + + popl %ebx + jz L(finish_zero_unaligned) + + movq %mm2, 12(%edx) +L(finish_zero_unaligned): + + emms + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/pentium/mul_1.asm b/ghc/rts/gmp/mpn/x86/pentium/mul_1.asm new file mode 100644 index 0000000..08639ec --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/pentium/mul_1.asm @@ -0,0 +1,79 @@ +dnl Intel Pentium mpn_mul_1 -- mpn by limb multiplication. +dnl +dnl P5: 13.0 cycles/limb + +dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, +dnl Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. */ + + +include(`../config.m4') + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); + +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_mul_1) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST, %edi + movl PARAM_SRC, %esi + movl PARAM_SIZE, %ecx + movl PARAM_MULTIPLIER, %ebp + + leal (%edi,%ecx,4), %edi + leal (%esi,%ecx,4), %esi + negl %ecx + xorl %ebx, %ebx + ALIGN(8) + +L(oop): adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(oop) + + adcl $0, %ebx + movl %ebx, %eax + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/pentium/mul_basecase.asm b/ghc/rts/gmp/mpn/x86/pentium/mul_basecase.asm new file mode 100644 index 0000000..d9f79a0 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/pentium/mul_basecase.asm @@ -0,0 +1,135 @@ +dnl Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication. +dnl +dnl P5: 14.2 cycles/crossproduct (approx) + + +dnl Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); + +defframe(PARAM_YSIZE, 20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE, 12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + +defframe(VAR_COUNTER, -4) + + .text + ALIGN(8) +PROLOGUE(mpn_mul_basecase) + + pushl %eax C dummy push for allocating stack slot + pushl %esi + pushl %ebp + pushl %edi +deflit(`FRAME',16) + + movl PARAM_XP,%esi + movl PARAM_WP,%edi + movl PARAM_YP,%ebp + + movl (%esi),%eax C load xp[0] + mull (%ebp) C multiply by yp[0] + movl %eax,(%edi) C store to wp[0] + movl PARAM_XSIZE,%ecx C xsize + decl %ecx C If xsize = 1, ysize = 1 too + jz L(done) + + movl PARAM_XSIZE,%eax + pushl %ebx +FRAME_pushl() + movl %edx,%ebx + leal (%esi,%eax,4),%esi C make xp point at end + leal (%edi,%eax,4),%edi C offset wp by xsize + negl %ecx C negate j size/index for inner loop + xorl %eax,%eax C clear carry + + ALIGN(8) +L(oop1): adcl $0,%ebx + movl (%esi,%ecx,4),%eax C load next limb at xp[j] + mull (%ebp) + addl %ebx,%eax + movl %eax,(%edi,%ecx,4) + incl %ecx + movl %edx,%ebx + jnz L(oop1) + + adcl $0,%ebx + movl PARAM_YSIZE,%eax + movl %ebx,(%edi) C most significant limb of product + addl $4,%edi C increment wp + decl %eax + jz L(skip) + movl %eax,VAR_COUNTER C set index i to ysize + +L(outer): + addl $4,%ebp C make ebp point to next y limb + movl PARAM_XSIZE,%ecx + negl %ecx + xorl %ebx,%ebx + + C code at 0x61 here, close enough to aligned +L(oop2): + adcl $0,%ebx + movl (%esi,%ecx,4),%eax + mull (%ebp) + addl %ebx,%eax + movl (%edi,%ecx,4),%ebx + adcl $0,%edx + addl %eax,%ebx + movl %ebx,(%edi,%ecx,4) + incl %ecx + movl %edx,%ebx + jnz L(oop2) + + adcl $0,%ebx + + movl %ebx,(%edi) + addl $4,%edi + movl VAR_COUNTER,%eax + decl %eax + movl %eax,VAR_COUNTER + jnz L(outer) + +L(skip): + popl %ebx + popl %edi + popl %ebp + popl %esi + addl $4,%esp + ret + +L(done): + movl %edx,4(%edi) C store to wp[1] + popl %edi + popl %ebp + popl %esi + popl %eax C dummy pop for deallocating stack slot + ret + +EPILOGUE() + diff --git a/ghc/rts/gmp/mpn/x86/pentium/rshift.asm b/ghc/rts/gmp/mpn/x86/pentium/rshift.asm new file mode 100644 index 0000000..e8f5ae8 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/pentium/rshift.asm @@ -0,0 +1,236 @@ +dnl Intel Pentium mpn_rshift -- mpn right shift. +dnl +dnl cycles/limb +dnl P5,P54: 6.0 +dnl P55: 5.375 + + +dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software +dnl Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does, +C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_rshift) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%ebp + movl PARAM_SHIFT,%ecx + +C We can use faster code for shift-by-1 under certain conditions. + cmp $1,%ecx + jne L(normal) + leal 4(%edi),%eax + cmpl %esi,%eax + jnc L(special) C jump if res_ptr + 1 >= s_ptr + leal (%edi,%ebp,4),%eax + cmpl %eax,%esi + jnc L(special) C jump if s_ptr >= res_ptr + size + +L(normal): + movl (%esi),%edx + addl $4,%esi + xorl %eax,%eax + shrdl( %cl, %edx, %eax) C compute carry limb + pushl %eax C push carry limb onto stack + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz L(end) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(oop): movl 28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + shrdl( %cl, %eax, %ebx) + shrdl( %cl, %edx, %eax) + movl %ebx,(%edi) + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + shrdl( %cl, %ebx, %edx) + shrdl( %cl, %eax, %ebx) + movl %edx,8(%edi) + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + shrdl( %cl, %edx, %eax) + shrdl( %cl, %ebx, %edx) + movl %eax,16(%edi) + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + shrdl( %cl, %eax, %ebx) + shrdl( %cl, %edx, %eax) + movl %ebx,24(%edi) + movl %eax,28(%edi) + + addl $32,%esi + addl $32,%edi + decl %ebp + jnz L(oop) + +L(end): popl %ebp + andl $7,%ebp + jz L(end2) +L(oop2): + movl (%esi),%eax + shrdl( %cl,%eax,%edx) C compute result limb + movl %edx,(%edi) + movl %eax,%edx + addl $4,%esi + addl $4,%edi + decl %ebp + jnz L(oop2) + +L(end2): + shrl %cl,%edx C compute most significant limb + movl %edx,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + +C We loop from least significant end of the arrays, which is only +C permissable if the source and destination don't overlap, since the +C function is documented to work for overlapping source and destination. + +L(special): + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + shrl %edx + incl %ebp + decl %ebp + jz L(Lend) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(Loop): + movl -28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + rcrl %eax + movl %ebx,(%edi) + rcrl %edx + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + rcrl %ebx + movl %edx,-8(%edi) + rcrl %eax + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + rcrl %edx + movl %eax,-16(%edi) + rcrl %ebx + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + rcrl %eax + movl %ebx,-24(%edi) + rcrl %edx + movl %eax,-28(%edi) + + leal -32(%esi),%esi C use leal not to clobber carry + leal -32(%edi),%edi + decl %ebp + jnz L(Loop) + +L(Lend): + popl %ebp + sbbl %eax,%eax C save carry in %eax + andl $7,%ebp + jz L(Lend2) + addl %eax,%eax C restore carry from eax +L(Loop2): + movl %edx,%ebx + movl (%esi),%edx + rcrl %edx + movl %ebx,(%edi) + + leal -4(%esi),%esi C use leal not to clobber carry + leal -4(%edi),%edi + decl %ebp + jnz L(Loop2) + + jmp L(L1) +L(Lend2): + addl %eax,%eax C restore carry from eax +L(L1): movl %edx,(%edi) C store last limb + + movl $0,%eax + rcrl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/pentium/sqr_basecase.asm b/ghc/rts/gmp/mpn/x86/pentium/sqr_basecase.asm new file mode 100644 index 0000000..c8584df --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/pentium/sqr_basecase.asm @@ -0,0 +1,520 @@ +dnl Intel P5 mpn_sqr_basecase -- square an mpn number. +dnl +dnl P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular +dnl product at around 20x20 limbs. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Calculate src,size squared, storing the result in dst,2*size. +C +C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a +C lot of function call overheads are avoided, especially when the size is +C small. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + movl PARAM_SRC, %eax + + cmpl $2, %edx + movl PARAM_DST, %ecx + + je L(two_limbs) + + movl (%eax), %eax + ja L(three_or_more) + +C ----------------------------------------------------------------------------- +C one limb only + C eax src + C ebx + C ecx dst + C edx + + mull %eax + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + ret + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(two_limbs): + C eax src + C ebx + C ecx dst + C edx size + + pushl %ebp + pushl %edi + + pushl %esi + pushl %ebx + + movl %eax, %ebx + movl (%eax), %eax + + mull %eax C src[0]^2 + + movl %eax, (%ecx) C dst[0] + movl %edx, %esi C dst[1] + + movl 4(%ebx), %eax + + mull %eax C src[1]^2 + + movl %eax, %edi C dst[2] + movl %edx, %ebp C dst[3] + + movl (%ebx), %eax + + mull 4(%ebx) C src[0]*src[1] + + addl %eax, %esi + popl %ebx + + adcl %edx, %edi + + adcl $0, %ebp + addl %esi, %eax + + adcl %edi, %edx + movl %eax, 4(%ecx) + + adcl $0, %ebp + popl %esi + + movl %edx, 8(%ecx) + movl %ebp, 12(%ecx) + + popl %edi + popl %ebp + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(three_or_more): + C eax src low limb + C ebx + C ecx dst + C edx size + + cmpl $4, %edx + pushl %ebx +deflit(`FRAME',4) + + movl PARAM_SRC, %ebx + jae L(four_or_more) + + +C ----------------------------------------------------------------------------- +C three limbs + C eax src low limb + C ebx src + C ecx dst + C edx size + + pushl %ebp + pushl %edi + + mull %eax C src[0] ^ 2 + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + movl 4(%ebx), %eax + xorl %ebp, %ebp + + mull %eax C src[1] ^ 2 + + movl %eax, 8(%ecx) + movl %edx, 12(%ecx) + + movl 8(%ebx), %eax + pushl %esi C risk of cache bank clash + + mull %eax C src[2] ^ 2 + + movl %eax, 16(%ecx) + movl %edx, 20(%ecx) + + movl (%ebx), %eax + + mull 4(%ebx) C src[0] * src[1] + + movl %eax, %esi + movl %edx, %edi + + movl (%ebx), %eax + + mull 8(%ebx) C src[0] * src[2] + + addl %eax, %edi + movl %edx, %ebp + + adcl $0, %ebp + movl 4(%ebx), %eax + + mull 8(%ebx) C src[1] * src[2] + + xorl %ebx, %ebx + addl %eax, %ebp + + C eax + C ebx zero, will be dst[5] + C ecx dst + C edx dst[4] + C esi dst[1] + C edi dst[2] + C ebp dst[3] + + adcl $0, %edx + addl %esi, %esi + + adcl %edi, %edi + + adcl %ebp, %ebp + + adcl %edx, %edx + movl 4(%ecx), %eax + + adcl $0, %ebx + addl %esi, %eax + + movl %eax, 4(%ecx) + movl 8(%ecx), %eax + + adcl %edi, %eax + movl 12(%ecx), %esi + + adcl %ebp, %esi + movl 16(%ecx), %edi + + movl %eax, 8(%ecx) + movl %esi, 12(%ecx) + + adcl %edx, %edi + popl %esi + + movl 20(%ecx), %eax + movl %edi, 16(%ecx) + + popl %edi + popl %ebp + + adcl %ebx, %eax C no carry out of this + popl %ebx + + movl %eax, 20(%ecx) + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(four_or_more): + C eax src low limb + C ebx src + C ecx dst + C edx size + C esi + C edi + C ebp + C + C First multiply src[0]*src[1..size-1] and store at dst[1..size]. + +deflit(`FRAME',4) + + pushl %edi +FRAME_pushl() + pushl %esi +FRAME_pushl() + + pushl %ebp +FRAME_pushl() + leal (%ecx,%edx,4), %edi C dst end of this mul1 + + leal (%ebx,%edx,4), %esi C src end + movl %ebx, %ebp C src + + negl %edx C -size + xorl %ebx, %ebx C clear carry limb and carry flag + + leal 1(%edx), %ecx C -(size-1) + +L(mul1): + C eax scratch + C ebx carry + C ecx counter, negative + C edx scratch + C esi &src[size] + C edi &dst[size] + C ebp src + + adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull (%ebp) + + addl %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(mul1) + + + C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for + C n=1..size-2. + C + C The last two products, which are the end corner of the product + C triangle, are handled separately to save looping overhead. These + C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1]. + C If size is 4 then it's only these that need to be done. + C + C In the outer loop %esi is a constant, and %edi just advances by 1 + C limb each time. The size of the operation decreases by 1 limb + C each time. + + C eax + C ebx carry (needing carry flag added) + C ecx + C edx + C esi &src[size] + C edi &dst[size] + C ebp + + adcl $0, %ebx + movl PARAM_SIZE, %edx + + movl %ebx, (%edi) + subl $4, %edx + + negl %edx + jz L(corner) + + +L(outer): + C ebx previous carry limb to store + C edx outer loop counter (negative) + C esi &src[size] + C edi dst, pointing at stored carry limb of previous loop + + pushl %edx C new outer loop counter + leal -2(%edx), %ecx + + movl %ebx, (%edi) + addl $4, %edi + + addl $4, %ebp + xorl %ebx, %ebx C initial carry limb, clear carry flag + +L(inner): + C eax scratch + C ebx carry (needing carry flag added) + C ecx counter, negative + C edx scratch + C esi &src[size] + C edi dst end of this addmul + C ebp &src[j] + + adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull (%ebp) + + addl %ebx, %eax + movl (%edi,%ecx,4), %ebx + + adcl $0, %edx + addl %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(inner) + + + adcl $0, %ebx + popl %edx C outer loop counter + + incl %edx + jnz L(outer) + + + movl %ebx, (%edi) + +L(corner): + C esi &src[size] + C edi &dst[2*size-4] + + movl -8(%esi), %eax + movl -4(%edi), %ebx C risk of data cache bank clash here + + mull -12(%esi) C src[size-2]*src[size-3] + + addl %eax, %ebx + movl %edx, %ecx + + adcl $0, %ecx + movl -4(%esi), %eax + + mull -12(%esi) C src[size-1]*src[size-3] + + addl %ecx, %eax + movl (%edi), %ecx + + adcl $0, %edx + movl %ebx, -4(%edi) + + addl %eax, %ecx + movl %edx, %ebx + + adcl $0, %ebx + movl -4(%esi), %eax + + mull -8(%esi) C src[size-1]*src[size-2] + + movl %ecx, 0(%edi) + addl %eax, %ebx + + adcl $0, %edx + movl PARAM_SIZE, %eax + + negl %eax + movl %ebx, 4(%edi) + + addl $1, %eax C -(size-1) and clear carry + movl %edx, 8(%edi) + + +C ----------------------------------------------------------------------------- +C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. + +L(lshift): + C eax counter, negative + C ebx next limb + C ecx + C edx + C esi + C edi &dst[2*size-4] + C ebp + + movl 12(%edi,%eax,8), %ebx + + rcll %ebx + movl 16(%edi,%eax,8), %ecx + + rcll %ecx + movl %ebx, 12(%edi,%eax,8) + + movl %ecx, 16(%edi,%eax,8) + incl %eax + + jnz L(lshift) + + + adcl %eax, %eax C high bit out + movl PARAM_SRC, %esi + + movl PARAM_SIZE, %ecx C risk of cache bank clash + movl %eax, 12(%edi) C dst most significant limb + + +C ----------------------------------------------------------------------------- +C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + movl (%esi), %eax C src[0] + leal (%esi,%ecx,4), %esi C src end + + negl %ecx + + mull %eax + + movl %eax, 16(%edi,%ecx,8) C dst[0] + movl %edx, %ebx + + addl $1, %ecx C size-1 and clear carry + +L(diag): + C eax scratch (low product) + C ebx carry limb + C ecx counter, negative + C edx scratch (high product) + C esi &src[size] + C edi &dst[2*size-4] + C ebp scratch (fetched dst limbs) + + movl (%esi,%ecx,4), %eax + adcl $0, %ebx + + mull %eax + + movl 16-4(%edi,%ecx,8), %ebp + + addl %ebp, %ebx + movl 16(%edi,%ecx,8), %ebp + + adcl %eax, %ebp + movl %ebx, 16-4(%edi,%ecx,8) + + movl %ebp, 16(%edi,%ecx,8) + incl %ecx + + movl %edx, %ebx + jnz L(diag) + + + adcl $0, %edx + movl 16-4(%edi), %eax C dst most significant limb + + addl %eax, %edx + popl %ebp + + movl %edx, 16-4(%edi) + popl %esi C risk of cache bank clash + + popl %edi + popl %ebx + + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/rshift.asm b/ghc/rts/gmp/mpn/x86/rshift.asm new file mode 100644 index 0000000..c9881fd --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/rshift.asm @@ -0,0 +1,92 @@ +dnl x86 mpn_rshift -- mpn right shift. + +dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, +dnl Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_rshift) + + pushl %edi + pushl %esi + pushl %ebx +deflit(`FRAME',12) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%edx + movl PARAM_SHIFT,%ecx + + leal -4(%edi,%edx,4),%edi + leal (%esi,%edx,4),%esi + negl %edx + + movl (%esi,%edx,4),%ebx C read least significant limb + xorl %eax,%eax + shrdl( %cl, %ebx, %eax) C compute carry limb + incl %edx + jz L(end) + pushl %eax C push carry limb onto stack + testb $1,%dl + jnz L(1) C enter loop in the middle + movl %ebx,%eax + + ALIGN(8) +L(oop): movl (%esi,%edx,4),%ebx C load next higher limb + shrdl( %cl, %ebx, %eax) C compute result limb + movl %eax,(%edi,%edx,4) C store it + incl %edx +L(1): movl (%esi,%edx,4),%eax + shrdl( %cl, %eax, %ebx) + movl %ebx,(%edi,%edx,4) + incl %edx + jnz L(oop) + + shrl %cl,%eax C compute most significant limb + movl %eax,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebx + popl %esi + popl %edi + ret + +L(end): shrl %cl,%ebx C compute most significant limb + movl %ebx,(%edi) C store it + + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/udiv.asm b/ghc/rts/gmp/mpn/x86/udiv.asm new file mode 100644 index 0000000..9fe022b --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/udiv.asm @@ -0,0 +1,44 @@ +dnl x86 mpn_udiv_qrnnd -- 2 by 1 limb division + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_udiv_qrnnd (mp_limb_t *remptr, mp_limb_t high, mp_limb_t low, +C mp_limb_t divisor); + +defframe(PARAM_DIVISOR, 16) +defframe(PARAM_LOW, 12) +defframe(PARAM_HIGH, 8) +defframe(PARAM_REMPTR, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_udiv_qrnnd) +deflit(`FRAME',0) + movl PARAM_LOW, %eax + movl PARAM_HIGH, %edx + divl PARAM_DIVISOR + movl PARAM_REMPTR, %ecx + movl %edx, (%ecx) + ret +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/umul.asm b/ghc/rts/gmp/mpn/x86/umul.asm new file mode 100644 index 0000000..3d289d1 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/umul.asm @@ -0,0 +1,43 @@ +dnl mpn_umul_ppmm -- 1x1->2 limb multiplication + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2); +C + +defframe(PARAM_M2, 12) +defframe(PARAM_M1, 8) +defframe(PARAM_LOWPTR, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_umul_ppmm) +deflit(`FRAME',0) + movl PARAM_LOWPTR, %ecx + movl PARAM_M1, %eax + mull PARAM_M2 + movl %eax, (%ecx) + movl %edx, %eax + ret +EPILOGUE() diff --git a/ghc/rts/gmp/mpn/x86/x86-defs.m4 b/ghc/rts/gmp/mpn/x86/x86-defs.m4 new file mode 100644 index 0000000..2dad698 --- /dev/null +++ b/ghc/rts/gmp/mpn/x86/x86-defs.m4 @@ -0,0 +1,713 @@ +divert(-1) + +dnl m4 macros for x86 assembler. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl Notes: +dnl +dnl m4 isn't perfect for processing BSD style x86 assembler code, the main +dnl problems are, +dnl +dnl 1. Doing define(foo,123) and then using foo in an addressing mode like +dnl foo(%ebx) expands as a macro rather than a constant. This is worked +dnl around by using deflit() from asm-defs.m4, instead of define(). +dnl +dnl 2. Immediates in macro definitions need a space or `' to stop the $ +dnl looking like a macro parameter. For example, +dnl +dnl define(foo, `mov $ 123, %eax') +dnl +dnl This is only a problem in macro definitions, not in ordinary text, +dnl nor in macro parameters like text passed to forloop() or ifdef(). + + +deflit(BYTES_PER_MP_LIMB, 4) + + +dnl -------------------------------------------------------------------------- +dnl Replacement PROLOGUE/EPILOGUE with more sophisticated error checking. +dnl Nesting and overlapping not allowed. +dnl + + +dnl Usage: PROLOGUE(functionname) +dnl +dnl Generate a function prologue. functionname gets GSYM_PREFIX added. +dnl Examples, +dnl +dnl PROLOGUE(mpn_add_n) +dnl PROLOGUE(somefun) + +define(`PROLOGUE', +m4_assert_numargs(1) +m4_assert_defined(`PROLOGUE_cpu') +`ifdef(`PROLOGUE_current_function', +`m4_error(`PROLOGUE'(`PROLOGUE_current_function') needs an `EPILOGUE'() before `PROLOGUE'($1) +)')dnl +m4_file_seen()dnl +define(`PROLOGUE_current_function',`$1')dnl +PROLOGUE_cpu(GSYM_PREFIX`'$1)') + + +dnl Usage: EPILOGUE() +dnl +dnl Notice the function name is passed to EPILOGUE_cpu(), letting it use $1 +dnl instead of the long PROLOGUE_current_function symbol. + +define(`EPILOGUE', +m4_assert_numargs(0) +m4_assert_defined(`EPILOGUE_cpu') +`ifdef(`PROLOGUE_current_function',, +`m4_error(`EPILOGUE'() with no `PROLOGUE'() +)')dnl +EPILOGUE_cpu(GSYM_PREFIX`'PROLOGUE_current_function)`'dnl +undefine(`PROLOGUE_current_function')') + +m4wrap_prepend( +`ifdef(`PROLOGUE_current_function', +`m4_error(`EPILOGUE() for PROLOGUE('PROLOGUE_current_function`) never seen +')')') + + +dnl Usage: PROLOGUE_assert_inside() +dnl +dnl Use this unquoted on a line on its own at the start of a macro +dnl definition to add some code to check the macro is only used inside a +dnl PROLOGUE/EPILOGUE pair, and that hence PROLOGUE_current_function is +dnl defined. + +define(PROLOGUE_assert_inside, +m4_assert_numargs(0) +``PROLOGUE_assert_inside_internal'(m4_doublequote($`'0))`dnl '') + +define(PROLOGUE_assert_inside_internal, +m4_assert_numargs(1) +`ifdef(`PROLOGUE_current_function',, +`m4_error(`$1 used outside a PROLOGUE / EPILOGUE pair +')')') + + +dnl Usage: L(labelname) +dnl LF(functionname,labelname) +dnl +dnl Generate a local label in the current or given function. For LF(), +dnl functionname gets GSYM_PREFIX added, the same as with PROLOGUE(). +dnl +dnl For example, in a function mpn_add_n (and with MPN_PREFIX __gmpn), +dnl +dnl L(bar) => L__gmpn_add_n__bar +dnl LF(somefun,bar) => Lsomefun__bar +dnl +dnl The funtion name and label name get two underscores between them rather +dnl than one to guard against clashing with a separate external symbol that +dnl happened to be called functionname_labelname. (Though this would only +dnl happen if the local label prefix is is empty.) Underscores are used so +dnl the whole label will still be a valid C identifier and so can be easily +dnl used in gdb. + +dnl LSYM_PREFIX can be L$, so defn() is used to prevent L expanding as the +dnl L macro and making an infinite recursion. +define(LF, +m4_assert_numargs(2) +m4_assert_defined(`LSYM_PREFIX') +`defn(`LSYM_PREFIX')GSYM_PREFIX`'$1`'__$2') + +define(`L', +m4_assert_numargs(1) +PROLOGUE_assert_inside() +`LF(PROLOGUE_current_function,`$1')') + + +dnl Called: PROLOGUE_cpu(gsym) +dnl EPILOGUE_cpu(gsym) + +define(PROLOGUE_cpu, +m4_assert_numargs(1) + `GLOBL $1 + TYPE($1,`function') +$1:') + +define(EPILOGUE_cpu, +m4_assert_numargs(1) +` SIZE($1,.-$1)') + + + +dnl -------------------------------------------------------------------------- +dnl Various x86 macros. +dnl + + +dnl Usage: ALIGN_OFFSET(bytes,offset) +dnl +dnl Align to `offset' away from a multiple of `bytes'. +dnl +dnl This is useful for testing, for example align to something very strict +dnl and see what effect offsets from it have, "ALIGN_OFFSET(256,32)". +dnl +dnl Generally you wouldn't execute across the padding, but it's done with +dnl nop's so it'll work. + +define(ALIGN_OFFSET, +m4_assert_numargs(2) +`ALIGN($1) +forloop(`i',1,$2,` nop +')') + + +dnl Usage: defframe(name,offset) +dnl +dnl Make a definition like the following with which to access a parameter +dnl or variable on the stack. +dnl +dnl define(name,`FRAME+offset(%esp)') +dnl +dnl Actually m4_empty_if_zero(FRAME+offset) is used, which will save one +dnl byte if FRAME+offset is zero, by putting (%esp) rather than 0(%esp). +dnl Use define(`defframe_empty_if_zero_disabled',1) if for some reason the +dnl zero offset is wanted. +dnl +dnl The new macro also gets a check that when it's used FRAME is actually +dnl defined, and that the final %esp offset isn't negative, which would +dnl mean an attempt to access something below the current %esp. +dnl +dnl deflit() is used rather than a plain define(), so the new macro won't +dnl delete any following parenthesized expression. name(%edi) will come +dnl out say as 16(%esp)(%edi). This isn't valid assembler and should +dnl provoke an error, which is better than silently giving just 16(%esp). +dnl +dnl See README.family for more on the suggested way to access the stack +dnl frame. + +define(defframe, +m4_assert_numargs(2) +`deflit(`$1', +m4_assert_defined(`FRAME') +`defframe_check_notbelow(`$1',$2,FRAME)dnl +defframe_empty_if_zero(FRAME+($2))(%esp)')') + +dnl Called: defframe_empty_if_zero(expression) +define(defframe_empty_if_zero, +`ifelse(defframe_empty_if_zero_disabled,1, +`eval($1)', +`m4_empty_if_zero($1)')') + +dnl Called: defframe_check_notbelow(`name',offset,FRAME) +define(defframe_check_notbelow, +m4_assert_numargs(3) +`ifelse(eval(($3)+($2)<0),1, +`m4_error(`$1 at frame offset $2 used when FRAME is only $3 bytes +')')') + + +dnl Usage: FRAME_pushl() +dnl FRAME_popl() +dnl FRAME_addl_esp(n) +dnl FRAME_subl_esp(n) +dnl +dnl Adjust FRAME appropriately for a pushl or popl, or for an addl or subl +dnl %esp of n bytes. +dnl +dnl Using these macros is completely optional. Sometimes it makes more +dnl sense to put explicit deflit(`FRAME',N) forms, especially when there's +dnl jumps and different sequences of FRAME values need to be used in +dnl different places. + +define(FRAME_pushl, +m4_assert_numargs(0) +m4_assert_defined(`FRAME') +`deflit(`FRAME',eval(FRAME+4))') + +define(FRAME_popl, +m4_assert_numargs(0) +m4_assert_defined(`FRAME') +`deflit(`FRAME',eval(FRAME-4))') + +define(FRAME_addl_esp, +m4_assert_numargs(1) +m4_assert_defined(`FRAME') +`deflit(`FRAME',eval(FRAME-($1)))') + +define(FRAME_subl_esp, +m4_assert_numargs(1) +m4_assert_defined(`FRAME') +`deflit(`FRAME',eval(FRAME+($1)))') + + +dnl Usage: defframe_pushl(name) +dnl +dnl Do a combination of a FRAME_pushl() and a defframe() to name the stack +dnl location just pushed. This should come after a pushl instruction. +dnl Putting it on the same line works and avoids lengthening the code. For +dnl example, +dnl +dnl pushl %eax defframe_pushl(VAR_COUNTER) +dnl +dnl Notice the defframe() is done with an unquoted -FRAME thus giving its +dnl current value without tracking future changes. + +define(defframe_pushl, +`FRAME_pushl()defframe(`$1',-FRAME)') + + +dnl -------------------------------------------------------------------------- +dnl Assembler instruction macros. +dnl + + +dnl Usage: emms_or_femms +dnl femms_available_p +dnl +dnl femms_available_p expands to 1 or 0 according to whether the AMD 3DNow +dnl femms instruction is available. emms_or_femms expands to femms if +dnl available, or emms if not. +dnl +dnl emms_or_femms is meant for use in the K6 directory where plain K6 +dnl (without femms) and K6-2 and K6-3 (with a slightly faster femms) are +dnl supported together. +dnl +dnl On K7 femms is no longer faster and is just an alias for emms, so plain +dnl emms may as well be used. + +define(femms_available_p, +m4_assert_numargs(-1) +`m4_ifdef_anyof_p( + `HAVE_TARGET_CPU_k62', + `HAVE_TARGET_CPU_k63', + `HAVE_TARGET_CPU_athlon')') + +define(emms_or_femms, +m4_assert_numargs(-1) +`ifelse(femms_available_p,1,`femms',`emms')') + + +dnl Usage: femms +dnl +dnl The gas 2.9.1 that comes with FreeBSD 3.4 doesn't support femms, so the +dnl following is a replacement using .byte. +dnl +dnl If femms isn't available, an emms is generated instead, for convenience +dnl when testing on a machine without femms. + +define(femms, +m4_assert_numargs(-1) +`ifelse(femms_available_p,1, +`.byte 15,14 C AMD 3DNow femms', +`emms`'dnl +m4_warning(`warning, using emms in place of femms, use for testing only +')')') + + +dnl Usage: jadcl0(op) +dnl +dnl Issue a jnc/incl as a substitute for adcl $0,op. This isn't an exact +dnl replacement, since it doesn't set the flags like adcl does. +dnl +dnl This finds a use in K6 mpn_addmul_1, mpn_submul_1, mpn_mul_basecase and +dnl mpn_sqr_basecase because on K6 an adcl is slow, the branch +dnl misprediction penalty is small, and the multiply algorithm used leads +dnl to a carry bit on average only 1/4 of the time. +dnl +dnl jadcl0_disabled can be set to 1 to instead issue an ordinary adcl for +dnl comparison. For example, +dnl +dnl define(`jadcl0_disabled',1) +dnl +dnl When using a register operand, eg. "jadcl0(%edx)", the jnc/incl code is +dnl the same size as an adcl. This makes it possible to use the exact same +dnl computed jump code when testing the relative speed of jnc/incl and adcl +dnl with jadcl0_disabled. + +define(jadcl0, +m4_assert_numargs(1) +`ifelse(jadcl0_disabled,1, + `adcl $`'0, $1', + `jnc 1f + incl $1 +1:dnl')') + + +dnl Usage: cmov_available_p +dnl +dnl Expand to 1 if cmov is available, 0 if not. + +define(cmov_available_p, +`m4_ifdef_anyof_p( + `HAVE_TARGET_CPU_pentiumpro', + `HAVE_TARGET_CPU_pentium2', + `HAVE_TARGET_CPU_pentium3', + `HAVE_TARGET_CPU_athlon')') + + +dnl Usage: x86_lookup(target, key,value, key,value, ...) +dnl x86_lookup_p(target, key,value, key,value, ...) +dnl +dnl Look for `target' among the `key' parameters. +dnl +dnl x86_lookup expands to the corresponding `value', or generates an error +dnl if `target' isn't found. +dnl +dnl x86_lookup_p expands to 1 if `target' is found, or 0 if not. + +define(x86_lookup, +`ifelse(eval($#<3),1, +`m4_error(`unrecognised part of x86 instruction: $1 +')', +`ifelse(`$1',`$2', `$3', +`x86_lookup(`$1',shift(shift(shift($@))))')')') + +define(x86_lookup_p, +`ifelse(eval($#<3),1, `0', +`ifelse(`$1',`$2', `1', +`x86_lookup_p(`$1',shift(shift(shift($@))))')')') + + +dnl Usage: x86_opcode_reg32(reg) +dnl x86_opcode_reg32_p(reg) +dnl +dnl x86_opcode_reg32 expands to the standard 3 bit encoding for the given +dnl 32-bit register, eg. `%ebp' turns into 5. +dnl +dnl x86_opcode_reg32_p expands to 1 if reg is a valid 32-bit register, or 0 +dnl if not. + +define(x86_opcode_reg32, +m4_assert_numargs(1) +`x86_lookup(`$1',x86_opcode_reg32_list)') + +define(x86_opcode_reg32_p, +m4_assert_onearg() +`x86_lookup_p(`$1',x86_opcode_reg32_list)') + +define(x86_opcode_reg32_list, +``%eax',0, +`%ecx',1, +`%edx',2, +`%ebx',3, +`%esp',4, +`%ebp',5, +`%esi',6, +`%edi',7') + + +dnl Usage: x86_opcode_tttn(cond) +dnl +dnl Expand to the 4-bit "tttn" field value for the given x86 branch +dnl condition (like `c', `ae', etc). + +define(x86_opcode_tttn, +m4_assert_numargs(1) +`x86_lookup(`$1',x86_opcode_ttn_list)') + +define(x86_opcode_tttn_list, +``o', 0, +`no', 1, +`b', 2, `c', 2, `nae',2, +`nb', 3, `nc', 3, `ae', 3, +`e', 4, `z', 4, +`ne', 5, `nz', 5, +`be', 6, `na', 6, +`nbe', 7, `a', 7, +`s', 8, +`ns', 9, +`p', 10, `pe', 10, `npo',10, +`np', 11, `npe',11, `po', 11, +`l', 12, `nge',12, +`nl', 13, `ge', 13, +`le', 14, `ng', 14, +`nle',15, `g', 15') + + +dnl Usage: cmovCC(srcreg,dstreg) +dnl +dnl Generate a cmov instruction if the target supports cmov, or simulate it +dnl with a conditional jump if not (the latter being meant only for +dnl testing). For example, +dnl +dnl cmovz( %eax, %ebx) +dnl +dnl cmov instructions are generated using .byte sequences, since only +dnl recent versions of gas know cmov. +dnl +dnl The source operand can only be a plain register. (m4 code implementing +dnl full memory addressing modes exists, believe it or not, but isn't +dnl currently needed and isn't included.) +dnl +dnl All the standard conditions are defined. Attempting to use one without +dnl the macro parentheses, such as just "cmovbe %eax, %ebx", will provoke +dnl an error. This ensures the necessary .byte sequences aren't +dnl accidentally missed. + +dnl Called: define_cmov_many(cond,tttn,cond,tttn,...) +define(define_cmov_many, +`ifelse(m4_length(`$1'),0,, +`define_cmov(`$1',`$2')define_cmov_many(shift(shift($@)))')') + +dnl Called: define_cmov(cond,tttn) +define(define_cmov, +m4_assert_numargs(2) +`define(`cmov$1', +m4_instruction_wrapper() +m4_assert_numargs(2) +`cmov_internal'(m4_doublequote($`'0),``$1',`$2'',dnl +m4_doublequote($`'1),m4_doublequote($`'2)))') + +define_cmov_many(x86_opcode_tttn_list) + + +dnl Called: cmov_internal(name,cond,tttn,src,dst) +define(cmov_internal, +m4_assert_numargs(5) +`ifelse(cmov_available_p,1, +`cmov_bytes_tttn(`$1',`$3',`$4',`$5')', +`m4_warning(`warning, simulating cmov with jump, use for testing only +')cmov_simulate(`$2',`$4',`$5')')') + +dnl Called: cmov_simulate(cond,src,dst) +dnl If this is going to be used with memory operands for the source it will +dnl need to be changed to do a fetch even if the condition is false, so as +dnl to trigger exceptions the same way a real cmov does. +define(cmov_simulate, +m4_assert_numargs(3) + `j$1 1f C cmov$1 $2, $3 + jmp 2f +1: movl $2, $3 +2:') + +dnl Called: cmov_bytes_tttn(name,tttn,src,dst) +define(cmov_bytes_tttn, +m4_assert_numargs(4) +`.byte dnl +15, dnl +eval(64+$2), dnl +eval(192+8*x86_opcode_reg32(`$4')+x86_opcode_reg32(`$3')) dnl + C `$1 $3, $4'') + + +dnl Usage: loop_or_decljnz label +dnl +dnl Generate either a "loop" instruction or a "decl %ecx / jnz", whichever +dnl is better. "loop" is better on K6 and probably on 386, on other chips +dnl separate decl/jnz is better. +dnl +dnl This macro is just for mpn/x86/divrem_1.asm and mpn/x86/mod_1.asm where +dnl this loop_or_decljnz variation is enough to let the code be shared by +dnl all chips. + +define(loop_or_decljnz, +`ifelse(loop_is_better_p,1, + `loop', + `decl %ecx + jnz')') + +define(loop_is_better_p, +`m4_ifdef_anyof_p(`HAVE_TARGET_CPU_k6', + `HAVE_TARGET_CPU_k62', + `HAVE_TARGET_CPU_k63', + `HAVE_TARGET_CPU_i386')') + + +dnl Usage: Zdisp(inst,op,op,op) +dnl +dnl Generate explicit .byte sequences if necessary to force a byte-sized +dnl zero displacement on an instruction. For example, +dnl +dnl Zdisp( movl, 0,(%esi), %eax) +dnl +dnl expands to +dnl +dnl .byte 139,70,0 C movl 0(%esi), %eax +dnl +dnl If the displacement given isn't 0, then normal assembler code is +dnl generated. For example, +dnl +dnl Zdisp( movl, 4,(%esi), %eax) +dnl +dnl expands to +dnl +dnl movl 4(%esi), %eax +dnl +dnl This means a single Zdisp() form can be used with an expression for the +dnl displacement, and .byte will be used only if necessary. The +dnl displacement argument is eval()ed. +dnl +dnl Because there aren't many places a 0(reg) form is wanted, Zdisp is +dnl implemented with a table of instructions and encodings. A new entry is +dnl needed for any different operation or registers. + +define(Zdisp, +`define(`Zdisp_found',0)dnl +Zdisp_match( movl, %eax, 0,(%edi), `137,71,0', $@)`'dnl +Zdisp_match( movl, %ebx, 0,(%edi), `137,95,0', $@)`'dnl +Zdisp_match( movl, %esi, 0,(%edi), `137,119,0', $@)`'dnl +Zdisp_match( movl, 0,(%ebx), %eax, `139,67,0', $@)`'dnl +Zdisp_match( movl, 0,(%ebx), %esi, `139,115,0', $@)`'dnl +Zdisp_match( movl, 0,(%esi), %eax, `139,70,0', $@)`'dnl +Zdisp_match( movl, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00', $@)`'dnl +Zdisp_match( addl, %ebx, 0,(%edi), `1,95,0', $@)`'dnl +Zdisp_match( addl, %ecx, 0,(%edi), `1,79,0', $@)`'dnl +Zdisp_match( addl, %esi, 0,(%edi), `1,119,0', $@)`'dnl +Zdisp_match( subl, %ecx, 0,(%edi), `41,79,0', $@)`'dnl +Zdisp_match( adcl, 0,(%edx), %esi, `19,114,0', $@)`'dnl +Zdisp_match( sbbl, 0,(%edx), %esi, `27,114,0', $@)`'dnl +Zdisp_match( movq, 0,(%eax,%ecx,8), %mm0, `0x0f,0x6f,0x44,0xc8,0x00', $@)`'dnl +Zdisp_match( movq, 0,(%ebx,%eax,4), %mm0, `0x0f,0x6f,0x44,0x83,0x00', $@)`'dnl +Zdisp_match( movq, 0,(%ebx,%eax,4), %mm2, `0x0f,0x6f,0x54,0x83,0x00', $@)`'dnl +Zdisp_match( movq, 0,(%esi), %mm0, `15,111,70,0', $@)`'dnl +Zdisp_match( movq, %mm0, 0,(%edi), `15,127,71,0', $@)`'dnl +Zdisp_match( movq, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7f,0x54,0x81,0x00', $@)`'dnl +Zdisp_match( movq, %mm2, 0,(%edx,%eax,4), `0x0f,0x7f,0x54,0x82,0x00', $@)`'dnl +Zdisp_match( movq, %mm0, 0,(%edx,%ecx,8), `0x0f,0x7f,0x44,0xca,0x00', $@)`'dnl +Zdisp_match( movd, 0,(%eax,%ecx,8), %mm1, `0x0f,0x6e,0x4c,0xc8,0x00', $@)`'dnl +Zdisp_match( movd, 0,(%edx,%ecx,8), %mm0, `0x0f,0x6e,0x44,0xca,0x00', $@)`'dnl +Zdisp_match( movd, %mm0, 0,(%eax,%ecx,4), `0x0f,0x7e,0x44,0x88,0x00', $@)`'dnl +Zdisp_match( movd, %mm0, 0,(%ecx,%eax,4), `0x0f,0x7e,0x44,0x81,0x00', $@)`'dnl +Zdisp_match( movd, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7e,0x54,0x81,0x00', $@)`'dnl +ifelse(Zdisp_found,0, +`m4_error(`unrecognised instruction in Zdisp: $1 $2 $3 $4 +')')') + +define(Zdisp_match, +`ifelse(eval(m4_stringequal_p(`$1',`$6') + && m4_stringequal_p(`$2',0) + && m4_stringequal_p(`$3',`$8') + && m4_stringequal_p(`$4',`$9')),1, +`define(`Zdisp_found',1)dnl +ifelse(eval(`$7'),0, +` .byte $5 C `$1 0$3, $4'', +` $6 $7$8, $9')', + +`ifelse(eval(m4_stringequal_p(`$1',`$6') + && m4_stringequal_p(`$2',`$7') + && m4_stringequal_p(`$3',0) + && m4_stringequal_p(`$4',`$9')),1, +`define(`Zdisp_found',1)dnl +ifelse(eval(`$8'),0, +` .byte $5 C `$1 $2, 0$4'', +` $6 $7, $8$9')')')') + + +dnl Usage: shldl(count,src,dst) +dnl shrdl(count,src,dst) +dnl shldw(count,src,dst) +dnl shrdw(count,src,dst) +dnl +dnl Generate a double-shift instruction, possibly omitting a %cl count +dnl parameter if that's what the assembler requires, as indicated by +dnl WANT_SHLDL_CL in config.m4. For example, +dnl +dnl shldl( %cl, %eax, %ebx) +dnl +dnl turns into either +dnl +dnl shldl %cl, %eax, %ebx +dnl or +dnl shldl %eax, %ebx +dnl +dnl Immediate counts are always passed through unchanged. For example, +dnl +dnl shrdl( $2, %esi, %edi) +dnl becomes +dnl shrdl $2, %esi, %edi +dnl +dnl +dnl If you forget to use the macro form "shldl( ...)" and instead write +dnl just a plain "shldl ...", an error results. This ensures the necessary +dnl variant treatment of %cl isn't accidentally bypassed. + +define(define_shd_instruction, +`define($1, +m4_instruction_wrapper() +m4_assert_numargs(3) +`shd_instruction'(m4_doublequote($`'0),m4_doublequote($`'1),dnl +m4_doublequote($`'2),m4_doublequote($`'3)))') + +dnl Effectively: define(shldl,`shd_instruction(`$0',`$1',`$2',`$3')') etc +define_shd_instruction(shldl) +define_shd_instruction(shrdl) +define_shd_instruction(shldw) +define_shd_instruction(shrdw) + +dnl Called: shd_instruction(op,count,src,dst) +define(shd_instruction, +m4_assert_numargs(4) +m4_assert_defined(`WANT_SHLDL_CL') +`ifelse(eval(m4_stringequal_p(`$2',`%cl') && !WANT_SHLDL_CL),1, +``$1' `$3', `$4'', +``$1' `$2', `$3', `$4'')') + + +dnl Usage: ASSERT(cond, instructions) +dnl +dnl If WANT_ASSERT is 1, output the given instructions and expect the given +dnl flags condition to then be satisfied. For example, +dnl +dnl ASSERT(ne, `cmpl %eax, %ebx') +dnl +dnl The instructions can be omitted to just assert a flags condition with +dnl no extra calculation. For example, +dnl +dnl ASSERT(nc) +dnl +dnl When `instructions' is not empty, a pushf/popf is added to preserve the +dnl flags, but the instructions themselves must preserve any registers that +dnl matter. FRAME is adjusted for the push and pop, so the instructions +dnl given can use defframe() stack variables. + +define(ASSERT, +m4_assert_numargs_range(1,2) +`ifelse(WANT_ASSERT,1, + `C ASSERT +ifelse(`$2',,,` pushf ifdef(`FRAME',`FRAME_pushl()')') + $2 + j`$1' 1f + ud2 C assertion failed +1: +ifelse(`$2',,,` popf ifdef(`FRAME',`FRAME_popl()')') +')') + + +dnl Usage: movl_text_address(label,register) +dnl +dnl Get the address of a text segment label, using either a plain movl or a +dnl position-independent calculation, as necessary. For example, +dnl +dnl movl_code_address(L(foo),%eax) +dnl +dnl This macro is only meant for use in ASSERT()s or when testing, since +dnl the PIC sequence it generates will want to be done with a ret balancing +dnl the call on CPUs with return address branch predition. +dnl +dnl The addl generated here has a backward reference to 1b, and so won't +dnl suffer from the two forwards references bug in old gas (described in +dnl mpn/x86/README.family). + +define(movl_text_address, +`ifdef(`PIC', + `call 1f +1: popl $2 C %eip + addl `$'$1-1b, $2', + `movl `$'$1, $2')') + + +divert`'dnl diff --git a/ghc/rts/gmp/mpn/z8000/add_n.s b/ghc/rts/gmp/mpn/z8000/add_n.s index a50fc3e..3a13610 100644 --- a/ghc/rts/gmp/mpn/z8000/add_n.s +++ b/ghc/rts/gmp/mpn/z8000/add_n.s @@ -1,20 +1,20 @@ -! Z8000 __mpn_add_n -- Add two limb vectors of equal, non-zero length. +! Z8000 __gmpn_add_n -- Add two limb vectors of equal, non-zero length. -! Copyright (C) 1993, 1994 Free Software Foundation, Inc. +! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc. ! This file is part of the GNU MP Library. ! The GNU MP Library is free software; you can redistribute it and/or modify -! it under the terms of the GNU Library General Public License as published by -! the Free Software Foundation; either version 2 of the License, or (at your +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your ! option) any later version. ! The GNU MP Library is distributed in the hope that it will be useful, but ! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ! License for more details. -! You should have received a copy of the GNU Library General Public License +! You should have received a copy of the GNU Lesser General Public License ! along with the GNU MP Library; see the file COPYING.LIB. If not, write to ! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ! MA 02111-1307, USA. @@ -33,8 +33,8 @@ unseg .text even - global ___mpn_add_n -___mpn_add_n: + global ___gmpn_add_n +___gmpn_add_n: pop r0,@r6 pop r1,@r5 add r0,r1 diff --git a/ghc/rts/gmp/mpn/z8000/gmp-mparam.h b/ghc/rts/gmp/mpn/z8000/gmp-mparam.h index e0a303e..4216df6 100644 --- a/ghc/rts/gmp/mpn/z8000/gmp-mparam.h +++ b/ghc/rts/gmp/mpn/z8000/gmp-mparam.h @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpn/z8000/mul_1.s b/ghc/rts/gmp/mpn/z8000/mul_1.s index f1126b5..20fadd3 100644 --- a/ghc/rts/gmp/mpn/z8000/mul_1.s +++ b/ghc/rts/gmp/mpn/z8000/mul_1.s @@ -1,21 +1,21 @@ -! Z8000 __mpn_mul_1 -- Multiply a limb vector with a limb and store +! Z8000 __gmpn_mul_1 -- Multiply a limb vector with a limb and store ! the result in a second limb vector. -! Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc. +! Copyright (C) 1993, 1994, 1995, 2000 Free Software Foundation, Inc. ! This file is part of the GNU MP Library. ! The GNU MP Library is free software; you can redistribute it and/or modify -! it under the terms of the GNU Library General Public License as published by -! the Free Software Foundation; either version 2 of the License, or (at your +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your ! option) any later version. ! The GNU MP Library is distributed in the hope that it will be useful, but ! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ! License for more details. -! You should have received a copy of the GNU Library General Public License +! You should have received a copy of the GNU Lesser General Public License ! along with the GNU MP Library; see the file COPYING.LIB. If not, write to ! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ! MA 02111-1307, USA. @@ -30,8 +30,8 @@ unseg .text even - global ___mpn_mul_1 -___mpn_mul_1: + global ___gmpn_mul_1 +___gmpn_mul_1: sub r2,r2 ! zero carry limb and r4,r4 jr mi,Lneg diff --git a/ghc/rts/gmp/mpn/z8000/sub_n.s b/ghc/rts/gmp/mpn/z8000/sub_n.s index 272c671..bd9a7ad 100644 --- a/ghc/rts/gmp/mpn/z8000/sub_n.s +++ b/ghc/rts/gmp/mpn/z8000/sub_n.s @@ -1,21 +1,21 @@ -! Z8000 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +! Z8000 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and ! store difference in a third limb vector. -! Copyright (C) 1993, 1994 Free Software Foundation, Inc. +! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc. ! This file is part of the GNU MP Library. ! The GNU MP Library is free software; you can redistribute it and/or modify -! it under the terms of the GNU Library General Public License as published by -! the Free Software Foundation; either version 2 of the License, or (at your +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your ! option) any later version. ! The GNU MP Library is distributed in the hope that it will be useful, but ! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ! License for more details. -! You should have received a copy of the GNU Library General Public License +! You should have received a copy of the GNU Lesser General Public License ! along with the GNU MP Library; see the file COPYING.LIB. If not, write to ! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ! MA 02111-1307, USA. @@ -34,8 +34,8 @@ unseg .text even - global ___mpn_sub_n -___mpn_sub_n: + global ___gmpn_sub_n +___gmpn_sub_n: pop r0,@r6 pop r1,@r5 sub r0,r1 diff --git a/ghc/rts/gmp/mpn/z8000x/add_n.s b/ghc/rts/gmp/mpn/z8000x/add_n.s index c5c0d42..7f13078 100644 --- a/ghc/rts/gmp/mpn/z8000x/add_n.s +++ b/ghc/rts/gmp/mpn/z8000x/add_n.s @@ -1,21 +1,21 @@ -! Z8000 (32 bit limb version) __mpn_add_n -- Add two limb vectors of equal, +! Z8000 (32 bit limb version) __gmpn_add_n -- Add two limb vectors of equal, ! non-zero length. -! Copyright (C) 1993, 1994 Free Software Foundation, Inc. +! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc. ! This file is part of the GNU MP Library. ! The GNU MP Library is free software; you can redistribute it and/or modify -! it under the terms of the GNU Library General Public License as published by -! the Free Software Foundation; either version 2 of the License, or (at your +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your ! option) any later version. ! The GNU MP Library is distributed in the hope that it will be useful, but ! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ! License for more details. -! You should have received a copy of the GNU Library General Public License +! You should have received a copy of the GNU Lesser General Public License ! along with the GNU MP Library; see the file COPYING.LIB. If not, write to ! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ! MA 02111-1307, USA. @@ -34,8 +34,8 @@ segm .text even - global ___mpn_add_n -___mpn_add_n: + global ___gmpn_add_n +___gmpn_add_n: popl rr0,@r6 popl rr8,@r5 addl rr0,rr8 diff --git a/ghc/rts/gmp/mpn/z8000x/sub_n.s b/ghc/rts/gmp/mpn/z8000x/sub_n.s index 9eeece6..f416d1d 100644 --- a/ghc/rts/gmp/mpn/z8000x/sub_n.s +++ b/ghc/rts/gmp/mpn/z8000x/sub_n.s @@ -1,21 +1,21 @@ -! Z8000 (32 bit limb version) __mpn_sub_n -- Subtract two limb vectors of the +! Z8000 (32 bit limb version) __gmpn_sub_n -- Subtract two limb vectors of the ! same length > 0 and store difference in a third limb vector. -! Copyright (C) 1993, 1994 Free Software Foundation, Inc. +! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc. ! This file is part of the GNU MP Library. ! The GNU MP Library is free software; you can redistribute it and/or modify -! it under the terms of the GNU Library General Public License as published by -! the Free Software Foundation; either version 2 of the License, or (at your +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your ! option) any later version. ! The GNU MP Library is distributed in the hope that it will be useful, but ! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ! License for more details. -! You should have received a copy of the GNU Library General Public License +! You should have received a copy of the GNU Lesser General Public License ! along with the GNU MP Library; see the file COPYING.LIB. If not, write to ! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, ! MA 02111-1307, USA. @@ -34,8 +34,8 @@ segm .text even - global ___mpn_sub_n -___mpn_sub_n: + global ___gmpn_sub_n +___gmpn_sub_n: popl rr0,@r6 popl rr8,@r5 subl rr0,rr8 diff --git a/ghc/rts/gmp/mpz/Makefile.am b/ghc/rts/gmp/mpz/Makefile.am new file mode 100644 index 0000000..cd6fec4 --- /dev/null +++ b/ghc/rts/gmp/mpz/Makefile.am @@ -0,0 +1,58 @@ +## Process this file with automake to generate Makefile.in + +# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +AUTOMAKE_OPTIONS = gnu no-dependencies + +SUBDIRS = tests + +INCLUDES = -I$(top_srcdir) -DOPERATION_$* + +noinst_LTLIBRARIES = libmpz.la +libmpz_la_SOURCES = \ + abs.c add.c add_ui.c addmul_ui.c and.c array_init.c \ + bin_ui.c bin_uiui.c cdiv_q.c \ + cdiv_q_ui.c cdiv_qr.c cdiv_qr_ui.c cdiv_r.c cdiv_r_ui.c cdiv_ui.c \ + clear.c clrbit.c cmp.c cmp_si.c cmp_ui.c cmpabs.c cmpabs_ui.c com.c \ + divexact.c dump.c fac_ui.c fdiv_q.c fdiv_q_2exp.c fdiv_q_ui.c \ + fdiv_qr.c fdiv_qr_ui.c fdiv_r.c fdiv_r_2exp.c fdiv_r_ui.c fdiv_ui.c \ + fib_ui.c fits_sint_p.c fits_slong_p.c fits_sshort_p.c fits_uint_p.c \ + fits_ulong_p.c fits_ushort_p.c gcd.c gcd_ui.c gcdext.c get_d.c get_si.c \ + get_str.c get_ui.c getlimbn.c hamdist.c init.c inp_raw.c inp_str.c \ + invert.c ior.c iset.c iset_d.c iset_si.c iset_str.c iset_ui.c \ + jacobi.c kronsz.c kronuz.c kronzs.c kronzu.c \ + lcm.c legendre.c mod.c mul.c mul_2exp.c neg.c nextprime.c \ + out_raw.c out_str.c perfpow.c perfsqr.c popcount.c pow_ui.c powm.c \ + powm_ui.c pprime_p.c random.c random2.c realloc.c remove.c root.c rrandomb.c \ + scan0.c scan1.c set.c set_d.c set_f.c set_q.c set_si.c set_str.c \ + set_ui.c setbit.c size.c sizeinbase.c sqrt.c sqrtrem.c sub.c \ + sub_ui.c swap.c tdiv_ui.c tdiv_q.c tdiv_q_2exp.c tdiv_q_ui.c tdiv_qr.c \ + tdiv_qr_ui.c tdiv_r.c tdiv_r_2exp.c tdiv_r_ui.c tstbit.c ui_pow_ui.c \ + urandomb.c urandomm.c xor.c + +EXTRA_DIST = mul_siui.c +nodist_libmpz_la_SOURCES = mul_si.c mul_ui.c +CLEANFILES = $(nodist_libmpz_la_SOURCES) + +mul_si.c: $(srcdir)/mul_siui.c + cp $(srcdir)/mul_siui.c mul_si.c +mul_ui.c: $(srcdir)/mul_siui.c + cp $(srcdir)/mul_siui.c mul_ui.c diff --git a/ghc/rts/gmp/mpz/README b/ghc/rts/gmp/mpz/README new file mode 100644 index 0000000..06b481d --- /dev/null +++ b/ghc/rts/gmp/mpz/README @@ -0,0 +1,23 @@ +This directory contains functions for GMP's integer function layer. + +In this version of GMP, integers are represented like in the figure below. +(Please note that the format might change between every version, and that +depending on the internal format in any way is a bad idea.) + + most least +significant significant + limb limb + + _mp_d + / + / + \/ + ____ ____ ____ ____ ____ + |____|____|____|____|____| + + <------- _mp_size -------> + + +The most significant limb will be non-zero. The _mp_size field's sign +reflects the sign of the number. Its absolute value is the count of limbs +in the number. diff --git a/ghc/rts/gmp/mpz/abs.c b/ghc/rts/gmp/mpz/abs.c index 080cac6..0b5eab1 100644 --- a/ghc/rts/gmp/mpz/abs.c +++ b/ghc/rts/gmp/mpz/abs.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/add.c b/ghc/rts/gmp/mpz/add.c index 10dd970..a22c377 100644 --- a/ghc/rts/gmp/mpz/add.c +++ b/ghc/rts/gmp/mpz/add.c @@ -1,26 +1,29 @@ /* mpz_add -- Add two integers. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "gmp.h" #include "gmp-impl.h" +#ifdef BERKELEY_MP +#include "mp.h" +#endif #ifndef BERKELEY_MP void @@ -58,9 +61,9 @@ madd (u, v, w) if (abs_usize < abs_vsize) { /* Swap U and V. */ - {const __mpz_struct *t = u; u = v; v = t;} - {mp_size_t t = usize; usize = vsize; vsize = t;} - {mp_size_t t = abs_usize; abs_usize = abs_vsize; abs_vsize = t;} + MPZ_SRCPTR_SWAP (u, v); + MP_SIZE_T_SWAP (usize, vsize); + MP_SIZE_T_SWAP (abs_usize, abs_vsize); } /* True: ABS_USIZE >= ABS_VSIZE. */ diff --git a/ghc/rts/gmp/mpz/add_ui.c b/ghc/rts/gmp/mpz/add_ui.c index a1e4306..28dbd71 100644 --- a/ghc/rts/gmp/mpz/add_ui.c +++ b/ghc/rts/gmp/mpz/add_ui.c @@ -1,20 +1,20 @@ /* mpz_add_ui -- Add an mpz_t and an unsigned one-word integer. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1999 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -59,7 +59,7 @@ mpz_add_ui (w, u, v) if (usize >= 0) { mp_limb_t cy; - cy = mpn_add_1 (wp, up, abs_usize, v); + cy = mpn_add_1 (wp, up, abs_usize, (mp_limb_t) v); wp[abs_usize] = cy; wsize = abs_usize + cy; } @@ -74,7 +74,7 @@ mpz_add_ui (w, u, v) } else { - mpn_sub_1 (wp, up, abs_usize, v); + mpn_sub_1 (wp, up, abs_usize, (mp_limb_t) v); /* Size can decrease with at most one limb. */ wsize = -(abs_usize - (wp[abs_usize - 1] == 0)); } diff --git a/ghc/rts/gmp/mpz/addmul_ui.c b/ghc/rts/gmp/mpz/addmul_ui.c new file mode 100644 index 0000000..7b38d36 --- /dev/null +++ b/ghc/rts/gmp/mpz/addmul_ui.c @@ -0,0 +1,214 @@ +/* mpz_addmul_ui(prodsum, multiplier, small_multiplicand) -- + Add MULTIPLICATOR times SMALL_MULTIPLICAND to PRODSUM. + +Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +static mp_limb_t mpn_neg1 _PROTO ((mp_ptr, mp_size_t)); + +#if 0 +#undef MPN_NORMALIZE +#define MPN_NORMALIZE(DST, NLIMBS) \ + do { \ + while (--(NLIMBS) >= 0 && (DST)[NLIMBS] == 0) \ + ; \ + (NLIMBS)++; \ + } while (0) +#undef MPN_NORMALIZE_NOT_ZERO +#define MPN_NORMALIZE_NOT_ZERO(DST, NLIMBS) \ + do { \ + while ((DST)[--(NLIMBS)] == 0) \ + ; \ + (NLIMBS)++; \ + } while (0) +#endif + +void +#if __STDC__ +mpz_addmul_ui (mpz_ptr rz, mpz_srcptr az, unsigned long int bu) +#else +mpz_addmul_ui (rz, az, bu) + mpz_ptr rz; + mpz_srcptr az; + unsigned long int bu; +#endif +{ + mp_size_t rn, an; + mp_ptr rp, ap; + + an = SIZ (az); + + /* If either multiplier is zero, result is unaffected. */ + if (bu == 0 || an == 0) + return; + + rn = SIZ (rz); + + if (rn == 0) + { + mp_limb_t cy; + + an = ABS (an); + if (ALLOC (rz) <= an) + _mpz_realloc (rz, an + 1); + rp = PTR (rz); + ap = PTR (az); + cy = mpn_mul_1 (rp, ap, an, (mp_limb_t) bu); + rp[an] = cy; + an += cy != 0; + SIZ (rz) = SIZ (az) >= 0 ? an : -an; + return; + } + + if ((an ^ rn) >= 0) + { + /* Sign of operands are the same--really add. */ + an = ABS (an); + rn = ABS (rn); + if (rn > an) + { + mp_limb_t cy; + if (ALLOC (rz) <= rn) + _mpz_realloc (rz, rn + 1); + rp = PTR (rz); + ap = PTR (az); + cy = mpn_addmul_1 (rp, ap, an, (mp_limb_t) bu); + cy = mpn_add_1 (rp + an, rp + an, rn - an, cy); + rp[rn] = cy; + rn += cy != 0; + SIZ (rz) = SIZ (rz) >= 0 ? rn : -rn; + return; + } + else + { + mp_limb_t cy; + if (ALLOC (rz) <= an) + _mpz_realloc (rz, an + 1); + rp = PTR (rz); + ap = PTR (az); + cy = mpn_addmul_1 (rp, ap, rn, (mp_limb_t) bu); + if (an != rn) + { + mp_limb_t cy2; + cy2 = mpn_mul_1 (rp + rn, ap + rn, an - rn, (mp_limb_t) bu); + cy = cy2 + mpn_add_1 (rp + rn, rp + rn, an - rn, cy); + } + rn = an; + rp[rn] = cy; + rn += cy != 0; + SIZ (rz) = SIZ (rz) >= 0 ? rn : -rn; + return; + } + } + else + { + /* Sign of operands are different--actually subtract. */ + an = ABS (an); + rn = ABS (rn); + if (rn > an) + { + mp_limb_t cy; + rp = PTR (rz); + ap = PTR (az); + cy = mpn_submul_1 (rp, ap, an, (mp_limb_t) bu); + cy = mpn_sub_1 (rp + an, rp + an, rn - an, cy); + if (cy != 0) + { + mpn_neg1 (rp, rn); + MPN_NORMALIZE_NOT_ZERO (rp, rn); + } + else + { + MPN_NORMALIZE (rp, rn); + rn = -rn; + } + + SIZ (rz) = SIZ (rz) >= 0 ? -rn : rn; + return; + } + else + { + /* Tricky case. We need to subtract an operand that might be larger + than the minuend. To avoid allocating temporary space, we compute + a*b-r instead of r-a*b and then negate. */ + mp_limb_t cy; + if (ALLOC (rz) <= an) + _mpz_realloc (rz, an + 1); + rp = PTR (rz); + ap = PTR (az); + cy = mpn_submul_1 (rp, ap, rn, (mp_limb_t) bu); + if (an != rn) + { + mp_limb_t cy2; + cy -= mpn_neg1 (rp, rn); + cy2 = mpn_mul_1 (rp + rn, ap + rn, an - rn, (mp_limb_t) bu); + if (cy == ~(mp_limb_t) 0) + cy = cy2 - mpn_sub_1 (rp + rn, rp + rn, an - rn, (mp_limb_t) 1); + else + cy = cy2 + mpn_add_1 (rp + rn, rp + rn, an - rn, cy); + rp[an] = cy; + rn = an + (cy != 0); + rn -= rp[rn - 1] == 0; + } + else if (cy != 0) + { + cy -= mpn_neg1 (rp, rn); + rp[an] = cy; + rn = an + 1; + MPN_NORMALIZE_NOT_ZERO (rp, rn); + } + else + { + rn = an; + MPN_NORMALIZE (rp, rn); + rn = -rn; + } + + SIZ (rz) = SIZ (rz) >= 0 ? -rn : rn; + return; + } + } +} + +static mp_limb_t +#if __STDC__ +mpn_neg1 (mp_ptr rp, mp_size_t rn) +#else +mpn_neg1 (rp, rn) + mp_ptr rp; + mp_size_t rn; +#endif +{ + mp_size_t i; + + while (rn != 0 && rp[0] == 0) + rp++, rn--; + + if (rn != 0) + { + rp[0] = -rp[0]; + for (i = 1; i < rn; i++) + rp[i] = ~rp[i]; + return 1; + } + return 0; +} diff --git a/ghc/rts/gmp/mpz/and.c b/ghc/rts/gmp/mpz/and.c index 838d4b1..354e945 100644 --- a/ghc/rts/gmp/mpz/and.c +++ b/ghc/rts/gmp/mpz/and.c @@ -1,20 +1,21 @@ /* mpz_and -- Logical and. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation, +Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -116,7 +117,7 @@ mpz_and (res, op1, op2) _mpz_realloc (res, res_alloc); res_ptr = res->_mp_d; /* Don't re-read OP1_PTR and OP2_PTR. They point to - temporary space--never to the space RES->_mp_D used + temporary space--never to the space RES->_mp_d used to point to before reallocation. */ } @@ -152,9 +153,8 @@ mpz_and (res, op1, op2) { /* We should compute -OP1 & OP2. Swap OP1 and OP2 and fall through to the code that handles OP1 & -OP2. */ - {mpz_srcptr t = op1; op1 = op2; op2 = t;} - {mp_srcptr t = op1_ptr; op1_ptr = op2_ptr; op2_ptr = t;} - {mp_size_t t = op1_size; op1_size = op2_size; op2_size = t;} + MPZ_SRCPTR_SWAP (op1, op2); + MPN_SRCPTR_SWAP (op1_ptr,op1_size, op2_ptr,op2_size); } } @@ -173,7 +173,7 @@ mpz_and (res, op1, op2) operand as the result for those limbs is going to become zero anyway. */ - /* Scan for the least significant. non-zero OP2 limb, and zero the + /* Scan for the least significant non-zero OP2 limb, and zero the result meanwhile for those limb positions. (We will surely find a non-zero limb, so we can write the loop with one termination condition only.) */ @@ -237,7 +237,7 @@ mpz_and (res, op1, op2) res_ptr = res->_mp_d; op1_ptr = op1->_mp_d; /* Don't re-read OP2_PTR. It points to temporary space--never - to the space RES->_mp_D used to point to before reallocation. */ + to the space RES->_mp_d used to point to before reallocation. */ } MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size, @@ -264,7 +264,7 @@ mpz_and (res, op1, op2) res_ptr = res->_mp_d; op1_ptr = op1->_mp_d; /* Don't re-read OP2_PTR. It points to temporary space--never - to the space RES->_mp_D used to point to before reallocation. */ + to the space RES->_mp_d used to point to before reallocation. */ } for (i = res_size - 1; i >= 0; i--) diff --git a/ghc/rts/gmp/mpz/array_init.c b/ghc/rts/gmp/mpz/array_init.c index 8b2e85c..1c22046 100644 --- a/ghc/rts/gmp/mpz/array_init.c +++ b/ghc/rts/gmp/mpz/array_init.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/bin_ui.c b/ghc/rts/gmp/mpz/bin_ui.c new file mode 100644 index 0000000..a7a6c98 --- /dev/null +++ b/ghc/rts/gmp/mpz/bin_ui.c @@ -0,0 +1,141 @@ +/* mpz_bin_uiui - compute n over k. + +Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +/* This is a poor implementation. Look at bin_uiui.c for improvement ideas. + In fact consider calling mpz_bin_uiui() when the arguments fit, leaving + the code here only for big n. + + The identity bin(n,k) = (-1)^k * bin(-n+k-1,k) can be found in Knuth vol + 1 section 1.2.6 part G. */ + + +/* Enhancement: use mpn_divexact_1 when it exists */ +#define DIVIDE() \ + ASSERT (SIZ(r) > 0); \ + ASSERT_NOCARRY (mpn_divrem_1 (PTR(r), (mp_size_t) 0, \ + PTR(r), SIZ(r), kacc)); \ + SIZ(r) -= (PTR(r)[SIZ(r)-1] == 0); + +void +#if __STDC__ +mpz_bin_ui (mpz_ptr r, mpz_srcptr n, unsigned long int k) +#else +mpz_bin_ui (r, n, k) + mpz_ptr r; + mpz_srcptr n; + unsigned long int k; +#endif +{ + mpz_t ni; + mp_limb_t i; + mpz_t nacc; + mp_limb_t kacc; + mp_size_t negate; + + if (mpz_sgn (n) < 0) + { + /* bin(n,k) = (-1)^k * bin(-n+k-1,k), and set ni = -n+k-1 - k = -n-1 */ + mpz_init (ni); + mpz_neg (ni, n); + mpz_sub_ui (ni, ni, 1L); + negate = (k & 1); /* (-1)^k */ + } + else + { + /* bin(n,k) == 0 if k>n + (no test for this under the n<0 case, since -n+k-1 >= k there) */ + if (mpz_cmp_ui (n, k) < 0) + { + mpz_set_ui (r, 0L); + return; + } + + /* set ni = n-k */ + mpz_init (ni); + mpz_sub_ui (ni, n, k); + negate = 0; + } + + /* Now wanting bin(ni+k,k), with ni positive, and "negate" is the sign (0 + for positive, 1 for negative). */ + mpz_set_ui (r, 1L); + + /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller. In this case it's + whether ni+k-k < k meaning ni>= 1; + nacclow >>= 1; + } + mpz_div_2exp (nacc, nacc, c); +#endif + + mpz_add_ui (ni, ni, 1); + mpz_mul (nacc, nacc, ni); + umul_ppmm (k1, k0, kacc, i); + if (k1 != 0) + { + /* Accumulator overflow. Perform bignum step. */ + mpz_mul (r, r, nacc); + mpz_set_ui (nacc, 1); + DIVIDE (); + kacc = i; + } + else + { + /* Save new products in accumulators to keep accumulating. */ + kacc = k0; + } + } + + mpz_mul (r, r, nacc); + DIVIDE (); + SIZ(r) = (SIZ(r) ^ -negate) + negate; + + mpz_clear (nacc); + mpz_clear (ni); +} diff --git a/ghc/rts/gmp/mpz/bin_uiui.c b/ghc/rts/gmp/mpz/bin_uiui.c new file mode 100644 index 0000000..b37541b --- /dev/null +++ b/ghc/rts/gmp/mpz/bin_uiui.c @@ -0,0 +1,120 @@ +/* mpz_bin_uiui - compute n over k. + +Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +/* Avoid reallocs by rounding up any new size */ +#define ROUNDUP_MASK 15 + +/* Enhancement: use mpn_divexact_1 when it exists */ +#define MULDIV() \ + MPZ_REALLOC (r, (SIZ(r)+1)|ROUNDUP_MASK); \ + PTR(r)[SIZ(r)] = mpn_mul_1 (PTR(r), PTR(r), SIZ(r), nacc); \ + ASSERT_NOCARRY (mpn_divrem_1 (PTR(r), (mp_size_t) 0, \ + PTR(r), SIZ(r)+1, kacc)); \ + SIZ(r) += (PTR(r)[SIZ(r)] != 0); + +void +#if __STDC__ +mpz_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k) +#else +mpz_bin_uiui (r, n, k) + mpz_ptr r; + unsigned long int n; + unsigned long int k; +#endif +{ + unsigned long int i, j; + mp_limb_t nacc, kacc; + unsigned long int cnt; + + /* bin(n,k) = 0 if k>n. */ + if (n < k) + { + mpz_set_ui (r, 0); + return; + } + + /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller. */ + k = MIN (k, n-k); + + /* bin(n,0) = 1 */ + if (k == 0) + { + mpz_set_ui (r, 1); + return; + } + + j = n - k + 1; + mpz_set_ui (r, j); + + /* Initialize accumulators. */ + nacc = 1; + kacc = 1; + + cnt = 0; + for (i = 2; i <= k; i++) + { + mp_limb_t n1, n0, k1, k0; + + j++; +#if 0 + /* Remove common multiples of 2. This will allow us to accumulate + more in nacc and kacc before we need a bignum step. It would make + sense to cancel factors of 3, 5, etc too, but this would be best + handled by sieving out factors. Alternatively, we could perform a + gcd of the accumulators just as they have overflown, and keep + accumulating until the gcd doesn't remove a significant factor. */ + while (((nacc | kacc) & 1) == 0) + { + nacc >>= 1; + kacc >>= 1; + } +#else + cnt = ((nacc | kacc) & 1) ^ 1; + nacc >>= cnt; + kacc >>= cnt; +#endif + /* Accumulate next multiples. */ + umul_ppmm (n1, n0, nacc, j); + umul_ppmm (k1, k0, kacc, i); + if (n1 != 0) + { + /* Accumulator overflow. Perform bignum step. */ + MULDIV (); + nacc = j; + kacc = i; + } + else + { + if (k1 != 0) abort (); + /* Save new products in accumulators to keep accumulating. */ + nacc = n0; + kacc = k0; + } + } + + /* Take care of whatever is left in accumulators. */ + MULDIV (); +} diff --git a/ghc/rts/gmp/mpz/cdiv_q.c b/ghc/rts/gmp/mpz/cdiv_q.c index 860a232..b15ba8a 100644 --- a/ghc/rts/gmp/mpz/cdiv_q.c +++ b/ghc/rts/gmp/mpz/cdiv_q.c @@ -1,21 +1,21 @@ /* mpz_cdiv_q -- Division rounding the quotient towards +infinity. The remainder gets the opposite sign as the denominator. -Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc. +Copyright (C) 1994, 1995, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -40,7 +40,7 @@ mpz_cdiv_q (quot, dividend, divisor) TMP_MARK (marker); - MPZ_TMP_INIT (rem, 1 + ABS (dividend_size)); + MPZ_TMP_INIT (rem, ABS (divisor_size)); mpz_tdiv_qr (quot, rem, dividend, divisor); diff --git a/ghc/rts/gmp/mpz/cdiv_q_ui.c b/ghc/rts/gmp/mpz/cdiv_q_ui.c index 7b6cfd7..74f3a90 100644 --- a/ghc/rts/gmp/mpz/cdiv_q_ui.c +++ b/ghc/rts/gmp/mpz/cdiv_q_ui.c @@ -3,21 +3,21 @@ always fit into the return type, the negative of the true remainder is returned. -Copyright (C) 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1994, 1996, 1999 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -40,6 +40,9 @@ mpz_cdiv_q_ui (quot, dividend, divisor) mp_ptr quot_ptr; mp_limb_t remainder_limb; + if (divisor == 0) + DIVIDE_BY_ZERO; + dividend_size = dividend->_mp_size; size = ABS (dividend_size); @@ -53,7 +56,7 @@ mpz_cdiv_q_ui (quot, dividend, divisor) if (remainder_limb != 0 && dividend_size >= 0) { - mpn_add_1 (quot_ptr, quot_ptr, size, (mp_limb_t) 1); + mpn_incr_u (quot_ptr, (mp_limb_t) 1); remainder_limb = divisor - remainder_limb; } diff --git a/ghc/rts/gmp/mpz/cdiv_qr.c b/ghc/rts/gmp/mpz/cdiv_qr.c index bf7d6da..29c7c41 100644 --- a/ghc/rts/gmp/mpz/cdiv_qr.c +++ b/ghc/rts/gmp/mpz/cdiv_qr.c @@ -1,21 +1,21 @@ /* mpz_cdiv_qr -- Division rounding the quotient towards +infinity. The remainder gets the opposite sign as the denominator. -Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc. +Copyright (C) 1994, 1995, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -35,6 +35,7 @@ mpz_cdiv_qr (quot, rem, dividend, divisor) #endif { mp_size_t divisor_size = divisor->_mp_size; + mp_size_t xsize; mpz_t temp_divisor; /* N.B.: lives until function returns! */ TMP_DECL (marker); @@ -50,9 +51,10 @@ mpz_cdiv_qr (quot, rem, dividend, divisor) divisor = temp_divisor; } + xsize = dividend->_mp_size ^ divisor_size;; mpz_tdiv_qr (quot, rem, dividend, divisor); - if ((divisor_size ^ dividend->_mp_size) >= 0 && rem->_mp_size != 0) + if (xsize >= 0 && rem->_mp_size != 0) { mpz_add_ui (quot, quot, 1L); mpz_sub (rem, rem, divisor); diff --git a/ghc/rts/gmp/mpz/cdiv_qr_ui.c b/ghc/rts/gmp/mpz/cdiv_qr_ui.c index a780e77..a7873c6 100644 --- a/ghc/rts/gmp/mpz/cdiv_qr_ui.c +++ b/ghc/rts/gmp/mpz/cdiv_qr_ui.c @@ -3,21 +3,21 @@ always fit into the return type, the negative of the true remainder is returned. -Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc. +Copyright (C) 1994, 1995, 1996, 1999 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -41,6 +41,9 @@ mpz_cdiv_qr_ui (quot, rem, dividend, divisor) mp_ptr quot_ptr; mp_limb_t remainder_limb; + if (divisor == 0) + DIVIDE_BY_ZERO; + dividend_size = dividend->_mp_size; size = ABS (dividend_size); @@ -50,11 +53,11 @@ mpz_cdiv_qr_ui (quot, rem, dividend, divisor) quot_ptr = quot->_mp_d; remainder_limb = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size, - (mp_limb_t) divisor); + (mp_limb_t) divisor); if (remainder_limb != 0 && dividend_size >= 0) { - mpn_add_1 (quot_ptr, quot_ptr, size, (mp_limb_t) 1); + mpn_incr_u (quot_ptr, (mp_limb_t) 1); remainder_limb = divisor - remainder_limb; } diff --git a/ghc/rts/gmp/mpz/cdiv_r.c b/ghc/rts/gmp/mpz/cdiv_r.c index d34d138..e96ce7e 100644 --- a/ghc/rts/gmp/mpz/cdiv_r.c +++ b/ghc/rts/gmp/mpz/cdiv_r.c @@ -6,16 +6,16 @@ Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/cdiv_r_ui.c b/ghc/rts/gmp/mpz/cdiv_r_ui.c index 757a3f5..e17e238 100644 --- a/ghc/rts/gmp/mpz/cdiv_r_ui.c +++ b/ghc/rts/gmp/mpz/cdiv_r_ui.c @@ -8,16 +8,16 @@ Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -39,6 +39,9 @@ mpz_cdiv_r_ui (rem, dividend, divisor) mp_size_t size; mp_limb_t remainder_limb; + if (divisor == 0) + DIVIDE_BY_ZERO; + dividend_size = dividend->_mp_size; size = ABS (dividend_size); diff --git a/ghc/rts/gmp/mpz/cdiv_ui.c b/ghc/rts/gmp/mpz/cdiv_ui.c index df841ed..63547a7 100644 --- a/ghc/rts/gmp/mpz/cdiv_ui.c +++ b/ghc/rts/gmp/mpz/cdiv_ui.c @@ -8,16 +8,16 @@ Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/clear.c b/ghc/rts/gmp/mpz/clear.c index 00f3cfd..5224553 100644 --- a/ghc/rts/gmp/mpz/clear.c +++ b/ghc/rts/gmp/mpz/clear.c @@ -6,16 +6,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/clrbit.c b/ghc/rts/gmp/mpz/clrbit.c index 59d9565..865d849 100644 --- a/ghc/rts/gmp/mpz/clrbit.c +++ b/ghc/rts/gmp/mpz/clrbit.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/cmp.c b/ghc/rts/gmp/mpz/cmp.c index 37be334..6062834 100644 --- a/ghc/rts/gmp/mpz/cmp.c +++ b/ghc/rts/gmp/mpz/cmp.c @@ -6,16 +6,16 @@ Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/cmp_si.c b/ghc/rts/gmp/mpz/cmp_si.c index 8063fd3..0c2212f 100644 --- a/ghc/rts/gmp/mpz/cmp_si.c +++ b/ghc/rts/gmp/mpz/cmp_si.c @@ -1,21 +1,22 @@ /* mpz_cmp_si(u,v) -- Compare an integer U with a single-word int V. Return positive, zero, or negative based on if U > V, U == V, or U < V. -Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1995, 1996, 2000 Free Software Foundation, +Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -23,14 +24,11 @@ MA 02111-1307, USA. */ #include "gmp.h" #include "gmp-impl.h" -/* gmp.h defines a macro for mpz_cmp_si. */ -#undef mpz_cmp_si - int #if __STDC__ -mpz_cmp_si (mpz_srcptr u, signed long int v_digit) +_mpz_cmp_si (mpz_srcptr u, signed long int v_digit) #else -mpz_cmp_si (u, v_digit) +_mpz_cmp_si (u, v_digit) mpz_srcptr u; signed long int v_digit; #endif @@ -56,10 +54,10 @@ mpz_cmp_si (u, v_digit) u_digit = u->_mp_d[0]; - if (u_digit == v_digit) + if (u_digit == (mp_limb_t) (unsigned long) v_digit) return 0; - if (u_digit > v_digit) + if (u_digit > (mp_limb_t) (unsigned long) v_digit) return usize; else return -usize; diff --git a/ghc/rts/gmp/mpz/cmp_ui.c b/ghc/rts/gmp/mpz/cmp_ui.c index 1a50b96..fd84f30 100644 --- a/ghc/rts/gmp/mpz/cmp_ui.c +++ b/ghc/rts/gmp/mpz/cmp_ui.c @@ -1,21 +1,21 @@ /* mpz_cmp_ui.c -- Compare a mpz_t a with an mp_limb_t b. Return positive, zero, or negative based on if a > b, a == b, or a < b. -Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1995, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -23,14 +23,11 @@ MA 02111-1307, USA. */ #include "gmp.h" #include "gmp-impl.h" -/* gmp.h defines a macro for mpz_cmp_ui. */ -#undef mpz_cmp_ui - int #if __STDC__ -mpz_cmp_ui (mpz_srcptr u, unsigned long int v_digit) +_mpz_cmp_ui (mpz_srcptr u, unsigned long int v_digit) #else -mpz_cmp_ui (u, v_digit) +_mpz_cmp_ui (u, v_digit) mpz_srcptr u; unsigned long int v_digit; #endif diff --git a/ghc/rts/gmp/mpz/cmpabs.c b/ghc/rts/gmp/mpz/cmpabs.c new file mode 100644 index 0000000..037d7a9 --- /dev/null +++ b/ghc/rts/gmp/mpz/cmpabs.c @@ -0,0 +1,57 @@ +/* mpz_cmpabs(u,v) -- Compare U, V. Return postive, zero, or negative + based on if U > V, U == V, or U < V. + +Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +int +#if __STDC__ +mpz_cmpabs (mpz_srcptr u, mpz_srcptr v) +#else +mpz_cmpabs (u, v) + mpz_srcptr u; + mpz_srcptr v; +#endif +{ + mp_size_t usize = u->_mp_size; + mp_size_t vsize = v->_mp_size; + mp_size_t size; + mp_srcptr up, vp; + int cmp; + + usize = ABS (usize); + vsize = ABS (vsize); + + if (usize != vsize) + return usize - vsize; + + if (usize == 0) + return 0; + + up = u->_mp_d; + vp = v->_mp_d; + + cmp = mpn_cmp (up, vp, usize); + + return cmp; +} diff --git a/ghc/rts/gmp/mpz/cmpabs_ui.c b/ghc/rts/gmp/mpz/cmpabs_ui.c new file mode 100644 index 0000000..db816b5 --- /dev/null +++ b/ghc/rts/gmp/mpz/cmpabs_ui.c @@ -0,0 +1,56 @@ +/* mpz_cmpabs_ui.c -- Compare a mpz_t a with an mp_limb_t b. Return positive, + zero, or negative based on if a > b, a == b, or a < b. + +Copyright (C) 1991, 1993, 1994, 1995, 1997, 2000 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +int +#if __STDC__ +mpz_cmpabs_ui (mpz_srcptr u, unsigned long int v_digit) +#else +mpz_cmpabs_ui (u, v_digit) + mpz_srcptr u; + unsigned long int v_digit; +#endif +{ + mp_size_t usize = u->_mp_size; + + if (usize == 0) + return -(v_digit != 0); + + usize = ABS (usize); + + if (usize == 1) + { + mp_limb_t u_digit; + + u_digit = u->_mp_d[0]; + if (u_digit > v_digit) + return 1; + if (u_digit < v_digit) + return -1; + return 0; + } + + return 1; +} diff --git a/ghc/rts/gmp/mpz/com.c b/ghc/rts/gmp/mpz/com.c index 559f2b6..18d6427 100644 --- a/ghc/rts/gmp/mpz/com.c +++ b/ghc/rts/gmp/mpz/com.c @@ -6,16 +6,16 @@ Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/divexact.c b/ghc/rts/gmp/mpz/divexact.c index b897448..c297045 100644 --- a/ghc/rts/gmp/mpz/divexact.c +++ b/ghc/rts/gmp/mpz/divexact.c @@ -1,20 +1,21 @@ /* mpz_divexact -- finds quotient when known that quot * den == num && den != 0. -Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000 Free Software +Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -27,8 +28,8 @@ MA 02111-1307, USA. */ de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS). References: - T. Jebelean, An algorithm for exact division, Journal of Symbolic - Computation, v. 15, 1993, pp. 169-180. */ + T. Jebelean, An algorithm for exact division, Journal of Symbolic + Computation, v. 15, 1993, pp. 169-180. */ #include "gmp.h" #include "gmp-impl.h" @@ -46,30 +47,42 @@ mpz_divexact (quot, num, den) { mp_ptr qp, tp; mp_size_t qsize, tsize; - - mp_srcptr np = num->_mp_d; - mp_srcptr dp = den->_mp_d; - mp_size_t nsize = ABS (num->_mp_size); - mp_size_t dsize = ABS (den->_mp_size); + mp_srcptr np, dp; + mp_size_t nsize, dsize; TMP_DECL (marker); - /* Generate divide-by-zero error if dsize == 0. */ - if (dsize == 0) - { - quot->_mp_size = 1 / dsize; - return; - } + nsize = ABS (num->_mp_size); + dsize = ABS (den->_mp_size); + + qsize = nsize - dsize + 1; + if (quot->_mp_alloc < qsize) + _mpz_realloc (quot, qsize); + + np = num->_mp_d; + dp = den->_mp_d; + qp = quot->_mp_d; if (nsize == 0) { + if (dsize == 0) + DIVIDE_BY_ZERO; quot->_mp_size = 0; return; } - qsize = nsize - dsize + 1; - if (quot->_mp_alloc < qsize) - _mpz_realloc (quot, qsize); - qp = quot->_mp_d; + if (dsize <= 1) + { + if (dsize == 1) + { + mpn_divmod_1 (qp, np, nsize, dp[0]); + qsize -= qp[qsize - 1] == 0; + quot->_mp_size = (num->_mp_size ^ den->_mp_size) >= 0 ? qsize : -qsize; + return; + } + + /* Generate divide-by-zero error since dsize == 0. */ + DIVIDE_BY_ZERO; + } TMP_MARK (marker); @@ -77,36 +90,36 @@ mpz_divexact (quot, num, den) while (dp[0] == 0) np += 1, nsize -= 1, dp += 1, dsize -= 1; tsize = MIN (qsize, dsize); - if (dp[0] & 1) + if ((dp[0] & 1) != 0) { - if (qp != dp) - MPN_COPY (qp, np, qsize); - if (qp == dp) /* QUOT and DEN overlap. */ + if (quot == den) /* QUOT and DEN overlap. */ { - tp = (mp_ptr) TMP_ALLOC (sizeof (mp_limb_t) * tsize); + tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB); MPN_COPY (tp, dp, tsize); } else tp = (mp_ptr) dp; + if (qp != np) + MPN_COPY_INCR (qp, np, qsize); } else { - unsigned long int r; - tp = (mp_ptr) TMP_ALLOC (sizeof (mp_limb_t) * tsize); + unsigned int r; + tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB); count_trailing_zeros (r, dp[0]); mpn_rshift (tp, dp, tsize, r); if (dsize > tsize) - tp[tsize-1] |= dp[tsize] << (BITS_PER_MP_LIMB - r); + tp[tsize - 1] |= dp[tsize] << (BITS_PER_MP_LIMB - r); mpn_rshift (qp, np, qsize, r); if (nsize > qsize) - qp[qsize-1] |= np[qsize] << (BITS_PER_MP_LIMB - r); + qp[qsize - 1] |= np[qsize] << (BITS_PER_MP_LIMB - r); } /* Now QUOT <-- QUOT/T. */ mpn_bdivmod (qp, qp, qsize, tp, tsize, qsize * BITS_PER_MP_LIMB); MPN_NORMALIZE (qp, qsize); - quot->_mp_size = (num->_mp_size < 0) == (den->_mp_size < 0) ? qsize : -qsize; + quot->_mp_size = (num->_mp_size ^ den->_mp_size) >= 0 ? qsize : -qsize; TMP_FREE (marker); } diff --git a/ghc/rts/gmp/mpz/dump.c b/ghc/rts/gmp/mpz/dump.c new file mode 100644 index 0000000..dc318ac --- /dev/null +++ b/ghc/rts/gmp/mpz/dump.c @@ -0,0 +1,44 @@ +/* mpz_dump - Dump an integer to stdout. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS NOT SAFE TO + CALL THIS FUNCTION DIRECTLY. IN FACT, IT IS ALMOST GUARANTEED THAT THIS + FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include +#include "gmp.h" +#include "gmp-impl.h" + +void +#if __STDC__ +mpz_dump (mpz_srcptr u) +#else +mpz_dump (u) + mpz_srcptr u; +#endif +{ + char *str; + + str = mpz_get_str (0, 10, u); + printf ("%s\n", str); + (*_mp_free_func) (str, 0);/* ??? broken alloc interface, pass what size ??? */ +} diff --git a/ghc/rts/gmp/mpz/fac_ui.c b/ghc/rts/gmp/mpz/fac_ui.c index a170060..85f40f2 100644 --- a/ghc/rts/gmp/mpz/fac_ui.c +++ b/ghc/rts/gmp/mpz/fac_ui.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/fdiv_q.c b/ghc/rts/gmp/mpz/fdiv_q.c index 3da943a..9d75ca3 100644 --- a/ghc/rts/gmp/mpz/fdiv_q.c +++ b/ghc/rts/gmp/mpz/fdiv_q.c @@ -1,21 +1,21 @@ /* mpz_fdiv_q -- Division rounding the quotient towards -infinity. The remainder gets the same sign as the denominator. -Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc. +Copyright (C) 1994, 1995, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -40,7 +40,7 @@ mpz_fdiv_q (quot, dividend, divisor) TMP_MARK (marker); - MPZ_TMP_INIT (rem, 1 + ABS (dividend_size)); + MPZ_TMP_INIT (rem, ABS (divisor_size)); mpz_tdiv_qr (quot, rem, dividend, divisor); diff --git a/ghc/rts/gmp/mpz/fdiv_q_2exp.c b/ghc/rts/gmp/mpz/fdiv_q_2exp.c index 3f56baf..8e02180 100644 --- a/ghc/rts/gmp/mpz/fdiv_q_2exp.c +++ b/ghc/rts/gmp/mpz/fdiv_q_2exp.c @@ -1,21 +1,22 @@ /* mpz_fdiv_q_2exp -- Divide an integer by 2**CNT. Round the quotient towards -infinity. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1998, 1999 Free Software Foundation, +Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -86,9 +87,18 @@ mpz_fdiv_q_2exp (w, u, cnt) if (usize < 0 && round != 0) { mp_limb_t cy; - cy = mpn_add_1 (wp, wp, wsize, 1); - wp[wsize] = cy; - wsize += cy; + if (wsize != 0) + { + cy = mpn_add_1 (wp, wp, wsize, (mp_limb_t) 1); + wp[wsize] = cy; + wsize += cy; + } + else + { + /* We shifted something negative to zero. The result is -1. */ + wp[0] = 1; + wsize = 1; + } } w->_mp_size = usize >= 0 ? wsize : -wsize; } diff --git a/ghc/rts/gmp/mpz/fdiv_q_ui.c b/ghc/rts/gmp/mpz/fdiv_q_ui.c index 3d6825d..55d2498 100644 --- a/ghc/rts/gmp/mpz/fdiv_q_ui.c +++ b/ghc/rts/gmp/mpz/fdiv_q_ui.c @@ -1,21 +1,21 @@ /* mpz_fdiv_q_ui -- Division rounding the quotient towards -infinity. The remainder gets the same sign as the denominator. -Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc. +Copyright (C) 1994, 1995, 1996, 1999 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -38,6 +38,9 @@ mpz_fdiv_q_ui (quot, dividend, divisor) mp_ptr quot_ptr; mp_limb_t remainder_limb; + if (divisor == 0) + DIVIDE_BY_ZERO; + dividend_size = dividend->_mp_size; size = ABS (dividend_size); @@ -51,7 +54,7 @@ mpz_fdiv_q_ui (quot, dividend, divisor) if (remainder_limb != 0 && dividend_size < 0) { - mpn_add_1 (quot_ptr, quot_ptr, size, (mp_limb_t) 1); + mpn_incr_u (quot_ptr, (mp_limb_t) 1); remainder_limb = divisor - remainder_limb; } diff --git a/ghc/rts/gmp/mpz/fdiv_qr.c b/ghc/rts/gmp/mpz/fdiv_qr.c index 2abb16c..06ce506 100644 --- a/ghc/rts/gmp/mpz/fdiv_qr.c +++ b/ghc/rts/gmp/mpz/fdiv_qr.c @@ -1,21 +1,21 @@ /* mpz_fdiv_qr -- Division rounding the quotient towards -infinity. The remainder gets the same sign as the denominator. -Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc. +Copyright (C) 1994, 1995, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -35,6 +35,7 @@ mpz_fdiv_qr (quot, rem, dividend, divisor) #endif { mp_size_t divisor_size = divisor->_mp_size; + mp_size_t xsize; mpz_t temp_divisor; /* N.B.: lives until function returns! */ TMP_DECL (marker); @@ -50,9 +51,10 @@ mpz_fdiv_qr (quot, rem, dividend, divisor) divisor = temp_divisor; } + xsize = dividend->_mp_size ^ divisor_size;; mpz_tdiv_qr (quot, rem, dividend, divisor); - if ((divisor_size ^ dividend->_mp_size) < 0 && rem->_mp_size != 0) + if (xsize < 0 && rem->_mp_size != 0) { mpz_sub_ui (quot, quot, 1L); mpz_add (rem, rem, divisor); diff --git a/ghc/rts/gmp/mpz/fdiv_qr_ui.c b/ghc/rts/gmp/mpz/fdiv_qr_ui.c index a22b702..600c0da 100644 --- a/ghc/rts/gmp/mpz/fdiv_qr_ui.c +++ b/ghc/rts/gmp/mpz/fdiv_qr_ui.c @@ -1,21 +1,21 @@ /* mpz_fdiv_qr_ui -- Division rounding the quotient towards -infinity. The remainder gets the same sign as the denominator. -Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc. +Copyright (C) 1994, 1995, 1996, 1999 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -39,6 +39,9 @@ mpz_fdiv_qr_ui (quot, rem, dividend, divisor) mp_ptr quot_ptr; mp_limb_t remainder_limb; + if (divisor == 0) + DIVIDE_BY_ZERO; + dividend_size = dividend->_mp_size; size = ABS (dividend_size); @@ -48,11 +51,11 @@ mpz_fdiv_qr_ui (quot, rem, dividend, divisor) quot_ptr = quot->_mp_d; remainder_limb = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size, - (mp_limb_t) divisor); + (mp_limb_t) divisor); if (remainder_limb != 0 && dividend_size < 0) { - mpn_add_1 (quot_ptr, quot_ptr, size, (mp_limb_t) 1); + mpn_incr_u (quot_ptr, (mp_limb_t) 1); remainder_limb = divisor - remainder_limb; } diff --git a/ghc/rts/gmp/mpz/fdiv_r.c b/ghc/rts/gmp/mpz/fdiv_r.c index 14e045b..a365283 100644 --- a/ghc/rts/gmp/mpz/fdiv_r.c +++ b/ghc/rts/gmp/mpz/fdiv_r.c @@ -6,16 +6,16 @@ Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/fdiv_r_2exp.c b/ghc/rts/gmp/mpz/fdiv_r_2exp.c index 04190b1..d0cd039 100644 --- a/ghc/rts/gmp/mpz/fdiv_r_2exp.c +++ b/ghc/rts/gmp/mpz/fdiv_r_2exp.c @@ -1,20 +1,21 @@ /* mpz_fdiv_r_2exp -- Divide a integer by 2**CNT and produce a remainder. -Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1995, 1998, 1999 Free Software Foundation, +Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -65,7 +66,8 @@ mpz_fdiv_r_2exp (res, in, cnt) else { /* The input operand is smaller than 2**CNT. We perform a no-op, - apart from that we might need to copy IN to RES. */ + apart from that we might need to copy IN to RES, and may need + to round the result. */ res_size = in_size; if (res->_mp_alloc < res_size) _mpz_realloc (res, res_size); @@ -75,14 +77,77 @@ mpz_fdiv_r_2exp (res, in, cnt) if (res != in) MPN_COPY (res->_mp_d, in->_mp_d, limb_cnt); + in_size = in->_mp_size; res->_mp_size = res_size; - if (in->_mp_size < 0 && res_size != 0) + if (in_size < 0 && res_size != 0) { /* Result should be 2^CNT - RES */ mpz_t tmp; - MPZ_TMP_INIT (tmp, limb_cnt + 1); + MPZ_TMP_INIT (tmp, cnt/BITS_PER_MP_LIMB + 2); mpz_set_ui (tmp, 1L); mpz_mul_2exp (tmp, tmp, cnt); mpz_sub (res, tmp, res); } } + +/* This is an alternative ending of the above function using just low-level + functions. Tested, but perhaps excessive? */ +#if 0 + if (in->_mp_size < 0 && res_size != 0) + { + /* Result should be 2^CNT - RES */ + + mp_ptr rp; + + limb_cnt = cnt / BITS_PER_MP_LIMB; + + if (res->_mp_alloc <= limb_cnt) + _mpz_realloc (res, limb_cnt + 1); + rp = PTR(res); + if (res_size > limb_cnt) + { + mpn_nz_neg (rp, rp, res_size); + rp[limb_cnt] &= ~(~(mp_limb_t) 0 << cnt % BITS_PER_MP_LIMB); + MPN_NORMALIZE_NOT_ZERO (rp, res_size); + } + else + { + mp_size_t i; + mpn_nz_neg (rp, rp, res_size); + for (i = res_size; i < limb_cnt; i++) + rp[i] = ~ (mp_limb_t) 0; + res_size = limb_cnt; + if (cnt % BITS_PER_MP_LIMB != 0) + { + rp[res_size] = ((mp_limb_t) 1 << (cnt % BITS_PER_MP_LIMB)) - 1; + res_size++; + } + else + MPN_NORMALIZE_NOT_ZERO (rp, res_size); + } + } + SIZ(res) = res_size; +} + +static void +mpn_nz_neg (rp, sp, n) + mp_ptr rp, sp; + mp_size_t n; +{ + mp_size_t i; + mp_limb_t x; + + x = sp[0]; + rp[0] = -x; + for (i = 1; x == 0; i++) + { + x = sp[i]; + rp[i] = -x; + } + + for (; i < n; i++) + { + rp[i] = ~sp[i]; + } +} +#endif diff --git a/ghc/rts/gmp/mpz/fdiv_r_ui.c b/ghc/rts/gmp/mpz/fdiv_r_ui.c index c4c3749..dd5c743 100644 --- a/ghc/rts/gmp/mpz/fdiv_r_ui.c +++ b/ghc/rts/gmp/mpz/fdiv_r_ui.c @@ -6,16 +6,16 @@ Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -37,6 +37,9 @@ mpz_fdiv_r_ui (rem, dividend, divisor) mp_size_t size; mp_limb_t remainder_limb; + if (divisor == 0) + DIVIDE_BY_ZERO; + dividend_size = dividend->_mp_size; size = ABS (dividend_size); diff --git a/ghc/rts/gmp/mpz/fdiv_ui.c b/ghc/rts/gmp/mpz/fdiv_ui.c index 4d018a2..f937b5f 100644 --- a/ghc/rts/gmp/mpz/fdiv_ui.c +++ b/ghc/rts/gmp/mpz/fdiv_ui.c @@ -6,16 +6,16 @@ Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/fib_ui.c b/ghc/rts/gmp/mpz/fib_ui.c new file mode 100644 index 0000000..4bebb80 --- /dev/null +++ b/ghc/rts/gmp/mpz/fib_ui.c @@ -0,0 +1,165 @@ +/* mpz_fib_ui(result, n) -- Set RESULT to the Nth Fibonacci number. + +Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +/* This is fast, but could be made somewhat faster and neater. + The timing is somewhat fluctuating for even/odd sizes because + of the extra hair used to save variables and operations. Here + are a few things one might want to address: + 1. Avoid using 4 intermediate variables in mpz_fib_bigcase. + 2. Call mpn functions directly. Straightforward for these functions. + 3. Merge the three functions into one. + +Said by Kevin: + Consider using the Lucas numbers L[n] as an auxiliary sequence, making + it possible to do the "doubling" operation in mpz_fib_bigcase with two + squares rather than two multiplies. The formulas are a little more + complicated, something like the following (untested). + + F[2n] = ((F[n]+L[n])^2 - 6*F[n]^2 - 4*(-1)^n) / 2 + L[2n] = 5*F[n]^2 + 2*(-1)^n + + F[2n+1] = (F[2n] + L[2n]) / 2 + L[2n+1] = (5*F[2n] + L[2n]) / 2 + + The Lucas number that comes for free here could even be returned. + + Maybe there's formulas with two squares using just F[n], but I don't + know of any. +*/ + +/* Determine the needed storage for Fib(n). */ +#define FIB_SIZE(n) (((mp_size_t) ((n)*0.695)) / BITS_PER_MP_LIMB + 2) + +static void mpz_fib_bigcase _PROTO ((mpz_t, mpz_t, unsigned long int)); +static void mpz_fib_basecase _PROTO ((mpz_t, mpz_t, unsigned long int)); + + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 60 +#endif + +void +#if __STDC__ +mpz_fib_ui (mpz_t r, unsigned long int n) +#else +mpz_fib_ui (r, n) + mpz_t r; + unsigned long int n; +#endif +{ + if (n == 0) + mpz_set_ui (r, 0); + else + { + mpz_t t1; + mpz_init (t1); + if (n < FIB_THRESHOLD) + mpz_fib_basecase (t1, r, n); + else + mpz_fib_bigcase (t1, r, n); + mpz_clear (t1); + } +} + +static void +#if __STDC__ +mpz_fib_basecase (mpz_t t1, mpz_t t2, unsigned long int n) +#else +mpz_fib_basecase (t1, t2, n) + mpz_t t1; + mpz_t t2; + unsigned long int n; +#endif +{ + unsigned long int m, i; + + mpz_set_ui (t1, 0); + mpz_set_ui (t2, 1); + m = n/2; + for (i = 0; i < m; i++) + { + mpz_add (t1, t1, t2); + mpz_add (t2, t1, t2); + } + if ((n & 1) == 0) + { + mpz_sub (t1, t2, t1); + mpz_sub (t2, t2, t1); /* trick: recover t1 value just overwritten */ + } +} + +static void +#if __STDC__ +mpz_fib_bigcase (mpz_t t1, mpz_t t2, unsigned long int n) +#else +mpz_fib_bigcase (t1, t2, n) + mpz_t t1; + mpz_t t2; + unsigned long int n; +#endif +{ + unsigned long int n2; + int ni, i; + mpz_t x1, x2, u1, u2; + + ni = 0; + for (n2 = n; n2 >= FIB_THRESHOLD; n2 /= 2) + ni++; + + mpz_fib_basecase (t1, t2, n2); + + mpz_init (x1); + mpz_init (x2); + mpz_init (u1); + mpz_init (u2); + + for (i = ni - 1; i >= 0; i--) + { + mpz_mul_2exp (x1, t1, 1); + mpz_mul_2exp (x2, t2, 1); + + mpz_add (x1, x1, t2); + mpz_sub (x2, x2, t1); + + mpz_mul (u1, t2, x1); + mpz_mul (u2, t1, x2); + + if (((n >> i) & 1) == 0) + { + mpz_sub (t1, u1, u2); + mpz_set (t2, u1); + } + else + { + mpz_set (t1, u1); + mpz_mul_2exp (t2, u1, 1); + mpz_sub (t2, t2, u2); + } + } + + mpz_clear (x1); + mpz_clear (x2); + mpz_clear (u1); + mpz_clear (u2); +} diff --git a/ghc/rts/gmp/mpz/fits_sint_p.c b/ghc/rts/gmp/mpz/fits_sint_p.c new file mode 100644 index 0000000..82e32a2 --- /dev/null +++ b/ghc/rts/gmp/mpz/fits_sint_p.c @@ -0,0 +1,50 @@ +/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X. + +Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +int +#if __STDC__ +mpz_fits_sint_p (mpz_srcptr src) +#else +mpz_fits_sint_p (src) + mpz_srcptr src; +#endif +{ + mp_size_t size; + mp_limb_t mpl; + + mpl = PTR(src)[0]; + size = SIZ(src); + if (size > 0) + { + if (size > 1) + return 0; + return mpl < ~((~(unsigned int) 0) >> 1); + } + else + { + if (size < -1) + return 0; + return mpl <= ~((~(unsigned int) 0) >> 1); + } +} diff --git a/ghc/rts/gmp/mpz/fits_slong_p.c b/ghc/rts/gmp/mpz/fits_slong_p.c new file mode 100644 index 0000000..e0669b5 --- /dev/null +++ b/ghc/rts/gmp/mpz/fits_slong_p.c @@ -0,0 +1,50 @@ +/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X. + +Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +int +#if __STDC__ +mpz_fits_slong_p (mpz_srcptr src) +#else +mpz_fits_slong_p (src) + mpz_srcptr src; +#endif +{ + mp_size_t size; + mp_limb_t mpl; + + mpl = PTR(src)[0]; + size = SIZ(src); + if (size > 0) + { + if (size > 1) + return 0; + return mpl < ~((~(unsigned long int) 0) >> 1); + } + else + { + if (size < -1) + return 0; + return mpl <= ~((~(unsigned long int) 0) >> 1); + } +} diff --git a/ghc/rts/gmp/mpz/fits_sshort_p.c b/ghc/rts/gmp/mpz/fits_sshort_p.c new file mode 100644 index 0000000..5b8e31a --- /dev/null +++ b/ghc/rts/gmp/mpz/fits_sshort_p.c @@ -0,0 +1,50 @@ +/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X. + +Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +int +#if __STDC__ +mpz_fits_sshort_p (mpz_srcptr src) +#else +mpz_fits_sshort_p (src) + mpz_srcptr src; +#endif +{ + mp_size_t size; + mp_limb_t mpl; + + mpl = PTR(src)[0]; + size = SIZ(src); + if (size > 0) + { + if (size > 1) + return 0; + return mpl <= (((unsigned short int) ~(unsigned int) 0) >> 1); + } + else + { + if (size < -1) + return 0; + return mpl <= (((unsigned short int) ~(unsigned int) 0) >> 1) + 1; + } +} diff --git a/ghc/rts/gmp/mpz/fits_uint_p.c b/ghc/rts/gmp/mpz/fits_uint_p.c new file mode 100644 index 0000000..72f62fa --- /dev/null +++ b/ghc/rts/gmp/mpz/fits_uint_p.c @@ -0,0 +1,41 @@ +/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X. + +Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +int +#if __STDC__ +mpz_fits_uint_p (mpz_srcptr src) +#else +mpz_fits_uint_p (src) + mpz_srcptr src; +#endif +{ + mp_size_t size; + mp_limb_t mpl; + + mpl = PTR(src)[0]; + size = SIZ(src); + if (size < 0 || size > 1) + return 0; + return mpl <= (~(unsigned int) 0); +} diff --git a/ghc/rts/gmp/mpz/fits_ulong_p.c b/ghc/rts/gmp/mpz/fits_ulong_p.c new file mode 100644 index 0000000..92eb42e --- /dev/null +++ b/ghc/rts/gmp/mpz/fits_ulong_p.c @@ -0,0 +1,41 @@ +/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X. + +Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +int +#if __STDC__ +mpz_fits_ulong_p (mpz_srcptr src) +#else +mpz_fits_ulong_p (src) + mpz_srcptr src; +#endif +{ + mp_size_t size; + mp_limb_t mpl; + + mpl = PTR(src)[0]; + size = SIZ(src); + if (size < 0 || size > 1) + return 0; + return mpl <= (~(unsigned long int) 0); +} diff --git a/ghc/rts/gmp/mpz/fits_ushort_p.c b/ghc/rts/gmp/mpz/fits_ushort_p.c new file mode 100644 index 0000000..bde0eda --- /dev/null +++ b/ghc/rts/gmp/mpz/fits_ushort_p.c @@ -0,0 +1,41 @@ +/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X. + +Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +int +#if __STDC__ +mpz_fits_ushort_p (mpz_srcptr src) +#else +mpz_fits_ushort_p (src) + mpz_srcptr src; +#endif +{ + mp_size_t size; + mp_limb_t mpl; + + mpl = PTR(src)[0]; + size = SIZ(src); + if (size < 0 || size > 1) + return 0; + return mpl <= ((unsigned short int) ~(unsigned int) 0); +} diff --git a/ghc/rts/gmp/mpz/gcd.c b/ghc/rts/gmp/mpz/gcd.c index f93030c..0d950dd 100644 --- a/ghc/rts/gmp/mpz/gcd.c +++ b/ghc/rts/gmp/mpz/gcd.c @@ -1,20 +1,20 @@ /* mpz/gcd.c: Calculate the greatest common divisor of two integers. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -22,8 +22,10 @@ MA 02111-1307, USA. */ #include "gmp.h" #include "gmp-impl.h" #include "longlong.h" +#ifdef BERKELEY_MP +#include "mp.h" +#endif -void *_mpz_realloc (); #ifndef BERKELEY_MP void @@ -145,10 +147,10 @@ gcd (u, v, g) g_zero_bits = MIN (u_zero_bits, v_zero_bits); } - /* Call mpn_gcd. The 1st argument must not have more bits than the 2nd. */ + /* Call mpn_gcd. The 2nd argument must not have more bits than the 1st. */ vsize = (usize < vsize || (usize == vsize && up[usize-1] < vp[vsize-1])) - ? mpn_gcd (vp, up, usize, vp, vsize) - : mpn_gcd (vp, vp, vsize, up, usize); + ? mpn_gcd (vp, vp, vsize, up, usize) + : mpn_gcd (vp, up, usize, vp, vsize); /* Here G <-- V << (g_zero_limbs*BITS_PER_MP_LIMB + g_zero_bits). */ gsize = vsize + g_zero_limbs; diff --git a/ghc/rts/gmp/mpz/gcd_ui.c b/ghc/rts/gmp/mpz/gcd_ui.c index 388ab05..f3bec58 100644 --- a/ghc/rts/gmp/mpz/gcd_ui.c +++ b/ghc/rts/gmp/mpz/gcd_ui.c @@ -1,24 +1,25 @@ /* mpz_gcd_ui -- Calculate the greatest common divisior of two integers. -Copyright (C) 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1994, 1996, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include /* for NULL */ #include "gmp.h" #include "gmp-impl.h" @@ -53,7 +54,7 @@ mpz_gcd_ui (w, u, v) return size > 1 ? 0 : w->_mp_d[0]; } else - res = mpn_gcd_1 (u->_mp_d, size, v); + res = mpn_gcd_1 (u->_mp_d, size, (mp_limb_t) v); if (w != NULL) { diff --git a/ghc/rts/gmp/mpz/gcdext.c b/ghc/rts/gmp/mpz/gcdext.c index adf66b0..3ba04c8 100644 --- a/ghc/rts/gmp/mpz/gcdext.c +++ b/ghc/rts/gmp/mpz/gcdext.c @@ -1,30 +1,30 @@ /* mpz_gcdext(g, s, t, a, b) -- Set G to gcd(a, b), and S and T such that g = as + bt. -Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 2000 Free Software +Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include /* for NULL */ #include "gmp.h" #include "gmp-impl.h" -/* Botch: SLOW! */ - void #if __STDC__ mpz_gcdext (mpz_ptr g, mpz_ptr s, mpz_ptr t, mpz_srcptr a, mpz_srcptr b) @@ -37,52 +37,101 @@ mpz_gcdext (g, s, t, a, b) mpz_srcptr b; #endif { - mpz_t s0, s1, q, r, x, d0, d1; + mp_size_t asize, bsize, usize, vsize; + mp_srcptr ap, bp; + mp_ptr up, vp; + mp_size_t gsize, ssize, tmp_ssize; + mp_ptr gp, sp, tmp_gp, tmp_sp; + mpz_srcptr u, v; + mpz_ptr ss, tt; + __mpz_struct stmp, gtmp; + TMP_DECL (marker); + + TMP_MARK (marker); + + /* mpn_gcdext requires that U >= V. Therefore, we often have to swap U and + V. This in turn leads to a lot of complications. The computed cofactor + will be the wrong one, so we have to fix that up at the end. */ + + asize = ABS (SIZ (a)); + bsize = ABS (SIZ (b)); + ap = PTR (a); + bp = PTR (b); + if (asize > bsize || (asize == bsize && mpn_cmp (ap, bp, asize) > 0)) + { + usize = asize; + vsize = bsize; + up = (mp_ptr) TMP_ALLOC ((usize + 1) * BYTES_PER_MP_LIMB); + vp = (mp_ptr) TMP_ALLOC ((vsize + 1) * BYTES_PER_MP_LIMB); + MPN_COPY (up, ap, usize); + MPN_COPY (vp, bp, vsize); + u = a; + v = b; + ss = s; + tt = t; + } + else + { + usize = bsize; + vsize = asize; + up = (mp_ptr) TMP_ALLOC ((usize + 1) * BYTES_PER_MP_LIMB); + vp = (mp_ptr) TMP_ALLOC ((vsize + 1) * BYTES_PER_MP_LIMB); + MPN_COPY (up, bp, usize); + MPN_COPY (vp, ap, vsize); + u = b; + v = a; + ss = t; + tt = s; + } - mpz_init_set_ui (s0, 1L); - mpz_init_set_ui (s1, 0L); - mpz_init (q); - mpz_init (r); - mpz_init (x); - mpz_init_set (d0, a); - mpz_init_set (d1, b); + tmp_gp = (mp_ptr) TMP_ALLOC ((usize + 1) * BYTES_PER_MP_LIMB); + tmp_sp = (mp_ptr) TMP_ALLOC ((usize + 1) * BYTES_PER_MP_LIMB); - while (d1->_mp_size != 0) + if (vsize == 0) { - mpz_tdiv_qr (q, r, d0, d1); - mpz_set (d0, d1); - mpz_set (d1, r); - - mpz_mul (x, s1, q); - mpz_sub (x, s0, x); - mpz_set (s0, s1); - mpz_set (s1, x); + tmp_sp[0] = 1; + tmp_ssize = 1; + MPN_COPY (tmp_gp, up, usize); + gsize = usize; } + else + gsize = mpn_gcdext (tmp_gp, tmp_sp, &tmp_ssize, up, usize, vp, vsize); + ssize = ABS (tmp_ssize); + + PTR (>mp) = tmp_gp; + SIZ (>mp) = gsize; - if (t != NULL) + PTR (&stmp) = tmp_sp; + SIZ (&stmp) = (tmp_ssize ^ SIZ (u)) >= 0 ? ssize : -ssize; + + if (tt != NULL) { - mpz_mul (x, s0, a); - mpz_sub (x, d0, x); - if (b->_mp_size == 0) - t->_mp_size = 0; + if (SIZ (v) == 0) + SIZ (tt) = 0; else - mpz_tdiv_q (t, x, b); + { + mpz_t x; + MPZ_TMP_INIT (x, ssize + usize + 1); + mpz_mul (x, &stmp, u); + mpz_sub (x, >mp, x); + mpz_tdiv_q (tt, x, v); + } } - mpz_set (s, s0); - mpz_set (g, d0); - if (g->_mp_size < 0) + + if (ss != NULL) { - g->_mp_size = -g->_mp_size; - s->_mp_size = -s->_mp_size; - if (t != NULL) - t->_mp_size = -t->_mp_size; + if (ALLOC (ss) < ssize) + _mpz_realloc (ss, ssize); + sp = PTR (ss); + MPN_COPY (sp, tmp_sp, ssize); + SIZ (ss) = SIZ (&stmp); } - mpz_clear (s0); - mpz_clear (s1); - mpz_clear (q); - mpz_clear (r); - mpz_clear (x); - mpz_clear (d0); - mpz_clear (d1); + if (ALLOC (g) < gsize) + _mpz_realloc (g, gsize); + gp = PTR (g); + MPN_COPY (gp, tmp_gp, gsize); + SIZ (g) = gsize; + + TMP_FREE (marker); } diff --git a/ghc/rts/gmp/mpz/get_d.c b/ghc/rts/gmp/mpz/get_d.c index 0fd7916..6a7c585 100644 --- a/ghc/rts/gmp/mpz/get_d.c +++ b/ghc/rts/gmp/mpz/get_d.c @@ -1,26 +1,49 @@ /* double mpz_get_d (mpz_t src) -- Return the double approximation to SRC. -Copyright (C) 1996 Free Software Foundation, Inc. +Copyright (C) 1996, 1997, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "gmp.h" #include "gmp-impl.h" +#include "longlong.h" + + +static int +#if __STDC__ +mpn_zero_p (mp_ptr p, mp_size_t n) +#else +mpn_zero_p (p, n) + mp_ptr p; + mp_size_t n; +#endif +{ + mp_size_t i; + + for (i = 0; i < n; i++) + { + if (p[i] != 0) + return 0; + } + + return 1; +} + double #if __STDC__ @@ -31,9 +54,11 @@ mpz_get_d (src) #endif { double res; - mp_size_t size, i, n_limbs_to_use; + mp_size_t size; int negative; mp_ptr qp; + mp_limb_t hz, lz; + int cnt; size = SIZ(src); if (size == 0) @@ -43,12 +68,61 @@ mpz_get_d (src) size = ABS (size); qp = PTR(src); - res = qp[size - 1]; - n_limbs_to_use = MIN (LIMBS_PER_DOUBLE, size); - for (i = 2; i <= n_limbs_to_use; i++) - res = res * MP_BASE_AS_DOUBLE + qp[size - i]; + if (size == 1) + { + res = qp[size - 1]; + } + else if (size == 2) + { + res = MP_BASE_AS_DOUBLE * qp[size - 1] + qp[size - 2]; + } + else + { + count_leading_zeros (cnt, qp[size - 1]); - res = __gmp_scale2 (res, (size - n_limbs_to_use) * BITS_PER_MP_LIMB); +#if BITS_PER_MP_LIMB == 32 + if (cnt == 0) + { + hz = qp[size - 1]; + lz = qp[size - 2]; + } + else + { + hz = (qp[size - 1] << cnt) | (qp[size - 2] >> BITS_PER_MP_LIMB - cnt); + lz = (qp[size - 2] << cnt) | (qp[size - 3] >> BITS_PER_MP_LIMB - cnt); + } +#if _GMP_IEEE_FLOATS + /* Take bits from less significant limbs, but only if they may affect + the result. */ + if ((lz & 0x7ff) == 0x400) + { + if (cnt != 0) + lz += ((qp[size - 3] << cnt) != 0 || ! mpn_zero_p (qp, size - 3)); + else + lz += (! mpn_zero_p (qp, size - 2)); + } +#endif + res = MP_BASE_AS_DOUBLE * hz + lz; + res = __gmp_scale2 (res, (size - 2) * BITS_PER_MP_LIMB - cnt); +#endif +#if BITS_PER_MP_LIMB == 64 + if (cnt == 0) + hz = qp[size - 1]; + else + hz = (qp[size - 1] << cnt) | (qp[size - 2] >> BITS_PER_MP_LIMB - cnt); +#if _GMP_IEEE_FLOATS + if ((hz & 0x7ff) == 0x400) + { + if (cnt != 0) + hz += ((qp[size - 2] << cnt) != 0 || ! mpn_zero_p (qp, size - 2)); + else + hz += (! mpn_zero_p (qp, size - 1)); + } +#endif + res = hz; + res = __gmp_scale2 (res, (size - 1) * BITS_PER_MP_LIMB - cnt); +#endif + } return negative ? -res : res; } diff --git a/ghc/rts/gmp/mpz/get_si.c b/ghc/rts/gmp/mpz/get_si.c index 45e0e5a..8a5d0e4 100644 --- a/ghc/rts/gmp/mpz/get_si.c +++ b/ghc/rts/gmp/mpz/get_si.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/get_str.c b/ghc/rts/gmp/mpz/get_str.c index 8ccf3ef..c7278af 100644 --- a/ghc/rts/gmp/mpz/get_str.c +++ b/ghc/rts/gmp/mpz/get_str.c @@ -9,16 +9,16 @@ Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/get_ui.c b/ghc/rts/gmp/mpz/get_ui.c index 4bfb5e1..a8ec9e0 100644 --- a/ghc/rts/gmp/mpz/get_ui.c +++ b/ghc/rts/gmp/mpz/get_ui.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/getlimbn.c b/ghc/rts/gmp/mpz/getlimbn.c index c7a234b..b772ed0 100644 --- a/ghc/rts/gmp/mpz/getlimbn.c +++ b/ghc/rts/gmp/mpz/getlimbn.c @@ -1,20 +1,20 @@ /* mpz_getlimbn(integer,n) -- Return the N:th limb from INTEGER. -Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc. +Copyright (C) 1993, 1994, 1995, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -31,7 +31,7 @@ mpz_getlimbn (integer, n) mp_size_t n; #endif { - if (integer->_mp_size <= n || n < 0) + if (ABS (integer->_mp_size) <= n || n < 0) return 0; else return integer->_mp_d[n]; diff --git a/ghc/rts/gmp/mpz/hamdist.c b/ghc/rts/gmp/mpz/hamdist.c index 58c9273..b039a65 100644 --- a/ghc/rts/gmp/mpz/hamdist.c +++ b/ghc/rts/gmp/mpz/hamdist.c @@ -8,16 +8,16 @@ Copyright (C) 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/init.c b/ghc/rts/gmp/mpz/init.c index f8d8e20..2e8e4d2 100644 --- a/ghc/rts/gmp/mpz/init.c +++ b/ghc/rts/gmp/mpz/init.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/inp_raw.c b/ghc/rts/gmp/mpz/inp_raw.c index e1cec1d..15e6012 100644 --- a/ghc/rts/gmp/mpz/inp_raw.c +++ b/ghc/rts/gmp/mpz/inp_raw.c @@ -6,16 +6,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/inp_str.c b/ghc/rts/gmp/mpz/inp_str.c index 7159062..7aa5e1f 100644 --- a/ghc/rts/gmp/mpz/inp_str.c +++ b/ghc/rts/gmp/mpz/inp_str.c @@ -1,21 +1,21 @@ /* mpz_inp_str(dest_integer, stream, base) -- Input a number in base BASE from stdio stream STREAM and store the result in DEST_INTEGER. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1998, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -26,9 +26,13 @@ MA 02111-1307, USA. */ #include "gmp-impl.h" static int +#if __STDC__ +digit_value_in_base (int c, int base) +#else digit_value_in_base (c, base) int c; int base; +#endif { int digit; @@ -48,10 +52,10 @@ digit_value_in_base (c, base) size_t #if __STDC__ -mpz_inp_str (mpz_ptr dest, FILE *stream, int base) +mpz_inp_str (mpz_ptr x, FILE *stream, int base) #else -mpz_inp_str (dest, stream, base) - mpz_ptr dest; +mpz_inp_str (x, stream, base) + mpz_ptr x; FILE *stream; int base; #endif @@ -60,15 +64,12 @@ mpz_inp_str (dest, stream, base) size_t alloc_size, str_size; int c; int negative; - mp_size_t dest_size; + mp_size_t xsize; size_t nread; if (stream == 0) stream = stdin; - alloc_size = 100; - str = (char *) (*_mp_allocate_func) (alloc_size); - str_size = 0; nread = 0; /* Skip whitespace. */ @@ -84,6 +85,7 @@ mpz_inp_str (dest, stream, base) { negative = 1; c = getc (stream); + nread++; } if (digit_value_in_base (c, base == 0 ? 10 : base) < 0) @@ -105,9 +107,26 @@ mpz_inp_str (dest, stream, base) c = getc (stream); nread++; } + else if (c == 'b' || c == 'B') + { + base = 2; + c = getc (stream); + nread++; + } } } + /* Skip leading zeros. */ + while (c == '0') + { + c = getc (stream); + nread++; + } + + alloc_size = 100; + str = (char *) (*_mp_allocate_func) (alloc_size); + str_size = 0; + for (;;) { int dig; @@ -126,12 +145,22 @@ mpz_inp_str (dest, stream, base) ungetc (c, stream); - dest_size = str_size / __mp_bases[base].chars_per_limb + 1; - if (dest->_mp_alloc < dest_size) - _mpz_realloc (dest, dest_size); + /* Make sure the string is not empty, mpn_set_str would fail. */ + if (str_size == 0) + { + x->_mp_size = 0; + (*_mp_free_func) (str, alloc_size); + return nread; + } + + xsize = (((mp_size_t) (str_size / __mp_bases[base].chars_per_bit_exactly)) + / BITS_PER_MP_LIMB + 2); + if (x->_mp_alloc < xsize) + _mpz_realloc (x, xsize); - dest_size = mpn_set_str (dest->_mp_d, (unsigned char *) str, str_size, base); - dest->_mp_size = negative ? -dest_size : dest_size; + /* Convert the byte array in base BASE to our bignum format. */ + xsize = mpn_set_str (x->_mp_d, (unsigned char *) str, str_size, base); + x->_mp_size = negative ? -xsize : xsize; (*_mp_free_func) (str, alloc_size); return str_size + nread; diff --git a/ghc/rts/gmp/mpz/invert.c b/ghc/rts/gmp/mpz/invert.c index ff1d6d9..749a096 100644 --- a/ghc/rts/gmp/mpz/invert.c +++ b/ghc/rts/gmp/mpz/invert.c @@ -1,27 +1,28 @@ /* mpz_invert (inv, x, n). Find multiplicative inverse of X in Z(N). If X has an inverse, return non-zero and store inverse in INVERSE, - otherwise, return 0 and put garbage in X. + otherwise, return 0 and put garbage in INVERSE. -Copyright (C) 1996 Free Software Foundation, Inc. +Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "gmp.h" +#include "gmp-impl.h" int #if __STDC__ @@ -32,12 +33,45 @@ mpz_invert (inverse, x, n) mpz_srcptr x, n; #endif { - mpz_t gcd; - int rv; - - mpz_init (gcd); - mpz_gcdext (gcd, inverse, (mpz_ptr) 0, x, n); - rv = gcd->_mp_size == 1 && (gcd->_mp_d)[0] == 1; - mpz_clear (gcd); - return rv; + mpz_t gcd, tmp; + mp_size_t xsize, nsize, size; + TMP_DECL (marker); + + xsize = SIZ (x); + nsize = SIZ (n); + xsize = ABS (xsize); + nsize = ABS (nsize); + size = MAX (xsize, nsize) + 1; + + /* No inverse exists if the leftside operand is 0. Likewise, no + inverse exists if the mod operand is 1. */ + if (xsize == 0 || (nsize == 1 && (PTR (n))[0] == 1)) + return 0; + + TMP_MARK (marker); + + MPZ_TMP_INIT (gcd, size); + MPZ_TMP_INIT (tmp, size); + mpz_gcdext (gcd, tmp, (mpz_ptr) 0, x, n); + + /* If no inverse existed, return with an indication of that. */ + if (gcd->_mp_size != 1 || (gcd->_mp_d)[0] != 1) + { + TMP_FREE (marker); + return 0; + } + + /* Make sure we return a positive inverse. */ + if (SIZ (tmp) < 0) + { + if (SIZ (n) < 0) + mpz_sub (inverse, tmp, n); + else + mpz_add (inverse, tmp, n); + } + else + mpz_set (inverse, tmp); + + TMP_FREE (marker); + return 1; } diff --git a/ghc/rts/gmp/mpz/ior.c b/ghc/rts/gmp/mpz/ior.c index 77facfd..0bb5a80 100644 --- a/ghc/rts/gmp/mpz/ior.c +++ b/ghc/rts/gmp/mpz/ior.c @@ -1,20 +1,21 @@ /* mpz_ior -- Logical inclusive or. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation, +Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -126,7 +127,7 @@ mpz_ior (res, op1, op2) _mpz_realloc (res, res_size); res_ptr = res->_mp_d; /* Don't re-read OP1_PTR and OP2_PTR. They point to - temporary space--never to the space RES->_mp_D used + temporary space--never to the space RES->_mp_d used to point to before reallocation. */ } @@ -163,9 +164,8 @@ mpz_ior (res, op1, op2) { /* We should compute -OP1 | OP2. Swap OP1 and OP2 and fall through to the code that handles OP1 | -OP2. */ - {mpz_srcptr t = op1; op1 = op2; op2 = t;} - {mp_srcptr t = op1_ptr; op1_ptr = op2_ptr; op2_ptr = t;} - {mp_size_t t = op1_size; op1_size = op2_size; op2_size = t;} + MPZ_SRCPTR_SWAP (op1, op2); + MPN_SRCPTR_SWAP (op1_ptr,op1_size, op2_ptr,op2_size); } } @@ -187,6 +187,7 @@ mpz_ior (res, op1, op2) opx = (mp_ptr) TMP_ALLOC (op2_size * BYTES_PER_MP_LIMB); mpn_sub_1 (opx, op2_ptr, op2_size, (mp_limb_t) 1); op2_ptr = opx; + op2_size -= op2_ptr[op2_size - 1] == 0; if (res->_mp_alloc < res_alloc) { @@ -194,7 +195,7 @@ mpz_ior (res, op1, op2) op1_ptr = op1->_mp_d; res_ptr = res->_mp_d; /* Don't re-read OP2_PTR. It points to temporary space--never - to the space RES->_mp_D used to point to before reallocation. */ + to the space RES->_mp_d used to point to before reallocation. */ } if (op1_size >= op2_size) diff --git a/ghc/rts/gmp/mpz/iset.c b/ghc/rts/gmp/mpz/iset.c index c8a17dc..114bc2d 100644 --- a/ghc/rts/gmp/mpz/iset.c +++ b/ghc/rts/gmp/mpz/iset.c @@ -6,16 +6,16 @@ Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/iset_d.c b/ghc/rts/gmp/mpz/iset_d.c index 41e5c4f..502a893 100644 --- a/ghc/rts/gmp/mpz/iset_d.c +++ b/ghc/rts/gmp/mpz/iset_d.c @@ -6,16 +6,16 @@ Copyright (C) 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/iset_si.c b/ghc/rts/gmp/mpz/iset_si.c index af51f05..842db14 100644 --- a/ghc/rts/gmp/mpz/iset_si.c +++ b/ghc/rts/gmp/mpz/iset_si.c @@ -1,21 +1,21 @@ /* mpz_init_set_si(val) -- Make a new multiple precision number with value val. -Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1995, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -41,7 +41,7 @@ mpz_init_set_si (x, val) } else if (val < 0) { - x->_mp_d[0] = -val; + x->_mp_d[0] = (unsigned long) -val; x->_mp_size = -1; } else diff --git a/ghc/rts/gmp/mpz/iset_str.c b/ghc/rts/gmp/mpz/iset_str.c index e04ad5d..dfb8c6b 100644 --- a/ghc/rts/gmp/mpz/iset_str.c +++ b/ghc/rts/gmp/mpz/iset_str.c @@ -5,21 +5,21 @@ i.e. 0xhh...h means base 16, 0oo...o means base 8, otherwise assume base 10. -Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1995, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -40,5 +40,8 @@ mpz_init_set_str (x, str, base) x->_mp_alloc = 1; x->_mp_d = (mp_ptr) (*_mp_allocate_func) (BYTES_PER_MP_LIMB); + /* if str has no digits mpz_set_str leaves x->_mp_size unset */ + x->_mp_size = 0; + return mpz_set_str (x, str, base); } diff --git a/ghc/rts/gmp/mpz/iset_ui.c b/ghc/rts/gmp/mpz/iset_ui.c index dc39f59..759182c 100644 --- a/ghc/rts/gmp/mpz/iset_ui.c +++ b/ghc/rts/gmp/mpz/iset_ui.c @@ -6,16 +6,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/jacobi.c b/ghc/rts/gmp/mpz/jacobi.c index 409f622..9d49e1d 100644 --- a/ghc/rts/gmp/mpz/jacobi.c +++ b/ghc/rts/gmp/mpz/jacobi.c @@ -6,16 +6,16 @@ Copyright (C) 1991, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/kronsz.c b/ghc/rts/gmp/mpz/kronsz.c new file mode 100644 index 0000000..c8c6752 --- /dev/null +++ b/ghc/rts/gmp/mpz/kronsz.c @@ -0,0 +1,126 @@ +/* mpz_si_kronecker -- Kronecker/Jacobi symbol. */ + +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +int +#if __STDC__ +mpz_si_kronecker (long a, mpz_srcptr b) +#else +mpz_si_kronecker (a, b) + long a; + mpz_srcptr b; +#endif +{ + int b_abs_size; + mp_srcptr b_ptr; + mp_limb_t b_low; + int twos; + int result_bit1; + + b_abs_size = ABSIZ (b); + if (b_abs_size == 0) + return JACOBI_S0 (a); /* (a/0) */ + + b_ptr = PTR(b); + b_low = b_ptr[0]; + + /* (0/b) = 1 if b=+/-1, 0 otherwise */ + if (a == 0) + return (b_abs_size == 1) & (b_low == 1); + + /* account for the effect of the sign of b, so can then ignore it */ + result_bit1 = JACOBI_BSGN_SZ_BIT1 (a, b); + + if ((b_low & 1) == 0) + { + /* b even */ + + if ((a & 1) == 0) + return 0; /* (a/b)=0 if both a,b even */ + + /* Require MP_BITS_PER_LIMB even, so that (a/2)^MP_BITS_PER_LIMB = 1, + and so that therefore there's no need to account for how many zero + limbs are stripped. */ + ASSERT ((BITS_PER_MP_LIMB & 1) == 0); + + MPN_STRIP_LOW_ZEROS_NOT_ZERO (b_ptr, b_abs_size); + b_low = b_ptr[0]; + + if ((b_low & 1) == 0) + { + /* odd a, even b */ + + mp_limb_t b_shl_bit1; + + count_trailing_zeros (twos, b_low); + + /* b_shl_bit1 is b>>twos, but with only bit 1 guaranteed */ + if (twos == BITS_PER_MP_LIMB-1) + b_shl_bit1 = (b_abs_size == 1) ? 0 : (b_ptr[1] << 1); + else + b_shl_bit1 = (b_low >> twos); + + result_bit1 ^= JACOBI_ASGN_SU_BIT1 (a, b_shl_bit1); + a = ABS(a); + + if (a == 1) + return JACOBI_BIT1_TO_PN (result_bit1); /* (1/b)=1 */ + + /* twos (a/2), reciprocity to (b/a), and (b/a) = (b mod a / b) */ + return mpn_jacobi_base (mpn_mod_1_rshift (b_ptr, b_abs_size, + twos, a), + a, + result_bit1 + ^ JACOBI_TWOS_U_BIT1 (twos, a) + ^ JACOBI_RECIP_UU_BIT1 (a, b_shl_bit1)); + } + } + + /* b odd */ + + result_bit1 ^= JACOBI_ASGN_SU_BIT1 (a, b_low); + a = ABS(a); + + /* (a/1) = 1 for any a */ + if (b_abs_size == 1 && b_low == 1) + return JACOBI_BIT1_TO_PN (result_bit1); + + /* Note a is cast to unsigned because 0x80..00 doesn't fit in a signed. */ + if ((a & 1) == 0) + { + count_trailing_zeros (twos, a); + a = ((unsigned long) a) >> twos; + result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, b_low); + } + + if (a == 1) + return JACOBI_BIT1_TO_PN (result_bit1); /* (1/b)=1 */ + + /* reciprocity to (b/a), and (b/a) == (b mod a / a) */ + return mpn_jacobi_base (mpn_mod_1 (b_ptr, b_abs_size, a), a, + result_bit1 ^ JACOBI_RECIP_UU_BIT1 (a, b_low)); +} diff --git a/ghc/rts/gmp/mpz/kronuz.c b/ghc/rts/gmp/mpz/kronuz.c new file mode 100644 index 0000000..b877e6f --- /dev/null +++ b/ghc/rts/gmp/mpz/kronuz.c @@ -0,0 +1,115 @@ +/* mpz_ui_kronecker -- Kronecker/Jacobi symbol. */ + +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +int +#if __STDC__ +mpz_ui_kronecker (unsigned long a, mpz_srcptr b) +#else +mpz_ui_kronecker (a, b) + unsigned long a; + mpz_srcptr b; +#endif +{ + int b_abs_size; + mp_srcptr b_ptr; + mp_limb_t b_low; + int twos; + int result_bit1; + + /* (a/0) */ + b_abs_size = ABSIZ (b); + if (b_abs_size == 0) + return JACOBI_U0 (a); + + /* (a/-1)=1 when a>=0, so the sign of b is ignored */ + b_ptr = PTR(b); + b_low = b_ptr[0]; + + /* (0/1)=1; (0/-1)=1; (0/b)=0 for b!=+/-1 + (1/b)=1, for any b */ + if (a <= 1) + return (a == 1) | ((b_abs_size == 1) & (b_low == 1)); + + if (b_low & 1) + { + /* (a/1) = 1 for any a */ + if (b_abs_size == 1 && b_low == 1) + return 1; + + count_trailing_zeros (twos, a); + a >>= twos; + if (a == 1) + return JACOBI_TWOS_U (twos, b_low); /* powers of (2/b) only */ + + /* powers of (2/b); reciprocity to (b/a); (b/a) == (b mod a / a) */ + return mpn_jacobi_base (mpn_mod_1 (b_ptr, b_abs_size, a), + a, + JACOBI_TWOS_U_BIT1 (twos, b_low) + ^ JACOBI_RECIP_UU_BIT1 (b_low, a)); + } + + /* b is even; (a/2)=0 if a is even */ + if ((a & 1) == 0) + return 0; + + /* Require MP_BITS_PER_LIMB even, so (a/2)^MP_BITS_PER_LIMB = 1, and so we + don't have to pay attention to how many trailing zero limbs are + stripped. */ + ASSERT ((BITS_PER_MP_LIMB & 1) == 0); + + MPN_STRIP_LOW_ZEROS_NOT_ZERO (b_ptr, b_abs_size); + b_low = b_ptr[0]; + + if (b_low & 1) + /* reciprocity to (b/a); (b/a) == (b mod a / a) */ + return mpn_jacobi_base (mpn_mod_1 (b_ptr, b_abs_size, a), + a, + JACOBI_RECIP_UU_BIT1 (b_low, a)); + + count_trailing_zeros (twos, b_low); + + /* reciprocity to get (b/a) */ + if (twos == BITS_PER_MP_LIMB-1) + { + if (b_abs_size == 1) + { + /* b==0x800...00, one limb high bit only, so (a/2)^(BPML-1) */ + return JACOBI_TWOS_U (BITS_PER_MP_LIMB-1, a); + } + + /* b_abs_size > 1 */ + result_bit1 = JACOBI_RECIP_UU_BIT1 (a, b_ptr[1] << 1); + } + else + result_bit1 = JACOBI_RECIP_UU_BIT1 (a, b_low >> twos); + + /* powers of (a/2); reciprocity to (b/a); (b/a) == (b mod a / a) */ + return mpn_jacobi_base (mpn_mod_1_rshift (b_ptr, b_abs_size, twos, a), + a, + JACOBI_TWOS_U_BIT1 (twos, a) ^ result_bit1); +} diff --git a/ghc/rts/gmp/mpz/kronzs.c b/ghc/rts/gmp/mpz/kronzs.c new file mode 100644 index 0000000..edfb465 --- /dev/null +++ b/ghc/rts/gmp/mpz/kronzs.c @@ -0,0 +1,74 @@ +/* mpz_kronecker_si -- Kronecker/Jacobi symbol. */ + +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +/* This function is expected to be often used with b odd, so there's a test + for this before invoking count_trailing_zeros(). + + After the absolute value of b is established it's treated as an unsigned + long, because 0x80..00 doesn't fit in a signed long. */ + +int +#if __STDC__ +mpz_kronecker_si (mpz_srcptr a, long b) +#else +mpz_kronecker_si (a, b) + mpz_srcptr a; + long b; +#endif +{ + int result_bit1; + int twos; + + if (b == 0) + return JACOBI_Z0 (a); + + result_bit1 = JACOBI_BSGN_ZS_BIT1(a, b); + b = ABS (b); + + if (b == 1) + return JACOBI_BIT1_TO_PN (result_bit1); /* (a/1) = 1 for any a */ + + if (b & 1) + return mpn_jacobi_base (mpz_fdiv_ui (a, b), b, result_bit1); + + /* result 0 if both a,b even */ + if (mpz_even_p (a)) + return 0; + + /* (a/2)=(2/a) when a odd */ + count_trailing_zeros (twos, b); + result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, PTR(a)[0]); + + b = ((unsigned long) b) >> twos; + if (b == 1) + return JACOBI_BIT1_TO_PN (result_bit1); + else + return mpn_jacobi_base (mpz_fdiv_ui (a, b), b, result_bit1); +} + + diff --git a/ghc/rts/gmp/mpz/kronzu.c b/ghc/rts/gmp/mpz/kronzu.c new file mode 100644 index 0000000..749be5d --- /dev/null +++ b/ghc/rts/gmp/mpz/kronzu.c @@ -0,0 +1,66 @@ +/* mpz_kronecker_ui -- Kronecker/Jacobi symbol. */ + +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +/* This function is expected to be often used with b an odd prime, so the + code for odd b is nice and short. */ + +int +#if __STDC__ +mpz_kronecker_ui (mpz_srcptr a, unsigned long b) +#else +mpz_kronecker_ui (a, b) + mpz_srcptr a; + unsigned long b; +#endif +{ + int twos; + + if (b & 1) + { + if (b != 1) + return mpn_jacobi_base (mpz_fdiv_ui (a, b), b, 0); + else + return 1; /* (a/1)=1 for any a */ + } + + if (b == 0) + return JACOBI_Z0 (a); + + /* (a/2)=0 if a even */ + if (mpz_even_p (a)) + return 0; + + /* (a/2)=(2/a) when a odd */ + count_trailing_zeros (twos, b); + b >>= twos; + if (b == 1) + return JACOBI_TWOS_U (twos, PTR(a)[0]); + + return mpn_jacobi_base (mpz_fdiv_ui (a, b), b, + JACOBI_TWOS_U_BIT1(twos, PTR(a)[0])); +} diff --git a/ghc/rts/gmp/mpz/lcm.c b/ghc/rts/gmp/mpz/lcm.c new file mode 100644 index 0000000..ca8c15c --- /dev/null +++ b/ghc/rts/gmp/mpz/lcm.c @@ -0,0 +1,56 @@ +/* mpz/lcm.c: Calculate the least common multiple of two integers. + +Copyright (C) 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +void *_mpz_realloc (); + +void +#if __STDC__ +mpz_lcm (mpz_ptr r, mpz_srcptr u, mpz_srcptr v) +#else +mpz_lcm (r, u, v) + mpz_ptr r; + mpz_srcptr u; + mpz_srcptr v; +#endif +{ + mpz_t g; + mp_size_t usize, vsize, size; + + usize = ABS (SIZ (u)); + vsize = ABS (SIZ (v)); + + if (usize == 0 || vsize == 0) + { + SIZ (r) = 0; + return; + } + + size = MAX (usize, vsize); + MPZ_TMP_INIT (g, size); + + mpz_gcd (g, u, v); + mpz_divexact (g, u, g); + mpz_mul (r, g, v); +} diff --git a/ghc/rts/gmp/mpz/legendre.c b/ghc/rts/gmp/mpz/legendre.c index 4de16a6..ab665f7 100644 --- a/ghc/rts/gmp/mpz/legendre.c +++ b/ghc/rts/gmp/mpz/legendre.c @@ -6,16 +6,16 @@ Copyright (C) 1992, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/mod.c b/ghc/rts/gmp/mpz/mod.c index b2b8b39..87033b3 100644 --- a/ghc/rts/gmp/mpz/mod.c +++ b/ghc/rts/gmp/mpz/mod.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1995, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/mul.c b/ghc/rts/gmp/mpz/mul.c index 47ce8e3..7854788 100644 --- a/ghc/rts/gmp/mpz/mul.c +++ b/ghc/rts/gmp/mpz/mul.c @@ -1,26 +1,30 @@ /* mpz_mul -- Multiply two integers. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include /* for NULL */ #include "gmp.h" #include "gmp-impl.h" +#ifdef BERKELEY_MP +#include "mp.h" +#endif #ifndef BERKELEY_MP void diff --git a/ghc/rts/gmp/mpz/mul_2exp.c b/ghc/rts/gmp/mpz/mul_2exp.c index 4d66a98..abea5fe 100644 --- a/ghc/rts/gmp/mpz/mul_2exp.c +++ b/ghc/rts/gmp/mpz/mul_2exp.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/mul_siui.c b/ghc/rts/gmp/mpz/mul_siui.c new file mode 100644 index 0000000..9849cd4 --- /dev/null +++ b/ghc/rts/gmp/mpz/mul_siui.c @@ -0,0 +1,81 @@ +/* mpz_mul_ui/si (product, multiplier, small_multiplicand) -- Set PRODUCT to + MULTIPLICATOR times SMALL_MULTIPLICAND. + +Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + + +#ifdef OPERATION_mul_ui +#define FUNCTION mpz_mul_ui +#define MULTIPLICAND_UNSIGNED unsigned +#define MULTIPLICAND_ABS(x) x +#else +#ifdef OPERATION_mul_si +#define FUNCTION mpz_mul_si +#define MULTIPLICAND_UNSIGNED +#define MULTIPLICAND_ABS(x) ABS(x) +#else +Error, error, unrecognised OPERATION +#endif +#endif + + +void +#if __STDC__ +FUNCTION (mpz_ptr prod, mpz_srcptr mult, + MULTIPLICAND_UNSIGNED long int small_mult) +#else +FUNCTION (prod, mult, small_mult) + mpz_ptr prod; + mpz_srcptr mult; + MULTIPLICAND_UNSIGNED long int small_mult; +#endif +{ + mp_size_t size = mult->_mp_size; + mp_size_t sign_product = size; + mp_limb_t cy; + mp_size_t prod_size; + mp_ptr prod_ptr; + + if (size == 0 || small_mult == 0) + { + prod->_mp_size = 0; + return; + } + size = ABS (size); + + prod_size = size + 1; + if (prod->_mp_alloc < prod_size) + _mpz_realloc (prod, prod_size); + + prod_ptr = prod->_mp_d; + + cy = mpn_mul_1 (prod_ptr, mult->_mp_d, size, + (mp_limb_t) MULTIPLICAND_ABS (small_mult)); + if (cy != 0) + { + prod_ptr[size] = cy; + size++; + } + + prod->_mp_size = ((sign_product < 0) ^ (small_mult < 0)) ? -size : size; +} diff --git a/ghc/rts/gmp/mpz/neg.c b/ghc/rts/gmp/mpz/neg.c index 0b48e5c..566c3a9 100644 --- a/ghc/rts/gmp/mpz/neg.c +++ b/ghc/rts/gmp/mpz/neg.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/nextprime.c b/ghc/rts/gmp/mpz/nextprime.c new file mode 100644 index 0000000..f024dd1 --- /dev/null +++ b/ghc/rts/gmp/mpz/nextprime.c @@ -0,0 +1,120 @@ +/* mpz_nextprime(p,t) - compute the next prime > t and store that in p. + +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +void +#if __STDC__ +mpz_nextprime (mpz_ptr p, mpz_srcptr t) +#else +mpz_nextprime (p, t) + mpz_ptr p; + mpz_srcptr t; +#endif +{ + mpz_add_ui (p, t, 1L); + while (! mpz_probab_prime_p (p, 5)) + mpz_add_ui (p, p, 1L); +} + +#if 0 +/* This code is not yet tested. Will be enabled in 3.1. */ + +status unsigned short primes[] = +{ +3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97, +101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181, +191,193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277, +281,283,293,307,311,313,317,331,337,347,349,353,359,367,373,379,383, +389,397,401,409,419,421,431,433,439,443,449,457,461,463,467,479,487, +491,499,503,509,521,523,541,547,557,563,569,571,577,587,593,599,601, +607,613,617,619,631,641,643,647,653,659,661,673,677,683,691,701,709, +719,727,733,739,743,751,757,761,769,773,787,797,809,811,821,823,827, +829,839,853,857,859,863,877,881,883,887,907,911,919,929,937,941,947, +953,967,971,977,983,991,997 +}; + +#define NUMBER_OF_PRIMES 167 + +void +#if __STDC__ +mpz_nextprime (mpz_ptr p, mpz_srcptr n) +#else +mpz_nextprime (p, n) + mpz_ptr p; + mpz_srcptr n; +#endif +{ + mpz_t tmp; + unsigned short *moduli; + unsigned long difference; + int i; + int composite; + + /* First handle tiny numbers */ + if (mpz_cmp_ui (n, 2) < 0) + { + mpz_set_ui (p, 2); + return; + } + mpz_add_ui (p, n, 1); + mpz_setbit (p, 0); + + if (mpz_cmp_ui (p, 7) <= 0) + return; + + prime_limit = NUMBER_OF_PRIMES - 1; + if (mpz_cmp_ui (p, primes[prime_limit]) <= 0) + /* Just use first three entries (3,5,7) of table for small numbers */ + prime_limit = 3; + if (prime_limit) + { + /* Compute residues modulo small odd primes */ + moduli = (unsigned short *) TMP_ALLOC (prime_limit * sizeof moduli[0]); + for (i = 0; i < prime_limit; i++) + moduli[i] = mpz_fdiv_ui (p, primes[i]); + } + for (difference = 0; ; difference += 2) + { + composite = 0; + + /* First check residues */ + for (i = 0; i < prime_limit; i++) + { + int acc, pr; + composite |= (moduli[i] == 0); + acc = moduli[i] + 2; + pr = primes[i]; + moduli[i] = acc >= pr ? acc - pr : acc; + } + if (composite) + continue; + + mpz_add_ui (p, p, difference); + difference = 0; + + /* Miller-Rabin test */ + if (mpz_millerrabin (p, 2)) + break; + } +} +#endif diff --git a/ghc/rts/gmp/mpz/out_raw.c b/ghc/rts/gmp/mpz/out_raw.c index 35d311b..6270947 100644 --- a/ghc/rts/gmp/mpz/out_raw.c +++ b/ghc/rts/gmp/mpz/out_raw.c @@ -6,16 +6,16 @@ Copyright (C) 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/out_str.c b/ghc/rts/gmp/mpz/out_str.c index 909f533..bf971b0 100644 --- a/ghc/rts/gmp/mpz/out_str.c +++ b/ghc/rts/gmp/mpz/out_str.c @@ -6,16 +6,16 @@ Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/perfpow.c b/ghc/rts/gmp/mpz/perfpow.c new file mode 100644 index 0000000..e71670a --- /dev/null +++ b/ghc/rts/gmp/mpz/perfpow.c @@ -0,0 +1,272 @@ +/* mpz_perfect_power_p(arg) -- Return non-zero if ARG is a perfect power, + zero otherwise. + +Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* + We are to determine if c is a perfect power, c = a ^ b. + Assume c is divisible by 2^n and that codd = c/2^n is odd. + Assume a is divisible by 2^m and that aodd = a/2^m is odd. + It is always true that m divides n. + + * If n is prime, either 1) a is 2*aodd and b = n + or 2) a = c and b = 1. + So for n prime, we readily have a solution. + * If n is factorable into the non-trivial factors p1,p2,... + Since m divides n, m has a subset of n's factors and b = n / m. + + BUG: Should handle negative numbers, since they can be odd perfect powers. +*/ + +/* This is a naive approach to recognizing perfect powers. + Many things can be improved. In particular, we should use p-adic + arithmetic for computing possible roots. */ + +#include /* for NULL */ +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +static unsigned long int gcd _PROTO ((unsigned long int a, unsigned long int b)); +static int isprime _PROTO ((unsigned long int t)); + +static const unsigned short primes[] = +{ 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, + 59, 61, 67, 71, 73, 79, 83, 89, 97,101,103,107,109,113,127,131, + 137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223, + 227,229,233,239,241,251,257,263,269,271,277,281,283,293,307,311, + 313,317,331,337,347,349,353,359,367,373,379,383,389,397,401,409, + 419,421,431,433,439,443,449,457,461,463,467,479,487,491,499,503, + 509,521,523,541,547,557,563,569,571,577,587,593,599,601,607,613, + 617,619,631,641,643,647,653,659,661,673,677,683,691,701,709,719, + 727,733,739,743,751,757,761,769,773,787,797,809,811,821,823,827, + 829,839,853,857,859,863,877,881,883,887,907,911,919,929,937,941, + 947,953,967,971,977,983,991,997,0 +}; +#define SMALLEST_OMITTED_PRIME 1009 + + +int +#if __STDC__ +mpz_perfect_power_p (mpz_srcptr u) +#else +mpz_perfect_power_p (u) + mpz_srcptr u; +#endif +{ + unsigned long int prime; + unsigned long int n, n2; + int i; + unsigned long int rem; + mpz_t u2, q; + int exact; + mp_size_t uns; + TMP_DECL (marker); + + if (mpz_cmp_ui (u, 1) <= 0) + return 0; + + n2 = mpz_scan1 (u, 0); + if (n2 == 1) + return 0; + + TMP_MARK (marker); + + uns = ABSIZ (u) - n2 / BITS_PER_MP_LIMB; + MPZ_TMP_INIT (q, uns); + MPZ_TMP_INIT (u2, uns); + + mpz_tdiv_q_2exp (u2, u, n2); + + if (isprime (n2)) + goto n2prime; + + for (i = 1; primes[i] != 0; i++) + { + prime = primes[i]; + rem = mpz_tdiv_ui (u2, prime); + if (rem == 0) /* divisable? */ + { + rem = mpz_tdiv_q_ui (q, u2, prime * prime); + if (rem != 0) + { + TMP_FREE (marker); + return 0; + } + mpz_swap (q, u2); + for (n = 2;;) + { + rem = mpz_tdiv_q_ui (q, u2, prime); + if (rem != 0) + break; + mpz_swap (q, u2); + n++; + } + + n2 = gcd (n2, n); + if (n2 == 1) + { + TMP_FREE (marker); + return 0; + } + + /* As soon as n2 becomes a prime number, stop factoring. + Either we have u=x^n2 or u is not a perfect power. */ + if (isprime (n2)) + goto n2prime; + } + } + + if (mpz_cmp_ui (u2, 1) == 0) + { + TMP_FREE (marker); + return 1; + } + + if (n2 == 0) + { + unsigned long int nth; + /* We did not find any factors above. We have to consider all values + of n. */ + for (nth = 2;; nth++) + { + if (! isprime (nth)) + continue; +#if 0 + exact = mpz_padic_root (q, u2, nth, PTH); + if (exact) +#endif + exact = mpz_root (q, u2, nth); + if (exact) + { + TMP_FREE (marker); + return 1; + } + if (mpz_cmp_ui (q, SMALLEST_OMITTED_PRIME) < 0) + { + TMP_FREE (marker); + return 0; + } + } + } + else + { + unsigned long int nth; + /* We found some factors above. We just need to consider values of n + that divides n2. */ + for (nth = 2; nth <= n2; nth++) + { + if (! isprime (nth)) + continue; + if (n2 % nth != 0) + continue; +#if 0 + exact = mpz_padic_root (q, u2, nth, PTH); + if (exact) +#endif + exact = mpz_root (q, u2, nth); + if (exact) + { + TMP_FREE (marker); + return 1; + } + if (mpz_cmp_ui (q, SMALLEST_OMITTED_PRIME) < 0) + { + TMP_FREE (marker); + return 0; + } + } + + TMP_FREE (marker); + return 0; + } + +n2prime: + exact = mpz_root (NULL, u2, n2); + TMP_FREE (marker); + return exact; +} + +static unsigned long int +#if __STDC__ +gcd (unsigned long int a, unsigned long int b) +#else +gcd (a, b) + unsigned long int a, b; +#endif +{ + int an2, bn2, n2; + + if (a == 0) + return b; + if (b == 0) + return a; + + count_trailing_zeros (an2, a); + a >>= an2; + + count_trailing_zeros (bn2, b); + b >>= bn2; + + n2 = MIN (an2, bn2); + + while (a != b) + { + if (a > b) + { + a -= b; + do + a >>= 1; + while ((a & 1) == 0); + } + else /* b > a. */ + { + b -= a; + do + b >>= 1; + while ((b & 1) == 0); + } + } + + return a << n2; +} + +static int +#if __STDC__ +isprime (unsigned long int t) +#else +isprime (t) + unsigned long int t; +#endif +{ + unsigned long int q, r, d; + + if (t < 3 || (t & 1) == 0) + return t == 2; + + for (d = 3, r = 1; r != 0; d += 2) + { + q = t / d; + r = t - q * d; + if (q < d) + return 1; + } + return 0; +} diff --git a/ghc/rts/gmp/mpz/perfsqr.c b/ghc/rts/gmp/mpz/perfsqr.c index cdf1b5a..92e8d08 100644 --- a/ghc/rts/gmp/mpz/perfsqr.c +++ b/ghc/rts/gmp/mpz/perfsqr.c @@ -1,21 +1,21 @@ /* mpz_perfect_square_p(arg) -- Return non-zero if ARG is a perfect square, zero otherwise. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -37,5 +37,9 @@ mpz_perfect_square_p (a) if (asize < 0) return 0; + /* Zero is a perfect square. */ + if (asize == 0) + return 1; + return mpn_perfect_square_p (a->_mp_d, asize); } diff --git a/ghc/rts/gmp/mpz/popcount.c b/ghc/rts/gmp/mpz/popcount.c index a979380..3105258 100644 --- a/ghc/rts/gmp/mpz/popcount.c +++ b/ghc/rts/gmp/mpz/popcount.c @@ -6,16 +6,16 @@ Copyright (C) 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/pow_ui.c b/ghc/rts/gmp/mpz/pow_ui.c index d8cf7a6..96ca114 100644 --- a/ghc/rts/gmp/mpz/pow_ui.c +++ b/ghc/rts/gmp/mpz/pow_ui.c @@ -1,20 +1,20 @@ /* mpz_pow_ui(res, base, exp) -- Set RES to BASE**EXP. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1997 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -49,7 +49,7 @@ rpow (b, e, r) #endif /* BERKELEY_MP */ { mp_ptr rp, bp, tp, xp; - mp_size_t rsize, bsize; + mp_size_t ralloc, rsize, bsize; int cnt, i; mp_limb_t blimb; TMP_DECL (marker); @@ -82,13 +82,13 @@ rpow (b, e, r) /* Estimate space requirements accurately. Using the code from the `else' path would over-estimate space requirements wildly. */ float lb = __mp_bases[blimb].chars_per_bit_exactly; - rsize = 2 + ((mp_size_t) (e / lb) / BITS_PER_MP_LIMB); + ralloc = 3 + ((mp_size_t) (e / lb) / BITS_PER_MP_LIMB); } else { /* Over-estimate space requirements somewhat. */ count_leading_zeros (cnt, blimb); - rsize = bsize * e - cnt * e / BITS_PER_MP_LIMB + 1; + ralloc = bsize * e - cnt * e / BITS_PER_MP_LIMB + 2; } TMP_MARK (marker); @@ -97,8 +97,8 @@ rpow (b, e, r) product for mpn_mul. (This scheme is used to fulfill the requirements of mpn_mul; that the product space may not be the same as any of the input operands.) */ - rp = (mp_ptr) TMP_ALLOC (rsize * BYTES_PER_MP_LIMB); - tp = (mp_ptr) TMP_ALLOC (rsize * BYTES_PER_MP_LIMB); + rp = (mp_ptr) TMP_ALLOC (ralloc * BYTES_PER_MP_LIMB); + tp = (mp_ptr) TMP_ALLOC (ralloc * BYTES_PER_MP_LIMB); MPN_COPY (rp, bp, bsize); rsize = bsize; diff --git a/ghc/rts/gmp/mpz/powm.c b/ghc/rts/gmp/mpz/powm.c index 5dcd1b1..e6af855 100644 --- a/ghc/rts/gmp/mpz/powm.c +++ b/ghc/rts/gmp/mpz/powm.c @@ -1,20 +1,21 @@ /* mpz_powm(res,base,exp,mod) -- Set RES to (base**exp) mod MOD. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation, Inc. +Contributed by Paul Zimmermann. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -22,255 +23,342 @@ MA 02111-1307, USA. */ #include "gmp.h" #include "gmp-impl.h" #include "longlong.h" +#ifdef BERKELEY_MP +#include "mp.h" +#endif + + +/* set c <- (a*b)/R^n mod m c has to have at least (2n) allocated limbs */ +static void +#if __STDC__ +mpz_redc (mpz_ptr c, mpz_srcptr a, mpz_srcptr b, mpz_srcptr m, mp_limb_t Nprim) +#else +mpz_redc (c, a, b, m, Nprim) + mpz_ptr c; + mpz_srcptr a; + mpz_srcptr b; + mpz_srcptr m; + mp_limb_t Nprim; +#endif +{ + mp_ptr cp, mp = PTR (m); + mp_limb_t cy, cout = 0; + mp_limb_t q; + size_t j, n = ABSIZ (m); + + ASSERT (ALLOC (c) >= 2 * n); + + mpz_mul (c, a, b); + cp = PTR (c); + j = ABSIZ (c); + MPN_ZERO (cp + j, 2 * n - j); + for (j = 0; j < n; j++) + { + q = cp[0] * Nprim; + cy = mpn_addmul_1 (cp, mp, n, q); + cout += mpn_add_1 (cp + n, cp + n, n - j, cy); + cp++; + } + cp -= n; + if (cout) + { + cy = cout - mpn_sub_n (cp, cp + n, mp, n); + while (cy) + cy -= mpn_sub_n (cp, cp, mp, n); + } + else + MPN_COPY (cp, cp + n, n); + MPN_NORMALIZE (cp, n); + SIZ (c) = SIZ (c) < 0 ? -n : n; +} +/* average number of calls to redc for an exponent of n bits + with the sliding window algorithm of base 2^k: the optimal is + obtained for the value of k which minimizes 2^(k-1)+n/(k+1): + + n\k 4 5 6 7 8 + 128 156* 159 171 200 261 + 256 309 307* 316 343 403 + 512 617 607* 610 632 688 + 1024 1231 1204 1195* 1207 1256 + 2048 2461 2399 2366 2360* 2396 + 4096 4918 4787 4707 4665* 4670 +*/ + #ifndef BERKELEY_MP void #if __STDC__ -mpz_powm (mpz_ptr res, mpz_srcptr base, mpz_srcptr exp, mpz_srcptr mod) +mpz_powm (mpz_ptr res, mpz_srcptr base, mpz_srcptr e, mpz_srcptr mod) #else -mpz_powm (res, base, exp, mod) +mpz_powm (res, base, e, mod) mpz_ptr res; mpz_srcptr base; - mpz_srcptr exp; + mpz_srcptr e; mpz_srcptr mod; #endif #else /* BERKELEY_MP */ void #if __STDC__ -pow (mpz_srcptr base, mpz_srcptr exp, mpz_srcptr mod, mpz_ptr res) +pow (mpz_srcptr base, mpz_srcptr e, mpz_srcptr mod, mpz_ptr res) #else -pow (base, exp, mod, res) +pow (base, e, mod, res) mpz_srcptr base; - mpz_srcptr exp; + mpz_srcptr e; mpz_srcptr mod; mpz_ptr res; #endif #endif /* BERKELEY_MP */ { - mp_ptr rp, ep, mp, bp; - mp_size_t esize, msize, bsize, rsize; - mp_size_t size; - int mod_shift_cnt; - int negative_result; - mp_limb_t *free_me = NULL; - size_t free_me_size; - TMP_DECL (marker); - - esize = ABS (exp->_mp_size); - msize = ABS (mod->_mp_size); - size = 2 * msize; + mp_limb_t invm, *ep, c, mask; + mpz_t xx, *g; + mp_size_t n, i, K, j, l, k; + int sh; + int use_redc; + +#ifdef POWM_DEBUG + mpz_t exp; + mpz_init (exp); +#endif - rp = res->_mp_d; - ep = exp->_mp_d; + n = ABSIZ (mod); - if (msize == 0) - msize = 1 / msize; /* provoke a signal */ + if (n == 0) + DIVIDE_BY_ZERO; - if (esize == 0) + if (SIZ (e) == 0) { /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0 - depending on if MOD equals 1. */ - rp[0] = 1; - res->_mp_size = (msize == 1 && (mod->_mp_d)[0] == 1) ? 0 : 1; + depending on if MOD equals 1. */ + SIZ(res) = (ABSIZ (mod) == 1 && (PTR(mod))[0] == 1) ? 0 : 1; + PTR(res)[0] = 1; return; } - TMP_MARK (marker); - - /* Normalize MOD (i.e. make its most significant bit set) as required by - mpn_divmod. This will make the intermediate values in the calculation - slightly larger, but the correct result is obtained after a final - reduction using the original MOD value. */ + /* Use REDC instead of usual reduction for sizes < POWM_THRESHOLD. + In REDC each modular multiplication costs about 2*n^2 limbs operations, + whereas using usual reduction it costs 3*K(n), where K(n) is the cost of a + multiplication using Karatsuba, and a division is assumed to cost 2*K(n), + for example using Burnikel-Ziegler's algorithm. This gives a theoretical + threshold of a*KARATSUBA_SQR_THRESHOLD, with a=(3/2)^(1/(2-ln(3)/ln(2))) ~ + 2.66. */ + /* For now, also disable REDC when MOD is even, as the inverse can't + handle that. */ + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD ((8 * KARATSUBA_SQR_THRESHOLD) / 3) +#endif - mp = (mp_ptr) TMP_ALLOC (msize * BYTES_PER_MP_LIMB); - count_leading_zeros (mod_shift_cnt, mod->_mp_d[msize - 1]); - if (mod_shift_cnt != 0) - mpn_lshift (mp, mod->_mp_d, msize, mod_shift_cnt); - else - MPN_COPY (mp, mod->_mp_d, msize); + use_redc = (n < POWM_THRESHOLD && PTR(mod)[0] % 2 != 0); + if (use_redc) + { + /* invm = -1/m mod 2^BITS_PER_MP_LIMB, must have m odd */ + modlimb_invert (invm, PTR(mod)[0]); + invm = -invm; + } - bsize = ABS (base->_mp_size); - if (bsize > msize) + /* determines optimal value of k */ + l = ABSIZ (e) * BITS_PER_MP_LIMB; /* number of bits of exponent */ + k = 1; + K = 2; + while (2 * l > K * (2 + k * (3 + k))) { - /* The base is larger than the module. Reduce it. */ + k++; + K *= 2; + } - /* Allocate (BSIZE + 1) with space for remainder and quotient. - (The quotient is (bsize - msize + 1) limbs.) */ - bp = (mp_ptr) TMP_ALLOC ((bsize + 1) * BYTES_PER_MP_LIMB); - MPN_COPY (bp, base->_mp_d, bsize); - /* We don't care about the quotient, store it above the remainder, - at BP + MSIZE. */ - mpn_divmod (bp + msize, bp, bsize, mp, msize); - bsize = msize; - /* Canonicalize the base, since we are going to multiply with it - quite a few times. */ - MPN_NORMALIZE (bp, bsize); + g = (mpz_t *) (*_mp_allocate_func) (K / 2 * sizeof (mpz_t)); + /* compute x*R^n where R=2^BITS_PER_MP_LIMB */ + mpz_init (g[0]); + if (use_redc) + { + mpz_mul_2exp (g[0], base, n * BITS_PER_MP_LIMB); + mpz_mod (g[0], g[0], mod); } else - bp = base->_mp_d; + mpz_mod (g[0], base, mod); - if (bsize == 0) + /* compute xx^g for odd g < 2^k */ + mpz_init (xx); + if (use_redc) { - res->_mp_size = 0; - TMP_FREE (marker); - return; + _mpz_realloc (xx, 2 * n); + mpz_redc (xx, g[0], g[0], mod, invm); /* xx = x^2*R^n */ } - - if (res->_mp_alloc < size) + else { - /* We have to allocate more space for RES. If any of the input - parameters are identical to RES, defer deallocation of the old - space. */ - - if (rp == ep || rp == mp || rp == bp) + mpz_mul (xx, g[0], g[0]); + mpz_mod (xx, xx, mod); + } + for (i = 1; i < K / 2; i++) + { + mpz_init (g[i]); + if (use_redc) { - free_me = rp; - free_me_size = res->_mp_alloc; + _mpz_realloc (g[i], 2 * n); + mpz_redc (g[i], g[i - 1], xx, mod, invm); /* g[i] = x^(2i+1)*R^n */ } else - (*_mp_free_func) (rp, res->_mp_alloc * BYTES_PER_MP_LIMB); - - rp = (mp_ptr) (*_mp_allocate_func) (size * BYTES_PER_MP_LIMB); - res->_mp_alloc = size; - res->_mp_d = rp; - } - else - { - /* Make BASE, EXP and MOD not overlap with RES. */ - if (rp == bp) { - /* RES and BASE are identical. Allocate temp. space for BASE. */ - bp = (mp_ptr) TMP_ALLOC (bsize * BYTES_PER_MP_LIMB); - MPN_COPY (bp, rp, bsize); + mpz_mul (g[i], g[i - 1], xx); + mpz_mod (g[i], g[i], mod); } - if (rp == ep) + } + + /* now starts the real stuff */ + mask = (mp_limb_t) ((1< 0) { - /* RES and EXP are identical. Allocate temp. space for EXP. */ - ep = (mp_ptr) TMP_ALLOC (esize * BYTES_PER_MP_LIMB); - MPN_COPY (ep, rp, esize); + i--; + c = (c << (-sh)) | (ep[i] >> (BITS_PER_MP_LIMB + sh)); + sh += BITS_PER_MP_LIMB; } - if (rp == mp) + } + else + c = c >> sh; +#ifdef POWM_DEBUG + printf ("-1/m mod 2^%u = %lu\n", BITS_PER_MP_LIMB, invm); + mpz_set_ui (exp, c); +#endif + j=0; + while (c % 2 == 0) + { + j++; + c = (c >> 1); + } + mpz_set (xx, g[c >> 1]); + while (j--) + { + if (use_redc) + mpz_redc (xx, xx, xx, mod, invm); + else { - /* RES and MOD are identical. Allocate temporary space for MOD. */ - mp = (mp_ptr) TMP_ALLOC (msize * BYTES_PER_MP_LIMB); - MPN_COPY (mp, rp, msize); + mpz_mul (xx, xx, xx); + mpz_mod (xx, xx, mod); } } - MPN_COPY (rp, bp, bsize); - rsize = bsize; - - { - mp_size_t i; - mp_ptr xp = (mp_ptr) TMP_ALLOC (2 * (msize + 1) * BYTES_PER_MP_LIMB); - int c; - mp_limb_t e; - mp_limb_t carry_limb; - - negative_result = (ep[0] & 1) && base->_mp_size < 0; - - i = esize - 1; - e = ep[i]; - count_leading_zeros (c, e); - e = (e << c) << 1; /* shift the exp bits to the left, lose msb */ - c = BITS_PER_MP_LIMB - 1 - c; - - /* Main loop. - - Make the result be pointed to alternately by XP and RP. This - helps us avoid block copying, which would otherwise be necessary - with the overlap restrictions of mpn_divmod. With 50% probability - the result after this loop will be in the area originally pointed - by RP (==RES->_mp_d), and with 50% probability in the area originally - pointed to by XP. */ +#ifdef POWM_DEBUG + printf ("x^"); mpz_out_str (0, 10, exp); + printf ("*2^%u mod m = ", n * BITS_PER_MP_LIMB); mpz_out_str (0, 10, xx); + putchar ('\n'); +#endif - for (;;) - { - while (c != 0) - { - mp_ptr tp; - mp_size_t xsize; + while (i > 0 || sh > 0) + { + c = ep[i]; + sh -= k; + l = k; /* number of bits treated */ + if (sh < 0) + { + if (i > 0) + { + i--; + c = (c << (-sh)) | (ep[i] >> (BITS_PER_MP_LIMB + sh)); + sh += BITS_PER_MP_LIMB; + } + else + { + l += sh; /* may be less bits than k here */ + c = c & ((1<> sh; + c = c & mask; - mpn_mul_n (xp, rp, rp, rsize); - xsize = 2 * rsize; - if (xsize > msize) - { - mpn_divmod (xp + msize, xp, xsize, mp, msize); - xsize = msize; - } + /* this while loop implements the sliding window improvement */ + while ((c & (1 << (k - 1))) == 0 && (i > 0 || sh > 0)) + { + if (use_redc) mpz_redc (xx, xx, xx, mod, invm); + else + { + mpz_mul (xx, xx, xx); + mpz_mod (xx, xx, mod); + } + if (sh) + { + sh--; + c = (c<<1) + ((ep[i]>>sh) & 1); + } + else + { + i--; + sh = BITS_PER_MP_LIMB - 1; + c = (c<<1) + (ep[i]>>sh); + } + } - tp = rp; rp = xp; xp = tp; - rsize = xsize; +#ifdef POWM_DEBUG + printf ("l=%u c=%lu\n", l, c); + mpz_mul_2exp (exp, exp, k); + mpz_add_ui (exp, exp, c); +#endif - if ((mp_limb_signed_t) e < 0) + /* now replace xx by xx^(2^k)*x^c */ + if (c != 0) + { + j = 0; + while (c % 2 == 0) + { + j++; + c = c >> 1; + } + /* c0 = c * 2^j, i.e. xx^(2^k)*x^c = (A^(2^(k - j))*c)^(2^j) */ + l -= j; + while (l--) + if (use_redc) mpz_redc (xx, xx, xx, mod, invm); + else { - mpn_mul (xp, rp, rsize, bp, bsize); - xsize = rsize + bsize; - if (xsize > msize) - { - mpn_divmod (xp + msize, xp, xsize, mp, msize); - xsize = msize; - } - - tp = rp; rp = xp; xp = tp; - rsize = xsize; + mpz_mul (xx, xx, xx); + mpz_mod (xx, xx, mod); } - e <<= 1; - c--; - } - - i--; - if (i < 0) - break; - e = ep[i]; - c = BITS_PER_MP_LIMB; - } - - /* We shifted MOD, the modulo reduction argument, left MOD_SHIFT_CNT - steps. Adjust the result by reducing it with the original MOD. - - Also make sure the result is put in RES->_mp_d (where it already - might be, see above). */ - - if (mod_shift_cnt != 0) - { - carry_limb = mpn_lshift (res->_mp_d, rp, rsize, mod_shift_cnt); - rp = res->_mp_d; - if (carry_limb != 0) - { - rp[rsize] = carry_limb; - rsize++; - } - } - else - { - MPN_COPY (res->_mp_d, rp, rsize); - rp = res->_mp_d; - } - - if (rsize >= msize) - { - mpn_divmod (rp + msize, rp, rsize, mp, msize); - rsize = msize; - } - - /* Remove any leading zero words from the result. */ - if (mod_shift_cnt != 0) - mpn_rshift (rp, rp, rsize, mod_shift_cnt); - MPN_NORMALIZE (rp, rsize); - } + if (use_redc) + mpz_redc (xx, xx, g[c >> 1], mod, invm); + else + { + mpz_mul (xx, xx, g[c >> 1]); + mpz_mod (xx, xx, mod); + } + } + else + j = l; /* case c=0 */ + while (j--) + { + if (use_redc) + mpz_redc (xx, xx, xx, mod, invm); + else + { + mpz_mul (xx, xx, xx); + mpz_mod (xx, xx, mod); + } + } +#ifdef POWM_DEBUG + printf ("x^"); mpz_out_str (0, 10, exp); + printf ("*2^%u mod m = ", n * BITS_PER_MP_LIMB); mpz_out_str (0, 10, xx); + putchar ('\n'); +#endif + } - if (negative_result && rsize != 0) + /* now convert back xx to xx/R^n */ + if (use_redc) { - if (mod_shift_cnt != 0) - mpn_rshift (mp, mp, msize, mod_shift_cnt); - mpn_sub (rp, mp, msize, rp, rsize); - rsize = msize; - MPN_NORMALIZE (rp, rsize); + mpz_set_ui (g[0], 1); + mpz_redc (xx, xx, g[0], mod, invm); + if (mpz_cmp (xx, mod) >= 0) + mpz_sub (xx, xx, mod); } - res->_mp_size = rsize; + mpz_set (res, xx); - if (free_me != NULL) - (*_mp_free_func) (free_me, free_me_size * BYTES_PER_MP_LIMB); - TMP_FREE (marker); + mpz_clear (xx); + for (i = 0; i < K / 2; i++) + mpz_clear (g[i]); + (*_mp_free_func) (g, K / 2 * sizeof (mpz_t)); } diff --git a/ghc/rts/gmp/mpz/powm_ui.c b/ghc/rts/gmp/mpz/powm_ui.c index 596815a..00f70bd 100644 --- a/ghc/rts/gmp/mpz/powm_ui.c +++ b/ghc/rts/gmp/mpz/powm_ui.c @@ -1,24 +1,26 @@ /* mpz_powm_ui(res,base,exp,mod) -- Set RES to (base**exp) mod MOD. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation, +Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include /* for NULL */ #include "gmp.h" #include "gmp-impl.h" #include "longlong.h" @@ -49,12 +51,14 @@ mpz_powm_ui (res, base, exp, mod) rp = res->_mp_d; if (msize == 0) - msize = 1 / msize; /* provoke a signal */ + DIVIDE_BY_ZERO; if (exp == 0) { - rp[0] = 1; + /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0 + depending on if MOD equals 1. */ res->_mp_size = (msize == 1 && (mod->_mp_d)[0] == 1) ? 0 : 1; + rp[0] = 1; return; } @@ -166,6 +170,7 @@ mpz_powm_ui (res, base, exp, mod) mpn_mul_n (xp, rp, rp, rsize); xsize = 2 * rsize; + xsize -= xp[xsize - 1] == 0; if (xsize > msize) { mpn_divmod (xp + msize, xp, xsize, mp, msize); @@ -179,6 +184,7 @@ mpz_powm_ui (res, base, exp, mod) { mpn_mul (xp, rp, rsize, bp, bsize); xsize = rsize + bsize; + xsize -= xp[xsize - 1] == 0; if (xsize > msize) { mpn_divmod (xp + msize, xp, xsize, mp, msize); @@ -226,7 +232,15 @@ mpz_powm_ui (res, base, exp, mod) MPN_NORMALIZE (rp, rsize); } - res->_mp_size = negative_result == 0 ? rsize : -rsize; + if (negative_result && rsize != 0) + { + if (mod_shift_cnt != 0) + mpn_rshift (mp, mp, msize, mod_shift_cnt); + mpn_sub (rp, mp, msize, rp, rsize); + rsize = msize; + MPN_NORMALIZE (rp, rsize); + } + res->_mp_size = rsize; if (free_me != NULL) (*_mp_free_func) (free_me, free_me_size * BYTES_PER_MP_LIMB); diff --git a/ghc/rts/gmp/mpz/pprime_p.c b/ghc/rts/gmp/mpz/pprime_p.c index 494de14..0217d8f 100644 --- a/ghc/rts/gmp/mpz/pprime_p.c +++ b/ghc/rts/gmp/mpz/pprime_p.c @@ -2,114 +2,241 @@ An implementation of the probabilistic primality test found in Knuth's Seminumerical Algorithms book. If the function mpz_probab_prime_p() returns 0 then n is not prime. If it returns 1, then n is 'probably' - prime. The probability of a false positive is (1/4)**reps, where - reps is the number of internal passes of the probabilistic algorithm. - Knuth indicates that 25 passes are reasonable. + prime. If it returns 2, n is surely prime. The probability of a false + positive is (1/4)**reps, where reps is the number of internal passes of the + probabilistic algorithm. Knuth indicates that 25 passes are reasonable. -Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. -Contributed by John Amanatides. +Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software +Foundation, Inc. Miller-Rabin code contributed by John Amanatides. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" -static int -possibly_prime (n, n_minus_1, x, y, q, k) +static int isprime _PROTO ((unsigned long int t)); +static int mpz_millerrabin _PROTO ((mpz_srcptr n, int reps)); + +int +#if __STDC__ +mpz_probab_prime_p (mpz_srcptr n, int reps) +#else +mpz_probab_prime_p (n, reps) mpz_srcptr n; - mpz_srcptr n_minus_1; - mpz_ptr x; - mpz_ptr y; - mpz_srcptr q; - unsigned long int k; + int reps; +#endif { - unsigned long int i; + mp_limb_t r; - /* find random x s.t. 1 < x < n */ - do + /* Handle small and negative n. */ + if (mpz_cmp_ui (n, 1000000L) <= 0) { - mpz_random (x, mpz_size (n)); - mpz_mmod (x, x, n); + int is_prime; + if (mpz_sgn (n) < 0) + { + /* Negative number. Negate and call ourselves. */ + mpz_t n2; + mpz_init (n2); + mpz_neg (n2, n); + is_prime = mpz_probab_prime_p (n2, reps); + mpz_clear (n2); + return is_prime; + } + is_prime = isprime (mpz_get_ui (n)); + return is_prime ? 2 : 0; } - while (mpz_cmp_ui (x, 1L) <= 0); - mpz_powm (y, x, q, n); + /* If n is now even, it is not a prime. */ + if ((mpz_get_ui (n) & 1) == 0) + return 0; + + /* Check if n has small factors. */ + if (UDIV_TIME > (2 * UMUL_TIME + 6)) + r = mpn_preinv_mod_1 (PTR(n), SIZ(n), (mp_limb_t) PP, (mp_limb_t) PP_INVERTED); + else + r = mpn_mod_1 (PTR(n), SIZ(n), (mp_limb_t) PP); + if (r % 3 == 0 || r % 5 == 0 || r % 7 == 0 || r % 11 == 0 || r % 13 == 0 + || r % 17 == 0 || r % 19 == 0 || r % 23 == 0 || r % 29 == 0 +#if BITS_PER_MP_LIMB == 64 + || r % 31 == 0 || r % 37 == 0 || r % 41 == 0 || r % 43 == 0 + || r % 47 == 0 || r % 53 == 0 +#endif + ) + { + return 0; + } - if (mpz_cmp_ui (y, 1L) == 0 || mpz_cmp (y, n_minus_1) == 0) - return 1; + /* Do more dividing. We collect small primes, using umul_ppmm, until we + overflow a single limb. We divide our number by the small primes product, + and look for factors in the remainder. */ + { + unsigned long int ln2; + unsigned long int q; + mp_limb_t p1, p0, p; + unsigned int primes[15]; + int nprimes; + + nprimes = 0; + p = 1; + ln2 = mpz_sizeinbase (n, 2) / 30; ln2 = ln2 * ln2; + for (q = BITS_PER_MP_LIMB == 64 ? 59 : 31; q < ln2; q += 2) + { + if (isprime (q)) + { + umul_ppmm (p1, p0, p, q); + if (p1 != 0) + { + r = mpn_mod_1 (PTR(n), SIZ(n), p); + while (--nprimes >= 0) + if (r % primes[nprimes] == 0) + { + if (mpn_mod_1 (PTR(n), SIZ(n), (mp_limb_t) primes[nprimes]) != 0) + abort (); + return 0; + } + p = q; + nprimes = 0; + } + else + { + p = p0; + } + primes[nprimes++] = q; + } + } + } + + /* Perform a number of Miller-Rabin tests. */ + return mpz_millerrabin (n, reps); +} - for (i = 1; i < k; i++) +static int +#if __STDC__ +isprime (unsigned long int t) +#else +isprime (t) + unsigned long int t; +#endif +{ + unsigned long int q, r, d; + + if (t < 3 || (t & 1) == 0) + return t == 2; + + for (d = 3, r = 1; r != 0; d += 2) { - mpz_powm_ui (y, y, 2L, n); - if (mpz_cmp (y, n_minus_1) == 0) + q = t / d; + r = t - q * d; + if (q < d) return 1; - if (mpz_cmp_ui (y, 1L) == 0) - return 0; } return 0; } -int +static int millerrabin _PROTO ((mpz_srcptr n, mpz_srcptr nm1, + mpz_ptr x, mpz_ptr y, + mpz_srcptr q, unsigned long int k)); + +static int #if __STDC__ -mpz_probab_prime_p (mpz_srcptr m, int reps) +mpz_millerrabin (mpz_srcptr n, int reps) #else -mpz_probab_prime_p (m, reps) - mpz_srcptr m; +mpz_millerrabin (n, reps) + mpz_srcptr n; int reps; #endif { - mpz_t n, n_minus_1, x, y, q; - int i, is_prime; + int r; + mpz_t nm1, x, y, q; unsigned long int k; + gmp_randstate_t rstate; + int is_prime; + TMP_DECL (marker); + TMP_MARK (marker); - mpz_init (n); - /* Take the absolute value of M, to handle positive and negative primes. */ - mpz_abs (n, m); + MPZ_TMP_INIT (nm1, SIZ (n) + 1); + mpz_sub_ui (nm1, n, 1L); - if (mpz_cmp_ui (n, 3L) <= 0) - { - mpz_clear (n); - return mpz_cmp_ui (n, 1L) > 0; - } + MPZ_TMP_INIT (x, SIZ (n)); + MPZ_TMP_INIT (y, 2 * SIZ (n)); /* mpz_powm_ui needs excessive memory!!! */ - if ((mpz_get_ui (n) & 1) == 0) + /* Perform a Fermat test. */ + mpz_set_ui (x, 210L); + mpz_powm (y, x, nm1, n); + if (mpz_cmp_ui (y, 1L) != 0) { - mpz_clear (n); - return 0; /* even */ + return 0; + TMP_FREE (marker); } - mpz_init (n_minus_1); - mpz_sub_ui (n_minus_1, n, 1L); - mpz_init (x); - mpz_init (y); + MPZ_TMP_INIT (q, SIZ (n)); + + /* Find q and k, where q is odd and n = 1 + 2**k * q. */ + k = mpz_scan1 (nm1, 0L); + mpz_tdiv_q_2exp (q, nm1, k); - /* find q and k, s.t. n = 1 + 2**k * q */ - mpz_init_set (q, n_minus_1); - k = mpz_scan1 (q, 0); - mpz_tdiv_q_2exp (q, q, k); + gmp_randinit (rstate, GMP_RAND_ALG_DEFAULT, 32L); is_prime = 1; - for (i = 0; i < reps && is_prime; i++) - is_prime &= possibly_prime (n, n_minus_1, x, y, q, k); - - mpz_clear (n_minus_1); - mpz_clear (n); - mpz_clear (x); - mpz_clear (y); - mpz_clear (q); + for (r = 0; r < reps && is_prime; r++) + { + do + mpz_urandomb (x, rstate, mpz_sizeinbase (n, 2) - 1); + while (mpz_cmp_ui (x, 1L) <= 0); + + is_prime = millerrabin (n, nm1, x, y, q, k); + } + + gmp_randclear (rstate); + + TMP_FREE (marker); return is_prime; } + +static int +#if __STDC__ +millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y, + mpz_srcptr q, unsigned long int k) +#else +millerrabin (n, nm1, x, y, q, k) + mpz_srcptr n; + mpz_srcptr nm1; + mpz_ptr x; + mpz_ptr y; + mpz_srcptr q; + unsigned long int k; +#endif +{ + unsigned long int i; + + mpz_powm (y, x, q, n); + + if (mpz_cmp_ui (y, 1L) == 0 || mpz_cmp (y, nm1) == 0) + return 1; + + for (i = 1; i < k; i++) + { + mpz_powm_ui (y, y, 2L, n); + if (mpz_cmp (y, nm1) == 0) + return 1; + if (mpz_cmp_ui (y, 1L) == 0) + return 0; + } + return 0; +} diff --git a/ghc/rts/gmp/mpz/random.c b/ghc/rts/gmp/mpz/random.c index ab41eef..60d9113 100644 --- a/ghc/rts/gmp/mpz/random.c +++ b/ghc/rts/gmp/mpz/random.c @@ -6,16 +6,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/random2.c b/ghc/rts/gmp/mpz/random2.c index 725a8b4..a90af11 100644 --- a/ghc/rts/gmp/mpz/random2.c +++ b/ghc/rts/gmp/mpz/random2.c @@ -7,16 +7,16 @@ Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/realloc.c b/ghc/rts/gmp/mpz/realloc.c index 2c2a5da..0b9e447 100644 --- a/ghc/rts/gmp/mpz/realloc.c +++ b/ghc/rts/gmp/mpz/realloc.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/remove.c b/ghc/rts/gmp/mpz/remove.c new file mode 100644 index 0000000..bc6675f --- /dev/null +++ b/ghc/rts/gmp/mpz/remove.c @@ -0,0 +1,93 @@ +/* mpz_remove -- divide out a factor and return its multiplicity. + +Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +unsigned long int +#if __STDC__ +mpz_remove (mpz_ptr dest, mpz_srcptr src, mpz_srcptr f) +#else +mpz_remove (dest, src, f) + mpz_ptr dest; + mpz_srcptr src; + mpz_srcptr f; +#endif +{ + mpz_t fpow[40]; /* inexhaustible...until year 2020 or so */ + mpz_t x, rem; + unsigned long int pwr; + int p; + + if (mpz_cmp_ui (f, 1) <= 0 || mpz_sgn (src) == 0) + DIVIDE_BY_ZERO; + if (mpz_cmp_ui (f, 2) == 0) + { + unsigned long int s0; + s0 = mpz_scan1 (src, 0); + mpz_div_2exp (dest, src, s0); + return s0; + } + + /* We could perhaps compute mpz_scan1(src,0)/mpz_scan1(f,0). It is an + upper bound of the result we're seeking. We could also shift down the + operands so that they become odd, to make intermediate values smaller. */ + + mpz_init (rem); + mpz_init (x); + + pwr = 0; + mpz_init (fpow[0]); + mpz_set (fpow[0], f); + mpz_set (dest, src); + + /* Divide by f, f^2, ..., f^(2^k) until we get a remainder for f^(2^k). */ + for (p = 0;; p++) + { + mpz_tdiv_qr (x, rem, dest, fpow[p]); + if (SIZ (rem) != 0) + break; + mpz_init (fpow[p + 1]); + mpz_mul (fpow[p + 1], fpow[p], fpow[p]); + mpz_set (dest, x); + } + + pwr = (1 << p) - 1; + + mpz_clear (fpow[p]); + + /* Divide by f^(2^(k-1)), f^(2^(k-2)), ..., f for all divisors that give a + zero remainder. */ + while (--p >= 0) + { + mpz_tdiv_qr (x, rem, dest, fpow[p]); + if (SIZ (rem) == 0) + { + pwr += 1 << p; + mpz_set (dest, x); + } + mpz_clear (fpow[p]); + } + + mpz_clear (x); + mpz_clear (rem); + return pwr; +} diff --git a/ghc/rts/gmp/mpz/root.c b/ghc/rts/gmp/mpz/root.c new file mode 100644 index 0000000..0920bf2 --- /dev/null +++ b/ghc/rts/gmp/mpz/root.c @@ -0,0 +1,183 @@ +/* mpz_root(root, u, nth) -- Set ROOT to floor(U^(1/nth)). + Return an indication if the result is exact. + +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* Naive implementation of nth root extraction. It would probably be a + better idea to use a division-free Newton iteration. It is insane + to use full precision from iteration 1. The mpz_scan1 trick compensates + to some extent. It would be natural to avoid representing the low zero + bits mpz_scan1 is counting, and at the same time call mpn directly. */ + +#include /* for NULL */ +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +int +#if __STDC__ +mpz_root (mpz_ptr r, mpz_srcptr c, unsigned long int nth) +#else +mpz_root (r, c, nth) + mpz_ptr r; + mpz_srcptr c; + unsigned long int nth; +#endif +{ + mpz_t x, t0, t1, t2; + __mpz_struct ccs, *cc = &ccs; + unsigned long int nbits; + int bit; + int exact; + int i; + unsigned long int lowz; + unsigned long int rl; + + /* even roots of negatives provoke an exception */ + if (mpz_sgn (c) < 0 && (nth & 1) == 0) + SQRT_OF_NEGATIVE; + + /* root extraction interpreted as c^(1/nth) means a zeroth root should + provoke a divide by zero, do this even if c==0 */ + if (nth == 0) + DIVIDE_BY_ZERO; + + if (mpz_sgn (c) == 0) + { + if (r != NULL) + mpz_set_ui (r, 0); + return 1; /* exact result */ + } + + PTR(cc) = PTR(c); + SIZ(cc) = ABSIZ(c); + + nbits = (mpz_sizeinbase (cc, 2) - 1) / nth; + if (nbits == 0) + { + if (r != NULL) + mpz_set_ui (r, 1); + if (mpz_sgn (c) < 0) + { + if (r != NULL) + SIZ(r) = -SIZ(r); + return mpz_cmp_si (c, -1L) == 0; + } + return mpz_cmp_ui (c, 1L) == 0; + } + + mpz_init (x); + mpz_init (t0); + mpz_init (t1); + mpz_init (t2); + + /* Create a one-bit approximation. */ + mpz_set_ui (x, 0); + mpz_setbit (x, nbits); + + /* Make the approximation better, one bit at a time. This odd-looking + termination criteria makes large nth get better initial approximation, + which avoids slow convergence for such values. */ + bit = nbits - 1; + for (i = 1; (nth >> i) != 0; i++) + { + mpz_setbit (x, bit); + mpz_tdiv_q_2exp (t0, x, bit); + mpz_pow_ui (t1, t0, nth); + mpz_mul_2exp (t1, t1, bit * nth); + if (mpz_cmp (cc, t1) < 0) + mpz_clrbit (x, bit); + + bit--; /* check/set next bit */ + if (bit < 0) + { + /* We're done. */ + mpz_pow_ui (t1, x, nth); + goto done; + } + } + mpz_setbit (x, bit); + mpz_set_ui (t2, 0); mpz_setbit (t2, bit); mpz_add (x, x, t2); + +#if DEBUG + /* Check that the starting approximation is >= than the root. */ + mpz_pow_ui (t1, x, nth); + if (mpz_cmp (cc, t1) >= 0) + abort (); +#endif + + mpz_add_ui (x, x, 1); + + /* Main loop */ + do + { + lowz = mpz_scan1 (x, 0); + mpz_tdiv_q_2exp (t0, x, lowz); + mpz_pow_ui (t1, t0, nth - 1); + mpz_mul_2exp (t1, t1, lowz * (nth - 1)); + mpz_tdiv_q (t2, cc, t1); + mpz_sub (t2, x, t2); + rl = mpz_tdiv_q_ui (t2, t2, nth); + mpz_sub (x, x, t2); + } + while (mpz_sgn (t2) != 0); + + /* If we got a non-zero remainder in the last division, we know our root + is too large. */ + mpz_sub_ui (x, x, (mp_limb_t) (rl != 0)); + + /* Adjustment loop. If we spend more care on rounding in the loop above, + we could probably get rid of this, or greatly simplify it. */ + { + int bad = 0; + lowz = mpz_scan1 (x, 0); + mpz_tdiv_q_2exp (t0, x, lowz); + mpz_pow_ui (t1, t0, nth); + mpz_mul_2exp (t1, t1, lowz * nth); + while (mpz_cmp (cc, t1) < 0) + { + bad++; + if (bad > 2) + abort (); /* abort if our root is far off */ + mpz_sub_ui (x, x, 1); + lowz = mpz_scan1 (x, 0); + mpz_tdiv_q_2exp (t0, x, lowz); + mpz_pow_ui (t1, t0, nth); + mpz_mul_2exp (t1, t1, lowz * nth); + } + } + + done: + exact = mpz_cmp (t1, cc) == 0; + + if (r != NULL) + { + mpz_set (r, x); + if (mpz_sgn (c) < 0) + SIZ(r) = -SIZ(r); + } + + mpz_clear (t2); + mpz_clear (t1); + mpz_clear (t0); + mpz_clear (x); + + return exact; +} diff --git a/ghc/rts/gmp/mpz/rrandomb.c b/ghc/rts/gmp/mpz/rrandomb.c new file mode 100644 index 0000000..7d78243 --- /dev/null +++ b/ghc/rts/gmp/mpz/rrandomb.c @@ -0,0 +1,117 @@ +/* mpz_rrandomb -- Generate a positive random mpz_t of specified bit size, with + long runs of consecutive ones and zeros in the binary representation. + Meant for testing of other MP routines. + +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +static void gmp_rrandomb _PROTO ((mp_ptr rp, gmp_randstate_t rstate, unsigned long int nbits)); + +void +#if __STDC__ +mpz_rrandomb (mpz_ptr x, gmp_randstate_t rstate, unsigned long int nbits) +#else +mpz_rrandomb (x, rstate, nbits) + mpz_ptr x; + gmp_randstate_t rstate; + unsigned long int nbits; +#endif +{ + mp_size_t nl = 0; + + if (nbits != 0) + { + mp_ptr xp; + nl = (nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB; + if (x->_mp_alloc < nl) + _mpz_realloc (x, nl); + + xp = PTR(x); + gmp_rrandomb (xp, rstate, nbits); + MPN_NORMALIZE (xp, nl); + } + + SIZ(x) = nl; +} + +#define BITS_PER_CHUNK 4 + +static void +#if __STDC__ +gmp_rrandomb (mp_ptr rp, gmp_randstate_t rstate, unsigned long int nbits) +#else +gmp_rrandomb (rp, rstate, nbits) + mp_ptr rp; + gmp_randstate_t rstate; + unsigned long int nbits; +#endif +{ + int nb; + int bit_pos; + mp_size_t limb_pos; + mp_limb_t ran, ranm; + mp_limb_t acc; + mp_size_t n; + + bit_pos = nbits % BITS_PER_MP_LIMB; + limb_pos = nbits / BITS_PER_MP_LIMB; + if (bit_pos == 0) + { + bit_pos = BITS_PER_MP_LIMB; + limb_pos--; + } + + acc = 0; + while (limb_pos >= 0) + { + _gmp_rand (&ranm, rstate, BITS_PER_CHUNK + 1); + ran = ranm; + nb = (ran >> 1) + 1; + if ((ran & 1) != 0) + { + /* Generate a string of ones. */ + if (nb > bit_pos) + { + rp[limb_pos--] = acc | ((((mp_limb_t) 1) << bit_pos) - 1); + bit_pos += BITS_PER_MP_LIMB; + bit_pos -= nb; + acc = (~(mp_limb_t) 0) << bit_pos; + } + else + { + bit_pos -= nb; + acc |= ((((mp_limb_t) 1) << nb) - 1) << bit_pos; + } + } + else + { + /* Generate a string of zeroes. */ + if (nb > bit_pos) + { + rp[limb_pos--] = acc; + acc = 0; + bit_pos += BITS_PER_MP_LIMB; + } + bit_pos -= nb; + } + } +} diff --git a/ghc/rts/gmp/mpz/scan0.c b/ghc/rts/gmp/mpz/scan0.c index 8e45aa3..6c59cf8 100644 --- a/ghc/rts/gmp/mpz/scan0.c +++ b/ghc/rts/gmp/mpz/scan0.c @@ -5,16 +5,16 @@ Copyright (C) 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/scan1.c b/ghc/rts/gmp/mpz/scan1.c index 6ecb0aa..3b84e34 100644 --- a/ghc/rts/gmp/mpz/scan1.c +++ b/ghc/rts/gmp/mpz/scan1.c @@ -5,16 +5,16 @@ Copyright (C) 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/set.c b/ghc/rts/gmp/mpz/set.c index d94ab74..06b2eef 100644 --- a/ghc/rts/gmp/mpz/set.c +++ b/ghc/rts/gmp/mpz/set.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/set_d.c b/ghc/rts/gmp/mpz/set_d.c index c09b1d9..e90ed9b 100644 --- a/ghc/rts/gmp/mpz/set_d.c +++ b/ghc/rts/gmp/mpz/set_d.c @@ -1,20 +1,20 @@ /* mpz_set_d(integer, val) -- Assign INTEGER with a double value VAL. -Copyright (C) 1995 Free Software Foundation, Inc. +Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -32,9 +32,9 @@ mpz_set_d (r, d) #endif { int negative; - mp_size_t size; mp_limb_t tp[3]; mp_ptr rp; + mp_size_t rn; negative = d < 0; d = ABS (d); @@ -49,19 +49,20 @@ mpz_set_d (r, d) return; } - size = __gmp_extract_double (tp, d); + rn = __gmp_extract_double (tp, d); - if (ALLOC(r) < size) - _mpz_realloc (r, size); + if (ALLOC(r) < rn) + _mpz_realloc (r, rn); rp = PTR (r); #if BITS_PER_MP_LIMB == 32 - switch (size) + switch (rn) { default: - MPN_ZERO (rp, size - 3); - rp += size - 3; + MPN_ZERO (rp, rn - 3); + rp += rn - 3; + /* fall through */ case 3: rp[2] = tp[2]; rp[1] = tp[1]; @@ -72,22 +73,24 @@ mpz_set_d (r, d) rp[0] = tp[1]; break; case 1: + /* handled in "small aguments" case above */ abort (); } #else - switch (size) + switch (rn) { default: - MPN_ZERO (rp, size - 2); - rp += size - 2; + MPN_ZERO (rp, rn - 2); + rp += rn - 2; + /* fall through */ case 2: - rp[1] = tp[1]; - rp[0] = tp[0]; + rp[1] = tp[1], rp[0] = tp[0]; break; case 1: + /* handled in "small aguments" case above */ abort (); } #endif - SIZ(r) = negative ? -size : size; + SIZ(r) = negative ? -rn : rn; } diff --git a/ghc/rts/gmp/mpz/set_f.c b/ghc/rts/gmp/mpz/set_f.c index 9547907..2273953 100644 --- a/ghc/rts/gmp/mpz/set_f.c +++ b/ghc/rts/gmp/mpz/set_f.c @@ -5,16 +5,16 @@ Copyright (C) 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/set_q.c b/ghc/rts/gmp/mpz/set_q.c index 61bd5c7..72d3222 100644 --- a/ghc/rts/gmp/mpz/set_q.c +++ b/ghc/rts/gmp/mpz/set_q.c @@ -6,16 +6,16 @@ Copyright (C) 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/set_si.c b/ghc/rts/gmp/mpz/set_si.c index 82a90a3..9ba2fba 100644 --- a/ghc/rts/gmp/mpz/set_si.c +++ b/ghc/rts/gmp/mpz/set_si.c @@ -1,20 +1,20 @@ /* mpz_set_si(integer, val) -- Assign INTEGER with a small value VAL. -Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1995, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -40,7 +40,7 @@ mpz_set_si (dest, val) } else if (val < 0) { - dest->_mp_d[0] = -val; + dest->_mp_d[0] = (unsigned long) -val; dest->_mp_size = -1; } else diff --git a/ghc/rts/gmp/mpz/set_str.c b/ghc/rts/gmp/mpz/set_str.c index d1334b1..3ab79c0 100644 --- a/ghc/rts/gmp/mpz/set_str.c +++ b/ghc/rts/gmp/mpz/set_str.c @@ -4,34 +4,40 @@ the base in the C standard way, i.e. 0xhh...h means base 16, 0oo...o means base 8, otherwise assume base 10. -Copyright (C) 1991, 1993, 1994, Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 2000 Free Software +Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include #include #include "gmp.h" #include "gmp-impl.h" #include "longlong.h" static int +#if __STDC__ +digit_value_in_base (int c, int base) +#else digit_value_in_base (c, base) int c; int base; +#endif { int digit; @@ -96,13 +102,30 @@ mpz_set_str (x, str, base) base = 16; c = *str++; } + else if (c == 'b' || c == 'B') + { + base = 2; + c = *str++; + } } } + /* Skip leading zeros. */ + while (c == '0') + c = *str++; + /* Make sure the string does not become empty, mpn_set_str would fail. */ + if (c == 0) + { + x->_mp_size = 0; + return 0; + } + TMP_MARK (marker); str_size = strlen (str - 1); s = begs = (char *) TMP_ALLOC (str_size + 1); + /* Remove spaces from the string and convert the result from ASCII to a + byte array. */ for (i = 0; i < str_size; i++) { if (!isspace (c)) @@ -120,10 +143,12 @@ mpz_set_str (x, str, base) str_size = s - begs; - xsize = str_size / __mp_bases[base].chars_per_limb + 1; + xsize = (((mp_size_t) (str_size / __mp_bases[base].chars_per_bit_exactly)) + / BITS_PER_MP_LIMB + 2); if (x->_mp_alloc < xsize) _mpz_realloc (x, xsize); + /* Convert the byte array in base BASE to our bignum format. */ xsize = mpn_set_str (x->_mp_d, (unsigned char *) begs, str_size, base); x->_mp_size = negative ? -xsize : xsize; diff --git a/ghc/rts/gmp/mpz/set_ui.c b/ghc/rts/gmp/mpz/set_ui.c index 73f6cf5..d6097c1 100644 --- a/ghc/rts/gmp/mpz/set_ui.c +++ b/ghc/rts/gmp/mpz/set_ui.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/setbit.c b/ghc/rts/gmp/mpz/setbit.c index af59e2c..d4249a4 100644 --- a/ghc/rts/gmp/mpz/setbit.c +++ b/ghc/rts/gmp/mpz/setbit.c @@ -1,20 +1,21 @@ /* mpz_setbit -- set a specified bit. -Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1995, 1997, 1999 Free Software Foundation, +Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -63,7 +64,7 @@ mpz_setbit (d, bit_index) /* Simulate two's complement arithmetic, i.e. simulate 1. Set OP = ~(OP - 1) [with infinitely many leading ones]. - 2. set the bit. + 2. Set the bit. 3. Set OP = ~OP + 1. */ dsize = -dsize; @@ -108,6 +109,11 @@ mpz_setbit (d, bit_index) } } else - ; + { + mpn_decr_u (dp + limb_index, + (mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB)); + dsize -= dp[dsize - 1] == 0; + d->_mp_size = -dsize; + } } } diff --git a/ghc/rts/gmp/mpz/size.c b/ghc/rts/gmp/mpz/size.c index 0b09fbe..6574756 100644 --- a/ghc/rts/gmp/mpz/size.c +++ b/ghc/rts/gmp/mpz/size.c @@ -6,16 +6,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/sizeinbase.c b/ghc/rts/gmp/mpz/sizeinbase.c index 51bd555..734f9c4 100644 --- a/ghc/rts/gmp/mpz/sizeinbase.c +++ b/ghc/rts/gmp/mpz/sizeinbase.c @@ -7,16 +7,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/sqrt.c b/ghc/rts/gmp/mpz/sqrt.c index 44c554e..fe82fe4 100644 --- a/ghc/rts/gmp/mpz/sqrt.c +++ b/ghc/rts/gmp/mpz/sqrt.c @@ -1,24 +1,25 @@ /* mpz_sqrt(root, u) -- Set ROOT to floor(sqrt(U)). -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include /* for NULL */ #include "gmp.h" #include "gmp-impl.h" @@ -40,7 +41,7 @@ mpz_sqrt (root, op) TMP_MARK (marker); op_size = op->_mp_size; if (op_size < 0) - op_size = 1 / (op_size > 0); /* Divide by zero for negative OP. */ + SQRT_OF_NEGATIVE; /* The size of the root is accurate after this simple calculation. */ root_size = (op_size + 1) / 2; diff --git a/ghc/rts/gmp/mpz/sqrtrem.c b/ghc/rts/gmp/mpz/sqrtrem.c index 757cc5d..99a6453 100644 --- a/ghc/rts/gmp/mpz/sqrtrem.c +++ b/ghc/rts/gmp/mpz/sqrtrem.c @@ -1,27 +1,31 @@ /* mpz_sqrtrem(root,rem,x) -- Set ROOT to floor(sqrt(X)) and REM to the remainder, i.e. X - ROOT**2. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include /* for NULL */ #include "gmp.h" #include "gmp-impl.h" +#ifdef BERKELEY_MP +#include "mp.h" +#endif #ifndef BERKELEY_MP void @@ -54,7 +58,7 @@ msqrt (op, root, rem) TMP_MARK (marker); op_size = op->_mp_size; if (op_size < 0) - op_size = 1 / (op_size > 0); /* Divide by zero for negative OP. */ + SQRT_OF_NEGATIVE; if (rem->_mp_alloc < op_size) _mpz_realloc (rem, op_size); diff --git a/ghc/rts/gmp/mpz/sub.c b/ghc/rts/gmp/mpz/sub.c index 56ef1a1..f3ae7c2 100644 --- a/ghc/rts/gmp/mpz/sub.c +++ b/ghc/rts/gmp/mpz/sub.c @@ -1,26 +1,29 @@ /* mpz_sub -- Subtract two integers. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "gmp.h" #include "gmp-impl.h" +#ifdef BERKELEY_MP +#include "mp.h" +#endif #ifndef BERKELEY_MP void @@ -58,9 +61,9 @@ msub (u, v, w) if (abs_usize < abs_vsize) { /* Swap U and V. */ - {const __mpz_struct *t = u; u = v; v = t;} - {mp_size_t t = usize; usize = vsize; vsize = t;} - {mp_size_t t = abs_usize; abs_usize = abs_vsize; abs_vsize = t;} + MPZ_SRCPTR_SWAP (u, v); + MP_SIZE_T_SWAP (usize, vsize); + MP_SIZE_T_SWAP (abs_usize, abs_vsize); } /* True: ABS_USIZE >= ABS_VSIZE. */ diff --git a/ghc/rts/gmp/mpz/sub_ui.c b/ghc/rts/gmp/mpz/sub_ui.c index 7dea4b6..327add8 100644 --- a/ghc/rts/gmp/mpz/sub_ui.c +++ b/ghc/rts/gmp/mpz/sub_ui.c @@ -1,20 +1,20 @@ /* mpz_sub_ui -- Subtract an unsigned one-word integer from an MP_INT. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1999 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -59,7 +59,7 @@ mpz_sub_ui (w, u, v) if (usize < 0) { mp_limb_t cy; - cy = mpn_add_1 (wp, up, abs_usize, v); + cy = mpn_add_1 (wp, up, abs_usize, (mp_limb_t) v); wp[abs_usize] = cy; wsize = -(abs_usize + cy); } @@ -74,7 +74,7 @@ mpz_sub_ui (w, u, v) } else { - mpn_sub_1 (wp, up, abs_usize, v); + mpn_sub_1 (wp, up, abs_usize, (mp_limb_t) v); /* Size can decrease with at most one limb. */ wsize = abs_usize - (wp[abs_usize - 1] == 0); } diff --git a/ghc/rts/gmp/mpz/swap.c b/ghc/rts/gmp/mpz/swap.c new file mode 100644 index 0000000..0070d6f --- /dev/null +++ b/ghc/rts/gmp/mpz/swap.c @@ -0,0 +1,52 @@ +/* mpz_swap (dest_integer, src_integer) -- Swap U and V. + +Copyright (C) 1997, 1998 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +void +#if __STDC__ +mpz_swap (mpz_ptr u, mpz_ptr v) +#else +mpz_swap (u, v) + mpz_ptr u; + mpz_ptr v; +#endif +{ + mp_ptr up, vp; + mp_size_t usize, vsize; + mp_size_t ualloc, valloc; + + ualloc = u->_mp_alloc; + valloc = v->_mp_alloc; + v->_mp_alloc = ualloc; + u->_mp_alloc = valloc; + + usize = u->_mp_size; + vsize = v->_mp_size; + v->_mp_size = usize; + u->_mp_size = vsize; + + up = u->_mp_d; + vp = v->_mp_d; + v->_mp_d = up; + u->_mp_d = vp; +} diff --git a/ghc/rts/gmp/mpz/tdiv_q.c b/ghc/rts/gmp/mpz/tdiv_q.c index b4d3636..21db4ab 100644 --- a/ghc/rts/gmp/mpz/tdiv_q.c +++ b/ghc/rts/gmp/mpz/tdiv_q.c @@ -1,20 +1,20 @@ /* mpz_tdiv_q -- divide two integers and produce a quotient. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -33,101 +33,59 @@ mpz_tdiv_q (quot, num, den) mpz_srcptr den; #endif { - mp_srcptr np, dp; - mp_ptr qp, rp; - mp_size_t nsize = num->_mp_size; - mp_size_t dsize = den->_mp_size; - mp_size_t qsize, rsize; - mp_size_t sign_quotient = nsize ^ dsize; - unsigned normalization_steps; - mp_limb_t q_limb; + mp_size_t ql; + mp_size_t ns, ds, nl, dl; + mp_ptr np, dp, qp, rp; TMP_DECL (marker); - nsize = ABS (nsize); - dsize = ABS (dsize); + ns = SIZ (num); + ds = SIZ (den); + nl = ABS (ns); + dl = ABS (ds); + ql = nl - dl + 1; - /* Ensure space is enough for quotient. */ + if (dl == 0) + DIVIDE_BY_ZERO; - qsize = nsize - dsize + 1; /* qsize cannot be bigger than this. */ - if (qsize <= 0) + if (ql <= 0) { - quot->_mp_size = 0; + SIZ (quot) = 0; return; } - if (quot->_mp_alloc < qsize) - _mpz_realloc (quot, qsize); - - qp = quot->_mp_d; - np = num->_mp_d; - dp = den->_mp_d; - - /* Optimize division by a single-limb divisor. */ - if (dsize == 1) - { - mpn_divmod_1 (qp, np, nsize, dp[0]); - qsize -= qp[qsize - 1] == 0; - quot->_mp_size = sign_quotient >= 0 ? qsize : -qsize; - return; - } + MPZ_REALLOC (quot, ql); TMP_MARK (marker); + qp = PTR (quot); + rp = (mp_ptr) TMP_ALLOC (dl * BYTES_PER_MP_LIMB); + np = PTR (num); + dp = PTR (den); - rp = (mp_ptr) TMP_ALLOC ((nsize + 1) * BYTES_PER_MP_LIMB); - - count_leading_zeros (normalization_steps, dp[dsize - 1]); + /* FIXME: We should think about how to handle the temporary allocation. + Perhaps mpn_tdiv_qr should handle it, since it anyway often needs to + allocate temp space. */ - /* Normalize the denominator, i.e. make its most significant bit set by - shifting it NORMALIZATION_STEPS bits to the left. Also shift the - numerator the same number of steps (to keep the quotient the same!). */ - if (normalization_steps != 0) + /* Copy denominator to temporary space if it overlaps with the quotient. */ + if (dp == qp) { mp_ptr tp; - mp_limb_t nlimb; - - /* Shift up the denominator setting the most significant bit of - the most significant word. Use temporary storage not to clobber - the original contents of the denominator. */ - tp = (mp_ptr) TMP_ALLOC (dsize * BYTES_PER_MP_LIMB); - mpn_lshift (tp, dp, dsize, normalization_steps); + tp = (mp_ptr) TMP_ALLOC (dl * BYTES_PER_MP_LIMB); + MPN_COPY (tp, dp, dl); dp = tp; - - /* Shift up the numerator, possibly introducing a new most - significant word. Move the shifted numerator in the remainder - meanwhile. */ - nlimb = mpn_lshift (rp, np, nsize, normalization_steps); - if (nlimb != 0) - { - rp[nsize] = nlimb; - rsize = nsize + 1; - } - else - rsize = nsize; } - else + /* Copy numerator to temporary space if it overlaps with the quotient. */ + if (np == qp) { - /* The denominator is already normalized, as required. Copy it to - temporary space if it overlaps with the quotient. */ - if (dp == qp) - { - dp = (mp_ptr) TMP_ALLOC (dsize * BYTES_PER_MP_LIMB); - MPN_COPY ((mp_ptr) dp, qp, dsize); - } - - /* Move the numerator to the remainder. */ - MPN_COPY (rp, np, nsize); - rsize = nsize; + mp_ptr tp; + tp = (mp_ptr) TMP_ALLOC (nl * BYTES_PER_MP_LIMB); + MPN_COPY (tp, np, nl); + np = tp; } - q_limb = mpn_divmod (qp, rp, rsize, dp, dsize); + mpn_tdiv_qr (qp, rp, 0L, np, nl, dp, dl); - qsize = rsize - dsize; - if (q_limb) - { - qp[qsize] = q_limb; - qsize += 1; - } + ql -= qp[ql - 1] == 0; - quot->_mp_size = sign_quotient >= 0 ? qsize : -qsize; + SIZ (quot) = (ns ^ ds) >= 0 ? ql : -ql; TMP_FREE (marker); } diff --git a/ghc/rts/gmp/mpz/tdiv_q_2exp.c b/ghc/rts/gmp/mpz/tdiv_q_2exp.c index e70d810..03d1e01 100644 --- a/ghc/rts/gmp/mpz/tdiv_q_2exp.c +++ b/ghc/rts/gmp/mpz/tdiv_q_2exp.c @@ -6,16 +6,16 @@ Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/tdiv_q_ui.c b/ghc/rts/gmp/mpz/tdiv_q_ui.c index 9048e0a..a2e3462 100644 --- a/ghc/rts/gmp/mpz/tdiv_q_ui.c +++ b/ghc/rts/gmp/mpz/tdiv_q_ui.c @@ -1,21 +1,21 @@ /* mpz_tdiv_q_ui(quot, dividend, divisor_limb) -- Divide DIVIDEND by DIVISOR_LIMB and store the result in QUOT. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1998 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -23,7 +23,7 @@ MA 02111-1307, USA. */ #include "gmp.h" #include "gmp-impl.h" -void +unsigned long int #if __STDC__ mpz_tdiv_q_ui (mpz_ptr quot, mpz_srcptr dividend, unsigned long int divisor) #else @@ -36,16 +36,14 @@ mpz_tdiv_q_ui (quot, dividend, divisor) mp_size_t dividend_size; mp_size_t size; mp_ptr quot_ptr; + mp_limb_t remainder_limb; + + if (divisor == 0) + DIVIDE_BY_ZERO; dividend_size = dividend->_mp_size; size = ABS (dividend_size); - if (size == 0) - { - quot->_mp_size = 0; - return; - } - /* No need for temporary allocation and copying if QUOT == DIVIDEND as the divisor is just one limb, and thus no intermediate remainders need to be stored. */ @@ -55,9 +53,12 @@ mpz_tdiv_q_ui (quot, dividend, divisor) quot_ptr = quot->_mp_d; - mpn_divmod_1 (quot_ptr, dividend->_mp_d, size, (mp_limb_t) divisor); + remainder_limb + = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size, (mp_limb_t) divisor); /* The quotient is SIZE limbs, but the most significant might be zero. */ - size -= quot_ptr[size - 1] == 0; + size -= size != 0 && quot_ptr[size - 1] == 0; quot->_mp_size = dividend_size >= 0 ? size : -size; + + return remainder_limb; } diff --git a/ghc/rts/gmp/mpz/tdiv_qr.c b/ghc/rts/gmp/mpz/tdiv_qr.c index 500e199..d66f57d 100644 --- a/ghc/rts/gmp/mpz/tdiv_qr.c +++ b/ghc/rts/gmp/mpz/tdiv_qr.c @@ -1,21 +1,21 @@ /* mpz_tdiv_qr(quot,rem,dividend,divisor) -- Set QUOT to DIVIDEND/DIVISOR, and REM to DIVIDEND mod DIVISOR. -Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -23,6 +23,12 @@ MA 02111-1307, USA. */ #include "gmp.h" #include "gmp-impl.h" #include "longlong.h" +#ifdef BERKELEY_MP +#include "mp.h" +#endif + + +#ifndef BERKELEY_MP void #if __STDC__ @@ -35,5 +41,90 @@ mpz_tdiv_qr (quot, rem, num, den) mpz_srcptr den; #endif -#define COMPUTE_QUOTIENT -#include "dmincl.c" +#else /* BERKELEY_MP */ + +void +#if __STDC__ +mdiv (mpz_srcptr num, mpz_srcptr den, mpz_ptr quot, mpz_ptr rem) +#else +mdiv (num, den, quot, rem) + mpz_srcptr num; + mpz_srcptr den; + mpz_ptr quot; + mpz_ptr rem; +#endif + +#endif /* BERKELEY_MP */ +{ + mp_size_t ql; + mp_size_t ns, ds, nl, dl; + mp_ptr np, dp, qp, rp; + TMP_DECL (marker); + + ns = SIZ (num); + ds = SIZ (den); + nl = ABS (ns); + dl = ABS (ds); + ql = nl - dl + 1; + + if (dl == 0) + DIVIDE_BY_ZERO; + + MPZ_REALLOC (rem, dl); + + if (ql <= 0) + { + if (num != rem) + { + mp_ptr np, rp; + np = PTR (num); + rp = PTR (rem); + MPN_COPY (rp, np, nl); + SIZ (rem) = SIZ (num); + } + /* This needs to follow the assignment to rem, in case the + numerator and quotient are the same. */ + SIZ (quot) = 0; + return; + } + + MPZ_REALLOC (quot, ql); + + TMP_MARK (marker); + qp = PTR (quot); + rp = PTR (rem); + np = PTR (num); + dp = PTR (den); + + /* FIXME: We should think about how to handle the temporary allocation. + Perhaps mpn_tdiv_qr should handle it, since it anyway often needs to + allocate temp space. */ + + /* Copy denominator to temporary space if it overlaps with the quotient + or remainder. */ + if (dp == rp || dp == qp) + { + mp_ptr tp; + tp = (mp_ptr) TMP_ALLOC (dl * BYTES_PER_MP_LIMB); + MPN_COPY (tp, dp, dl); + dp = tp; + } + /* Copy numerator to temporary space if it overlaps with the quotient or + remainder. */ + if (np == rp || np == qp) + { + mp_ptr tp; + tp = (mp_ptr) TMP_ALLOC (nl * BYTES_PER_MP_LIMB); + MPN_COPY (tp, np, nl); + np = tp; + } + + mpn_tdiv_qr (qp, rp, 0L, np, nl, dp, dl); + + ql -= qp[ql - 1] == 0; + MPN_NORMALIZE (rp, dl); + + SIZ (quot) = (ns ^ ds) >= 0 ? ql : -ql; + SIZ (rem) = ns >= 0 ? dl : -dl; + TMP_FREE (marker); +} diff --git a/ghc/rts/gmp/mpz/tdiv_qr_ui.c b/ghc/rts/gmp/mpz/tdiv_qr_ui.c index cb5041c..10368cd 100644 --- a/ghc/rts/gmp/mpz/tdiv_qr_ui.c +++ b/ghc/rts/gmp/mpz/tdiv_qr_ui.c @@ -2,21 +2,21 @@ Set QUOT to DIVIDEND / SHORT_DIVISOR and REM to DIVIDEND mod SHORT_DIVISOR. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1998 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -24,7 +24,7 @@ MA 02111-1307, USA. */ #include "gmp.h" #include "gmp-impl.h" -void +unsigned long int #if __STDC__ mpz_tdiv_qr_ui (mpz_ptr quot, mpz_ptr rem, mpz_srcptr dividend, unsigned long int divisor) #else @@ -40,16 +40,12 @@ mpz_tdiv_qr_ui (quot, rem, dividend, divisor) mp_ptr quot_ptr; mp_limb_t remainder_limb; + if (divisor == 0) + DIVIDE_BY_ZERO; + dividend_size = dividend->_mp_size; size = ABS (dividend_size); - if (size == 0) - { - quot->_mp_size = 0; - rem->_mp_size = 0; - return; - } - /* No need for temporary allocation and copying if QUOT == DIVIDEND as the divisor is just one limb, and thus no intermediate remainders need to be stored. */ @@ -60,7 +56,7 @@ mpz_tdiv_qr_ui (quot, rem, dividend, divisor) quot_ptr = quot->_mp_d; remainder_limb = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size, - (mp_limb_t) divisor); + (mp_limb_t) divisor); if (remainder_limb == 0) rem->_mp_size = 0; @@ -73,6 +69,8 @@ mpz_tdiv_qr_ui (quot, rem, dividend, divisor) } /* The quotient is SIZE limbs, but the most significant might be zero. */ - size -= quot_ptr[size - 1] == 0; + size -= size != 0 && quot_ptr[size - 1] == 0; quot->_mp_size = dividend_size >= 0 ? size : -size; + + return remainder_limb; } diff --git a/ghc/rts/gmp/mpz/tdiv_r.c b/ghc/rts/gmp/mpz/tdiv_r.c index 813a0d4..9eb87df 100644 --- a/ghc/rts/gmp/mpz/tdiv_r.c +++ b/ghc/rts/gmp/mpz/tdiv_r.c @@ -1,20 +1,20 @@ /* mpz_tdiv_r(rem, dividend, divisor) -- Set REM to DIVIDEND mod DIVISOR. -Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -32,6 +32,67 @@ mpz_tdiv_r (rem, num, den) mpz_srcptr num; mpz_srcptr den; #endif +{ + mp_size_t ql; + mp_size_t ns, ds, nl, dl; + mp_ptr np, dp, qp, rp; + TMP_DECL (marker); -#undef COMPUTE_QUOTIENT -#include "dmincl.c" + ns = SIZ (num); + ds = SIZ (den); + nl = ABS (ns); + dl = ABS (ds); + ql = nl - dl + 1; + + if (dl == 0) + DIVIDE_BY_ZERO; + + MPZ_REALLOC (rem, dl); + + if (ql <= 0) + { + if (num != rem) + { + mp_ptr np, rp; + np = PTR (num); + rp = PTR (rem); + MPN_COPY (rp, np, nl); + SIZ (rem) = SIZ (num); + } + return; + } + + TMP_MARK (marker); + qp = (mp_ptr) TMP_ALLOC (ql * BYTES_PER_MP_LIMB); + rp = PTR (rem); + np = PTR (num); + dp = PTR (den); + + /* FIXME: We should think about how to handle the temporary allocation. + Perhaps mpn_tdiv_qr should handle it, since it anyway often needs to + allocate temp space. */ + + /* Copy denominator to temporary space if it overlaps with the remainder. */ + if (dp == rp) + { + mp_ptr tp; + tp = (mp_ptr) TMP_ALLOC (dl * BYTES_PER_MP_LIMB); + MPN_COPY (tp, dp, dl); + dp = tp; + } + /* Copy numerator to temporary space if it overlaps with the remainder. */ + if (np == rp) + { + mp_ptr tp; + tp = (mp_ptr) TMP_ALLOC (nl * BYTES_PER_MP_LIMB); + MPN_COPY (tp, np, nl); + np = tp; + } + + mpn_tdiv_qr (qp, rp, 0L, np, nl, dp, dl); + + MPN_NORMALIZE (rp, dl); + + SIZ (rem) = ns >= 0 ? dl : -dl; + TMP_FREE (marker); +} diff --git a/ghc/rts/gmp/mpz/tdiv_r_2exp.c b/ghc/rts/gmp/mpz/tdiv_r_2exp.c index 99d617e..91de170 100644 --- a/ghc/rts/gmp/mpz/tdiv_r_2exp.c +++ b/ghc/rts/gmp/mpz/tdiv_r_2exp.c @@ -5,16 +5,16 @@ Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ diff --git a/ghc/rts/gmp/mpz/tdiv_r_ui.c b/ghc/rts/gmp/mpz/tdiv_r_ui.c index 0428b52..2ea411f 100644 --- a/ghc/rts/gmp/mpz/tdiv_r_ui.c +++ b/ghc/rts/gmp/mpz/tdiv_r_ui.c @@ -1,21 +1,21 @@ /* mpz_tdiv_r_ui(rem, dividend, divisor_limb) -- Set REM to DIVDEND mod DIVISOR_LIMB. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1998 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -23,7 +23,7 @@ MA 02111-1307, USA. */ #include "gmp.h" #include "gmp-impl.h" -void +unsigned long int #if __STDC__ mpz_tdiv_r_ui (mpz_ptr rem, mpz_srcptr dividend, unsigned long int divisor) #else @@ -37,15 +37,12 @@ mpz_tdiv_r_ui (rem, dividend, divisor) mp_size_t size; mp_limb_t remainder_limb; + if (divisor == 0) + DIVIDE_BY_ZERO; + dividend_size = dividend->_mp_size; size = ABS (dividend_size); - if (size == 0) - { - rem->_mp_size = 0; - return; - } - /* No need for temporary allocation and copying if QUOT == DIVIDEND as the divisor is just one limb, and thus no intermediate remainders need to be stored. */ @@ -61,4 +58,6 @@ mpz_tdiv_r_ui (rem, dividend, divisor) rem->_mp_size = dividend_size >= 0 ? 1 : -1; rem->_mp_d[0] = remainder_limb; } + + return remainder_limb; } diff --git a/ghc/rts/gmp/mpz/tdiv_ui.c b/ghc/rts/gmp/mpz/tdiv_ui.c new file mode 100644 index 0000000..7a40a6a --- /dev/null +++ b/ghc/rts/gmp/mpz/tdiv_ui.c @@ -0,0 +1,53 @@ +/* mpz_tdiv_ui(dividend, divisor_limb) + -- Return DIVDEND mod DIVISOR_LIMB. + +Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +unsigned long int +#if __STDC__ +mpz_tdiv_ui (mpz_srcptr dividend, unsigned long int divisor) +#else +mpz_tdiv_ui (dividend, divisor) + mpz_srcptr dividend; + unsigned long int divisor; +#endif +{ + mp_size_t dividend_size; + mp_size_t size; + mp_limb_t remainder_limb; + + if (divisor == 0) + DIVIDE_BY_ZERO; + + dividend_size = dividend->_mp_size; + size = ABS (dividend_size); + + /* No need for temporary allocation and copying if QUOT == DIVIDEND as + the divisor is just one limb, and thus no intermediate remainders + need to be stored. */ + + remainder_limb = mpn_mod_1 (dividend->_mp_d, size, (mp_limb_t) divisor); + + return remainder_limb; +} diff --git a/ghc/rts/gmp/mpz/tstbit.c b/ghc/rts/gmp/mpz/tstbit.c new file mode 100644 index 0000000..b0a8b0b --- /dev/null +++ b/ghc/rts/gmp/mpz/tstbit.c @@ -0,0 +1,70 @@ +/* mpz_tstbit -- test a specified bit. Simulate 2's complement representation. + +Copyright (C) 1997 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +int +#if __STDC__ +mpz_tstbit (mpz_srcptr d, unsigned long int bit_index) +#else +mpz_tstbit (d, bit_index) + mpz_srcptr d; + unsigned long int bit_index; +#endif +{ + mp_size_t dsize = d->_mp_size; + mp_ptr dp = d->_mp_d; + mp_size_t limb_index; + + limb_index = bit_index / BITS_PER_MP_LIMB; + if (dsize >= 0) + { + if (limb_index < dsize) + return (dp[limb_index] >> (bit_index % BITS_PER_MP_LIMB)) & 1; + else + /* Testing a bit outside of a positive number. */ + return 0; + } + else + { + mp_size_t zero_bound; + + dsize = -dsize; + + /* Locate the least significant non-zero limb. */ + for (zero_bound = 0; dp[zero_bound] == 0; zero_bound++) + ; + + if (limb_index > zero_bound) + { + if (limb_index < dsize) + return (~dp[limb_index] >> (bit_index % BITS_PER_MP_LIMB)) & 1; + else + /* Testing a bit outside of a negative number. */ + return 1; + } + else if (limb_index == zero_bound) + return (-dp[limb_index] >> (bit_index % BITS_PER_MP_LIMB)) & 1; + else + return 0; + } +} diff --git a/ghc/rts/gmp/mpz/ui_pow_ui.c b/ghc/rts/gmp/mpz/ui_pow_ui.c index 19baca1..edd2dee 100644 --- a/ghc/rts/gmp/mpz/ui_pow_ui.c +++ b/ghc/rts/gmp/mpz/ui_pow_ui.c @@ -1,20 +1,21 @@ /* mpz_ui_pow_ui(res, base, exp) -- Set RES to BASE**EXP. -Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation, +Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -23,6 +24,9 @@ MA 02111-1307, USA. */ #include "gmp-impl.h" #include "longlong.h" + +static void mpz_pow2 _PROTO ((mpz_ptr r, mp_limb_t blimb, unsigned long int e, mp_limb_t rl)); + void #if __STDC__ mpz_ui_pow_ui (mpz_ptr r, unsigned long int b, unsigned long int e) @@ -33,79 +37,103 @@ mpz_ui_pow_ui (r, b, e) unsigned long int e; #endif { - mp_ptr rp, tp, xp; - mp_size_t rsize; - int cnt, i; mp_limb_t blimb = b; - TMP_DECL (marker); + mp_limb_t rl; - /* Single out cases that give result == 0 or 1. These tests are here - to simplify the general code below, not to optimize. */ if (e == 0) { + /* For x^0 we return 1, even if x is 0. */ r->_mp_d[0] = 1; r->_mp_size = 1; return; } - if (blimb == 0) - { - r->_mp_size = 0; - return; - } - if (blimb < 0x100) + /* Compute b^e as (b^n)^(e div n) * b^(e mod n), where n is chosen such that + the latter factor is the largest number small enough to fit in a limb. */ + + rl = 1; + while (e != 0 && blimb < ((mp_limb_t) 1 << BITS_PER_MP_LIMB/2)) { - /* Estimate space requirements accurately. Using the code from the - `else' path would over-estimate space requirements wildly. */ - float lb = __mp_bases[blimb].chars_per_bit_exactly; - rsize = 2 + ((mp_size_t) (e / lb) / BITS_PER_MP_LIMB); + if ((e & 1) != 0) + rl = rl * blimb; + blimb = blimb * blimb; + e = e >> 1; } - else + + /* rl is now b^(e mod n). (I.e., the latter factor above.) */ + + if (e == 0) { - /* Over-estimate space requirements somewhat. */ - count_leading_zeros (cnt, blimb); - rsize = e - cnt * e / BITS_PER_MP_LIMB + 1; + r->_mp_d[0] = rl; + r->_mp_size = rl != 0; + return; } + mpz_pow2 (r, blimb, e, rl); +} + +/* Multi-precision part of expontialization code. */ +static void +#if __STDC__ +mpz_pow2 (mpz_ptr r, mp_limb_t blimb, unsigned long int e, mp_limb_t rl) +#else +mpz_pow2 (r, blimb, e, rl) + mpz_ptr r; + mp_limb_t blimb; + unsigned long int e; + mp_limb_t rl; +#endif +{ + mp_ptr rp, tp; + mp_size_t ralloc, rsize; + int cnt, i; + TMP_DECL (marker); + TMP_MARK (marker); - /* The two areas are used to alternatingly hold the input and recieve the - product for mpn_mul. (This scheme is used to fulfill the requirements - of mpn_mul; that the product space may not be the same as any of the - input operands.) */ - rp = (mp_ptr) TMP_ALLOC (rsize * BYTES_PER_MP_LIMB); - tp = (mp_ptr) TMP_ALLOC (rsize * BYTES_PER_MP_LIMB); + /* Over-estimate temporary space requirements somewhat. */ + count_leading_zeros (cnt, blimb); + ralloc = e - cnt * e / BITS_PER_MP_LIMB + 1; + + /* The two areas are used to alternatingly hold the input and receive the + product for mpn_mul. (Needed since mpn_mul_n requires that the product + is distinct from either input operand.) */ + rp = (mp_ptr) TMP_ALLOC (ralloc * BYTES_PER_MP_LIMB); + tp = (mp_ptr) TMP_ALLOC (ralloc * BYTES_PER_MP_LIMB); rp[0] = blimb; rsize = 1; - count_leading_zeros (cnt, e); + count_leading_zeros (cnt, e); for (i = BITS_PER_MP_LIMB - cnt - 2; i >= 0; i--) { mpn_mul_n (tp, rp, rp, rsize); rsize = 2 * rsize; rsize -= tp[rsize - 1] == 0; - xp = tp; tp = rp; rp = xp; + MP_PTR_SWAP (rp, tp); if ((e & ((mp_limb_t) 1 << i)) != 0) { mp_limb_t cy; - cy = mpn_mul_1 (tp, rp, rsize, blimb); - if (cy != 0) - { - tp[rsize] = cy; - rsize++; - } - xp = tp; tp = rp; rp = xp; + cy = mpn_mul_1 (rp, rp, rsize, blimb); + rp[rsize] = cy; + rsize += cy != 0; } } - /* Now then we know the exact space requirements, reallocate if - necessary. */ - if (r->_mp_alloc < rsize) - _mpz_realloc (r, rsize); + /* We will need rsize or rsize+1 limbs for the result. */ + if (r->_mp_alloc <= rsize) + _mpz_realloc (r, rsize + 1); + + /* Multiply the two factors (in rp,rsize and rl) and put the final result + in place. */ + { + mp_limb_t cy; + cy = mpn_mul_1 (r->_mp_d, rp, rsize, rl); + (r->_mp_d)[rsize] = cy; + rsize += cy != 0; + } - MPN_COPY (r->_mp_d, rp, rsize); r->_mp_size = rsize; TMP_FREE (marker); } diff --git a/ghc/rts/gmp/mpz/urandomb.c b/ghc/rts/gmp/mpz/urandomb.c new file mode 100644 index 0000000..caca086 --- /dev/null +++ b/ghc/rts/gmp/mpz/urandomb.c @@ -0,0 +1,49 @@ +/* mpz_urandomb (rop, state, n) -- Generate a uniform pseudorandom + integer in the range 0 to 2^N - 1, inclusive, using STATE as the + random state previously initialized by a call to gmp_randinit(). + +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +void +#if __STDC__ +mpz_urandomb (mpz_t rop, gmp_randstate_t rstate, unsigned long int nbits) +#else +mpz_urandomb (rop, rstate, nbits) + mpz_t rop; + gmp_randstate_t rstate; + unsigned long int nbits; +#endif +{ + mp_ptr rp; + mp_size_t size; + + size = (nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB; + if (ALLOC (rop) < size) + _mpz_realloc (rop, size); + + rp = PTR (rop); + + _gmp_rand (rp, rstate, nbits); + MPN_NORMALIZE (rp, size); + SIZ (rop) = size; +} diff --git a/ghc/rts/gmp/mpz/urandomm.c b/ghc/rts/gmp/mpz/urandomm.c new file mode 100644 index 0000000..aa57784 --- /dev/null +++ b/ghc/rts/gmp/mpz/urandomm.c @@ -0,0 +1,73 @@ +/* mpz_urandomm (rop, state, n) -- Generate a uniform pseudorandom + integer in the range 0 to N-1, using STATE as the random state + previously initialized by a call to gmp_randinit(). + +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +void +#if __STDC__ +mpz_urandomm (mpz_t rop, gmp_randstate_t rstate, mpz_t n) +#else +mpz_urandomm (rop, rstate, n) + mpz_t rop; + gmp_randstate_t rstate; + mpz_t n; +#endif +{ + mpz_t t, p, m; + mp_ptr tp; + mp_size_t nbits, size; + int count; + + /* FIXME: Should check for n == 0 and report error */ + + size = SIZ (n); + count_leading_zeros (count, PTR (n)[size - 1]); + nbits = size * BITS_PER_MP_LIMB - count; + + /* Allocate enough for any mpz function called since a realloc of + these will fail. */ + MPZ_TMP_INIT (t, size); + MPZ_TMP_INIT (m, size + 1); + MPZ_TMP_INIT (p, size + 1); + + /* Let m = highest possible random number plus 1. */ + mpz_set_ui (m, 0); + mpz_setbit (m, nbits); + + /* Let p = floor(m / n) * n. */ + mpz_fdiv_q (p, m, n); + mpz_mul (p, p, n); + + tp = PTR (t); + do + { + _gmp_rand (tp, rstate, nbits); + MPN_NORMALIZE (tp, size); /* FIXME: Really necessary? */ + SIZ (t) = size; + } + while (mpz_cmp (t, p) >= 0); + + mpz_mod (rop, t, n); +} diff --git a/ghc/rts/gmp/mpz/xor.c b/ghc/rts/gmp/mpz/xor.c new file mode 100644 index 0000000..69898d1 --- /dev/null +++ b/ghc/rts/gmp/mpz/xor.c @@ -0,0 +1,217 @@ +/* mpz_xor -- Logical xor. + +Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +void +#if __STDC__ +mpz_xor (mpz_ptr res, mpz_srcptr op1, mpz_srcptr op2) +#else +mpz_xor (res, op1, op2) + mpz_ptr res; + mpz_srcptr op1; + mpz_srcptr op2; +#endif +{ + mp_srcptr op1_ptr, op2_ptr; + mp_size_t op1_size, op2_size; + mp_ptr res_ptr; + mp_size_t res_size, res_alloc; + mp_size_t i; + TMP_DECL (marker); + + TMP_MARK (marker); + op1_size = op1->_mp_size; + op2_size = op2->_mp_size; + + op1_ptr = op1->_mp_d; + op2_ptr = op2->_mp_d; + res_ptr = res->_mp_d; + + if (op1_size >= 0) + { + if (op2_size >= 0) + { + if (op1_size >= op2_size) + { + if (res->_mp_alloc < op1_size) + { + _mpz_realloc (res, op1_size); + op1_ptr = op1->_mp_d; + op2_ptr = op2->_mp_d; + res_ptr = res->_mp_d; + } + + if (res_ptr != op1_ptr) + MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size, + op1_size - op2_size); + for (i = op2_size - 1; i >= 0; i--) + res_ptr[i] = op1_ptr[i] ^ op2_ptr[i]; + res_size = op1_size; + } + else + { + if (res->_mp_alloc < op2_size) + { + _mpz_realloc (res, op2_size); + op1_ptr = op1->_mp_d; + op2_ptr = op2->_mp_d; + res_ptr = res->_mp_d; + } + + if (res_ptr != op2_ptr) + MPN_COPY (res_ptr + op1_size, op2_ptr + op1_size, + op2_size - op1_size); + for (i = op1_size - 1; i >= 0; i--) + res_ptr[i] = op1_ptr[i] ^ op2_ptr[i]; + res_size = op2_size; + } + + MPN_NORMALIZE (res_ptr, res_size); + res->_mp_size = res_size; + return; + } + else /* op2_size < 0 */ + { + /* Fall through to the code at the end of the function. */ + } + } + else + { + if (op2_size < 0) + { + mp_ptr opx; + mp_limb_t cy; + + /* Both operands are negative, the result will be positive. + (-OP1) ^ (-OP2) = + = ~(OP1 - 1) ^ ~(OP2 - 1) = + = (OP1 - 1) ^ (OP2 - 1) */ + + op1_size = -op1_size; + op2_size = -op2_size; + + /* Possible optimization: Decrease mpn_sub precision, + as we won't use the entire res of both. */ + opx = (mp_ptr) TMP_ALLOC (op1_size * BYTES_PER_MP_LIMB); + mpn_sub_1 (opx, op1_ptr, op1_size, (mp_limb_t) 1); + op1_ptr = opx; + + opx = (mp_ptr) TMP_ALLOC (op2_size * BYTES_PER_MP_LIMB); + mpn_sub_1 (opx, op2_ptr, op2_size, (mp_limb_t) 1); + op2_ptr = opx; + + res_alloc = MAX (op1_size, op2_size); + if (res->_mp_alloc < res_alloc) + { + _mpz_realloc (res, res_alloc); + res_ptr = res->_mp_d; + /* Don't re-read OP1_PTR and OP2_PTR. They point to + temporary space--never to the space RES->_mp_d used + to point to before reallocation. */ + } + + if (op1_size > op2_size) + { + MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size, + op1_size - op2_size); + for (i = op2_size - 1; i >= 0; i--) + res_ptr[i] = op1_ptr[i] ^ op2_ptr[i]; + res_size = op1_size; + } + else + { + MPN_COPY (res_ptr + op1_size, op2_ptr + op1_size, + op2_size - op1_size); + for (i = op1_size - 1; i >= 0; i--) + res_ptr[i] = op1_ptr[i] ^ op2_ptr[i]; + res_size = op2_size; + } + + MPN_NORMALIZE (res_ptr, res_size); + res->_mp_size = res_size; + TMP_FREE (marker); + return; + } + else + { + /* We should compute -OP1 ^ OP2. Swap OP1 and OP2 and fall + through to the code that handles OP1 ^ -OP2. */ + MPZ_SRCPTR_SWAP (op1, op2); + MPN_SRCPTR_SWAP (op1_ptr,op1_size, op2_ptr,op2_size); + } + } + + { + mp_ptr opx; + mp_limb_t cy; + mp_size_t count; + + /* Operand 2 negative, so will be the result. + -(OP1 ^ (-OP2)) = -(OP1 ^ ~(OP2 - 1)) = + = ~(OP1 ^ ~(OP2 - 1)) + 1 = + = (OP1 ^ (OP2 - 1)) + 1 */ + + op2_size = -op2_size; + + opx = (mp_ptr) TMP_ALLOC (op2_size * BYTES_PER_MP_LIMB); + mpn_sub_1 (opx, op2_ptr, op2_size, (mp_limb_t) 1); + op2_ptr = opx; + + res_alloc = MAX (op1_size, op2_size) + 1; + if (res->_mp_alloc < res_alloc) + { + _mpz_realloc (res, res_alloc); + op1_ptr = op1->_mp_d; + res_ptr = res->_mp_d; + /* Don't re-read OP2_PTR. It points to temporary space--never + to the space RES->_mp_d used to point to before reallocation. */ + } + + if (op1_size > op2_size) + { + MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size, op1_size - op2_size); + for (i = op2_size - 1; i >= 0; i--) + res_ptr[i] = op1_ptr[i] ^ op2_ptr[i]; + res_size = op1_size; + } + else + { + MPN_COPY (res_ptr + op1_size, op2_ptr + op1_size, op2_size - op1_size); + for (i = op1_size - 1; i >= 0; i--) + res_ptr[i] = op1_ptr[i] ^ op2_ptr[i]; + res_size = op2_size; + } + + cy = mpn_add_1 (res_ptr, res_ptr, res_size, (mp_limb_t) 1); + if (cy) + { + res_ptr[res_size] = cy; + res_size++; + } + + MPN_NORMALIZE (res_ptr, res_size); + res->_mp_size = -res_size; + TMP_FREE (marker); + } +} diff --git a/ghc/rts/gmp/rand.c b/ghc/rts/gmp/rand.c new file mode 100644 index 0000000..d1f9354 --- /dev/null +++ b/ghc/rts/gmp/rand.c @@ -0,0 +1,171 @@ +/* gmp_randinit (state, algorithm, ...) -- Initialize a random state. + +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include /* for NULL */ +#if __STDC__ +# include +#else +# include +#endif + +#include "gmp.h" +#include "gmp-impl.h" + +/* Array of CL-schemes, ordered in increasing order of the first + member (the 'm2exp' value). The end of the array is indicated with + an entry containing all zeros. */ + +/* All multipliers are in the range 0.01*m and 0.99*m, and are +congruent to 5 (mod 8). +They all pass the spectral test with Vt >= 2^(30/t) and merit >= 1. +(Up to and including 196 bits, merit is >= 3.) */ + +struct __gmp_rand_lc_scheme_struct +{ + unsigned long int m2exp; /* Modulus is 2 ^ m2exp. */ + char *astr; /* Multiplier in string form. */ + unsigned long int c; /* Adder. */ +}; + +struct __gmp_rand_lc_scheme_struct __gmp_rand_lc_scheme[] = +{ + {32, "43840821", 1}, + {33, "85943917", 1}, + {34, "171799469", 1}, + {35, "343825285", 1}, + {36, "687285701", 1}, + {37, "1374564613", 1}, + {38, "2749193437", 1}, + {39, "5497652029", 1}, + {40, "10995212661", 1}, + {56, "47988680294711517", 1}, + {64, "13469374875402548381", 1}, + {100, "203786806069096950756900463357", 1}, + {128, "96573135900076068624591706046897650309", 1}, + {156, "43051576988660538262511726153887323360449035333", 1}, + {196, "1611627857640767981443524165616850972435303571524033586421", 1}, + {200, "491824250216153841876046962368396460896019632211283945747141", 1}, + {256, "79336254595106925775099152154558630917988041692672147726148065355845551082677", 1}, + {0, NULL, 0} /* End of array. */ +}; + +void +#if __STDC__ +gmp_randinit (gmp_randstate_t rstate, + gmp_randalg_t alg, + ...) +#else +gmp_randinit (va_alist) + va_dcl +#endif +{ + va_list ap; +#if __STDC__ +#else + __gmp_randstate_struct *rstate; + gmp_randalg_t alg; +#endif + +#if __STDC__ + va_start (ap, alg); +#else + va_start (ap); + + rstate = va_arg (ap, __gmp_randstate_struct *); + alg = va_arg (ap, gmp_randalg_t); +#endif + + switch (alg) + { + case GMP_RAND_ALG_LC: /* Linear congruential. */ + { + unsigned long int size; + struct __gmp_rand_lc_scheme_struct *sp; + mpz_t a; + + size = va_arg (ap, unsigned long int); + + /* Pick a scheme. */ + for (sp = __gmp_rand_lc_scheme; sp->m2exp != 0; sp++) + if (sp->m2exp / 2 >= size) + break; + + if (sp->m2exp == 0) /* Nothing big enough found. */ + { + gmp_errno |= GMP_ERROR_INVALID_ARGUMENT; + return; + } + + /* Install scheme. */ + mpz_init_set_str (a, sp->astr, 0); + gmp_randinit_lc_2exp (rstate, a, sp->c, sp->m2exp); + mpz_clear (a); + break; + } + +#if 0 + case GMP_RAND_ALG_BBS: /* Blum, Blum, and Shub. */ + { + mpz_t p, q; + mpz_t ztmp; + + /* FIXME: Generate p and q. They must be ``large'' primes, + congruent to 3 mod 4. Should we ensure that they meet some + of the criterias for being ``hard primes''?*/ + + /* These are around 128 bits. */ + mpz_init_set_str (p, "148028650191182616877187862194899201391", 10); + mpz_init_set_str (q, "315270837425234199477225845240496832591", 10); + + /* Allocate algorithm specific data. */ + rstate->data.bbs = (__gmp_rand_data_bbs *) + (*_mp_allocate_func) (sizeof (__gmp_rand_data_bbs)); + + mpz_init (rstate->data.bbs->bi); /* The Blum integer. */ + mpz_mul (rstate->data.bbs->bi, p, q); + + /* Find a seed, x, with gcd (x, bi) == 1. */ + mpz_init (ztmp); + while (1) + { + mpz_gcd (ztmp, seed, rstate->data.bbs->bi); + if (!mpz_cmp_ui (ztmp, 1)) + break; + mpz_add_ui (seed, seed, 1); + } + + rstate->alg = alg; + rstate->size = size; /* FIXME: Remove. */ + mpz_set (rstate->seed, seed); + + mpz_clear (p); + mpz_clear (q); + mpz_clear (ztmp); + break; + } +#endif /* 0 */ + + default: /* Bad choice. */ + gmp_errno |= GMP_ERROR_UNSUPPORTED_ARGUMENT; + } + + va_end (ap); +} diff --git a/ghc/rts/gmp/randclr.c b/ghc/rts/gmp/randclr.c new file mode 100644 index 0000000..5cb0291 --- /dev/null +++ b/ghc/rts/gmp/randclr.c @@ -0,0 +1,54 @@ +/* gmp_randclear (state) -- Clear and deallocate random state STATE. + +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +void +#if __STDC__ +gmp_randclear (gmp_randstate_t rstate) +#else +gmp_randclear (rstate) + gmp_randstate_t rstate; +#endif +{ + mpz_clear (rstate->seed); + + switch (rstate->alg) + { + case GMP_RAND_ALG_LC: + mpz_clear (rstate->algdata.lc->a); + if (rstate->algdata.lc->m2exp == 0) + mpz_clear (rstate->algdata.lc->m); + (*_mp_free_func) (rstate->algdata.lc, sizeof (*rstate->algdata.lc)); + break; + +#if 0 + case GMP_RAND_ALG_BBS: + mpz_clear (rstate->algdata.bbs->bi); + (*_mp_free_func) (rstate->algdata.bbs, sizeof (*rstate->algdata.bbs)); + break; +#endif /* 0 */ + + default: + gmp_errno |= GMP_ERROR_UNSUPPORTED_ARGUMENT; + } +} diff --git a/ghc/rts/gmp/randlc.c b/ghc/rts/gmp/randlc.c new file mode 100644 index 0000000..7079db8 --- /dev/null +++ b/ghc/rts/gmp/randlc.c @@ -0,0 +1,56 @@ +/* gmp_randinit_lc (state, a, c, m) -- Initialize a random state for a + linear congruential generator with multiplier A, adder C, and + modulus M. + +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +void +#if __STDC__ +gmp_randinit_lc (gmp_randstate_t rstate, + mpz_t a, + unsigned long int c, + mpz_t m) +#else +gmp_randinit_lc (rstate, a, c, m) + gmp_randstate_t rstate; + mpz_t a; + unsigned long int c; + mpz_t m; +#endif +{ + /* FIXME: Not finished. We don't handle this in _gmp_rand() yet. */ + abort (); + + mpz_init_set_ui (rstate->seed, 1); + _mpz_realloc (rstate->seed, ABSIZ (m)); + + /* Allocate algorithm specific data. */ + rstate->algdata.lc = (__gmp_randata_lc *) + (*_mp_allocate_func) (sizeof (__gmp_randata_lc)); + + mpz_init_set (rstate->algdata.lc->a, a); + rstate->algdata.lc->c = c; + mpz_init_set (rstate->algdata.lc->m, m); + + rstate->alg = GMP_RAND_ALG_LC; +} diff --git a/ghc/rts/gmp/randlc2x.c b/ghc/rts/gmp/randlc2x.c new file mode 100644 index 0000000..dbd5f04 --- /dev/null +++ b/ghc/rts/gmp/randlc2x.c @@ -0,0 +1,59 @@ +/* gmp_randinit_lc_2exp (state, a, c, m2exp) -- Initialize random + state STATE for a linear congruential generator with multiplier A, + adder C, and modulus 2 ^ M2EXP. + +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +void +#if __STDC__ +gmp_randinit_lc_2exp (gmp_randstate_t rstate, + mpz_t a, + unsigned long int c, + unsigned long int m2exp) +#else +gmp_randinit_lc_2exp (rstate, a, c, m2exp) + gmp_randstate_t rstate; + mpz_t a; + unsigned long int c; + unsigned long int m2exp; +#endif +{ + mpz_init_set_ui (rstate->seed, 1); + _mpz_realloc (rstate->seed, m2exp / BITS_PER_MP_LIMB + + (m2exp % BITS_PER_MP_LIMB != 0)); + + /* Allocate algorithm specific data. */ + rstate->algdata.lc = (__gmp_randata_lc *) + (*_mp_allocate_func) (sizeof (__gmp_randata_lc)); + + mpz_init_set (rstate->algdata.lc->a, a); + rstate->algdata.lc->c = c; + + /* Cover weird case where m2exp is 0, which means that m is used + instead of m2exp. */ + if (m2exp == 0) + mpz_init_set_ui (rstate->algdata.lc->m, 0); + rstate->algdata.lc->m2exp = m2exp; + + rstate->alg = GMP_RAND_ALG_LC; +} diff --git a/ghc/rts/gmp/randraw.c b/ghc/rts/gmp/randraw.c new file mode 100644 index 0000000..c0c3889 --- /dev/null +++ b/ghc/rts/gmp/randraw.c @@ -0,0 +1,360 @@ +/* _gmp_rand (rp, state, nbits) -- Generate a random bitstream of + length NBITS in RP. RP must have enough space allocated to hold + NBITS. + +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* For linear congruential (LC), we use one of algorithms (1) or (2). + (gmp-3.0 uses algorithm (1) with 'm' as a power of 2.) + +LC algorithm (1). + + X = (aX + c) mod m + +[D. Knuth, "The Art of Computer Programming: Volume 2, Seminumerical Algorithms", +Third Edition, Addison Wesley, 1998, pp. 184-185.] + + X is the seed and the result + a is chosen so that + a mod 8 = 5 [3.2.1.2] and [3.2.1.3] + .01m < a < .99m + its binary or decimal digits is not a simple, regular pattern + it has no large quotients when Euclid's algorithm is used to find + gcd(a, m) [3.3.3] + it passes the spectral test [3.3.4] + it passes several tests of [3.3.2] + c has no factor in common with m (c=1 or c=a can be good) + m is large (2^30) + is a power of 2 [3.2.1.1] + +The least significant digits of the generated number are not very +random. It should be regarded as a random fraction X/m. To get a +random integer between 0 and n-1, multiply X/m by n and truncate. +(Don't use X/n [ex 3.4.1-3]) + +The ``accuracy'' in t dimensions is one part in ``the t'th root of m'' [3.3.4]. + +Don't generate more than about m/1000 numbers without changing a, c, or m. + +The sequence length depends on chosen a,c,m. + + +LC algorithm (2). + + X = a * (X mod q) - r * (long) (X/q) + if X<0 then X+=m + +[Knuth, pp. 185-186.] + + X is the seed and the result + as a seed is nonzero and less than m + a is a primitive root of m (which means that a^2 <= m) + q is (long) m / a + r is m mod a + m is a prime number near the largest easily computed integer + +which gives + + X = a * (X % ((long) m / a)) - + (M % a) * ((long) (X / ((long) m / a))) + +Since m is prime, the least-significant bits of X are just as random as +the most-significant bits. */ + +/* Blum, Blum, and Shub. + + [Bruce Schneier, "Applied Cryptography", Second Edition, John Wiley + & Sons, Inc., 1996, pp. 417-418.] + + "Find two large prime numbers, p and q, which are congruent to 3 + modulo 4. The product of those numbers, n, is a blum integer. + Choose another random integer, x, which is relatively prime to n. + Compute + x[0] = x^2 mod n + That's the seed for the generator." + + To generate a random bit, compute + x[i] = x[i-1]^2 mod n + The least significant bit of x[i] is the one we want. + + We can use more than one bit from x[i], namely the + log2(bitlength of x[i]) + least significant bits of x[i]. + + So, for a 32-bit seed we get 5 bits per computation. + + The non-predictability of this generator is based on the difficulty + of factoring n. + */ + +/* -------------------------------------------------- */ + +/* lc (rp, state) -- Generate next number in LC sequence. Return the + number of valid bits in the result. NOTE: If 'm' is a power of 2 + (m2exp != 0), discard the lower half of the result. */ + +static +unsigned long int +#if __STDC__ +lc (mp_ptr rp, gmp_randstate_t rstate) +#else +lc (rp, rstate) + mp_ptr rp; + gmp_randstate_t rstate; +#endif +{ + mp_ptr tp, seedp, ap; + mp_size_t ta; + mp_size_t tn, seedn, an; + mp_size_t retval; + int shiftcount = 0; + unsigned long int m2exp; + mp_limb_t c; + TMP_DECL (mark); + + m2exp = rstate->algdata.lc->m2exp; + c = (mp_limb_t) rstate->algdata.lc->c; + + seedp = PTR (rstate->seed); + seedn = SIZ (rstate->seed); + + if (seedn == 0) + { + /* Seed is 0. Result is C % M. */ + *rp = c; + + if (m2exp != 0) + { + /* M is a power of 2. */ + if (m2exp < BITS_PER_MP_LIMB) + { + /* Only necessary when M may be smaller than C. */ + *rp &= (((mp_limb_t) 1 << m2exp) - 1); + } + } + else + { + /* M is not a power of 2. */ + abort (); /* FIXME. */ + } + + /* Save result as next seed. */ + *seedp = *rp; + SIZ (rstate->seed) = 1; + return BITS_PER_MP_LIMB; + } + + ap = PTR (rstate->algdata.lc->a); + an = SIZ (rstate->algdata.lc->a); + + /* Allocate temporary storage. Let there be room for calculation of + (A * seed + C) % M, or M if bigger than that. */ + + ASSERT_ALWAYS (m2exp != 0); /* FIXME. */ + + TMP_MARK (mark); + ta = an + seedn + 1; + tp = (mp_ptr) TMP_ALLOC (ta * BYTES_PER_MP_LIMB); + MPN_ZERO (tp, ta); + + /* t = a * seed */ + if (seedn >= an) + mpn_mul_basecase (tp, seedp, seedn, ap, an); + else + mpn_mul_basecase (tp, ap, an, seedp, seedn); + tn = an + seedn; + + /* t = t + c */ + mpn_incr_u (tp, c); + + /* t = t % m */ + if (m2exp != 0) + { + /* M is a power of 2. The mod operation is trivial. */ + + tp[m2exp / BITS_PER_MP_LIMB] &= ((mp_limb_t) 1 << m2exp % BITS_PER_MP_LIMB) - 1; + tn = (m2exp + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB; + } + else + { + abort (); /* FIXME. */ + } + + /* Save result as next seed. */ + MPN_COPY (PTR (rstate->seed), tp, tn); + SIZ (rstate->seed) = tn; + + if (m2exp != 0) + { + /* Discard the lower half of the result. */ + unsigned long int discardb = m2exp / 2; + mp_size_t discardl = discardb / BITS_PER_MP_LIMB; + + tn -= discardl; + if (tn > 0) + { + if (discardb % BITS_PER_MP_LIMB != 0) + { + mpn_rshift (tp, tp + discardl, tn, discardb % BITS_PER_MP_LIMB); + MPN_COPY (rp, tp, (discardb + BITS_PER_MP_LIMB -1) / BITS_PER_MP_LIMB); + } + else /* Even limb boundary. */ + MPN_COPY_INCR (rp, tp + discardl, tn); + } + } + else + { + MPN_COPY (rp, tp, tn); + } + + TMP_FREE (mark); + + /* Return number of valid bits in the result. */ + if (m2exp != 0) + retval = (m2exp + 1) / 2; + else + retval = SIZ (rstate->algdata.lc->m) * BITS_PER_MP_LIMB - shiftcount; + return retval; +} + +#ifdef RAWRANDEBUG +/* Set even bits to EVENBITS and odd bits to ! EVENBITS in RP. + Number of bits is m2exp in state. */ +/* FIXME: Remove. */ +unsigned long int +lc_test (mp_ptr rp, gmp_randstate_t s, const int evenbits) +{ + unsigned long int rn, nbits; + int f; + + nbits = s->algdata.lc->m2exp / 2; + rn = nbits / BITS_PER_MP_LIMB + (nbits % BITS_PER_MP_LIMB != 0); + MPN_ZERO (rp, rn); + + for (f = 0; f < nbits; f++) + { + mpn_lshift (rp, rp, rn, 1); + if (f % 2 == ! evenbits) + rp[0] += 1; + } + + return nbits; +} +#endif /* RAWRANDEBUG */ + +void +#if __STDC__ +_gmp_rand (mp_ptr rp, gmp_randstate_t rstate, unsigned long int nbits) +#else +_gmp_rand (rp, rstate, nbits) + mp_ptr rp; + gmp_randstate_t rstate; + unsigned long int nbits; +#endif +{ + mp_size_t rn; /* Size of R. */ + + rn = (nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB; + + switch (rstate->alg) + { + case GMP_RAND_ALG_LC: + { + unsigned long int rbitpos; + int chunk_nbits; + mp_ptr tp; + mp_size_t tn; + TMP_DECL (lcmark); + + TMP_MARK (lcmark); + + chunk_nbits = rstate->algdata.lc->m2exp / 2; + tn = (chunk_nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB; + + tp = (mp_ptr) TMP_ALLOC (tn * BYTES_PER_MP_LIMB); + + rbitpos = 0; + while (rbitpos + chunk_nbits <= nbits) + { + mp_ptr r2p = rp + rbitpos / BITS_PER_MP_LIMB; + + if (rbitpos % BITS_PER_MP_LIMB != 0) + { + mp_limb_t savelimb, rcy; + /* Target of of new chunk is not bit aligned. Use temp space + and align things by shifting it up. */ + lc (tp, rstate); + savelimb = r2p[0]; + rcy = mpn_lshift (r2p, tp, tn, rbitpos % BITS_PER_MP_LIMB); + r2p[0] |= savelimb; +/* bogus */ if ((chunk_nbits % BITS_PER_MP_LIMB + rbitpos % BITS_PER_MP_LIMB) + > BITS_PER_MP_LIMB) + r2p[tn] = rcy; + } + else + { + /* Target of of new chunk is bit aligned. Let `lc' put bits + directly into our target variable. */ + lc (r2p, rstate); + } + rbitpos += chunk_nbits; + } + + /* Handle last [0..chunk_nbits) bits. */ + if (rbitpos != nbits) + { + mp_ptr r2p = rp + rbitpos / BITS_PER_MP_LIMB; + int last_nbits = nbits - rbitpos; + tn = (last_nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB; + lc (tp, rstate); + if (rbitpos % BITS_PER_MP_LIMB != 0) + { + mp_limb_t savelimb, rcy; + /* Target of of new chunk is not bit aligned. Use temp space + and align things by shifting it up. */ + savelimb = r2p[0]; + rcy = mpn_lshift (r2p, tp, tn, rbitpos % BITS_PER_MP_LIMB); + r2p[0] |= savelimb; + if (rbitpos + tn * BITS_PER_MP_LIMB - rbitpos % BITS_PER_MP_LIMB < nbits) + r2p[tn] = rcy; + } + else + { + MPN_COPY (r2p, tp, tn); + } + /* Mask off top bits if needed. */ + if (nbits % BITS_PER_MP_LIMB != 0) + rp[nbits / BITS_PER_MP_LIMB] + &= ~ ((~(mp_limb_t) 0) << nbits % BITS_PER_MP_LIMB); + } + + TMP_FREE (lcmark); + break; + } + + default: + gmp_errno |= GMP_ERROR_UNSUPPORTED_ARGUMENT; + break; + } +} diff --git a/ghc/rts/gmp/randsd.c b/ghc/rts/gmp/randsd.c new file mode 100644 index 0000000..3bed14b --- /dev/null +++ b/ghc/rts/gmp/randsd.c @@ -0,0 +1,37 @@ +/* gmp_randseed (state, seed) -- Set initial seed SEED in random state + STATE. + +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +void +#if __STDC__ +gmp_randseed (gmp_randstate_t rstate, + mpz_t seed) +#else +gmp_randseed (rstate, seed) + gmp_randstate_t rstate; + mpz_t seed; +#endif +{ + mpz_set (rstate->seed, seed); +} diff --git a/ghc/rts/gmp/randsdui.c b/ghc/rts/gmp/randsdui.c new file mode 100644 index 0000000..92f412f --- /dev/null +++ b/ghc/rts/gmp/randsdui.c @@ -0,0 +1,37 @@ +/* gmp_randseed_ui (state, seed) -- Set initial seed SEED in random + state STATE. + +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +void +#if __STDC__ +gmp_randseed_ui (gmp_randstate_t rstate, + unsigned long int seed) +#else +gmp_randseed_ui (rstate, seed) + gmp_randstate_t rstate; + mpz_t seed; +#endif +{ + mpz_set_ui (rstate->seed, seed); +} diff --git a/ghc/rts/gmp/stack-alloc.c b/ghc/rts/gmp/stack-alloc.c index d9619f6..9ab98fe 100644 --- a/ghc/rts/gmp/stack-alloc.c +++ b/ghc/rts/gmp/stack-alloc.c @@ -1,43 +1,63 @@ /* Stack allocation routines. This is intended for machines without support for the `alloca' function. -Copyright (C) 1996 Free Software Foundation, Inc. +Copyright (C) 1996, 1997, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "stack-alloc.h" +#define __need_size_t +#include +#undef __need_size_t + +/* gmp-impl.h and stack-alloc.h conflict when not USE_STACK_ALLOC, so these + declarations are copied here */ +#if __STDC__ +extern void * (*__gmp_allocate_func) (size_t); +extern void (*__gmp_free_func) (void *, size_t); +#else +extern void * (*__gmp_allocate_func) (); +extern void (*__gmp_free_func) (); +#endif + typedef struct tmp_stack tmp_stack; -void *malloc (); static unsigned long max_total_allocation = 0; static unsigned long current_total_allocation = 0; static tmp_stack xxx = {&xxx, &xxx, 0}; static tmp_stack *current = &xxx; +/* The rounded size of the header of each allocation block. */ +#define HSIZ ((sizeof (tmp_stack) + __TMP_ALIGN - 1) & -__TMP_ALIGN) + /* Allocate a block of exactly bytes. This should only be called through the TMP_ALLOC macro, which takes care of rounding/alignment. */ void * -__tmp_alloc (size) +#if __STDC__ +__gmp_tmp_alloc (unsigned long size) +#else +__gmp_tmp_alloc (size) unsigned long size; +#endif { - void *this; + void *that; if (size > (char *) current->end - (char *) current->alloc_point) { @@ -56,43 +76,52 @@ __tmp_alloc (size) /* We need more temporary memory than ever before. Increase for future needs. */ now = now * 3 / 2; - chunk_size = now - current_total_allocation + sizeof (tmp_stack); + chunk_size = now - current_total_allocation + HSIZ; current_total_allocation = now; max_total_allocation = current_total_allocation; } else { - chunk_size = max_total_allocation - current_total_allocation + sizeof (tmp_stack); + chunk_size = max_total_allocation - current_total_allocation + HSIZ; current_total_allocation = max_total_allocation; } - chunk = malloc (chunk_size); - header = chunk; + chunk = (*__gmp_allocate_func) (chunk_size); + header = (tmp_stack *) chunk; header->end = (char *) chunk + chunk_size; - header->alloc_point = (char *) chunk + sizeof (tmp_stack); + header->alloc_point = (char *) chunk + HSIZ; header->prev = current; current = header; } - this = current->alloc_point; - current->alloc_point = (char *) this + size; - return this; + that = current->alloc_point; + current->alloc_point = (char *) that + size; + return that; } -/* Typically called at function entry. is assigned so that __tmp_free - can later be used to reclaim all subsecuently allocated storage. */ +/* Typically called at function entry. is assigned so that + __gmp_tmp_free can later be used to reclaim all subsequently allocated + storage. */ void -__tmp_mark (mark) +#if __STDC__ +__gmp_tmp_mark (tmp_marker *mark) +#else +__gmp_tmp_mark (mark) tmp_marker *mark; +#endif { mark->which_chunk = current; mark->alloc_point = current->alloc_point; } -/* Free everything allocated since was assigned by __tmp_mark */ +/* Free everything allocated since was assigned by __gmp_tmp_mark */ void -__tmp_free (mark) +#if __STDC__ +__gmp_tmp_free (tmp_marker *mark) +#else +__gmp_tmp_free (mark) tmp_marker *mark; +#endif { while (mark->which_chunk != current) { @@ -100,9 +129,8 @@ __tmp_free (mark) tmp = current; current = tmp->prev; - current_total_allocation -= (((char *) (tmp->end) - (char *) tmp) - - sizeof (tmp_stack)); - free (tmp); + current_total_allocation -= (((char *) (tmp->end) - (char *) tmp) - HSIZ); + (*__gmp_free_func) (tmp, (char *) tmp->end - (char *) tmp); } current->alloc_point = mark->alloc_point; } diff --git a/ghc/rts/gmp/stack-alloc.h b/ghc/rts/gmp/stack-alloc.h index a84eeff..f59beec 100644 --- a/ghc/rts/gmp/stack-alloc.h +++ b/ghc/rts/gmp/stack-alloc.h @@ -1,21 +1,21 @@ /* Stack allocation routines. This is intended for machines without support for the `alloca' function. -Copyright (C) 1996 Free Software Foundation, Inc. +Copyright (C) 1996, 2000 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -35,14 +35,22 @@ struct tmp_marker typedef struct tmp_marker tmp_marker; +#if defined (__cplusplus) +extern "C" { +#endif + #if __STDC__ -void *__tmp_alloc (unsigned long); -void __tmp_mark (tmp_marker *); -void __tmp_free (tmp_marker *); +void *__gmp_tmp_alloc (unsigned long); +void __gmp_tmp_mark (tmp_marker *); +void __gmp_tmp_free (tmp_marker *); #else -void *__tmp_alloc (); -void __tmp_mark (); -void __tmp_free (); +void *__gmp_tmp_alloc (); +void __gmp_tmp_mark (); +void __gmp_tmp_free (); +#endif + +#if defined (__cplusplus) +} #endif #ifndef __TMP_ALIGN @@ -51,6 +59,6 @@ void __tmp_free (); #define TMP_DECL(marker) tmp_marker marker #define TMP_ALLOC(size) \ - __tmp_alloc (((unsigned long) (size) + __TMP_ALIGN - 1) & -__TMP_ALIGN) -#define TMP_MARK(marker) __tmp_mark (&marker) -#define TMP_FREE(marker) __tmp_free (&marker) + __gmp_tmp_alloc (((unsigned long) (size) + __TMP_ALIGN - 1) & -__TMP_ALIGN) +#define TMP_MARK(marker) __gmp_tmp_mark (&marker) +#define TMP_FREE(marker) __gmp_tmp_free (&marker) diff --git a/ghc/rts/gmp/stamp-vti b/ghc/rts/gmp/stamp-vti new file mode 100644 index 0000000..5ac4cb8 --- /dev/null +++ b/ghc/rts/gmp/stamp-vti @@ -0,0 +1,3 @@ +@set UPDATED 3 August 2000 +@set EDITION 3.1 +@set VERSION 3.1 diff --git a/ghc/rts/gmp/version.c b/ghc/rts/gmp/version.c index 7050239..9d544ee 100644 --- a/ghc/rts/gmp/version.c +++ b/ghc/rts/gmp/version.c @@ -1 +1,26 @@ -static char *gmp_version = "2.0.1"; +/* gmp_version -- version number compiled into the library */ + +/* +Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +const char *gmp_version = VERSION; diff --git a/ghc/rts/gmp/version.texi b/ghc/rts/gmp/version.texi new file mode 100644 index 0000000..5ac4cb8 --- /dev/null +++ b/ghc/rts/gmp/version.texi @@ -0,0 +1,3 @@ +@set UPDATED 3 August 2000 +@set EDITION 3.1 +@set VERSION 3.1 -- 1.7.10.4