X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=rts%2FPrimOps.cmm;h=9ebe8fb2c8a902a07f85205ce7b6a8afc97aa710;hb=96fff1ba94bd3a21da3fa797816bd6e82e4148ba;hp=dc60ff2457ec14267485584ca7886be6f35686ff;hpb=8a317b223ac681f99e75f9a4a89380c92d283049;p=ghc-hetmet.git diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm index dc60ff2..9ebe8fb 100644 --- a/rts/PrimOps.cmm +++ b/rts/PrimOps.cmm @@ -83,32 +83,65 @@ newByteArrayzh_fast RET_P(p); } +#define BA_ALIGN 16 +#define BA_MASK (BA_ALIGN-1) + newPinnedByteArrayzh_fast { - W_ words, payload_words, n, p; + W_ words, bytes, payload_words, p; MAYBE_GC(NO_PTRS,newPinnedByteArrayzh_fast); - n = R1; - payload_words = ROUNDUP_BYTES_TO_WDS(n); + bytes = R1; + /* payload_words is what we will tell the profiler we had to allocate */ + payload_words = ROUNDUP_BYTES_TO_WDS(bytes); + /* When we actually allocate memory, we need to allow space for the + header: */ + bytes = bytes + SIZEOF_StgArrWords; + /* And we want to align to BA_ALIGN bytes, so we need to allow space + to shift up to BA_ALIGN - 1 bytes: */ + bytes = bytes + BA_ALIGN - 1; + /* Now we convert to a number of words: */ + words = ROUNDUP_BYTES_TO_WDS(bytes); - // We want an 8-byte aligned array. allocatePinned() gives us - // 8-byte aligned memory by default, but we want to align the - // *goods* inside the ArrWords object, so we have to check the - // size of the ArrWords header and adjust our size accordingly. - words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words; - if ((SIZEOF_StgArrWords & 7) != 0) { - words = words + 1; - } + ("ptr" p) = foreign "C" allocatePinned(words) []; + TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0); + + /* Now we need to move p forward so that the payload is aligned + to BA_ALIGN bytes: */ + p = p + ((-p - SIZEOF_StgArrWords) & BA_MASK); + + SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]); + StgArrWords_words(p) = payload_words; + RET_P(p); +} + +newAlignedPinnedByteArrayzh_fast +{ + W_ words, bytes, payload_words, p, alignment; + + MAYBE_GC(NO_PTRS,newAlignedPinnedByteArrayzh_fast); + bytes = R1; + alignment = R2; + + /* payload_words is what we will tell the profiler we had to allocate */ + payload_words = ROUNDUP_BYTES_TO_WDS(bytes); + + /* When we actually allocate memory, we need to allow space for the + header: */ + bytes = bytes + SIZEOF_StgArrWords; + /* And we want to align to bytes, so we need to allow space + to shift up to bytes: */ + bytes = bytes + alignment - 1; + /* Now we convert to a number of words: */ + words = ROUNDUP_BYTES_TO_WDS(bytes); ("ptr" p) = foreign "C" allocatePinned(words) []; TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0); - // Again, if the ArrWords header isn't a multiple of 8 bytes, we - // have to push the object forward one word so that the goods - // fall on an 8-byte boundary. - if ((SIZEOF_StgArrWords & 7) != 0) { - p = p + WDS(1); - } + /* Now we need to move p forward so that the payload is aligned + to bytes. Note that we are assuming that + is a power of 2, which is technically not guaranteed */ + p = p + ((-p - SIZEOF_StgArrWords) & (alignment - 1)); SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]); StgArrWords_words(p) = payload_words; @@ -334,7 +367,7 @@ mkWeakForeignEnvzh_fast flag = R5; eptr = R6; - ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, mkWeakForeignEnvzh_fast ); + ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR, mkWeakForeignEnvzh_fast ); w = Hp - SIZEOF_StgWeak + WDS(1); SET_HDR(w, stg_WEAK_info, W_[CCCS]); @@ -611,60 +644,32 @@ word64ToIntegerzh_fast RET_NP(s,p); } - - #endif /* SUPPORT_LONG_LONGS */ -/* ToDo: this is shockingly inefficient */ - -#ifndef THREADED_RTS -section "bss" { - mp_tmp1: - bits8 [SIZEOF_MP_INT]; -} - -section "bss" { - mp_tmp2: - bits8 [SIZEOF_MP_INT]; -} - -section "bss" { - mp_result1: - bits8 [SIZEOF_MP_INT]; -} - -section "bss" { - mp_result2: - bits8 [SIZEOF_MP_INT]; -} -#endif - -#ifdef THREADED_RTS -#define FETCH_MP_TEMP(X) \ -W_ X; \ -X = BaseReg + (OFFSET_StgRegTable_r ## X); -#else -#define FETCH_MP_TEMP(X) /* Nothing */ -#endif - #define GMP_TAKE2_RET1(name,mp_fun) \ name \ { \ CInt s1, s2; \ W_ d1, d2; \ - FETCH_MP_TEMP(mp_tmp1); \ - FETCH_MP_TEMP(mp_tmp2); \ - FETCH_MP_TEMP(mp_result1) \ - FETCH_MP_TEMP(mp_result2); \ + W_ mp_tmp1; \ + W_ mp_tmp2; \ + W_ mp_result1; \ + W_ mp_result2; \ \ /* call doYouWantToGC() */ \ MAYBE_GC(R2_PTR & R4_PTR, name); \ \ + STK_CHK_GEN( 4 * SIZEOF_MP_INT, R2_PTR & R4_PTR, name ); \ + \ s1 = W_TO_INT(R1); \ d1 = R2; \ s2 = W_TO_INT(R3); \ d2 = R4; \ \ + mp_tmp1 = Sp - 1 * SIZEOF_MP_INT; \ + mp_tmp2 = Sp - 2 * SIZEOF_MP_INT; \ + mp_result1 = Sp - 3 * SIZEOF_MP_INT; \ + mp_result2 = Sp - 4 * SIZEOF_MP_INT; \ MP_INT__mp_alloc(mp_tmp1) = W_TO_INT(StgArrWords_words(d1)); \ MP_INT__mp_size(mp_tmp1) = (s1); \ MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(d1); \ @@ -686,15 +691,19 @@ name \ { \ CInt s1; \ W_ d1; \ - FETCH_MP_TEMP(mp_tmp1); \ - FETCH_MP_TEMP(mp_result1) \ + W_ mp_tmp1; \ + W_ mp_result1; \ \ /* call doYouWantToGC() */ \ MAYBE_GC(R2_PTR, name); \ \ + STK_CHK_GEN( 2 * SIZEOF_MP_INT, R2_PTR, name ); \ + \ d1 = R2; \ s1 = W_TO_INT(R1); \ \ + mp_tmp1 = Sp - 1 * SIZEOF_MP_INT; \ + mp_result1 = Sp - 2 * SIZEOF_MP_INT; \ MP_INT__mp_alloc(mp_tmp1) = W_TO_INT(StgArrWords_words(d1)); \ MP_INT__mp_size(mp_tmp1) = (s1); \ MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(d1); \ @@ -713,19 +722,25 @@ name { \ CInt s1, s2; \ W_ d1, d2; \ - FETCH_MP_TEMP(mp_tmp1); \ - FETCH_MP_TEMP(mp_tmp2); \ - FETCH_MP_TEMP(mp_result1) \ - FETCH_MP_TEMP(mp_result2) \ + W_ mp_tmp1; \ + W_ mp_tmp2; \ + W_ mp_result1; \ + W_ mp_result2; \ \ /* call doYouWantToGC() */ \ MAYBE_GC(R2_PTR & R4_PTR, name); \ \ + STK_CHK_GEN( 4 * SIZEOF_MP_INT, R2_PTR & R4_PTR, name ); \ + \ s1 = W_TO_INT(R1); \ d1 = R2; \ s2 = W_TO_INT(R3); \ d2 = R4; \ \ + mp_tmp1 = Sp - 1 * SIZEOF_MP_INT; \ + mp_tmp2 = Sp - 2 * SIZEOF_MP_INT; \ + mp_result1 = Sp - 3 * SIZEOF_MP_INT; \ + mp_result2 = Sp - 4 * SIZEOF_MP_INT; \ MP_INT__mp_alloc(mp_tmp1) = W_TO_INT(StgArrWords_words(d1)); \ MP_INT__mp_size(mp_tmp1) = (s1); \ MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(d1); \ @@ -760,17 +775,15 @@ GMP_TAKE1_RET1(complementIntegerzh_fast, __gmpz_com) GMP_TAKE2_RET2(quotRemIntegerzh_fast, __gmpz_tdiv_qr) GMP_TAKE2_RET2(divModIntegerzh_fast, __gmpz_fdiv_qr) -#ifndef THREADED_RTS -section "bss" { - mp_tmp_w: W_; // NB. mp_tmp_w is really an here mp_limb_t -} -#endif - gcdIntzh_fast { /* R1 = the first Int#; R2 = the second Int# */ W_ r; - FETCH_MP_TEMP(mp_tmp_w); + W_ mp_tmp_w; + + STK_CHK_GEN( 1 * SIZEOF_MP_INT, NO_PTRS, gcdIntzh_fast ); + + mp_tmp_w = Sp - 1 * SIZEOF_MP_INT; W_[mp_tmp_w] = R1; (r) = foreign "C" __gmpn_gcd_1(mp_tmp_w "ptr", 1, R2) []; @@ -921,38 +934,17 @@ integer2Wordzh_fast jump %ENTRY_CODE(Sp(0)); } -decodeFloatzh_fast -{ - W_ p; - F_ arg; - FETCH_MP_TEMP(mp_tmp1); - FETCH_MP_TEMP(mp_tmp_w); - - /* arguments: F1 = Float# */ - arg = F1; - - ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, decodeFloatzh_fast ); - - /* Be prepared to tell Lennart-coded __decodeFloat - where mantissa._mp_d can be put (it does not care about the rest) */ - p = Hp - SIZEOF_StgArrWords; - SET_HDR(p,stg_ARR_WORDS_info,W_[CCCS]); - StgArrWords_words(p) = 1; - MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p); - - /* Perform the operation */ - foreign "C" __decodeFloat(mp_tmp1 "ptr",mp_tmp_w "ptr" ,arg) []; - - /* returns: (Int# (expn), Int#, ByteArray#) */ - RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p); -} - decodeFloatzuIntzh_fast { W_ p; F_ arg; - FETCH_MP_TEMP(mp_tmp1); - FETCH_MP_TEMP(mp_tmp_w); + W_ mp_tmp1; + W_ mp_tmp_w; + + STK_CHK_GEN( WDS(2), NO_PTRS, decodeFloatzuIntzh_fast ); + + mp_tmp1 = Sp - WDS(1); + mp_tmp_w = Sp - WDS(2); /* arguments: F1 = Float# */ arg = F1; @@ -971,8 +963,13 @@ decodeDoublezh_fast { D_ arg; W_ p; - FETCH_MP_TEMP(mp_tmp1); - FETCH_MP_TEMP(mp_tmp_w); + W_ mp_tmp1; + W_ mp_tmp_w; + + STK_CHK_GEN( 2 * SIZEOF_MP_INT, NO_PTRS, decodeDoublezh_fast ); + + mp_tmp1 = Sp - 1 * SIZEOF_MP_INT; + mp_tmp_w = Sp - 2 * SIZEOF_MP_INT; /* arguments: D1 = Double# */ arg = D1; @@ -997,10 +994,17 @@ decodeDoublezu2Intzh_fast { D_ arg; W_ p; - FETCH_MP_TEMP(mp_tmp1); - FETCH_MP_TEMP(mp_tmp2); - FETCH_MP_TEMP(mp_result1); - FETCH_MP_TEMP(mp_result2); + W_ mp_tmp1; + W_ mp_tmp2; + W_ mp_result1; + W_ mp_result2; + + STK_CHK_GEN( WDS(4), NO_PTRS, decodeDoublezu2Intzh_fast ); + + mp_tmp1 = Sp - WDS(1); + mp_tmp2 = Sp - WDS(2); + mp_result1 = Sp - WDS(3); + mp_result2 = Sp - WDS(4); /* arguments: D1 = Double# */ arg = D1; @@ -1040,7 +1044,8 @@ forkzh_fast foreign "C" scheduleThread(MyCapability() "ptr", threadid "ptr") []; - // switch at the earliest opportunity + // context switch soon, but not immediately: we don't want every + // forkIO to force a context-switch. Capability_context_switch(MyCapability()) = 1 :: CInt; RET_P(threadid); @@ -1069,7 +1074,8 @@ forkOnzh_fast foreign "C" scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr") []; - // switch at the earliest opportunity + // context switch soon, but not immediately: we don't want every + // forkIO to force a context-switch. Capability_context_switch(MyCapability()) = 1 :: CInt; RET_P(threadid); @@ -2362,6 +2368,26 @@ getApStackValzh_fast RET_NP(ok,val); } +/* ----------------------------------------------------------------------------- + Misc. primitives + -------------------------------------------------------------------------- */ + +// Write the cost center stack of the first argument on stderr; return +// the second. Possibly only makes sense for already evaluated +// things? +traceCcszh_fast +{ + W_ ccs; + +#ifdef PROFILING + ccs = StgHeader_ccs(UNTAG(R1)); + foreign "C" fprintCCS_stderr(ccs "ptr") [R2]; +#endif + + R1 = R2; + ENTER(); +} + getSparkzh_fast { W_ spark;