Do not link ghc stage1 using -threaded, only for stage2 or 3

[ghc-hetmet.git] / rts / PrimOps.cmm
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm

index f2ce415..121102c 100644 (file)
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -52,6 +52,7 @@ import pthread_mutex_unlock;
  import base_ControlziExceptionziBase_nestedAtomically_closure;
  import EnterCriticalSection;
  import LeaveCriticalSection;
+import ghczmprim_GHCziBool_False_closure;
  
  /*-----------------------------------------------------------------------------
    Array Primitives
@@ -82,32 +83,65 @@ newByteArrayzh_fast
      RET_P(p);
  }
  
+#define BA_ALIGN 16
+#define BA_MASK  (BA_ALIGN-1)
+
  newPinnedByteArrayzh_fast
  {
-    W_ words, payload_words, n, p;
+    W_ words, bytes, payload_words, p;
  
      MAYBE_GC(NO_PTRS,newPinnedByteArrayzh_fast);
-    n = R1;
-    payload_words = ROUNDUP_BYTES_TO_WDS(n);
+    bytes = R1;
+    /* payload_words is what we will tell the profiler we had to allocate */
+    payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
+    /* When we actually allocate memory, we need to allow space for the
+       header: */
+    bytes = bytes + SIZEOF_StgArrWords;
+    /* And we want to align to BA_ALIGN bytes, so we need to allow space
+       to shift up to BA_ALIGN - 1 bytes: */
+    bytes = bytes + BA_ALIGN - 1;
+    /* Now we convert to a number of words: */
+    words = ROUNDUP_BYTES_TO_WDS(bytes);
  
-    // We want an 8-byte aligned array.  allocatePinned() gives us
-    // 8-byte aligned memory by default, but we want to align the
-    // *goods* inside the ArrWords object, so we have to check the
-    // size of the ArrWords header and adjust our size accordingly.
-    words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-    if ((SIZEOF_StgArrWords & 7) != 0) {
-       words = words + 1;
-    }
+    ("ptr" p) = foreign "C" allocatePinned(words) [];
+    TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
+
+    /* Now we need to move p forward so that the payload is aligned
+       to BA_ALIGN bytes: */
+    p = p + ((-p - SIZEOF_StgArrWords) & BA_MASK);
+
+    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+    StgArrWords_words(p) = payload_words;
+    RET_P(p);
+}
+
+newAlignedPinnedByteArrayzh_fast
+{
+    W_ words, bytes, payload_words, p, alignment;
+
+    MAYBE_GC(NO_PTRS,newAlignedPinnedByteArrayzh_fast);
+    bytes = R1;
+    alignment = R2;
+
+    /* payload_words is what we will tell the profiler we had to allocate */
+    payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
+
+    /* When we actually allocate memory, we need to allow space for the
+       header: */
+    bytes = bytes + SIZEOF_StgArrWords;
+    /* And we want to align to <alignment> bytes, so we need to allow space
+       to shift up to <alignment - 1> bytes: */
+    bytes = bytes + alignment - 1;
+    /* Now we convert to a number of words: */
+    words = ROUNDUP_BYTES_TO_WDS(bytes);
  
      ("ptr" p) = foreign "C" allocatePinned(words) [];
      TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
  
-    // Again, if the ArrWords header isn't a multiple of 8 bytes, we
-    // have to push the object forward one word so that the goods
-    // fall on an 8-byte boundary.
-    if ((SIZEOF_StgArrWords & 7) != 0) {
-       p = p + WDS(1);
-    }
+    /* Now we need to move p forward so that the payload is aligned
+       to <alignment> bytes. Note that we are assuming that
+       <alignment> is a power of 2, which is technically not guaranteed */
+    p = p + ((-p - SIZEOF_StgArrWords) & (alignment - 1));
  
      SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
      StgArrWords_words(p) = payload_words;
@@ -193,7 +227,7 @@ newMutVarzh_fast
  
  atomicModifyMutVarzh_fast
  {
-    W_ mv, z, x, y, r;
+    W_ mv, f, z, x, y, r, h;
      /* Args: R1 :: MutVar#,  R2 :: a -> (a,b) */
  
      /* If x is the current contents of the MutVar#, then 
@@ -232,19 +266,15 @@ atomicModifyMutVarzh_fast
  
     HP_CHK_GEN_TICKY(SIZE, R1_PTR & R2_PTR, atomicModifyMutVarzh_fast);
  
-#if defined(THREADED_RTS)
-    ACQUIRE_LOCK(atomic_modify_mutvar_mutex "ptr") [R1,R2];
-#endif
-
-   x = StgMutVar_var(R1);
+   mv = R1;
+   f = R2;
  
     TICK_ALLOC_THUNK_2();
     CCCS_ALLOC(THUNK_2_SIZE);
     z = Hp - THUNK_2_SIZE + WDS(1);
     SET_HDR(z, stg_ap_2_upd_info, W_[CCCS]);
     LDV_RECORD_CREATE(z);
-   StgThunk_payload(z,0) = R2;
-   StgThunk_payload(z,1) = x;
+   StgThunk_payload(z,0) = f;
  
     TICK_ALLOC_THUNK_1();
     CCCS_ALLOC(THUNK_1_SIZE);
@@ -253,9 +283,6 @@ atomicModifyMutVarzh_fast
     LDV_RECORD_CREATE(y);
     StgThunk_payload(y,0) = z;
  
-   StgMutVar_var(R1) = y;
-   foreign "C" dirty_MUT_VAR(BaseReg "ptr", R1 "ptr") [R1];
-
     TICK_ALLOC_THUNK_1();
     CCCS_ALLOC(THUNK_1_SIZE);
     r = y - THUNK_1_SIZE;
@@ -263,10 +290,20 @@ atomicModifyMutVarzh_fast
     LDV_RECORD_CREATE(r);
     StgThunk_payload(r,0) = z;
  
-#if defined(THREADED_RTS)
-    RELEASE_LOCK(atomic_modify_mutvar_mutex "ptr") [];
+ retry:
+   x = StgMutVar_var(mv);
+   StgThunk_payload(z,1) = x;
+#ifdef THREADED_RTS
+   (h) = foreign "C" cas(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var, x, y) [];
+   if (h != x) { goto retry; }
+#else
+   StgMutVar_var(mv) = y;
  #endif
  
+   if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
+     foreign "C" dirty_MUT_VAR(BaseReg "ptr", mv "ptr") [];
+   }
+
     RET_P(r);
  }
  
@@ -293,9 +330,14 @@ mkWeakzh_fast
    w = Hp - SIZEOF_StgWeak + WDS(1);
    SET_HDR(w, stg_WEAK_info, W_[CCCS]);
  
-  StgWeak_key(w)       = R1;
-  StgWeak_value(w)     = R2;
-  StgWeak_finalizer(w) = R3;
+  // We don't care about cfinalizer here.
+  // Should StgWeak_cfinalizer(w) be stg_NO_FINALIZER_closure or
+  // something else?
+
+  StgWeak_key(w)        = R1;
+  StgWeak_value(w)      = R2;
+  StgWeak_finalizer(w)  = R3;
+  StgWeak_cfinalizer(w) = stg_NO_FINALIZER_closure;
  
    StgWeak_link(w)      = W_[weak_ptr_list];
    W_[weak_ptr_list]    = w;
@@ -305,12 +347,65 @@ mkWeakzh_fast
    RET_P(w);
  }
  
+mkWeakForeignEnvzh_fast
+{
+  /* R1 = key
+     R2 = value
+     R3 = finalizer
+     R4 = pointer
+     R5 = has environment (0 or 1)
+     R6 = environment
+  */
+  W_ w, payload_words, words, p;
+
+  W_ key, val, fptr, ptr, flag, eptr;
+
+  key  = R1;
+  val  = R2;
+  fptr = R3;
+  ptr  = R4;
+  flag = R5;
+  eptr = R6;
+
+  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, mkWeakForeignEnvzh_fast );
+
+  w = Hp - SIZEOF_StgWeak + WDS(1);
+  SET_HDR(w, stg_WEAK_info, W_[CCCS]);
+
+  payload_words = 4;
+  words         = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
+  ("ptr" p)     = foreign "C" allocateLocal(MyCapability() "ptr", words) [];
+
+  TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
+  SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+
+  StgArrWords_words(p)     = payload_words;
+  StgArrWords_payload(p,0) = fptr;
+  StgArrWords_payload(p,1) = ptr;
+  StgArrWords_payload(p,2) = eptr;
+  StgArrWords_payload(p,3) = flag;
+
+  // We don't care about the value here.
+  // Should StgWeak_value(w) be stg_NO_FINALIZER_closure or something else?
+
+  StgWeak_key(w)        = key;
+  StgWeak_value(w)      = val;
+  StgWeak_finalizer(w)  = stg_NO_FINALIZER_closure;
+  StgWeak_cfinalizer(w) = p;
+
+  StgWeak_link(w)   = W_[weak_ptr_list];
+  W_[weak_ptr_list] = w;
+
+  IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
+
+  RET_P(w);
+}
  
  finalizzeWeakzh_fast
  {
    /* R1 = weak ptr
     */
-  W_ w, f;
+  W_ w, f, arr;
  
    w = R1;
  
@@ -338,9 +433,18 @@ finalizzeWeakzh_fast
    SET_INFO(w,stg_DEAD_WEAK_info);
    LDV_RECORD_CREATE(w);
  
-  f = StgWeak_finalizer(w);
+  f   = StgWeak_finalizer(w);
+  arr = StgWeak_cfinalizer(w);
+
    StgDeadWeak_link(w) = StgWeak_link(w);
  
+  if (arr != stg_NO_FINALIZER_closure) {
+    foreign "C" runCFinalizer(StgArrWords_payload(arr,0),
+                              StgArrWords_payload(arr,1),
+                              StgArrWords_payload(arr,2),
+                              StgArrWords_payload(arr,3)) [];
+  }
+
    /* return the finalizer */
    if (f == stg_NO_FINALIZER_closure) {
        RET_NP(0,stg_NO_FINALIZER_closure);
@@ -969,8 +1073,9 @@ forkzh_fast
  
    foreign "C" scheduleThread(MyCapability() "ptr", threadid "ptr") [];
  
-  // switch at the earliest opportunity
-  CInt[context_switch] = 1 :: CInt;
+  // context switch soon, but not immediately: we don't want every
+  // forkIO to force a context-switch.
+  Capability_context_switch(MyCapability()) = 1 :: CInt;
    
    RET_P(threadid);
  }
@@ -998,8 +1103,9 @@ forkOnzh_fast
  
    foreign "C" scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr") [];
  
-  // switch at the earliest opportunity
-  CInt[context_switch] = 1 :: CInt;
+  // context switch soon, but not immediately: we don't want every
+  // forkIO to force a context-switch.
+  Capability_context_switch(MyCapability()) = 1 :: CInt;
    
    RET_P(threadid);
  }
@@ -1080,7 +1186,7 @@ INFO_TABLE_RET(stg_catch_retry_frame, CATCH_RETRY_FRAME,
  #if defined(PROFILING)
    W_ unused1, W_ unused2,
  #endif
-  W_ unused3, "ptr" W_ unused4, "ptr" W_ unused5)
+  W_ unused3, P_ unused4, P_ unused5)
  {
     W_ r, frame, trec, outer;
  
@@ -1114,7 +1220,7 @@ INFO_TABLE_RET(stg_atomically_frame, ATOMICALLY_FRAME,
  #if defined(PROFILING)
    W_ unused1, W_ unused2,
  #endif
-  "ptr" W_ unused3, "ptr" W_ unused4)
+  P_ unused3, P_ unused4)
  {
    W_ frame, trec, valid, next_invariant, q, outer;
  
@@ -1176,7 +1282,7 @@ INFO_TABLE_RET(stg_atomically_waiting_frame, ATOMICALLY_FRAME,
  #if defined(PROFILING)
    W_ unused1, W_ unused2,
  #endif
-  "ptr" W_ unused3, "ptr" W_ unused4)
+  P_ unused3, P_ unused4)
  {
    W_ frame, trec, valid;
  
@@ -1210,7 +1316,7 @@ INFO_TABLE_RET(stg_catch_stm_frame, CATCH_STM_FRAME,
  #if defined(PROFILING)
    W_ unused1, W_ unused2,
  #endif
-  "ptr" W_ unused3, "ptr" W_ unused4)
+  P_ unused3, P_ unused4)
     {
        W_ r, frame, trec, outer;
        frame = Sp;
@@ -1452,6 +1558,17 @@ readTVarzh_fast
    RET_P(result);
  }
  
+readTVarIOzh_fast
+{
+    W_ result;
+
+again:
+    result = StgTVar_current_value(R1);
+    if (%INFO_PTR(result) == stg_TREC_HEADER_info) {
+        goto again;
+    }
+    RET_P(result);
+}
  
  writeTVarzh_fast
  {
@@ -1569,8 +1686,11 @@ takeMVarzh_fast
                                     CurrentTSO) [];
         }
         StgTSO__link(CurrentTSO)       = stg_END_TSO_QUEUE_closure;
-       StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
         StgTSO_block_info(CurrentTSO)  = mvar;
+        // write barrier for throwTo(), which looks at block_info
+        // if why_blocked==BlockedOnMVar.
+        prim %write_barrier() [];
+       StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
         StgMVar_tail(mvar) = CurrentTSO;
         
          R1 = mvar;
@@ -1726,8 +1846,11 @@ putMVarzh_fast
                                     CurrentTSO) [];
         }
         StgTSO__link(CurrentTSO)       = stg_END_TSO_QUEUE_closure;
-       StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
         StgTSO_block_info(CurrentTSO)  = mvar;
+        // write barrier for throwTo(), which looks at block_info
+        // if why_blocked==BlockedOnMVar.
+        prim %write_barrier() [];
+       StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
         StgMVar_tail(mvar) = CurrentTSO;
         
          R1 = mvar;
@@ -2273,3 +2396,39 @@ getApStackValzh_fast
     }
     RET_NP(ok,val);
  }
+
+/* -----------------------------------------------------------------------------
+   Misc. primitives
+   -------------------------------------------------------------------------- */
+
+// Write the cost center stack of the first argument on stderr; return
+// the second.  Possibly only makes sense for already evaluated
+// things?
+traceCcszh_fast
+{
+    W_ ccs;
+
+#ifdef PROFILING
+    ccs = StgHeader_ccs(UNTAG(R1));
+    foreign "C" fprintCCS_stderr(ccs "ptr") [R2];
+#endif
+
+    R1 = R2;
+    ENTER();
+}
+
+getSparkzh_fast
+{
+   W_ spark;
+
+#ifndef THREADED_RTS
+   RET_NP(0,ghczmprim_GHCziBool_False_closure);
+#else
+   (spark) = foreign "C" findSpark(MyCapability());
+   if (spark != 0) {
+      RET_NP(1,spark);
+   } else {
+      RET_NP(0,ghczmprim_GHCziBool_False_closure);
+   }
+#endif
+}