re-instate counting of sparks converted
[ghc-hetmet.git] / rts / PrimOps.cmm
1 /* -----------------------------------------------------------------------------
2  *
3  * (c) The GHC Team, 1998-2004
4  *
5  * Out-of-line primitive operations
6  *
7  * This file contains the implementations of all the primitive
8  * operations ("primops") which are not expanded inline.  See
9  * ghc/compiler/prelude/primops.txt.pp for a list of all the primops;
10  * this file contains code for most of those with the attribute
11  * out_of_line=True.
12  *
13  * Entry convention: the entry convention for a primop is that all the
14  * args are in Stg registers (R1, R2, etc.).  This is to make writing
15  * the primops easier.  (see compiler/codeGen/CgCallConv.hs).
16  *
17  * Return convention: results from a primop are generally returned
18  * using the ordinary unboxed tuple return convention.  The C-- parser
19  * implements the RET_xxxx() macros to perform unboxed-tuple returns
20  * based on the prevailing return convention.
21  *
22  * This file is written in a subset of C--, extended with various
23  * features specific to GHC.  It is compiled by GHC directly.  For the
24  * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
25  *
26  * ---------------------------------------------------------------------------*/
27
28 #include "Cmm.h"
29
30 #ifdef __PIC__
31 #ifndef mingw32_HOST_OS
32 import __gmpz_init;
33 import __gmpz_add;
34 import __gmpz_sub;
35 import __gmpz_mul;
36 import __gmpz_gcd;
37 import __gmpn_gcd_1;
38 import __gmpn_cmp;
39 import __gmpz_tdiv_q;
40 import __gmpz_tdiv_r;
41 import __gmpz_tdiv_qr;
42 import __gmpz_fdiv_qr;
43 import __gmpz_divexact;
44 import __gmpz_and;
45 import __gmpz_xor;
46 import __gmpz_ior;
47 import __gmpz_com;
48 #endif
49 import pthread_mutex_lock;
50 import pthread_mutex_unlock;
51 #endif
52 import base_ControlziExceptionziBase_nestedAtomically_closure;
53 import EnterCriticalSection;
54 import LeaveCriticalSection;
55
56 /*-----------------------------------------------------------------------------
57   Array Primitives
58
59   Basically just new*Array - the others are all inline macros.
60
61   The size arg is always passed in R1, and the result returned in R1.
62
63   The slow entry point is for returning from a heap check, the saved
64   size argument must be re-loaded from the stack.
65   -------------------------------------------------------------------------- */
66
67 /* for objects that are *less* than the size of a word, make sure we
68  * round up to the nearest word for the size of the array.
69  */
70
71 newByteArrayzh_fast
72 {
73     W_ words, payload_words, n, p;
74     MAYBE_GC(NO_PTRS,newByteArrayzh_fast);
75     n = R1;
76     payload_words = ROUNDUP_BYTES_TO_WDS(n);
77     words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
78     ("ptr" p) = foreign "C" allocateLocal(MyCapability() "ptr",words) [];
79     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
80     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
81     StgArrWords_words(p) = payload_words;
82     RET_P(p);
83 }
84
85 newPinnedByteArrayzh_fast
86 {
87     W_ words, payload_words, n, p;
88
89     MAYBE_GC(NO_PTRS,newPinnedByteArrayzh_fast);
90     n = R1;
91     payload_words = ROUNDUP_BYTES_TO_WDS(n);
92
93     // We want an 8-byte aligned array.  allocatePinned() gives us
94     // 8-byte aligned memory by default, but we want to align the
95     // *goods* inside the ArrWords object, so we have to check the
96     // size of the ArrWords header and adjust our size accordingly.
97     words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
98     if ((SIZEOF_StgArrWords & 7) != 0) {
99         words = words + 1;
100     }
101
102     ("ptr" p) = foreign "C" allocatePinned(words) [];
103     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
104
105     // Again, if the ArrWords header isn't a multiple of 8 bytes, we
106     // have to push the object forward one word so that the goods
107     // fall on an 8-byte boundary.
108     if ((SIZEOF_StgArrWords & 7) != 0) {
109         p = p + WDS(1);
110     }
111
112     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
113     StgArrWords_words(p) = payload_words;
114     RET_P(p);
115 }
116
117 newArrayzh_fast
118 {
119     W_ words, n, init, arr, p;
120     /* Args: R1 = words, R2 = initialisation value */
121
122     n = R1;
123     MAYBE_GC(R2_PTR,newArrayzh_fast);
124
125     words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + n;
126     ("ptr" arr) = foreign "C" allocateLocal(MyCapability() "ptr",words) [R2];
127     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);
128
129     SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
130     StgMutArrPtrs_ptrs(arr) = n;
131
132     // Initialise all elements of the the array with the value in R2
133     init = R2;
134     p = arr + SIZEOF_StgMutArrPtrs;
135   for:
136     if (p < arr + WDS(words)) {
137         W_[p] = init;
138         p = p + WDS(1);
139         goto for;
140     }
141
142     RET_P(arr);
143 }
144
145 unsafeThawArrayzh_fast
146 {
147   // SUBTLETY TO DO WITH THE OLD GEN MUTABLE LIST
148   //
149   // A MUT_ARR_PTRS lives on the mutable list, but a MUT_ARR_PTRS_FROZEN 
150   // normally doesn't.  However, when we freeze a MUT_ARR_PTRS, we leave
151   // it on the mutable list for the GC to remove (removing something from
152   // the mutable list is not easy, because the mut_list is only singly-linked).
153   // 
154   // So that we can tell whether a MUT_ARR_PTRS_FROZEN is on the mutable list,
155   // when we freeze it we set the info ptr to be MUT_ARR_PTRS_FROZEN0
156   // to indicate that it is still on the mutable list.
157   //
158   // So, when we thaw a MUT_ARR_PTRS_FROZEN, we must cope with two cases:
159   // either it is on a mut_list, or it isn't.  We adopt the convention that
160   // the closure type is MUT_ARR_PTRS_FROZEN0 if it is on the mutable list,
161   // and MUT_ARR_PTRS_FROZEN otherwise.  In fact it wouldn't matter if
162   // we put it on the mutable list more than once, but it would get scavenged
163   // multiple times during GC, which would be unnecessarily slow.
164   //
165   if (StgHeader_info(R1) != stg_MUT_ARR_PTRS_FROZEN0_info) {
166         SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
167         recordMutable(R1, R1);
168         // must be done after SET_INFO, because it ASSERTs closure_MUTABLE()
169         RET_P(R1);
170   } else {
171         SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
172         RET_P(R1);
173   }
174 }
175
176 /* -----------------------------------------------------------------------------
177    MutVar primitives
178    -------------------------------------------------------------------------- */
179
180 newMutVarzh_fast
181 {
182     W_ mv;
183     /* Args: R1 = initialisation value */
184
185     ALLOC_PRIM( SIZEOF_StgMutVar, R1_PTR, newMutVarzh_fast);
186
187     mv = Hp - SIZEOF_StgMutVar + WDS(1);
188     SET_HDR(mv,stg_MUT_VAR_DIRTY_info,W_[CCCS]);
189     StgMutVar_var(mv) = R1;
190     
191     RET_P(mv);
192 }
193
194 atomicModifyMutVarzh_fast
195 {
196     W_ mv, f, z, x, y, r, h;
197     /* Args: R1 :: MutVar#,  R2 :: a -> (a,b) */
198
199     /* If x is the current contents of the MutVar#, then 
200        We want to make the new contents point to
201
202          (sel_0 (f x))
203  
204        and the return value is
205          
206          (sel_1 (f x))
207
208         obviously we can share (f x).
209
210          z = [stg_ap_2 f x]  (max (HS + 2) MIN_UPD_SIZE)
211          y = [stg_sel_0 z]   (max (HS + 1) MIN_UPD_SIZE)
212          r = [stg_sel_1 z]   (max (HS + 1) MIN_UPD_SIZE)
213     */
214
215 #if MIN_UPD_SIZE > 1
216 #define THUNK_1_SIZE (SIZEOF_StgThunkHeader + WDS(MIN_UPD_SIZE))
217 #define TICK_ALLOC_THUNK_1() TICK_ALLOC_UP_THK(WDS(1),WDS(MIN_UPD_SIZE-1))
218 #else
219 #define THUNK_1_SIZE (SIZEOF_StgThunkHeader + WDS(1))
220 #define TICK_ALLOC_THUNK_1() TICK_ALLOC_UP_THK(WDS(1),0)
221 #endif
222
223 #if MIN_UPD_SIZE > 2
224 #define THUNK_2_SIZE (SIZEOF_StgThunkHeader + WDS(MIN_UPD_SIZE))
225 #define TICK_ALLOC_THUNK_2() TICK_ALLOC_UP_THK(WDS(2),WDS(MIN_UPD_SIZE-2))
226 #else
227 #define THUNK_2_SIZE (SIZEOF_StgThunkHeader + WDS(2))
228 #define TICK_ALLOC_THUNK_2() TICK_ALLOC_UP_THK(WDS(2),0)
229 #endif
230
231 #define SIZE (THUNK_2_SIZE + THUNK_1_SIZE + THUNK_1_SIZE)
232
233    HP_CHK_GEN_TICKY(SIZE, R1_PTR & R2_PTR, atomicModifyMutVarzh_fast);
234
235    mv = R1;
236    f = R2;
237
238    TICK_ALLOC_THUNK_2();
239    CCCS_ALLOC(THUNK_2_SIZE);
240    z = Hp - THUNK_2_SIZE + WDS(1);
241    SET_HDR(z, stg_ap_2_upd_info, W_[CCCS]);
242    LDV_RECORD_CREATE(z);
243    StgThunk_payload(z,0) = f;
244
245    TICK_ALLOC_THUNK_1();
246    CCCS_ALLOC(THUNK_1_SIZE);
247    y = z - THUNK_1_SIZE;
248    SET_HDR(y, stg_sel_0_upd_info, W_[CCCS]);
249    LDV_RECORD_CREATE(y);
250    StgThunk_payload(y,0) = z;
251
252    TICK_ALLOC_THUNK_1();
253    CCCS_ALLOC(THUNK_1_SIZE);
254    r = y - THUNK_1_SIZE;
255    SET_HDR(r, stg_sel_1_upd_info, W_[CCCS]);
256    LDV_RECORD_CREATE(r);
257    StgThunk_payload(r,0) = z;
258
259  retry:
260    x = StgMutVar_var(mv);
261    StgThunk_payload(z,1) = x;
262 #ifdef THREADED_RTS
263    (h) = foreign "C" cas(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var, x, y) [];
264    if (h != x) { goto retry; }
265 #else
266    StgMutVar_var(mv) = y;
267 #endif
268
269    if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
270      foreign "C" dirty_MUT_VAR(BaseReg "ptr", mv "ptr") [];
271    }
272
273    RET_P(r);
274 }
275
276 /* -----------------------------------------------------------------------------
277    Weak Pointer Primitives
278    -------------------------------------------------------------------------- */
279
280 STRING(stg_weak_msg,"New weak pointer at %p\n")
281
282 mkWeakzh_fast
283 {
284   /* R1 = key
285      R2 = value
286      R3 = finalizer (or NULL)
287   */
288   W_ w;
289
290   if (R3 == NULL) {
291     R3 = stg_NO_FINALIZER_closure;
292   }
293
294   ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, mkWeakzh_fast );
295
296   w = Hp - SIZEOF_StgWeak + WDS(1);
297   SET_HDR(w, stg_WEAK_info, W_[CCCS]);
298
299   StgWeak_key(w)       = R1;
300   StgWeak_value(w)     = R2;
301   StgWeak_finalizer(w) = R3;
302
303   StgWeak_link(w)       = W_[weak_ptr_list];
304   W_[weak_ptr_list]     = w;
305
306   IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
307
308   RET_P(w);
309 }
310
311
312 finalizzeWeakzh_fast
313 {
314   /* R1 = weak ptr
315    */
316   W_ w, f;
317
318   w = R1;
319
320   // already dead?
321   if (GET_INFO(w) == stg_DEAD_WEAK_info) {
322       RET_NP(0,stg_NO_FINALIZER_closure);
323   }
324
325   // kill it
326 #ifdef PROFILING
327   // @LDV profiling
328   // A weak pointer is inherently used, so we do not need to call
329   // LDV_recordDead_FILL_SLOP_DYNAMIC():
330   //    LDV_recordDead_FILL_SLOP_DYNAMIC((StgClosure *)w);
331   // or, LDV_recordDead():
332   //    LDV_recordDead((StgClosure *)w, sizeofW(StgWeak) - sizeofW(StgProfHeader));
333   // Furthermore, when PROFILING is turned on, dead weak pointers are exactly as 
334   // large as weak pointers, so there is no need to fill the slop, either.
335   // See stg_DEAD_WEAK_info in StgMiscClosures.hc.
336 #endif
337
338   //
339   // Todo: maybe use SET_HDR() and remove LDV_recordCreate()?
340   //
341   SET_INFO(w,stg_DEAD_WEAK_info);
342   LDV_RECORD_CREATE(w);
343
344   f = StgWeak_finalizer(w);
345   StgDeadWeak_link(w) = StgWeak_link(w);
346
347   /* return the finalizer */
348   if (f == stg_NO_FINALIZER_closure) {
349       RET_NP(0,stg_NO_FINALIZER_closure);
350   } else {
351       RET_NP(1,f);
352   }
353 }
354
355 deRefWeakzh_fast
356 {
357   /* R1 = weak ptr */
358   W_ w, code, val;
359
360   w = R1;
361   if (GET_INFO(w) == stg_WEAK_info) {
362     code = 1;
363     val = StgWeak_value(w);
364   } else {
365     code = 0;
366     val = w;
367   }
368   RET_NP(code,val);
369 }
370
371 /* -----------------------------------------------------------------------------
372    Arbitrary-precision Integer operations.
373
374    There are some assumptions in this code that mp_limb_t == W_.  This is
375    the case for all the platforms that GHC supports, currently.
376    -------------------------------------------------------------------------- */
377
378 int2Integerzh_fast
379 {
380    /* arguments: R1 = Int# */
381
382    W_ val, s, p;        /* to avoid aliasing */
383
384    val = R1;
385    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, int2Integerzh_fast );
386
387    p = Hp - SIZEOF_StgArrWords;
388    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
389    StgArrWords_words(p) = 1;
390
391    /* mpz_set_si is inlined here, makes things simpler */
392    if (%lt(val,0)) { 
393         s  = -1;
394         Hp(0) = -val;
395    } else { 
396      if (%gt(val,0)) {
397         s = 1;
398         Hp(0) = val;
399      } else {
400         s = 0;
401      }
402   }
403
404    /* returns (# size  :: Int#, 
405                  data  :: ByteArray# 
406                #)
407    */
408    RET_NP(s,p);
409 }
410
411 word2Integerzh_fast
412 {
413    /* arguments: R1 = Word# */
414
415    W_ val, s, p;        /* to avoid aliasing */
416
417    val = R1;
418
419    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, word2Integerzh_fast);
420
421    p = Hp - SIZEOF_StgArrWords;
422    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
423    StgArrWords_words(p) = 1;
424
425    if (val != 0) {
426         s = 1;
427         W_[Hp] = val;
428    } else {
429         s = 0;
430    }
431
432    /* returns (# size  :: Int#, 
433                  data  :: ByteArray# #)
434    */
435    RET_NP(s,p);
436 }
437
438
439 /*
440  * 'long long' primops for converting to/from Integers.
441  */
442
443 #ifdef SUPPORT_LONG_LONGS
444
445 int64ToIntegerzh_fast
446 {
447    /* arguments: L1 = Int64# */
448
449    L_ val;
450    W_ hi, lo, s, neg, words_needed, p;
451
452    val = L1;
453    neg = 0;
454
455    hi = TO_W_(val >> 32);
456    lo = TO_W_(val);
457
458    if ( hi == 0 || (hi == 0xFFFFFFFF && lo != 0) )  {
459        // minimum is one word
460        words_needed = 1;
461    } else { 
462        words_needed = 2;
463    }
464
465    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
466                NO_PTRS, int64ToIntegerzh_fast );
467
468    p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
469    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
470    StgArrWords_words(p) = words_needed;
471
472    if ( %lt(hi,0) ) {
473      neg = 1;
474      lo = -lo;
475      if(lo == 0) {
476        hi = -hi;
477      } else {
478        hi = -hi - 1;
479      }
480    }
481
482    if ( words_needed == 2 )  { 
483       s = 2;
484       Hp(-1) = lo;
485       Hp(0) = hi;
486    } else { 
487        if ( lo != 0 ) {
488            s = 1;
489            Hp(0) = lo;
490        } else /* val==0 */  {
491            s = 0;
492        }
493    }
494    if ( neg != 0 ) {
495         s = -s;
496    }
497
498    /* returns (# size  :: Int#, 
499                  data  :: ByteArray# #)
500    */
501    RET_NP(s,p);
502 }
503 word64ToIntegerzh_fast
504 {
505    /* arguments: L1 = Word64# */
506
507    L_ val;
508    W_ hi, lo, s, words_needed, p;
509
510    val = L1;
511    hi = TO_W_(val >> 32);
512    lo = TO_W_(val);
513
514    if ( hi != 0 ) {
515       words_needed = 2;
516    } else {
517       words_needed = 1;
518    }
519
520    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
521                NO_PTRS, word64ToIntegerzh_fast );
522
523    p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
524    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
525    StgArrWords_words(p) = words_needed;
526
527    if ( hi != 0 ) { 
528      s = 2;
529      Hp(-1) = lo;
530      Hp(0)  = hi;
531    } else {
532       if ( lo != 0 ) {
533         s = 1;
534         Hp(0) = lo;
535      } else /* val==0 */  {
536       s = 0;
537      }
538   }
539
540    /* returns (# size  :: Int#, 
541                  data  :: ByteArray# #)
542    */
543    RET_NP(s,p);
544 }
545
546
547
548 #endif /* SUPPORT_LONG_LONGS */
549
550 /* ToDo: this is shockingly inefficient */
551
552 #ifndef THREADED_RTS
553 section "bss" {
554   mp_tmp1:
555     bits8 [SIZEOF_MP_INT];
556 }
557
558 section "bss" {
559   mp_tmp2:
560     bits8 [SIZEOF_MP_INT];
561 }
562
563 section "bss" {
564   mp_result1:
565     bits8 [SIZEOF_MP_INT];
566 }
567
568 section "bss" {
569   mp_result2:
570     bits8 [SIZEOF_MP_INT];
571 }
572 #endif
573
574 #ifdef THREADED_RTS
575 #define FETCH_MP_TEMP(X) \
576 W_ X; \
577 X = BaseReg + (OFFSET_StgRegTable_r ## X);
578 #else
579 #define FETCH_MP_TEMP(X) /* Nothing */
580 #endif
581
582 #define GMP_TAKE2_RET1(name,mp_fun)                                     \
583 name                                                                    \
584 {                                                                       \
585   CInt s1, s2;                                                          \
586   W_ d1, d2;                                                            \
587   FETCH_MP_TEMP(mp_tmp1);                                               \
588   FETCH_MP_TEMP(mp_tmp2);                                               \
589   FETCH_MP_TEMP(mp_result1)                                             \
590   FETCH_MP_TEMP(mp_result2);                                            \
591                                                                         \
592   /* call doYouWantToGC() */                                            \
593   MAYBE_GC(R2_PTR & R4_PTR, name);                                      \
594                                                                         \
595   s1 = W_TO_INT(R1);                                                    \
596   d1 = R2;                                                              \
597   s2 = W_TO_INT(R3);                                                    \
598   d2 = R4;                                                              \
599                                                                         \
600   MP_INT__mp_alloc(mp_tmp1) = W_TO_INT(StgArrWords_words(d1));          \
601   MP_INT__mp_size(mp_tmp1)  = (s1);                                     \
602   MP_INT__mp_d(mp_tmp1)     = BYTE_ARR_CTS(d1);                         \
603   MP_INT__mp_alloc(mp_tmp2) = W_TO_INT(StgArrWords_words(d2));          \
604   MP_INT__mp_size(mp_tmp2)  = (s2);                                     \
605   MP_INT__mp_d(mp_tmp2)     = BYTE_ARR_CTS(d2);                         \
606                                                                         \
607   foreign "C" __gmpz_init(mp_result1 "ptr") [];                            \
608                                                                         \
609   /* Perform the operation */                                           \
610   foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1  "ptr",mp_tmp2  "ptr") []; \
611                                                                         \
612   RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
613          MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
614 }
615
616 #define GMP_TAKE1_RET1(name,mp_fun)                                     \
617 name                                                                    \
618 {                                                                       \
619   CInt s1;                                                              \
620   W_ d1;                                                                \
621   FETCH_MP_TEMP(mp_tmp1);                                               \
622   FETCH_MP_TEMP(mp_result1)                                             \
623                                                                         \
624   /* call doYouWantToGC() */                                            \
625   MAYBE_GC(R2_PTR, name);                                               \
626                                                                         \
627   d1 = R2;                                                              \
628   s1 = W_TO_INT(R1);                                                    \
629                                                                         \
630   MP_INT__mp_alloc(mp_tmp1)     = W_TO_INT(StgArrWords_words(d1));      \
631   MP_INT__mp_size(mp_tmp1)      = (s1);                                 \
632   MP_INT__mp_d(mp_tmp1)         = BYTE_ARR_CTS(d1);                     \
633                                                                         \
634   foreign "C" __gmpz_init(mp_result1 "ptr") [];                            \
635                                                                         \
636   /* Perform the operation */                                           \
637   foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1 "ptr") [];                \
638                                                                         \
639   RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
640          MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
641 }
642
643 #define GMP_TAKE2_RET2(name,mp_fun)                                                     \
644 name                                                                                    \
645 {                                                                                       \
646   CInt s1, s2;                                                                          \
647   W_ d1, d2;                                                                            \
648   FETCH_MP_TEMP(mp_tmp1);                                                               \
649   FETCH_MP_TEMP(mp_tmp2);                                                               \
650   FETCH_MP_TEMP(mp_result1)                                                             \
651   FETCH_MP_TEMP(mp_result2)                                                             \
652                                                                                         \
653   /* call doYouWantToGC() */                                                            \
654   MAYBE_GC(R2_PTR & R4_PTR, name);                                                      \
655                                                                                         \
656   s1 = W_TO_INT(R1);                                                                    \
657   d1 = R2;                                                                              \
658   s2 = W_TO_INT(R3);                                                                    \
659   d2 = R4;                                                                              \
660                                                                                         \
661   MP_INT__mp_alloc(mp_tmp1)     = W_TO_INT(StgArrWords_words(d1));                      \
662   MP_INT__mp_size(mp_tmp1)      = (s1);                                                 \
663   MP_INT__mp_d(mp_tmp1)         = BYTE_ARR_CTS(d1);                                     \
664   MP_INT__mp_alloc(mp_tmp2)     = W_TO_INT(StgArrWords_words(d2));                      \
665   MP_INT__mp_size(mp_tmp2)      = (s2);                                                 \
666   MP_INT__mp_d(mp_tmp2)         = BYTE_ARR_CTS(d2);                                     \
667                                                                                         \
668   foreign "C" __gmpz_init(mp_result1 "ptr") [];                                               \
669   foreign "C" __gmpz_init(mp_result2 "ptr") [];                                               \
670                                                                                         \
671   /* Perform the operation */                                                           \
672   foreign "C" mp_fun(mp_result1 "ptr",mp_result2 "ptr",mp_tmp1 "ptr",mp_tmp2 "ptr") [];    \
673                                                                                         \
674   RET_NPNP(TO_W_(MP_INT__mp_size(mp_result1)),                                          \
675            MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords,                               \
676            TO_W_(MP_INT__mp_size(mp_result2)),                                          \
677            MP_INT__mp_d(mp_result2) - SIZEOF_StgArrWords);                              \
678 }
679
680 GMP_TAKE2_RET1(plusIntegerzh_fast,     __gmpz_add)
681 GMP_TAKE2_RET1(minusIntegerzh_fast,    __gmpz_sub)
682 GMP_TAKE2_RET1(timesIntegerzh_fast,    __gmpz_mul)
683 GMP_TAKE2_RET1(gcdIntegerzh_fast,      __gmpz_gcd)
684 GMP_TAKE2_RET1(quotIntegerzh_fast,     __gmpz_tdiv_q)
685 GMP_TAKE2_RET1(remIntegerzh_fast,      __gmpz_tdiv_r)
686 GMP_TAKE2_RET1(divExactIntegerzh_fast, __gmpz_divexact)
687 GMP_TAKE2_RET1(andIntegerzh_fast,      __gmpz_and)
688 GMP_TAKE2_RET1(orIntegerzh_fast,       __gmpz_ior)
689 GMP_TAKE2_RET1(xorIntegerzh_fast,      __gmpz_xor)
690 GMP_TAKE1_RET1(complementIntegerzh_fast, __gmpz_com)
691
692 GMP_TAKE2_RET2(quotRemIntegerzh_fast, __gmpz_tdiv_qr)
693 GMP_TAKE2_RET2(divModIntegerzh_fast,  __gmpz_fdiv_qr)
694
695 #ifndef THREADED_RTS
696 section "bss" {
697   mp_tmp_w:  W_; // NB. mp_tmp_w is really an here mp_limb_t
698 }
699 #endif
700
701 gcdIntzh_fast
702 {
703     /* R1 = the first Int#; R2 = the second Int# */
704     W_ r; 
705     FETCH_MP_TEMP(mp_tmp_w);
706
707     W_[mp_tmp_w] = R1;
708     (r) = foreign "C" __gmpn_gcd_1(mp_tmp_w "ptr", 1, R2) [];
709
710     R1 = r;
711     /* Result parked in R1, return via info-pointer at TOS */
712     jump %ENTRY_CODE(Sp(0));
713 }
714
715
716 gcdIntegerIntzh_fast
717 {
718     /* R1 = s1; R2 = d1; R3 = the int */
719     W_ s1;
720     (s1) = foreign "C" __gmpn_gcd_1( BYTE_ARR_CTS(R2) "ptr", R1, R3) [];
721     R1 = s1;
722     
723     /* Result parked in R1, return via info-pointer at TOS */
724     jump %ENTRY_CODE(Sp(0));
725 }
726
727
728 cmpIntegerIntzh_fast
729 {
730     /* R1 = s1; R2 = d1; R3 = the int */
731     W_ usize, vsize, v_digit, u_digit;
732
733     usize = R1;
734     vsize = 0;
735     v_digit = R3;
736
737     // paraphrased from __gmpz_cmp_si() in the GMP sources
738     if (%gt(v_digit,0)) {
739         vsize = 1;
740     } else { 
741         if (%lt(v_digit,0)) {
742             vsize = -1;
743             v_digit = -v_digit;
744         }
745     }
746
747     if (usize != vsize) {
748         R1 = usize - vsize; 
749         jump %ENTRY_CODE(Sp(0));
750     }
751
752     if (usize == 0) {
753         R1 = 0; 
754         jump %ENTRY_CODE(Sp(0));
755     }
756
757     u_digit = W_[BYTE_ARR_CTS(R2)];
758
759     if (u_digit == v_digit) {
760         R1 = 0; 
761         jump %ENTRY_CODE(Sp(0));
762     }
763
764     if (%gtu(u_digit,v_digit)) { // NB. unsigned: these are mp_limb_t's
765         R1 = usize; 
766     } else {
767         R1 = -usize; 
768     }
769
770     jump %ENTRY_CODE(Sp(0));
771 }
772
773 cmpIntegerzh_fast
774 {
775     /* R1 = s1; R2 = d1; R3 = s2; R4 = d2 */
776     W_ usize, vsize, size, up, vp;
777     CInt cmp;
778
779     // paraphrased from __gmpz_cmp() in the GMP sources
780     usize = R1;
781     vsize = R3;
782
783     if (usize != vsize) {
784         R1 = usize - vsize; 
785         jump %ENTRY_CODE(Sp(0));
786     }
787
788     if (usize == 0) {
789         R1 = 0; 
790         jump %ENTRY_CODE(Sp(0));
791     }
792
793     if (%lt(usize,0)) { // NB. not <, which is unsigned
794         size = -usize;
795     } else {
796         size = usize;
797     }
798
799     up = BYTE_ARR_CTS(R2);
800     vp = BYTE_ARR_CTS(R4);
801
802     (cmp) = foreign "C" __gmpn_cmp(up "ptr", vp "ptr", size) [];
803
804     if (cmp == 0 :: CInt) {
805         R1 = 0; 
806         jump %ENTRY_CODE(Sp(0));
807     }
808
809     if (%lt(cmp,0 :: CInt) == %lt(usize,0)) {
810         R1 = 1;
811     } else {
812         R1 = (-1); 
813     }
814     /* Result parked in R1, return via info-pointer at TOS */
815     jump %ENTRY_CODE(Sp(0));
816 }
817
818 integer2Intzh_fast
819 {
820     /* R1 = s; R2 = d */
821     W_ r, s;
822
823     s = R1;
824     if (s == 0) {
825         r = 0;
826     } else {
827         r = W_[R2 + SIZEOF_StgArrWords];
828         if (%lt(s,0)) {
829             r = -r;
830         }
831     }
832     /* Result parked in R1, return via info-pointer at TOS */
833     R1 = r;
834     jump %ENTRY_CODE(Sp(0));
835 }
836
837 integer2Wordzh_fast
838 {
839   /* R1 = s; R2 = d */
840   W_ r, s;
841
842   s = R1;
843   if (s == 0) {
844     r = 0;
845   } else {
846     r = W_[R2 + SIZEOF_StgArrWords];
847     if (%lt(s,0)) {
848         r = -r;
849     }
850   }
851   /* Result parked in R1, return via info-pointer at TOS */
852   R1 = r;
853   jump %ENTRY_CODE(Sp(0));
854 }
855
856 decodeFloatzh_fast
857
858     W_ p;
859     F_ arg;
860     FETCH_MP_TEMP(mp_tmp1);
861     FETCH_MP_TEMP(mp_tmp_w);
862     
863     /* arguments: F1 = Float# */
864     arg = F1;
865     
866     ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, decodeFloatzh_fast );
867     
868     /* Be prepared to tell Lennart-coded __decodeFloat
869        where mantissa._mp_d can be put (it does not care about the rest) */
870     p = Hp - SIZEOF_StgArrWords;
871     SET_HDR(p,stg_ARR_WORDS_info,W_[CCCS]);
872     StgArrWords_words(p) = 1;
873     MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
874     
875     /* Perform the operation */
876     foreign "C" __decodeFloat(mp_tmp1 "ptr",mp_tmp_w "ptr" ,arg) [];
877     
878     /* returns: (Int# (expn), Int#, ByteArray#) */
879     RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
880 }
881
882 decodeFloatzuIntzh_fast
883
884     W_ p;
885     F_ arg;
886     FETCH_MP_TEMP(mp_tmp1);
887     FETCH_MP_TEMP(mp_tmp_w);
888     
889     /* arguments: F1 = Float# */
890     arg = F1;
891     
892     /* Perform the operation */
893     foreign "C" __decodeFloat_Int(mp_tmp1 "ptr", mp_tmp_w "ptr", arg) [];
894     
895     /* returns: (Int# (mantissa), Int# (exponent)) */
896     RET_NN(W_[mp_tmp1], W_[mp_tmp_w]);
897 }
898
899 #define DOUBLE_MANTISSA_SIZE SIZEOF_DOUBLE
900 #define ARR_SIZE (SIZEOF_StgArrWords + DOUBLE_MANTISSA_SIZE)
901
902 decodeDoublezh_fast
903
904     D_ arg;
905     W_ p;
906     FETCH_MP_TEMP(mp_tmp1);
907     FETCH_MP_TEMP(mp_tmp_w);
908
909     /* arguments: D1 = Double# */
910     arg = D1;
911
912     ALLOC_PRIM( ARR_SIZE, NO_PTRS, decodeDoublezh_fast );
913     
914     /* Be prepared to tell Lennart-coded __decodeDouble
915        where mantissa.d can be put (it does not care about the rest) */
916     p = Hp - ARR_SIZE + WDS(1);
917     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
918     StgArrWords_words(p) = BYTES_TO_WDS(DOUBLE_MANTISSA_SIZE);
919     MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
920
921     /* Perform the operation */
922     foreign "C" __decodeDouble(mp_tmp1 "ptr", mp_tmp_w "ptr",arg) [];
923     
924     /* returns: (Int# (expn), Int#, ByteArray#) */
925     RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
926 }
927
928 decodeDoublezu2Intzh_fast
929
930     D_ arg;
931     W_ p;
932     FETCH_MP_TEMP(mp_tmp1);
933     FETCH_MP_TEMP(mp_tmp2);
934     FETCH_MP_TEMP(mp_result1);
935     FETCH_MP_TEMP(mp_result2);
936
937     /* arguments: D1 = Double# */
938     arg = D1;
939
940     /* Perform the operation */
941     foreign "C" __decodeDouble_2Int(mp_tmp1 "ptr", mp_tmp2 "ptr",
942                                     mp_result1 "ptr", mp_result2 "ptr",
943                                     arg) [];
944
945     /* returns:
946        (Int# (mant sign), Word# (mant high), Word# (mant low), Int# (expn)) */
947     RET_NNNN(W_[mp_tmp1], W_[mp_tmp2], W_[mp_result1], W_[mp_result2]);
948 }
949
950 /* -----------------------------------------------------------------------------
951  * Concurrency primitives
952  * -------------------------------------------------------------------------- */
953
954 forkzh_fast
955 {
956   /* args: R1 = closure to spark */
957
958   MAYBE_GC(R1_PTR, forkzh_fast);
959
960   W_ closure;
961   W_ threadid;
962   closure = R1;
963
964   ("ptr" threadid) = foreign "C" createIOThread( MyCapability() "ptr", 
965                                 RtsFlags_GcFlags_initialStkSize(RtsFlags), 
966                                 closure "ptr") [];
967
968   /* start blocked if the current thread is blocked */
969   StgTSO_flags(threadid) = 
970      StgTSO_flags(threadid) |  (StgTSO_flags(CurrentTSO) & 
971                                 (TSO_BLOCKEX::I32 | TSO_INTERRUPTIBLE::I32));
972
973   foreign "C" scheduleThread(MyCapability() "ptr", threadid "ptr") [];
974
975   // switch at the earliest opportunity
976   Capability_context_switch(MyCapability()) = 1 :: CInt;
977   
978   RET_P(threadid);
979 }
980
981 forkOnzh_fast
982 {
983   /* args: R1 = cpu, R2 = closure to spark */
984
985   MAYBE_GC(R2_PTR, forkOnzh_fast);
986
987   W_ cpu;
988   W_ closure;
989   W_ threadid;
990   cpu = R1;
991   closure = R2;
992
993   ("ptr" threadid) = foreign "C" createIOThread( MyCapability() "ptr", 
994                                 RtsFlags_GcFlags_initialStkSize(RtsFlags), 
995                                 closure "ptr") [];
996
997   /* start blocked if the current thread is blocked */
998   StgTSO_flags(threadid) = 
999      StgTSO_flags(threadid) |  (StgTSO_flags(CurrentTSO) & 
1000                                 (TSO_BLOCKEX::I32 | TSO_INTERRUPTIBLE::I32));
1001
1002   foreign "C" scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr") [];
1003
1004   // switch at the earliest opportunity
1005   Capability_context_switch(MyCapability()) = 1 :: CInt;
1006   
1007   RET_P(threadid);
1008 }
1009
1010 yieldzh_fast
1011 {
1012   jump stg_yield_noregs;
1013 }
1014
1015 myThreadIdzh_fast
1016 {
1017   /* no args. */
1018   RET_P(CurrentTSO);
1019 }
1020
1021 labelThreadzh_fast
1022 {
1023   /* args: 
1024         R1 = ThreadId#
1025         R2 = Addr# */
1026 #ifdef DEBUG
1027   foreign "C" labelThread(R1 "ptr", R2 "ptr") [];
1028 #endif
1029   jump %ENTRY_CODE(Sp(0));
1030 }
1031
1032 isCurrentThreadBoundzh_fast
1033 {
1034   /* no args */
1035   W_ r;
1036   (r) = foreign "C" isThreadBound(CurrentTSO) [];
1037   RET_N(r);
1038 }
1039
1040 threadStatuszh_fast
1041 {
1042     /* args: R1 :: ThreadId# */
1043     W_ tso;
1044     W_ why_blocked;
1045     W_ what_next;
1046     W_ ret;
1047
1048     tso = R1;
1049     loop:
1050       if (TO_W_(StgTSO_what_next(tso)) == ThreadRelocated) {
1051           tso = StgTSO__link(tso);
1052           goto loop;
1053       }
1054
1055     what_next   = TO_W_(StgTSO_what_next(tso));
1056     why_blocked = TO_W_(StgTSO_why_blocked(tso));
1057     // Note: these two reads are not atomic, so they might end up
1058     // being inconsistent.  It doesn't matter, since we
1059     // only return one or the other.  If we wanted to return the
1060     // contents of block_info too, then we'd have to do some synchronisation.
1061
1062     if (what_next == ThreadComplete) {
1063         ret = 16;  // NB. magic, matches up with GHC.Conc.threadStatus
1064     } else {
1065         if (what_next == ThreadKilled) {
1066             ret = 17;
1067         } else {
1068             ret = why_blocked;
1069         }
1070     }
1071     RET_N(ret);
1072 }
1073
1074 /* -----------------------------------------------------------------------------
1075  * TVar primitives
1076  * -------------------------------------------------------------------------- */
1077
1078 #define SP_OFF 0
1079
1080 // Catch retry frame ------------------------------------------------------------
1081
1082 INFO_TABLE_RET(stg_catch_retry_frame, CATCH_RETRY_FRAME,
1083 #if defined(PROFILING)
1084   W_ unused1, W_ unused2,
1085 #endif
1086   W_ unused3, "ptr" W_ unused4, "ptr" W_ unused5)
1087 {
1088    W_ r, frame, trec, outer;
1089
1090    frame = Sp;
1091    trec = StgTSO_trec(CurrentTSO);
1092    ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
1093    (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
1094    if (r != 0) {
1095      /* Succeeded (either first branch or second branch) */
1096      StgTSO_trec(CurrentTSO) = outer;
1097      Sp = Sp + SIZEOF_StgCatchRetryFrame;
1098      jump %ENTRY_CODE(Sp(SP_OFF));
1099    } else {
1100      /* Did not commit: re-execute */
1101      W_ new_trec;
1102      ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
1103      StgTSO_trec(CurrentTSO) = new_trec;
1104      if (StgCatchRetryFrame_running_alt_code(frame) != 0::I32) {
1105        R1 = StgCatchRetryFrame_alt_code(frame);
1106      } else {
1107        R1 = StgCatchRetryFrame_first_code(frame);
1108      }
1109      jump stg_ap_v_fast;
1110    }
1111 }
1112
1113
1114 // Atomically frame ------------------------------------------------------------
1115
1116 INFO_TABLE_RET(stg_atomically_frame, ATOMICALLY_FRAME,
1117 #if defined(PROFILING)
1118   W_ unused1, W_ unused2,
1119 #endif
1120   "ptr" W_ unused3, "ptr" W_ unused4)
1121 {
1122   W_ frame, trec, valid, next_invariant, q, outer;
1123
1124   frame = Sp;
1125   trec = StgTSO_trec(CurrentTSO);
1126   ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
1127
1128   if (outer == NO_TREC) {
1129     /* First time back at the atomically frame -- pick up invariants */
1130     ("ptr" q) = foreign "C" stmGetInvariantsToCheck(MyCapability() "ptr", trec "ptr") [];
1131     StgAtomicallyFrame_next_invariant_to_check(frame) = q;
1132
1133   } else {
1134     /* Second/subsequent time back at the atomically frame -- abort the
1135      * tx that's checking the invariant and move on to the next one */
1136     StgTSO_trec(CurrentTSO) = outer;
1137     q = StgAtomicallyFrame_next_invariant_to_check(frame);
1138     StgInvariantCheckQueue_my_execution(q) = trec;
1139     foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
1140     /* Don't free trec -- it's linked from q and will be stashed in the
1141      * invariant if we eventually commit. */
1142     q = StgInvariantCheckQueue_next_queue_entry(q);
1143     StgAtomicallyFrame_next_invariant_to_check(frame) = q;
1144     trec = outer;
1145   }
1146
1147   q = StgAtomicallyFrame_next_invariant_to_check(frame);
1148
1149   if (q != END_INVARIANT_CHECK_QUEUE) {
1150     /* We can't commit yet: another invariant to check */
1151     ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [];
1152     StgTSO_trec(CurrentTSO) = trec;
1153
1154     next_invariant = StgInvariantCheckQueue_invariant(q);
1155     R1 = StgAtomicInvariant_code(next_invariant);
1156     jump stg_ap_v_fast;
1157
1158   } else {
1159
1160     /* We've got no more invariants to check, try to commit */
1161     (valid) = foreign "C" stmCommitTransaction(MyCapability() "ptr", trec "ptr") [];
1162     if (valid != 0) {
1163       /* Transaction was valid: commit succeeded */
1164       StgTSO_trec(CurrentTSO) = NO_TREC;
1165       Sp = Sp + SIZEOF_StgAtomicallyFrame;
1166       jump %ENTRY_CODE(Sp(SP_OFF));
1167     } else {
1168       /* Transaction was not valid: try again */
1169       ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
1170       StgTSO_trec(CurrentTSO) = trec;
1171       StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;
1172       R1 = StgAtomicallyFrame_code(frame);
1173       jump stg_ap_v_fast;
1174     }
1175   }
1176 }
1177
1178 INFO_TABLE_RET(stg_atomically_waiting_frame, ATOMICALLY_FRAME,
1179 #if defined(PROFILING)
1180   W_ unused1, W_ unused2,
1181 #endif
1182   "ptr" W_ unused3, "ptr" W_ unused4)
1183 {
1184   W_ frame, trec, valid;
1185
1186   frame = Sp;
1187
1188   /* The TSO is currently waiting: should we stop waiting? */
1189   (valid) = foreign "C" stmReWait(MyCapability() "ptr", CurrentTSO "ptr") [];
1190   if (valid != 0) {
1191     /* Previous attempt is still valid: no point trying again yet */
1192     jump stg_block_noregs;
1193   } else {
1194     /* Previous attempt is no longer valid: try again */
1195     ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
1196     StgTSO_trec(CurrentTSO) = trec;
1197     StgHeader_info(frame) = stg_atomically_frame_info;
1198     R1 = StgAtomicallyFrame_code(frame);
1199     jump stg_ap_v_fast;
1200   }
1201 }
1202
1203 // STM catch frame --------------------------------------------------------------
1204
1205 #define SP_OFF 0
1206
1207 /* Catch frames are very similar to update frames, but when entering
1208  * one we just pop the frame off the stack and perform the correct
1209  * kind of return to the activation record underneath us on the stack.
1210  */
1211
1212 INFO_TABLE_RET(stg_catch_stm_frame, CATCH_STM_FRAME,
1213 #if defined(PROFILING)
1214   W_ unused1, W_ unused2,
1215 #endif
1216   "ptr" W_ unused3, "ptr" W_ unused4)
1217    {
1218       W_ r, frame, trec, outer;
1219       frame = Sp;
1220       trec = StgTSO_trec(CurrentTSO);
1221       ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
1222       (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
1223       if (r != 0) {
1224         /* Commit succeeded */
1225         StgTSO_trec(CurrentTSO) = outer;
1226         Sp = Sp + SIZEOF_StgCatchSTMFrame;
1227         jump Sp(SP_OFF);
1228       } else {
1229         /* Commit failed */
1230         W_ new_trec;
1231         ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
1232         StgTSO_trec(CurrentTSO) = new_trec;
1233         R1 = StgCatchSTMFrame_code(frame);
1234         jump stg_ap_v_fast;
1235       }
1236    }
1237
1238
1239 // Primop definition ------------------------------------------------------------
1240
1241 atomicallyzh_fast
1242 {
1243   W_ frame;
1244   W_ old_trec;
1245   W_ new_trec;
1246   
1247   // stmStartTransaction may allocate
1248   MAYBE_GC (R1_PTR, atomicallyzh_fast); 
1249
1250   /* Args: R1 = m :: STM a */
1251   STK_CHK_GEN(SIZEOF_StgAtomicallyFrame + WDS(1), R1_PTR, atomicallyzh_fast);
1252
1253   old_trec = StgTSO_trec(CurrentTSO);
1254
1255   /* Nested transactions are not allowed; raise an exception */
1256   if (old_trec != NO_TREC) {
1257      R1 = base_ControlziExceptionziBase_nestedAtomically_closure;
1258      jump raisezh_fast;
1259   }
1260
1261   /* Set up the atomically frame */
1262   Sp = Sp - SIZEOF_StgAtomicallyFrame;
1263   frame = Sp;
1264
1265   SET_HDR(frame,stg_atomically_frame_info, W_[CCCS]);
1266   StgAtomicallyFrame_code(frame) = R1;
1267   StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;
1268
1269   /* Start the memory transcation */
1270   ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", old_trec "ptr") [R1];
1271   StgTSO_trec(CurrentTSO) = new_trec;
1272
1273   /* Apply R1 to the realworld token */
1274   jump stg_ap_v_fast;
1275 }
1276
1277
1278 catchSTMzh_fast
1279 {
1280   W_ frame;
1281   
1282   /* Args: R1 :: STM a */
1283   /* Args: R2 :: Exception -> STM a */
1284   STK_CHK_GEN(SIZEOF_StgCatchSTMFrame + WDS(1), R1_PTR & R2_PTR, catchSTMzh_fast);
1285
1286   /* Set up the catch frame */
1287   Sp = Sp - SIZEOF_StgCatchSTMFrame;
1288   frame = Sp;
1289
1290   SET_HDR(frame, stg_catch_stm_frame_info, W_[CCCS]);
1291   StgCatchSTMFrame_handler(frame) = R2;
1292   StgCatchSTMFrame_code(frame) = R1;
1293
1294   /* Start a nested transaction to run the body of the try block in */
1295   W_ cur_trec;  
1296   W_ new_trec;
1297   cur_trec = StgTSO_trec(CurrentTSO);
1298   ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", cur_trec "ptr");
1299   StgTSO_trec(CurrentTSO) = new_trec;
1300
1301   /* Apply R1 to the realworld token */
1302   jump stg_ap_v_fast;
1303 }
1304
1305
1306 catchRetryzh_fast
1307 {
1308   W_ frame;
1309   W_ new_trec;
1310   W_ trec;
1311
1312   // stmStartTransaction may allocate
1313   MAYBE_GC (R1_PTR & R2_PTR, catchRetryzh_fast); 
1314
1315   /* Args: R1 :: STM a */
1316   /* Args: R2 :: STM a */
1317   STK_CHK_GEN(SIZEOF_StgCatchRetryFrame + WDS(1), R1_PTR & R2_PTR, catchRetryzh_fast);
1318
1319   /* Start a nested transaction within which to run the first code */
1320   trec = StgTSO_trec(CurrentTSO);
1321   ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [R1,R2];
1322   StgTSO_trec(CurrentTSO) = new_trec;
1323
1324   /* Set up the catch-retry frame */
1325   Sp = Sp - SIZEOF_StgCatchRetryFrame;
1326   frame = Sp;
1327   
1328   SET_HDR(frame, stg_catch_retry_frame_info, W_[CCCS]);
1329   StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false;
1330   StgCatchRetryFrame_first_code(frame) = R1;
1331   StgCatchRetryFrame_alt_code(frame) = R2;
1332
1333   /* Apply R1 to the realworld token */
1334   jump stg_ap_v_fast;
1335 }
1336
1337
1338 retryzh_fast
1339 {
1340   W_ frame_type;
1341   W_ frame;
1342   W_ trec;
1343   W_ outer;
1344   W_ r;
1345
1346   MAYBE_GC (NO_PTRS, retryzh_fast); // STM operations may allocate
1347
1348   // Find the enclosing ATOMICALLY_FRAME or CATCH_RETRY_FRAME
1349 retry_pop_stack:
1350   StgTSO_sp(CurrentTSO) = Sp;
1351   (frame_type) = foreign "C" findRetryFrameHelper(CurrentTSO "ptr") [];
1352   Sp = StgTSO_sp(CurrentTSO);
1353   frame = Sp;
1354   trec = StgTSO_trec(CurrentTSO);
1355   ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
1356
1357   if (frame_type == CATCH_RETRY_FRAME) {
1358     // The retry reaches a CATCH_RETRY_FRAME before the atomic frame
1359     ASSERT(outer != NO_TREC);
1360     // Abort the transaction attempting the current branch
1361     foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
1362     foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
1363     if (!StgCatchRetryFrame_running_alt_code(frame) != 0::I32) {
1364       // Retry in the first branch: try the alternative
1365       ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
1366       StgTSO_trec(CurrentTSO) = trec;
1367       StgCatchRetryFrame_running_alt_code(frame) = 1 :: CInt; // true;
1368       R1 = StgCatchRetryFrame_alt_code(frame);
1369       jump stg_ap_v_fast;
1370     } else {
1371       // Retry in the alternative code: propagate the retry
1372       StgTSO_trec(CurrentTSO) = outer;
1373       Sp = Sp + SIZEOF_StgCatchRetryFrame;
1374       goto retry_pop_stack;
1375     }
1376   }
1377
1378   // We've reached the ATOMICALLY_FRAME: attempt to wait 
1379   ASSERT(frame_type == ATOMICALLY_FRAME);
1380   if (outer != NO_TREC) {
1381     // We called retry while checking invariants, so abort the current
1382     // invariant check (merging its TVar accesses into the parents read
1383     // set so we'll wait on them)
1384     foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
1385     foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
1386     trec = outer;
1387     StgTSO_trec(CurrentTSO) = trec;
1388     ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
1389   }
1390   ASSERT(outer == NO_TREC);
1391
1392   (r) = foreign "C" stmWait(MyCapability() "ptr", CurrentTSO "ptr", trec "ptr") [];
1393   if (r != 0) {
1394     // Transaction was valid: stmWait put us on the TVars' queues, we now block
1395     StgHeader_info(frame) = stg_atomically_waiting_frame_info;
1396     Sp = frame;
1397     // Fix up the stack in the unregisterised case: the return convention is different.
1398     R3 = trec; // passing to stmWaitUnblock()
1399     jump stg_block_stmwait;
1400   } else {
1401     // Transaction was not valid: retry immediately
1402     ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
1403     StgTSO_trec(CurrentTSO) = trec;
1404     R1 = StgAtomicallyFrame_code(frame);
1405     Sp = frame;
1406     jump stg_ap_v_fast;
1407   }
1408 }
1409
1410
1411 checkzh_fast
1412 {
1413   W_ trec, closure;
1414
1415   /* Args: R1 = invariant closure */
1416   MAYBE_GC (R1_PTR, checkzh_fast); 
1417
1418   trec = StgTSO_trec(CurrentTSO);
1419   closure = R1;
1420   foreign "C" stmAddInvariantToCheck(MyCapability() "ptr", 
1421                                      trec "ptr",
1422                                      closure "ptr") [];
1423
1424   jump %ENTRY_CODE(Sp(0));
1425 }
1426
1427
1428 newTVarzh_fast
1429 {
1430   W_ tv;
1431   W_ new_value;
1432
1433   /* Args: R1 = initialisation value */
1434
1435   MAYBE_GC (R1_PTR, newTVarzh_fast); 
1436   new_value = R1;
1437   ("ptr" tv) = foreign "C" stmNewTVar(MyCapability() "ptr", new_value "ptr") [];
1438   RET_P(tv);
1439 }
1440
1441
1442 readTVarzh_fast
1443 {
1444   W_ trec;
1445   W_ tvar;
1446   W_ result;
1447
1448   /* Args: R1 = TVar closure */
1449
1450   MAYBE_GC (R1_PTR, readTVarzh_fast); // Call to stmReadTVar may allocate
1451   trec = StgTSO_trec(CurrentTSO);
1452   tvar = R1;
1453   ("ptr" result) = foreign "C" stmReadTVar(MyCapability() "ptr", trec "ptr", tvar "ptr") [];
1454
1455   RET_P(result);
1456 }
1457
1458 readTVarIOzh_fast
1459 {
1460     W_ result;
1461
1462 again:
1463     result = StgTVar_current_value(R1);
1464     if (%INFO_PTR(result) == stg_TREC_HEADER_info) {
1465         goto again;
1466     }
1467     RET_P(result);
1468 }
1469
1470 writeTVarzh_fast
1471 {
1472   W_ trec;
1473   W_ tvar;
1474   W_ new_value;
1475   
1476   /* Args: R1 = TVar closure */
1477   /*       R2 = New value    */
1478
1479   MAYBE_GC (R1_PTR & R2_PTR, writeTVarzh_fast); // Call to stmWriteTVar may allocate
1480   trec = StgTSO_trec(CurrentTSO);
1481   tvar = R1;
1482   new_value = R2;
1483   foreign "C" stmWriteTVar(MyCapability() "ptr", trec "ptr", tvar "ptr", new_value "ptr") [];
1484
1485   jump %ENTRY_CODE(Sp(0));
1486 }
1487
1488
1489 /* -----------------------------------------------------------------------------
1490  * MVar primitives
1491  *
1492  * take & putMVar work as follows.  Firstly, an important invariant:
1493  *
1494  *    If the MVar is full, then the blocking queue contains only
1495  *    threads blocked on putMVar, and if the MVar is empty then the
1496  *    blocking queue contains only threads blocked on takeMVar.
1497  *
1498  * takeMvar:
1499  *    MVar empty : then add ourselves to the blocking queue
1500  *    MVar full  : remove the value from the MVar, and
1501  *                 blocking queue empty     : return
1502  *                 blocking queue non-empty : perform the first blocked putMVar
1503  *                                            from the queue, and wake up the
1504  *                                            thread (MVar is now full again)
1505  *
1506  * putMVar is just the dual of the above algorithm.
1507  *
1508  * How do we "perform a putMVar"?  Well, we have to fiddle around with
1509  * the stack of the thread waiting to do the putMVar.  See
1510  * stg_block_putmvar and stg_block_takemvar in HeapStackCheck.c for
1511  * the stack layout, and the PerformPut and PerformTake macros below.
1512  *
1513  * It is important that a blocked take or put is woken up with the
1514  * take/put already performed, because otherwise there would be a
1515  * small window of vulnerability where the thread could receive an
1516  * exception and never perform its take or put, and we'd end up with a
1517  * deadlock.
1518  *
1519  * -------------------------------------------------------------------------- */
1520
1521 isEmptyMVarzh_fast
1522 {
1523     /* args: R1 = MVar closure */
1524
1525     if (StgMVar_value(R1) == stg_END_TSO_QUEUE_closure) {
1526         RET_N(1);
1527     } else {
1528         RET_N(0);
1529     }
1530 }
1531
1532 newMVarzh_fast
1533 {
1534     /* args: none */
1535     W_ mvar;
1536
1537     ALLOC_PRIM ( SIZEOF_StgMVar, NO_PTRS, newMVarzh_fast );
1538   
1539     mvar = Hp - SIZEOF_StgMVar + WDS(1);
1540     SET_HDR(mvar,stg_MVAR_DIRTY_info,W_[CCCS]);
1541         // MVARs start dirty: generation 0 has no mutable list
1542     StgMVar_head(mvar)  = stg_END_TSO_QUEUE_closure;
1543     StgMVar_tail(mvar)  = stg_END_TSO_QUEUE_closure;
1544     StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
1545     RET_P(mvar);
1546 }
1547
1548
1549 #define PerformTake(tso, value)                         \
1550     W_[StgTSO_sp(tso) + WDS(1)] = value;                \
1551     W_[StgTSO_sp(tso) + WDS(0)] = stg_gc_unpt_r1_info;
1552
1553 #define PerformPut(tso,lval)                    \
1554     StgTSO_sp(tso) = StgTSO_sp(tso) + WDS(3);   \
1555     lval = W_[StgTSO_sp(tso) - WDS(1)];
1556
1557 takeMVarzh_fast
1558 {
1559     W_ mvar, val, info, tso;
1560
1561     /* args: R1 = MVar closure */
1562     mvar = R1;
1563
1564 #if defined(THREADED_RTS)
1565     ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
1566 #else
1567     info = GET_INFO(mvar);
1568 #endif
1569         
1570     if (info == stg_MVAR_CLEAN_info) {
1571         foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") [];
1572     }
1573
1574     /* If the MVar is empty, put ourselves on its blocking queue,
1575      * and wait until we're woken up.
1576      */
1577     if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
1578         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1579             StgMVar_head(mvar) = CurrentTSO;
1580         } else {
1581             foreign "C" setTSOLink(MyCapability() "ptr", 
1582                                    StgMVar_tail(mvar) "ptr",
1583                                    CurrentTSO) [];
1584         }
1585         StgTSO__link(CurrentTSO)       = stg_END_TSO_QUEUE_closure;
1586         StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
1587         StgTSO_block_info(CurrentTSO)  = mvar;
1588         StgMVar_tail(mvar) = CurrentTSO;
1589         
1590         R1 = mvar;
1591         jump stg_block_takemvar;
1592   }
1593
1594   /* we got the value... */
1595   val = StgMVar_value(mvar);
1596
1597   if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure)
1598   {
1599       /* There are putMVar(s) waiting... 
1600        * wake up the first thread on the queue
1601        */
1602       ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
1603
1604       /* actually perform the putMVar for the thread that we just woke up */
1605       tso = StgMVar_head(mvar);
1606       PerformPut(tso,StgMVar_value(mvar));
1607
1608       if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
1609           foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
1610       }
1611
1612       ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
1613                                             StgMVar_head(mvar) "ptr", 1) [];
1614       StgMVar_head(mvar) = tso;
1615
1616       if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1617           StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
1618       }
1619
1620 #if defined(THREADED_RTS)
1621       unlockClosure(mvar, stg_MVAR_DIRTY_info);
1622 #else
1623       SET_INFO(mvar,stg_MVAR_DIRTY_info);
1624 #endif
1625       RET_P(val);
1626   } 
1627   else
1628   {
1629       /* No further putMVars, MVar is now empty */
1630       StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
1631  
1632 #if defined(THREADED_RTS)
1633       unlockClosure(mvar, stg_MVAR_DIRTY_info);
1634 #else
1635       SET_INFO(mvar,stg_MVAR_DIRTY_info);
1636 #endif
1637
1638       RET_P(val);
1639   }
1640 }
1641
1642
1643 tryTakeMVarzh_fast
1644 {
1645     W_ mvar, val, info, tso;
1646
1647     /* args: R1 = MVar closure */
1648
1649     mvar = R1;
1650
1651 #if defined(THREADED_RTS)
1652     ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
1653 #else
1654     info = GET_INFO(mvar);
1655 #endif
1656
1657     if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
1658 #if defined(THREADED_RTS)
1659         unlockClosure(mvar, info);
1660 #endif
1661         /* HACK: we need a pointer to pass back, 
1662          * so we abuse NO_FINALIZER_closure
1663          */
1664         RET_NP(0, stg_NO_FINALIZER_closure);
1665     }
1666
1667     if (info == stg_MVAR_CLEAN_info) {
1668         foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
1669     }
1670
1671     /* we got the value... */
1672     val = StgMVar_value(mvar);
1673
1674     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
1675
1676         /* There are putMVar(s) waiting... 
1677          * wake up the first thread on the queue
1678          */
1679         ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
1680
1681         /* actually perform the putMVar for the thread that we just woke up */
1682         tso = StgMVar_head(mvar);
1683         PerformPut(tso,StgMVar_value(mvar));
1684         if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
1685             foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
1686         }
1687
1688         ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
1689                                               StgMVar_head(mvar) "ptr", 1) [];
1690         StgMVar_head(mvar) = tso;
1691
1692         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1693             StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
1694         }
1695 #if defined(THREADED_RTS)
1696         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1697 #else
1698         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1699 #endif
1700     }
1701     else 
1702     {
1703         /* No further putMVars, MVar is now empty */
1704         StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
1705 #if defined(THREADED_RTS)
1706         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1707 #else
1708         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1709 #endif
1710     }
1711     
1712     RET_NP(1, val);
1713 }
1714
1715
1716 putMVarzh_fast
1717 {
1718     W_ mvar, val, info, tso;
1719
1720     /* args: R1 = MVar, R2 = value */
1721     mvar = R1;
1722     val  = R2;
1723
1724 #if defined(THREADED_RTS)
1725     ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
1726 #else
1727     info = GET_INFO(mvar);
1728 #endif
1729
1730     if (info == stg_MVAR_CLEAN_info) {
1731         foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
1732     }
1733
1734     if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
1735         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1736             StgMVar_head(mvar) = CurrentTSO;
1737         } else {
1738             foreign "C" setTSOLink(MyCapability() "ptr", 
1739                                    StgMVar_tail(mvar) "ptr",
1740                                    CurrentTSO) [];
1741         }
1742         StgTSO__link(CurrentTSO)       = stg_END_TSO_QUEUE_closure;
1743         StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
1744         StgTSO_block_info(CurrentTSO)  = mvar;
1745         StgMVar_tail(mvar) = CurrentTSO;
1746         
1747         R1 = mvar;
1748         R2 = val;
1749         jump stg_block_putmvar;
1750     }
1751   
1752     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
1753
1754         /* There are takeMVar(s) waiting: wake up the first one
1755          */
1756         ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
1757
1758         /* actually perform the takeMVar */
1759         tso = StgMVar_head(mvar);
1760         PerformTake(tso, val);
1761         if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
1762             foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
1763         }
1764       
1765         ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
1766                                               StgMVar_head(mvar) "ptr", 1) [];
1767         StgMVar_head(mvar) = tso;
1768
1769         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1770             StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
1771         }
1772
1773 #if defined(THREADED_RTS)
1774         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1775 #else
1776         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1777 #endif
1778         jump %ENTRY_CODE(Sp(0));
1779     }
1780     else
1781     {
1782         /* No further takes, the MVar is now full. */
1783         StgMVar_value(mvar) = val;
1784
1785 #if defined(THREADED_RTS)
1786         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1787 #else
1788         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1789 #endif
1790         jump %ENTRY_CODE(Sp(0));
1791     }
1792     
1793     /* ToDo: yield afterward for better communication performance? */
1794 }
1795
1796
1797 tryPutMVarzh_fast
1798 {
1799     W_ mvar, info, tso;
1800
1801     /* args: R1 = MVar, R2 = value */
1802     mvar = R1;
1803
1804 #if defined(THREADED_RTS)
1805     ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [R2];
1806 #else
1807     info = GET_INFO(mvar);
1808 #endif
1809
1810     if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
1811 #if defined(THREADED_RTS)
1812         unlockClosure(mvar, info);
1813 #endif
1814         RET_N(0);
1815     }
1816   
1817     if (info == stg_MVAR_CLEAN_info) {
1818         foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
1819     }
1820
1821     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
1822
1823         /* There are takeMVar(s) waiting: wake up the first one
1824          */
1825         ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
1826         
1827         /* actually perform the takeMVar */
1828         tso = StgMVar_head(mvar);
1829         PerformTake(tso, R2);
1830         if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
1831             foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
1832         }
1833       
1834         ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
1835                                               StgMVar_head(mvar) "ptr", 1) [];
1836         StgMVar_head(mvar) = tso;
1837
1838         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1839             StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
1840         }
1841
1842 #if defined(THREADED_RTS)
1843         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1844 #else
1845         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1846 #endif
1847     }
1848     else
1849     {
1850         /* No further takes, the MVar is now full. */
1851         StgMVar_value(mvar) = R2;
1852
1853 #if defined(THREADED_RTS)
1854         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1855 #else
1856         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1857 #endif
1858     }
1859     
1860     RET_N(1);
1861     /* ToDo: yield afterward for better communication performance? */
1862 }
1863
1864
1865 /* -----------------------------------------------------------------------------
1866    Stable pointer primitives
1867    -------------------------------------------------------------------------  */
1868
1869 makeStableNamezh_fast
1870 {
1871     W_ index, sn_obj;
1872
1873     ALLOC_PRIM( SIZEOF_StgStableName, R1_PTR, makeStableNamezh_fast );
1874   
1875     (index) = foreign "C" lookupStableName(R1 "ptr") [];
1876
1877     /* Is there already a StableName for this heap object?
1878      *  stable_ptr_table is a pointer to an array of snEntry structs.
1879      */
1880     if ( snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) == NULL ) {
1881         sn_obj = Hp - SIZEOF_StgStableName + WDS(1);
1882         SET_HDR(sn_obj, stg_STABLE_NAME_info, W_[CCCS]);
1883         StgStableName_sn(sn_obj) = index;
1884         snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) = sn_obj;
1885     } else {
1886         sn_obj = snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry);
1887     }
1888     
1889     RET_P(sn_obj);
1890 }
1891
1892
1893 makeStablePtrzh_fast
1894 {
1895     /* Args: R1 = a */
1896     W_ sp;
1897     MAYBE_GC(R1_PTR, makeStablePtrzh_fast);
1898     ("ptr" sp) = foreign "C" getStablePtr(R1 "ptr") [];
1899     RET_N(sp);
1900 }
1901
1902 deRefStablePtrzh_fast
1903 {
1904     /* Args: R1 = the stable ptr */
1905     W_ r, sp;
1906     sp = R1;
1907     r = snEntry_addr(W_[stable_ptr_table] + sp*SIZEOF_snEntry);
1908     RET_P(r);
1909 }
1910
1911 /* -----------------------------------------------------------------------------
1912    Bytecode object primitives
1913    -------------------------------------------------------------------------  */
1914
1915 newBCOzh_fast
1916 {
1917     /* R1 = instrs
1918        R2 = literals
1919        R3 = ptrs
1920        R4 = arity
1921        R5 = bitmap array
1922     */
1923     W_ bco, bitmap_arr, bytes, words;
1924     
1925     bitmap_arr = R5;
1926
1927     words = BYTES_TO_WDS(SIZEOF_StgBCO) + StgArrWords_words(bitmap_arr);
1928     bytes = WDS(words);
1929
1930     ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R5_PTR, newBCOzh_fast );
1931
1932     bco = Hp - bytes + WDS(1);
1933     SET_HDR(bco, stg_BCO_info, W_[CCCS]);
1934     
1935     StgBCO_instrs(bco)     = R1;
1936     StgBCO_literals(bco)   = R2;
1937     StgBCO_ptrs(bco)       = R3;
1938     StgBCO_arity(bco)      = HALF_W_(R4);
1939     StgBCO_size(bco)       = HALF_W_(words);
1940     
1941     // Copy the arity/bitmap info into the BCO
1942     W_ i;
1943     i = 0;
1944 for:
1945     if (i < StgArrWords_words(bitmap_arr)) {
1946         StgBCO_bitmap(bco,i) = StgArrWords_payload(bitmap_arr,i);
1947         i = i + 1;
1948         goto for;
1949     }
1950     
1951     RET_P(bco);
1952 }
1953
1954
1955 mkApUpd0zh_fast
1956 {
1957     // R1 = the BCO# for the AP
1958     //  
1959     W_ ap;
1960
1961     // This function is *only* used to wrap zero-arity BCOs in an
1962     // updatable wrapper (see ByteCodeLink.lhs).  An AP thunk is always
1963     // saturated and always points directly to a FUN or BCO.
1964     ASSERT(%INFO_TYPE(%GET_STD_INFO(R1)) == HALF_W_(BCO) &&
1965            StgBCO_arity(R1) == HALF_W_(0));
1966
1967     HP_CHK_GEN_TICKY(SIZEOF_StgAP, R1_PTR, mkApUpd0zh_fast);
1968     TICK_ALLOC_UP_THK(0, 0);
1969     CCCS_ALLOC(SIZEOF_StgAP);
1970
1971     ap = Hp - SIZEOF_StgAP + WDS(1);
1972     SET_HDR(ap, stg_AP_info, W_[CCCS]);
1973     
1974     StgAP_n_args(ap) = HALF_W_(0);
1975     StgAP_fun(ap) = R1;
1976     
1977     RET_P(ap);
1978 }
1979
1980 unpackClosurezh_fast
1981 {
1982 /* args: R1 = closure to analyze */
1983 // TODO: Consider the absence of ptrs or nonptrs as a special case ?
1984
1985     W_ info, ptrs, nptrs, p, ptrs_arr, nptrs_arr;
1986     info  = %GET_STD_INFO(UNTAG(R1));
1987
1988     // Some closures have non-standard layout, so we omit those here.
1989     W_ type;
1990     type = TO_W_(%INFO_TYPE(info));
1991     switch [0 .. N_CLOSURE_TYPES] type {
1992     case THUNK_SELECTOR : {
1993         ptrs = 1;
1994         nptrs = 0;
1995         goto out;
1996     }
1997     case THUNK, THUNK_1_0, THUNK_0_1, THUNK_2_0, THUNK_1_1, 
1998          THUNK_0_2, THUNK_STATIC, AP, PAP, AP_STACK, BCO : {
1999         ptrs = 0;
2000         nptrs = 0;
2001         goto out;
2002     }
2003     default: {
2004         ptrs  = TO_W_(%INFO_PTRS(info)); 
2005         nptrs = TO_W_(%INFO_NPTRS(info));
2006         goto out;
2007     }}
2008 out:
2009
2010     W_ ptrs_arr_sz, nptrs_arr_sz;
2011     nptrs_arr_sz = SIZEOF_StgArrWords   + WDS(nptrs);
2012     ptrs_arr_sz  = SIZEOF_StgMutArrPtrs + WDS(ptrs);
2013
2014     ALLOC_PRIM (ptrs_arr_sz + nptrs_arr_sz, R1_PTR, unpackClosurezh_fast);
2015
2016     W_ clos;
2017     clos = UNTAG(R1);
2018
2019     ptrs_arr  = Hp - nptrs_arr_sz - ptrs_arr_sz + WDS(1);
2020     nptrs_arr = Hp - nptrs_arr_sz + WDS(1);
2021
2022     SET_HDR(ptrs_arr, stg_MUT_ARR_PTRS_FROZEN_info, W_[CCCS]);
2023     StgMutArrPtrs_ptrs(ptrs_arr) = ptrs;
2024     p = 0;
2025 for:
2026     if(p < ptrs) {
2027          W_[ptrs_arr + SIZEOF_StgMutArrPtrs + WDS(p)] = StgClosure_payload(clos,p);
2028          p = p + 1;
2029          goto for;
2030     }
2031     
2032     SET_HDR(nptrs_arr, stg_ARR_WORDS_info, W_[CCCS]);
2033     StgArrWords_words(nptrs_arr) = nptrs;
2034     p = 0;
2035 for2:
2036     if(p < nptrs) {
2037          W_[BYTE_ARR_CTS(nptrs_arr) + WDS(p)] = StgClosure_payload(clos, p+ptrs);
2038          p = p + 1;
2039          goto for2;
2040     }
2041     RET_NPP(info, ptrs_arr, nptrs_arr);
2042 }
2043
2044 /* -----------------------------------------------------------------------------
2045    Thread I/O blocking primitives
2046    -------------------------------------------------------------------------- */
2047
2048 /* Add a thread to the end of the blocked queue. (C-- version of the C
2049  * macro in Schedule.h).
2050  */
2051 #define APPEND_TO_BLOCKED_QUEUE(tso)                    \
2052     ASSERT(StgTSO__link(tso) == END_TSO_QUEUE);         \
2053     if (W_[blocked_queue_hd] == END_TSO_QUEUE) {        \
2054       W_[blocked_queue_hd] = tso;                       \
2055     } else {                                            \
2056       foreign "C" setTSOLink(MyCapability() "ptr", W_[blocked_queue_tl] "ptr", tso) []; \
2057     }                                                   \
2058     W_[blocked_queue_tl] = tso;
2059
2060 waitReadzh_fast
2061 {
2062     /* args: R1 */
2063 #ifdef THREADED_RTS
2064     foreign "C" barf("waitRead# on threaded RTS") never returns;
2065 #else
2066
2067     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2068     StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
2069     StgTSO_block_info(CurrentTSO) = R1;
2070     // No locking - we're not going to use this interface in the
2071     // threaded RTS anyway.
2072     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2073     jump stg_block_noregs;
2074 #endif
2075 }
2076
2077 waitWritezh_fast
2078 {
2079     /* args: R1 */
2080 #ifdef THREADED_RTS
2081     foreign "C" barf("waitWrite# on threaded RTS") never returns;
2082 #else
2083
2084     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2085     StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
2086     StgTSO_block_info(CurrentTSO) = R1;
2087     // No locking - we're not going to use this interface in the
2088     // threaded RTS anyway.
2089     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2090     jump stg_block_noregs;
2091 #endif
2092 }
2093
2094
2095 STRING(stg_delayzh_malloc_str, "delayzh_fast")
2096 delayzh_fast
2097 {
2098 #ifdef mingw32_HOST_OS
2099     W_ ares;
2100     CInt reqID;
2101 #else
2102     W_ t, prev, target;
2103 #endif
2104
2105 #ifdef THREADED_RTS
2106     foreign "C" barf("delay# on threaded RTS") never returns;
2107 #else
2108
2109     /* args: R1 (microsecond delay amount) */
2110     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2111     StgTSO_why_blocked(CurrentTSO) = BlockedOnDelay::I16;
2112
2113 #ifdef mingw32_HOST_OS
2114
2115     /* could probably allocate this on the heap instead */
2116     ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
2117                                             stg_delayzh_malloc_str);
2118     (reqID) = foreign "C" addDelayRequest(R1);
2119     StgAsyncIOResult_reqID(ares)   = reqID;
2120     StgAsyncIOResult_len(ares)     = 0;
2121     StgAsyncIOResult_errCode(ares) = 0;
2122     StgTSO_block_info(CurrentTSO)  = ares;
2123
2124     /* Having all async-blocked threads reside on the blocked_queue
2125      * simplifies matters, so change the status to OnDoProc put the
2126      * delayed thread on the blocked_queue.
2127      */
2128     StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
2129     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2130     jump stg_block_async_void;
2131
2132 #else
2133
2134     W_ time;
2135     W_ divisor;
2136     (time) = foreign "C" getourtimeofday() [R1];
2137     divisor = TO_W_(RtsFlags_MiscFlags_tickInterval(RtsFlags));
2138     if (divisor == 0) {
2139         divisor = 50;
2140     }
2141     divisor = divisor * 1000;
2142     target = ((R1 + divisor - 1) / divisor) /* divide rounding up */
2143            + time + 1; /* Add 1 as getourtimeofday rounds down */
2144     StgTSO_block_info(CurrentTSO) = target;
2145
2146     /* Insert the new thread in the sleeping queue. */
2147     prev = NULL;
2148     t = W_[sleeping_queue];
2149 while:
2150     if (t != END_TSO_QUEUE && StgTSO_block_info(t) < target) {
2151         prev = t;
2152         t = StgTSO__link(t);
2153         goto while;
2154     }
2155
2156     StgTSO__link(CurrentTSO) = t;
2157     if (prev == NULL) {
2158         W_[sleeping_queue] = CurrentTSO;
2159     } else {
2160         foreign "C" setTSOLink(MyCapability() "ptr", prev "ptr", CurrentTSO) [];
2161     }
2162     jump stg_block_noregs;
2163 #endif
2164 #endif /* !THREADED_RTS */
2165 }
2166
2167
2168 #ifdef mingw32_HOST_OS
2169 STRING(stg_asyncReadzh_malloc_str, "asyncReadzh_fast")
2170 asyncReadzh_fast
2171 {
2172     W_ ares;
2173     CInt reqID;
2174
2175 #ifdef THREADED_RTS
2176     foreign "C" barf("asyncRead# on threaded RTS") never returns;
2177 #else
2178
2179     /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
2180     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2181     StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
2182
2183     /* could probably allocate this on the heap instead */
2184     ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
2185                                             stg_asyncReadzh_malloc_str)
2186                         [R1,R2,R3,R4];
2187     (reqID) = foreign "C" addIORequest(R1, 0/*FALSE*/,R2,R3,R4 "ptr") [];
2188     StgAsyncIOResult_reqID(ares)   = reqID;
2189     StgAsyncIOResult_len(ares)     = 0;
2190     StgAsyncIOResult_errCode(ares) = 0;
2191     StgTSO_block_info(CurrentTSO)  = ares;
2192     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2193     jump stg_block_async;
2194 #endif
2195 }
2196
2197 STRING(stg_asyncWritezh_malloc_str, "asyncWritezh_fast")
2198 asyncWritezh_fast
2199 {
2200     W_ ares;
2201     CInt reqID;
2202
2203 #ifdef THREADED_RTS
2204     foreign "C" barf("asyncWrite# on threaded RTS") never returns;
2205 #else
2206
2207     /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
2208     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2209     StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
2210
2211     ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
2212                                             stg_asyncWritezh_malloc_str)
2213                         [R1,R2,R3,R4];
2214     (reqID) = foreign "C" addIORequest(R1, 1/*TRUE*/,R2,R3,R4 "ptr") [];
2215
2216     StgAsyncIOResult_reqID(ares)   = reqID;
2217     StgAsyncIOResult_len(ares)     = 0;
2218     StgAsyncIOResult_errCode(ares) = 0;
2219     StgTSO_block_info(CurrentTSO)  = ares;
2220     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2221     jump stg_block_async;
2222 #endif
2223 }
2224
2225 STRING(stg_asyncDoProczh_malloc_str, "asyncDoProczh_fast")
2226 asyncDoProczh_fast
2227 {
2228     W_ ares;
2229     CInt reqID;
2230
2231 #ifdef THREADED_RTS
2232     foreign "C" barf("asyncDoProc# on threaded RTS") never returns;
2233 #else
2234
2235     /* args: R1 = proc, R2 = param */
2236     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2237     StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
2238
2239     /* could probably allocate this on the heap instead */
2240     ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
2241                                             stg_asyncDoProczh_malloc_str) 
2242                                 [R1,R2];
2243     (reqID) = foreign "C" addDoProcRequest(R1 "ptr",R2 "ptr") [];
2244     StgAsyncIOResult_reqID(ares)   = reqID;
2245     StgAsyncIOResult_len(ares)     = 0;
2246     StgAsyncIOResult_errCode(ares) = 0;
2247     StgTSO_block_info(CurrentTSO) = ares;
2248     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2249     jump stg_block_async;
2250 #endif
2251 }
2252 #endif
2253
2254 // noDuplicate# tries to ensure that none of the thunks under
2255 // evaluation by the current thread are also under evaluation by
2256 // another thread.  It relies on *both* threads doing noDuplicate#;
2257 // the second one will get blocked if they are duplicating some work.
2258 noDuplicatezh_fast
2259 {
2260     SAVE_THREAD_STATE();
2261     ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
2262     foreign "C" threadPaused (MyCapability() "ptr", CurrentTSO "ptr") [];
2263     
2264     if (StgTSO_what_next(CurrentTSO) == ThreadKilled::I16) {
2265         jump stg_threadFinished;
2266     } else {
2267         LOAD_THREAD_STATE();
2268         ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
2269         jump %ENTRY_CODE(Sp(0));
2270     }
2271 }
2272
2273 getApStackValzh_fast
2274 {
2275    W_ ap_stack, offset, val, ok;
2276
2277    /* args: R1 = AP_STACK, R2 = offset */
2278    ap_stack = R1;
2279    offset   = R2;
2280
2281    if (%INFO_PTR(ap_stack) == stg_AP_STACK_info) {
2282         ok = 1;
2283         val = StgAP_STACK_payload(ap_stack,offset); 
2284    } else {
2285         ok = 0;
2286         val = R1;
2287    }
2288    RET_NP(ok,val);
2289 }
2290
2291 getSparkzh_fast
2292 {
2293    W_ spark;
2294
2295 #ifndef THREADED_RTS
2296    RET_NP(0,ghczmprim_GHCziBool_False_closure);
2297 #else
2298    (spark) = foreign "C" findSpark(MyCapability());
2299    if (spark != 0) {
2300       RET_NP(1,spark);
2301    } else {
2302       RET_NP(0,ghczmprim_GHCziBool_False_closure);
2303    }
2304 #endif
2305 }