Partial fix for #2917
[ghc-hetmet.git] / rts / PrimOps.cmm
1 /* -----------------------------------------------------------------------------
2  *
3  * (c) The GHC Team, 1998-2004
4  *
5  * Out-of-line primitive operations
6  *
7  * This file contains the implementations of all the primitive
8  * operations ("primops") which are not expanded inline.  See
9  * ghc/compiler/prelude/primops.txt.pp for a list of all the primops;
10  * this file contains code for most of those with the attribute
11  * out_of_line=True.
12  *
13  * Entry convention: the entry convention for a primop is that all the
14  * args are in Stg registers (R1, R2, etc.).  This is to make writing
15  * the primops easier.  (see compiler/codeGen/CgCallConv.hs).
16  *
17  * Return convention: results from a primop are generally returned
18  * using the ordinary unboxed tuple return convention.  The C-- parser
19  * implements the RET_xxxx() macros to perform unboxed-tuple returns
20  * based on the prevailing return convention.
21  *
22  * This file is written in a subset of C--, extended with various
23  * features specific to GHC.  It is compiled by GHC directly.  For the
24  * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
25  *
26  * ---------------------------------------------------------------------------*/
27
28 #include "Cmm.h"
29
30 #ifdef __PIC__
31 #ifndef mingw32_HOST_OS
32 import __gmpz_init;
33 import __gmpz_add;
34 import __gmpz_sub;
35 import __gmpz_mul;
36 import __gmpz_gcd;
37 import __gmpn_gcd_1;
38 import __gmpn_cmp;
39 import __gmpz_tdiv_q;
40 import __gmpz_tdiv_r;
41 import __gmpz_tdiv_qr;
42 import __gmpz_fdiv_qr;
43 import __gmpz_divexact;
44 import __gmpz_and;
45 import __gmpz_xor;
46 import __gmpz_ior;
47 import __gmpz_com;
48 #endif
49 import pthread_mutex_lock;
50 import pthread_mutex_unlock;
51 #endif
52 import base_ControlziExceptionziBase_nestedAtomically_closure;
53 import EnterCriticalSection;
54 import LeaveCriticalSection;
55 import ghczmprim_GHCziBool_False_closure;
56
57 /*-----------------------------------------------------------------------------
58   Array Primitives
59
60   Basically just new*Array - the others are all inline macros.
61
62   The size arg is always passed in R1, and the result returned in R1.
63
64   The slow entry point is for returning from a heap check, the saved
65   size argument must be re-loaded from the stack.
66   -------------------------------------------------------------------------- */
67
68 /* for objects that are *less* than the size of a word, make sure we
69  * round up to the nearest word for the size of the array.
70  */
71
72 newByteArrayzh_fast
73 {
74     W_ words, payload_words, n, p;
75     MAYBE_GC(NO_PTRS,newByteArrayzh_fast);
76     n = R1;
77     payload_words = ROUNDUP_BYTES_TO_WDS(n);
78     words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
79     ("ptr" p) = foreign "C" allocateLocal(MyCapability() "ptr",words) [];
80     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
81     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
82     StgArrWords_words(p) = payload_words;
83     RET_P(p);
84 }
85
86 #define BA_ALIGN 16
87 #define BA_MASK  (BA_ALIGN-1)
88
89 newPinnedByteArrayzh_fast
90 {
91     W_ words, payload_words, n, p;
92
93     MAYBE_GC(NO_PTRS,newPinnedByteArrayzh_fast);
94     n = R1;
95     payload_words = ROUNDUP_BYTES_TO_WDS(n);
96
97     words = payload_words + ((SIZEOF_StgArrWords + BA_MASK) & ~BA_MASK);
98
99     ("ptr" p) = foreign "C" allocatePinned(words) [];
100     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
101
102     // This bumps p forwards so that the payload falls on an R2-byte boundary.
103     p = p + ((-p - SIZEOF_StgArrWords) & BA_MASK);
104
105     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
106     StgArrWords_words(p) = payload_words;
107     RET_P(p);
108 }
109
110 newAlignedPinnedByteArrayzh_fast
111 {
112     W_ words, payload_words, n, p, mask;
113
114     MAYBE_GC(NO_PTRS,newAlignedPinnedByteArrayzh_fast);
115     n = R1;
116
117     if (R2 > SIZEOF_W) {
118         mask = R2 - 1;
119     } else {
120         mask = 0;
121     }
122
123     payload_words = ROUNDUP_BYTES_TO_WDS(n);
124
125     // We want an <align>-byte aligned array.  allocatePinned() gives us
126     // 8-byte aligned memory by default, but we want to align the
127     // *goods* inside the ArrWords object, so we have to check the
128     // size of the ArrWords header and adjust our size accordingly.
129     words = payload_words + ((SIZEOF_StgArrWords + mask) & ~mask);
130
131     ("ptr" p) = foreign "C" allocatePinned(words) [];
132     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
133
134     // This bumps p forwards so that the payload falls on an R2-byte boundary.
135     p = p + ((-p - SIZEOF_StgArrWords) & mask);
136
137     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
138     StgArrWords_words(p) = payload_words;
139     RET_P(p);
140 }
141
142 newArrayzh_fast
143 {
144     W_ words, n, init, arr, p;
145     /* Args: R1 = words, R2 = initialisation value */
146
147     n = R1;
148     MAYBE_GC(R2_PTR,newArrayzh_fast);
149
150     words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + n;
151     ("ptr" arr) = foreign "C" allocateLocal(MyCapability() "ptr",words) [R2];
152     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);
153
154     SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
155     StgMutArrPtrs_ptrs(arr) = n;
156
157     // Initialise all elements of the the array with the value in R2
158     init = R2;
159     p = arr + SIZEOF_StgMutArrPtrs;
160   for:
161     if (p < arr + WDS(words)) {
162         W_[p] = init;
163         p = p + WDS(1);
164         goto for;
165     }
166
167     RET_P(arr);
168 }
169
170 unsafeThawArrayzh_fast
171 {
172   // SUBTLETY TO DO WITH THE OLD GEN MUTABLE LIST
173   //
174   // A MUT_ARR_PTRS lives on the mutable list, but a MUT_ARR_PTRS_FROZEN 
175   // normally doesn't.  However, when we freeze a MUT_ARR_PTRS, we leave
176   // it on the mutable list for the GC to remove (removing something from
177   // the mutable list is not easy, because the mut_list is only singly-linked).
178   // 
179   // So that we can tell whether a MUT_ARR_PTRS_FROZEN is on the mutable list,
180   // when we freeze it we set the info ptr to be MUT_ARR_PTRS_FROZEN0
181   // to indicate that it is still on the mutable list.
182   //
183   // So, when we thaw a MUT_ARR_PTRS_FROZEN, we must cope with two cases:
184   // either it is on a mut_list, or it isn't.  We adopt the convention that
185   // the closure type is MUT_ARR_PTRS_FROZEN0 if it is on the mutable list,
186   // and MUT_ARR_PTRS_FROZEN otherwise.  In fact it wouldn't matter if
187   // we put it on the mutable list more than once, but it would get scavenged
188   // multiple times during GC, which would be unnecessarily slow.
189   //
190   if (StgHeader_info(R1) != stg_MUT_ARR_PTRS_FROZEN0_info) {
191         SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
192         recordMutable(R1, R1);
193         // must be done after SET_INFO, because it ASSERTs closure_MUTABLE()
194         RET_P(R1);
195   } else {
196         SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
197         RET_P(R1);
198   }
199 }
200
201 /* -----------------------------------------------------------------------------
202    MutVar primitives
203    -------------------------------------------------------------------------- */
204
205 newMutVarzh_fast
206 {
207     W_ mv;
208     /* Args: R1 = initialisation value */
209
210     ALLOC_PRIM( SIZEOF_StgMutVar, R1_PTR, newMutVarzh_fast);
211
212     mv = Hp - SIZEOF_StgMutVar + WDS(1);
213     SET_HDR(mv,stg_MUT_VAR_DIRTY_info,W_[CCCS]);
214     StgMutVar_var(mv) = R1;
215     
216     RET_P(mv);
217 }
218
219 atomicModifyMutVarzh_fast
220 {
221     W_ mv, f, z, x, y, r, h;
222     /* Args: R1 :: MutVar#,  R2 :: a -> (a,b) */
223
224     /* If x is the current contents of the MutVar#, then 
225        We want to make the new contents point to
226
227          (sel_0 (f x))
228  
229        and the return value is
230          
231          (sel_1 (f x))
232
233         obviously we can share (f x).
234
235          z = [stg_ap_2 f x]  (max (HS + 2) MIN_UPD_SIZE)
236          y = [stg_sel_0 z]   (max (HS + 1) MIN_UPD_SIZE)
237          r = [stg_sel_1 z]   (max (HS + 1) MIN_UPD_SIZE)
238     */
239
240 #if MIN_UPD_SIZE > 1
241 #define THUNK_1_SIZE (SIZEOF_StgThunkHeader + WDS(MIN_UPD_SIZE))
242 #define TICK_ALLOC_THUNK_1() TICK_ALLOC_UP_THK(WDS(1),WDS(MIN_UPD_SIZE-1))
243 #else
244 #define THUNK_1_SIZE (SIZEOF_StgThunkHeader + WDS(1))
245 #define TICK_ALLOC_THUNK_1() TICK_ALLOC_UP_THK(WDS(1),0)
246 #endif
247
248 #if MIN_UPD_SIZE > 2
249 #define THUNK_2_SIZE (SIZEOF_StgThunkHeader + WDS(MIN_UPD_SIZE))
250 #define TICK_ALLOC_THUNK_2() TICK_ALLOC_UP_THK(WDS(2),WDS(MIN_UPD_SIZE-2))
251 #else
252 #define THUNK_2_SIZE (SIZEOF_StgThunkHeader + WDS(2))
253 #define TICK_ALLOC_THUNK_2() TICK_ALLOC_UP_THK(WDS(2),0)
254 #endif
255
256 #define SIZE (THUNK_2_SIZE + THUNK_1_SIZE + THUNK_1_SIZE)
257
258    HP_CHK_GEN_TICKY(SIZE, R1_PTR & R2_PTR, atomicModifyMutVarzh_fast);
259
260    mv = R1;
261    f = R2;
262
263    TICK_ALLOC_THUNK_2();
264    CCCS_ALLOC(THUNK_2_SIZE);
265    z = Hp - THUNK_2_SIZE + WDS(1);
266    SET_HDR(z, stg_ap_2_upd_info, W_[CCCS]);
267    LDV_RECORD_CREATE(z);
268    StgThunk_payload(z,0) = f;
269
270    TICK_ALLOC_THUNK_1();
271    CCCS_ALLOC(THUNK_1_SIZE);
272    y = z - THUNK_1_SIZE;
273    SET_HDR(y, stg_sel_0_upd_info, W_[CCCS]);
274    LDV_RECORD_CREATE(y);
275    StgThunk_payload(y,0) = z;
276
277    TICK_ALLOC_THUNK_1();
278    CCCS_ALLOC(THUNK_1_SIZE);
279    r = y - THUNK_1_SIZE;
280    SET_HDR(r, stg_sel_1_upd_info, W_[CCCS]);
281    LDV_RECORD_CREATE(r);
282    StgThunk_payload(r,0) = z;
283
284  retry:
285    x = StgMutVar_var(mv);
286    StgThunk_payload(z,1) = x;
287 #ifdef THREADED_RTS
288    (h) = foreign "C" cas(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var, x, y) [];
289    if (h != x) { goto retry; }
290 #else
291    StgMutVar_var(mv) = y;
292 #endif
293
294    if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
295      foreign "C" dirty_MUT_VAR(BaseReg "ptr", mv "ptr") [];
296    }
297
298    RET_P(r);
299 }
300
301 /* -----------------------------------------------------------------------------
302    Weak Pointer Primitives
303    -------------------------------------------------------------------------- */
304
305 STRING(stg_weak_msg,"New weak pointer at %p\n")
306
307 mkWeakzh_fast
308 {
309   /* R1 = key
310      R2 = value
311      R3 = finalizer (or NULL)
312   */
313   W_ w;
314
315   if (R3 == NULL) {
316     R3 = stg_NO_FINALIZER_closure;
317   }
318
319   ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, mkWeakzh_fast );
320
321   w = Hp - SIZEOF_StgWeak + WDS(1);
322   SET_HDR(w, stg_WEAK_info, W_[CCCS]);
323
324   // We don't care about cfinalizer here.
325   // Should StgWeak_cfinalizer(w) be stg_NO_FINALIZER_closure or
326   // something else?
327
328   StgWeak_key(w)        = R1;
329   StgWeak_value(w)      = R2;
330   StgWeak_finalizer(w)  = R3;
331   StgWeak_cfinalizer(w) = stg_NO_FINALIZER_closure;
332
333   StgWeak_link(w)       = W_[weak_ptr_list];
334   W_[weak_ptr_list]     = w;
335
336   IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
337
338   RET_P(w);
339 }
340
341 mkWeakForeignEnvzh_fast
342 {
343   /* R1 = key
344      R2 = value
345      R3 = finalizer
346      R4 = pointer
347      R5 = has environment (0 or 1)
348      R6 = environment
349   */
350   W_ w, payload_words, words, p;
351
352   W_ key, val, fptr, ptr, flag, eptr;
353
354   key  = R1;
355   val  = R2;
356   fptr = R3;
357   ptr  = R4;
358   flag = R5;
359   eptr = R6;
360
361   ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, mkWeakForeignEnvzh_fast );
362
363   w = Hp - SIZEOF_StgWeak + WDS(1);
364   SET_HDR(w, stg_WEAK_info, W_[CCCS]);
365
366   payload_words = 4;
367   words         = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
368   ("ptr" p)     = foreign "C" allocateLocal(MyCapability() "ptr", words) [];
369
370   TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
371   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
372
373   StgArrWords_words(p)     = payload_words;
374   StgArrWords_payload(p,0) = fptr;
375   StgArrWords_payload(p,1) = ptr;
376   StgArrWords_payload(p,2) = eptr;
377   StgArrWords_payload(p,3) = flag;
378
379   // We don't care about the value here.
380   // Should StgWeak_value(w) be stg_NO_FINALIZER_closure or something else?
381
382   StgWeak_key(w)        = key;
383   StgWeak_value(w)      = val;
384   StgWeak_finalizer(w)  = stg_NO_FINALIZER_closure;
385   StgWeak_cfinalizer(w) = p;
386
387   StgWeak_link(w)   = W_[weak_ptr_list];
388   W_[weak_ptr_list] = w;
389
390   IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
391
392   RET_P(w);
393 }
394
395 finalizzeWeakzh_fast
396 {
397   /* R1 = weak ptr
398    */
399   W_ w, f, arr;
400
401   w = R1;
402
403   // already dead?
404   if (GET_INFO(w) == stg_DEAD_WEAK_info) {
405       RET_NP(0,stg_NO_FINALIZER_closure);
406   }
407
408   // kill it
409 #ifdef PROFILING
410   // @LDV profiling
411   // A weak pointer is inherently used, so we do not need to call
412   // LDV_recordDead_FILL_SLOP_DYNAMIC():
413   //    LDV_recordDead_FILL_SLOP_DYNAMIC((StgClosure *)w);
414   // or, LDV_recordDead():
415   //    LDV_recordDead((StgClosure *)w, sizeofW(StgWeak) - sizeofW(StgProfHeader));
416   // Furthermore, when PROFILING is turned on, dead weak pointers are exactly as 
417   // large as weak pointers, so there is no need to fill the slop, either.
418   // See stg_DEAD_WEAK_info in StgMiscClosures.hc.
419 #endif
420
421   //
422   // Todo: maybe use SET_HDR() and remove LDV_recordCreate()?
423   //
424   SET_INFO(w,stg_DEAD_WEAK_info);
425   LDV_RECORD_CREATE(w);
426
427   f   = StgWeak_finalizer(w);
428   arr = StgWeak_cfinalizer(w);
429
430   StgDeadWeak_link(w) = StgWeak_link(w);
431
432   if (arr != stg_NO_FINALIZER_closure) {
433     foreign "C" runCFinalizer(StgArrWords_payload(arr,0),
434                               StgArrWords_payload(arr,1),
435                               StgArrWords_payload(arr,2),
436                               StgArrWords_payload(arr,3)) [];
437   }
438
439   /* return the finalizer */
440   if (f == stg_NO_FINALIZER_closure) {
441       RET_NP(0,stg_NO_FINALIZER_closure);
442   } else {
443       RET_NP(1,f);
444   }
445 }
446
447 deRefWeakzh_fast
448 {
449   /* R1 = weak ptr */
450   W_ w, code, val;
451
452   w = R1;
453   if (GET_INFO(w) == stg_WEAK_info) {
454     code = 1;
455     val = StgWeak_value(w);
456   } else {
457     code = 0;
458     val = w;
459   }
460   RET_NP(code,val);
461 }
462
463 /* -----------------------------------------------------------------------------
464    Arbitrary-precision Integer operations.
465
466    There are some assumptions in this code that mp_limb_t == W_.  This is
467    the case for all the platforms that GHC supports, currently.
468    -------------------------------------------------------------------------- */
469
470 int2Integerzh_fast
471 {
472    /* arguments: R1 = Int# */
473
474    W_ val, s, p;        /* to avoid aliasing */
475
476    val = R1;
477    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, int2Integerzh_fast );
478
479    p = Hp - SIZEOF_StgArrWords;
480    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
481    StgArrWords_words(p) = 1;
482
483    /* mpz_set_si is inlined here, makes things simpler */
484    if (%lt(val,0)) { 
485         s  = -1;
486         Hp(0) = -val;
487    } else { 
488      if (%gt(val,0)) {
489         s = 1;
490         Hp(0) = val;
491      } else {
492         s = 0;
493      }
494   }
495
496    /* returns (# size  :: Int#, 
497                  data  :: ByteArray# 
498                #)
499    */
500    RET_NP(s,p);
501 }
502
503 word2Integerzh_fast
504 {
505    /* arguments: R1 = Word# */
506
507    W_ val, s, p;        /* to avoid aliasing */
508
509    val = R1;
510
511    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, word2Integerzh_fast);
512
513    p = Hp - SIZEOF_StgArrWords;
514    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
515    StgArrWords_words(p) = 1;
516
517    if (val != 0) {
518         s = 1;
519         W_[Hp] = val;
520    } else {
521         s = 0;
522    }
523
524    /* returns (# size  :: Int#, 
525                  data  :: ByteArray# #)
526    */
527    RET_NP(s,p);
528 }
529
530
531 /*
532  * 'long long' primops for converting to/from Integers.
533  */
534
535 #ifdef SUPPORT_LONG_LONGS
536
537 int64ToIntegerzh_fast
538 {
539    /* arguments: L1 = Int64# */
540
541    L_ val;
542    W_ hi, lo, s, neg, words_needed, p;
543
544    val = L1;
545    neg = 0;
546
547    hi = TO_W_(val >> 32);
548    lo = TO_W_(val);
549
550    if ( hi == 0 || (hi == 0xFFFFFFFF && lo != 0) )  {
551        // minimum is one word
552        words_needed = 1;
553    } else { 
554        words_needed = 2;
555    }
556
557    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
558                NO_PTRS, int64ToIntegerzh_fast );
559
560    p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
561    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
562    StgArrWords_words(p) = words_needed;
563
564    if ( %lt(hi,0) ) {
565      neg = 1;
566      lo = -lo;
567      if(lo == 0) {
568        hi = -hi;
569      } else {
570        hi = -hi - 1;
571      }
572    }
573
574    if ( words_needed == 2 )  { 
575       s = 2;
576       Hp(-1) = lo;
577       Hp(0) = hi;
578    } else { 
579        if ( lo != 0 ) {
580            s = 1;
581            Hp(0) = lo;
582        } else /* val==0 */  {
583            s = 0;
584        }
585    }
586    if ( neg != 0 ) {
587         s = -s;
588    }
589
590    /* returns (# size  :: Int#, 
591                  data  :: ByteArray# #)
592    */
593    RET_NP(s,p);
594 }
595 word64ToIntegerzh_fast
596 {
597    /* arguments: L1 = Word64# */
598
599    L_ val;
600    W_ hi, lo, s, words_needed, p;
601
602    val = L1;
603    hi = TO_W_(val >> 32);
604    lo = TO_W_(val);
605
606    if ( hi != 0 ) {
607       words_needed = 2;
608    } else {
609       words_needed = 1;
610    }
611
612    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
613                NO_PTRS, word64ToIntegerzh_fast );
614
615    p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
616    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
617    StgArrWords_words(p) = words_needed;
618
619    if ( hi != 0 ) { 
620      s = 2;
621      Hp(-1) = lo;
622      Hp(0)  = hi;
623    } else {
624       if ( lo != 0 ) {
625         s = 1;
626         Hp(0) = lo;
627      } else /* val==0 */  {
628       s = 0;
629      }
630   }
631
632    /* returns (# size  :: Int#, 
633                  data  :: ByteArray# #)
634    */
635    RET_NP(s,p);
636 }
637
638
639
640 #endif /* SUPPORT_LONG_LONGS */
641
642 /* ToDo: this is shockingly inefficient */
643
644 #ifndef THREADED_RTS
645 section "bss" {
646   mp_tmp1:
647     bits8 [SIZEOF_MP_INT];
648 }
649
650 section "bss" {
651   mp_tmp2:
652     bits8 [SIZEOF_MP_INT];
653 }
654
655 section "bss" {
656   mp_result1:
657     bits8 [SIZEOF_MP_INT];
658 }
659
660 section "bss" {
661   mp_result2:
662     bits8 [SIZEOF_MP_INT];
663 }
664 #endif
665
666 #ifdef THREADED_RTS
667 #define FETCH_MP_TEMP(X) \
668 W_ X; \
669 X = BaseReg + (OFFSET_StgRegTable_r ## X);
670 #else
671 #define FETCH_MP_TEMP(X) /* Nothing */
672 #endif
673
674 #define GMP_TAKE2_RET1(name,mp_fun)                                     \
675 name                                                                    \
676 {                                                                       \
677   CInt s1, s2;                                                          \
678   W_ d1, d2;                                                            \
679   FETCH_MP_TEMP(mp_tmp1);                                               \
680   FETCH_MP_TEMP(mp_tmp2);                                               \
681   FETCH_MP_TEMP(mp_result1)                                             \
682   FETCH_MP_TEMP(mp_result2);                                            \
683                                                                         \
684   /* call doYouWantToGC() */                                            \
685   MAYBE_GC(R2_PTR & R4_PTR, name);                                      \
686                                                                         \
687   s1 = W_TO_INT(R1);                                                    \
688   d1 = R2;                                                              \
689   s2 = W_TO_INT(R3);                                                    \
690   d2 = R4;                                                              \
691                                                                         \
692   MP_INT__mp_alloc(mp_tmp1) = W_TO_INT(StgArrWords_words(d1));          \
693   MP_INT__mp_size(mp_tmp1)  = (s1);                                     \
694   MP_INT__mp_d(mp_tmp1)     = BYTE_ARR_CTS(d1);                         \
695   MP_INT__mp_alloc(mp_tmp2) = W_TO_INT(StgArrWords_words(d2));          \
696   MP_INT__mp_size(mp_tmp2)  = (s2);                                     \
697   MP_INT__mp_d(mp_tmp2)     = BYTE_ARR_CTS(d2);                         \
698                                                                         \
699   foreign "C" __gmpz_init(mp_result1 "ptr") [];                            \
700                                                                         \
701   /* Perform the operation */                                           \
702   foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1  "ptr",mp_tmp2  "ptr") []; \
703                                                                         \
704   RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
705          MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
706 }
707
708 #define GMP_TAKE1_RET1(name,mp_fun)                                     \
709 name                                                                    \
710 {                                                                       \
711   CInt s1;                                                              \
712   W_ d1;                                                                \
713   FETCH_MP_TEMP(mp_tmp1);                                               \
714   FETCH_MP_TEMP(mp_result1)                                             \
715                                                                         \
716   /* call doYouWantToGC() */                                            \
717   MAYBE_GC(R2_PTR, name);                                               \
718                                                                         \
719   d1 = R2;                                                              \
720   s1 = W_TO_INT(R1);                                                    \
721                                                                         \
722   MP_INT__mp_alloc(mp_tmp1)     = W_TO_INT(StgArrWords_words(d1));      \
723   MP_INT__mp_size(mp_tmp1)      = (s1);                                 \
724   MP_INT__mp_d(mp_tmp1)         = BYTE_ARR_CTS(d1);                     \
725                                                                         \
726   foreign "C" __gmpz_init(mp_result1 "ptr") [];                            \
727                                                                         \
728   /* Perform the operation */                                           \
729   foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1 "ptr") [];                \
730                                                                         \
731   RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
732          MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
733 }
734
735 #define GMP_TAKE2_RET2(name,mp_fun)                                                     \
736 name                                                                                    \
737 {                                                                                       \
738   CInt s1, s2;                                                                          \
739   W_ d1, d2;                                                                            \
740   FETCH_MP_TEMP(mp_tmp1);                                                               \
741   FETCH_MP_TEMP(mp_tmp2);                                                               \
742   FETCH_MP_TEMP(mp_result1)                                                             \
743   FETCH_MP_TEMP(mp_result2)                                                             \
744                                                                                         \
745   /* call doYouWantToGC() */                                                            \
746   MAYBE_GC(R2_PTR & R4_PTR, name);                                                      \
747                                                                                         \
748   s1 = W_TO_INT(R1);                                                                    \
749   d1 = R2;                                                                              \
750   s2 = W_TO_INT(R3);                                                                    \
751   d2 = R4;                                                                              \
752                                                                                         \
753   MP_INT__mp_alloc(mp_tmp1)     = W_TO_INT(StgArrWords_words(d1));                      \
754   MP_INT__mp_size(mp_tmp1)      = (s1);                                                 \
755   MP_INT__mp_d(mp_tmp1)         = BYTE_ARR_CTS(d1);                                     \
756   MP_INT__mp_alloc(mp_tmp2)     = W_TO_INT(StgArrWords_words(d2));                      \
757   MP_INT__mp_size(mp_tmp2)      = (s2);                                                 \
758   MP_INT__mp_d(mp_tmp2)         = BYTE_ARR_CTS(d2);                                     \
759                                                                                         \
760   foreign "C" __gmpz_init(mp_result1 "ptr") [];                                               \
761   foreign "C" __gmpz_init(mp_result2 "ptr") [];                                               \
762                                                                                         \
763   /* Perform the operation */                                                           \
764   foreign "C" mp_fun(mp_result1 "ptr",mp_result2 "ptr",mp_tmp1 "ptr",mp_tmp2 "ptr") [];    \
765                                                                                         \
766   RET_NPNP(TO_W_(MP_INT__mp_size(mp_result1)),                                          \
767            MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords,                               \
768            TO_W_(MP_INT__mp_size(mp_result2)),                                          \
769            MP_INT__mp_d(mp_result2) - SIZEOF_StgArrWords);                              \
770 }
771
772 GMP_TAKE2_RET1(plusIntegerzh_fast,     __gmpz_add)
773 GMP_TAKE2_RET1(minusIntegerzh_fast,    __gmpz_sub)
774 GMP_TAKE2_RET1(timesIntegerzh_fast,    __gmpz_mul)
775 GMP_TAKE2_RET1(gcdIntegerzh_fast,      __gmpz_gcd)
776 GMP_TAKE2_RET1(quotIntegerzh_fast,     __gmpz_tdiv_q)
777 GMP_TAKE2_RET1(remIntegerzh_fast,      __gmpz_tdiv_r)
778 GMP_TAKE2_RET1(divExactIntegerzh_fast, __gmpz_divexact)
779 GMP_TAKE2_RET1(andIntegerzh_fast,      __gmpz_and)
780 GMP_TAKE2_RET1(orIntegerzh_fast,       __gmpz_ior)
781 GMP_TAKE2_RET1(xorIntegerzh_fast,      __gmpz_xor)
782 GMP_TAKE1_RET1(complementIntegerzh_fast, __gmpz_com)
783
784 GMP_TAKE2_RET2(quotRemIntegerzh_fast, __gmpz_tdiv_qr)
785 GMP_TAKE2_RET2(divModIntegerzh_fast,  __gmpz_fdiv_qr)
786
787 #ifndef THREADED_RTS
788 section "bss" {
789   mp_tmp_w:  W_; // NB. mp_tmp_w is really an here mp_limb_t
790 }
791 #endif
792
793 gcdIntzh_fast
794 {
795     /* R1 = the first Int#; R2 = the second Int# */
796     W_ r; 
797     FETCH_MP_TEMP(mp_tmp_w);
798
799     W_[mp_tmp_w] = R1;
800     (r) = foreign "C" __gmpn_gcd_1(mp_tmp_w "ptr", 1, R2) [];
801
802     R1 = r;
803     /* Result parked in R1, return via info-pointer at TOS */
804     jump %ENTRY_CODE(Sp(0));
805 }
806
807
808 gcdIntegerIntzh_fast
809 {
810     /* R1 = s1; R2 = d1; R3 = the int */
811     W_ s1;
812     (s1) = foreign "C" __gmpn_gcd_1( BYTE_ARR_CTS(R2) "ptr", R1, R3) [];
813     R1 = s1;
814     
815     /* Result parked in R1, return via info-pointer at TOS */
816     jump %ENTRY_CODE(Sp(0));
817 }
818
819
820 cmpIntegerIntzh_fast
821 {
822     /* R1 = s1; R2 = d1; R3 = the int */
823     W_ usize, vsize, v_digit, u_digit;
824
825     usize = R1;
826     vsize = 0;
827     v_digit = R3;
828
829     // paraphrased from __gmpz_cmp_si() in the GMP sources
830     if (%gt(v_digit,0)) {
831         vsize = 1;
832     } else { 
833         if (%lt(v_digit,0)) {
834             vsize = -1;
835             v_digit = -v_digit;
836         }
837     }
838
839     if (usize != vsize) {
840         R1 = usize - vsize; 
841         jump %ENTRY_CODE(Sp(0));
842     }
843
844     if (usize == 0) {
845         R1 = 0; 
846         jump %ENTRY_CODE(Sp(0));
847     }
848
849     u_digit = W_[BYTE_ARR_CTS(R2)];
850
851     if (u_digit == v_digit) {
852         R1 = 0; 
853         jump %ENTRY_CODE(Sp(0));
854     }
855
856     if (%gtu(u_digit,v_digit)) { // NB. unsigned: these are mp_limb_t's
857         R1 = usize; 
858     } else {
859         R1 = -usize; 
860     }
861
862     jump %ENTRY_CODE(Sp(0));
863 }
864
865 cmpIntegerzh_fast
866 {
867     /* R1 = s1; R2 = d1; R3 = s2; R4 = d2 */
868     W_ usize, vsize, size, up, vp;
869     CInt cmp;
870
871     // paraphrased from __gmpz_cmp() in the GMP sources
872     usize = R1;
873     vsize = R3;
874
875     if (usize != vsize) {
876         R1 = usize - vsize; 
877         jump %ENTRY_CODE(Sp(0));
878     }
879
880     if (usize == 0) {
881         R1 = 0; 
882         jump %ENTRY_CODE(Sp(0));
883     }
884
885     if (%lt(usize,0)) { // NB. not <, which is unsigned
886         size = -usize;
887     } else {
888         size = usize;
889     }
890
891     up = BYTE_ARR_CTS(R2);
892     vp = BYTE_ARR_CTS(R4);
893
894     (cmp) = foreign "C" __gmpn_cmp(up "ptr", vp "ptr", size) [];
895
896     if (cmp == 0 :: CInt) {
897         R1 = 0; 
898         jump %ENTRY_CODE(Sp(0));
899     }
900
901     if (%lt(cmp,0 :: CInt) == %lt(usize,0)) {
902         R1 = 1;
903     } else {
904         R1 = (-1); 
905     }
906     /* Result parked in R1, return via info-pointer at TOS */
907     jump %ENTRY_CODE(Sp(0));
908 }
909
910 integer2Intzh_fast
911 {
912     /* R1 = s; R2 = d */
913     W_ r, s;
914
915     s = R1;
916     if (s == 0) {
917         r = 0;
918     } else {
919         r = W_[R2 + SIZEOF_StgArrWords];
920         if (%lt(s,0)) {
921             r = -r;
922         }
923     }
924     /* Result parked in R1, return via info-pointer at TOS */
925     R1 = r;
926     jump %ENTRY_CODE(Sp(0));
927 }
928
929 integer2Wordzh_fast
930 {
931   /* R1 = s; R2 = d */
932   W_ r, s;
933
934   s = R1;
935   if (s == 0) {
936     r = 0;
937   } else {
938     r = W_[R2 + SIZEOF_StgArrWords];
939     if (%lt(s,0)) {
940         r = -r;
941     }
942   }
943   /* Result parked in R1, return via info-pointer at TOS */
944   R1 = r;
945   jump %ENTRY_CODE(Sp(0));
946 }
947
948 decodeFloatzh_fast
949
950     W_ p;
951     F_ arg;
952     FETCH_MP_TEMP(mp_tmp1);
953     FETCH_MP_TEMP(mp_tmp_w);
954     
955     /* arguments: F1 = Float# */
956     arg = F1;
957     
958     ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, decodeFloatzh_fast );
959     
960     /* Be prepared to tell Lennart-coded __decodeFloat
961        where mantissa._mp_d can be put (it does not care about the rest) */
962     p = Hp - SIZEOF_StgArrWords;
963     SET_HDR(p,stg_ARR_WORDS_info,W_[CCCS]);
964     StgArrWords_words(p) = 1;
965     MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
966     
967     /* Perform the operation */
968     foreign "C" __decodeFloat(mp_tmp1 "ptr",mp_tmp_w "ptr" ,arg) [];
969     
970     /* returns: (Int# (expn), Int#, ByteArray#) */
971     RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
972 }
973
974 decodeFloatzuIntzh_fast
975
976     W_ p;
977     F_ arg;
978     FETCH_MP_TEMP(mp_tmp1);
979     FETCH_MP_TEMP(mp_tmp_w);
980     
981     /* arguments: F1 = Float# */
982     arg = F1;
983     
984     /* Perform the operation */
985     foreign "C" __decodeFloat_Int(mp_tmp1 "ptr", mp_tmp_w "ptr", arg) [];
986     
987     /* returns: (Int# (mantissa), Int# (exponent)) */
988     RET_NN(W_[mp_tmp1], W_[mp_tmp_w]);
989 }
990
991 #define DOUBLE_MANTISSA_SIZE SIZEOF_DOUBLE
992 #define ARR_SIZE (SIZEOF_StgArrWords + DOUBLE_MANTISSA_SIZE)
993
994 decodeDoublezh_fast
995
996     D_ arg;
997     W_ p;
998     FETCH_MP_TEMP(mp_tmp1);
999     FETCH_MP_TEMP(mp_tmp_w);
1000
1001     /* arguments: D1 = Double# */
1002     arg = D1;
1003
1004     ALLOC_PRIM( ARR_SIZE, NO_PTRS, decodeDoublezh_fast );
1005     
1006     /* Be prepared to tell Lennart-coded __decodeDouble
1007        where mantissa.d can be put (it does not care about the rest) */
1008     p = Hp - ARR_SIZE + WDS(1);
1009     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
1010     StgArrWords_words(p) = BYTES_TO_WDS(DOUBLE_MANTISSA_SIZE);
1011     MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
1012
1013     /* Perform the operation */
1014     foreign "C" __decodeDouble(mp_tmp1 "ptr", mp_tmp_w "ptr",arg) [];
1015     
1016     /* returns: (Int# (expn), Int#, ByteArray#) */
1017     RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
1018 }
1019
1020 decodeDoublezu2Intzh_fast
1021
1022     D_ arg;
1023     W_ p;
1024     FETCH_MP_TEMP(mp_tmp1);
1025     FETCH_MP_TEMP(mp_tmp2);
1026     FETCH_MP_TEMP(mp_result1);
1027     FETCH_MP_TEMP(mp_result2);
1028
1029     /* arguments: D1 = Double# */
1030     arg = D1;
1031
1032     /* Perform the operation */
1033     foreign "C" __decodeDouble_2Int(mp_tmp1 "ptr", mp_tmp2 "ptr",
1034                                     mp_result1 "ptr", mp_result2 "ptr",
1035                                     arg) [];
1036
1037     /* returns:
1038        (Int# (mant sign), Word# (mant high), Word# (mant low), Int# (expn)) */
1039     RET_NNNN(W_[mp_tmp1], W_[mp_tmp2], W_[mp_result1], W_[mp_result2]);
1040 }
1041
1042 /* -----------------------------------------------------------------------------
1043  * Concurrency primitives
1044  * -------------------------------------------------------------------------- */
1045
1046 forkzh_fast
1047 {
1048   /* args: R1 = closure to spark */
1049
1050   MAYBE_GC(R1_PTR, forkzh_fast);
1051
1052   W_ closure;
1053   W_ threadid;
1054   closure = R1;
1055
1056   ("ptr" threadid) = foreign "C" createIOThread( MyCapability() "ptr", 
1057                                 RtsFlags_GcFlags_initialStkSize(RtsFlags), 
1058                                 closure "ptr") [];
1059
1060   /* start blocked if the current thread is blocked */
1061   StgTSO_flags(threadid) = 
1062      StgTSO_flags(threadid) |  (StgTSO_flags(CurrentTSO) & 
1063                                 (TSO_BLOCKEX::I32 | TSO_INTERRUPTIBLE::I32));
1064
1065   foreign "C" scheduleThread(MyCapability() "ptr", threadid "ptr") [];
1066
1067   // switch at the earliest opportunity
1068   Capability_context_switch(MyCapability()) = 1 :: CInt;
1069   
1070   RET_P(threadid);
1071 }
1072
1073 forkOnzh_fast
1074 {
1075   /* args: R1 = cpu, R2 = closure to spark */
1076
1077   MAYBE_GC(R2_PTR, forkOnzh_fast);
1078
1079   W_ cpu;
1080   W_ closure;
1081   W_ threadid;
1082   cpu = R1;
1083   closure = R2;
1084
1085   ("ptr" threadid) = foreign "C" createIOThread( MyCapability() "ptr", 
1086                                 RtsFlags_GcFlags_initialStkSize(RtsFlags), 
1087                                 closure "ptr") [];
1088
1089   /* start blocked if the current thread is blocked */
1090   StgTSO_flags(threadid) = 
1091      StgTSO_flags(threadid) |  (StgTSO_flags(CurrentTSO) & 
1092                                 (TSO_BLOCKEX::I32 | TSO_INTERRUPTIBLE::I32));
1093
1094   foreign "C" scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr") [];
1095
1096   // switch at the earliest opportunity
1097   Capability_context_switch(MyCapability()) = 1 :: CInt;
1098   
1099   RET_P(threadid);
1100 }
1101
1102 yieldzh_fast
1103 {
1104   jump stg_yield_noregs;
1105 }
1106
1107 myThreadIdzh_fast
1108 {
1109   /* no args. */
1110   RET_P(CurrentTSO);
1111 }
1112
1113 labelThreadzh_fast
1114 {
1115   /* args: 
1116         R1 = ThreadId#
1117         R2 = Addr# */
1118 #ifdef DEBUG
1119   foreign "C" labelThread(R1 "ptr", R2 "ptr") [];
1120 #endif
1121   jump %ENTRY_CODE(Sp(0));
1122 }
1123
1124 isCurrentThreadBoundzh_fast
1125 {
1126   /* no args */
1127   W_ r;
1128   (r) = foreign "C" isThreadBound(CurrentTSO) [];
1129   RET_N(r);
1130 }
1131
1132 threadStatuszh_fast
1133 {
1134     /* args: R1 :: ThreadId# */
1135     W_ tso;
1136     W_ why_blocked;
1137     W_ what_next;
1138     W_ ret;
1139
1140     tso = R1;
1141     loop:
1142       if (TO_W_(StgTSO_what_next(tso)) == ThreadRelocated) {
1143           tso = StgTSO__link(tso);
1144           goto loop;
1145       }
1146
1147     what_next   = TO_W_(StgTSO_what_next(tso));
1148     why_blocked = TO_W_(StgTSO_why_blocked(tso));
1149     // Note: these two reads are not atomic, so they might end up
1150     // being inconsistent.  It doesn't matter, since we
1151     // only return one or the other.  If we wanted to return the
1152     // contents of block_info too, then we'd have to do some synchronisation.
1153
1154     if (what_next == ThreadComplete) {
1155         ret = 16;  // NB. magic, matches up with GHC.Conc.threadStatus
1156     } else {
1157         if (what_next == ThreadKilled) {
1158             ret = 17;
1159         } else {
1160             ret = why_blocked;
1161         }
1162     }
1163     RET_N(ret);
1164 }
1165
1166 /* -----------------------------------------------------------------------------
1167  * TVar primitives
1168  * -------------------------------------------------------------------------- */
1169
1170 #define SP_OFF 0
1171
1172 // Catch retry frame ------------------------------------------------------------
1173
1174 INFO_TABLE_RET(stg_catch_retry_frame, CATCH_RETRY_FRAME,
1175 #if defined(PROFILING)
1176   W_ unused1, W_ unused2,
1177 #endif
1178   W_ unused3, P_ unused4, P_ unused5)
1179 {
1180    W_ r, frame, trec, outer;
1181
1182    frame = Sp;
1183    trec = StgTSO_trec(CurrentTSO);
1184    ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
1185    (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
1186    if (r != 0) {
1187      /* Succeeded (either first branch or second branch) */
1188      StgTSO_trec(CurrentTSO) = outer;
1189      Sp = Sp + SIZEOF_StgCatchRetryFrame;
1190      jump %ENTRY_CODE(Sp(SP_OFF));
1191    } else {
1192      /* Did not commit: re-execute */
1193      W_ new_trec;
1194      ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
1195      StgTSO_trec(CurrentTSO) = new_trec;
1196      if (StgCatchRetryFrame_running_alt_code(frame) != 0::I32) {
1197        R1 = StgCatchRetryFrame_alt_code(frame);
1198      } else {
1199        R1 = StgCatchRetryFrame_first_code(frame);
1200      }
1201      jump stg_ap_v_fast;
1202    }
1203 }
1204
1205
1206 // Atomically frame ------------------------------------------------------------
1207
1208 INFO_TABLE_RET(stg_atomically_frame, ATOMICALLY_FRAME,
1209 #if defined(PROFILING)
1210   W_ unused1, W_ unused2,
1211 #endif
1212   P_ unused3, P_ unused4)
1213 {
1214   W_ frame, trec, valid, next_invariant, q, outer;
1215
1216   frame = Sp;
1217   trec = StgTSO_trec(CurrentTSO);
1218   ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
1219
1220   if (outer == NO_TREC) {
1221     /* First time back at the atomically frame -- pick up invariants */
1222     ("ptr" q) = foreign "C" stmGetInvariantsToCheck(MyCapability() "ptr", trec "ptr") [];
1223     StgAtomicallyFrame_next_invariant_to_check(frame) = q;
1224
1225   } else {
1226     /* Second/subsequent time back at the atomically frame -- abort the
1227      * tx that's checking the invariant and move on to the next one */
1228     StgTSO_trec(CurrentTSO) = outer;
1229     q = StgAtomicallyFrame_next_invariant_to_check(frame);
1230     StgInvariantCheckQueue_my_execution(q) = trec;
1231     foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
1232     /* Don't free trec -- it's linked from q and will be stashed in the
1233      * invariant if we eventually commit. */
1234     q = StgInvariantCheckQueue_next_queue_entry(q);
1235     StgAtomicallyFrame_next_invariant_to_check(frame) = q;
1236     trec = outer;
1237   }
1238
1239   q = StgAtomicallyFrame_next_invariant_to_check(frame);
1240
1241   if (q != END_INVARIANT_CHECK_QUEUE) {
1242     /* We can't commit yet: another invariant to check */
1243     ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [];
1244     StgTSO_trec(CurrentTSO) = trec;
1245
1246     next_invariant = StgInvariantCheckQueue_invariant(q);
1247     R1 = StgAtomicInvariant_code(next_invariant);
1248     jump stg_ap_v_fast;
1249
1250   } else {
1251
1252     /* We've got no more invariants to check, try to commit */
1253     (valid) = foreign "C" stmCommitTransaction(MyCapability() "ptr", trec "ptr") [];
1254     if (valid != 0) {
1255       /* Transaction was valid: commit succeeded */
1256       StgTSO_trec(CurrentTSO) = NO_TREC;
1257       Sp = Sp + SIZEOF_StgAtomicallyFrame;
1258       jump %ENTRY_CODE(Sp(SP_OFF));
1259     } else {
1260       /* Transaction was not valid: try again */
1261       ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
1262       StgTSO_trec(CurrentTSO) = trec;
1263       StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;
1264       R1 = StgAtomicallyFrame_code(frame);
1265       jump stg_ap_v_fast;
1266     }
1267   }
1268 }
1269
1270 INFO_TABLE_RET(stg_atomically_waiting_frame, ATOMICALLY_FRAME,
1271 #if defined(PROFILING)
1272   W_ unused1, W_ unused2,
1273 #endif
1274   P_ unused3, P_ unused4)
1275 {
1276   W_ frame, trec, valid;
1277
1278   frame = Sp;
1279
1280   /* The TSO is currently waiting: should we stop waiting? */
1281   (valid) = foreign "C" stmReWait(MyCapability() "ptr", CurrentTSO "ptr") [];
1282   if (valid != 0) {
1283     /* Previous attempt is still valid: no point trying again yet */
1284     jump stg_block_noregs;
1285   } else {
1286     /* Previous attempt is no longer valid: try again */
1287     ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
1288     StgTSO_trec(CurrentTSO) = trec;
1289     StgHeader_info(frame) = stg_atomically_frame_info;
1290     R1 = StgAtomicallyFrame_code(frame);
1291     jump stg_ap_v_fast;
1292   }
1293 }
1294
1295 // STM catch frame --------------------------------------------------------------
1296
1297 #define SP_OFF 0
1298
1299 /* Catch frames are very similar to update frames, but when entering
1300  * one we just pop the frame off the stack and perform the correct
1301  * kind of return to the activation record underneath us on the stack.
1302  */
1303
1304 INFO_TABLE_RET(stg_catch_stm_frame, CATCH_STM_FRAME,
1305 #if defined(PROFILING)
1306   W_ unused1, W_ unused2,
1307 #endif
1308   P_ unused3, P_ unused4)
1309    {
1310       W_ r, frame, trec, outer;
1311       frame = Sp;
1312       trec = StgTSO_trec(CurrentTSO);
1313       ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
1314       (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
1315       if (r != 0) {
1316         /* Commit succeeded */
1317         StgTSO_trec(CurrentTSO) = outer;
1318         Sp = Sp + SIZEOF_StgCatchSTMFrame;
1319         jump Sp(SP_OFF);
1320       } else {
1321         /* Commit failed */
1322         W_ new_trec;
1323         ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
1324         StgTSO_trec(CurrentTSO) = new_trec;
1325         R1 = StgCatchSTMFrame_code(frame);
1326         jump stg_ap_v_fast;
1327       }
1328    }
1329
1330
1331 // Primop definition ------------------------------------------------------------
1332
1333 atomicallyzh_fast
1334 {
1335   W_ frame;
1336   W_ old_trec;
1337   W_ new_trec;
1338   
1339   // stmStartTransaction may allocate
1340   MAYBE_GC (R1_PTR, atomicallyzh_fast); 
1341
1342   /* Args: R1 = m :: STM a */
1343   STK_CHK_GEN(SIZEOF_StgAtomicallyFrame + WDS(1), R1_PTR, atomicallyzh_fast);
1344
1345   old_trec = StgTSO_trec(CurrentTSO);
1346
1347   /* Nested transactions are not allowed; raise an exception */
1348   if (old_trec != NO_TREC) {
1349      R1 = base_ControlziExceptionziBase_nestedAtomically_closure;
1350      jump raisezh_fast;
1351   }
1352
1353   /* Set up the atomically frame */
1354   Sp = Sp - SIZEOF_StgAtomicallyFrame;
1355   frame = Sp;
1356
1357   SET_HDR(frame,stg_atomically_frame_info, W_[CCCS]);
1358   StgAtomicallyFrame_code(frame) = R1;
1359   StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;
1360
1361   /* Start the memory transcation */
1362   ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", old_trec "ptr") [R1];
1363   StgTSO_trec(CurrentTSO) = new_trec;
1364
1365   /* Apply R1 to the realworld token */
1366   jump stg_ap_v_fast;
1367 }
1368
1369
1370 catchSTMzh_fast
1371 {
1372   W_ frame;
1373   
1374   /* Args: R1 :: STM a */
1375   /* Args: R2 :: Exception -> STM a */
1376   STK_CHK_GEN(SIZEOF_StgCatchSTMFrame + WDS(1), R1_PTR & R2_PTR, catchSTMzh_fast);
1377
1378   /* Set up the catch frame */
1379   Sp = Sp - SIZEOF_StgCatchSTMFrame;
1380   frame = Sp;
1381
1382   SET_HDR(frame, stg_catch_stm_frame_info, W_[CCCS]);
1383   StgCatchSTMFrame_handler(frame) = R2;
1384   StgCatchSTMFrame_code(frame) = R1;
1385
1386   /* Start a nested transaction to run the body of the try block in */
1387   W_ cur_trec;  
1388   W_ new_trec;
1389   cur_trec = StgTSO_trec(CurrentTSO);
1390   ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", cur_trec "ptr");
1391   StgTSO_trec(CurrentTSO) = new_trec;
1392
1393   /* Apply R1 to the realworld token */
1394   jump stg_ap_v_fast;
1395 }
1396
1397
1398 catchRetryzh_fast
1399 {
1400   W_ frame;
1401   W_ new_trec;
1402   W_ trec;
1403
1404   // stmStartTransaction may allocate
1405   MAYBE_GC (R1_PTR & R2_PTR, catchRetryzh_fast); 
1406
1407   /* Args: R1 :: STM a */
1408   /* Args: R2 :: STM a */
1409   STK_CHK_GEN(SIZEOF_StgCatchRetryFrame + WDS(1), R1_PTR & R2_PTR, catchRetryzh_fast);
1410
1411   /* Start a nested transaction within which to run the first code */
1412   trec = StgTSO_trec(CurrentTSO);
1413   ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [R1,R2];
1414   StgTSO_trec(CurrentTSO) = new_trec;
1415
1416   /* Set up the catch-retry frame */
1417   Sp = Sp - SIZEOF_StgCatchRetryFrame;
1418   frame = Sp;
1419   
1420   SET_HDR(frame, stg_catch_retry_frame_info, W_[CCCS]);
1421   StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false;
1422   StgCatchRetryFrame_first_code(frame) = R1;
1423   StgCatchRetryFrame_alt_code(frame) = R2;
1424
1425   /* Apply R1 to the realworld token */
1426   jump stg_ap_v_fast;
1427 }
1428
1429
1430 retryzh_fast
1431 {
1432   W_ frame_type;
1433   W_ frame;
1434   W_ trec;
1435   W_ outer;
1436   W_ r;
1437
1438   MAYBE_GC (NO_PTRS, retryzh_fast); // STM operations may allocate
1439
1440   // Find the enclosing ATOMICALLY_FRAME or CATCH_RETRY_FRAME
1441 retry_pop_stack:
1442   StgTSO_sp(CurrentTSO) = Sp;
1443   (frame_type) = foreign "C" findRetryFrameHelper(CurrentTSO "ptr") [];
1444   Sp = StgTSO_sp(CurrentTSO);
1445   frame = Sp;
1446   trec = StgTSO_trec(CurrentTSO);
1447   ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
1448
1449   if (frame_type == CATCH_RETRY_FRAME) {
1450     // The retry reaches a CATCH_RETRY_FRAME before the atomic frame
1451     ASSERT(outer != NO_TREC);
1452     // Abort the transaction attempting the current branch
1453     foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
1454     foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
1455     if (!StgCatchRetryFrame_running_alt_code(frame) != 0::I32) {
1456       // Retry in the first branch: try the alternative
1457       ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
1458       StgTSO_trec(CurrentTSO) = trec;
1459       StgCatchRetryFrame_running_alt_code(frame) = 1 :: CInt; // true;
1460       R1 = StgCatchRetryFrame_alt_code(frame);
1461       jump stg_ap_v_fast;
1462     } else {
1463       // Retry in the alternative code: propagate the retry
1464       StgTSO_trec(CurrentTSO) = outer;
1465       Sp = Sp + SIZEOF_StgCatchRetryFrame;
1466       goto retry_pop_stack;
1467     }
1468   }
1469
1470   // We've reached the ATOMICALLY_FRAME: attempt to wait 
1471   ASSERT(frame_type == ATOMICALLY_FRAME);
1472   if (outer != NO_TREC) {
1473     // We called retry while checking invariants, so abort the current
1474     // invariant check (merging its TVar accesses into the parents read
1475     // set so we'll wait on them)
1476     foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
1477     foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
1478     trec = outer;
1479     StgTSO_trec(CurrentTSO) = trec;
1480     ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
1481   }
1482   ASSERT(outer == NO_TREC);
1483
1484   (r) = foreign "C" stmWait(MyCapability() "ptr", CurrentTSO "ptr", trec "ptr") [];
1485   if (r != 0) {
1486     // Transaction was valid: stmWait put us on the TVars' queues, we now block
1487     StgHeader_info(frame) = stg_atomically_waiting_frame_info;
1488     Sp = frame;
1489     // Fix up the stack in the unregisterised case: the return convention is different.
1490     R3 = trec; // passing to stmWaitUnblock()
1491     jump stg_block_stmwait;
1492   } else {
1493     // Transaction was not valid: retry immediately
1494     ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
1495     StgTSO_trec(CurrentTSO) = trec;
1496     R1 = StgAtomicallyFrame_code(frame);
1497     Sp = frame;
1498     jump stg_ap_v_fast;
1499   }
1500 }
1501
1502
1503 checkzh_fast
1504 {
1505   W_ trec, closure;
1506
1507   /* Args: R1 = invariant closure */
1508   MAYBE_GC (R1_PTR, checkzh_fast); 
1509
1510   trec = StgTSO_trec(CurrentTSO);
1511   closure = R1;
1512   foreign "C" stmAddInvariantToCheck(MyCapability() "ptr", 
1513                                      trec "ptr",
1514                                      closure "ptr") [];
1515
1516   jump %ENTRY_CODE(Sp(0));
1517 }
1518
1519
1520 newTVarzh_fast
1521 {
1522   W_ tv;
1523   W_ new_value;
1524
1525   /* Args: R1 = initialisation value */
1526
1527   MAYBE_GC (R1_PTR, newTVarzh_fast); 
1528   new_value = R1;
1529   ("ptr" tv) = foreign "C" stmNewTVar(MyCapability() "ptr", new_value "ptr") [];
1530   RET_P(tv);
1531 }
1532
1533
1534 readTVarzh_fast
1535 {
1536   W_ trec;
1537   W_ tvar;
1538   W_ result;
1539
1540   /* Args: R1 = TVar closure */
1541
1542   MAYBE_GC (R1_PTR, readTVarzh_fast); // Call to stmReadTVar may allocate
1543   trec = StgTSO_trec(CurrentTSO);
1544   tvar = R1;
1545   ("ptr" result) = foreign "C" stmReadTVar(MyCapability() "ptr", trec "ptr", tvar "ptr") [];
1546
1547   RET_P(result);
1548 }
1549
1550 readTVarIOzh_fast
1551 {
1552     W_ result;
1553
1554 again:
1555     result = StgTVar_current_value(R1);
1556     if (%INFO_PTR(result) == stg_TREC_HEADER_info) {
1557         goto again;
1558     }
1559     RET_P(result);
1560 }
1561
1562 writeTVarzh_fast
1563 {
1564   W_ trec;
1565   W_ tvar;
1566   W_ new_value;
1567   
1568   /* Args: R1 = TVar closure */
1569   /*       R2 = New value    */
1570
1571   MAYBE_GC (R1_PTR & R2_PTR, writeTVarzh_fast); // Call to stmWriteTVar may allocate
1572   trec = StgTSO_trec(CurrentTSO);
1573   tvar = R1;
1574   new_value = R2;
1575   foreign "C" stmWriteTVar(MyCapability() "ptr", trec "ptr", tvar "ptr", new_value "ptr") [];
1576
1577   jump %ENTRY_CODE(Sp(0));
1578 }
1579
1580
1581 /* -----------------------------------------------------------------------------
1582  * MVar primitives
1583  *
1584  * take & putMVar work as follows.  Firstly, an important invariant:
1585  *
1586  *    If the MVar is full, then the blocking queue contains only
1587  *    threads blocked on putMVar, and if the MVar is empty then the
1588  *    blocking queue contains only threads blocked on takeMVar.
1589  *
1590  * takeMvar:
1591  *    MVar empty : then add ourselves to the blocking queue
1592  *    MVar full  : remove the value from the MVar, and
1593  *                 blocking queue empty     : return
1594  *                 blocking queue non-empty : perform the first blocked putMVar
1595  *                                            from the queue, and wake up the
1596  *                                            thread (MVar is now full again)
1597  *
1598  * putMVar is just the dual of the above algorithm.
1599  *
1600  * How do we "perform a putMVar"?  Well, we have to fiddle around with
1601  * the stack of the thread waiting to do the putMVar.  See
1602  * stg_block_putmvar and stg_block_takemvar in HeapStackCheck.c for
1603  * the stack layout, and the PerformPut and PerformTake macros below.
1604  *
1605  * It is important that a blocked take or put is woken up with the
1606  * take/put already performed, because otherwise there would be a
1607  * small window of vulnerability where the thread could receive an
1608  * exception and never perform its take or put, and we'd end up with a
1609  * deadlock.
1610  *
1611  * -------------------------------------------------------------------------- */
1612
1613 isEmptyMVarzh_fast
1614 {
1615     /* args: R1 = MVar closure */
1616
1617     if (StgMVar_value(R1) == stg_END_TSO_QUEUE_closure) {
1618         RET_N(1);
1619     } else {
1620         RET_N(0);
1621     }
1622 }
1623
1624 newMVarzh_fast
1625 {
1626     /* args: none */
1627     W_ mvar;
1628
1629     ALLOC_PRIM ( SIZEOF_StgMVar, NO_PTRS, newMVarzh_fast );
1630   
1631     mvar = Hp - SIZEOF_StgMVar + WDS(1);
1632     SET_HDR(mvar,stg_MVAR_DIRTY_info,W_[CCCS]);
1633         // MVARs start dirty: generation 0 has no mutable list
1634     StgMVar_head(mvar)  = stg_END_TSO_QUEUE_closure;
1635     StgMVar_tail(mvar)  = stg_END_TSO_QUEUE_closure;
1636     StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
1637     RET_P(mvar);
1638 }
1639
1640
1641 #define PerformTake(tso, value)                         \
1642     W_[StgTSO_sp(tso) + WDS(1)] = value;                \
1643     W_[StgTSO_sp(tso) + WDS(0)] = stg_gc_unpt_r1_info;
1644
1645 #define PerformPut(tso,lval)                    \
1646     StgTSO_sp(tso) = StgTSO_sp(tso) + WDS(3);   \
1647     lval = W_[StgTSO_sp(tso) - WDS(1)];
1648
1649 takeMVarzh_fast
1650 {
1651     W_ mvar, val, info, tso;
1652
1653     /* args: R1 = MVar closure */
1654     mvar = R1;
1655
1656 #if defined(THREADED_RTS)
1657     ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
1658 #else
1659     info = GET_INFO(mvar);
1660 #endif
1661         
1662     if (info == stg_MVAR_CLEAN_info) {
1663         foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") [];
1664     }
1665
1666     /* If the MVar is empty, put ourselves on its blocking queue,
1667      * and wait until we're woken up.
1668      */
1669     if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
1670         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1671             StgMVar_head(mvar) = CurrentTSO;
1672         } else {
1673             foreign "C" setTSOLink(MyCapability() "ptr", 
1674                                    StgMVar_tail(mvar) "ptr",
1675                                    CurrentTSO) [];
1676         }
1677         StgTSO__link(CurrentTSO)       = stg_END_TSO_QUEUE_closure;
1678         StgTSO_block_info(CurrentTSO)  = mvar;
1679         // write barrier for throwTo(), which looks at block_info
1680         // if why_blocked==BlockedOnMVar.
1681         prim %write_barrier() [];
1682         StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
1683         StgMVar_tail(mvar) = CurrentTSO;
1684         
1685         R1 = mvar;
1686         jump stg_block_takemvar;
1687   }
1688
1689   /* we got the value... */
1690   val = StgMVar_value(mvar);
1691
1692   if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure)
1693   {
1694       /* There are putMVar(s) waiting... 
1695        * wake up the first thread on the queue
1696        */
1697       ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
1698
1699       /* actually perform the putMVar for the thread that we just woke up */
1700       tso = StgMVar_head(mvar);
1701       PerformPut(tso,StgMVar_value(mvar));
1702
1703       if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
1704           foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
1705       }
1706
1707       ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
1708                                             StgMVar_head(mvar) "ptr", 1) [];
1709       StgMVar_head(mvar) = tso;
1710
1711       if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1712           StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
1713       }
1714
1715 #if defined(THREADED_RTS)
1716       unlockClosure(mvar, stg_MVAR_DIRTY_info);
1717 #else
1718       SET_INFO(mvar,stg_MVAR_DIRTY_info);
1719 #endif
1720       RET_P(val);
1721   } 
1722   else
1723   {
1724       /* No further putMVars, MVar is now empty */
1725       StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
1726  
1727 #if defined(THREADED_RTS)
1728       unlockClosure(mvar, stg_MVAR_DIRTY_info);
1729 #else
1730       SET_INFO(mvar,stg_MVAR_DIRTY_info);
1731 #endif
1732
1733       RET_P(val);
1734   }
1735 }
1736
1737
1738 tryTakeMVarzh_fast
1739 {
1740     W_ mvar, val, info, tso;
1741
1742     /* args: R1 = MVar closure */
1743
1744     mvar = R1;
1745
1746 #if defined(THREADED_RTS)
1747     ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
1748 #else
1749     info = GET_INFO(mvar);
1750 #endif
1751
1752     if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
1753 #if defined(THREADED_RTS)
1754         unlockClosure(mvar, info);
1755 #endif
1756         /* HACK: we need a pointer to pass back, 
1757          * so we abuse NO_FINALIZER_closure
1758          */
1759         RET_NP(0, stg_NO_FINALIZER_closure);
1760     }
1761
1762     if (info == stg_MVAR_CLEAN_info) {
1763         foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
1764     }
1765
1766     /* we got the value... */
1767     val = StgMVar_value(mvar);
1768
1769     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
1770
1771         /* There are putMVar(s) waiting... 
1772          * wake up the first thread on the queue
1773          */
1774         ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
1775
1776         /* actually perform the putMVar for the thread that we just woke up */
1777         tso = StgMVar_head(mvar);
1778         PerformPut(tso,StgMVar_value(mvar));
1779         if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
1780             foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
1781         }
1782
1783         ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
1784                                               StgMVar_head(mvar) "ptr", 1) [];
1785         StgMVar_head(mvar) = tso;
1786
1787         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1788             StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
1789         }
1790 #if defined(THREADED_RTS)
1791         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1792 #else
1793         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1794 #endif
1795     }
1796     else 
1797     {
1798         /* No further putMVars, MVar is now empty */
1799         StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
1800 #if defined(THREADED_RTS)
1801         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1802 #else
1803         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1804 #endif
1805     }
1806     
1807     RET_NP(1, val);
1808 }
1809
1810
1811 putMVarzh_fast
1812 {
1813     W_ mvar, val, info, tso;
1814
1815     /* args: R1 = MVar, R2 = value */
1816     mvar = R1;
1817     val  = R2;
1818
1819 #if defined(THREADED_RTS)
1820     ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
1821 #else
1822     info = GET_INFO(mvar);
1823 #endif
1824
1825     if (info == stg_MVAR_CLEAN_info) {
1826         foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
1827     }
1828
1829     if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
1830         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1831             StgMVar_head(mvar) = CurrentTSO;
1832         } else {
1833             foreign "C" setTSOLink(MyCapability() "ptr", 
1834                                    StgMVar_tail(mvar) "ptr",
1835                                    CurrentTSO) [];
1836         }
1837         StgTSO__link(CurrentTSO)       = stg_END_TSO_QUEUE_closure;
1838         StgTSO_block_info(CurrentTSO)  = mvar;
1839         // write barrier for throwTo(), which looks at block_info
1840         // if why_blocked==BlockedOnMVar.
1841         prim %write_barrier() [];
1842         StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
1843         StgMVar_tail(mvar) = CurrentTSO;
1844         
1845         R1 = mvar;
1846         R2 = val;
1847         jump stg_block_putmvar;
1848     }
1849   
1850     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
1851
1852         /* There are takeMVar(s) waiting: wake up the first one
1853          */
1854         ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
1855
1856         /* actually perform the takeMVar */
1857         tso = StgMVar_head(mvar);
1858         PerformTake(tso, val);
1859         if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
1860             foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
1861         }
1862       
1863         ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
1864                                               StgMVar_head(mvar) "ptr", 1) [];
1865         StgMVar_head(mvar) = tso;
1866
1867         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1868             StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
1869         }
1870
1871 #if defined(THREADED_RTS)
1872         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1873 #else
1874         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1875 #endif
1876         jump %ENTRY_CODE(Sp(0));
1877     }
1878     else
1879     {
1880         /* No further takes, the MVar is now full. */
1881         StgMVar_value(mvar) = val;
1882
1883 #if defined(THREADED_RTS)
1884         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1885 #else
1886         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1887 #endif
1888         jump %ENTRY_CODE(Sp(0));
1889     }
1890     
1891     /* ToDo: yield afterward for better communication performance? */
1892 }
1893
1894
1895 tryPutMVarzh_fast
1896 {
1897     W_ mvar, info, tso;
1898
1899     /* args: R1 = MVar, R2 = value */
1900     mvar = R1;
1901
1902 #if defined(THREADED_RTS)
1903     ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [R2];
1904 #else
1905     info = GET_INFO(mvar);
1906 #endif
1907
1908     if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
1909 #if defined(THREADED_RTS)
1910         unlockClosure(mvar, info);
1911 #endif
1912         RET_N(0);
1913     }
1914   
1915     if (info == stg_MVAR_CLEAN_info) {
1916         foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
1917     }
1918
1919     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
1920
1921         /* There are takeMVar(s) waiting: wake up the first one
1922          */
1923         ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
1924         
1925         /* actually perform the takeMVar */
1926         tso = StgMVar_head(mvar);
1927         PerformTake(tso, R2);
1928         if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
1929             foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
1930         }
1931       
1932         ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
1933                                               StgMVar_head(mvar) "ptr", 1) [];
1934         StgMVar_head(mvar) = tso;
1935
1936         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1937             StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
1938         }
1939
1940 #if defined(THREADED_RTS)
1941         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1942 #else
1943         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1944 #endif
1945     }
1946     else
1947     {
1948         /* No further takes, the MVar is now full. */
1949         StgMVar_value(mvar) = R2;
1950
1951 #if defined(THREADED_RTS)
1952         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1953 #else
1954         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1955 #endif
1956     }
1957     
1958     RET_N(1);
1959     /* ToDo: yield afterward for better communication performance? */
1960 }
1961
1962
1963 /* -----------------------------------------------------------------------------
1964    Stable pointer primitives
1965    -------------------------------------------------------------------------  */
1966
1967 makeStableNamezh_fast
1968 {
1969     W_ index, sn_obj;
1970
1971     ALLOC_PRIM( SIZEOF_StgStableName, R1_PTR, makeStableNamezh_fast );
1972   
1973     (index) = foreign "C" lookupStableName(R1 "ptr") [];
1974
1975     /* Is there already a StableName for this heap object?
1976      *  stable_ptr_table is a pointer to an array of snEntry structs.
1977      */
1978     if ( snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) == NULL ) {
1979         sn_obj = Hp - SIZEOF_StgStableName + WDS(1);
1980         SET_HDR(sn_obj, stg_STABLE_NAME_info, W_[CCCS]);
1981         StgStableName_sn(sn_obj) = index;
1982         snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) = sn_obj;
1983     } else {
1984         sn_obj = snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry);
1985     }
1986     
1987     RET_P(sn_obj);
1988 }
1989
1990
1991 makeStablePtrzh_fast
1992 {
1993     /* Args: R1 = a */
1994     W_ sp;
1995     MAYBE_GC(R1_PTR, makeStablePtrzh_fast);
1996     ("ptr" sp) = foreign "C" getStablePtr(R1 "ptr") [];
1997     RET_N(sp);
1998 }
1999
2000 deRefStablePtrzh_fast
2001 {
2002     /* Args: R1 = the stable ptr */
2003     W_ r, sp;
2004     sp = R1;
2005     r = snEntry_addr(W_[stable_ptr_table] + sp*SIZEOF_snEntry);
2006     RET_P(r);
2007 }
2008
2009 /* -----------------------------------------------------------------------------
2010    Bytecode object primitives
2011    -------------------------------------------------------------------------  */
2012
2013 newBCOzh_fast
2014 {
2015     /* R1 = instrs
2016        R2 = literals
2017        R3 = ptrs
2018        R4 = arity
2019        R5 = bitmap array
2020     */
2021     W_ bco, bitmap_arr, bytes, words;
2022     
2023     bitmap_arr = R5;
2024
2025     words = BYTES_TO_WDS(SIZEOF_StgBCO) + StgArrWords_words(bitmap_arr);
2026     bytes = WDS(words);
2027
2028     ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R5_PTR, newBCOzh_fast );
2029
2030     bco = Hp - bytes + WDS(1);
2031     SET_HDR(bco, stg_BCO_info, W_[CCCS]);
2032     
2033     StgBCO_instrs(bco)     = R1;
2034     StgBCO_literals(bco)   = R2;
2035     StgBCO_ptrs(bco)       = R3;
2036     StgBCO_arity(bco)      = HALF_W_(R4);
2037     StgBCO_size(bco)       = HALF_W_(words);
2038     
2039     // Copy the arity/bitmap info into the BCO
2040     W_ i;
2041     i = 0;
2042 for:
2043     if (i < StgArrWords_words(bitmap_arr)) {
2044         StgBCO_bitmap(bco,i) = StgArrWords_payload(bitmap_arr,i);
2045         i = i + 1;
2046         goto for;
2047     }
2048     
2049     RET_P(bco);
2050 }
2051
2052
2053 mkApUpd0zh_fast
2054 {
2055     // R1 = the BCO# for the AP
2056     //  
2057     W_ ap;
2058
2059     // This function is *only* used to wrap zero-arity BCOs in an
2060     // updatable wrapper (see ByteCodeLink.lhs).  An AP thunk is always
2061     // saturated and always points directly to a FUN or BCO.
2062     ASSERT(%INFO_TYPE(%GET_STD_INFO(R1)) == HALF_W_(BCO) &&
2063            StgBCO_arity(R1) == HALF_W_(0));
2064
2065     HP_CHK_GEN_TICKY(SIZEOF_StgAP, R1_PTR, mkApUpd0zh_fast);
2066     TICK_ALLOC_UP_THK(0, 0);
2067     CCCS_ALLOC(SIZEOF_StgAP);
2068
2069     ap = Hp - SIZEOF_StgAP + WDS(1);
2070     SET_HDR(ap, stg_AP_info, W_[CCCS]);
2071     
2072     StgAP_n_args(ap) = HALF_W_(0);
2073     StgAP_fun(ap) = R1;
2074     
2075     RET_P(ap);
2076 }
2077
2078 unpackClosurezh_fast
2079 {
2080 /* args: R1 = closure to analyze */
2081 // TODO: Consider the absence of ptrs or nonptrs as a special case ?
2082
2083     W_ info, ptrs, nptrs, p, ptrs_arr, nptrs_arr;
2084     info  = %GET_STD_INFO(UNTAG(R1));
2085
2086     // Some closures have non-standard layout, so we omit those here.
2087     W_ type;
2088     type = TO_W_(%INFO_TYPE(info));
2089     switch [0 .. N_CLOSURE_TYPES] type {
2090     case THUNK_SELECTOR : {
2091         ptrs = 1;
2092         nptrs = 0;
2093         goto out;
2094     }
2095     case THUNK, THUNK_1_0, THUNK_0_1, THUNK_2_0, THUNK_1_1, 
2096          THUNK_0_2, THUNK_STATIC, AP, PAP, AP_STACK, BCO : {
2097         ptrs = 0;
2098         nptrs = 0;
2099         goto out;
2100     }
2101     default: {
2102         ptrs  = TO_W_(%INFO_PTRS(info)); 
2103         nptrs = TO_W_(%INFO_NPTRS(info));
2104         goto out;
2105     }}
2106 out:
2107
2108     W_ ptrs_arr_sz, nptrs_arr_sz;
2109     nptrs_arr_sz = SIZEOF_StgArrWords   + WDS(nptrs);
2110     ptrs_arr_sz  = SIZEOF_StgMutArrPtrs + WDS(ptrs);
2111
2112     ALLOC_PRIM (ptrs_arr_sz + nptrs_arr_sz, R1_PTR, unpackClosurezh_fast);
2113
2114     W_ clos;
2115     clos = UNTAG(R1);
2116
2117     ptrs_arr  = Hp - nptrs_arr_sz - ptrs_arr_sz + WDS(1);
2118     nptrs_arr = Hp - nptrs_arr_sz + WDS(1);
2119
2120     SET_HDR(ptrs_arr, stg_MUT_ARR_PTRS_FROZEN_info, W_[CCCS]);
2121     StgMutArrPtrs_ptrs(ptrs_arr) = ptrs;
2122     p = 0;
2123 for:
2124     if(p < ptrs) {
2125          W_[ptrs_arr + SIZEOF_StgMutArrPtrs + WDS(p)] = StgClosure_payload(clos,p);
2126          p = p + 1;
2127          goto for;
2128     }
2129     
2130     SET_HDR(nptrs_arr, stg_ARR_WORDS_info, W_[CCCS]);
2131     StgArrWords_words(nptrs_arr) = nptrs;
2132     p = 0;
2133 for2:
2134     if(p < nptrs) {
2135          W_[BYTE_ARR_CTS(nptrs_arr) + WDS(p)] = StgClosure_payload(clos, p+ptrs);
2136          p = p + 1;
2137          goto for2;
2138     }
2139     RET_NPP(info, ptrs_arr, nptrs_arr);
2140 }
2141
2142 /* -----------------------------------------------------------------------------
2143    Thread I/O blocking primitives
2144    -------------------------------------------------------------------------- */
2145
2146 /* Add a thread to the end of the blocked queue. (C-- version of the C
2147  * macro in Schedule.h).
2148  */
2149 #define APPEND_TO_BLOCKED_QUEUE(tso)                    \
2150     ASSERT(StgTSO__link(tso) == END_TSO_QUEUE);         \
2151     if (W_[blocked_queue_hd] == END_TSO_QUEUE) {        \
2152       W_[blocked_queue_hd] = tso;                       \
2153     } else {                                            \
2154       foreign "C" setTSOLink(MyCapability() "ptr", W_[blocked_queue_tl] "ptr", tso) []; \
2155     }                                                   \
2156     W_[blocked_queue_tl] = tso;
2157
2158 waitReadzh_fast
2159 {
2160     /* args: R1 */
2161 #ifdef THREADED_RTS
2162     foreign "C" barf("waitRead# on threaded RTS") never returns;
2163 #else
2164
2165     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2166     StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
2167     StgTSO_block_info(CurrentTSO) = R1;
2168     // No locking - we're not going to use this interface in the
2169     // threaded RTS anyway.
2170     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2171     jump stg_block_noregs;
2172 #endif
2173 }
2174
2175 waitWritezh_fast
2176 {
2177     /* args: R1 */
2178 #ifdef THREADED_RTS
2179     foreign "C" barf("waitWrite# on threaded RTS") never returns;
2180 #else
2181
2182     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2183     StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
2184     StgTSO_block_info(CurrentTSO) = R1;
2185     // No locking - we're not going to use this interface in the
2186     // threaded RTS anyway.
2187     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2188     jump stg_block_noregs;
2189 #endif
2190 }
2191
2192
2193 STRING(stg_delayzh_malloc_str, "delayzh_fast")
2194 delayzh_fast
2195 {
2196 #ifdef mingw32_HOST_OS
2197     W_ ares;
2198     CInt reqID;
2199 #else
2200     W_ t, prev, target;
2201 #endif
2202
2203 #ifdef THREADED_RTS
2204     foreign "C" barf("delay# on threaded RTS") never returns;
2205 #else
2206
2207     /* args: R1 (microsecond delay amount) */
2208     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2209     StgTSO_why_blocked(CurrentTSO) = BlockedOnDelay::I16;
2210
2211 #ifdef mingw32_HOST_OS
2212
2213     /* could probably allocate this on the heap instead */
2214     ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
2215                                             stg_delayzh_malloc_str);
2216     (reqID) = foreign "C" addDelayRequest(R1);
2217     StgAsyncIOResult_reqID(ares)   = reqID;
2218     StgAsyncIOResult_len(ares)     = 0;
2219     StgAsyncIOResult_errCode(ares) = 0;
2220     StgTSO_block_info(CurrentTSO)  = ares;
2221
2222     /* Having all async-blocked threads reside on the blocked_queue
2223      * simplifies matters, so change the status to OnDoProc put the
2224      * delayed thread on the blocked_queue.
2225      */
2226     StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
2227     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2228     jump stg_block_async_void;
2229
2230 #else
2231
2232     W_ time;
2233     W_ divisor;
2234     (time) = foreign "C" getourtimeofday() [R1];
2235     divisor = TO_W_(RtsFlags_MiscFlags_tickInterval(RtsFlags));
2236     if (divisor == 0) {
2237         divisor = 50;
2238     }
2239     divisor = divisor * 1000;
2240     target = ((R1 + divisor - 1) / divisor) /* divide rounding up */
2241            + time + 1; /* Add 1 as getourtimeofday rounds down */
2242     StgTSO_block_info(CurrentTSO) = target;
2243
2244     /* Insert the new thread in the sleeping queue. */
2245     prev = NULL;
2246     t = W_[sleeping_queue];
2247 while:
2248     if (t != END_TSO_QUEUE && StgTSO_block_info(t) < target) {
2249         prev = t;
2250         t = StgTSO__link(t);
2251         goto while;
2252     }
2253
2254     StgTSO__link(CurrentTSO) = t;
2255     if (prev == NULL) {
2256         W_[sleeping_queue] = CurrentTSO;
2257     } else {
2258         foreign "C" setTSOLink(MyCapability() "ptr", prev "ptr", CurrentTSO) [];
2259     }
2260     jump stg_block_noregs;
2261 #endif
2262 #endif /* !THREADED_RTS */
2263 }
2264
2265
2266 #ifdef mingw32_HOST_OS
2267 STRING(stg_asyncReadzh_malloc_str, "asyncReadzh_fast")
2268 asyncReadzh_fast
2269 {
2270     W_ ares;
2271     CInt reqID;
2272
2273 #ifdef THREADED_RTS
2274     foreign "C" barf("asyncRead# on threaded RTS") never returns;
2275 #else
2276
2277     /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
2278     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2279     StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
2280
2281     /* could probably allocate this on the heap instead */
2282     ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
2283                                             stg_asyncReadzh_malloc_str)
2284                         [R1,R2,R3,R4];
2285     (reqID) = foreign "C" addIORequest(R1, 0/*FALSE*/,R2,R3,R4 "ptr") [];
2286     StgAsyncIOResult_reqID(ares)   = reqID;
2287     StgAsyncIOResult_len(ares)     = 0;
2288     StgAsyncIOResult_errCode(ares) = 0;
2289     StgTSO_block_info(CurrentTSO)  = ares;
2290     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2291     jump stg_block_async;
2292 #endif
2293 }
2294
2295 STRING(stg_asyncWritezh_malloc_str, "asyncWritezh_fast")
2296 asyncWritezh_fast
2297 {
2298     W_ ares;
2299     CInt reqID;
2300
2301 #ifdef THREADED_RTS
2302     foreign "C" barf("asyncWrite# on threaded RTS") never returns;
2303 #else
2304
2305     /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
2306     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2307     StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
2308
2309     ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
2310                                             stg_asyncWritezh_malloc_str)
2311                         [R1,R2,R3,R4];
2312     (reqID) = foreign "C" addIORequest(R1, 1/*TRUE*/,R2,R3,R4 "ptr") [];
2313
2314     StgAsyncIOResult_reqID(ares)   = reqID;
2315     StgAsyncIOResult_len(ares)     = 0;
2316     StgAsyncIOResult_errCode(ares) = 0;
2317     StgTSO_block_info(CurrentTSO)  = ares;
2318     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2319     jump stg_block_async;
2320 #endif
2321 }
2322
2323 STRING(stg_asyncDoProczh_malloc_str, "asyncDoProczh_fast")
2324 asyncDoProczh_fast
2325 {
2326     W_ ares;
2327     CInt reqID;
2328
2329 #ifdef THREADED_RTS
2330     foreign "C" barf("asyncDoProc# on threaded RTS") never returns;
2331 #else
2332
2333     /* args: R1 = proc, R2 = param */
2334     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2335     StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
2336
2337     /* could probably allocate this on the heap instead */
2338     ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
2339                                             stg_asyncDoProczh_malloc_str) 
2340                                 [R1,R2];
2341     (reqID) = foreign "C" addDoProcRequest(R1 "ptr",R2 "ptr") [];
2342     StgAsyncIOResult_reqID(ares)   = reqID;
2343     StgAsyncIOResult_len(ares)     = 0;
2344     StgAsyncIOResult_errCode(ares) = 0;
2345     StgTSO_block_info(CurrentTSO) = ares;
2346     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2347     jump stg_block_async;
2348 #endif
2349 }
2350 #endif
2351
2352 // noDuplicate# tries to ensure that none of the thunks under
2353 // evaluation by the current thread are also under evaluation by
2354 // another thread.  It relies on *both* threads doing noDuplicate#;
2355 // the second one will get blocked if they are duplicating some work.
2356 noDuplicatezh_fast
2357 {
2358     SAVE_THREAD_STATE();
2359     ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
2360     foreign "C" threadPaused (MyCapability() "ptr", CurrentTSO "ptr") [];
2361     
2362     if (StgTSO_what_next(CurrentTSO) == ThreadKilled::I16) {
2363         jump stg_threadFinished;
2364     } else {
2365         LOAD_THREAD_STATE();
2366         ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
2367         jump %ENTRY_CODE(Sp(0));
2368     }
2369 }
2370
2371 getApStackValzh_fast
2372 {
2373    W_ ap_stack, offset, val, ok;
2374
2375    /* args: R1 = AP_STACK, R2 = offset */
2376    ap_stack = R1;
2377    offset   = R2;
2378
2379    if (%INFO_PTR(ap_stack) == stg_AP_STACK_info) {
2380         ok = 1;
2381         val = StgAP_STACK_payload(ap_stack,offset); 
2382    } else {
2383         ok = 0;
2384         val = R1;
2385    }
2386    RET_NP(ok,val);
2387 }
2388
2389 /* -----------------------------------------------------------------------------
2390    Misc. primitives
2391    -------------------------------------------------------------------------- */
2392
2393 // Write the cost center stack of the first argument on stderr; return
2394 // the second.  Possibly only makes sense for already evaluated
2395 // things?
2396 traceCcszh_fast
2397 {
2398     W_ ccs;
2399
2400 #ifdef PROFILING
2401     ccs = StgHeader_ccs(UNTAG(R1));
2402     foreign "C" fprintCCS_stderr(ccs "ptr") [R2];
2403 #endif
2404
2405     R1 = R2;
2406     ENTER();
2407 }
2408
2409 getSparkzh_fast
2410 {
2411    W_ spark;
2412
2413 #ifndef THREADED_RTS
2414    RET_NP(0,ghczmprim_GHCziBool_False_closure);
2415 #else
2416    (spark) = foreign "C" findSpark(MyCapability());
2417    if (spark != 0) {
2418       RET_NP(1,spark);
2419    } else {
2420       RET_NP(0,ghczmprim_GHCziBool_False_closure);
2421    }
2422 #endif
2423 }