Convert the gmp cmm primops to use local stack allocation
[ghc-hetmet.git] / rts / PrimOps.cmm
1 /* -----------------------------------------------------------------------------
2  *
3  * (c) The GHC Team, 1998-2004
4  *
5  * Out-of-line primitive operations
6  *
7  * This file contains the implementations of all the primitive
8  * operations ("primops") which are not expanded inline.  See
9  * ghc/compiler/prelude/primops.txt.pp for a list of all the primops;
10  * this file contains code for most of those with the attribute
11  * out_of_line=True.
12  *
13  * Entry convention: the entry convention for a primop is that all the
14  * args are in Stg registers (R1, R2, etc.).  This is to make writing
15  * the primops easier.  (see compiler/codeGen/CgCallConv.hs).
16  *
17  * Return convention: results from a primop are generally returned
18  * using the ordinary unboxed tuple return convention.  The C-- parser
19  * implements the RET_xxxx() macros to perform unboxed-tuple returns
20  * based on the prevailing return convention.
21  *
22  * This file is written in a subset of C--, extended with various
23  * features specific to GHC.  It is compiled by GHC directly.  For the
24  * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
25  *
26  * ---------------------------------------------------------------------------*/
27
28 #include "Cmm.h"
29
30 #ifdef __PIC__
31 #ifndef mingw32_HOST_OS
32 import __gmpz_init;
33 import __gmpz_add;
34 import __gmpz_sub;
35 import __gmpz_mul;
36 import __gmpz_gcd;
37 import __gmpn_gcd_1;
38 import __gmpn_cmp;
39 import __gmpz_tdiv_q;
40 import __gmpz_tdiv_r;
41 import __gmpz_tdiv_qr;
42 import __gmpz_fdiv_qr;
43 import __gmpz_divexact;
44 import __gmpz_and;
45 import __gmpz_xor;
46 import __gmpz_ior;
47 import __gmpz_com;
48 #endif
49 import pthread_mutex_lock;
50 import pthread_mutex_unlock;
51 #endif
52 import base_ControlziExceptionziBase_nestedAtomically_closure;
53 import EnterCriticalSection;
54 import LeaveCriticalSection;
55 import ghczmprim_GHCziBool_False_closure;
56
57 /*-----------------------------------------------------------------------------
58   Array Primitives
59
60   Basically just new*Array - the others are all inline macros.
61
62   The size arg is always passed in R1, and the result returned in R1.
63
64   The slow entry point is for returning from a heap check, the saved
65   size argument must be re-loaded from the stack.
66   -------------------------------------------------------------------------- */
67
68 /* for objects that are *less* than the size of a word, make sure we
69  * round up to the nearest word for the size of the array.
70  */
71
72 newByteArrayzh_fast
73 {
74     W_ words, payload_words, n, p;
75     MAYBE_GC(NO_PTRS,newByteArrayzh_fast);
76     n = R1;
77     payload_words = ROUNDUP_BYTES_TO_WDS(n);
78     words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
79     ("ptr" p) = foreign "C" allocateLocal(MyCapability() "ptr",words) [];
80     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
81     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
82     StgArrWords_words(p) = payload_words;
83     RET_P(p);
84 }
85
86 #define BA_ALIGN 16
87 #define BA_MASK  (BA_ALIGN-1)
88
89 newPinnedByteArrayzh_fast
90 {
91     W_ words, bytes, payload_words, p;
92
93     MAYBE_GC(NO_PTRS,newPinnedByteArrayzh_fast);
94     bytes = R1;
95     /* payload_words is what we will tell the profiler we had to allocate */
96     payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
97     /* When we actually allocate memory, we need to allow space for the
98        header: */
99     bytes = bytes + SIZEOF_StgArrWords;
100     /* And we want to align to BA_ALIGN bytes, so we need to allow space
101        to shift up to BA_ALIGN - 1 bytes: */
102     bytes = bytes + BA_ALIGN - 1;
103     /* Now we convert to a number of words: */
104     words = ROUNDUP_BYTES_TO_WDS(bytes);
105
106     ("ptr" p) = foreign "C" allocatePinned(words) [];
107     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
108
109     /* Now we need to move p forward so that the payload is aligned
110        to BA_ALIGN bytes: */
111     p = p + ((-p - SIZEOF_StgArrWords) & BA_MASK);
112
113     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
114     StgArrWords_words(p) = payload_words;
115     RET_P(p);
116 }
117
118 newAlignedPinnedByteArrayzh_fast
119 {
120     W_ words, bytes, payload_words, p, alignment;
121
122     MAYBE_GC(NO_PTRS,newAlignedPinnedByteArrayzh_fast);
123     bytes = R1;
124     alignment = R2;
125
126     /* payload_words is what we will tell the profiler we had to allocate */
127     payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
128
129     /* When we actually allocate memory, we need to allow space for the
130        header: */
131     bytes = bytes + SIZEOF_StgArrWords;
132     /* And we want to align to <alignment> bytes, so we need to allow space
133        to shift up to <alignment - 1> bytes: */
134     bytes = bytes + alignment - 1;
135     /* Now we convert to a number of words: */
136     words = ROUNDUP_BYTES_TO_WDS(bytes);
137
138     ("ptr" p) = foreign "C" allocatePinned(words) [];
139     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
140
141     /* Now we need to move p forward so that the payload is aligned
142        to <alignment> bytes. Note that we are assuming that
143        <alignment> is a power of 2, which is technically not guaranteed */
144     p = p + ((-p - SIZEOF_StgArrWords) & (alignment - 1));
145
146     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
147     StgArrWords_words(p) = payload_words;
148     RET_P(p);
149 }
150
151 newArrayzh_fast
152 {
153     W_ words, n, init, arr, p;
154     /* Args: R1 = words, R2 = initialisation value */
155
156     n = R1;
157     MAYBE_GC(R2_PTR,newArrayzh_fast);
158
159     words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + n;
160     ("ptr" arr) = foreign "C" allocateLocal(MyCapability() "ptr",words) [R2];
161     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);
162
163     SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
164     StgMutArrPtrs_ptrs(arr) = n;
165
166     // Initialise all elements of the the array with the value in R2
167     init = R2;
168     p = arr + SIZEOF_StgMutArrPtrs;
169   for:
170     if (p < arr + WDS(words)) {
171         W_[p] = init;
172         p = p + WDS(1);
173         goto for;
174     }
175
176     RET_P(arr);
177 }
178
179 unsafeThawArrayzh_fast
180 {
181   // SUBTLETY TO DO WITH THE OLD GEN MUTABLE LIST
182   //
183   // A MUT_ARR_PTRS lives on the mutable list, but a MUT_ARR_PTRS_FROZEN 
184   // normally doesn't.  However, when we freeze a MUT_ARR_PTRS, we leave
185   // it on the mutable list for the GC to remove (removing something from
186   // the mutable list is not easy, because the mut_list is only singly-linked).
187   // 
188   // So that we can tell whether a MUT_ARR_PTRS_FROZEN is on the mutable list,
189   // when we freeze it we set the info ptr to be MUT_ARR_PTRS_FROZEN0
190   // to indicate that it is still on the mutable list.
191   //
192   // So, when we thaw a MUT_ARR_PTRS_FROZEN, we must cope with two cases:
193   // either it is on a mut_list, or it isn't.  We adopt the convention that
194   // the closure type is MUT_ARR_PTRS_FROZEN0 if it is on the mutable list,
195   // and MUT_ARR_PTRS_FROZEN otherwise.  In fact it wouldn't matter if
196   // we put it on the mutable list more than once, but it would get scavenged
197   // multiple times during GC, which would be unnecessarily slow.
198   //
199   if (StgHeader_info(R1) != stg_MUT_ARR_PTRS_FROZEN0_info) {
200         SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
201         recordMutable(R1, R1);
202         // must be done after SET_INFO, because it ASSERTs closure_MUTABLE()
203         RET_P(R1);
204   } else {
205         SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
206         RET_P(R1);
207   }
208 }
209
210 /* -----------------------------------------------------------------------------
211    MutVar primitives
212    -------------------------------------------------------------------------- */
213
214 newMutVarzh_fast
215 {
216     W_ mv;
217     /* Args: R1 = initialisation value */
218
219     ALLOC_PRIM( SIZEOF_StgMutVar, R1_PTR, newMutVarzh_fast);
220
221     mv = Hp - SIZEOF_StgMutVar + WDS(1);
222     SET_HDR(mv,stg_MUT_VAR_DIRTY_info,W_[CCCS]);
223     StgMutVar_var(mv) = R1;
224     
225     RET_P(mv);
226 }
227
228 atomicModifyMutVarzh_fast
229 {
230     W_ mv, f, z, x, y, r, h;
231     /* Args: R1 :: MutVar#,  R2 :: a -> (a,b) */
232
233     /* If x is the current contents of the MutVar#, then 
234        We want to make the new contents point to
235
236          (sel_0 (f x))
237  
238        and the return value is
239          
240          (sel_1 (f x))
241
242         obviously we can share (f x).
243
244          z = [stg_ap_2 f x]  (max (HS + 2) MIN_UPD_SIZE)
245          y = [stg_sel_0 z]   (max (HS + 1) MIN_UPD_SIZE)
246          r = [stg_sel_1 z]   (max (HS + 1) MIN_UPD_SIZE)
247     */
248
249 #if MIN_UPD_SIZE > 1
250 #define THUNK_1_SIZE (SIZEOF_StgThunkHeader + WDS(MIN_UPD_SIZE))
251 #define TICK_ALLOC_THUNK_1() TICK_ALLOC_UP_THK(WDS(1),WDS(MIN_UPD_SIZE-1))
252 #else
253 #define THUNK_1_SIZE (SIZEOF_StgThunkHeader + WDS(1))
254 #define TICK_ALLOC_THUNK_1() TICK_ALLOC_UP_THK(WDS(1),0)
255 #endif
256
257 #if MIN_UPD_SIZE > 2
258 #define THUNK_2_SIZE (SIZEOF_StgThunkHeader + WDS(MIN_UPD_SIZE))
259 #define TICK_ALLOC_THUNK_2() TICK_ALLOC_UP_THK(WDS(2),WDS(MIN_UPD_SIZE-2))
260 #else
261 #define THUNK_2_SIZE (SIZEOF_StgThunkHeader + WDS(2))
262 #define TICK_ALLOC_THUNK_2() TICK_ALLOC_UP_THK(WDS(2),0)
263 #endif
264
265 #define SIZE (THUNK_2_SIZE + THUNK_1_SIZE + THUNK_1_SIZE)
266
267    HP_CHK_GEN_TICKY(SIZE, R1_PTR & R2_PTR, atomicModifyMutVarzh_fast);
268
269    mv = R1;
270    f = R2;
271
272    TICK_ALLOC_THUNK_2();
273    CCCS_ALLOC(THUNK_2_SIZE);
274    z = Hp - THUNK_2_SIZE + WDS(1);
275    SET_HDR(z, stg_ap_2_upd_info, W_[CCCS]);
276    LDV_RECORD_CREATE(z);
277    StgThunk_payload(z,0) = f;
278
279    TICK_ALLOC_THUNK_1();
280    CCCS_ALLOC(THUNK_1_SIZE);
281    y = z - THUNK_1_SIZE;
282    SET_HDR(y, stg_sel_0_upd_info, W_[CCCS]);
283    LDV_RECORD_CREATE(y);
284    StgThunk_payload(y,0) = z;
285
286    TICK_ALLOC_THUNK_1();
287    CCCS_ALLOC(THUNK_1_SIZE);
288    r = y - THUNK_1_SIZE;
289    SET_HDR(r, stg_sel_1_upd_info, W_[CCCS]);
290    LDV_RECORD_CREATE(r);
291    StgThunk_payload(r,0) = z;
292
293  retry:
294    x = StgMutVar_var(mv);
295    StgThunk_payload(z,1) = x;
296 #ifdef THREADED_RTS
297    (h) = foreign "C" cas(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var, x, y) [];
298    if (h != x) { goto retry; }
299 #else
300    StgMutVar_var(mv) = y;
301 #endif
302
303    if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
304      foreign "C" dirty_MUT_VAR(BaseReg "ptr", mv "ptr") [];
305    }
306
307    RET_P(r);
308 }
309
310 /* -----------------------------------------------------------------------------
311    Weak Pointer Primitives
312    -------------------------------------------------------------------------- */
313
314 STRING(stg_weak_msg,"New weak pointer at %p\n")
315
316 mkWeakzh_fast
317 {
318   /* R1 = key
319      R2 = value
320      R3 = finalizer (or NULL)
321   */
322   W_ w;
323
324   if (R3 == NULL) {
325     R3 = stg_NO_FINALIZER_closure;
326   }
327
328   ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, mkWeakzh_fast );
329
330   w = Hp - SIZEOF_StgWeak + WDS(1);
331   SET_HDR(w, stg_WEAK_info, W_[CCCS]);
332
333   // We don't care about cfinalizer here.
334   // Should StgWeak_cfinalizer(w) be stg_NO_FINALIZER_closure or
335   // something else?
336
337   StgWeak_key(w)        = R1;
338   StgWeak_value(w)      = R2;
339   StgWeak_finalizer(w)  = R3;
340   StgWeak_cfinalizer(w) = stg_NO_FINALIZER_closure;
341
342   StgWeak_link(w)       = W_[weak_ptr_list];
343   W_[weak_ptr_list]     = w;
344
345   IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
346
347   RET_P(w);
348 }
349
350 mkWeakForeignEnvzh_fast
351 {
352   /* R1 = key
353      R2 = value
354      R3 = finalizer
355      R4 = pointer
356      R5 = has environment (0 or 1)
357      R6 = environment
358   */
359   W_ w, payload_words, words, p;
360
361   W_ key, val, fptr, ptr, flag, eptr;
362
363   key  = R1;
364   val  = R2;
365   fptr = R3;
366   ptr  = R4;
367   flag = R5;
368   eptr = R6;
369
370   ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR, mkWeakForeignEnvzh_fast );
371
372   w = Hp - SIZEOF_StgWeak + WDS(1);
373   SET_HDR(w, stg_WEAK_info, W_[CCCS]);
374
375   payload_words = 4;
376   words         = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
377   ("ptr" p)     = foreign "C" allocateLocal(MyCapability() "ptr", words) [];
378
379   TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
380   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
381
382   StgArrWords_words(p)     = payload_words;
383   StgArrWords_payload(p,0) = fptr;
384   StgArrWords_payload(p,1) = ptr;
385   StgArrWords_payload(p,2) = eptr;
386   StgArrWords_payload(p,3) = flag;
387
388   // We don't care about the value here.
389   // Should StgWeak_value(w) be stg_NO_FINALIZER_closure or something else?
390
391   StgWeak_key(w)        = key;
392   StgWeak_value(w)      = val;
393   StgWeak_finalizer(w)  = stg_NO_FINALIZER_closure;
394   StgWeak_cfinalizer(w) = p;
395
396   StgWeak_link(w)   = W_[weak_ptr_list];
397   W_[weak_ptr_list] = w;
398
399   IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
400
401   RET_P(w);
402 }
403
404 finalizzeWeakzh_fast
405 {
406   /* R1 = weak ptr
407    */
408   W_ w, f, arr;
409
410   w = R1;
411
412   // already dead?
413   if (GET_INFO(w) == stg_DEAD_WEAK_info) {
414       RET_NP(0,stg_NO_FINALIZER_closure);
415   }
416
417   // kill it
418 #ifdef PROFILING
419   // @LDV profiling
420   // A weak pointer is inherently used, so we do not need to call
421   // LDV_recordDead_FILL_SLOP_DYNAMIC():
422   //    LDV_recordDead_FILL_SLOP_DYNAMIC((StgClosure *)w);
423   // or, LDV_recordDead():
424   //    LDV_recordDead((StgClosure *)w, sizeofW(StgWeak) - sizeofW(StgProfHeader));
425   // Furthermore, when PROFILING is turned on, dead weak pointers are exactly as 
426   // large as weak pointers, so there is no need to fill the slop, either.
427   // See stg_DEAD_WEAK_info in StgMiscClosures.hc.
428 #endif
429
430   //
431   // Todo: maybe use SET_HDR() and remove LDV_recordCreate()?
432   //
433   SET_INFO(w,stg_DEAD_WEAK_info);
434   LDV_RECORD_CREATE(w);
435
436   f   = StgWeak_finalizer(w);
437   arr = StgWeak_cfinalizer(w);
438
439   StgDeadWeak_link(w) = StgWeak_link(w);
440
441   if (arr != stg_NO_FINALIZER_closure) {
442     foreign "C" runCFinalizer(StgArrWords_payload(arr,0),
443                               StgArrWords_payload(arr,1),
444                               StgArrWords_payload(arr,2),
445                               StgArrWords_payload(arr,3)) [];
446   }
447
448   /* return the finalizer */
449   if (f == stg_NO_FINALIZER_closure) {
450       RET_NP(0,stg_NO_FINALIZER_closure);
451   } else {
452       RET_NP(1,f);
453   }
454 }
455
456 deRefWeakzh_fast
457 {
458   /* R1 = weak ptr */
459   W_ w, code, val;
460
461   w = R1;
462   if (GET_INFO(w) == stg_WEAK_info) {
463     code = 1;
464     val = StgWeak_value(w);
465   } else {
466     code = 0;
467     val = w;
468   }
469   RET_NP(code,val);
470 }
471
472 /* -----------------------------------------------------------------------------
473    Arbitrary-precision Integer operations.
474
475    There are some assumptions in this code that mp_limb_t == W_.  This is
476    the case for all the platforms that GHC supports, currently.
477    -------------------------------------------------------------------------- */
478
479 int2Integerzh_fast
480 {
481    /* arguments: R1 = Int# */
482
483    W_ val, s, p;        /* to avoid aliasing */
484
485    val = R1;
486    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, int2Integerzh_fast );
487
488    p = Hp - SIZEOF_StgArrWords;
489    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
490    StgArrWords_words(p) = 1;
491
492    /* mpz_set_si is inlined here, makes things simpler */
493    if (%lt(val,0)) { 
494         s  = -1;
495         Hp(0) = -val;
496    } else { 
497      if (%gt(val,0)) {
498         s = 1;
499         Hp(0) = val;
500      } else {
501         s = 0;
502      }
503   }
504
505    /* returns (# size  :: Int#, 
506                  data  :: ByteArray# 
507                #)
508    */
509    RET_NP(s,p);
510 }
511
512 word2Integerzh_fast
513 {
514    /* arguments: R1 = Word# */
515
516    W_ val, s, p;        /* to avoid aliasing */
517
518    val = R1;
519
520    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, word2Integerzh_fast);
521
522    p = Hp - SIZEOF_StgArrWords;
523    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
524    StgArrWords_words(p) = 1;
525
526    if (val != 0) {
527         s = 1;
528         W_[Hp] = val;
529    } else {
530         s = 0;
531    }
532
533    /* returns (# size  :: Int#, 
534                  data  :: ByteArray# #)
535    */
536    RET_NP(s,p);
537 }
538
539
540 /*
541  * 'long long' primops for converting to/from Integers.
542  */
543
544 #ifdef SUPPORT_LONG_LONGS
545
546 int64ToIntegerzh_fast
547 {
548    /* arguments: L1 = Int64# */
549
550    L_ val;
551    W_ hi, lo, s, neg, words_needed, p;
552
553    val = L1;
554    neg = 0;
555
556    hi = TO_W_(val >> 32);
557    lo = TO_W_(val);
558
559    if ( hi == 0 || (hi == 0xFFFFFFFF && lo != 0) )  {
560        // minimum is one word
561        words_needed = 1;
562    } else { 
563        words_needed = 2;
564    }
565
566    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
567                NO_PTRS, int64ToIntegerzh_fast );
568
569    p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
570    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
571    StgArrWords_words(p) = words_needed;
572
573    if ( %lt(hi,0) ) {
574      neg = 1;
575      lo = -lo;
576      if(lo == 0) {
577        hi = -hi;
578      } else {
579        hi = -hi - 1;
580      }
581    }
582
583    if ( words_needed == 2 )  { 
584       s = 2;
585       Hp(-1) = lo;
586       Hp(0) = hi;
587    } else { 
588        if ( lo != 0 ) {
589            s = 1;
590            Hp(0) = lo;
591        } else /* val==0 */  {
592            s = 0;
593        }
594    }
595    if ( neg != 0 ) {
596         s = -s;
597    }
598
599    /* returns (# size  :: Int#, 
600                  data  :: ByteArray# #)
601    */
602    RET_NP(s,p);
603 }
604 word64ToIntegerzh_fast
605 {
606    /* arguments: L1 = Word64# */
607
608    L_ val;
609    W_ hi, lo, s, words_needed, p;
610
611    val = L1;
612    hi = TO_W_(val >> 32);
613    lo = TO_W_(val);
614
615    if ( hi != 0 ) {
616       words_needed = 2;
617    } else {
618       words_needed = 1;
619    }
620
621    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
622                NO_PTRS, word64ToIntegerzh_fast );
623
624    p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
625    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
626    StgArrWords_words(p) = words_needed;
627
628    if ( hi != 0 ) { 
629      s = 2;
630      Hp(-1) = lo;
631      Hp(0)  = hi;
632    } else {
633       if ( lo != 0 ) {
634         s = 1;
635         Hp(0) = lo;
636      } else /* val==0 */  {
637       s = 0;
638      }
639   }
640
641    /* returns (# size  :: Int#, 
642                  data  :: ByteArray# #)
643    */
644    RET_NP(s,p);
645 }
646
647 #endif /* SUPPORT_LONG_LONGS */
648
649 #define GMP_TAKE2_RET1(name,mp_fun)                                     \
650 name                                                                    \
651 {                                                                       \
652   CInt s1, s2;                                                          \
653   W_ d1, d2;                                                            \
654   W_ mp_tmp1;                                                           \
655   W_ mp_tmp2;                                                           \
656   W_ mp_result1;                                                        \
657   W_ mp_result2;                                                        \
658                                                                         \
659   /* call doYouWantToGC() */                                            \
660   MAYBE_GC(R2_PTR & R4_PTR, name);                                      \
661                                                                         \
662   STK_CHK_GEN( 4 * SIZEOF_MP_INT, R2_PTR & R4_PTR, name );              \
663                                                                         \
664   s1 = W_TO_INT(R1);                                                    \
665   d1 = R2;                                                              \
666   s2 = W_TO_INT(R3);                                                    \
667   d2 = R4;                                                              \
668                                                                         \
669   mp_tmp1    = Sp - 1 * SIZEOF_MP_INT;                                  \
670   mp_tmp2    = Sp - 2 * SIZEOF_MP_INT;                                  \
671   mp_result1 = Sp - 3 * SIZEOF_MP_INT;                                  \
672   mp_result2 = Sp - 4 * SIZEOF_MP_INT;                                  \
673   MP_INT__mp_alloc(mp_tmp1) = W_TO_INT(StgArrWords_words(d1));          \
674   MP_INT__mp_size(mp_tmp1)  = (s1);                                     \
675   MP_INT__mp_d(mp_tmp1)     = BYTE_ARR_CTS(d1);                         \
676   MP_INT__mp_alloc(mp_tmp2) = W_TO_INT(StgArrWords_words(d2));          \
677   MP_INT__mp_size(mp_tmp2)  = (s2);                                     \
678   MP_INT__mp_d(mp_tmp2)     = BYTE_ARR_CTS(d2);                         \
679                                                                         \
680   foreign "C" __gmpz_init(mp_result1 "ptr") [];                            \
681                                                                         \
682   /* Perform the operation */                                           \
683   foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1  "ptr",mp_tmp2  "ptr") []; \
684                                                                         \
685   RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
686          MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
687 }
688
689 #define GMP_TAKE1_RET1(name,mp_fun)                                     \
690 name                                                                    \
691 {                                                                       \
692   CInt s1;                                                              \
693   W_ d1;                                                                \
694   W_ mp_tmp1;                                                           \
695   W_ mp_result1;                                                        \
696                                                                         \
697   /* call doYouWantToGC() */                                            \
698   MAYBE_GC(R2_PTR, name);                                               \
699                                                                         \
700   STK_CHK_GEN( 2 * SIZEOF_MP_INT, R2_PTR, name );                       \
701                                                                         \
702   d1 = R2;                                                              \
703   s1 = W_TO_INT(R1);                                                    \
704                                                                         \
705   mp_tmp1    = Sp - 1 * SIZEOF_MP_INT;                                  \
706   mp_result1 = Sp - 2 * SIZEOF_MP_INT;                                  \
707   MP_INT__mp_alloc(mp_tmp1)     = W_TO_INT(StgArrWords_words(d1));      \
708   MP_INT__mp_size(mp_tmp1)      = (s1);                                 \
709   MP_INT__mp_d(mp_tmp1)         = BYTE_ARR_CTS(d1);                     \
710                                                                         \
711   foreign "C" __gmpz_init(mp_result1 "ptr") [];                            \
712                                                                         \
713   /* Perform the operation */                                           \
714   foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1 "ptr") [];                \
715                                                                         \
716   RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
717          MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
718 }
719
720 #define GMP_TAKE2_RET2(name,mp_fun)                                                     \
721 name                                                                                    \
722 {                                                                                       \
723   CInt s1, s2;                                                                          \
724   W_ d1, d2;                                                                            \
725   W_ mp_tmp1;                                                                           \
726   W_ mp_tmp2;                                                                           \
727   W_ mp_result1;                                                                        \
728   W_ mp_result2;                                                                        \
729                                                                                         \
730   /* call doYouWantToGC() */                                                            \
731   MAYBE_GC(R2_PTR & R4_PTR, name);                                                      \
732                                                                                         \
733   STK_CHK_GEN( 4 * SIZEOF_MP_INT, R2_PTR & R4_PTR, name );                              \
734                                                                                         \
735   s1 = W_TO_INT(R1);                                                                    \
736   d1 = R2;                                                                              \
737   s2 = W_TO_INT(R3);                                                                    \
738   d2 = R4;                                                                              \
739                                                                                         \
740   mp_tmp1    = Sp - 1 * SIZEOF_MP_INT;                                                  \
741   mp_tmp2    = Sp - 2 * SIZEOF_MP_INT;                                                  \
742   mp_result1 = Sp - 3 * SIZEOF_MP_INT;                                                  \
743   mp_result2 = Sp - 4 * SIZEOF_MP_INT;                                                  \
744   MP_INT__mp_alloc(mp_tmp1)     = W_TO_INT(StgArrWords_words(d1));                      \
745   MP_INT__mp_size(mp_tmp1)      = (s1);                                                 \
746   MP_INT__mp_d(mp_tmp1)         = BYTE_ARR_CTS(d1);                                     \
747   MP_INT__mp_alloc(mp_tmp2)     = W_TO_INT(StgArrWords_words(d2));                      \
748   MP_INT__mp_size(mp_tmp2)      = (s2);                                                 \
749   MP_INT__mp_d(mp_tmp2)         = BYTE_ARR_CTS(d2);                                     \
750                                                                                         \
751   foreign "C" __gmpz_init(mp_result1 "ptr") [];                                               \
752   foreign "C" __gmpz_init(mp_result2 "ptr") [];                                               \
753                                                                                         \
754   /* Perform the operation */                                                           \
755   foreign "C" mp_fun(mp_result1 "ptr",mp_result2 "ptr",mp_tmp1 "ptr",mp_tmp2 "ptr") [];    \
756                                                                                         \
757   RET_NPNP(TO_W_(MP_INT__mp_size(mp_result1)),                                          \
758            MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords,                               \
759            TO_W_(MP_INT__mp_size(mp_result2)),                                          \
760            MP_INT__mp_d(mp_result2) - SIZEOF_StgArrWords);                              \
761 }
762
763 GMP_TAKE2_RET1(plusIntegerzh_fast,     __gmpz_add)
764 GMP_TAKE2_RET1(minusIntegerzh_fast,    __gmpz_sub)
765 GMP_TAKE2_RET1(timesIntegerzh_fast,    __gmpz_mul)
766 GMP_TAKE2_RET1(gcdIntegerzh_fast,      __gmpz_gcd)
767 GMP_TAKE2_RET1(quotIntegerzh_fast,     __gmpz_tdiv_q)
768 GMP_TAKE2_RET1(remIntegerzh_fast,      __gmpz_tdiv_r)
769 GMP_TAKE2_RET1(divExactIntegerzh_fast, __gmpz_divexact)
770 GMP_TAKE2_RET1(andIntegerzh_fast,      __gmpz_and)
771 GMP_TAKE2_RET1(orIntegerzh_fast,       __gmpz_ior)
772 GMP_TAKE2_RET1(xorIntegerzh_fast,      __gmpz_xor)
773 GMP_TAKE1_RET1(complementIntegerzh_fast, __gmpz_com)
774
775 GMP_TAKE2_RET2(quotRemIntegerzh_fast, __gmpz_tdiv_qr)
776 GMP_TAKE2_RET2(divModIntegerzh_fast,  __gmpz_fdiv_qr)
777
778 gcdIntzh_fast
779 {
780     /* R1 = the first Int#; R2 = the second Int# */
781     W_ r; 
782     W_ mp_tmp_w;
783
784     STK_CHK_GEN( 1 * SIZEOF_MP_INT, NO_PTRS, gcdIntzh_fast );
785
786     mp_tmp_w = Sp - 1 * SIZEOF_MP_INT;
787
788     W_[mp_tmp_w] = R1;
789     (r) = foreign "C" __gmpn_gcd_1(mp_tmp_w "ptr", 1, R2) [];
790
791     R1 = r;
792     /* Result parked in R1, return via info-pointer at TOS */
793     jump %ENTRY_CODE(Sp(0));
794 }
795
796
797 gcdIntegerIntzh_fast
798 {
799     /* R1 = s1; R2 = d1; R3 = the int */
800     W_ s1;
801     (s1) = foreign "C" __gmpn_gcd_1( BYTE_ARR_CTS(R2) "ptr", R1, R3) [];
802     R1 = s1;
803     
804     /* Result parked in R1, return via info-pointer at TOS */
805     jump %ENTRY_CODE(Sp(0));
806 }
807
808
809 cmpIntegerIntzh_fast
810 {
811     /* R1 = s1; R2 = d1; R3 = the int */
812     W_ usize, vsize, v_digit, u_digit;
813
814     usize = R1;
815     vsize = 0;
816     v_digit = R3;
817
818     // paraphrased from __gmpz_cmp_si() in the GMP sources
819     if (%gt(v_digit,0)) {
820         vsize = 1;
821     } else { 
822         if (%lt(v_digit,0)) {
823             vsize = -1;
824             v_digit = -v_digit;
825         }
826     }
827
828     if (usize != vsize) {
829         R1 = usize - vsize; 
830         jump %ENTRY_CODE(Sp(0));
831     }
832
833     if (usize == 0) {
834         R1 = 0; 
835         jump %ENTRY_CODE(Sp(0));
836     }
837
838     u_digit = W_[BYTE_ARR_CTS(R2)];
839
840     if (u_digit == v_digit) {
841         R1 = 0; 
842         jump %ENTRY_CODE(Sp(0));
843     }
844
845     if (%gtu(u_digit,v_digit)) { // NB. unsigned: these are mp_limb_t's
846         R1 = usize; 
847     } else {
848         R1 = -usize; 
849     }
850
851     jump %ENTRY_CODE(Sp(0));
852 }
853
854 cmpIntegerzh_fast
855 {
856     /* R1 = s1; R2 = d1; R3 = s2; R4 = d2 */
857     W_ usize, vsize, size, up, vp;
858     CInt cmp;
859
860     // paraphrased from __gmpz_cmp() in the GMP sources
861     usize = R1;
862     vsize = R3;
863
864     if (usize != vsize) {
865         R1 = usize - vsize; 
866         jump %ENTRY_CODE(Sp(0));
867     }
868
869     if (usize == 0) {
870         R1 = 0; 
871         jump %ENTRY_CODE(Sp(0));
872     }
873
874     if (%lt(usize,0)) { // NB. not <, which is unsigned
875         size = -usize;
876     } else {
877         size = usize;
878     }
879
880     up = BYTE_ARR_CTS(R2);
881     vp = BYTE_ARR_CTS(R4);
882
883     (cmp) = foreign "C" __gmpn_cmp(up "ptr", vp "ptr", size) [];
884
885     if (cmp == 0 :: CInt) {
886         R1 = 0; 
887         jump %ENTRY_CODE(Sp(0));
888     }
889
890     if (%lt(cmp,0 :: CInt) == %lt(usize,0)) {
891         R1 = 1;
892     } else {
893         R1 = (-1); 
894     }
895     /* Result parked in R1, return via info-pointer at TOS */
896     jump %ENTRY_CODE(Sp(0));
897 }
898
899 integer2Intzh_fast
900 {
901     /* R1 = s; R2 = d */
902     W_ r, s;
903
904     s = R1;
905     if (s == 0) {
906         r = 0;
907     } else {
908         r = W_[R2 + SIZEOF_StgArrWords];
909         if (%lt(s,0)) {
910             r = -r;
911         }
912     }
913     /* Result parked in R1, return via info-pointer at TOS */
914     R1 = r;
915     jump %ENTRY_CODE(Sp(0));
916 }
917
918 integer2Wordzh_fast
919 {
920   /* R1 = s; R2 = d */
921   W_ r, s;
922
923   s = R1;
924   if (s == 0) {
925     r = 0;
926   } else {
927     r = W_[R2 + SIZEOF_StgArrWords];
928     if (%lt(s,0)) {
929         r = -r;
930     }
931   }
932   /* Result parked in R1, return via info-pointer at TOS */
933   R1 = r;
934   jump %ENTRY_CODE(Sp(0));
935 }
936
937 decodeFloatzuIntzh_fast
938
939     W_ p;
940     F_ arg;
941     W_ mp_tmp1;
942     W_ mp_tmp_w;
943
944     STK_CHK_GEN( WDS(2), NO_PTRS, decodeFloatzuIntzh_fast );
945
946     mp_tmp1  = Sp - WDS(1);
947     mp_tmp_w = Sp - WDS(2);
948     
949     /* arguments: F1 = Float# */
950     arg = F1;
951     
952     /* Perform the operation */
953     foreign "C" __decodeFloat_Int(mp_tmp1 "ptr", mp_tmp_w "ptr", arg) [];
954     
955     /* returns: (Int# (mantissa), Int# (exponent)) */
956     RET_NN(W_[mp_tmp1], W_[mp_tmp_w]);
957 }
958
959 #define DOUBLE_MANTISSA_SIZE SIZEOF_DOUBLE
960 #define ARR_SIZE (SIZEOF_StgArrWords + DOUBLE_MANTISSA_SIZE)
961
962 decodeDoublezh_fast
963
964     D_ arg;
965     W_ p;
966     W_ mp_tmp1;
967     W_ mp_tmp_w;
968
969     STK_CHK_GEN( 2 * SIZEOF_MP_INT, NO_PTRS, decodeDoublezh_fast );
970
971     mp_tmp1  = Sp - 1 * SIZEOF_MP_INT;
972     mp_tmp_w = Sp - 2 * SIZEOF_MP_INT;
973
974     /* arguments: D1 = Double# */
975     arg = D1;
976
977     ALLOC_PRIM( ARR_SIZE, NO_PTRS, decodeDoublezh_fast );
978     
979     /* Be prepared to tell Lennart-coded __decodeDouble
980        where mantissa.d can be put (it does not care about the rest) */
981     p = Hp - ARR_SIZE + WDS(1);
982     SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
983     StgArrWords_words(p) = BYTES_TO_WDS(DOUBLE_MANTISSA_SIZE);
984     MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
985
986     /* Perform the operation */
987     foreign "C" __decodeDouble(mp_tmp1 "ptr", mp_tmp_w "ptr",arg) [];
988     
989     /* returns: (Int# (expn), Int#, ByteArray#) */
990     RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
991 }
992
993 decodeDoublezu2Intzh_fast
994
995     D_ arg;
996     W_ p;
997     W_ mp_tmp1;
998     W_ mp_tmp2;
999     W_ mp_result1;
1000     W_ mp_result2;
1001
1002     STK_CHK_GEN( WDS(4), NO_PTRS, decodeDoublezu2Intzh_fast );
1003
1004     mp_tmp1    = Sp - WDS(1);
1005     mp_tmp2    = Sp - WDS(2);
1006     mp_result1 = Sp - WDS(3);
1007     mp_result2 = Sp - WDS(4);
1008
1009     /* arguments: D1 = Double# */
1010     arg = D1;
1011
1012     /* Perform the operation */
1013     foreign "C" __decodeDouble_2Int(mp_tmp1 "ptr", mp_tmp2 "ptr",
1014                                     mp_result1 "ptr", mp_result2 "ptr",
1015                                     arg) [];
1016
1017     /* returns:
1018        (Int# (mant sign), Word# (mant high), Word# (mant low), Int# (expn)) */
1019     RET_NNNN(W_[mp_tmp1], W_[mp_tmp2], W_[mp_result1], W_[mp_result2]);
1020 }
1021
1022 /* -----------------------------------------------------------------------------
1023  * Concurrency primitives
1024  * -------------------------------------------------------------------------- */
1025
1026 forkzh_fast
1027 {
1028   /* args: R1 = closure to spark */
1029
1030   MAYBE_GC(R1_PTR, forkzh_fast);
1031
1032   W_ closure;
1033   W_ threadid;
1034   closure = R1;
1035
1036   ("ptr" threadid) = foreign "C" createIOThread( MyCapability() "ptr", 
1037                                 RtsFlags_GcFlags_initialStkSize(RtsFlags), 
1038                                 closure "ptr") [];
1039
1040   /* start blocked if the current thread is blocked */
1041   StgTSO_flags(threadid) = 
1042      StgTSO_flags(threadid) |  (StgTSO_flags(CurrentTSO) & 
1043                                 (TSO_BLOCKEX::I32 | TSO_INTERRUPTIBLE::I32));
1044
1045   foreign "C" scheduleThread(MyCapability() "ptr", threadid "ptr") [];
1046
1047   // context switch soon, but not immediately: we don't want every
1048   // forkIO to force a context-switch.
1049   Capability_context_switch(MyCapability()) = 1 :: CInt;
1050   
1051   RET_P(threadid);
1052 }
1053
1054 forkOnzh_fast
1055 {
1056   /* args: R1 = cpu, R2 = closure to spark */
1057
1058   MAYBE_GC(R2_PTR, forkOnzh_fast);
1059
1060   W_ cpu;
1061   W_ closure;
1062   W_ threadid;
1063   cpu = R1;
1064   closure = R2;
1065
1066   ("ptr" threadid) = foreign "C" createIOThread( MyCapability() "ptr", 
1067                                 RtsFlags_GcFlags_initialStkSize(RtsFlags), 
1068                                 closure "ptr") [];
1069
1070   /* start blocked if the current thread is blocked */
1071   StgTSO_flags(threadid) = 
1072      StgTSO_flags(threadid) |  (StgTSO_flags(CurrentTSO) & 
1073                                 (TSO_BLOCKEX::I32 | TSO_INTERRUPTIBLE::I32));
1074
1075   foreign "C" scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr") [];
1076
1077   // context switch soon, but not immediately: we don't want every
1078   // forkIO to force a context-switch.
1079   Capability_context_switch(MyCapability()) = 1 :: CInt;
1080   
1081   RET_P(threadid);
1082 }
1083
1084 yieldzh_fast
1085 {
1086   jump stg_yield_noregs;
1087 }
1088
1089 myThreadIdzh_fast
1090 {
1091   /* no args. */
1092   RET_P(CurrentTSO);
1093 }
1094
1095 labelThreadzh_fast
1096 {
1097   /* args: 
1098         R1 = ThreadId#
1099         R2 = Addr# */
1100 #ifdef DEBUG
1101   foreign "C" labelThread(R1 "ptr", R2 "ptr") [];
1102 #endif
1103   jump %ENTRY_CODE(Sp(0));
1104 }
1105
1106 isCurrentThreadBoundzh_fast
1107 {
1108   /* no args */
1109   W_ r;
1110   (r) = foreign "C" isThreadBound(CurrentTSO) [];
1111   RET_N(r);
1112 }
1113
1114 threadStatuszh_fast
1115 {
1116     /* args: R1 :: ThreadId# */
1117     W_ tso;
1118     W_ why_blocked;
1119     W_ what_next;
1120     W_ ret;
1121
1122     tso = R1;
1123     loop:
1124       if (TO_W_(StgTSO_what_next(tso)) == ThreadRelocated) {
1125           tso = StgTSO__link(tso);
1126           goto loop;
1127       }
1128
1129     what_next   = TO_W_(StgTSO_what_next(tso));
1130     why_blocked = TO_W_(StgTSO_why_blocked(tso));
1131     // Note: these two reads are not atomic, so they might end up
1132     // being inconsistent.  It doesn't matter, since we
1133     // only return one or the other.  If we wanted to return the
1134     // contents of block_info too, then we'd have to do some synchronisation.
1135
1136     if (what_next == ThreadComplete) {
1137         ret = 16;  // NB. magic, matches up with GHC.Conc.threadStatus
1138     } else {
1139         if (what_next == ThreadKilled) {
1140             ret = 17;
1141         } else {
1142             ret = why_blocked;
1143         }
1144     }
1145     RET_N(ret);
1146 }
1147
1148 /* -----------------------------------------------------------------------------
1149  * TVar primitives
1150  * -------------------------------------------------------------------------- */
1151
1152 #define SP_OFF 0
1153
1154 // Catch retry frame ------------------------------------------------------------
1155
1156 INFO_TABLE_RET(stg_catch_retry_frame, CATCH_RETRY_FRAME,
1157 #if defined(PROFILING)
1158   W_ unused1, W_ unused2,
1159 #endif
1160   W_ unused3, P_ unused4, P_ unused5)
1161 {
1162    W_ r, frame, trec, outer;
1163
1164    frame = Sp;
1165    trec = StgTSO_trec(CurrentTSO);
1166    ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
1167    (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
1168    if (r != 0) {
1169      /* Succeeded (either first branch or second branch) */
1170      StgTSO_trec(CurrentTSO) = outer;
1171      Sp = Sp + SIZEOF_StgCatchRetryFrame;
1172      jump %ENTRY_CODE(Sp(SP_OFF));
1173    } else {
1174      /* Did not commit: re-execute */
1175      W_ new_trec;
1176      ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
1177      StgTSO_trec(CurrentTSO) = new_trec;
1178      if (StgCatchRetryFrame_running_alt_code(frame) != 0::I32) {
1179        R1 = StgCatchRetryFrame_alt_code(frame);
1180      } else {
1181        R1 = StgCatchRetryFrame_first_code(frame);
1182      }
1183      jump stg_ap_v_fast;
1184    }
1185 }
1186
1187
1188 // Atomically frame ------------------------------------------------------------
1189
1190 INFO_TABLE_RET(stg_atomically_frame, ATOMICALLY_FRAME,
1191 #if defined(PROFILING)
1192   W_ unused1, W_ unused2,
1193 #endif
1194   P_ unused3, P_ unused4)
1195 {
1196   W_ frame, trec, valid, next_invariant, q, outer;
1197
1198   frame = Sp;
1199   trec = StgTSO_trec(CurrentTSO);
1200   ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
1201
1202   if (outer == NO_TREC) {
1203     /* First time back at the atomically frame -- pick up invariants */
1204     ("ptr" q) = foreign "C" stmGetInvariantsToCheck(MyCapability() "ptr", trec "ptr") [];
1205     StgAtomicallyFrame_next_invariant_to_check(frame) = q;
1206
1207   } else {
1208     /* Second/subsequent time back at the atomically frame -- abort the
1209      * tx that's checking the invariant and move on to the next one */
1210     StgTSO_trec(CurrentTSO) = outer;
1211     q = StgAtomicallyFrame_next_invariant_to_check(frame);
1212     StgInvariantCheckQueue_my_execution(q) = trec;
1213     foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
1214     /* Don't free trec -- it's linked from q and will be stashed in the
1215      * invariant if we eventually commit. */
1216     q = StgInvariantCheckQueue_next_queue_entry(q);
1217     StgAtomicallyFrame_next_invariant_to_check(frame) = q;
1218     trec = outer;
1219   }
1220
1221   q = StgAtomicallyFrame_next_invariant_to_check(frame);
1222
1223   if (q != END_INVARIANT_CHECK_QUEUE) {
1224     /* We can't commit yet: another invariant to check */
1225     ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [];
1226     StgTSO_trec(CurrentTSO) = trec;
1227
1228     next_invariant = StgInvariantCheckQueue_invariant(q);
1229     R1 = StgAtomicInvariant_code(next_invariant);
1230     jump stg_ap_v_fast;
1231
1232   } else {
1233
1234     /* We've got no more invariants to check, try to commit */
1235     (valid) = foreign "C" stmCommitTransaction(MyCapability() "ptr", trec "ptr") [];
1236     if (valid != 0) {
1237       /* Transaction was valid: commit succeeded */
1238       StgTSO_trec(CurrentTSO) = NO_TREC;
1239       Sp = Sp + SIZEOF_StgAtomicallyFrame;
1240       jump %ENTRY_CODE(Sp(SP_OFF));
1241     } else {
1242       /* Transaction was not valid: try again */
1243       ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
1244       StgTSO_trec(CurrentTSO) = trec;
1245       StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;
1246       R1 = StgAtomicallyFrame_code(frame);
1247       jump stg_ap_v_fast;
1248     }
1249   }
1250 }
1251
1252 INFO_TABLE_RET(stg_atomically_waiting_frame, ATOMICALLY_FRAME,
1253 #if defined(PROFILING)
1254   W_ unused1, W_ unused2,
1255 #endif
1256   P_ unused3, P_ unused4)
1257 {
1258   W_ frame, trec, valid;
1259
1260   frame = Sp;
1261
1262   /* The TSO is currently waiting: should we stop waiting? */
1263   (valid) = foreign "C" stmReWait(MyCapability() "ptr", CurrentTSO "ptr") [];
1264   if (valid != 0) {
1265     /* Previous attempt is still valid: no point trying again yet */
1266     jump stg_block_noregs;
1267   } else {
1268     /* Previous attempt is no longer valid: try again */
1269     ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
1270     StgTSO_trec(CurrentTSO) = trec;
1271     StgHeader_info(frame) = stg_atomically_frame_info;
1272     R1 = StgAtomicallyFrame_code(frame);
1273     jump stg_ap_v_fast;
1274   }
1275 }
1276
1277 // STM catch frame --------------------------------------------------------------
1278
1279 #define SP_OFF 0
1280
1281 /* Catch frames are very similar to update frames, but when entering
1282  * one we just pop the frame off the stack and perform the correct
1283  * kind of return to the activation record underneath us on the stack.
1284  */
1285
1286 INFO_TABLE_RET(stg_catch_stm_frame, CATCH_STM_FRAME,
1287 #if defined(PROFILING)
1288   W_ unused1, W_ unused2,
1289 #endif
1290   P_ unused3, P_ unused4)
1291    {
1292       W_ r, frame, trec, outer;
1293       frame = Sp;
1294       trec = StgTSO_trec(CurrentTSO);
1295       ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
1296       (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
1297       if (r != 0) {
1298         /* Commit succeeded */
1299         StgTSO_trec(CurrentTSO) = outer;
1300         Sp = Sp + SIZEOF_StgCatchSTMFrame;
1301         jump Sp(SP_OFF);
1302       } else {
1303         /* Commit failed */
1304         W_ new_trec;
1305         ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
1306         StgTSO_trec(CurrentTSO) = new_trec;
1307         R1 = StgCatchSTMFrame_code(frame);
1308         jump stg_ap_v_fast;
1309       }
1310    }
1311
1312
1313 // Primop definition ------------------------------------------------------------
1314
1315 atomicallyzh_fast
1316 {
1317   W_ frame;
1318   W_ old_trec;
1319   W_ new_trec;
1320   
1321   // stmStartTransaction may allocate
1322   MAYBE_GC (R1_PTR, atomicallyzh_fast); 
1323
1324   /* Args: R1 = m :: STM a */
1325   STK_CHK_GEN(SIZEOF_StgAtomicallyFrame + WDS(1), R1_PTR, atomicallyzh_fast);
1326
1327   old_trec = StgTSO_trec(CurrentTSO);
1328
1329   /* Nested transactions are not allowed; raise an exception */
1330   if (old_trec != NO_TREC) {
1331      R1 = base_ControlziExceptionziBase_nestedAtomically_closure;
1332      jump raisezh_fast;
1333   }
1334
1335   /* Set up the atomically frame */
1336   Sp = Sp - SIZEOF_StgAtomicallyFrame;
1337   frame = Sp;
1338
1339   SET_HDR(frame,stg_atomically_frame_info, W_[CCCS]);
1340   StgAtomicallyFrame_code(frame) = R1;
1341   StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;
1342
1343   /* Start the memory transcation */
1344   ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", old_trec "ptr") [R1];
1345   StgTSO_trec(CurrentTSO) = new_trec;
1346
1347   /* Apply R1 to the realworld token */
1348   jump stg_ap_v_fast;
1349 }
1350
1351
1352 catchSTMzh_fast
1353 {
1354   W_ frame;
1355   
1356   /* Args: R1 :: STM a */
1357   /* Args: R2 :: Exception -> STM a */
1358   STK_CHK_GEN(SIZEOF_StgCatchSTMFrame + WDS(1), R1_PTR & R2_PTR, catchSTMzh_fast);
1359
1360   /* Set up the catch frame */
1361   Sp = Sp - SIZEOF_StgCatchSTMFrame;
1362   frame = Sp;
1363
1364   SET_HDR(frame, stg_catch_stm_frame_info, W_[CCCS]);
1365   StgCatchSTMFrame_handler(frame) = R2;
1366   StgCatchSTMFrame_code(frame) = R1;
1367
1368   /* Start a nested transaction to run the body of the try block in */
1369   W_ cur_trec;  
1370   W_ new_trec;
1371   cur_trec = StgTSO_trec(CurrentTSO);
1372   ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", cur_trec "ptr");
1373   StgTSO_trec(CurrentTSO) = new_trec;
1374
1375   /* Apply R1 to the realworld token */
1376   jump stg_ap_v_fast;
1377 }
1378
1379
1380 catchRetryzh_fast
1381 {
1382   W_ frame;
1383   W_ new_trec;
1384   W_ trec;
1385
1386   // stmStartTransaction may allocate
1387   MAYBE_GC (R1_PTR & R2_PTR, catchRetryzh_fast); 
1388
1389   /* Args: R1 :: STM a */
1390   /* Args: R2 :: STM a */
1391   STK_CHK_GEN(SIZEOF_StgCatchRetryFrame + WDS(1), R1_PTR & R2_PTR, catchRetryzh_fast);
1392
1393   /* Start a nested transaction within which to run the first code */
1394   trec = StgTSO_trec(CurrentTSO);
1395   ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [R1,R2];
1396   StgTSO_trec(CurrentTSO) = new_trec;
1397
1398   /* Set up the catch-retry frame */
1399   Sp = Sp - SIZEOF_StgCatchRetryFrame;
1400   frame = Sp;
1401   
1402   SET_HDR(frame, stg_catch_retry_frame_info, W_[CCCS]);
1403   StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false;
1404   StgCatchRetryFrame_first_code(frame) = R1;
1405   StgCatchRetryFrame_alt_code(frame) = R2;
1406
1407   /* Apply R1 to the realworld token */
1408   jump stg_ap_v_fast;
1409 }
1410
1411
1412 retryzh_fast
1413 {
1414   W_ frame_type;
1415   W_ frame;
1416   W_ trec;
1417   W_ outer;
1418   W_ r;
1419
1420   MAYBE_GC (NO_PTRS, retryzh_fast); // STM operations may allocate
1421
1422   // Find the enclosing ATOMICALLY_FRAME or CATCH_RETRY_FRAME
1423 retry_pop_stack:
1424   StgTSO_sp(CurrentTSO) = Sp;
1425   (frame_type) = foreign "C" findRetryFrameHelper(CurrentTSO "ptr") [];
1426   Sp = StgTSO_sp(CurrentTSO);
1427   frame = Sp;
1428   trec = StgTSO_trec(CurrentTSO);
1429   ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
1430
1431   if (frame_type == CATCH_RETRY_FRAME) {
1432     // The retry reaches a CATCH_RETRY_FRAME before the atomic frame
1433     ASSERT(outer != NO_TREC);
1434     // Abort the transaction attempting the current branch
1435     foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
1436     foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
1437     if (!StgCatchRetryFrame_running_alt_code(frame) != 0::I32) {
1438       // Retry in the first branch: try the alternative
1439       ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
1440       StgTSO_trec(CurrentTSO) = trec;
1441       StgCatchRetryFrame_running_alt_code(frame) = 1 :: CInt; // true;
1442       R1 = StgCatchRetryFrame_alt_code(frame);
1443       jump stg_ap_v_fast;
1444     } else {
1445       // Retry in the alternative code: propagate the retry
1446       StgTSO_trec(CurrentTSO) = outer;
1447       Sp = Sp + SIZEOF_StgCatchRetryFrame;
1448       goto retry_pop_stack;
1449     }
1450   }
1451
1452   // We've reached the ATOMICALLY_FRAME: attempt to wait 
1453   ASSERT(frame_type == ATOMICALLY_FRAME);
1454   if (outer != NO_TREC) {
1455     // We called retry while checking invariants, so abort the current
1456     // invariant check (merging its TVar accesses into the parents read
1457     // set so we'll wait on them)
1458     foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
1459     foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
1460     trec = outer;
1461     StgTSO_trec(CurrentTSO) = trec;
1462     ("ptr" outer) = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
1463   }
1464   ASSERT(outer == NO_TREC);
1465
1466   (r) = foreign "C" stmWait(MyCapability() "ptr", CurrentTSO "ptr", trec "ptr") [];
1467   if (r != 0) {
1468     // Transaction was valid: stmWait put us on the TVars' queues, we now block
1469     StgHeader_info(frame) = stg_atomically_waiting_frame_info;
1470     Sp = frame;
1471     // Fix up the stack in the unregisterised case: the return convention is different.
1472     R3 = trec; // passing to stmWaitUnblock()
1473     jump stg_block_stmwait;
1474   } else {
1475     // Transaction was not valid: retry immediately
1476     ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
1477     StgTSO_trec(CurrentTSO) = trec;
1478     R1 = StgAtomicallyFrame_code(frame);
1479     Sp = frame;
1480     jump stg_ap_v_fast;
1481   }
1482 }
1483
1484
1485 checkzh_fast
1486 {
1487   W_ trec, closure;
1488
1489   /* Args: R1 = invariant closure */
1490   MAYBE_GC (R1_PTR, checkzh_fast); 
1491
1492   trec = StgTSO_trec(CurrentTSO);
1493   closure = R1;
1494   foreign "C" stmAddInvariantToCheck(MyCapability() "ptr", 
1495                                      trec "ptr",
1496                                      closure "ptr") [];
1497
1498   jump %ENTRY_CODE(Sp(0));
1499 }
1500
1501
1502 newTVarzh_fast
1503 {
1504   W_ tv;
1505   W_ new_value;
1506
1507   /* Args: R1 = initialisation value */
1508
1509   MAYBE_GC (R1_PTR, newTVarzh_fast); 
1510   new_value = R1;
1511   ("ptr" tv) = foreign "C" stmNewTVar(MyCapability() "ptr", new_value "ptr") [];
1512   RET_P(tv);
1513 }
1514
1515
1516 readTVarzh_fast
1517 {
1518   W_ trec;
1519   W_ tvar;
1520   W_ result;
1521
1522   /* Args: R1 = TVar closure */
1523
1524   MAYBE_GC (R1_PTR, readTVarzh_fast); // Call to stmReadTVar may allocate
1525   trec = StgTSO_trec(CurrentTSO);
1526   tvar = R1;
1527   ("ptr" result) = foreign "C" stmReadTVar(MyCapability() "ptr", trec "ptr", tvar "ptr") [];
1528
1529   RET_P(result);
1530 }
1531
1532 readTVarIOzh_fast
1533 {
1534     W_ result;
1535
1536 again:
1537     result = StgTVar_current_value(R1);
1538     if (%INFO_PTR(result) == stg_TREC_HEADER_info) {
1539         goto again;
1540     }
1541     RET_P(result);
1542 }
1543
1544 writeTVarzh_fast
1545 {
1546   W_ trec;
1547   W_ tvar;
1548   W_ new_value;
1549   
1550   /* Args: R1 = TVar closure */
1551   /*       R2 = New value    */
1552
1553   MAYBE_GC (R1_PTR & R2_PTR, writeTVarzh_fast); // Call to stmWriteTVar may allocate
1554   trec = StgTSO_trec(CurrentTSO);
1555   tvar = R1;
1556   new_value = R2;
1557   foreign "C" stmWriteTVar(MyCapability() "ptr", trec "ptr", tvar "ptr", new_value "ptr") [];
1558
1559   jump %ENTRY_CODE(Sp(0));
1560 }
1561
1562
1563 /* -----------------------------------------------------------------------------
1564  * MVar primitives
1565  *
1566  * take & putMVar work as follows.  Firstly, an important invariant:
1567  *
1568  *    If the MVar is full, then the blocking queue contains only
1569  *    threads blocked on putMVar, and if the MVar is empty then the
1570  *    blocking queue contains only threads blocked on takeMVar.
1571  *
1572  * takeMvar:
1573  *    MVar empty : then add ourselves to the blocking queue
1574  *    MVar full  : remove the value from the MVar, and
1575  *                 blocking queue empty     : return
1576  *                 blocking queue non-empty : perform the first blocked putMVar
1577  *                                            from the queue, and wake up the
1578  *                                            thread (MVar is now full again)
1579  *
1580  * putMVar is just the dual of the above algorithm.
1581  *
1582  * How do we "perform a putMVar"?  Well, we have to fiddle around with
1583  * the stack of the thread waiting to do the putMVar.  See
1584  * stg_block_putmvar and stg_block_takemvar in HeapStackCheck.c for
1585  * the stack layout, and the PerformPut and PerformTake macros below.
1586  *
1587  * It is important that a blocked take or put is woken up with the
1588  * take/put already performed, because otherwise there would be a
1589  * small window of vulnerability where the thread could receive an
1590  * exception and never perform its take or put, and we'd end up with a
1591  * deadlock.
1592  *
1593  * -------------------------------------------------------------------------- */
1594
1595 isEmptyMVarzh_fast
1596 {
1597     /* args: R1 = MVar closure */
1598
1599     if (StgMVar_value(R1) == stg_END_TSO_QUEUE_closure) {
1600         RET_N(1);
1601     } else {
1602         RET_N(0);
1603     }
1604 }
1605
1606 newMVarzh_fast
1607 {
1608     /* args: none */
1609     W_ mvar;
1610
1611     ALLOC_PRIM ( SIZEOF_StgMVar, NO_PTRS, newMVarzh_fast );
1612   
1613     mvar = Hp - SIZEOF_StgMVar + WDS(1);
1614     SET_HDR(mvar,stg_MVAR_DIRTY_info,W_[CCCS]);
1615         // MVARs start dirty: generation 0 has no mutable list
1616     StgMVar_head(mvar)  = stg_END_TSO_QUEUE_closure;
1617     StgMVar_tail(mvar)  = stg_END_TSO_QUEUE_closure;
1618     StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
1619     RET_P(mvar);
1620 }
1621
1622
1623 #define PerformTake(tso, value)                         \
1624     W_[StgTSO_sp(tso) + WDS(1)] = value;                \
1625     W_[StgTSO_sp(tso) + WDS(0)] = stg_gc_unpt_r1_info;
1626
1627 #define PerformPut(tso,lval)                    \
1628     StgTSO_sp(tso) = StgTSO_sp(tso) + WDS(3);   \
1629     lval = W_[StgTSO_sp(tso) - WDS(1)];
1630
1631 takeMVarzh_fast
1632 {
1633     W_ mvar, val, info, tso;
1634
1635     /* args: R1 = MVar closure */
1636     mvar = R1;
1637
1638 #if defined(THREADED_RTS)
1639     ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
1640 #else
1641     info = GET_INFO(mvar);
1642 #endif
1643         
1644     if (info == stg_MVAR_CLEAN_info) {
1645         foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") [];
1646     }
1647
1648     /* If the MVar is empty, put ourselves on its blocking queue,
1649      * and wait until we're woken up.
1650      */
1651     if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
1652         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1653             StgMVar_head(mvar) = CurrentTSO;
1654         } else {
1655             foreign "C" setTSOLink(MyCapability() "ptr", 
1656                                    StgMVar_tail(mvar) "ptr",
1657                                    CurrentTSO) [];
1658         }
1659         StgTSO__link(CurrentTSO)       = stg_END_TSO_QUEUE_closure;
1660         StgTSO_block_info(CurrentTSO)  = mvar;
1661         // write barrier for throwTo(), which looks at block_info
1662         // if why_blocked==BlockedOnMVar.
1663         prim %write_barrier() [];
1664         StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
1665         StgMVar_tail(mvar) = CurrentTSO;
1666         
1667         R1 = mvar;
1668         jump stg_block_takemvar;
1669   }
1670
1671   /* we got the value... */
1672   val = StgMVar_value(mvar);
1673
1674   if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure)
1675   {
1676       /* There are putMVar(s) waiting... 
1677        * wake up the first thread on the queue
1678        */
1679       ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
1680
1681       /* actually perform the putMVar for the thread that we just woke up */
1682       tso = StgMVar_head(mvar);
1683       PerformPut(tso,StgMVar_value(mvar));
1684
1685       if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
1686           foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
1687       }
1688
1689       ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
1690                                             StgMVar_head(mvar) "ptr", 1) [];
1691       StgMVar_head(mvar) = tso;
1692
1693       if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1694           StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
1695       }
1696
1697 #if defined(THREADED_RTS)
1698       unlockClosure(mvar, stg_MVAR_DIRTY_info);
1699 #else
1700       SET_INFO(mvar,stg_MVAR_DIRTY_info);
1701 #endif
1702       RET_P(val);
1703   } 
1704   else
1705   {
1706       /* No further putMVars, MVar is now empty */
1707       StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
1708  
1709 #if defined(THREADED_RTS)
1710       unlockClosure(mvar, stg_MVAR_DIRTY_info);
1711 #else
1712       SET_INFO(mvar,stg_MVAR_DIRTY_info);
1713 #endif
1714
1715       RET_P(val);
1716   }
1717 }
1718
1719
1720 tryTakeMVarzh_fast
1721 {
1722     W_ mvar, val, info, tso;
1723
1724     /* args: R1 = MVar closure */
1725
1726     mvar = R1;
1727
1728 #if defined(THREADED_RTS)
1729     ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
1730 #else
1731     info = GET_INFO(mvar);
1732 #endif
1733
1734     if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
1735 #if defined(THREADED_RTS)
1736         unlockClosure(mvar, info);
1737 #endif
1738         /* HACK: we need a pointer to pass back, 
1739          * so we abuse NO_FINALIZER_closure
1740          */
1741         RET_NP(0, stg_NO_FINALIZER_closure);
1742     }
1743
1744     if (info == stg_MVAR_CLEAN_info) {
1745         foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
1746     }
1747
1748     /* we got the value... */
1749     val = StgMVar_value(mvar);
1750
1751     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
1752
1753         /* There are putMVar(s) waiting... 
1754          * wake up the first thread on the queue
1755          */
1756         ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
1757
1758         /* actually perform the putMVar for the thread that we just woke up */
1759         tso = StgMVar_head(mvar);
1760         PerformPut(tso,StgMVar_value(mvar));
1761         if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
1762             foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
1763         }
1764
1765         ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
1766                                               StgMVar_head(mvar) "ptr", 1) [];
1767         StgMVar_head(mvar) = tso;
1768
1769         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1770             StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
1771         }
1772 #if defined(THREADED_RTS)
1773         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1774 #else
1775         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1776 #endif
1777     }
1778     else 
1779     {
1780         /* No further putMVars, MVar is now empty */
1781         StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
1782 #if defined(THREADED_RTS)
1783         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1784 #else
1785         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1786 #endif
1787     }
1788     
1789     RET_NP(1, val);
1790 }
1791
1792
1793 putMVarzh_fast
1794 {
1795     W_ mvar, val, info, tso;
1796
1797     /* args: R1 = MVar, R2 = value */
1798     mvar = R1;
1799     val  = R2;
1800
1801 #if defined(THREADED_RTS)
1802     ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
1803 #else
1804     info = GET_INFO(mvar);
1805 #endif
1806
1807     if (info == stg_MVAR_CLEAN_info) {
1808         foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
1809     }
1810
1811     if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
1812         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1813             StgMVar_head(mvar) = CurrentTSO;
1814         } else {
1815             foreign "C" setTSOLink(MyCapability() "ptr", 
1816                                    StgMVar_tail(mvar) "ptr",
1817                                    CurrentTSO) [];
1818         }
1819         StgTSO__link(CurrentTSO)       = stg_END_TSO_QUEUE_closure;
1820         StgTSO_block_info(CurrentTSO)  = mvar;
1821         // write barrier for throwTo(), which looks at block_info
1822         // if why_blocked==BlockedOnMVar.
1823         prim %write_barrier() [];
1824         StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
1825         StgMVar_tail(mvar) = CurrentTSO;
1826         
1827         R1 = mvar;
1828         R2 = val;
1829         jump stg_block_putmvar;
1830     }
1831   
1832     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
1833
1834         /* There are takeMVar(s) waiting: wake up the first one
1835          */
1836         ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
1837
1838         /* actually perform the takeMVar */
1839         tso = StgMVar_head(mvar);
1840         PerformTake(tso, val);
1841         if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
1842             foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
1843         }
1844       
1845         ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
1846                                               StgMVar_head(mvar) "ptr", 1) [];
1847         StgMVar_head(mvar) = tso;
1848
1849         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1850             StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
1851         }
1852
1853 #if defined(THREADED_RTS)
1854         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1855 #else
1856         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1857 #endif
1858         jump %ENTRY_CODE(Sp(0));
1859     }
1860     else
1861     {
1862         /* No further takes, the MVar is now full. */
1863         StgMVar_value(mvar) = val;
1864
1865 #if defined(THREADED_RTS)
1866         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1867 #else
1868         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1869 #endif
1870         jump %ENTRY_CODE(Sp(0));
1871     }
1872     
1873     /* ToDo: yield afterward for better communication performance? */
1874 }
1875
1876
1877 tryPutMVarzh_fast
1878 {
1879     W_ mvar, info, tso;
1880
1881     /* args: R1 = MVar, R2 = value */
1882     mvar = R1;
1883
1884 #if defined(THREADED_RTS)
1885     ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [R2];
1886 #else
1887     info = GET_INFO(mvar);
1888 #endif
1889
1890     if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
1891 #if defined(THREADED_RTS)
1892         unlockClosure(mvar, info);
1893 #endif
1894         RET_N(0);
1895     }
1896   
1897     if (info == stg_MVAR_CLEAN_info) {
1898         foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
1899     }
1900
1901     if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
1902
1903         /* There are takeMVar(s) waiting: wake up the first one
1904          */
1905         ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
1906         
1907         /* actually perform the takeMVar */
1908         tso = StgMVar_head(mvar);
1909         PerformTake(tso, R2);
1910         if (TO_W_(StgTSO_flags(tso)) & TSO_DIRTY == 0) {
1911             foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
1912         }
1913       
1914         ("ptr" tso) = foreign "C" unblockOne_(MyCapability() "ptr", 
1915                                               StgMVar_head(mvar) "ptr", 1) [];
1916         StgMVar_head(mvar) = tso;
1917
1918         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
1919             StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
1920         }
1921
1922 #if defined(THREADED_RTS)
1923         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1924 #else
1925         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1926 #endif
1927     }
1928     else
1929     {
1930         /* No further takes, the MVar is now full. */
1931         StgMVar_value(mvar) = R2;
1932
1933 #if defined(THREADED_RTS)
1934         unlockClosure(mvar, stg_MVAR_DIRTY_info);
1935 #else
1936         SET_INFO(mvar,stg_MVAR_DIRTY_info);
1937 #endif
1938     }
1939     
1940     RET_N(1);
1941     /* ToDo: yield afterward for better communication performance? */
1942 }
1943
1944
1945 /* -----------------------------------------------------------------------------
1946    Stable pointer primitives
1947    -------------------------------------------------------------------------  */
1948
1949 makeStableNamezh_fast
1950 {
1951     W_ index, sn_obj;
1952
1953     ALLOC_PRIM( SIZEOF_StgStableName, R1_PTR, makeStableNamezh_fast );
1954   
1955     (index) = foreign "C" lookupStableName(R1 "ptr") [];
1956
1957     /* Is there already a StableName for this heap object?
1958      *  stable_ptr_table is a pointer to an array of snEntry structs.
1959      */
1960     if ( snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) == NULL ) {
1961         sn_obj = Hp - SIZEOF_StgStableName + WDS(1);
1962         SET_HDR(sn_obj, stg_STABLE_NAME_info, W_[CCCS]);
1963         StgStableName_sn(sn_obj) = index;
1964         snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) = sn_obj;
1965     } else {
1966         sn_obj = snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry);
1967     }
1968     
1969     RET_P(sn_obj);
1970 }
1971
1972
1973 makeStablePtrzh_fast
1974 {
1975     /* Args: R1 = a */
1976     W_ sp;
1977     MAYBE_GC(R1_PTR, makeStablePtrzh_fast);
1978     ("ptr" sp) = foreign "C" getStablePtr(R1 "ptr") [];
1979     RET_N(sp);
1980 }
1981
1982 deRefStablePtrzh_fast
1983 {
1984     /* Args: R1 = the stable ptr */
1985     W_ r, sp;
1986     sp = R1;
1987     r = snEntry_addr(W_[stable_ptr_table] + sp*SIZEOF_snEntry);
1988     RET_P(r);
1989 }
1990
1991 /* -----------------------------------------------------------------------------
1992    Bytecode object primitives
1993    -------------------------------------------------------------------------  */
1994
1995 newBCOzh_fast
1996 {
1997     /* R1 = instrs
1998        R2 = literals
1999        R3 = ptrs
2000        R4 = arity
2001        R5 = bitmap array
2002     */
2003     W_ bco, bitmap_arr, bytes, words;
2004     
2005     bitmap_arr = R5;
2006
2007     words = BYTES_TO_WDS(SIZEOF_StgBCO) + StgArrWords_words(bitmap_arr);
2008     bytes = WDS(words);
2009
2010     ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R5_PTR, newBCOzh_fast );
2011
2012     bco = Hp - bytes + WDS(1);
2013     SET_HDR(bco, stg_BCO_info, W_[CCCS]);
2014     
2015     StgBCO_instrs(bco)     = R1;
2016     StgBCO_literals(bco)   = R2;
2017     StgBCO_ptrs(bco)       = R3;
2018     StgBCO_arity(bco)      = HALF_W_(R4);
2019     StgBCO_size(bco)       = HALF_W_(words);
2020     
2021     // Copy the arity/bitmap info into the BCO
2022     W_ i;
2023     i = 0;
2024 for:
2025     if (i < StgArrWords_words(bitmap_arr)) {
2026         StgBCO_bitmap(bco,i) = StgArrWords_payload(bitmap_arr,i);
2027         i = i + 1;
2028         goto for;
2029     }
2030     
2031     RET_P(bco);
2032 }
2033
2034
2035 mkApUpd0zh_fast
2036 {
2037     // R1 = the BCO# for the AP
2038     //  
2039     W_ ap;
2040
2041     // This function is *only* used to wrap zero-arity BCOs in an
2042     // updatable wrapper (see ByteCodeLink.lhs).  An AP thunk is always
2043     // saturated and always points directly to a FUN or BCO.
2044     ASSERT(%INFO_TYPE(%GET_STD_INFO(R1)) == HALF_W_(BCO) &&
2045            StgBCO_arity(R1) == HALF_W_(0));
2046
2047     HP_CHK_GEN_TICKY(SIZEOF_StgAP, R1_PTR, mkApUpd0zh_fast);
2048     TICK_ALLOC_UP_THK(0, 0);
2049     CCCS_ALLOC(SIZEOF_StgAP);
2050
2051     ap = Hp - SIZEOF_StgAP + WDS(1);
2052     SET_HDR(ap, stg_AP_info, W_[CCCS]);
2053     
2054     StgAP_n_args(ap) = HALF_W_(0);
2055     StgAP_fun(ap) = R1;
2056     
2057     RET_P(ap);
2058 }
2059
2060 unpackClosurezh_fast
2061 {
2062 /* args: R1 = closure to analyze */
2063 // TODO: Consider the absence of ptrs or nonptrs as a special case ?
2064
2065     W_ info, ptrs, nptrs, p, ptrs_arr, nptrs_arr;
2066     info  = %GET_STD_INFO(UNTAG(R1));
2067
2068     // Some closures have non-standard layout, so we omit those here.
2069     W_ type;
2070     type = TO_W_(%INFO_TYPE(info));
2071     switch [0 .. N_CLOSURE_TYPES] type {
2072     case THUNK_SELECTOR : {
2073         ptrs = 1;
2074         nptrs = 0;
2075         goto out;
2076     }
2077     case THUNK, THUNK_1_0, THUNK_0_1, THUNK_2_0, THUNK_1_1, 
2078          THUNK_0_2, THUNK_STATIC, AP, PAP, AP_STACK, BCO : {
2079         ptrs = 0;
2080         nptrs = 0;
2081         goto out;
2082     }
2083     default: {
2084         ptrs  = TO_W_(%INFO_PTRS(info)); 
2085         nptrs = TO_W_(%INFO_NPTRS(info));
2086         goto out;
2087     }}
2088 out:
2089
2090     W_ ptrs_arr_sz, nptrs_arr_sz;
2091     nptrs_arr_sz = SIZEOF_StgArrWords   + WDS(nptrs);
2092     ptrs_arr_sz  = SIZEOF_StgMutArrPtrs + WDS(ptrs);
2093
2094     ALLOC_PRIM (ptrs_arr_sz + nptrs_arr_sz, R1_PTR, unpackClosurezh_fast);
2095
2096     W_ clos;
2097     clos = UNTAG(R1);
2098
2099     ptrs_arr  = Hp - nptrs_arr_sz - ptrs_arr_sz + WDS(1);
2100     nptrs_arr = Hp - nptrs_arr_sz + WDS(1);
2101
2102     SET_HDR(ptrs_arr, stg_MUT_ARR_PTRS_FROZEN_info, W_[CCCS]);
2103     StgMutArrPtrs_ptrs(ptrs_arr) = ptrs;
2104     p = 0;
2105 for:
2106     if(p < ptrs) {
2107          W_[ptrs_arr + SIZEOF_StgMutArrPtrs + WDS(p)] = StgClosure_payload(clos,p);
2108          p = p + 1;
2109          goto for;
2110     }
2111     
2112     SET_HDR(nptrs_arr, stg_ARR_WORDS_info, W_[CCCS]);
2113     StgArrWords_words(nptrs_arr) = nptrs;
2114     p = 0;
2115 for2:
2116     if(p < nptrs) {
2117          W_[BYTE_ARR_CTS(nptrs_arr) + WDS(p)] = StgClosure_payload(clos, p+ptrs);
2118          p = p + 1;
2119          goto for2;
2120     }
2121     RET_NPP(info, ptrs_arr, nptrs_arr);
2122 }
2123
2124 /* -----------------------------------------------------------------------------
2125    Thread I/O blocking primitives
2126    -------------------------------------------------------------------------- */
2127
2128 /* Add a thread to the end of the blocked queue. (C-- version of the C
2129  * macro in Schedule.h).
2130  */
2131 #define APPEND_TO_BLOCKED_QUEUE(tso)                    \
2132     ASSERT(StgTSO__link(tso) == END_TSO_QUEUE);         \
2133     if (W_[blocked_queue_hd] == END_TSO_QUEUE) {        \
2134       W_[blocked_queue_hd] = tso;                       \
2135     } else {                                            \
2136       foreign "C" setTSOLink(MyCapability() "ptr", W_[blocked_queue_tl] "ptr", tso) []; \
2137     }                                                   \
2138     W_[blocked_queue_tl] = tso;
2139
2140 waitReadzh_fast
2141 {
2142     /* args: R1 */
2143 #ifdef THREADED_RTS
2144     foreign "C" barf("waitRead# on threaded RTS") never returns;
2145 #else
2146
2147     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2148     StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
2149     StgTSO_block_info(CurrentTSO) = R1;
2150     // No locking - we're not going to use this interface in the
2151     // threaded RTS anyway.
2152     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2153     jump stg_block_noregs;
2154 #endif
2155 }
2156
2157 waitWritezh_fast
2158 {
2159     /* args: R1 */
2160 #ifdef THREADED_RTS
2161     foreign "C" barf("waitWrite# on threaded RTS") never returns;
2162 #else
2163
2164     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2165     StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
2166     StgTSO_block_info(CurrentTSO) = R1;
2167     // No locking - we're not going to use this interface in the
2168     // threaded RTS anyway.
2169     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2170     jump stg_block_noregs;
2171 #endif
2172 }
2173
2174
2175 STRING(stg_delayzh_malloc_str, "delayzh_fast")
2176 delayzh_fast
2177 {
2178 #ifdef mingw32_HOST_OS
2179     W_ ares;
2180     CInt reqID;
2181 #else
2182     W_ t, prev, target;
2183 #endif
2184
2185 #ifdef THREADED_RTS
2186     foreign "C" barf("delay# on threaded RTS") never returns;
2187 #else
2188
2189     /* args: R1 (microsecond delay amount) */
2190     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2191     StgTSO_why_blocked(CurrentTSO) = BlockedOnDelay::I16;
2192
2193 #ifdef mingw32_HOST_OS
2194
2195     /* could probably allocate this on the heap instead */
2196     ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
2197                                             stg_delayzh_malloc_str);
2198     (reqID) = foreign "C" addDelayRequest(R1);
2199     StgAsyncIOResult_reqID(ares)   = reqID;
2200     StgAsyncIOResult_len(ares)     = 0;
2201     StgAsyncIOResult_errCode(ares) = 0;
2202     StgTSO_block_info(CurrentTSO)  = ares;
2203
2204     /* Having all async-blocked threads reside on the blocked_queue
2205      * simplifies matters, so change the status to OnDoProc put the
2206      * delayed thread on the blocked_queue.
2207      */
2208     StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
2209     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2210     jump stg_block_async_void;
2211
2212 #else
2213
2214     W_ time;
2215     W_ divisor;
2216     (time) = foreign "C" getourtimeofday() [R1];
2217     divisor = TO_W_(RtsFlags_MiscFlags_tickInterval(RtsFlags));
2218     if (divisor == 0) {
2219         divisor = 50;
2220     }
2221     divisor = divisor * 1000;
2222     target = ((R1 + divisor - 1) / divisor) /* divide rounding up */
2223            + time + 1; /* Add 1 as getourtimeofday rounds down */
2224     StgTSO_block_info(CurrentTSO) = target;
2225
2226     /* Insert the new thread in the sleeping queue. */
2227     prev = NULL;
2228     t = W_[sleeping_queue];
2229 while:
2230     if (t != END_TSO_QUEUE && StgTSO_block_info(t) < target) {
2231         prev = t;
2232         t = StgTSO__link(t);
2233         goto while;
2234     }
2235
2236     StgTSO__link(CurrentTSO) = t;
2237     if (prev == NULL) {
2238         W_[sleeping_queue] = CurrentTSO;
2239     } else {
2240         foreign "C" setTSOLink(MyCapability() "ptr", prev "ptr", CurrentTSO) [];
2241     }
2242     jump stg_block_noregs;
2243 #endif
2244 #endif /* !THREADED_RTS */
2245 }
2246
2247
2248 #ifdef mingw32_HOST_OS
2249 STRING(stg_asyncReadzh_malloc_str, "asyncReadzh_fast")
2250 asyncReadzh_fast
2251 {
2252     W_ ares;
2253     CInt reqID;
2254
2255 #ifdef THREADED_RTS
2256     foreign "C" barf("asyncRead# on threaded RTS") never returns;
2257 #else
2258
2259     /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
2260     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2261     StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
2262
2263     /* could probably allocate this on the heap instead */
2264     ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
2265                                             stg_asyncReadzh_malloc_str)
2266                         [R1,R2,R3,R4];
2267     (reqID) = foreign "C" addIORequest(R1, 0/*FALSE*/,R2,R3,R4 "ptr") [];
2268     StgAsyncIOResult_reqID(ares)   = reqID;
2269     StgAsyncIOResult_len(ares)     = 0;
2270     StgAsyncIOResult_errCode(ares) = 0;
2271     StgTSO_block_info(CurrentTSO)  = ares;
2272     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2273     jump stg_block_async;
2274 #endif
2275 }
2276
2277 STRING(stg_asyncWritezh_malloc_str, "asyncWritezh_fast")
2278 asyncWritezh_fast
2279 {
2280     W_ ares;
2281     CInt reqID;
2282
2283 #ifdef THREADED_RTS
2284     foreign "C" barf("asyncWrite# on threaded RTS") never returns;
2285 #else
2286
2287     /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
2288     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2289     StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
2290
2291     ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
2292                                             stg_asyncWritezh_malloc_str)
2293                         [R1,R2,R3,R4];
2294     (reqID) = foreign "C" addIORequest(R1, 1/*TRUE*/,R2,R3,R4 "ptr") [];
2295
2296     StgAsyncIOResult_reqID(ares)   = reqID;
2297     StgAsyncIOResult_len(ares)     = 0;
2298     StgAsyncIOResult_errCode(ares) = 0;
2299     StgTSO_block_info(CurrentTSO)  = ares;
2300     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2301     jump stg_block_async;
2302 #endif
2303 }
2304
2305 STRING(stg_asyncDoProczh_malloc_str, "asyncDoProczh_fast")
2306 asyncDoProczh_fast
2307 {
2308     W_ ares;
2309     CInt reqID;
2310
2311 #ifdef THREADED_RTS
2312     foreign "C" barf("asyncDoProc# on threaded RTS") never returns;
2313 #else
2314
2315     /* args: R1 = proc, R2 = param */
2316     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
2317     StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
2318
2319     /* could probably allocate this on the heap instead */
2320     ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
2321                                             stg_asyncDoProczh_malloc_str) 
2322                                 [R1,R2];
2323     (reqID) = foreign "C" addDoProcRequest(R1 "ptr",R2 "ptr") [];
2324     StgAsyncIOResult_reqID(ares)   = reqID;
2325     StgAsyncIOResult_len(ares)     = 0;
2326     StgAsyncIOResult_errCode(ares) = 0;
2327     StgTSO_block_info(CurrentTSO) = ares;
2328     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
2329     jump stg_block_async;
2330 #endif
2331 }
2332 #endif
2333
2334 // noDuplicate# tries to ensure that none of the thunks under
2335 // evaluation by the current thread are also under evaluation by
2336 // another thread.  It relies on *both* threads doing noDuplicate#;
2337 // the second one will get blocked if they are duplicating some work.
2338 noDuplicatezh_fast
2339 {
2340     SAVE_THREAD_STATE();
2341     ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
2342     foreign "C" threadPaused (MyCapability() "ptr", CurrentTSO "ptr") [];
2343     
2344     if (StgTSO_what_next(CurrentTSO) == ThreadKilled::I16) {
2345         jump stg_threadFinished;
2346     } else {
2347         LOAD_THREAD_STATE();
2348         ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
2349         jump %ENTRY_CODE(Sp(0));
2350     }
2351 }
2352
2353 getApStackValzh_fast
2354 {
2355    W_ ap_stack, offset, val, ok;
2356
2357    /* args: R1 = AP_STACK, R2 = offset */
2358    ap_stack = R1;
2359    offset   = R2;
2360
2361    if (%INFO_PTR(ap_stack) == stg_AP_STACK_info) {
2362         ok = 1;
2363         val = StgAP_STACK_payload(ap_stack,offset); 
2364    } else {
2365         ok = 0;
2366         val = R1;
2367    }
2368    RET_NP(ok,val);
2369 }
2370
2371 /* -----------------------------------------------------------------------------
2372    Misc. primitives
2373    -------------------------------------------------------------------------- */
2374
2375 // Write the cost center stack of the first argument on stderr; return
2376 // the second.  Possibly only makes sense for already evaluated
2377 // things?
2378 traceCcszh_fast
2379 {
2380     W_ ccs;
2381
2382 #ifdef PROFILING
2383     ccs = StgHeader_ccs(UNTAG(R1));
2384     foreign "C" fprintCCS_stderr(ccs "ptr") [R2];
2385 #endif
2386
2387     R1 = R2;
2388     ENTER();
2389 }
2390
2391 getSparkzh_fast
2392 {
2393    W_ spark;
2394
2395 #ifndef THREADED_RTS
2396    RET_NP(0,ghczmprim_GHCziBool_False_closure);
2397 #else
2398    (spark) = foreign "C" findSpark(MyCapability());
2399    if (spark != 0) {
2400       RET_NP(1,spark);
2401    } else {
2402       RET_NP(0,ghczmprim_GHCziBool_False_closure);
2403    }
2404 #endif
2405 }