Interruptible FFI calls with pthread_kill and CancelSynchronousIO. v4

[ghc-hetmet.git] / rts / Interpreter.c
diff --git a/rts/Interpreter.c b/rts/Interpreter.c

index 81d4e38..da7ee21 100644 (file)
--- a/rts/Interpreter.c
+++ b/rts/Interpreter.c
@@ -7,27 +7,35 @@
  #include "PosixSource.h"
  #include "Rts.h"
  #include "RtsAPI.h"
+#include "rts/Bytecodes.h"
+
+// internal headers
+#include "sm/Storage.h"
+#include "sm/Sanity.h"
  #include "RtsUtils.h"
-#include "Closures.h"
-#include "TSO.h"
  #include "Schedule.h"
-#include "RtsFlags.h"
-#include "Storage.h"
-#include "LdvProfile.h"
  #include "Updates.h"
-#include "Sanity.h"
-#include "Liveness.h"
-
-#include "Bytecodes.h"
+#include "Prelude.h"
+#include "Stable.h"
  #include "Printer.h"
  #include "Disassembler.h"
  #include "Interpreter.h"
+#include "ThreadPaused.h"
+#include "Threads.h"
  
  #include <string.h>     /* for memcpy */
  #ifdef HAVE_ERRNO_H
  #include <errno.h>
  #endif
  
+// When building the RTS in the non-dyn way on Windows, we don't
+//     want declspec(__dllimport__) on the front of function prototypes
+//     from libffi.
+#if defined(mingw32_HOST_OS) && !defined(__PIC__)
+# define LIBFFI_NOT_DLL
+#endif
+
+#include "ffi.h"
  
  /* --------------------------------------------------------------------------
   * The bytecode interpreter
@@ -42,9 +50,19 @@
  /* Sp points to the lowest live word on the stack. */
  
  #define BCO_NEXT      instrs[bciPtr++]
+#define BCO_NEXT_32   (bciPtr += 2, (((StgWord) instrs[bciPtr-2]) << 16) + ((StgWord) instrs[bciPtr-1]))
+#define BCO_NEXT_64   (bciPtr += 4, (((StgWord) instrs[bciPtr-4]) << 48) + (((StgWord) instrs[bciPtr-3]) << 32) + (((StgWord) instrs[bciPtr-2]) << 16) + ((StgWord) instrs[bciPtr-1]))
+#if WORD_SIZE_IN_BITS == 32
+#define BCO_NEXT_WORD BCO_NEXT_32
+#elif WORD_SIZE_IN_BITS == 64
+#define BCO_NEXT_WORD BCO_NEXT_64
+#else
+#error Cannot cope with WORD_SIZE_IN_BITS being nether 32 nor 64
+#endif
+#define BCO_GET_LARGE_ARG ((bci & bci_FLAG_LARGE_ARGS) ? BCO_NEXT_WORD : BCO_NEXT)
+
  #define BCO_PTR(n)    (W_)ptrs[n]
  #define BCO_LIT(n)    literals[n]
-#define BCO_ITBL(n)   itbls[n]
  
  #define LOAD_STACK_POINTERS                                    \
      Sp = cap->r.rCurrentTSO->sp;                               \
@@ -52,6 +70,7 @@
      SpLim = cap->r.rCurrentTSO->stack + RESERVED_STACK_WORDS;
  
  #define SAVE_STACK_POINTERS                    \
+    ASSERT(Sp > SpLim); \
      cap->r.rCurrentTSO->sp = Sp
  
  #define RETURN_TO_SCHEDULER(todo,retcode)      \
@@ -69,11 +88,13 @@
  
  
  STATIC_INLINE StgPtr
-allocate_NONUPD (int n_words)
+allocate_NONUPD (Capability *cap, int n_words)
  {
-    return allocate(stg_max(sizeofW(StgHeader)+MIN_PAYLOAD_SIZE, n_words));
+    return allocate(cap, stg_max(sizeofW(StgHeader)+MIN_PAYLOAD_SIZE, n_words));
  }
  
+int rts_stop_next_breakpoint = 0;
+int rts_stop_on_exception = 0;
  
  #ifdef INTERP_STATS
  
@@ -94,6 +115,7 @@ int it_ofreq[27];
  int it_oofreq[27][27];
  int it_lastopc;
  
+
  #define INTERP_TICK(n) (n)++
  
  void interp_startup ( void )
@@ -166,6 +188,9 @@ static StgWord app_ptrs_itbl[] = {
      (W_)&stg_ap_pppppp_info,
  };
  
+HsStablePtr rts_breakpoint_io_action; // points to the IO action which is executed on a breakpoint
+                                // it is set in main/GHC.hs:runStmt
+
  Capability *
  interpretBCO (Capability* cap)
  {
@@ -173,11 +198,14 @@ interpretBCO (Capability* cap)
      // that these entities are non-aliasable.
      register StgPtr       Sp;    // local state -- stack pointer
      register StgPtr       SpLim; // local state -- stack lim pointer
-    register StgClosure*  obj;
+    register StgClosure   *tagged_obj = 0, *obj;
      nat n, m;
  
      LOAD_STACK_POINTERS;
  
+    cap->r.rHpLim = (P_)1; // HpLim is the context-switch flag; when it
+                           // goes to zero we must return to the scheduler.
+
      // ------------------------------------------------------------------------
      // Case 1:
      // 
@@ -189,8 +217,8 @@ interpretBCO (Capability* cap)
      //         +---------------+
      //       
      if (Sp[0] == (W_)&stg_enter_info) {
-       Sp++;
-       goto eval;
+       Sp++;
+       goto eval;
      }
  
      // ------------------------------------------------------------------------
@@ -208,7 +236,7 @@ interpretBCO (Capability* cap)
      //         +---------------+
      //       
      else if (Sp[0] == (W_)&stg_apply_interp_info) {
-       obj = (StgClosure *)Sp[1];
+       obj = UNTAG_CLOSURE((StgClosure *)Sp[1]);
         Sp += 2;
         goto run_BCO_fun;
      }
@@ -225,9 +253,10 @@ interpretBCO (Capability* cap)
  
      // Evaluate the object on top of the stack.
  eval:
-    obj = (StgClosure*)Sp[0]; Sp++;
+    tagged_obj = (StgClosure*)Sp[0]; Sp++;
  
  eval_obj:
+    obj = UNTAG_CLOSURE(tagged_obj);
      INTERP_TICK(it_total_evals);
  
      IF_DEBUG(interpreter,
@@ -241,17 +270,16 @@ eval_obj:
               debugBelch("\n\n");
              );
  
-    IF_DEBUG(sanity,checkStackChunk(Sp, cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size));
+//    IF_DEBUG(sanity,checkStackChunk(Sp, cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size));
+    IF_DEBUG(sanity,checkStackFrame(Sp));
  
      switch ( get_itbl(obj)->type ) {
  
      case IND:
-    case IND_OLDGEN:
      case IND_PERM:
-    case IND_OLDGEN_PERM:
      case IND_STATIC:
      { 
-       obj = ((StgInd*)obj)->indirectee;
+       tagged_obj = ((StgInd*)obj)->indirectee;
         goto eval_obj;
      }
      
@@ -275,8 +303,10 @@ eval_obj:
         break;
         
      case BCO:
+    {
         ASSERT(((StgBCO *)obj)->arity > 0);
         break;
+    }
  
      case AP:   /* Copied from stg_AP_entry. */
      {
@@ -289,7 +319,7 @@ eval_obj:
         // Stack check
         if (Sp - (words+sizeofW(StgUpdateFrame)) < SpLim) {
             Sp -= 2;
-           Sp[1] = (W_)obj;
+           Sp[1] = (W_)tagged_obj;
             Sp[0] = (W_)&stg_enter_info;
             RETURN_TO_SCHEDULER(ThreadInterpret, StackOverflow);
         }
@@ -309,7 +339,7 @@ eval_obj:
             Sp[i] = (W_)ap->payload[i];
         }
  
-       obj = (StgClosure*)ap->fun;
+       obj = UNTAG_CLOSURE((StgClosure*)ap->fun);
         ASSERT(get_itbl(obj)->type == BCO);
         goto run_BCO_fun;
      }
@@ -332,16 +362,17 @@ eval_obj:
                  printObj(obj);
             );
         Sp -= 2;
-       Sp[1] = (W_)obj;
+       Sp[1] = (W_)tagged_obj;
         Sp[0] = (W_)&stg_enter_info;
         RETURN_TO_SCHEDULER_NO_PAUSE(ThreadRunGHC, ThreadYielding);
      }
      }
  
      // ------------------------------------------------------------------------
-    // We now have an evaluated object (obj).  The next thing to
+    // We now have an evaluated object (tagged_obj).  The next thing to
      // do is return it to the stack frame on top of the stack.
  do_return:
+    obj = UNTAG_CLOSURE(tagged_obj);
      ASSERT(closure_HNF(obj));
  
      IF_DEBUG(interpreter,
@@ -402,8 +433,17 @@ do_return:
      case UPDATE_FRAME:
         // Returning to an update frame: do the update, pop the update
         // frame, and continue with the next stack frame.
+        //
+        // NB. we must update with the *tagged* pointer.  Some tags
+        // are not optional, and if we omit the tag bits when updating
+        // then bad things can happen (albeit very rarely).  See #1925.
+        // What happened was an indirection was created with an
+        // untagged pointer, and this untagged pointer was propagated
+        // to a PAP by the GC, violating the invariant that PAPs
+        // always contain a tagged pointer to the function.
         INTERP_TICK(it_retto_UPDATE);
-       UPD_IND(((StgUpdateFrame *)Sp)->updatee, obj); 
+        updateThunk(cap, cap->r.rCurrentTSO, 
+                    ((StgUpdateFrame *)Sp)->updatee, tagged_obj);
         Sp += sizeofW(StgUpdateFrame);
         goto do_return;
  
@@ -413,6 +453,8 @@ do_return:
         INTERP_TICK(it_retto_BCO);
         Sp--;
         Sp[0] = (W_)obj;
+        // NB. return the untagged object; the bytecode expects it to
+        // be untagged.  XXX this doesn't seem right.
         obj = (StgClosure*)Sp[2];
         ASSERT(get_itbl(obj)->type == BCO);
         goto run_BCO_return;
@@ -427,7 +469,7 @@ do_return:
                  printStackChunk(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size);
             );
         Sp -= 2;
-       Sp[1] = (W_)obj;
+       Sp[1] = (W_)tagged_obj;
         Sp[0] = (W_)&stg_enter_info;
         RETURN_TO_SCHEDULER_NO_PAUSE(ThreadRunGHC, ThreadYielding);
      }
@@ -500,6 +542,7 @@ do_return_unboxed:
      // Application...
  
  do_apply:
+    ASSERT(obj == UNTAG_CLOSURE(tagged_obj));
      // we have a function to apply (obj), and n arguments taking up m
      // words on the stack.  The info table (stg_ap_pp_info or whatever)
      // is on top of the arguments on the stack.
@@ -513,10 +556,20 @@ do_apply:
             pap = (StgPAP *)obj;
  
             // we only cope with PAPs whose function is a BCO
-           if (get_itbl(pap->fun)->type != BCO) {
+           if (get_itbl(UNTAG_CLOSURE(pap->fun))->type != BCO) {
                 goto defer_apply_to_sched;
             }
  
+            // Stack check: we're about to unpack the PAP onto the
+            // stack.  The (+1) is for the (arity < n) case, where we
+            // also need space for an extra info pointer.
+            if (Sp - (pap->n_args + 1) < SpLim) {
+                Sp -= 2;
+                Sp[1] = (W_)tagged_obj;
+                Sp[0] = (W_)&stg_enter_info;
+                RETURN_TO_SCHEDULER(ThreadInterpret, StackOverflow);
+            }
+
             Sp++;
             arity = pap->arity;
             ASSERT(arity > 0);
@@ -538,7 +591,7 @@ do_apply:
                 for (i = 0; i < pap->n_args; i++) {
                     Sp[i] = (W_)pap->payload[i];
                 }
-               obj = pap->fun;
+               obj = UNTAG_CLOSURE(pap->fun);
                 goto run_BCO_fun;
             } 
             else if (arity == n) {
@@ -546,13 +599,13 @@ do_apply:
                 for (i = 0; i < pap->n_args; i++) {
                     Sp[i] = (W_)pap->payload[i];
                 }
-               obj = pap->fun;
+               obj = UNTAG_CLOSURE(pap->fun);
                 goto run_BCO_fun;
             } 
             else /* arity > n */ {
                 // build a new PAP and return it.
                 StgPAP *new_pap;
-               new_pap = (StgPAP *)allocate(PAP_sizeW(pap->n_args + m));
+               new_pap = (StgPAP *)allocate(cap, PAP_sizeW(pap->n_args + m));
                 SET_HDR(new_pap,&stg_PAP_info,CCCS);
                 new_pap->arity = pap->arity - n;
                 new_pap->n_args = pap->n_args + m;
@@ -563,7 +616,7 @@ do_apply:
                 for (i = 0; i < m; i++) {
                     new_pap->payload[pap->n_args + i] = (StgClosure *)Sp[i];
                 }
-               obj = (StgClosure *)new_pap;
+               tagged_obj = (StgClosure *)new_pap;
                 Sp += m;
                 goto do_return;
             }
@@ -597,7 +650,7 @@ do_apply:
                 // build a PAP and return it.
                 StgPAP *pap;
                 nat i;
-               pap = (StgPAP *)allocate(PAP_sizeW(m));
+               pap = (StgPAP *)allocate(cap, PAP_sizeW(m));
                 SET_HDR(pap, &stg_PAP_info,CCCS);
                 pap->arity = arity - n;
                 pap->fun = obj;
@@ -605,7 +658,7 @@ do_apply:
                 for (i = 0; i < m; i++) {
                     pap->payload[i] = (StgClosure *)Sp[i];
                 }
-               obj = (StgClosure *)pap;
+               tagged_obj = (StgClosure *)pap;
                 Sp += m;
                 goto do_return;
             }
@@ -615,7 +668,7 @@ do_apply:
         default:
         defer_apply_to_sched:
             Sp -= 2;
-           Sp[1] = (W_)obj;
+           Sp[1] = (W_)tagged_obj;
             Sp[0] = (W_)&stg_enter_info;
             RETURN_TO_SCHEDULER_NO_PAUSE(ThreadRunGHC, ThreadYielding);
      }
@@ -663,23 +716,26 @@ do_apply:
      // Sadly we have three different kinds of stack/heap/cswitch check
      // to do:
  
+
  run_BCO_return:
      // Heap check
-    if (doYouWantToGC()) {
+    if (doYouWantToGC(cap)) {
         Sp--; Sp[0] = (W_)&stg_enter_info;
         RETURN_TO_SCHEDULER(ThreadInterpret, HeapOverflow);
      }
      // Stack checks aren't necessary at return points, the stack use
      // is aggregated into the enclosing function entry point.
+
      goto run_BCO;
      
  run_BCO_return_unboxed:
      // Heap check
-    if (doYouWantToGC()) {
+    if (doYouWantToGC(cap)) {
         RETURN_TO_SCHEDULER(ThreadInterpret, HeapOverflow);
      }
      // Stack checks aren't necessary at return points, the stack use
      // is aggregated into the enclosing function entry point.
+
      goto run_BCO;
      
  run_BCO_fun:
@@ -692,7 +748,7 @@ run_BCO_fun:
         );
  
      // Heap check
-    if (doYouWantToGC()) {
+    if (doYouWantToGC(cap)) {
         Sp -= 2; 
         Sp[1] = (W_)obj; 
         Sp[0] = (W_)&stg_apply_interp_info; // placeholder, really
@@ -706,6 +762,7 @@ run_BCO_fun:
         Sp[0] = (W_)&stg_apply_interp_info; // placeholder, really
         RETURN_TO_SCHEDULER(ThreadInterpret, StackOverflow);
      }
+
      goto run_BCO;
      
      // Now, actually interpret the BCO... (no returning to the
@@ -713,20 +770,22 @@ run_BCO_fun:
  run_BCO:
      INTERP_TICK(it_BCO_entries);
      {
-       register int       bciPtr     = 1; /* instruction pointer */
+       register int       bciPtr = 0; /* instruction pointer */
+        register StgWord16 bci;
         register StgBCO*   bco        = (StgBCO*)obj;
         register StgWord16* instrs    = (StgWord16*)(bco->instrs->payload);
         register StgWord*  literals   = (StgWord*)(&bco->literals->payload[0]);
         register StgPtr*   ptrs       = (StgPtr*)(&bco->ptrs->payload[0]);
-       register StgInfoTable** itbls = (StgInfoTable**)
-           (&bco->itbls->payload[0]);
+       int bcoSize;
+    bcoSize = BCO_NEXT_WORD;
+       IF_DEBUG(interpreter,debugBelch("bcoSize = %d\n", bcoSize));
  
  #ifdef INTERP_STATS
         it_lastopc = 0; /* no opcode */
  #endif
  
      nextInsn:
-       ASSERT(bciPtr <= instrs[0]);
+       ASSERT(bciPtr < bcoSize);
         IF_DEBUG(interpreter,
                  //if (do_print_stack) {
                  //debugBelch("\n-- BEGIN stack\n");
@@ -745,6 +804,7 @@ run_BCO:
                  //if (do_print_stack) checkStack(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size,iSu);
             );
  
+
         INTERP_TICK(it_insns);
  
  #ifdef INTERP_STATS
@@ -754,13 +814,109 @@ run_BCO:
         it_lastopc = (int)instrs[bciPtr];
  #endif
  
-       switch (BCO_NEXT) {
+       bci = BCO_NEXT;
+    /* We use the high 8 bits for flags, only the highest of which is
+     * currently allocated */
+    ASSERT((bci & 0xFF00) == (bci & 0x8000));
+
+    switch (bci & 0xFF) {
+
+        /* check for a breakpoint on the beginning of a let binding */
+        case bci_BRK_FUN: 
+        {
+            int arg1_brk_array, arg2_array_index, arg3_freeVars;
+            StgArrWords *breakPoints;
+            int returning_from_break;     // are we resuming execution from a breakpoint?
+                                          //  if yes, then don't break this time around
+            StgClosure *ioAction;         // the io action to run at a breakpoint
+
+            StgAP_STACK *new_aps;         // a closure to save the top stack frame on the heap
+            int i;
+            int size_words;
+
+            arg1_brk_array      = BCO_NEXT;  // 1st arg of break instruction
+            arg2_array_index    = BCO_NEXT;  // 2nd arg of break instruction
+            arg3_freeVars       = BCO_NEXT;  // 3rd arg of break instruction
+
+            // check if we are returning from a breakpoint - this info
+            // is stored in the flags field of the current TSO
+            returning_from_break = cap->r.rCurrentTSO->flags & TSO_STOPPED_ON_BREAKPOINT; 
+
+            // if we are returning from a break then skip this section
+            // and continue executing
+            if (!returning_from_break)
+            {
+               breakPoints = (StgArrWords *) BCO_PTR(arg1_brk_array);
+
+               // stop the current thread if either the
+               // "rts_stop_next_breakpoint" flag is true OR if the
+               // breakpoint flag for this particular expression is
+               // true
+               if (rts_stop_next_breakpoint == rtsTrue || 
+                   breakPoints->payload[arg2_array_index] == rtsTrue)
+               {
+                  // make sure we don't automatically stop at the
+                  // next breakpoint
+                  rts_stop_next_breakpoint = rtsFalse;
+
+                  // allocate memory for a new AP_STACK, enough to
+                  // store the top stack frame plus an
+                  // stg_apply_interp_info pointer and a pointer to
+                  // the BCO
+                  size_words = BCO_BITMAP_SIZE(obj) + 2;
+                  new_aps = (StgAP_STACK *) allocate(cap, AP_STACK_sizeW(size_words));
+                  SET_HDR(new_aps,&stg_AP_STACK_info,CCS_SYSTEM); 
+                  new_aps->size = size_words;
+                  new_aps->fun = &stg_dummy_ret_closure; 
+
+                  // fill in the payload of the AP_STACK 
+                  new_aps->payload[0] = (StgClosure *)&stg_apply_interp_info;
+                  new_aps->payload[1] = (StgClosure *)obj;
+
+                  // copy the contents of the top stack frame into the AP_STACK
+                  for (i = 2; i < size_words; i++)
+                  {
+                     new_aps->payload[i] = (StgClosure *)Sp[i-2];
+                  }
+
+                  // prepare the stack so that we can call the
+                  // rts_breakpoint_io_action and ensure that the stack is
+                  // in a reasonable state for the GC and so that
+                  // execution of this BCO can continue when we resume
+                  ioAction = (StgClosure *) deRefStablePtr (rts_breakpoint_io_action);
+                  Sp -= 8;
+                  Sp[7] = (W_)obj;
+                  Sp[6] = (W_)&stg_apply_interp_info;
+                  Sp[5] = (W_)new_aps;                 // the AP_STACK
+                  Sp[4] = (W_)BCO_PTR(arg3_freeVars);  // the info about local vars of the breakpoint
+                  Sp[3] = (W_)False_closure;            // True <=> a breakpoint
+                  Sp[2] = (W_)&stg_ap_pppv_info;
+                  Sp[1] = (W_)ioAction;                // apply the IO action to its two arguments above
+                  Sp[0] = (W_)&stg_enter_info;         // get ready to run the IO action
+                  // set the flag in the TSO to say that we are now
+                  // stopping at a breakpoint so that when we resume
+                  // we don't stop on the same breakpoint that we
+                  // already stopped at just now
+                  cap->r.rCurrentTSO->flags |= TSO_STOPPED_ON_BREAKPOINT;
+
+                  // stop this thread and return to the scheduler -
+                  // eventually we will come back and the IO action on
+                  // the top of the stack will be executed
+                  RETURN_TO_SCHEDULER_NO_PAUSE(ThreadRunGHC, ThreadYielding);
+               }
+            }
+            // record that this thread is not stopped at a breakpoint anymore
+            cap->r.rCurrentTSO->flags &= ~TSO_STOPPED_ON_BREAKPOINT;
+
+            // continue normal execution of the byte code instructions
+           goto nextInsn;
+        }
  
         case bci_STKCHECK: {
             // Explicit stack check at the beginning of a function
             // *only* (stack checks in case alternatives are
             // propagated to the enclosing function).
-           int stk_words_reqd = BCO_NEXT + 1;
+           StgWord stk_words_reqd = BCO_GET_LARGE_ARG + 1;
             if (Sp - stk_words_reqd < SpLim) {
                 Sp -= 2; 
                 Sp[1] = (W_)obj; 
@@ -921,7 +1077,7 @@ run_BCO:
         case bci_ALLOC_AP: {
             StgAP* ap; 
             int n_payload = BCO_NEXT;
-           ap = (StgAP*)allocate(AP_sizeW(n_payload));
+           ap = (StgAP*)allocate(cap, AP_sizeW(n_payload));
             Sp[-1] = (W_)ap;
             ap->n_args = n_payload;
             SET_HDR(ap, &stg_AP_info, CCS_SYSTEM/*ToDo*/)
@@ -929,11 +1085,22 @@ run_BCO:
             goto nextInsn;
         }
  
+       case bci_ALLOC_AP_NOUPD: {
+           StgAP* ap; 
+           int n_payload = BCO_NEXT;
+           ap = (StgAP*)allocate(cap, AP_sizeW(n_payload));
+           Sp[-1] = (W_)ap;
+           ap->n_args = n_payload;
+           SET_HDR(ap, &stg_AP_NOUPD_info, CCS_SYSTEM/*ToDo*/)
+           Sp --;
+           goto nextInsn;
+       }
+
         case bci_ALLOC_PAP: {
             StgPAP* pap; 
             int arity = BCO_NEXT;
             int n_payload = BCO_NEXT;
-           pap = (StgPAP*)allocate(PAP_sizeW(n_payload));
+           pap = (StgPAP*)allocate(cap, PAP_sizeW(n_payload));
             Sp[-1] = (W_)pap;
             pap->n_args = n_payload;
             pap->arity = arity;
@@ -1002,12 +1169,12 @@ run_BCO:
             int i;
             int o_itbl         = BCO_NEXT;
             int n_words        = BCO_NEXT;
-           StgInfoTable* itbl = INFO_PTR_TO_STRUCT(BCO_ITBL(o_itbl));
+           StgInfoTable* itbl = INFO_PTR_TO_STRUCT(BCO_LIT(o_itbl));
             int request        = CONSTR_sizeW( itbl->layout.payload.ptrs, 
                                                itbl->layout.payload.nptrs );
-           StgClosure* con = (StgClosure*)allocate_NONUPD(request);
+           StgClosure* con = (StgClosure*)allocate_NONUPD(cap,request);
             ASSERT( itbl->layout.payload.ptrs + itbl->layout.payload.nptrs > 0);
-           SET_HDR(con, BCO_ITBL(o_itbl), CCS_SYSTEM/*ToDo*/);
+           SET_HDR(con, (StgInfoTable*)BCO_LIT(o_itbl), CCS_SYSTEM/*ToDo*/);
             for (i = 0; i < n_words; i++) {
                 con->payload[i] = (StgClosure*)Sp[i];
             }
@@ -1023,7 +1190,7 @@ run_BCO:
  
         case bci_TESTLT_P: {
             unsigned int discr  = BCO_NEXT;
-           int failto = BCO_NEXT;
+           int failto = BCO_GET_LARGE_ARG;
             StgClosure* con = (StgClosure*)Sp[0];
             if (GET_TAG(con) >= discr) {
                 bciPtr = failto;
@@ -1033,7 +1200,7 @@ run_BCO:
  
         case bci_TESTEQ_P: {
             unsigned int discr  = BCO_NEXT;
-           int failto = BCO_NEXT;
+           int failto = BCO_GET_LARGE_ARG;
             StgClosure* con = (StgClosure*)Sp[0];
             if (GET_TAG(con) != discr) {
                 bciPtr = failto;
@@ -1044,7 +1211,7 @@ run_BCO:
         case bci_TESTLT_I: {
             // There should be an Int at Sp[1], and an info table at Sp[0].
             int discr   = BCO_NEXT;
-           int failto  = BCO_NEXT;
+           int failto  = BCO_GET_LARGE_ARG;
             I_ stackInt = (I_)Sp[1];
             if (stackInt >= (I_)BCO_LIT(discr))
                 bciPtr = failto;
@@ -1054,7 +1221,7 @@ run_BCO:
         case bci_TESTEQ_I: {
             // There should be an Int at Sp[1], and an info table at Sp[0].
             int discr   = BCO_NEXT;
-           int failto  = BCO_NEXT;
+           int failto  = BCO_GET_LARGE_ARG;
             I_ stackInt = (I_)Sp[1];
             if (stackInt != (I_)BCO_LIT(discr)) {
                 bciPtr = failto;
@@ -1062,10 +1229,31 @@ run_BCO:
             goto nextInsn;
         }
  
+       case bci_TESTLT_W: {
+           // There should be an Int at Sp[1], and an info table at Sp[0].
+           int discr   = BCO_NEXT;
+           int failto  = BCO_GET_LARGE_ARG;
+           W_ stackWord = (W_)Sp[1];
+           if (stackWord >= (W_)BCO_LIT(discr))
+               bciPtr = failto;
+           goto nextInsn;
+       }
+
+       case bci_TESTEQ_W: {
+           // There should be an Int at Sp[1], and an info table at Sp[0].
+           int discr   = BCO_NEXT;
+           int failto  = BCO_GET_LARGE_ARG;
+           W_ stackWord = (W_)Sp[1];
+           if (stackWord != (W_)BCO_LIT(discr)) {
+               bciPtr = failto;
+           }
+           goto nextInsn;
+       }
+
         case bci_TESTLT_D: {
             // There should be a Double at Sp[1], and an info table at Sp[0].
             int discr   = BCO_NEXT;
-           int failto  = BCO_NEXT;
+           int failto  = BCO_GET_LARGE_ARG;
             StgDouble stackDbl, discrDbl;
             stackDbl = PK_DBL( & Sp[1] );
             discrDbl = PK_DBL( & BCO_LIT(discr) );
@@ -1078,7 +1266,7 @@ run_BCO:
         case bci_TESTEQ_D: {
             // There should be a Double at Sp[1], and an info table at Sp[0].
             int discr   = BCO_NEXT;
-           int failto  = BCO_NEXT;
+           int failto  = BCO_GET_LARGE_ARG;
             StgDouble stackDbl, discrDbl;
             stackDbl = PK_DBL( & Sp[1] );
             discrDbl = PK_DBL( & BCO_LIT(discr) );
@@ -1091,7 +1279,7 @@ run_BCO:
         case bci_TESTLT_F: {
             // There should be a Float at Sp[1], and an info table at Sp[0].
             int discr   = BCO_NEXT;
-           int failto  = BCO_NEXT;
+           int failto  = BCO_GET_LARGE_ARG;
             StgFloat stackFlt, discrFlt;
             stackFlt = PK_FLT( & Sp[1] );
             discrFlt = PK_FLT( & BCO_LIT(discr) );
@@ -1104,7 +1292,7 @@ run_BCO:
         case bci_TESTEQ_F: {
             // There should be a Float at Sp[1], and an info table at Sp[0].
             int discr   = BCO_NEXT;
-           int failto  = BCO_NEXT;
+           int failto  = BCO_GET_LARGE_ARG;
             StgFloat stackFlt, discrFlt;
             stackFlt = PK_FLT( & Sp[1] );
             discrFlt = PK_FLT( & BCO_LIT(discr) );
@@ -1121,14 +1309,14 @@ run_BCO:
             // context switching: sometimes the scheduler can invoke
             // the interpreter with context_switch == 1, particularly
             // if the -C0 flag has been given on the cmd line.
-           if (context_switch) {
+           if (cap->r.rHpLim == NULL) {
                 Sp--; Sp[0] = (W_)&stg_enter_info;
                 RETURN_TO_SCHEDULER(ThreadInterpret, ThreadYielding);
             }
             goto eval;
  
         case bci_RETURN:
-           obj = (StgClosure *)Sp[0];
+           tagged_obj = (StgClosure *)Sp[0];
             Sp++;
             goto do_return;
  
@@ -1168,21 +1356,71 @@ run_BCO:
             void *tok;
             int stk_offset            = BCO_NEXT;
             int o_itbl                = BCO_NEXT;
+           int interruptible         = BCO_NEXT;
             void(*marshall_fn)(void*) = (void (*)(void*))BCO_LIT(o_itbl);
             int ret_dyn_size = 
                 RET_DYN_BITMAP_SIZE + RET_DYN_NONPTR_REGS_SIZE
                 + sizeofW(StgRetDyn);
  
-#ifdef THREADED_RTS
-           // Threaded RTS:
-           // Arguments on the TSO stack are not good, because garbage
-           // collection might move the TSO as soon as we call
-           // suspendThread below.
-
-           W_ arguments[stk_offset];
-           
-           memcpy(arguments, Sp, sizeof(W_) * stk_offset);
-#endif
+            /* the stack looks like this:
+               
+               |             |  <- Sp + stk_offset
+               +-------------+  
+               |             |
+               |    args     |
+               |             |  <- Sp + ret_size + 1
+               +-------------+
+               |    C fun    |  <- Sp + ret_size
+               +-------------+
+               |     ret     |  <- Sp
+               +-------------+
+
+               ret is a placeholder for the return address, and may be
+               up to 2 words.
+
+               We need to copy the args out of the TSO, because when
+               we call suspendThread() we no longer own the TSO stack,
+               and it may move at any time - indeed suspendThread()
+               itself may do stack squeezing and move our args.
+               So we make a copy of the argument block.
+            */
+
+#define ROUND_UP_WDS(p)  ((((StgWord)(p)) + sizeof(W_)-1)/sizeof(W_))
+
+            ffi_cif *cif = (ffi_cif *)marshall_fn;
+            nat nargs = cif->nargs;
+            nat ret_size;
+            nat i;
+            StgPtr p;
+            W_ ret[2];                  // max needed
+           W_ *arguments[stk_offset];  // max needed
+            void *argptrs[nargs];
+            void (*fn)(void);
+
+            if (cif->rtype->type == FFI_TYPE_VOID) {
+                // necessary because cif->rtype->size == 1 for void,
+                // but the bytecode generator has not pushed a
+                // placeholder in this case.
+                ret_size = 0;
+            } else {
+                ret_size = ROUND_UP_WDS(cif->rtype->size);
+            }
+
+           memcpy(arguments, Sp+ret_size+1, 
+                   sizeof(W_) * (stk_offset-1-ret_size));
+            
+            // libffi expects the args as an array of pointers to
+            // values, so we have to construct this array before making
+            // the call.
+            p = (StgPtr)arguments;
+            for (i = 0; i < nargs; i++) {
+                argptrs[i] = (void *)p;
+                // get the size from the cif
+                p += ROUND_UP_WDS(cif->arg_types[i]->size);
+            }
+
+            // this is the function we're going to call
+            fn = (void(*)(void))Sp[ret_size];
  
             // Restore the Haskell thread's current value of errno
             errno = cap->r.rCurrentTSO->saved_errno;
@@ -1198,58 +1436,59 @@ run_BCO:
             // on the stack frame to describe this chunk of stack.
             //
             Sp -= ret_dyn_size;
-           ((StgRetDyn *)Sp)->liveness = NO_PTRS | N_NONPTRS(stk_offset);
+           ((StgRetDyn *)Sp)->liveness = R1_PTR | N_NONPTRS(stk_offset);
             ((StgRetDyn *)Sp)->info = (StgInfoTable *)&stg_gc_gen_info;
  
-           SAVE_STACK_POINTERS;
-           tok = suspendThread(&cap->r);
+            // save obj (pointer to the current BCO), since this
+            // might move during the call.  We use the R1 slot in the
+            // RET_DYN frame for this, hence R1_PTR above.
+            ((StgRetDyn *)Sp)->payload[0] = (StgClosure *)obj;
  
-#ifndef THREADED_RTS
-           // Careful:
-           // suspendThread might have shifted the stack
-           // around (stack squeezing), so we have to grab the real
-           // Sp out of the TSO to find the ccall args again.
+           SAVE_STACK_POINTERS;
+           tok = suspendThread(&cap->r, interruptible ? rtsTrue : rtsFalse);
  
-           marshall_fn ( (void*)(cap->r.rCurrentTSO->sp + ret_dyn_size) );
-#else
-           // Threaded RTS:
             // We already made a copy of the arguments above.
-
-           marshall_fn ( arguments );
-#endif
+            ffi_call(cif, fn, ret, argptrs);
  
             // And restart the thread again, popping the RET_DYN frame.
-           cap = (Capability *)((void *)((unsigned char*)resumeThread(tok) - sizeof(StgFunTable)));
+           cap = (Capability *)((void *)((unsigned char*)resumeThread(tok) - STG_FIELD_OFFSET(Capability,r)));
             LOAD_STACK_POINTERS;
+
+            // Re-load the pointer to the BCO from the RET_DYN frame,
+            // it might have moved during the call.  Also reload the
+            // pointers to the components of the BCO.
+            obj        = ((StgRetDyn *)Sp)->payload[0];
+            bco        = (StgBCO*)obj;
+            instrs     = (StgWord16*)(bco->instrs->payload);
+            literals   = (StgWord*)(&bco->literals->payload[0]);
+            ptrs       = (StgPtr*)(&bco->ptrs->payload[0]);
+
             Sp += ret_dyn_size;
             
             // Save the Haskell thread's current value of errno
             cap->r.rCurrentTSO->saved_errno = errno;
                 
-#ifdef THREADED_RTS
-           // Threaded RTS:
-           // Copy the "arguments", which might include a return value,
-           // back to the TSO stack. It would of course be enough to
-           // just copy the return value, but we don't know the offset.
-           memcpy(Sp, arguments, sizeof(W_) * stk_offset);
-#endif
+           // Copy the return value back to the TSO stack.  It is at
+            // most 2 words large, and resides at arguments[0].
+            memcpy(Sp, ret, sizeof(W_) * stg_min(stk_offset,ret_size));
  
             goto nextInsn;
         }
  
         case bci_JMP: {
             /* BCO_NEXT modifies bciPtr, so be conservative. */
-           int nextpc = BCO_NEXT;
+           int nextpc = BCO_GET_LARGE_ARG;
             bciPtr     = nextpc;
             goto nextInsn;
         }
-
+ 
         case bci_CASEFAIL:
             barf("interpretBCO: hit a CASEFAIL");
             
             // Errors
         default: 
-           barf("interpretBCO: unknown or unimplemented opcode");
+           barf("interpretBCO: unknown or unimplemented opcode %d",
+                 (int)(bci & 0xFF));
  
         } /* switch on opcode */
      }