compiler/ghci/ByteCodeFFI.lhs

   1 %
   2 % (c) The University of Glasgow 2001-2006
   3 %
   4
   5 ByteCodeGen: Generate machine-code sequences for foreign import
   6
   7 \begin{code}
   8 {-# OPTIONS -w #-}
   9 -- The above warning supression flag is a temporary kludge.
  10 -- While working on this module you are encouraged to remove it and fix
  11 -- any warnings in the module. See
  12 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
  13 -- for details
  14
  15 #ifdef USE_LIBFFI
  16
  17 module ByteCodeFFI ( moan64, newExec ) where
  18
  19 import Outputable
  20 import System.IO
  21 import Foreign
  22 import Foreign.C
  23
  24 #else
  25
  26 module ByteCodeFFI ( mkMarshalCode, moan64, newExec ) where
  27
  28 #include "HsVersions.h"
  29
  30 import TyCon
  31 import Outputable
  32 import SMRep
  33 import ForeignCall
  34 import Panic
  35
  36 -- DON'T remove apparently unused imports here ..
  37 -- there is ifdeffery below
  38 import Control.Exception ( throwDyn )
  39 import Data.Bits        ( Bits(..), shiftR, shiftL )
  40 import Data.List        ( mapAccumL )
  41
  42 import Data.Word        ( Word8, Word32 )
  43 import Foreign          ( Ptr, FunPtr, castPtrToFunPtr,
  44                           Storable, sizeOf, pokeArray )
  45 import Foreign.C        ( CUInt )
  46 import System.IO.Unsafe ( unsafePerformIO )
  47 import System.IO        ( hPutStrLn, stderr )
  48 -- import Debug.Trace   ( trace )
  49 \end{code}
  50
  51 %************************************************************************
  52 %*                                                                      *
  53 \subsection{The platform-dependent marshall-code-generator.}
  54 %*                                                                      *
  55 %************************************************************************
  56
  57 \begin{code}
  58
  59 -- For sparc_TARGET_ARCH, i386_TARGET_ARCH, etc.
  60 #include "nativeGen/NCG.h"
  61
  62 {-
  63 Make a piece of code which expects to see the Haskell stack
  64 looking like this.  It is given a pointer to the lowest word in
  65 the stack -- presumably the tag of the placeholder.
  66
  67                   <arg_n>
  68                   ...
  69                   <arg_1>
  70                   Addr# address_of_C_fn
  71                   <placeholder-for-result#> (must be an unboxed type)
  72
  73 We cope with both ccall and stdcall for the C fn.  However, this code
  74 itself expects only to be called using the ccall convention -- that is,
  75 we don't clear our own (single) arg off the C stack.
  76 -}
  77 mkMarshalCode :: CCallConv
  78               -> (Int, PrimRep) -> Int -> [(Int, PrimRep)]
  79               -> IO (FunPtr ())
  80 mkMarshalCode cconv (r_offW, r_rep) addr_offW arg_offs_n_reps
  81    = let bytes = mkMarshalCode_wrk cconv (r_offW, r_rep)
  82                                    addr_offW arg_offs_n_reps
  83      in  newExec bytes
  84
  85 mkMarshalCode_wrk :: CCallConv
  86                   -> (Int, PrimRep) -> Int -> [(Int, PrimRep)]
  87                   -> [Word8]
  88
  89 mkMarshalCode_wrk cconv (r_offW, r_rep) addr_offW arg_offs_n_reps
  90
  91 #if i386_TARGET_ARCH
  92
  93    = let -- Don't change this without first consulting Intel Corp :-)
  94          bytes_per_word = 4
  95
  96          offsets_to_pushW
  97             = concat
  98               [   -- reversed because x86 is little-endian
  99                   reverse [a_offW .. a_offW + primRepSizeW a_rep - 1]
 100
 101                 -- reversed because args are pushed L -> R onto C stack
 102                 | (a_offW, a_rep) <- reverse arg_offs_n_reps
 103               ]
 104
 105          arguments_size = bytes_per_word * length offsets_to_pushW
 106 #if darwin_TARGET_OS
 107              -- Darwin: align stack frame size to a multiple of 16 bytes
 108          stack_frame_size = (arguments_size + 15) .&. complement 15
 109          stack_frame_pad = stack_frame_size - arguments_size
 110 #else
 111          stack_frame_size = arguments_size
 112 #endif
 113
 114          -- some helpers to assemble x86 insns.
 115          movl_offespmem_esi offB        -- movl   offB(%esp), %esi
 116             = [0x8B, 0xB4, 0x24] ++ lit32 offB
 117          movl_offesimem_ecx offB        -- movl   offB(%esi), %ecx
 118             = [0x8B, 0x8E] ++ lit32 offB
 119          save_regs                      -- pushl  all intregs except %esp
 120             = [0x50, 0x53, 0x51, 0x52, 0x56, 0x57, 0x55]
 121          restore_regs                   -- popl   ditto
 122             = [0x5D, 0x5F, 0x5E, 0x5A, 0x59, 0x5B, 0x58]
 123          pushl_ecx                      -- pushl  %ecx
 124             = [0x51]
 125          call_star_ecx                  -- call   * %ecx
 126             = [0xFF, 0xD1]
 127          add_lit_esp lit                -- addl   $lit, %esp
 128             = [0x81, 0xC4] ++ lit32 lit
 129          movl_eax_offesimem offB        -- movl   %eax, offB(%esi)
 130             = [0x89, 0x86] ++ lit32 offB
 131          movl_edx_offesimem offB        -- movl   %edx, offB(%esi)
 132             = [0x89, 0x96] ++ lit32 offB
 133          ret                            -- ret
 134             = [0xC3]
 135          fstpl_offesimem offB           -- fstpl   offB(%esi)
 136             = [0xDD, 0x9E] ++ lit32 offB
 137          fstps_offesimem offB           -- fstps   offB(%esi)
 138             = [0xD9, 0x9E] ++ lit32 offB
 139          {-
 140              2 0000 8BB42478    movl    0x12345678(%esp), %esi
 141              2      563412
 142              3 0007 8B8E7856    movl    0x12345678(%esi), %ecx
 143              3      3412
 144              4
 145              5 000d 50535152    pushl %eax ; pushl %ebx ; pushl %ecx ; pushl  %edx
 146              6 0011 565755      pushl %esi ; pushl %edi ; pushl %ebp
 147              7
 148              8 0014 5D5F5E      popl %ebp ; popl %edi ; popl %esi
 149              9 0017 5A595B58    popl %edx ; popl %ecx ; popl %ebx ; popl %eax
 150             10
 151             11 001b 51          pushl %ecx
 152             12 001c FFD1        call * %ecx
 153             13
 154             14 001e 81C47856    addl $0x12345678, %esp
 155             14      3412
 156             15 0024 89867856    movl %eax, 0x12345678(%esi)
 157             15      3412
 158             16 002a 89967856    movl %edx, 0x12345678(%esi)
 159             16      3412
 160             17
 161             18 0030 DD967856    fstl    0x12345678(%esi)
 162             18      3412
 163             19 0036 DD9E7856    fstpl   0x12345678(%esi)
 164             19      3412
 165             20 003c D9967856    fsts    0x12345678(%esi)
 166             20      3412
 167             21 0042 D99E7856    fstps   0x12345678(%esi)
 168             18
 169             19 0030 C3          ret
 170             20
 171
 172          -}
 173
 174      in
 175      --trace (show (map fst arg_offs_n_reps))
 176      (
 177      {- On entry, top of C stack 0(%esp) is the RA and 4(%esp) is
 178         arg passed from the interpreter.
 179
 180         Push all callee saved regs.  Push all of them anyway ...
 181            pushl       %eax
 182            pushl       %ebx
 183            pushl       %ecx
 184            pushl       %edx
 185            pushl       %esi
 186            pushl       %edi
 187            pushl       %ebp
 188      -}
 189      save_regs
 190
 191      {- Now 28+0(%esp) is RA and 28+4(%esp) is the arg (the H stack ptr).
 192         We'll use %esi as a temporary to point at the H stack, and
 193         %ecx as a temporary to copy via.
 194
 195            movl        28+4(%esp), %esi
 196      -}
 197      ++ movl_offespmem_esi 32
 198
 199 #if darwin_TARGET_OS
 200      {- On Darwin, add some padding so that the stack stays aligned. -}
 201      ++ (if stack_frame_pad /= 0
 202             then add_lit_esp (-stack_frame_pad)
 203             else [])
 204 #endif
 205
 206      {- For each arg in args_offs_n_reps, examine the associated
 207         CgRep to determine how many words there are.  This gives a
 208         bunch of offsets on the H stack to copy to the C stack:
 209
 210            movl        off1(%esi), %ecx
 211            pushl       %ecx
 212      -}
 213      ++ concatMap (\offW -> movl_offesimem_ecx (bytes_per_word * offW)
 214                             ++ pushl_ecx)
 215                   offsets_to_pushW
 216
 217      {- Get the addr to call into %ecx, bearing in mind that there's
 218         an Addr# tag at the indicated location, and do the call:
 219
 220            movl        4*(1 /*tag*/ +addr_offW)(%esi), %ecx
 221            call        * %ecx
 222      -}
 223      ++ movl_offesimem_ecx (bytes_per_word * addr_offW)
 224      ++ call_star_ecx
 225
 226      {- Nuke the args just pushed and re-establish %esi at the
 227         H-stack ptr:
 228
 229            addl        $4*number_of_args_pushed, %esp (ccall only)
 230            movl        28+4(%esp), %esi
 231      -}
 232      ++ (if   cconv /= StdCallConv
 233          then add_lit_esp stack_frame_size
 234          else [])
 235      ++ movl_offespmem_esi 32
 236
 237      {- Depending on what the return type is, get the result
 238         from %eax or %edx:%eax or %st(0).
 239
 240            movl        %eax, 4(%esi)        -- assuming tagged result
 241         or
 242            movl        %edx, 4(%esi)
 243            movl        %eax, 8(%esi)
 244         or
 245            fstpl       4(%esi)
 246         or
 247            fstps       4(%esi)
 248      -}
 249      ++ let i32 = movl_eax_offesimem 0
 250             i64 = movl_eax_offesimem 0 ++ movl_edx_offesimem 4
 251             f32 = fstps_offesimem 0
 252             f64 = fstpl_offesimem 0
 253         in
 254         case r_rep of
 255            VoidRep   -> []
 256            IntRep    -> i32
 257            WordRep   -> i32
 258            Int64Rep  -> i64
 259            Word64Rep -> i64
 260            AddrRep   -> i32
 261            FloatRep  -> f32
 262            DoubleRep -> f64
 263            other     -> moan64 "ByteCodeFFI.mkMarshalCode_wrk(x86)"
 264                                (ppr r_rep)
 265
 266      {- Restore all the pushed regs and go home.
 267
 268            pushl        %ebp
 269            pushl        %edi
 270            pushl        %esi
 271            pushl        %edx
 272            pushl        %ecx
 273            pushl        %ebx
 274            pushl        %eax
 275
 276            ret
 277      -}
 278      ++ restore_regs
 279      ++ ret
 280      )
 281
 282 #elif x86_64_TARGET_ARCH
 283
 284    =
 285      -- the address of the H stack is in %rdi.  We need to move it out, so
 286      -- we can use %rdi as an arg reg for the following call:
 287     pushq_rbp ++
 288     movq_rdi_rbp ++
 289
 290      -- ####### load / push the args
 291
 292      let
 293         (stack_args, fregs_unused, reg_loads) =
 294            load_arg_regs arg_offs_n_reps int_loads float_loads []
 295
 296         tot_arg_size = bytes_per_word * length stack_args
 297
 298         -- On entry to the called function, %rsp should be aligned
 299         -- on a 16-byte boundary +8 (i.e. the first stack arg after
 300         -- the return address is 16-byte aligned).  In STG land
 301         -- %rsp is kept 16-byte aligned (see StgCRun.c), so we just
 302         -- need to make sure we push a multiple of 16-bytes of args,
 303         -- plus the return address, to get the correct alignment.
 304         (real_size, adjust_rsp)
 305           | tot_arg_size `rem` 16 == 0    = (tot_arg_size, [])
 306           | otherwise                     = (tot_arg_size + 8, subq_lit_rsp 8)
 307
 308         (stack_pushes, stack_words) =
 309                 push_args stack_args [] 0
 310
 311         -- we need to know the number of SSE regs used in the call, see later
 312         n_sse_regs_used = length float_loads - length fregs_unused
 313      in
 314         concat reg_loads
 315      ++ adjust_rsp
 316      ++ concat stack_pushes -- push in reverse order
 317
 318      -- ####### make the call
 319
 320         -- use %r10 to make the call, because we don't have to save it.
 321         --      movq 8*addr_offW(%rbp), %r10
 322      ++ movq_rbpoff_r10 (bytes_per_word * addr_offW)
 323
 324         -- The x86_64 ABI requires us to set %al to the number of SSE
 325         -- registers that contain arguments, if the called routine
 326         -- is a varargs function.  We don't know whether it's a
 327         -- varargs function or not, so we have to assume it is.
 328         --
 329         -- It's not safe to omit this assignment, even if the number
 330         -- of SSE regs in use is zero.  If %al is larger than 8
 331         -- on entry to a varargs function, seg faults ensue.
 332      ++ movq_lit_rax n_sse_regs_used
 333      ++ call_star_r10
 334
 335         -- pop the args from the stack, only in ccall mode
 336         -- (in stdcall the callee does it).
 337      ++ (if   cconv /= StdCallConv
 338          then addq_lit_rsp real_size
 339          else [])
 340
 341      -- ####### place the result in the right place and return
 342
 343      ++ assign_result
 344      ++ popq_rbp
 345      ++ ret
 346
 347   where
 348      bytes_per_word = 8
 349
 350      -- int arg regs: rdi,rsi,rdx,rcx,r8,r9
 351      -- flt arg regs: xmm0..xmm7
 352      int_loads   = [ movq_rbpoff_rdi, movq_rbpoff_rsi, movq_rbpoff_rdx,
 353                      movq_rbpoff_rcx, movq_rbpoff_r8,  movq_rbpoff_r9 ]
 354      float_loads = [ 0..7 ]
 355
 356      load_arg_regs args [] [] code     =  (args, [], code)
 357      load_arg_regs [] iregs fregs code =  ([], fregs, code)
 358      load_arg_regs ((off,rep):args) iregs fregs code
 359         | FloatRep  <- rep =
 360             case fregs of
 361               [] -> push_this_arg
 362               n : frest ->
 363                 load_arg_regs args iregs frest
 364                       (mov_f32_rbpoff_xmm n (bytes_per_word * off) : code)
 365         | DoubleRep <- rep =
 366             case fregs of
 367               [] -> push_this_arg
 368               n : frest ->
 369                 load_arg_regs args iregs frest
 370                        (mov_f64_rbpoff_xmm n (bytes_per_word * off) : code)
 371         | (mov_reg:irest) <- iregs =
 372                 load_arg_regs args irest fregs (mov_reg (bytes_per_word * off) : code)
 373         | otherwise =
 374                  push_this_arg
 375         where
 376            push_this_arg = ((off,rep):args',fregs', code')
 377                 where (args',fregs',code') = load_arg_regs args iregs fregs code
 378
 379      push_args [] code pushed_words = (code, pushed_words)
 380      push_args ((off,rep):args) code pushed_words
 381         | FloatRep  <- rep =
 382                 push_args args (push_f32_rbpoff (bytes_per_word * off) : code)
 383                         (pushed_words+1)
 384         | DoubleRep <- rep =
 385                 push_args args (push_f64_rbpoff (bytes_per_word * off) : code)
 386                         (pushed_words+1)
 387         | otherwise =
 388                 push_args args (pushq_rbpoff (bytes_per_word * off) : code)
 389                         (pushed_words+1)
 390
 391
 392      assign_result =
 393         case r_rep of
 394           DoubleRep -> f64
 395           FloatRep  -> f32
 396           VoidRep   -> []
 397           _other    -> i64
 398         where
 399           i64 = movq_rax_rbpoff 0
 400           f32 = mov_f32_xmm0_rbpoff 0
 401           f64 = mov_f64_xmm0_rbpoff 0
 402
 403 --    ######### x86_64 machine code:
 404
 405 --   0:   48 89 fd                mov    %rdi,%rbp
 406 --   3:   48 8b bd 78 56 34 12    mov    0x12345678(%rbp),%rdi
 407 --   a:   48 8b b5 78 56 34 12    mov    0x12345678(%rbp),%rsi
 408 --  11:   48 8b 95 78 56 34 12    mov    0x12345678(%rbp),%rdx
 409 --  18:   48 8b 8d 78 56 34 12    mov    0x12345678(%rbp),%rcx
 410 --  1f:   4c 8b 85 78 56 34 12    mov    0x12345678(%rbp),%r8
 411 --  26:   4c 8b 8d 78 56 34 12    mov    0x12345678(%rbp),%r9
 412 --  2d:   4c 8b 95 78 56 34 12    mov    0x12345678(%rbp),%r10
 413 --  34:   48 c7 c0 78 56 34 12    mov    $0x12345678,%rax
 414 --  3b:   48 89 85 78 56 34 12    mov    %rax,0x12345678(%rbp)
 415 --  42:   f3 0f 10 bd 78 56 34 12 movss  0x12345678(%rbp),%xmm7
 416 --  4a:   f2 0f 10 9d 78 56 34 12 movsd  0x12345678(%rbp),%xmm3
 417 --  52:   f2 44 0f 10 85 78 56 34 12 movsd  0x12345678(%rbp),%xmm8
 418 --  5b:   f3 0f 11 9d 78 56 34 12 movss  %xmm3,0x12345678(%rbp)
 419 --  63:   f2 0f 11 9d 78 56 34 12 movsd  %xmm3,0x12345678(%rbp)
 420 --  6b:   f2 44 0f 11 85 78 56 34 12 movsd  %xmm8,0x12345678(%rbp)
 421 --  74:   ff b5 78 56 34 12       pushq  0x12345678(%rbp)
 422 --  7a:   f3 44 0f 11 04 24       movss  %xmm8,(%rsp)
 423 --  80:   f2 44 0f 11 04 24       movsd  %xmm8,(%rsp)
 424 --  86:   48 81 ec 78 56 34 12    sub    $0x12345678,%rsp
 425 --  8d:   48 81 c4 78 56 34 12    add    $0x12345678,%rsp
 426 --  94:   41 ff d2                callq  *%r10
 427 --  97:   55                      push   %rbp
 428 --  98:   5d                      pop    %rbp
 429 --  99:   c3                      retq
 430
 431      movq_rdi_rbp         = [0x48,0x89,0xfd]
 432      movq_rbpoff_rdi  off = [0x48, 0x8b, 0xbd] ++ lit32 off
 433      movq_rbpoff_rsi  off = [0x48, 0x8b, 0xb5] ++ lit32 off
 434      movq_rbpoff_rdx  off = [0x48, 0x8b, 0x95] ++ lit32 off
 435      movq_rbpoff_rcx  off = [0x48, 0x8b, 0x8d] ++ lit32 off
 436      movq_rbpoff_r8   off = [0x4c, 0x8b, 0x85] ++ lit32 off
 437      movq_rbpoff_r9   off = [0x4c, 0x8b, 0x8d] ++ lit32 off
 438      movq_rbpoff_r10  off = [0x4c, 0x8b, 0x95] ++ lit32 off
 439      movq_lit_rax     lit = [0x48, 0xc7, 0xc0] ++ lit32 lit
 440      movq_rax_rbpoff  off = [0x48, 0x89, 0x85] ++ lit32 off
 441      mov_f32_rbpoff_xmm n off
 442          = 0xf3 : if n >= 8 then 0x44 : rest else rest
 443          where rest = [0x0f, 0x10, 0x85 + (n.&.7)`shiftL`3] ++ lit32 off
 444      mov_f64_rbpoff_xmm n off
 445          = 0xf2 : if n >= 8 then 0x44 : rest else rest
 446          where rest = [0x0f, 0x10, 0x85 + (n.&.7)`shiftL`3] ++ lit32 off
 447      mov_f32_xmm0_rbpoff  off = [0xf3, 0x0f, 0x11, 0x85] ++ lit32 off
 448      mov_f64_xmm0_rbpoff  off = [0xf2, 0x0f, 0x11, 0x85] ++ lit32 off
 449      pushq_rbpoff     off = [0xff, 0xb5] ++ lit32 off
 450      push_f32_rbpoff  off =
 451         subq_lit_rsp 8 ++                        -- subq $8, %rsp
 452         mov_f32_rbpoff_xmm 8 off ++              -- movss off(%rbp), %xmm8
 453         [0xf3, 0x44, 0x0f, 0x11, 0x04, 0x24]     -- movss %xmm8, (%rsp)
 454      push_f64_rbpoff  off =
 455         subq_lit_rsp 8 ++                        -- subq $8, %rsp
 456         mov_f64_rbpoff_xmm 8 off ++              -- movsd off(%rbp), %xmm8
 457         [0xf2, 0x44, 0x0f, 0x11, 0x04, 0x24]     -- movsd %xmm8, (%rsp)
 458      subq_lit_rsp     lit = [0x48, 0x81, 0xec] ++ lit32 lit
 459      addq_lit_rsp     lit = [0x48, 0x81, 0xc4] ++ lit32 lit
 460      call_star_r10 = [0x41,0xff,0xd2]
 461      ret = [0xc3]
 462      pushq_rbp = [0x55]
 463      popq_rbp = [0x5d]
 464
 465 #elif sparc_TARGET_ARCH
 466
 467    = let -- At least for sparc V8
 468          bytes_per_word = 4
 469
 470          -- speaks for itself
 471          w32_to_w8s_bigEndian :: Word32 -> [Word8]
 472          w32_to_w8s_bigEndian w
 473             =  [fromIntegral (0xFF .&. (w `shiftR` 24)),
 474                 fromIntegral (0xFF .&. (w `shiftR` 16)),
 475                 fromIntegral (0xFF .&. (w `shiftR` 8)),
 476                 fromIntegral (0xFF .&. w)]
 477
 478          offsets_to_pushW
 479             = concat
 480               [  [a_offW .. a_offW + primRepSizeW a_rep - 1]
 481
 482                 | (a_offW, a_rep) <- arg_offs_n_reps
 483               ]
 484
 485          total_argWs    = length offsets_to_pushW
 486          argWs_on_stack = if total_argWs > 6 then total_argWs - 6
 487                                              else 0
 488
 489          -- The stack pointer must be kept 8-byte aligned, which means
 490          -- we need to calculate this quantity too
 491          argWs_on_stack_ROUNDED_UP
 492             | odd argWs_on_stack = 1 + argWs_on_stack
 493             | otherwise          = argWs_on_stack
 494
 495          -- some helpers to assemble sparc insns.
 496          -- REGS
 497          iReg, oReg, gReg, fReg :: Int -> Word32
 498          iReg = fromIntegral . (+ 24)
 499          oReg = fromIntegral . (+ 8)
 500          gReg = fromIntegral . (+ 0)
 501          fReg = fromIntegral
 502
 503          sp = oReg 6
 504          i0 = iReg 0
 505          i7 = iReg 7
 506          o0 = oReg 0
 507          o1 = oReg 1
 508          o7 = oReg 7
 509          g0 = gReg 0
 510          g1 = gReg 1
 511          f0 = fReg 0
 512          f1 = fReg 1
 513
 514          -- INSN templates
 515          insn_r_r_i :: Word32 -> Word32 -> Word32 -> Int -> Word32
 516          insn_r_r_i op3 rs1 rd imm13
 517             = (3 `shiftL` 30)
 518               .|. (rs1 `shiftL` 25)
 519               .|. (op3 `shiftL` 19)
 520               .|. (rd `shiftL` 14)
 521               .|. (1 `shiftL` 13)
 522               .|. mkSimm13 imm13
 523
 524          insn_r_i_r :: Word32 -> Word32 -> Int -> Word32 -> Word32
 525          insn_r_i_r op3 rs1 imm13 rd
 526             = (2 `shiftL` 30)
 527               .|. (rd `shiftL` 25)
 528               .|. (op3 `shiftL` 19)
 529               .|. (rs1 `shiftL` 14)
 530               .|. (1 `shiftL` 13)
 531               .|. mkSimm13 imm13
 532
 533          mkSimm13 :: Int -> Word32
 534          mkSimm13 imm13
 535             = let imm13w = (fromIntegral imm13) :: Word32
 536               in  imm13w .&. 0x1FFF
 537
 538          -- REAL (non-synthetic) insns
 539          -- or %rs1, %rs2, %rd
 540          mkOR :: Word32 -> Word32 -> Word32 -> Word32
 541          mkOR rs1 rs2 rd
 542             = (2 `shiftL` 30)
 543               .|. (rd `shiftL` 25)
 544               .|. (op3_OR `shiftL` 19)
 545               .|. (rs1 `shiftL` 14)
 546               .|. (0 `shiftL` 13)
 547               .|. rs2
 548               where op3_OR = 2 :: Word32
 549
 550          -- ld(int)   [%rs + imm13], %rd
 551          mkLD rs1 imm13 rd = insn_r_r_i 0x00{-op3_LD-} rd rs1 imm13
 552
 553          -- st(int)   %rs, [%rd + imm13]
 554          mkST   = insn_r_r_i 0x04 -- op3_ST
 555
 556          -- st(float) %rs, [%rd + imm13]
 557          mkSTF  = insn_r_r_i 0x24 -- op3_STF
 558
 559          -- jmpl     %rs + imm13, %rd
 560          mkJMPL = insn_r_i_r 0x38 -- op3_JMPL
 561
 562          -- save     %rs + imm13, %rd
 563          mkSAVE = insn_r_i_r 0x3C -- op3_SAVE
 564
 565          -- restore  %rs + imm13, %rd
 566          mkRESTORE = insn_r_i_r 0x3D -- op3_RESTORE
 567
 568          -- SYNTHETIC insns
 569          mkNOP             = mkOR g0 g0 g0
 570          mkCALL reg        = mkJMPL reg 0 o7
 571          mkRET             = mkJMPL i7 8 g0
 572          mkRESTORE_TRIVIAL = mkRESTORE g0 0 g0
 573      in
 574      --trace (show (map fst arg_offs_n_reps))
 575      concatMap w32_to_w8s_bigEndian (
 576
 577      {- On entry, %o0 is the arg passed from the interpreter.  After
 578         the initial save insn, it will be in %i0.  Studying the sparc
 579         docs one would have thought that the minimum frame size is 92
 580         bytes, but gcc always uses at least 112, and indeed there are
 581         segfaults a-plenty with 92.  So I use 112 here as well.  I
 582         don't understand why, tho.
 583      -}
 584      [mkSAVE sp (- ({-92-}112 + 4*argWs_on_stack_ROUNDED_UP)) sp]
 585
 586      {- For each arg in args_offs_n_reps, examine the associated
 587         CgRep to determine how many words there are.  This gives a
 588         bunch of offsets on the H stack.  Move the first 6 words into
 589         %o0 .. %o5 and the rest on the stack, starting at [%sp+92].
 590         Use %g1 as a temp.
 591      -}
 592      ++ let doArgW (offW, wordNo)
 593               | wordNo < 6
 594               = [mkLD i0 (bytes_per_word * offW) (oReg wordNo)]
 595               | otherwise
 596               = [mkLD i0 (bytes_per_word * offW) g1,
 597                  mkST g1 sp (92 + bytes_per_word * (wordNo - 6))]
 598         in
 599             concatMap doArgW (zip offsets_to_pushW [0 ..])
 600
 601      {- Get the addr to call into %g1, bearing in mind that there's
 602         an Addr# tag at the indicated location, and do the call:
 603
 604            ld     [4*(1 /*tag*/ +addr_offW) + %i0], %g1
 605            call   %g1
 606      -}
 607      ++ [mkLD i0 (bytes_per_word * addr_offW) g1,
 608          mkCALL g1,
 609          mkNOP]
 610
 611      {- Depending on what the return type is, get the result
 612         from %o0 or %o1:%o0 or %f0 or %f1:%f0.
 613
 614            st          %o0, [%i0 + 4]        -- 32 bit int
 615         or
 616            st          %o0, [%i0 + 4]        -- 64 bit int
 617            st          %o1, [%i0 + 8]        -- or the other way round?
 618         or
 619            st          %f0, [%i0 + 4]        -- 32 bit float
 620         or
 621            st          %f0, [%i0 + 4]        -- 64 bit float
 622            st          %f1, [%i0 + 8]        -- or the other way round?
 623
 624      -}
 625      ++ let i32 = [mkST o0 i0 0]
 626             i64 = [mkST o0 i0 0, mkST o1 i0 4]
 627             f32 = [mkSTF f0 i0 0]
 628             f64 = [mkSTF f0 i0 0, mkSTF f1 i0 4]
 629         in
 630             case r_rep of
 631                VoidRep   -> []
 632                IntRep    -> i32
 633                WordRep   -> i32
 634                AddrRep   -> i32
 635                FloatRep  -> f32
 636                DoubleRep -> f64
 637                other     -> moan64 "ByteCodeFFI.mkMarshalCode_wrk(sparc)"
 638                                    (ppr r_rep)
 639
 640      ++ [mkRET,
 641          mkRESTORE_TRIVIAL]  -- this is in the delay slot of the RET
 642      )
 643 #elif powerpc_TARGET_ARCH && darwin_TARGET_OS
 644
 645    = let
 646          bytes_per_word = 4
 647
 648          -- speaks for itself
 649          w32_to_w8s_bigEndian :: Word32 -> [Word8]
 650          w32_to_w8s_bigEndian w
 651             =  [fromIntegral (0xFF .&. (w `shiftR` 24)),
 652                 fromIntegral (0xFF .&. (w `shiftR` 16)),
 653                 fromIntegral (0xFF .&. (w `shiftR` 8)),
 654                 fromIntegral (0xFF .&. w)]
 655
 656          -- addr and result bits offsetsW
 657          a_off = addr_offW * bytes_per_word
 658          result_off  = r_offW * bytes_per_word
 659
 660          linkageArea = 24
 661          parameterArea = sum [ primRepSizeW a_rep * bytes_per_word
 662                         | (_, a_rep) <- arg_offs_n_reps ]
 663          savedRegisterArea = 4
 664          frameSize = padTo16 (linkageArea + max parameterArea 32 + savedRegisterArea)
 665          padTo16 x = case x `mod` 16 of
 666             0 -> x
 667             y -> x - y + 16
 668
 669          pass_parameters [] _ _ = []
 670          pass_parameters ((a_offW, a_rep):args) nextFPR offsetW =
 671             let
 672                haskellArgOffset = a_offW * bytes_per_word
 673                offsetW' = offsetW + primRepSizeW a_rep
 674
 675                pass_word w
 676                    | offsetW + w < 8 =
 677                       [0x801f0000    -- lwz rX, src(r31)
 678                         .|. (fromIntegral src .&. 0xFFFF)
 679                         .|. (fromIntegral (offsetW+w+3) `shiftL` 21)]
 680                    | otherwise =
 681                       [0x801f0000    -- lwz r0, src(r31)
 682                         .|. (fromIntegral src .&. 0xFFFF),
 683                        0x90010000    -- stw r0, dst(r1)
 684                         .|. (fromIntegral dst .&. 0xFFFF)]
 685                   where
 686                      src = haskellArgOffset + w*bytes_per_word
 687                      dst = linkageArea + (offsetW+w) * bytes_per_word
 688             in
 689                case a_rep of
 690                   FloatRep | nextFPR < 14 ->
 691                       (0xc01f0000    -- lfs fX, haskellArgOffset(r31)
 692                         .|. (fromIntegral haskellArgOffset .&. 0xFFFF)
 693                         .|. (fromIntegral nextFPR `shiftL` 21))
 694                       : pass_parameters args (nextFPR+1) offsetW'
 695                   DoubleRep | nextFPR < 14 ->
 696                       (0xc81f0000    -- lfd fX, haskellArgOffset(r31)
 697                         .|. (fromIntegral haskellArgOffset .&. 0xFFFF)
 698                         .|. (fromIntegral nextFPR `shiftL` 21))
 699                       : pass_parameters args (nextFPR+1) offsetW'
 700                   _ ->
 701                       concatMap pass_word [0 .. primRepSizeW a_rep - 1]
 702                       ++ pass_parameters args nextFPR offsetW'
 703
 704          gather_result = case r_rep of
 705             VoidRep -> []
 706             FloatRep ->
 707                [0xd03f0000 .|. (fromIntegral result_off .&. 0xFFFF)]
 708                -- stfs f1, result_off(r31)
 709             DoubleRep ->
 710                [0xd83f0000 .|. (fromIntegral result_off .&. 0xFFFF)]
 711                -- stfd f1, result_off(r31)
 712             _ | primRepSizeW r_rep == 2 ->
 713                [0x907f0000 .|. (fromIntegral result_off .&. 0xFFFF),
 714                 0x909f0000 .|. (fromIntegral (result_off+4) .&. 0xFFFF)]
 715                -- stw r3, result_off(r31)
 716                -- stw r4, result_off+4(r31)
 717             _ | primRepSizeW r_rep == 1 ->
 718                [0x907f0000 .|. (fromIntegral result_off .&. 0xFFFF)]
 719                -- stw r3, result_off(r31)
 720      in
 721          concatMap w32_to_w8s_bigEndian $ [
 722             0x7c0802a6,         -- mflr r0
 723             0x93e1fffc,         -- stw r31,-4(r1)
 724             0x90010008,         -- stw r0,8(r1)
 725             0x94210000 .|. (fromIntegral (-frameSize) .&. 0xFFFF),
 726                                 -- stwu r1, -frameSize(r1)
 727             0x7c7f1b78          -- mr r31, r3
 728          ] ++ pass_parameters arg_offs_n_reps 1 0 ++ [
 729             0x819f0000 .|. (fromIntegral a_off .&. 0xFFFF),
 730                                 -- lwz r12, a_off(r31)
 731             0x7d8903a6,         -- mtctr r12
 732             0x4e800421          -- bctrl
 733          ] ++ gather_result ++ [
 734             0x80210000,         -- lwz r1, 0(r1)
 735             0x83e1fffc,         -- lwz r31, -4(r1)
 736             0x80010008,         -- lwz r0, 8(r1)
 737             0x7c0803a6,         -- mtlr r0
 738             0x4e800020          -- blr
 739          ]
 740
 741 #elif powerpc_TARGET_ARCH && linux_TARGET_OS
 742
 743    -- All offsets here are measured in Words (not bytes).  This includes
 744    -- arguments to the load/store machine code generators, alignment numbers
 745    -- and the final 'framesize' among others.
 746
 747    = concatMap w32_to_w8s_bigEndian $ [
 748             0x7c0802a6,                         -- mflr r0
 749             0x93e1fffc,                         -- stw r31,-4(r1)
 750             0x90010008,                         -- stw r0,8(r1)
 751             0x94210000 .|. offset (-framesize), -- stwu r1, -frameSize(r1)
 752             0x7c7f1b78                          -- mr r31, r3
 753             ] ++ pass_parameters ++             -- pass the parameters
 754             loadWord 12 addr_offW ++ [          -- lwz r12, a_off(r31)
 755             0x7d8903a6,                         -- mtctr r12
 756             0x4e800421                          -- bctrl
 757             ] ++ gather_result ++ [             -- save the return value
 758             0x80210000,                         -- lwz r1, 0(r1)
 759             0x83e1fffc,                         -- lwz r31, -4(r1)
 760             0x80010008,                         -- lwz r0, 8(r1)
 761             0x7c0803a6,                         -- mtlr r0
 762             0x4e800020                          -- blr
 763          ]
 764
 765    where
 766      gather_result :: [Word32]
 767      gather_result = case r_rep of
 768        VoidRep   -> []
 769        FloatRep  -> storeFloat  1 r_offW
 770        DoubleRep -> storeDouble 1 r_offW
 771        Int64Rep  -> storeLong   3 r_offW
 772        Word64Rep -> storeLong   3 r_offW
 773        _         -> storeWord   3 r_offW
 774
 775      pass_parameters :: [Word32]
 776      pass_parameters = concat params
 777
 778      -- vector aligned (4 word = 16 bytes) with 8 extra words of buffer space
 779      framesize = alignedTo 4 (argsize + 8)
 780
 781      ((_,_,argsize), params) = mapAccumL loadparam (3,1,2) arg_offs_n_reps
 782
 783      -- handle one argument, returning machine code and the updated state
 784      loadparam :: (Int, Int, Int) -> (Int, PrimRep) ->
 785                   ((Int, Int, Int), [Word32])
 786
 787      loadparam (gpr, fpr, stack) (ofs, rep) = case rep of
 788        FloatRep | fpr <= 8  -> ( (gpr, fpr + 1, stack),  loadFloat fpr ofs )
 789        FloatRep             -> ( (gpr, fpr, stack + 1),  stackWord stack ofs )
 790
 791        DoubleRep | fpr <= 8 -> ( (gpr, fpr + 1, stack),  loadDouble fpr ofs )
 792        DoubleRep            -> ( (gpr, fpr, astack + 2), stackLong astack ofs )
 793
 794        r | is64 r && even gpr  -> loadparam (gpr + 1, fpr, stack) (ofs, rep)
 795        r | is64 r && gpr <= 9  -> ( (gpr + 2, fpr, stack),  loadLong gpr ofs )
 796        r | is64 r              -> ( (gpr, fpr, astack + 2), stackLong astack ofs )
 797        _ | gpr <= 10        -> ( (gpr + 1, fpr, stack),  loadWord gpr ofs )
 798        _                    -> ( (gpr, fpr, stack + 1),  stackWord stack ofs )
 799       where astack = alignedTo 2 stack
 800
 801             is64 Int64Rep = True
 802             is64 Word64Rep = True
 803             is64 _ = False
 804
 805      alignedTo :: Int -> Int -> Int
 806      alignedTo alignment x = case x `mod` alignment of
 807                                0 -> x
 808                                y -> x - y + alignment
 809
 810      -- convenience macros to do multiple-instruction data moves
 811      stackWord dst src = loadWord 0 src ++ storeWordC 0 dst
 812      stackLong dst src = stackWord dst src ++ stackWord (dst + 1) (src + 1)
 813      loadLong  dst src = loadWord dst src ++ loadWord (dst + 1) (src + 1)
 814      storeLong dst src = storeWord dst src ++ storeWord (dst + 1) (src + 1)
 815
 816      -- load data from the Haskell stack (relative to r31)
 817      loadFloat   = loadstoreInstr 0xc01f0000 -- lfs fpr, ofs(r31)
 818      loadDouble  = loadstoreInstr 0xc81f0000 -- lfd fpr, ofs(r31)
 819      loadWord    = loadstoreInstr 0x801f0000 -- lwz gpr, ofs(r31)
 820
 821      -- store data to the Haskell stack (relative to r31)
 822      storeFloat  = loadstoreInstr 0xd01f0000 -- stfs fpr, ofs(r31)
 823      storeDouble = loadstoreInstr 0xd81f0000 -- stfd fpr, ofs(r31)
 824      storeWord   = loadstoreInstr 0x901f0000 -- stw gpr, ofs(r31)
 825
 826      -- store data to the C stack (relative to r1)
 827      storeWordC  = loadstoreInstr 0x90010000 -- stw gpr, ofs(r1)
 828
 829      -- machine code building blocks
 830      loadstoreInstr :: Word32 -> Int -> Int -> [Word32]
 831      loadstoreInstr code reg ofs = [ code .|. register reg .|. offset ofs ]
 832
 833      register :: Int -> Word32
 834      register reg = fromIntegral reg `shiftL` 21
 835
 836      offset :: Int -> Word32
 837      offset ofs   = fromIntegral (ofs * 4) .&. 0xFFFF
 838
 839      -- speaks for itself
 840      w32_to_w8s_bigEndian :: Word32 -> [Word8]
 841      w32_to_w8s_bigEndian w =  [fromIntegral (0xFF .&. (w `shiftR` 24)),
 842                                 fromIntegral (0xFF .&. (w `shiftR` 16)),
 843                                 fromIntegral (0xFF .&. (w `shiftR` 8)),
 844                                 fromIntegral (0xFF .&. w)]
 845
 846 #else
 847
 848    = throwDyn (InstallationError "foreign import is not implemented for GHCi on this platform.")
 849
 850 #endif
 851
 852 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 853 lit32 :: Int -> [Word8]
 854 lit32 i = let w32 = (fromIntegral i) :: Word32
 855           in  map (fromIntegral . ( .&. 0xFF))
 856                   [w32, w32 `shiftR` 8,
 857                    w32 `shiftR` 16,  w32 `shiftR` 24]
 858 #endif
 859
 860 #endif /* !USE_LIBFFI */
 861
 862 moan64 :: String -> SDoc -> a
 863 moan64 msg pp_rep
 864    = unsafePerformIO (
 865         hPutStrLn stderr (
 866         "\nGHCi's bytecode generation machinery can't handle 64-bit\n" ++
 867         "code properly yet.  You can work around this for the time being\n" ++
 868         "by compiling this module and all those it imports to object code,\n" ++
 869         "and re-starting your GHCi session.  The panic below contains information,\n" ++
 870         "intended for the GHC implementors, about the exact place where GHC gave up.\n"
 871         )
 872      )
 873      `seq`
 874      pprPanic msg pp_rep
 875
 876 newExec :: Storable a => [a] -> IO (FunPtr ())
 877 newExec code
 878    = do ptr <- _allocateExec (fromIntegral $ codeSize undefined code)
 879         pokeArray ptr code
 880         return (castPtrToFunPtr ptr)
 881    where
 882    codeSize :: Storable a => a -> [a] -> Int
 883    codeSize dummy array = sizeOf(dummy) * length array
 884
 885 foreign import ccall unsafe "allocateExec"
 886   _allocateExec :: CUInt -> IO (Ptr a)
 887 \end{code}
 888