compiler/ghci/ByteCodeFFI.lhs

   1 %
   2 % (c) The University of Glasgow 2001-2006
   3 %
   4
   5 ByteCodeGen: Generate machine-code sequences for foreign import
   6
   7 \begin{code}
   8 {-# OPTIONS -w #-}
   9 -- The above warning supression flag is a temporary kludge.
  10 -- While working on this module you are encouraged to remove it and fix
  11 -- any warnings in the module. See
  12 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
  13 -- for details
  14
  15 module ByteCodeFFI ( mkMarshalCode, moan64, newExec ) where
  16
  17 #include "HsVersions.h"
  18
  19 import Outputable
  20 import SMRep
  21 import ForeignCall
  22 import Panic
  23
  24 -- DON'T remove apparently unused imports here ..
  25 -- there is ifdeffery below
  26 import Control.Exception ( throwDyn )
  27 import Data.Bits        ( Bits(..), shiftR, shiftL )
  28 import Data.List        ( mapAccumL )
  29
  30 import Data.Word        ( Word8, Word32 )
  31 import Foreign          ( Ptr, FunPtr, castPtrToFunPtr,
  32                           Storable, sizeOf, pokeArray )
  33 import Foreign.C        ( CUInt )
  34 import System.IO.Unsafe ( unsafePerformIO )
  35 import System.IO        ( hPutStrLn, stderr )
  36 -- import Debug.Trace   ( trace )
  37 \end{code}
  38
  39 %************************************************************************
  40 %*                                                                      *
  41 \subsection{The platform-dependent marshall-code-generator.}
  42 %*                                                                      *
  43 %************************************************************************
  44
  45 \begin{code}
  46
  47 moan64 :: String -> SDoc -> a
  48 moan64 msg pp_rep
  49    = unsafePerformIO (
  50         hPutStrLn stderr (
  51         "\nGHCi's bytecode generation machinery can't handle 64-bit\n" ++
  52         "code properly yet.  You can work around this for the time being\n" ++
  53         "by compiling this module and all those it imports to object code,\n" ++
  54         "and re-starting your GHCi session.  The panic below contains information,\n" ++
  55         "intended for the GHC implementors, about the exact place where GHC gave up.\n"
  56         )
  57      )
  58      `seq`
  59      pprPanic msg pp_rep
  60
  61
  62 -- For sparc_TARGET_ARCH, i386_TARGET_ARCH, etc.
  63 #include "nativeGen/NCG.h"
  64
  65 {-
  66 Make a piece of code which expects to see the Haskell stack
  67 looking like this.  It is given a pointer to the lowest word in
  68 the stack -- presumably the tag of the placeholder.
  69
  70                   <arg_n>
  71                   ...
  72                   <arg_1>
  73                   Addr# address_of_C_fn
  74                   <placeholder-for-result#> (must be an unboxed type)
  75
  76 We cope with both ccall and stdcall for the C fn.  However, this code
  77 itself expects only to be called using the ccall convention -- that is,
  78 we don't clear our own (single) arg off the C stack.
  79 -}
  80 mkMarshalCode :: CCallConv
  81               -> (Int, CgRep) -> Int -> [(Int, CgRep)]
  82               -> IO (FunPtr ())
  83 mkMarshalCode cconv (r_offW, r_rep) addr_offW arg_offs_n_reps
  84    = let bytes = mkMarshalCode_wrk cconv (r_offW, r_rep)
  85                                    addr_offW arg_offs_n_reps
  86      in  newExec bytes
  87
  88 newExec :: Storable a => [a] -> IO (FunPtr ())
  89 newExec code
  90    = do ptr <- _allocateExec (fromIntegral $ codeSize undefined code)
  91         pokeArray ptr code
  92         return (castPtrToFunPtr ptr)
  93    where
  94    codeSize :: Storable a => a -> [a] -> Int
  95    codeSize dummy array = sizeOf(dummy) * length array
  96
  97 foreign import ccall unsafe "allocateExec"
  98   _allocateExec :: CUInt -> IO (Ptr a)
  99
 100 mkMarshalCode_wrk :: CCallConv
 101                   -> (Int, CgRep) -> Int -> [(Int, CgRep)]
 102                   -> [Word8]
 103
 104 mkMarshalCode_wrk cconv (r_offW, r_rep) addr_offW arg_offs_n_reps
 105
 106 #if i386_TARGET_ARCH
 107
 108    = let -- Don't change this without first consulting Intel Corp :-)
 109          bytes_per_word = 4
 110
 111          offsets_to_pushW
 112             = concat
 113               [   -- reversed because x86 is little-endian
 114                   reverse [a_offW .. a_offW + cgRepSizeW a_rep - 1]
 115
 116                 -- reversed because args are pushed L -> R onto C stack
 117                 | (a_offW, a_rep) <- reverse arg_offs_n_reps
 118               ]
 119
 120          arguments_size = bytes_per_word * length offsets_to_pushW
 121 #if darwin_TARGET_OS
 122              -- Darwin: align stack frame size to a multiple of 16 bytes
 123          stack_frame_size = (arguments_size + 15) .&. complement 15
 124          stack_frame_pad = stack_frame_size - arguments_size
 125 #else
 126          stack_frame_size = arguments_size
 127 #endif
 128
 129          -- some helpers to assemble x86 insns.
 130          movl_offespmem_esi offB        -- movl   offB(%esp), %esi
 131             = [0x8B, 0xB4, 0x24] ++ lit32 offB
 132          movl_offesimem_ecx offB        -- movl   offB(%esi), %ecx
 133             = [0x8B, 0x8E] ++ lit32 offB
 134          save_regs                      -- pushl  all intregs except %esp
 135             = [0x50, 0x53, 0x51, 0x52, 0x56, 0x57, 0x55]
 136          restore_regs                   -- popl   ditto
 137             = [0x5D, 0x5F, 0x5E, 0x5A, 0x59, 0x5B, 0x58]
 138          pushl_ecx                      -- pushl  %ecx
 139             = [0x51]
 140          call_star_ecx                  -- call   * %ecx
 141             = [0xFF, 0xD1]
 142          add_lit_esp lit                -- addl   $lit, %esp
 143             = [0x81, 0xC4] ++ lit32 lit
 144          movl_eax_offesimem offB        -- movl   %eax, offB(%esi)
 145             = [0x89, 0x86] ++ lit32 offB
 146          movl_edx_offesimem offB        -- movl   %edx, offB(%esi)
 147             = [0x89, 0x96] ++ lit32 offB
 148          ret                            -- ret
 149             = [0xC3]
 150          fstpl_offesimem offB           -- fstpl   offB(%esi)
 151             = [0xDD, 0x9E] ++ lit32 offB
 152          fstps_offesimem offB           -- fstps   offB(%esi)
 153             = [0xD9, 0x9E] ++ lit32 offB
 154          {-
 155              2 0000 8BB42478    movl    0x12345678(%esp), %esi
 156              2      563412
 157              3 0007 8B8E7856    movl    0x12345678(%esi), %ecx
 158              3      3412
 159              4
 160              5 000d 50535152    pushl %eax ; pushl %ebx ; pushl %ecx ; pushl  %edx
 161              6 0011 565755      pushl %esi ; pushl %edi ; pushl %ebp
 162              7
 163              8 0014 5D5F5E      popl %ebp ; popl %edi ; popl %esi
 164              9 0017 5A595B58    popl %edx ; popl %ecx ; popl %ebx ; popl %eax
 165             10
 166             11 001b 51          pushl %ecx
 167             12 001c FFD1        call * %ecx
 168             13
 169             14 001e 81C47856    addl $0x12345678, %esp
 170             14      3412
 171             15 0024 89867856    movl %eax, 0x12345678(%esi)
 172             15      3412
 173             16 002a 89967856    movl %edx, 0x12345678(%esi)
 174             16      3412
 175             17
 176             18 0030 DD967856    fstl    0x12345678(%esi)
 177             18      3412
 178             19 0036 DD9E7856    fstpl   0x12345678(%esi)
 179             19      3412
 180             20 003c D9967856    fsts    0x12345678(%esi)
 181             20      3412
 182             21 0042 D99E7856    fstps   0x12345678(%esi)
 183             18
 184             19 0030 C3          ret
 185             20
 186
 187          -}
 188
 189      in
 190      --trace (show (map fst arg_offs_n_reps))
 191      (
 192      {- On entry, top of C stack 0(%esp) is the RA and 4(%esp) is
 193         arg passed from the interpreter.
 194
 195         Push all callee saved regs.  Push all of them anyway ...
 196            pushl       %eax
 197            pushl       %ebx
 198            pushl       %ecx
 199            pushl       %edx
 200            pushl       %esi
 201            pushl       %edi
 202            pushl       %ebp
 203      -}
 204      save_regs
 205
 206      {- Now 28+0(%esp) is RA and 28+4(%esp) is the arg (the H stack ptr).
 207         We'll use %esi as a temporary to point at the H stack, and
 208         %ecx as a temporary to copy via.
 209
 210            movl        28+4(%esp), %esi
 211      -}
 212      ++ movl_offespmem_esi 32
 213
 214 #if darwin_TARGET_OS
 215      {- On Darwin, add some padding so that the stack stays aligned. -}
 216      ++ (if stack_frame_pad /= 0
 217             then add_lit_esp (-stack_frame_pad)
 218             else [])
 219 #endif
 220
 221      {- For each arg in args_offs_n_reps, examine the associated
 222         CgRep to determine how many words there are.  This gives a
 223         bunch of offsets on the H stack to copy to the C stack:
 224
 225            movl        off1(%esi), %ecx
 226            pushl       %ecx
 227      -}
 228      ++ concatMap (\offW -> movl_offesimem_ecx (bytes_per_word * offW)
 229                             ++ pushl_ecx)
 230                   offsets_to_pushW
 231
 232      {- Get the addr to call into %ecx, bearing in mind that there's
 233         an Addr# tag at the indicated location, and do the call:
 234
 235            movl        4*(1 /*tag*/ +addr_offW)(%esi), %ecx
 236            call        * %ecx
 237      -}
 238      ++ movl_offesimem_ecx (bytes_per_word * addr_offW)
 239      ++ call_star_ecx
 240
 241      {- Nuke the args just pushed and re-establish %esi at the
 242         H-stack ptr:
 243
 244            addl        $4*number_of_args_pushed, %esp (ccall only)
 245            movl        28+4(%esp), %esi
 246      -}
 247      ++ (if   cconv /= StdCallConv
 248          then add_lit_esp stack_frame_size
 249          else [])
 250      ++ movl_offespmem_esi 32
 251
 252      {- Depending on what the return type is, get the result
 253         from %eax or %edx:%eax or %st(0).
 254
 255            movl        %eax, 4(%esi)        -- assuming tagged result
 256         or
 257            movl        %edx, 4(%esi)
 258            movl        %eax, 8(%esi)
 259         or
 260            fstpl       4(%esi)
 261         or
 262            fstps       4(%esi)
 263      -}
 264      ++ let i32 = movl_eax_offesimem 0
 265             i64 = movl_eax_offesimem 0 ++ movl_edx_offesimem 4
 266             f32 = fstps_offesimem 0
 267             f64 = fstpl_offesimem 0
 268         in
 269         case r_rep of
 270            NonPtrArg -> i32
 271            DoubleArg -> f64
 272            FloatArg  -> f32
 273            LongArg   -> i64
 274            VoidArg   -> []
 275            other     -> moan64 "ByteCodeFFI.mkMarshalCode_wrk(x86)"
 276                                (ppr r_rep)
 277
 278      {- Restore all the pushed regs and go home.
 279
 280            pushl        %ebp
 281            pushl        %edi
 282            pushl        %esi
 283            pushl        %edx
 284            pushl        %ecx
 285            pushl        %ebx
 286            pushl        %eax
 287
 288            ret
 289      -}
 290      ++ restore_regs
 291      ++ ret
 292      )
 293
 294 #elif x86_64_TARGET_ARCH
 295
 296    =
 297      -- the address of the H stack is in %rdi.  We need to move it out, so
 298      -- we can use %rdi as an arg reg for the following call:
 299     pushq_rbp ++
 300     movq_rdi_rbp ++
 301
 302      -- ####### load / push the args
 303
 304      let
 305         (stack_args, fregs_unused, reg_loads) =
 306            load_arg_regs arg_offs_n_reps int_loads float_loads []
 307
 308         tot_arg_size = bytes_per_word * length stack_args
 309
 310         -- On entry to the called function, %rsp should be aligned
 311         -- on a 16-byte boundary +8 (i.e. the first stack arg after
 312         -- the return address is 16-byte aligned).  In STG land
 313         -- %rsp is kept 16-byte aligned (see StgCRun.c), so we just
 314         -- need to make sure we push a multiple of 16-bytes of args,
 315         -- plus the return address, to get the correct alignment.
 316         (real_size, adjust_rsp)
 317           | tot_arg_size `rem` 16 == 0    = (tot_arg_size, [])
 318           | otherwise                     = (tot_arg_size + 8, subq_lit_rsp 8)
 319
 320         (stack_pushes, stack_words) =
 321                 push_args stack_args [] 0
 322
 323         -- we need to know the number of SSE regs used in the call, see later
 324         n_sse_regs_used = length float_loads - length fregs_unused
 325      in
 326         concat reg_loads
 327      ++ adjust_rsp
 328      ++ concat stack_pushes -- push in reverse order
 329
 330      -- ####### make the call
 331
 332         -- use %r10 to make the call, because we don't have to save it.
 333         --      movq 8*addr_offW(%rbp), %r10
 334      ++ movq_rbpoff_r10 (bytes_per_word * addr_offW)
 335
 336         -- The x86_64 ABI requires us to set %al to the number of SSE
 337         -- registers that contain arguments, if the called routine
 338         -- is a varargs function.  We don't know whether it's a
 339         -- varargs function or not, so we have to assume it is.
 340         --
 341         -- It's not safe to omit this assignment, even if the number
 342         -- of SSE regs in use is zero.  If %al is larger than 8
 343         -- on entry to a varargs function, seg faults ensue.
 344      ++ movq_lit_rax n_sse_regs_used
 345      ++ call_star_r10
 346
 347         -- pop the args from the stack, only in ccall mode
 348         -- (in stdcall the callee does it).
 349      ++ (if   cconv /= StdCallConv
 350          then addq_lit_rsp real_size
 351          else [])
 352
 353      -- ####### place the result in the right place and return
 354
 355      ++ assign_result
 356      ++ popq_rbp
 357      ++ ret
 358
 359   where
 360      bytes_per_word = 8
 361
 362      -- int arg regs: rdi,rsi,rdx,rcx,r8,r9
 363      -- flt arg regs: xmm0..xmm7
 364      int_loads   = [ movq_rbpoff_rdi, movq_rbpoff_rsi, movq_rbpoff_rdx,
 365                      movq_rbpoff_rcx, movq_rbpoff_r8,  movq_rbpoff_r9 ]
 366      float_loads = [ 0..7 ]
 367
 368      load_arg_regs args [] [] code     =  (args, [], code)
 369      load_arg_regs [] iregs fregs code =  ([], fregs, code)
 370      load_arg_regs ((off,rep):args) iregs fregs code
 371         | FloatArg  <- rep =
 372             case fregs of
 373               [] -> push_this_arg
 374               n : frest ->
 375                 load_arg_regs args iregs frest
 376                       (mov_f32_rbpoff_xmm n (bytes_per_word * off) : code)
 377         | DoubleArg <- rep =
 378             case fregs of
 379               [] -> push_this_arg
 380               n : frest ->
 381                 load_arg_regs args iregs frest
 382                        (mov_f64_rbpoff_xmm n (bytes_per_word * off) : code)
 383         | (mov_reg:irest) <- iregs =
 384                 load_arg_regs args irest fregs (mov_reg (bytes_per_word * off) : code)
 385         | otherwise =
 386                  push_this_arg
 387         where
 388            push_this_arg = ((off,rep):args',fregs', code')
 389                 where (args',fregs',code') = load_arg_regs args iregs fregs code
 390
 391      push_args [] code pushed_words = (code, pushed_words)
 392      push_args ((off,rep):args) code pushed_words
 393         | FloatArg  <- rep =
 394                 push_args args (push_f32_rbpoff (bytes_per_word * off) : code)
 395                         (pushed_words+1)
 396         | DoubleArg <- rep =
 397                 push_args args (push_f64_rbpoff (bytes_per_word * off) : code)
 398                         (pushed_words+1)
 399         | otherwise =
 400                 push_args args (pushq_rbpoff (bytes_per_word * off) : code)
 401                         (pushed_words+1)
 402
 403
 404      assign_result =
 405         case r_rep of
 406           DoubleArg -> f64
 407           FloatArg  -> f32
 408           VoidArg   -> []
 409           _other    -> i64
 410         where
 411           i64 = movq_rax_rbpoff 0
 412           f32 = mov_f32_xmm0_rbpoff 0
 413           f64 = mov_f64_xmm0_rbpoff 0
 414
 415 -- ######### x86_64 machine code:
 416
 417 --   0:   48 89 fd                mov    %rdi,%rbp
 418 --   3:   48 8b bd 78 56 34 12    mov    0x12345678(%rbp),%rdi
 419 --   a:   48 8b b5 78 56 34 12    mov    0x12345678(%rbp),%rsi
 420 --  11:   48 8b 95 78 56 34 12    mov    0x12345678(%rbp),%rdx
 421 --  18:   48 8b 8d 78 56 34 12    mov    0x12345678(%rbp),%rcx
 422 --  1f:   4c 8b 85 78 56 34 12    mov    0x12345678(%rbp),%r8
 423 --  26:   4c 8b 8d 78 56 34 12    mov    0x12345678(%rbp),%r9
 424 --  2d:   4c 8b 95 78 56 34 12    mov    0x12345678(%rbp),%r10
 425 --  34:   48 c7 c0 78 56 34 12    mov    $0x12345678,%rax
 426 --  3b:   48 89 85 78 56 34 12    mov    %rax,0x12345678(%rbp)
 427 --  42:   f3 0f 10 bd 78 56 34 12 movss  0x12345678(%rbp),%xmm7
 428 --  4a:   f2 0f 10 9d 78 56 34 12 movsd  0x12345678(%rbp),%xmm3
 429 --  52:   f2 44 0f 10 85 78 56 34 12 movsd  0x12345678(%rbp),%xmm8
 430 --  5b:   f3 0f 11 9d 78 56 34 12 movss  %xmm3,0x12345678(%rbp)
 431 --  63:   f2 0f 11 9d 78 56 34 12 movsd  %xmm3,0x12345678(%rbp)
 432 --  6b:   f2 44 0f 11 85 78 56 34 12 movsd  %xmm8,0x12345678(%rbp)
 433 --  74:   ff b5 78 56 34 12       pushq  0x12345678(%rbp)
 434 --  7a:   f3 44 0f 11 04 24       movss  %xmm8,(%rsp)
 435 --  80:   f2 44 0f 11 04 24       movsd  %xmm8,(%rsp)
 436 --  86:   48 81 ec 78 56 34 12    sub    $0x12345678,%rsp
 437 --  8d:   48 81 c4 78 56 34 12    add    $0x12345678,%rsp
 438 --  94:   41 ff d2                callq  *%r10
 439 --  97:   55                      push   %rbp
 440 --  98:   5d                      pop    %rbp
 441 --  99:   c3                      retq
 442
 443      movq_rdi_rbp         = [0x48,0x89,0xfd]
 444      movq_rbpoff_rdi  off = [0x48, 0x8b, 0xbd] ++ lit32 off
 445      movq_rbpoff_rsi  off = [0x48, 0x8b, 0xb5] ++ lit32 off
 446      movq_rbpoff_rdx  off = [0x48, 0x8b, 0x95] ++ lit32 off
 447      movq_rbpoff_rcx  off = [0x48, 0x8b, 0x8d] ++ lit32 off
 448      movq_rbpoff_r8   off = [0x4c, 0x8b, 0x85] ++ lit32 off
 449      movq_rbpoff_r9   off = [0x4c, 0x8b, 0x8d] ++ lit32 off
 450      movq_rbpoff_r10  off = [0x4c, 0x8b, 0x95] ++ lit32 off
 451      movq_lit_rax     lit = [0x48, 0xc7, 0xc0] ++ lit32 lit
 452      movq_rax_rbpoff  off = [0x48, 0x89, 0x85] ++ lit32 off
 453      mov_f32_rbpoff_xmm n off
 454          = 0xf3 : if n >= 8 then 0x44 : rest else rest
 455          where rest = [0x0f, 0x10, 0x85 + (n.&.7)`shiftL`3] ++ lit32 off
 456      mov_f64_rbpoff_xmm n off
 457          = 0xf2 : if n >= 8 then 0x44 : rest else rest
 458          where rest = [0x0f, 0x10, 0x85 + (n.&.7)`shiftL`3] ++ lit32 off
 459      mov_f32_xmm0_rbpoff  off = [0xf3, 0x0f, 0x11, 0x85] ++ lit32 off
 460      mov_f64_xmm0_rbpoff  off = [0xf2, 0x0f, 0x11, 0x85] ++ lit32 off
 461      pushq_rbpoff     off = [0xff, 0xb5] ++ lit32 off
 462      push_f32_rbpoff  off =
 463         subq_lit_rsp 8 ++                        -- subq $8, %rsp
 464         mov_f32_rbpoff_xmm 8 off ++              -- movss off(%rbp), %xmm8
 465         [0xf3, 0x44, 0x0f, 0x11, 0x04, 0x24]     -- movss %xmm8, (%rsp)
 466      push_f64_rbpoff  off =
 467         subq_lit_rsp 8 ++                        -- subq $8, %rsp
 468         mov_f64_rbpoff_xmm 8 off ++              -- movsd off(%rbp), %xmm8
 469         [0xf2, 0x44, 0x0f, 0x11, 0x04, 0x24]     -- movsd %xmm8, (%rsp)
 470      subq_lit_rsp     lit = [0x48, 0x81, 0xec] ++ lit32 lit
 471      addq_lit_rsp     lit = [0x48, 0x81, 0xc4] ++ lit32 lit
 472      call_star_r10 = [0x41,0xff,0xd2]
 473      ret = [0xc3]
 474      pushq_rbp = [0x55]
 475      popq_rbp = [0x5d]
 476
 477 #elif sparc_TARGET_ARCH
 478
 479    = let -- At least for sparc V8
 480          bytes_per_word = 4
 481
 482          -- speaks for itself
 483          w32_to_w8s_bigEndian :: Word32 -> [Word8]
 484          w32_to_w8s_bigEndian w
 485             =  [fromIntegral (0xFF .&. (w `shiftR` 24)),
 486                 fromIntegral (0xFF .&. (w `shiftR` 16)),
 487                 fromIntegral (0xFF .&. (w `shiftR` 8)),
 488                 fromIntegral (0xFF .&. w)]
 489
 490          offsets_to_pushW
 491             = concat
 492               [  [a_offW .. a_offW + cgRepSizeW a_rep - 1]
 493
 494                 | (a_offW, a_rep) <- arg_offs_n_reps
 495               ]
 496
 497          total_argWs    = length offsets_to_pushW
 498          argWs_on_stack = if total_argWs > 6 then total_argWs - 6
 499                                              else 0
 500
 501          -- The stack pointer must be kept 8-byte aligned, which means
 502          -- we need to calculate this quantity too
 503          argWs_on_stack_ROUNDED_UP
 504             | odd argWs_on_stack = 1 + argWs_on_stack
 505             | otherwise          = argWs_on_stack
 506
 507          -- some helpers to assemble sparc insns.
 508          -- REGS
 509          iReg, oReg, gReg, fReg :: Int -> Word32
 510          iReg = fromIntegral . (+ 24)
 511          oReg = fromIntegral . (+ 8)
 512          gReg = fromIntegral . (+ 0)
 513          fReg = fromIntegral
 514
 515          sp = oReg 6
 516          i0 = iReg 0
 517          i7 = iReg 7
 518          o0 = oReg 0
 519          o1 = oReg 1
 520          o7 = oReg 7
 521          g0 = gReg 0
 522          g1 = gReg 1
 523          f0 = fReg 0
 524          f1 = fReg 1
 525
 526          -- INSN templates
 527          insn_r_r_i :: Word32 -> Word32 -> Word32 -> Int -> Word32
 528          insn_r_r_i op3 rs1 rd imm13
 529             = (3 `shiftL` 30)
 530               .|. (rs1 `shiftL` 25)
 531               .|. (op3 `shiftL` 19)
 532               .|. (rd `shiftL` 14)
 533               .|. (1 `shiftL` 13)
 534               .|. mkSimm13 imm13
 535
 536          insn_r_i_r :: Word32 -> Word32 -> Int -> Word32 -> Word32
 537          insn_r_i_r op3 rs1 imm13 rd
 538             = (2 `shiftL` 30)
 539               .|. (rd `shiftL` 25)
 540               .|. (op3 `shiftL` 19)
 541               .|. (rs1 `shiftL` 14)
 542               .|. (1 `shiftL` 13)
 543               .|. mkSimm13 imm13
 544
 545          mkSimm13 :: Int -> Word32
 546          mkSimm13 imm13
 547             = let imm13w = (fromIntegral imm13) :: Word32
 548               in  imm13w .&. 0x1FFF
 549
 550          -- REAL (non-synthetic) insns
 551          -- or %rs1, %rs2, %rd
 552          mkOR :: Word32 -> Word32 -> Word32 -> Word32
 553          mkOR rs1 rs2 rd
 554             = (2 `shiftL` 30)
 555               .|. (rd `shiftL` 25)
 556               .|. (op3_OR `shiftL` 19)
 557               .|. (rs1 `shiftL` 14)
 558               .|. (0 `shiftL` 13)
 559               .|. rs2
 560               where op3_OR = 2 :: Word32
 561
 562          -- ld(int)   [%rs + imm13], %rd
 563          mkLD rs1 imm13 rd = insn_r_r_i 0x00{-op3_LD-} rd rs1 imm13
 564
 565          -- st(int)   %rs, [%rd + imm13]
 566          mkST   = insn_r_r_i 0x04 -- op3_ST
 567
 568          -- st(float) %rs, [%rd + imm13]
 569          mkSTF  = insn_r_r_i 0x24 -- op3_STF
 570
 571          -- jmpl     %rs + imm13, %rd
 572          mkJMPL = insn_r_i_r 0x38 -- op3_JMPL
 573
 574          -- save     %rs + imm13, %rd
 575          mkSAVE = insn_r_i_r 0x3C -- op3_SAVE
 576
 577          -- restore  %rs + imm13, %rd
 578          mkRESTORE = insn_r_i_r 0x3D -- op3_RESTORE
 579
 580          -- SYNTHETIC insns
 581          mkNOP             = mkOR g0 g0 g0
 582          mkCALL reg        = mkJMPL reg 0 o7
 583          mkRET             = mkJMPL i7 8 g0
 584          mkRESTORE_TRIVIAL = mkRESTORE g0 0 g0
 585      in
 586      --trace (show (map fst arg_offs_n_reps))
 587      concatMap w32_to_w8s_bigEndian (
 588
 589      {- On entry, %o0 is the arg passed from the interpreter.  After
 590         the initial save insn, it will be in %i0.  Studying the sparc
 591         docs one would have thought that the minimum frame size is 92
 592         bytes, but gcc always uses at least 112, and indeed there are
 593         segfaults a-plenty with 92.  So I use 112 here as well.  I
 594         don't understand why, tho.
 595      -}
 596      [mkSAVE sp (- ({-92-}112 + 4*argWs_on_stack_ROUNDED_UP)) sp]
 597
 598      {- For each arg in args_offs_n_reps, examine the associated
 599         CgRep to determine how many words there are.  This gives a
 600         bunch of offsets on the H stack.  Move the first 6 words into
 601         %o0 .. %o5 and the rest on the stack, starting at [%sp+92].
 602         Use %g1 as a temp.
 603      -}
 604      ++ let doArgW (offW, wordNo)
 605               | wordNo < 6
 606               = [mkLD i0 (bytes_per_word * offW) (oReg wordNo)]
 607               | otherwise
 608               = [mkLD i0 (bytes_per_word * offW) g1,
 609                  mkST g1 sp (92 + bytes_per_word * (wordNo - 6))]
 610         in
 611             concatMap doArgW (zip offsets_to_pushW [0 ..])
 612
 613      {- Get the addr to call into %g1, bearing in mind that there's
 614         an Addr# tag at the indicated location, and do the call:
 615
 616            ld     [4*(1 /*tag*/ +addr_offW) + %i0], %g1
 617            call   %g1
 618      -}
 619      ++ [mkLD i0 (bytes_per_word * addr_offW) g1,
 620          mkCALL g1,
 621          mkNOP]
 622
 623      {- Depending on what the return type is, get the result
 624         from %o0 or %o1:%o0 or %f0 or %f1:%f0.
 625
 626            st          %o0, [%i0 + 4]        -- 32 bit int
 627         or
 628            st          %o0, [%i0 + 4]        -- 64 bit int
 629            st          %o1, [%i0 + 8]        -- or the other way round?
 630         or
 631            st          %f0, [%i0 + 4]        -- 32 bit float
 632         or
 633            st          %f0, [%i0 + 4]        -- 64 bit float
 634            st          %f1, [%i0 + 8]        -- or the other way round?
 635
 636      -}
 637      ++ let i32 = [mkST o0 i0 0]
 638             i64 = [mkST o0 i0 0, mkST o1 i0 4]
 639             f32 = [mkSTF f0 i0 0]
 640             f64 = [mkSTF f0 i0 0, mkSTF f1 i0 4]
 641         in
 642             case r_rep of
 643                NonPtrArg -> i32
 644                DoubleArg -> f64
 645                FloatArg  -> f32
 646                VoidArg   -> []
 647                other     -> moan64 "ByteCodeFFI.mkMarshalCode_wrk(sparc)"
 648                                    (ppr r_rep)
 649
 650      ++ [mkRET,
 651          mkRESTORE_TRIVIAL]  -- this is in the delay slot of the RET
 652      )
 653 #elif powerpc_TARGET_ARCH && darwin_TARGET_OS
 654
 655    = let
 656          bytes_per_word = 4
 657
 658          -- speaks for itself
 659          w32_to_w8s_bigEndian :: Word32 -> [Word8]
 660          w32_to_w8s_bigEndian w
 661             =  [fromIntegral (0xFF .&. (w `shiftR` 24)),
 662                 fromIntegral (0xFF .&. (w `shiftR` 16)),
 663                 fromIntegral (0xFF .&. (w `shiftR` 8)),
 664                 fromIntegral (0xFF .&. w)]
 665
 666          -- addr and result bits offsetsW
 667          a_off = addr_offW * bytes_per_word
 668          result_off  = r_offW * bytes_per_word
 669
 670          linkageArea = 24
 671          parameterArea = sum [ cgRepSizeW a_rep * bytes_per_word
 672                         | (_, a_rep) <- arg_offs_n_reps ]
 673          savedRegisterArea = 4
 674          frameSize = padTo16 (linkageArea + max parameterArea 32 + savedRegisterArea)
 675          padTo16 x = case x `mod` 16 of
 676             0 -> x
 677             y -> x - y + 16
 678
 679          pass_parameters [] _ _ = []
 680          pass_parameters ((a_offW, a_rep):args) nextFPR offsetW =
 681             let
 682                haskellArgOffset = a_offW * bytes_per_word
 683                offsetW' = offsetW + cgRepSizeW a_rep
 684
 685                pass_word w
 686                    | offsetW + w < 8 =
 687                       [0x801f0000    -- lwz rX, src(r31)
 688                         .|. (fromIntegral src .&. 0xFFFF)
 689                         .|. (fromIntegral (offsetW+w+3) `shiftL` 21)]
 690                    | otherwise =
 691                       [0x801f0000    -- lwz r0, src(r31)
 692                         .|. (fromIntegral src .&. 0xFFFF),
 693                        0x90010000    -- stw r0, dst(r1)
 694                         .|. (fromIntegral dst .&. 0xFFFF)]
 695                   where
 696                      src = haskellArgOffset + w*bytes_per_word
 697                      dst = linkageArea + (offsetW+w) * bytes_per_word
 698             in
 699                case a_rep of
 700                   FloatArg | nextFPR < 14 ->
 701                       (0xc01f0000    -- lfs fX, haskellArgOffset(r31)
 702                         .|. (fromIntegral haskellArgOffset .&. 0xFFFF)
 703                         .|. (fromIntegral nextFPR `shiftL` 21))
 704                       : pass_parameters args (nextFPR+1) offsetW'
 705                   DoubleArg | nextFPR < 14 ->
 706                       (0xc81f0000    -- lfd fX, haskellArgOffset(r31)
 707                         .|. (fromIntegral haskellArgOffset .&. 0xFFFF)
 708                         .|. (fromIntegral nextFPR `shiftL` 21))
 709                       : pass_parameters args (nextFPR+1) offsetW'
 710                   _ ->
 711                       concatMap pass_word [0 .. cgRepSizeW a_rep - 1]
 712                       ++ pass_parameters args nextFPR offsetW'
 713
 714          gather_result = case r_rep of
 715             VoidArg -> []
 716             FloatArg ->
 717                [0xd03f0000 .|. (fromIntegral result_off .&. 0xFFFF)]
 718                -- stfs f1, result_off(r31)
 719             DoubleArg ->
 720                [0xd83f0000 .|. (fromIntegral result_off .&. 0xFFFF)]
 721                -- stfd f1, result_off(r31)
 722             _ | cgRepSizeW r_rep == 2 ->
 723                [0x907f0000 .|. (fromIntegral result_off .&. 0xFFFF),
 724                 0x909f0000 .|. (fromIntegral (result_off+4) .&. 0xFFFF)]
 725                -- stw r3, result_off(r31)
 726                -- stw r4, result_off+4(r31)
 727             _ | cgRepSizeW r_rep == 1 ->
 728                [0x907f0000 .|. (fromIntegral result_off .&. 0xFFFF)]
 729                -- stw r3, result_off(r31)
 730      in
 731          concatMap w32_to_w8s_bigEndian $ [
 732             0x7c0802a6,         -- mflr r0
 733             0x93e1fffc,         -- stw r31,-4(r1)
 734             0x90010008,         -- stw r0,8(r1)
 735             0x94210000 .|. (fromIntegral (-frameSize) .&. 0xFFFF),
 736                                 -- stwu r1, -frameSize(r1)
 737             0x7c7f1b78          -- mr r31, r3
 738          ] ++ pass_parameters arg_offs_n_reps 1 0 ++ [
 739             0x819f0000 .|. (fromIntegral a_off .&. 0xFFFF),
 740                                 -- lwz r12, a_off(r31)
 741             0x7d8903a6,         -- mtctr r12
 742             0x4e800421          -- bctrl
 743          ] ++ gather_result ++ [
 744             0x80210000,         -- lwz r1, 0(r1)
 745             0x83e1fffc,         -- lwz r31, -4(r1)
 746             0x80010008,         -- lwz r0, 8(r1)
 747             0x7c0803a6,         -- mtlr r0
 748             0x4e800020          -- blr
 749          ]
 750
 751 #elif powerpc_TARGET_ARCH && linux_TARGET_OS
 752
 753    -- All offsets here are measured in Words (not bytes).  This includes
 754    -- arguments to the load/store machine code generators, alignment numbers
 755    -- and the final 'framesize' among others.
 756
 757    = concatMap w32_to_w8s_bigEndian $ [
 758             0x7c0802a6,                         -- mflr r0
 759             0x93e1fffc,                         -- stw r31,-4(r1)
 760             0x90010008,                         -- stw r0,8(r1)
 761             0x94210000 .|. offset (-framesize), -- stwu r1, -frameSize(r1)
 762             0x7c7f1b78                          -- mr r31, r3
 763             ] ++ pass_parameters ++             -- pass the parameters
 764             loadWord 12 addr_offW ++ [          -- lwz r12, a_off(r31)
 765             0x7d8903a6,                         -- mtctr r12
 766             0x4e800421                          -- bctrl
 767             ] ++ gather_result ++ [             -- save the return value
 768             0x80210000,                         -- lwz r1, 0(r1)
 769             0x83e1fffc,                         -- lwz r31, -4(r1)
 770             0x80010008,                         -- lwz r0, 8(r1)
 771             0x7c0803a6,                         -- mtlr r0
 772             0x4e800020                          -- blr
 773          ]
 774
 775    where
 776      gather_result :: [Word32]
 777      gather_result = case r_rep of
 778        VoidArg   -> []
 779        FloatArg  -> storeFloat  1 r_offW
 780        DoubleArg -> storeDouble 1 r_offW
 781        LongArg   -> storeLong   3 r_offW
 782        _         -> storeWord   3 r_offW
 783
 784      pass_parameters :: [Word32]
 785      pass_parameters = concat params
 786
 787      -- vector aligned (4 word = 16 bytes) with 8 extra words of buffer space
 788      framesize = alignedTo 4 (argsize + 8)
 789
 790      ((_,_,argsize), params) = mapAccumL loadparam (3,1,2) arg_offs_n_reps
 791
 792      -- handle one argument, returning machine code and the updated state
 793      loadparam :: (Int, Int, Int) -> (Int, CgRep) ->
 794                   ((Int, Int, Int), [Word32])
 795
 796      loadparam (gpr, fpr, stack) (ofs, rep) = case rep of
 797        FloatArg | fpr <= 8  -> ( (gpr, fpr + 1, stack),  loadFloat fpr ofs )
 798        FloatArg             -> ( (gpr, fpr, stack + 1),  stackWord stack ofs )
 799
 800        DoubleArg | fpr <= 8 -> ( (gpr, fpr + 1, stack),  loadDouble fpr ofs )
 801        DoubleArg            -> ( (gpr, fpr, astack + 2), stackLong astack ofs )
 802
 803        LongArg | even gpr   -> loadparam (gpr + 1, fpr, stack) (ofs, rep)
 804        LongArg | gpr <= 9   -> ( (gpr + 2, fpr, stack),  loadLong gpr ofs )
 805        LongArg              -> ( (gpr, fpr, astack + 2), stackLong astack ofs )
 806
 807        _ | gpr <= 10        -> ( (gpr + 1, fpr, stack),  loadWord gpr ofs )
 808        _                    -> ( (gpr, fpr, stack + 1),  stackWord stack ofs )
 809       where astack = alignedTo 2 stack
 810
 811      alignedTo :: Int -> Int -> Int
 812      alignedTo alignment x = case x `mod` alignment of
 813                                0 -> x
 814                                y -> x - y + alignment
 815
 816      -- convenience macros to do multiple-instruction data moves
 817      stackWord dst src = loadWord 0 src ++ storeWordC 0 dst
 818      stackLong dst src = stackWord dst src ++ stackWord (dst + 1) (src + 1)
 819      loadLong  dst src = loadWord dst src ++ loadWord (dst + 1) (src + 1)
 820      storeLong dst src = storeWord dst src ++ storeWord (dst + 1) (src + 1)
 821
 822      -- load data from the Haskell stack (relative to r31)
 823      loadFloat   = loadstoreInstr 0xc01f0000 -- lfs fpr, ofs(r31)
 824      loadDouble  = loadstoreInstr 0xc81f0000 -- lfd fpr, ofs(r31)
 825      loadWord    = loadstoreInstr 0x801f0000 -- lwz gpr, ofs(r31)
 826
 827      -- store data to the Haskell stack (relative to r31)
 828      storeFloat  = loadstoreInstr 0xd01f0000 -- stfs fpr, ofs(r31)
 829      storeDouble = loadstoreInstr 0xd81f0000 -- stfd fpr, ofs(r31)
 830      storeWord   = loadstoreInstr 0x901f0000 -- stw gpr, ofs(r31)
 831
 832      -- store data to the C stack (relative to r1)
 833      storeWordC  = loadstoreInstr 0x90010000 -- stw gpr, ofs(r1)
 834
 835      -- machine code building blocks
 836      loadstoreInstr :: Word32 -> Int -> Int -> [Word32]
 837      loadstoreInstr code reg ofs = [ code .|. register reg .|. offset ofs ]
 838
 839      register :: Int -> Word32
 840      register reg = fromIntegral reg `shiftL` 21
 841
 842      offset :: Int -> Word32
 843      offset ofs   = fromIntegral (ofs * 4) .&. 0xFFFF
 844
 845      -- speaks for itself
 846      w32_to_w8s_bigEndian :: Word32 -> [Word8]
 847      w32_to_w8s_bigEndian w =  [fromIntegral (0xFF .&. (w `shiftR` 24)),
 848                                 fromIntegral (0xFF .&. (w `shiftR` 16)),
 849                                 fromIntegral (0xFF .&. (w `shiftR` 8)),
 850                                 fromIntegral (0xFF .&. w)]
 851
 852 #else
 853
 854    = throwDyn (InstallationError "foreign import is not implemented for GHCi on this platform.")
 855
 856 #endif
 857
 858 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 859 lit32 :: Int -> [Word8]
 860 lit32 i = let w32 = (fromIntegral i) :: Word32
 861           in  map (fromIntegral . ( .&. 0xFF))
 862                   [w32, w32 `shiftR` 8,
 863                    w32 `shiftR` 16,  w32 `shiftR` 24]
 864 #endif
 865 \end{code}
 866