compiler/ghci/ByteCodeFFI.lhs

   1 %
   2 % (c) The University of Glasgow 2001-2006
   3 %
   4
   5 ByteCodeGen: Generate machine-code sequences for foreign import
   6
   7 \begin{code}
   8 module ByteCodeFFI ( mkMarshalCode, moan64, newExec ) where
   9
  10 #include "HsVersions.h"
  11
  12 import Outputable
  13 import SMRep
  14 import ForeignCall
  15 import Panic
  16
  17 -- DON'T remove apparently unused imports here ..
  18 -- there is ifdeffery below
  19 import Control.Exception ( throwDyn )
  20 import Data.Bits        ( Bits(..), shiftR, shiftL )
  21 import Data.List        ( mapAccumL )
  22
  23 import Data.Word        ( Word8, Word32 )
  24 import Foreign          ( Ptr, FunPtr, castPtrToFunPtr,
  25                           Storable, sizeOf, pokeArray )
  26 import Foreign.C        ( CUInt )
  27 import System.IO.Unsafe ( unsafePerformIO )
  28 import System.IO        ( hPutStrLn, stderr )
  29 -- import Debug.Trace   ( trace )
  30 \end{code}
  31
  32 %************************************************************************
  33 %*                                                                      *
  34 \subsection{The platform-dependent marshall-code-generator.}
  35 %*                                                                      *
  36 %************************************************************************
  37
  38 \begin{code}
  39
  40 moan64 :: String -> SDoc -> a
  41 moan64 msg pp_rep
  42    = unsafePerformIO (
  43         hPutStrLn stderr (
  44         "\nGHCi's bytecode generation machinery can't handle 64-bit\n" ++
  45         "code properly yet.  You can work around this for the time being\n" ++
  46         "by compiling this module and all those it imports to object code,\n" ++
  47         "and re-starting your GHCi session.  The panic below contains information,\n" ++
  48         "intended for the GHC implementors, about the exact place where GHC gave up.\n"
  49         )
  50      )
  51      `seq`
  52      pprPanic msg pp_rep
  53
  54
  55 -- For sparc_TARGET_ARCH, i386_TARGET_ARCH, etc.
  56 #include "nativeGen/NCG.h"
  57
  58 {-
  59 Make a piece of code which expects to see the Haskell stack
  60 looking like this.  It is given a pointer to the lowest word in
  61 the stack -- presumably the tag of the placeholder.
  62
  63                   <arg_n>
  64                   ...
  65                   <arg_1>
  66                   Addr# address_of_C_fn
  67                   <placeholder-for-result#> (must be an unboxed type)
  68
  69 We cope with both ccall and stdcall for the C fn.  However, this code
  70 itself expects only to be called using the ccall convention -- that is,
  71 we don't clear our own (single) arg off the C stack.
  72 -}
  73 mkMarshalCode :: CCallConv
  74               -> (Int, CgRep) -> Int -> [(Int, CgRep)]
  75               -> IO (FunPtr ())
  76 mkMarshalCode cconv (r_offW, r_rep) addr_offW arg_offs_n_reps
  77    = let bytes = mkMarshalCode_wrk cconv (r_offW, r_rep)
  78                                    addr_offW arg_offs_n_reps
  79      in  newExec bytes
  80
  81 newExec :: Storable a => [a] -> IO (FunPtr ())
  82 newExec code
  83    = do ptr <- _allocateExec (fromIntegral $ codeSize undefined code)
  84         pokeArray ptr code
  85         return (castPtrToFunPtr ptr)
  86    where
  87    codeSize :: Storable a => a -> [a] -> Int
  88    codeSize dummy array = sizeOf(dummy) * length array
  89
  90 foreign import ccall unsafe "allocateExec"
  91   _allocateExec :: CUInt -> IO (Ptr a)
  92
  93 mkMarshalCode_wrk :: CCallConv
  94                   -> (Int, CgRep) -> Int -> [(Int, CgRep)]
  95                   -> [Word8]
  96
  97 mkMarshalCode_wrk cconv (r_offW, r_rep) addr_offW arg_offs_n_reps
  98
  99 #if i386_TARGET_ARCH
 100
 101    = let -- Don't change this without first consulting Intel Corp :-)
 102          bytes_per_word = 4
 103
 104          offsets_to_pushW
 105             = concat
 106               [   -- reversed because x86 is little-endian
 107                   reverse [a_offW .. a_offW + cgRepSizeW a_rep - 1]
 108
 109                 -- reversed because args are pushed L -> R onto C stack
 110                 | (a_offW, a_rep) <- reverse arg_offs_n_reps
 111               ]
 112
 113          arguments_size = bytes_per_word * length offsets_to_pushW
 114 #if darwin_TARGET_OS
 115              -- Darwin: align stack frame size to a multiple of 16 bytes
 116          stack_frame_size = (arguments_size + 15) .&. complement 15
 117          stack_frame_pad = stack_frame_size - arguments_size
 118 #else
 119          stack_frame_size = arguments_size
 120 #endif
 121
 122          -- some helpers to assemble x86 insns.
 123          movl_offespmem_esi offB        -- movl   offB(%esp), %esi
 124             = [0x8B, 0xB4, 0x24] ++ lit32 offB
 125          movl_offesimem_ecx offB        -- movl   offB(%esi), %ecx
 126             = [0x8B, 0x8E] ++ lit32 offB
 127          save_regs                      -- pushl  all intregs except %esp
 128             = [0x50, 0x53, 0x51, 0x52, 0x56, 0x57, 0x55]
 129          restore_regs                   -- popl   ditto
 130             = [0x5D, 0x5F, 0x5E, 0x5A, 0x59, 0x5B, 0x58]
 131          pushl_ecx                      -- pushl  %ecx
 132             = [0x51]
 133          call_star_ecx                  -- call   * %ecx
 134             = [0xFF, 0xD1]
 135          add_lit_esp lit                -- addl   $lit, %esp
 136             = [0x81, 0xC4] ++ lit32 lit
 137          movl_eax_offesimem offB        -- movl   %eax, offB(%esi)
 138             = [0x89, 0x86] ++ lit32 offB
 139          movl_edx_offesimem offB        -- movl   %edx, offB(%esi)
 140             = [0x89, 0x96] ++ lit32 offB
 141          ret                            -- ret
 142             = [0xC3]
 143          fstpl_offesimem offB           -- fstpl   offB(%esi)
 144             = [0xDD, 0x9E] ++ lit32 offB
 145          fstps_offesimem offB           -- fstps   offB(%esi)
 146             = [0xD9, 0x9E] ++ lit32 offB
 147          {-
 148              2 0000 8BB42478    movl    0x12345678(%esp), %esi
 149              2      563412
 150              3 0007 8B8E7856    movl    0x12345678(%esi), %ecx
 151              3      3412
 152              4
 153              5 000d 50535152    pushl %eax ; pushl %ebx ; pushl %ecx ; pushl  %edx
 154              6 0011 565755      pushl %esi ; pushl %edi ; pushl %ebp
 155              7
 156              8 0014 5D5F5E      popl %ebp ; popl %edi ; popl %esi
 157              9 0017 5A595B58    popl %edx ; popl %ecx ; popl %ebx ; popl %eax
 158             10
 159             11 001b 51          pushl %ecx
 160             12 001c FFD1        call * %ecx
 161             13
 162             14 001e 81C47856    addl $0x12345678, %esp
 163             14      3412
 164             15 0024 89867856    movl %eax, 0x12345678(%esi)
 165             15      3412
 166             16 002a 89967856    movl %edx, 0x12345678(%esi)
 167             16      3412
 168             17
 169             18 0030 DD967856    fstl    0x12345678(%esi)
 170             18      3412
 171             19 0036 DD9E7856    fstpl   0x12345678(%esi)
 172             19      3412
 173             20 003c D9967856    fsts    0x12345678(%esi)
 174             20      3412
 175             21 0042 D99E7856    fstps   0x12345678(%esi)
 176             18
 177             19 0030 C3          ret
 178             20
 179
 180          -}
 181
 182      in
 183      --trace (show (map fst arg_offs_n_reps))
 184      (
 185      {- On entry, top of C stack 0(%esp) is the RA and 4(%esp) is
 186         arg passed from the interpreter.
 187
 188         Push all callee saved regs.  Push all of them anyway ...
 189            pushl       %eax
 190            pushl       %ebx
 191            pushl       %ecx
 192            pushl       %edx
 193            pushl       %esi
 194            pushl       %edi
 195            pushl       %ebp
 196      -}
 197      save_regs
 198
 199      {- Now 28+0(%esp) is RA and 28+4(%esp) is the arg (the H stack ptr).
 200         We'll use %esi as a temporary to point at the H stack, and
 201         %ecx as a temporary to copy via.
 202
 203            movl        28+4(%esp), %esi
 204      -}
 205      ++ movl_offespmem_esi 32
 206
 207 #if darwin_TARGET_OS
 208      {- On Darwin, add some padding so that the stack stays aligned. -}
 209      ++ (if stack_frame_pad /= 0
 210             then add_lit_esp (-stack_frame_pad)
 211             else [])
 212 #endif
 213
 214      {- For each arg in args_offs_n_reps, examine the associated
 215         CgRep to determine how many words there are.  This gives a
 216         bunch of offsets on the H stack to copy to the C stack:
 217
 218            movl        off1(%esi), %ecx
 219            pushl       %ecx
 220      -}
 221      ++ concatMap (\offW -> movl_offesimem_ecx (bytes_per_word * offW)
 222                             ++ pushl_ecx)
 223                   offsets_to_pushW
 224
 225      {- Get the addr to call into %ecx, bearing in mind that there's
 226         an Addr# tag at the indicated location, and do the call:
 227
 228            movl        4*(1 /*tag*/ +addr_offW)(%esi), %ecx
 229            call        * %ecx
 230      -}
 231      ++ movl_offesimem_ecx (bytes_per_word * addr_offW)
 232      ++ call_star_ecx
 233
 234      {- Nuke the args just pushed and re-establish %esi at the
 235         H-stack ptr:
 236
 237            addl        $4*number_of_args_pushed, %esp (ccall only)
 238            movl        28+4(%esp), %esi
 239      -}
 240      ++ (if   cconv /= StdCallConv
 241          then add_lit_esp stack_frame_size
 242          else [])
 243      ++ movl_offespmem_esi 32
 244
 245      {- Depending on what the return type is, get the result
 246         from %eax or %edx:%eax or %st(0).
 247
 248            movl        %eax, 4(%esi)        -- assuming tagged result
 249         or
 250            movl        %edx, 4(%esi)
 251            movl        %eax, 8(%esi)
 252         or
 253            fstpl       4(%esi)
 254         or
 255            fstps       4(%esi)
 256      -}
 257      ++ let i32 = movl_eax_offesimem 0
 258             i64 = movl_eax_offesimem 0 ++ movl_edx_offesimem 4
 259             f32 = fstps_offesimem 0
 260             f64 = fstpl_offesimem 0
 261         in
 262         case r_rep of
 263            NonPtrArg -> i32
 264            DoubleArg -> f64
 265            FloatArg  -> f32
 266            LongArg   -> i64
 267            VoidArg   -> []
 268            other     -> moan64 "ByteCodeFFI.mkMarshalCode_wrk(x86)"
 269                                (ppr r_rep)
 270
 271      {- Restore all the pushed regs and go home.
 272
 273            pushl        %ebp
 274            pushl        %edi
 275            pushl        %esi
 276            pushl        %edx
 277            pushl        %ecx
 278            pushl        %ebx
 279            pushl        %eax
 280
 281            ret
 282      -}
 283      ++ restore_regs
 284      ++ ret
 285      )
 286
 287 #elif x86_64_TARGET_ARCH
 288
 289    =
 290      -- the address of the H stack is in %rdi.  We need to move it out, so
 291      -- we can use %rdi as an arg reg for the following call:
 292     pushq_rbp ++
 293     movq_rdi_rbp ++
 294
 295      -- ####### load / push the args
 296
 297      let
 298         (stack_args, fregs_unused, reg_loads) =
 299            load_arg_regs arg_offs_n_reps int_loads float_loads []
 300
 301         tot_arg_size = bytes_per_word * length stack_args
 302
 303         -- On entry to the called function, %rsp should be aligned
 304         -- on a 16-byte boundary +8 (i.e. the first stack arg after
 305         -- the return address is 16-byte aligned).  In STG land
 306         -- %rsp is kept 16-byte aligned (see StgCRun.c), so we just
 307         -- need to make sure we push a multiple of 16-bytes of args,
 308         -- plus the return address, to get the correct alignment.
 309         (real_size, adjust_rsp)
 310           | tot_arg_size `rem` 16 == 0    = (tot_arg_size, [])
 311           | otherwise                     = (tot_arg_size + 8, subq_lit_rsp 8)
 312
 313         (stack_pushes, stack_words) =
 314                 push_args stack_args [] 0
 315
 316         -- we need to know the number of SSE regs used in the call, see later
 317         n_sse_regs_used = length float_loads - length fregs_unused
 318      in
 319         concat reg_loads
 320      ++ adjust_rsp
 321      ++ concat stack_pushes -- push in reverse order
 322
 323      -- ####### make the call
 324
 325         -- use %r10 to make the call, because we don't have to save it.
 326         --      movq 8*addr_offW(%rbp), %r10
 327      ++ movq_rbpoff_r10 (bytes_per_word * addr_offW)
 328
 329         -- The x86_64 ABI requires us to set %al to the number of SSE
 330         -- registers that contain arguments, if the called routine
 331         -- is a varargs function.  We don't know whether it's a
 332         -- varargs function or not, so we have to assume it is.
 333         --
 334         -- It's not safe to omit this assignment, even if the number
 335         -- of SSE regs in use is zero.  If %al is larger than 8
 336         -- on entry to a varargs function, seg faults ensue.
 337      ++ movq_lit_rax n_sse_regs_used
 338      ++ call_star_r10
 339
 340         -- pop the args from the stack, only in ccall mode
 341         -- (in stdcall the callee does it).
 342      ++ (if   cconv /= StdCallConv
 343          then addq_lit_rsp real_size
 344          else [])
 345
 346      -- ####### place the result in the right place and return
 347
 348      ++ assign_result
 349      ++ popq_rbp
 350      ++ ret
 351
 352   where
 353      bytes_per_word = 8
 354
 355      -- int arg regs: rdi,rsi,rdx,rcx,r8,r9
 356      -- flt arg regs: xmm0..xmm7
 357      int_loads   = [ movq_rbpoff_rdi, movq_rbpoff_rsi, movq_rbpoff_rdx,
 358                      movq_rbpoff_rcx, movq_rbpoff_r8,  movq_rbpoff_r9 ]
 359      float_loads = [ 0..7 ]
 360
 361      load_arg_regs args [] [] code     =  (args, [], code)
 362      load_arg_regs [] iregs fregs code =  ([], fregs, code)
 363      load_arg_regs ((off,rep):args) iregs fregs code
 364         | FloatArg  <- rep =
 365             case fregs of
 366               [] -> push_this_arg
 367               n : frest ->
 368                 load_arg_regs args iregs frest
 369                       (mov_f32_rbpoff_xmm n (bytes_per_word * off) : code)
 370         | DoubleArg <- rep =
 371             case fregs of
 372               [] -> push_this_arg
 373               n : frest ->
 374                 load_arg_regs args iregs frest
 375                        (mov_f64_rbpoff_xmm n (bytes_per_word * off) : code)
 376         | (mov_reg:irest) <- iregs =
 377                 load_arg_regs args irest fregs (mov_reg (bytes_per_word * off) : code)
 378         | otherwise =
 379                  push_this_arg
 380         where
 381            push_this_arg = ((off,rep):args',fregs', code')
 382                 where (args',fregs',code') = load_arg_regs args iregs fregs code
 383
 384      push_args [] code pushed_words = (code, pushed_words)
 385      push_args ((off,rep):args) code pushed_words
 386         | FloatArg  <- rep =
 387                 push_args args (push_f32_rbpoff (bytes_per_word * off) : code)
 388                         (pushed_words+1)
 389         | DoubleArg <- rep =
 390                 push_args args (push_f64_rbpoff (bytes_per_word * off) : code)
 391                         (pushed_words+1)
 392         | otherwise =
 393                 push_args args (pushq_rbpoff (bytes_per_word * off) : code)
 394                         (pushed_words+1)
 395
 396
 397      assign_result =
 398         case r_rep of
 399           DoubleArg -> f64
 400           FloatArg  -> f32
 401           VoidArg   -> []
 402           _other    -> i64
 403         where
 404           i64 = movq_rax_rbpoff 0
 405           f32 = mov_f32_xmm0_rbpoff 0
 406           f64 = mov_f64_xmm0_rbpoff 0
 407
 408 -- ######### x86_64 machine code:
 409
 410 --   0:   48 89 fd                mov    %rdi,%rbp
 411 --   3:   48 8b bd 78 56 34 12    mov    0x12345678(%rbp),%rdi
 412 --   a:   48 8b b5 78 56 34 12    mov    0x12345678(%rbp),%rsi
 413 --  11:   48 8b 95 78 56 34 12    mov    0x12345678(%rbp),%rdx
 414 --  18:   48 8b 8d 78 56 34 12    mov    0x12345678(%rbp),%rcx
 415 --  1f:   4c 8b 85 78 56 34 12    mov    0x12345678(%rbp),%r8
 416 --  26:   4c 8b 8d 78 56 34 12    mov    0x12345678(%rbp),%r9
 417 --  2d:   4c 8b 95 78 56 34 12    mov    0x12345678(%rbp),%r10
 418 --  34:   48 c7 c0 78 56 34 12    mov    $0x12345678,%rax
 419 --  3b:   48 89 85 78 56 34 12    mov    %rax,0x12345678(%rbp)
 420 --  42:   f3 0f 10 bd 78 56 34 12 movss  0x12345678(%rbp),%xmm7
 421 --  4a:   f2 0f 10 9d 78 56 34 12 movsd  0x12345678(%rbp),%xmm3
 422 --  52:   f2 44 0f 10 85 78 56 34 12 movsd  0x12345678(%rbp),%xmm8
 423 --  5b:   f3 0f 11 9d 78 56 34 12 movss  %xmm3,0x12345678(%rbp)
 424 --  63:   f2 0f 11 9d 78 56 34 12 movsd  %xmm3,0x12345678(%rbp)
 425 --  6b:   f2 44 0f 11 85 78 56 34 12 movsd  %xmm8,0x12345678(%rbp)
 426 --  74:   ff b5 78 56 34 12       pushq  0x12345678(%rbp)
 427 --  7a:   f3 44 0f 11 04 24       movss  %xmm8,(%rsp)
 428 --  80:   f2 44 0f 11 04 24       movsd  %xmm8,(%rsp)
 429 --  86:   48 81 ec 78 56 34 12    sub    $0x12345678,%rsp
 430 --  8d:   48 81 c4 78 56 34 12    add    $0x12345678,%rsp
 431 --  94:   41 ff d2                callq  *%r10
 432 --  97:   55                      push   %rbp
 433 --  98:   5d                      pop    %rbp
 434 --  99:   c3                      retq
 435
 436      movq_rdi_rbp         = [0x48,0x89,0xfd]
 437      movq_rbpoff_rdi  off = [0x48, 0x8b, 0xbd] ++ lit32 off
 438      movq_rbpoff_rsi  off = [0x48, 0x8b, 0xb5] ++ lit32 off
 439      movq_rbpoff_rdx  off = [0x48, 0x8b, 0x95] ++ lit32 off
 440      movq_rbpoff_rcx  off = [0x48, 0x8b, 0x8d] ++ lit32 off
 441      movq_rbpoff_r8   off = [0x4c, 0x8b, 0x85] ++ lit32 off
 442      movq_rbpoff_r9   off = [0x4c, 0x8b, 0x8d] ++ lit32 off
 443      movq_rbpoff_r10  off = [0x4c, 0x8b, 0x95] ++ lit32 off
 444      movq_lit_rax     lit = [0x48, 0xc7, 0xc0] ++ lit32 lit
 445      movq_rax_rbpoff  off = [0x48, 0x89, 0x85] ++ lit32 off
 446      mov_f32_rbpoff_xmm n off
 447          = 0xf3 : if n >= 8 then 0x44 : rest else rest
 448          where rest = [0x0f, 0x10, 0x85 + (n.&.7)`shiftL`3] ++ lit32 off
 449      mov_f64_rbpoff_xmm n off
 450          = 0xf2 : if n >= 8 then 0x44 : rest else rest
 451          where rest = [0x0f, 0x10, 0x85 + (n.&.7)`shiftL`3] ++ lit32 off
 452      mov_f32_xmm0_rbpoff  off = [0xf3, 0x0f, 0x11, 0x85] ++ lit32 off
 453      mov_f64_xmm0_rbpoff  off = [0xf2, 0x0f, 0x11, 0x85] ++ lit32 off
 454      pushq_rbpoff     off = [0xff, 0xb5] ++ lit32 off
 455      push_f32_rbpoff  off =
 456         subq_lit_rsp 8 ++                        -- subq $8, %rsp
 457         mov_f32_rbpoff_xmm 8 off ++              -- movss off(%rbp), %xmm8
 458         [0xf3, 0x44, 0x0f, 0x11, 0x04, 0x24]     -- movss %xmm8, (%rsp)
 459      push_f64_rbpoff  off =
 460         subq_lit_rsp 8 ++                        -- subq $8, %rsp
 461         mov_f64_rbpoff_xmm 8 off ++              -- movsd off(%rbp), %xmm8
 462         [0xf2, 0x44, 0x0f, 0x11, 0x04, 0x24]     -- movsd %xmm8, (%rsp)
 463      subq_lit_rsp     lit = [0x48, 0x81, 0xec] ++ lit32 lit
 464      addq_lit_rsp     lit = [0x48, 0x81, 0xc4] ++ lit32 lit
 465      call_star_r10 = [0x41,0xff,0xd2]
 466      ret = [0xc3]
 467      pushq_rbp = [0x55]
 468      popq_rbp = [0x5d]
 469
 470 #elif sparc_TARGET_ARCH
 471
 472    = let -- At least for sparc V8
 473          bytes_per_word = 4
 474
 475          -- speaks for itself
 476          w32_to_w8s_bigEndian :: Word32 -> [Word8]
 477          w32_to_w8s_bigEndian w
 478             =  [fromIntegral (0xFF .&. (w `shiftR` 24)),
 479                 fromIntegral (0xFF .&. (w `shiftR` 16)),
 480                 fromIntegral (0xFF .&. (w `shiftR` 8)),
 481                 fromIntegral (0xFF .&. w)]
 482
 483          offsets_to_pushW
 484             = concat
 485               [  [a_offW .. a_offW + cgRepSizeW a_rep - 1]
 486
 487                 | (a_offW, a_rep) <- arg_offs_n_reps
 488               ]
 489
 490          total_argWs    = length offsets_to_pushW
 491          argWs_on_stack = if total_argWs > 6 then total_argWs - 6
 492                                              else 0
 493
 494          -- The stack pointer must be kept 8-byte aligned, which means
 495          -- we need to calculate this quantity too
 496          argWs_on_stack_ROUNDED_UP
 497             | odd argWs_on_stack = 1 + argWs_on_stack
 498             | otherwise          = argWs_on_stack
 499
 500          -- some helpers to assemble sparc insns.
 501          -- REGS
 502          iReg, oReg, gReg, fReg :: Int -> Word32
 503          iReg = fromIntegral . (+ 24)
 504          oReg = fromIntegral . (+ 8)
 505          gReg = fromIntegral . (+ 0)
 506          fReg = fromIntegral
 507
 508          sp = oReg 6
 509          i0 = iReg 0
 510          i7 = iReg 7
 511          o0 = oReg 0
 512          o1 = oReg 1
 513          o7 = oReg 7
 514          g0 = gReg 0
 515          g1 = gReg 1
 516          f0 = fReg 0
 517          f1 = fReg 1
 518
 519          -- INSN templates
 520          insn_r_r_i :: Word32 -> Word32 -> Word32 -> Int -> Word32
 521          insn_r_r_i op3 rs1 rd imm13
 522             = (3 `shiftL` 30)
 523               .|. (rs1 `shiftL` 25)
 524               .|. (op3 `shiftL` 19)
 525               .|. (rd `shiftL` 14)
 526               .|. (1 `shiftL` 13)
 527               .|. mkSimm13 imm13
 528
 529          insn_r_i_r :: Word32 -> Word32 -> Int -> Word32 -> Word32
 530          insn_r_i_r op3 rs1 imm13 rd
 531             = (2 `shiftL` 30)
 532               .|. (rd `shiftL` 25)
 533               .|. (op3 `shiftL` 19)
 534               .|. (rs1 `shiftL` 14)
 535               .|. (1 `shiftL` 13)
 536               .|. mkSimm13 imm13
 537
 538          mkSimm13 :: Int -> Word32
 539          mkSimm13 imm13
 540             = let imm13w = (fromIntegral imm13) :: Word32
 541               in  imm13w .&. 0x1FFF
 542
 543          -- REAL (non-synthetic) insns
 544          -- or %rs1, %rs2, %rd
 545          mkOR :: Word32 -> Word32 -> Word32 -> Word32
 546          mkOR rs1 rs2 rd
 547             = (2 `shiftL` 30)
 548               .|. (rd `shiftL` 25)
 549               .|. (op3_OR `shiftL` 19)
 550               .|. (rs1 `shiftL` 14)
 551               .|. (0 `shiftL` 13)
 552               .|. rs2
 553               where op3_OR = 2 :: Word32
 554
 555          -- ld(int)   [%rs + imm13], %rd
 556          mkLD rs1 imm13 rd = insn_r_r_i 0x00{-op3_LD-} rd rs1 imm13
 557
 558          -- st(int)   %rs, [%rd + imm13]
 559          mkST   = insn_r_r_i 0x04 -- op3_ST
 560
 561          -- st(float) %rs, [%rd + imm13]
 562          mkSTF  = insn_r_r_i 0x24 -- op3_STF
 563
 564          -- jmpl     %rs + imm13, %rd
 565          mkJMPL = insn_r_i_r 0x38 -- op3_JMPL
 566
 567          -- save     %rs + imm13, %rd
 568          mkSAVE = insn_r_i_r 0x3C -- op3_SAVE
 569
 570          -- restore  %rs + imm13, %rd
 571          mkRESTORE = insn_r_i_r 0x3D -- op3_RESTORE
 572
 573          -- SYNTHETIC insns
 574          mkNOP             = mkOR g0 g0 g0
 575          mkCALL reg        = mkJMPL reg 0 o7
 576          mkRET             = mkJMPL i7 8 g0
 577          mkRESTORE_TRIVIAL = mkRESTORE g0 0 g0
 578      in
 579      --trace (show (map fst arg_offs_n_reps))
 580      concatMap w32_to_w8s_bigEndian (
 581
 582      {- On entry, %o0 is the arg passed from the interpreter.  After
 583         the initial save insn, it will be in %i0.  Studying the sparc
 584         docs one would have thought that the minimum frame size is 92
 585         bytes, but gcc always uses at least 112, and indeed there are
 586         segfaults a-plenty with 92.  So I use 112 here as well.  I
 587         don't understand why, tho.
 588      -}
 589      [mkSAVE sp (- ({-92-}112 + 4*argWs_on_stack_ROUNDED_UP)) sp]
 590
 591      {- For each arg in args_offs_n_reps, examine the associated
 592         CgRep to determine how many words there are.  This gives a
 593         bunch of offsets on the H stack.  Move the first 6 words into
 594         %o0 .. %o5 and the rest on the stack, starting at [%sp+92].
 595         Use %g1 as a temp.
 596      -}
 597      ++ let doArgW (offW, wordNo)
 598               | wordNo < 6
 599               = [mkLD i0 (bytes_per_word * offW) (oReg wordNo)]
 600               | otherwise
 601               = [mkLD i0 (bytes_per_word * offW) g1,
 602                  mkST g1 sp (92 + bytes_per_word * (wordNo - 6))]
 603         in
 604             concatMap doArgW (zip offsets_to_pushW [0 ..])
 605
 606      {- Get the addr to call into %g1, bearing in mind that there's
 607         an Addr# tag at the indicated location, and do the call:
 608
 609            ld     [4*(1 /*tag*/ +addr_offW) + %i0], %g1
 610            call   %g1
 611      -}
 612      ++ [mkLD i0 (bytes_per_word * addr_offW) g1,
 613          mkCALL g1,
 614          mkNOP]
 615
 616      {- Depending on what the return type is, get the result
 617         from %o0 or %o1:%o0 or %f0 or %f1:%f0.
 618
 619            st          %o0, [%i0 + 4]        -- 32 bit int
 620         or
 621            st          %o0, [%i0 + 4]        -- 64 bit int
 622            st          %o1, [%i0 + 8]        -- or the other way round?
 623         or
 624            st          %f0, [%i0 + 4]        -- 32 bit float
 625         or
 626            st          %f0, [%i0 + 4]        -- 64 bit float
 627            st          %f1, [%i0 + 8]        -- or the other way round?
 628
 629      -}
 630      ++ let i32 = [mkST o0 i0 0]
 631             i64 = [mkST o0 i0 0, mkST o1 i0 4]
 632             f32 = [mkSTF f0 i0 0]
 633             f64 = [mkSTF f0 i0 0, mkSTF f1 i0 4]
 634         in
 635             case r_rep of
 636                NonPtrArg -> i32
 637                DoubleArg -> f64
 638                FloatArg  -> f32
 639                VoidArg   -> []
 640                other     -> moan64 "ByteCodeFFI.mkMarshalCode_wrk(sparc)"
 641                                    (ppr r_rep)
 642
 643      ++ [mkRET,
 644          mkRESTORE_TRIVIAL]  -- this is in the delay slot of the RET
 645      )
 646 #elif powerpc_TARGET_ARCH && darwin_TARGET_OS
 647
 648    = let
 649          bytes_per_word = 4
 650
 651          -- speaks for itself
 652          w32_to_w8s_bigEndian :: Word32 -> [Word8]
 653          w32_to_w8s_bigEndian w
 654             =  [fromIntegral (0xFF .&. (w `shiftR` 24)),
 655                 fromIntegral (0xFF .&. (w `shiftR` 16)),
 656                 fromIntegral (0xFF .&. (w `shiftR` 8)),
 657                 fromIntegral (0xFF .&. w)]
 658
 659          -- addr and result bits offsetsW
 660          a_off = addr_offW * bytes_per_word
 661          result_off  = r_offW * bytes_per_word
 662
 663          linkageArea = 24
 664          parameterArea = sum [ cgRepSizeW a_rep * bytes_per_word
 665                         | (_, a_rep) <- arg_offs_n_reps ]
 666          savedRegisterArea = 4
 667          frameSize = padTo16 (linkageArea + max parameterArea 32 + savedRegisterArea)
 668          padTo16 x = case x `mod` 16 of
 669             0 -> x
 670             y -> x - y + 16
 671
 672          pass_parameters [] _ _ = []
 673          pass_parameters ((a_offW, a_rep):args) nextFPR offsetW =
 674             let
 675                haskellArgOffset = a_offW * bytes_per_word
 676                offsetW' = offsetW + cgRepSizeW a_rep
 677
 678                pass_word w
 679                    | offsetW + w < 8 =
 680                       [0x801f0000    -- lwz rX, src(r31)
 681                         .|. (fromIntegral src .&. 0xFFFF)
 682                         .|. (fromIntegral (offsetW+w+3) `shiftL` 21)]
 683                    | otherwise =
 684                       [0x801f0000    -- lwz r0, src(r31)
 685                         .|. (fromIntegral src .&. 0xFFFF),
 686                        0x90010000    -- stw r0, dst(r1)
 687                         .|. (fromIntegral dst .&. 0xFFFF)]
 688                   where
 689                      src = haskellArgOffset + w*bytes_per_word
 690                      dst = linkageArea + (offsetW+w) * bytes_per_word
 691             in
 692                case a_rep of
 693                   FloatArg | nextFPR < 14 ->
 694                       (0xc01f0000    -- lfs fX, haskellArgOffset(r31)
 695                         .|. (fromIntegral haskellArgOffset .&. 0xFFFF)
 696                         .|. (fromIntegral nextFPR `shiftL` 21))
 697                       : pass_parameters args (nextFPR+1) offsetW'
 698                   DoubleArg | nextFPR < 14 ->
 699                       (0xc81f0000    -- lfd fX, haskellArgOffset(r31)
 700                         .|. (fromIntegral haskellArgOffset .&. 0xFFFF)
 701                         .|. (fromIntegral nextFPR `shiftL` 21))
 702                       : pass_parameters args (nextFPR+1) offsetW'
 703                   _ ->
 704                       concatMap pass_word [0 .. cgRepSizeW a_rep - 1]
 705                       ++ pass_parameters args nextFPR offsetW'
 706
 707          gather_result = case r_rep of
 708             VoidArg -> []
 709             FloatArg ->
 710                [0xd03f0000 .|. (fromIntegral result_off .&. 0xFFFF)]
 711                -- stfs f1, result_off(r31)
 712             DoubleArg ->
 713                [0xd83f0000 .|. (fromIntegral result_off .&. 0xFFFF)]
 714                -- stfd f1, result_off(r31)
 715             _ | cgRepSizeW r_rep == 2 ->
 716                [0x907f0000 .|. (fromIntegral result_off .&. 0xFFFF),
 717                 0x909f0000 .|. (fromIntegral (result_off+4) .&. 0xFFFF)]
 718                -- stw r3, result_off(r31)
 719                -- stw r4, result_off+4(r31)
 720             _ | cgRepSizeW r_rep == 1 ->
 721                [0x907f0000 .|. (fromIntegral result_off .&. 0xFFFF)]
 722                -- stw r3, result_off(r31)
 723      in
 724          concatMap w32_to_w8s_bigEndian $ [
 725             0x7c0802a6,         -- mflr r0
 726             0x93e1fffc,         -- stw r31,-4(r1)
 727             0x90010008,         -- stw r0,8(r1)
 728             0x94210000 .|. (fromIntegral (-frameSize) .&. 0xFFFF),
 729                                 -- stwu r1, -frameSize(r1)
 730             0x7c7f1b78          -- mr r31, r3
 731          ] ++ pass_parameters arg_offs_n_reps 1 0 ++ [
 732             0x819f0000 .|. (fromIntegral a_off .&. 0xFFFF),
 733                                 -- lwz r12, a_off(r31)
 734             0x7d8903a6,         -- mtctr r12
 735             0x4e800421          -- bctrl
 736          ] ++ gather_result ++ [
 737             0x80210000,         -- lwz r1, 0(r1)
 738             0x83e1fffc,         -- lwz r31, -4(r1)
 739             0x80010008,         -- lwz r0, 8(r1)
 740             0x7c0803a6,         -- mtlr r0
 741             0x4e800020          -- blr
 742          ]
 743
 744 #elif powerpc_TARGET_ARCH && linux_TARGET_OS
 745
 746    -- All offsets here are measured in Words (not bytes).  This includes
 747    -- arguments to the load/store machine code generators, alignment numbers
 748    -- and the final 'framesize' among others.
 749
 750    = concatMap w32_to_w8s_bigEndian $ [
 751             0x7c0802a6,                         -- mflr r0
 752             0x93e1fffc,                         -- stw r31,-4(r1)
 753             0x90010008,                         -- stw r0,8(r1)
 754             0x94210000 .|. offset (-framesize), -- stwu r1, -frameSize(r1)
 755             0x7c7f1b78                          -- mr r31, r3
 756             ] ++ pass_parameters ++             -- pass the parameters
 757             loadWord 12 addr_offW ++ [          -- lwz r12, a_off(r31)
 758             0x7d8903a6,                         -- mtctr r12
 759             0x4e800421                          -- bctrl
 760             ] ++ gather_result ++ [             -- save the return value
 761             0x80210000,                         -- lwz r1, 0(r1)
 762             0x83e1fffc,                         -- lwz r31, -4(r1)
 763             0x80010008,                         -- lwz r0, 8(r1)
 764             0x7c0803a6,                         -- mtlr r0
 765             0x4e800020                          -- blr
 766          ]
 767
 768    where
 769      gather_result :: [Word32]
 770      gather_result = case r_rep of
 771        VoidArg   -> []
 772        FloatArg  -> storeFloat  1 r_offW
 773        DoubleArg -> storeDouble 1 r_offW
 774        LongArg   -> storeLong   3 r_offW
 775        _         -> storeWord   3 r_offW
 776
 777      pass_parameters :: [Word32]
 778      pass_parameters = concat params
 779
 780      -- vector aligned (4 word = 16 bytes) with 8 extra words of buffer space
 781      framesize = alignedTo 4 (argsize + 8)
 782
 783      ((_,_,argsize), params) = mapAccumL loadparam (3,1,2) arg_offs_n_reps
 784
 785      -- handle one argument, returning machine code and the updated state
 786      loadparam :: (Int, Int, Int) -> (Int, CgRep) ->
 787                   ((Int, Int, Int), [Word32])
 788
 789      loadparam (gpr, fpr, stack) (ofs, rep) = case rep of
 790        FloatArg | fpr <= 8  -> ( (gpr, fpr + 1, stack),  loadFloat fpr ofs )
 791        FloatArg             -> ( (gpr, fpr, stack + 1),  stackWord stack ofs )
 792
 793        DoubleArg | fpr <= 8 -> ( (gpr, fpr + 1, stack),  loadDouble fpr ofs )
 794        DoubleArg            -> ( (gpr, fpr, astack + 2), stackLong astack ofs )
 795
 796        LongArg | even gpr   -> loadparam (gpr + 1, fpr, stack) (ofs, rep)
 797        LongArg | gpr <= 9   -> ( (gpr + 2, fpr, stack),  loadLong gpr ofs )
 798        LongArg              -> ( (gpr, fpr, astack + 2), stackLong astack ofs )
 799
 800        _ | gpr <= 10        -> ( (gpr + 1, fpr, stack),  loadWord gpr ofs )
 801        _                    -> ( (gpr, fpr, stack + 1),  stackWord stack ofs )
 802       where astack = alignedTo 2 stack
 803
 804      alignedTo :: Int -> Int -> Int
 805      alignedTo alignment x = case x `mod` alignment of
 806                                0 -> x
 807                                y -> x - y + alignment
 808
 809      -- convenience macros to do multiple-instruction data moves
 810      stackWord dst src = loadWord 0 src ++ storeWordC 0 dst
 811      stackLong dst src = stackWord dst src ++ stackWord (dst + 1) (src + 1)
 812      loadLong  dst src = loadWord dst src ++ loadWord (dst + 1) (src + 1)
 813      storeLong dst src = storeWord dst src ++ storeWord (dst + 1) (src + 1)
 814
 815      -- load data from the Haskell stack (relative to r31)
 816      loadFloat   = loadstoreInstr 0xc01f0000 -- lfs fpr, ofs(r31)
 817      loadDouble  = loadstoreInstr 0xc81f0000 -- lfd fpr, ofs(r31)
 818      loadWord    = loadstoreInstr 0x801f0000 -- lwz gpr, ofs(r31)
 819
 820      -- store data to the Haskell stack (relative to r31)
 821      storeFloat  = loadstoreInstr 0xd01f0000 -- stfs fpr, ofs(r31)
 822      storeDouble = loadstoreInstr 0xd81f0000 -- stfd fpr, ofs(r31)
 823      storeWord   = loadstoreInstr 0x901f0000 -- stw gpr, ofs(r31)
 824
 825      -- store data to the C stack (relative to r1)
 826      storeWordC  = loadstoreInstr 0x90010000 -- stw gpr, ofs(r1)
 827
 828      -- machine code building blocks
 829      loadstoreInstr :: Word32 -> Int -> Int -> [Word32]
 830      loadstoreInstr code reg ofs = [ code .|. register reg .|. offset ofs ]
 831
 832      register :: Int -> Word32
 833      register reg = fromIntegral reg `shiftL` 21
 834
 835      offset :: Int -> Word32
 836      offset ofs   = fromIntegral (ofs * 4) .&. 0xFFFF
 837
 838      -- speaks for itself
 839      w32_to_w8s_bigEndian :: Word32 -> [Word8]
 840      w32_to_w8s_bigEndian w =  [fromIntegral (0xFF .&. (w `shiftR` 24)),
 841                                 fromIntegral (0xFF .&. (w `shiftR` 16)),
 842                                 fromIntegral (0xFF .&. (w `shiftR` 8)),
 843                                 fromIntegral (0xFF .&. w)]
 844
 845 #else
 846
 847    = throwDyn (InstallationError "foreign import is not implemented for GHCi on this platform.")
 848
 849 #endif
 850
 851 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 852 lit32 :: Int -> [Word8]
 853 lit32 i = let w32 = (fromIntegral i) :: Word32
 854           in  map (fromIntegral . ( .&. 0xFF))
 855                   [w32, w32 `shiftR` 8,
 856                    w32 `shiftR` 16,  w32 `shiftR` 24]
 857 #endif
 858 \end{code}
 859