compiler/ghci/ByteCodeFFI.lhs

   1 %
   2 % (c) The University of Glasgow 2001-2006
   3 %
   4
   5 ByteCodeGen: Generate machine-code sequences for foreign import
   6
   7 \begin{code}
   8 module ByteCodeFFI ( mkMarshalCode, moan64, newExec ) where
   9
  10 #include "HsVersions.h"
  11
  12 import Outputable
  13 import SMRep
  14 import ForeignCall
  15 import Panic
  16
  17 -- DON'T remove apparently unused imports here ..
  18 -- there is ifdeffery below
  19 import Control.Exception ( throwDyn )
  20 import Data.Bits        ( Bits(..), shiftR, shiftL )
  21 import Data.List        ( mapAccumL )
  22
  23 import Data.Word        ( Word8, Word32 )
  24 import Foreign          ( Ptr, FunPtr, castPtrToFunPtr,
  25                           Storable, sizeOf, pokeArray )
  26 import Foreign.C        ( CUInt )
  27 import System.IO.Unsafe ( unsafePerformIO )
  28 import System.IO        ( hPutStrLn, stderr )
  29 -- import Debug.Trace   ( trace )
  30 \end{code}
  31
  32 %************************************************************************
  33 %*                                                                      *
  34 \subsection{The platform-dependent marshall-code-generator.}
  35 %*                                                                      *
  36 %************************************************************************
  37
  38 \begin{code}
  39
  40 moan64 :: String -> SDoc -> a
  41 moan64 msg pp_rep
  42    = unsafePerformIO (
  43         hPutStrLn stderr (
  44         "\nGHCi's bytecode generation machinery can't handle 64-bit\n" ++
  45         "code properly yet.  You can work around this for the time being\n" ++
  46         "by compiling this module and all those it imports to object code,\n" ++
  47         "and re-starting your GHCi session.  The panic below contains information,\n" ++
  48         "intended for the GHC implementors, about the exact place where GHC gave up.\n"
  49         )
  50      )
  51      `seq`
  52      pprPanic msg pp_rep
  53
  54
  55 -- For sparc_TARGET_ARCH, i386_TARGET_ARCH, etc.
  56 #include "nativeGen/NCG.h"
  57
  58 {-
  59 Make a piece of code which expects to see the Haskell stack
  60 looking like this.  It is given a pointer to the lowest word in
  61 the stack -- presumably the tag of the placeholder.
  62
  63                   <arg_n>
  64                   ...
  65                   <arg_1>
  66                   Addr# address_of_C_fn
  67                   <placeholder-for-result#> (must be an unboxed type)
  68
  69 We cope with both ccall and stdcall for the C fn.  However, this code
  70 itself expects only to be called using the ccall convention -- that is,
  71 we don't clear our own (single) arg off the C stack.
  72 -}
  73 mkMarshalCode :: CCallConv
  74               -> (Int, CgRep) -> Int -> [(Int, CgRep)]
  75               -> IO (FunPtr ())
  76 mkMarshalCode cconv (r_offW, r_rep) addr_offW arg_offs_n_reps
  77    = let bytes = mkMarshalCode_wrk cconv (r_offW, r_rep)
  78                                    addr_offW arg_offs_n_reps
  79      in  newExec bytes
  80
  81 newExec :: Storable a => [a] -> IO (FunPtr ())
  82 newExec code
  83    = do ptr <- _allocateExec (fromIntegral $ codeSize undefined code)
  84         pokeArray ptr code
  85         return (castPtrToFunPtr ptr)
  86    where
  87    codeSize :: Storable a => a -> [a] -> Int
  88    codeSize dummy array = sizeOf(dummy) * length array
  89
  90 foreign import ccall unsafe "allocateExec"
  91   _allocateExec :: CUInt -> IO (Ptr a)
  92
  93 mkMarshalCode_wrk :: CCallConv
  94                   -> (Int, CgRep) -> Int -> [(Int, CgRep)]
  95                   -> [Word8]
  96
  97 mkMarshalCode_wrk cconv (r_offW, r_rep) addr_offW arg_offs_n_reps
  98
  99 #if i386_TARGET_ARCH
 100
 101    = let -- Don't change this without first consulting Intel Corp :-)
 102          bytes_per_word = 4
 103
 104          offsets_to_pushW
 105             = concat
 106               [   -- reversed because x86 is little-endian
 107                   reverse [a_offW .. a_offW + cgRepSizeW a_rep - 1]
 108
 109                 -- reversed because args are pushed L -> R onto C stack
 110                 | (a_offW, a_rep) <- reverse arg_offs_n_reps
 111               ]
 112
 113          arguments_size = bytes_per_word * length offsets_to_pushW
 114 #if darwin_TARGET_OS
 115              -- Darwin: align stack frame size to a multiple of 16 bytes
 116          stack_frame_size = (arguments_size + 15) .&. complement 15
 117          stack_frame_pad = stack_frame_size - arguments_size
 118 #else
 119          stack_frame_size = arguments_size
 120 #endif
 121
 122          -- some helpers to assemble x86 insns.
 123          movl_offespmem_esi offB        -- movl   offB(%esp), %esi
 124             = [0x8B, 0xB4, 0x24] ++ lit32 offB
 125          movl_offesimem_ecx offB        -- movl   offB(%esi), %ecx
 126             = [0x8B, 0x8E] ++ lit32 offB
 127          save_regs                      -- pushl  all intregs except %esp
 128             = [0x50, 0x53, 0x51, 0x52, 0x56, 0x57, 0x55]
 129          restore_regs                   -- popl   ditto
 130             = [0x5D, 0x5F, 0x5E, 0x5A, 0x59, 0x5B, 0x58]
 131          pushl_ecx                      -- pushl  %ecx
 132             = [0x51]
 133          call_star_ecx                  -- call   * %ecx
 134             = [0xFF, 0xD1]
 135          add_lit_esp lit                -- addl   $lit, %esp
 136             = [0x81, 0xC4] ++ lit32 lit
 137          movl_eax_offesimem offB        -- movl   %eax, offB(%esi)
 138             = [0x89, 0x86] ++ lit32 offB
 139          movl_edx_offesimem offB        -- movl   %edx, offB(%esi)
 140             = [0x89, 0x96] ++ lit32 offB
 141          ret                            -- ret
 142             = [0xC3]
 143          fstpl_offesimem offB           -- fstpl   offB(%esi)
 144             = [0xDD, 0x9E] ++ lit32 offB
 145          fstps_offesimem offB           -- fstps   offB(%esi)
 146             = [0xD9, 0x9E] ++ lit32 offB
 147          {-
 148              2 0000 8BB42478    movl    0x12345678(%esp), %esi
 149              2      563412
 150              3 0007 8B8E7856    movl    0x12345678(%esi), %ecx
 151              3      3412
 152              4
 153              5 000d 50535152    pushl %eax ; pushl %ebx ; pushl %ecx ; pushl  %edx
 154              6 0011 565755      pushl %esi ; pushl %edi ; pushl %ebp
 155              7
 156              8 0014 5D5F5E      popl %ebp ; popl %edi ; popl %esi
 157              9 0017 5A595B58    popl %edx ; popl %ecx ; popl %ebx ; popl %eax
 158             10
 159             11 001b 51          pushl %ecx
 160             12 001c FFD1        call * %ecx
 161             13
 162             14 001e 81C47856    addl $0x12345678, %esp
 163             14      3412
 164             15 0024 89867856    movl %eax, 0x12345678(%esi)
 165             15      3412
 166             16 002a 89967856    movl %edx, 0x12345678(%esi)
 167             16      3412
 168             17
 169             18 0030 DD967856    fstl    0x12345678(%esi)
 170             18      3412
 171             19 0036 DD9E7856    fstpl   0x12345678(%esi)
 172             19      3412
 173             20 003c D9967856    fsts    0x12345678(%esi)
 174             20      3412
 175             21 0042 D99E7856    fstps   0x12345678(%esi)
 176             18
 177             19 0030 C3          ret
 178             20
 179
 180          -}
 181
 182      in
 183      --trace (show (map fst arg_offs_n_reps))
 184      (
 185      {- On entry, top of C stack 0(%esp) is the RA and 4(%esp) is
 186         arg passed from the interpreter.
 187
 188         Push all callee saved regs.  Push all of them anyway ...
 189            pushl       %eax
 190            pushl       %ebx
 191            pushl       %ecx
 192            pushl       %edx
 193            pushl       %esi
 194            pushl       %edi
 195            pushl       %ebp
 196      -}
 197      save_regs
 198
 199      {- Now 28+0(%esp) is RA and 28+4(%esp) is the arg (the H stack ptr).
 200         We'll use %esi as a temporary to point at the H stack, and
 201         %ecx as a temporary to copy via.
 202
 203            movl        28+4(%esp), %esi
 204      -}
 205      ++ movl_offespmem_esi 32
 206
 207 #if darwin_TARGET_OS
 208      {- On Darwin, add some padding so that the stack stays aligned. -}
 209      ++ (if stack_frame_pad /= 0
 210             then add_lit_esp (-stack_frame_pad)
 211             else [])
 212 #endif
 213
 214      {- For each arg in args_offs_n_reps, examine the associated
 215         CgRep to determine how many words there are.  This gives a
 216         bunch of offsets on the H stack to copy to the C stack:
 217
 218            movl        off1(%esi), %ecx
 219            pushl       %ecx
 220      -}
 221      ++ concatMap (\offW -> movl_offesimem_ecx (bytes_per_word * offW)
 222                             ++ pushl_ecx)
 223                   offsets_to_pushW
 224
 225      {- Get the addr to call into %ecx, bearing in mind that there's
 226         an Addr# tag at the indicated location, and do the call:
 227
 228            movl        4*(1 /*tag*/ +addr_offW)(%esi), %ecx
 229            call        * %ecx
 230      -}
 231      ++ movl_offesimem_ecx (bytes_per_word * addr_offW)
 232      ++ call_star_ecx
 233
 234      {- Nuke the args just pushed and re-establish %esi at the
 235         H-stack ptr:
 236
 237            addl        $4*number_of_args_pushed, %esp (ccall only)
 238            movl        28+4(%esp), %esi
 239      -}
 240      ++ (if   cconv /= StdCallConv
 241          then add_lit_esp stack_frame_size
 242          else [])
 243      ++ movl_offespmem_esi 32
 244
 245      {- Depending on what the return type is, get the result
 246         from %eax or %edx:%eax or %st(0).
 247
 248            movl        %eax, 4(%esi)        -- assuming tagged result
 249         or
 250            movl        %edx, 4(%esi)
 251            movl        %eax, 8(%esi)
 252         or
 253            fstpl       4(%esi)
 254         or
 255            fstps       4(%esi)
 256      -}
 257      ++ let i32 = movl_eax_offesimem 0
 258             i64 = movl_eax_offesimem 0 ++ movl_edx_offesimem 4
 259             f32 = fstps_offesimem 0
 260             f64 = fstpl_offesimem 0
 261         in
 262         case r_rep of
 263            NonPtrArg -> i32
 264            DoubleArg -> f64
 265            FloatArg  -> f32
 266            LongArg   -> i64
 267            VoidArg   -> []
 268            other     -> moan64 "ByteCodeFFI.mkMarshalCode_wrk(x86)"
 269                                (ppr r_rep)
 270
 271      {- Restore all the pushed regs and go home.
 272
 273            pushl        %ebp
 274            pushl        %edi
 275            pushl        %esi
 276            pushl        %edx
 277            pushl        %ecx
 278            pushl        %ebx
 279            pushl        %eax
 280
 281            ret
 282      -}
 283      ++ restore_regs
 284      ++ ret
 285      )
 286
 287 #elif x86_64_TARGET_ARCH
 288
 289    =
 290      -- the address of the H stack is in %rdi.  We need to move it out, so
 291      -- we can use %rdi as an arg reg for the following call:
 292     pushq_rbp ++
 293     movq_rdi_rbp ++
 294
 295      -- ####### load / push the args
 296
 297      let
 298         (stack_args, fregs_unused, reg_loads) =
 299            load_arg_regs arg_offs_n_reps int_loads float_loads []
 300
 301         tot_arg_size = bytes_per_word * length stack_args
 302
 303         -- On entry to the called function, %rsp should be aligned
 304         -- on a 16-byte boundary +8 (i.e. the first stack arg after
 305         -- the return address is 16-byte aligned).  In STG land
 306         -- %rsp is kept 16-byte aligned (see StgCRun.c), so we just
 307         -- need to make sure we push a multiple of 16-bytes of args,
 308         -- plus the return address, to get the correct alignment.
 309         (real_size, adjust_rsp)
 310           | tot_arg_size `rem` 16 == 0    = (tot_arg_size, [])
 311           | otherwise                     = (tot_arg_size + 8, subq_lit_rsp 8)
 312
 313         (stack_pushes, stack_words) =
 314                 push_args stack_args [] 0
 315
 316         -- we need to know the number of SSE regs used in the call, see later
 317         n_sse_regs_used = length float_loads - length fregs_unused
 318      in
 319         concat reg_loads
 320      ++ adjust_rsp
 321      ++ concat stack_pushes -- push in reverse order
 322
 323      -- ####### make the call
 324
 325         -- use %r10 to make the call, because we don't have to save it.
 326         --      movq 8*addr_offW(%rbp), %r10
 327      ++ movq_rbpoff_r10 (bytes_per_word * addr_offW)
 328
 329         -- The x86_64 ABI requires us to set %al to the number of SSE
 330         -- registers that contain arguments, if the called routine
 331         -- is a varargs function.  We don't know whether it's a
 332         -- varargs function or not, so we have to assume it is.
 333         --
 334         -- It's not safe to omit this assignment, even if the number
 335         -- of SSE regs in use is zero.  If %al is larger than 8
 336         -- on entry to a varargs function, seg faults ensue.
 337      ++ movq_lit_rax n_sse_regs_used
 338      ++ call_star_r10
 339
 340         -- pop the args from the stack, only in ccall mode
 341         -- (in stdcall the callee does it).
 342      ++ (if   cconv /= StdCallConv
 343          then addq_lit_rsp real_size
 344          else [])
 345
 346      -- ####### place the result in the right place and return
 347
 348      ++ assign_result
 349      ++ popq_rbp
 350      ++ ret
 351
 352   where
 353      bytes_per_word = 8
 354
 355      -- int arg regs: rdi,rsi,rdx,rcx,r8,r9
 356      -- flt arg regs: xmm0..xmm7
 357      int_loads   = [ movq_rbpoff_rdi, movq_rbpoff_rsi, movq_rbpoff_rdx,
 358                      movq_rbpoff_rcx, movq_rbpoff_r8,  movq_rbpoff_r9 ]
 359      float_loads = [ (mov_f32_rbpoff_xmm n, mov_f64_rbpoff_xmm n) | n <- [0..7] ]
 360
 361      load_arg_regs args [] [] code     =  (args, [], code)
 362      load_arg_regs [] iregs fregs code =  ([], fregs, code)
 363      load_arg_regs ((off,rep):args) iregs fregs code
 364         | FloatArg  <- rep, ((mov_f32,_):frest) <- fregs =
 365                 load_arg_regs args iregs frest (mov_f32 (bytes_per_word * off) : code)
 366         | DoubleArg <- rep, ((_,mov_f64):frest) <- fregs =
 367                 load_arg_regs args iregs frest (mov_f64 (bytes_per_word * off) : code)
 368         | (mov_reg:irest) <- iregs =
 369                 load_arg_regs args irest fregs (mov_reg (bytes_per_word * off) : code)
 370         | otherwise =
 371                  push_this_arg
 372         where
 373            push_this_arg = ((off,rep):args',fregs', code')
 374                 where (args',fregs',code') = load_arg_regs args iregs fregs code
 375
 376      push_args [] code pushed_words = (code, pushed_words)
 377      push_args ((off,rep):args) code pushed_words
 378         | FloatArg  <- rep =
 379                 push_args args (push_f32_rbpoff (bytes_per_word * off) : code)
 380                         (pushed_words+1)
 381         | DoubleArg <- rep =
 382                 push_args args (push_f64_rbpoff (bytes_per_word * off) : code)
 383                         (pushed_words+1)
 384         | otherwise =
 385                 push_args args (pushq_rbpoff (bytes_per_word * off) : code)
 386                         (pushed_words+1)
 387
 388
 389      assign_result =
 390         case r_rep of
 391           DoubleArg -> f64
 392           FloatArg  -> f32
 393           VoidArg   -> []
 394           _other    -> i64
 395         where
 396           i64 = movq_rax_rbpoff 0
 397           f32 = mov_f32_xmm0_rbpoff 0
 398           f64 = mov_f64_xmm0_rbpoff 0
 399
 400 -- ######### x86_64 machine code:
 401
 402 --   0:   48 89 fd                mov    %rdi,%rbp
 403 --   3:   48 8b bd 78 56 34 12    mov    0x12345678(%rbp),%rdi
 404 --   a:   48 8b b5 78 56 34 12    mov    0x12345678(%rbp),%rsi
 405 --  11:   48 8b 95 78 56 34 12    mov    0x12345678(%rbp),%rdx
 406 --  18:   48 8b 8d 78 56 34 12    mov    0x12345678(%rbp),%rcx
 407 --  1f:   4c 8b 85 78 56 34 12    mov    0x12345678(%rbp),%r8
 408 --  26:   4c 8b 8d 78 56 34 12    mov    0x12345678(%rbp),%r9
 409 --  2d:   4c 8b 95 78 56 34 12    mov    0x12345678(%rbp),%r10
 410 --  34:   48 c7 c0 78 56 34 12    mov    $0x12345678,%rax
 411 --  3b:   48 89 85 78 56 34 12    mov    %rax,0x12345678(%rbp)
 412 --  42:   f3 0f 10 85 78 56 34 12 movss  0x12345678(%rbp),%xmm0
 413 --  4a:   f2 0f 10 85 78 56 34 12 movsd  0x12345678(%rbp),%xmm0
 414 --  52:   f3 0f 11 85 78 56 34 12 movss  %xmm0,0x12345678(%rbp)
 415 --  5a:   f2 0f 11 85 78 56 34 12 movsd  %xmm0,0x12345678(%rbp)
 416 --  62:   ff b5 78 56 34 12       pushq  0x12345678(%rbp)
 417 --  68:   f3 44 0f 11 04 24       movss  %xmm8,(%rsp)
 418 --  6e:   f2 44 0f 11 04 24       movsd  %xmm8,(%rsp)
 419 --  74:   48 81 ec 78 56 34 12    sub    $0x12345678,%rsp
 420 --  7b:   48 81 c4 78 56 34 12    add    $0x12345678,%rsp
 421 --  82:   41 ff d2                callq  *%r10
 422 --  85:   c3                      retq
 423
 424      movq_rdi_rbp         = [0x48,0x89,0xfd]
 425      movq_rbpoff_rdi  off = [0x48, 0x8b, 0xbd] ++ lit32 off
 426      movq_rbpoff_rsi  off = [0x48, 0x8b, 0xb5] ++ lit32 off
 427      movq_rbpoff_rdx  off = [0x48, 0x8b, 0x95] ++ lit32 off
 428      movq_rbpoff_rcx  off = [0x48, 0x8b, 0x8d] ++ lit32 off
 429      movq_rbpoff_r8   off = [0x4c, 0x8b, 0x85] ++ lit32 off
 430      movq_rbpoff_r9   off = [0x4c, 0x8b, 0x8d] ++ lit32 off
 431      movq_rbpoff_r10  off = [0x4c, 0x8b, 0x95] ++ lit32 off
 432      movq_lit_rax     lit = [0x48, 0xc7, 0xc0] ++ lit32 lit
 433      movq_rax_rbpoff  off = [0x48, 0x89, 0x85] ++ lit32 off
 434      mov_f32_rbpoff_xmm n off = [0xf3, 0x0f, 0x10, 0x85 + n`shiftL`3] ++ lit32 off
 435      mov_f64_rbpoff_xmm n off = [0xf2, 0x0f, 0x10, 0x85 + n`shiftL`3] ++ lit32 off
 436      mov_f32_xmm0_rbpoff  off = [0xf3, 0x0f, 0x11, 0x85] ++ lit32 off
 437      mov_f64_xmm0_rbpoff  off = [0xf2, 0x0f, 0x11, 0x85] ++ lit32 off
 438      pushq_rbpoff     off = [0xff, 0xb5] ++ lit32 off
 439      push_f32_rbpoff  off =
 440         mov_f32_rbpoff_xmm 8 off ++              -- movss off(%rbp), %xmm8
 441         [0xf3, 0x44, 0x0f, 0x11, 0x04, 0x24] ++  -- movss %xmm8, (%rsp)
 442         subq_lit_rsp 8                           -- subq $8, %rsp
 443      push_f64_rbpoff  off =
 444         mov_f64_rbpoff_xmm 8 off ++              -- movsd off(%rbp), %xmm8
 445         [0xf2, 0x44, 0x0f, 0x11, 0x04, 0x24] ++  -- movsd %xmm8, (%rsp)
 446         subq_lit_rsp 8                           -- subq $8, %rsp
 447      subq_lit_rsp     lit = [0x48, 0x81, 0xec] ++ lit32 lit
 448      addq_lit_rsp     lit = [0x48, 0x81, 0xc4] ++ lit32 lit
 449      call_star_r10 = [0x41,0xff,0xd2]
 450      ret = [0xc3]
 451      pushq_rbp = [0x55]
 452      popq_rbp = [0x5d]
 453
 454 #elif sparc_TARGET_ARCH
 455
 456    = let -- At least for sparc V8
 457          bytes_per_word = 4
 458
 459          -- speaks for itself
 460          w32_to_w8s_bigEndian :: Word32 -> [Word8]
 461          w32_to_w8s_bigEndian w
 462             =  [fromIntegral (0xFF .&. (w `shiftR` 24)),
 463                 fromIntegral (0xFF .&. (w `shiftR` 16)),
 464                 fromIntegral (0xFF .&. (w `shiftR` 8)),
 465                 fromIntegral (0xFF .&. w)]
 466
 467          offsets_to_pushW
 468             = concat
 469               [  [a_offW .. a_offW + cgRepSizeW a_rep - 1]
 470
 471                 | (a_offW, a_rep) <- arg_offs_n_reps
 472               ]
 473
 474          total_argWs    = length offsets_to_pushW
 475          argWs_on_stack = if total_argWs > 6 then total_argWs - 6
 476                                              else 0
 477
 478          -- The stack pointer must be kept 8-byte aligned, which means
 479          -- we need to calculate this quantity too
 480          argWs_on_stack_ROUNDED_UP
 481             | odd argWs_on_stack = 1 + argWs_on_stack
 482             | otherwise          = argWs_on_stack
 483
 484          -- some helpers to assemble sparc insns.
 485          -- REGS
 486          iReg, oReg, gReg, fReg :: Int -> Word32
 487          iReg = fromIntegral . (+ 24)
 488          oReg = fromIntegral . (+ 8)
 489          gReg = fromIntegral . (+ 0)
 490          fReg = fromIntegral
 491
 492          sp = oReg 6
 493          i0 = iReg 0
 494          i7 = iReg 7
 495          o0 = oReg 0
 496          o1 = oReg 1
 497          o7 = oReg 7
 498          g0 = gReg 0
 499          g1 = gReg 1
 500          f0 = fReg 0
 501          f1 = fReg 1
 502
 503          -- INSN templates
 504          insn_r_r_i :: Word32 -> Word32 -> Word32 -> Int -> Word32
 505          insn_r_r_i op3 rs1 rd imm13
 506             = (3 `shiftL` 30)
 507               .|. (rs1 `shiftL` 25)
 508               .|. (op3 `shiftL` 19)
 509               .|. (rd `shiftL` 14)
 510               .|. (1 `shiftL` 13)
 511               .|. mkSimm13 imm13
 512
 513          insn_r_i_r :: Word32 -> Word32 -> Int -> Word32 -> Word32
 514          insn_r_i_r op3 rs1 imm13 rd
 515             = (2 `shiftL` 30)
 516               .|. (rd `shiftL` 25)
 517               .|. (op3 `shiftL` 19)
 518               .|. (rs1 `shiftL` 14)
 519               .|. (1 `shiftL` 13)
 520               .|. mkSimm13 imm13
 521
 522          mkSimm13 :: Int -> Word32
 523          mkSimm13 imm13
 524             = let imm13w = (fromIntegral imm13) :: Word32
 525               in  imm13w .&. 0x1FFF
 526
 527          -- REAL (non-synthetic) insns
 528          -- or %rs1, %rs2, %rd
 529          mkOR :: Word32 -> Word32 -> Word32 -> Word32
 530          mkOR rs1 rs2 rd
 531             = (2 `shiftL` 30)
 532               .|. (rd `shiftL` 25)
 533               .|. (op3_OR `shiftL` 19)
 534               .|. (rs1 `shiftL` 14)
 535               .|. (0 `shiftL` 13)
 536               .|. rs2
 537               where op3_OR = 2 :: Word32
 538
 539          -- ld(int)   [%rs + imm13], %rd
 540          mkLD rs1 imm13 rd = insn_r_r_i 0x00{-op3_LD-} rd rs1 imm13
 541
 542          -- st(int)   %rs, [%rd + imm13]
 543          mkST   = insn_r_r_i 0x04 -- op3_ST
 544
 545          -- st(float) %rs, [%rd + imm13]
 546          mkSTF  = insn_r_r_i 0x24 -- op3_STF
 547
 548          -- jmpl     %rs + imm13, %rd
 549          mkJMPL = insn_r_i_r 0x38 -- op3_JMPL
 550
 551          -- save     %rs + imm13, %rd
 552          mkSAVE = insn_r_i_r 0x3C -- op3_SAVE
 553
 554          -- restore  %rs + imm13, %rd
 555          mkRESTORE = insn_r_i_r 0x3D -- op3_RESTORE
 556
 557          -- SYNTHETIC insns
 558          mkNOP             = mkOR g0 g0 g0
 559          mkCALL reg        = mkJMPL reg 0 o7
 560          mkRET             = mkJMPL i7 8 g0
 561          mkRESTORE_TRIVIAL = mkRESTORE g0 0 g0
 562      in
 563      --trace (show (map fst arg_offs_n_reps))
 564      concatMap w32_to_w8s_bigEndian (
 565
 566      {- On entry, %o0 is the arg passed from the interpreter.  After
 567         the initial save insn, it will be in %i0.  Studying the sparc
 568         docs one would have thought that the minimum frame size is 92
 569         bytes, but gcc always uses at least 112, and indeed there are
 570         segfaults a-plenty with 92.  So I use 112 here as well.  I
 571         don't understand why, tho.
 572      -}
 573      [mkSAVE sp (- ({-92-}112 + 4*argWs_on_stack_ROUNDED_UP)) sp]
 574
 575      {- For each arg in args_offs_n_reps, examine the associated
 576         CgRep to determine how many words there are.  This gives a
 577         bunch of offsets on the H stack.  Move the first 6 words into
 578         %o0 .. %o5 and the rest on the stack, starting at [%sp+92].
 579         Use %g1 as a temp.
 580      -}
 581      ++ let doArgW (offW, wordNo)
 582               | wordNo < 6
 583               = [mkLD i0 (bytes_per_word * offW) (oReg wordNo)]
 584               | otherwise
 585               = [mkLD i0 (bytes_per_word * offW) g1,
 586                  mkST g1 sp (92 + bytes_per_word * (wordNo - 6))]
 587         in
 588             concatMap doArgW (zip offsets_to_pushW [0 ..])
 589
 590      {- Get the addr to call into %g1, bearing in mind that there's
 591         an Addr# tag at the indicated location, and do the call:
 592
 593            ld     [4*(1 /*tag*/ +addr_offW) + %i0], %g1
 594            call   %g1
 595      -}
 596      ++ [mkLD i0 (bytes_per_word * addr_offW) g1,
 597          mkCALL g1,
 598          mkNOP]
 599
 600      {- Depending on what the return type is, get the result
 601         from %o0 or %o1:%o0 or %f0 or %f1:%f0.
 602
 603            st          %o0, [%i0 + 4]        -- 32 bit int
 604         or
 605            st          %o0, [%i0 + 4]        -- 64 bit int
 606            st          %o1, [%i0 + 8]        -- or the other way round?
 607         or
 608            st          %f0, [%i0 + 4]        -- 32 bit float
 609         or
 610            st          %f0, [%i0 + 4]        -- 64 bit float
 611            st          %f1, [%i0 + 8]        -- or the other way round?
 612
 613      -}
 614      ++ let i32 = [mkST o0 i0 0]
 615             i64 = [mkST o0 i0 0, mkST o1 i0 4]
 616             f32 = [mkSTF f0 i0 0]
 617             f64 = [mkSTF f0 i0 0, mkSTF f1 i0 4]
 618         in
 619             case r_rep of
 620                NonPtrArg -> i32
 621                DoubleArg -> f64
 622                FloatArg  -> f32
 623                VoidArg   -> []
 624                other     -> moan64 "ByteCodeFFI.mkMarshalCode_wrk(sparc)"
 625                                    (ppr r_rep)
 626
 627      ++ [mkRET,
 628          mkRESTORE_TRIVIAL]  -- this is in the delay slot of the RET
 629      )
 630 #elif powerpc_TARGET_ARCH && darwin_TARGET_OS
 631
 632    = let
 633          bytes_per_word = 4
 634
 635          -- speaks for itself
 636          w32_to_w8s_bigEndian :: Word32 -> [Word8]
 637          w32_to_w8s_bigEndian w
 638             =  [fromIntegral (0xFF .&. (w `shiftR` 24)),
 639                 fromIntegral (0xFF .&. (w `shiftR` 16)),
 640                 fromIntegral (0xFF .&. (w `shiftR` 8)),
 641                 fromIntegral (0xFF .&. w)]
 642
 643          -- addr and result bits offsetsW
 644          a_off = addr_offW * bytes_per_word
 645          result_off  = r_offW * bytes_per_word
 646
 647          linkageArea = 24
 648          parameterArea = sum [ cgRepSizeW a_rep * bytes_per_word
 649                         | (_, a_rep) <- arg_offs_n_reps ]
 650          savedRegisterArea = 4
 651          frameSize = padTo16 (linkageArea + max parameterArea 32 + savedRegisterArea)
 652          padTo16 x = case x `mod` 16 of
 653             0 -> x
 654             y -> x - y + 16
 655
 656          pass_parameters [] _ _ = []
 657          pass_parameters ((a_offW, a_rep):args) nextFPR offsetW =
 658             let
 659                haskellArgOffset = a_offW * bytes_per_word
 660                offsetW' = offsetW + cgRepSizeW a_rep
 661
 662                pass_word w
 663                    | offsetW + w < 8 =
 664                       [0x801f0000    -- lwz rX, src(r31)
 665                         .|. (fromIntegral src .&. 0xFFFF)
 666                         .|. (fromIntegral (offsetW+w+3) `shiftL` 21)]
 667                    | otherwise =
 668                       [0x801f0000    -- lwz r0, src(r31)
 669                         .|. (fromIntegral src .&. 0xFFFF),
 670                        0x90010000    -- stw r0, dst(r1)
 671                         .|. (fromIntegral dst .&. 0xFFFF)]
 672                   where
 673                      src = haskellArgOffset + w*bytes_per_word
 674                      dst = linkageArea + (offsetW+w) * bytes_per_word
 675             in
 676                case a_rep of
 677                   FloatArg | nextFPR < 14 ->
 678                       (0xc01f0000    -- lfs fX, haskellArgOffset(r31)
 679                         .|. (fromIntegral haskellArgOffset .&. 0xFFFF)
 680                         .|. (fromIntegral nextFPR `shiftL` 21))
 681                       : pass_parameters args (nextFPR+1) offsetW'
 682                   DoubleArg | nextFPR < 14 ->
 683                       (0xc81f0000    -- lfd fX, haskellArgOffset(r31)
 684                         .|. (fromIntegral haskellArgOffset .&. 0xFFFF)
 685                         .|. (fromIntegral nextFPR `shiftL` 21))
 686                       : pass_parameters args (nextFPR+1) offsetW'
 687                   _ ->
 688                       concatMap pass_word [0 .. cgRepSizeW a_rep - 1]
 689                       ++ pass_parameters args nextFPR offsetW'
 690
 691          gather_result = case r_rep of
 692             VoidArg -> []
 693             FloatArg ->
 694                [0xd03f0000 .|. (fromIntegral result_off .&. 0xFFFF)]
 695                -- stfs f1, result_off(r31)
 696             DoubleArg ->
 697                [0xd83f0000 .|. (fromIntegral result_off .&. 0xFFFF)]
 698                -- stfd f1, result_off(r31)
 699             _ | cgRepSizeW r_rep == 2 ->
 700                [0x907f0000 .|. (fromIntegral result_off .&. 0xFFFF),
 701                 0x909f0000 .|. (fromIntegral (result_off+4) .&. 0xFFFF)]
 702                -- stw r3, result_off(r31)
 703                -- stw r4, result_off+4(r31)
 704             _ | cgRepSizeW r_rep == 1 ->
 705                [0x907f0000 .|. (fromIntegral result_off .&. 0xFFFF)]
 706                -- stw r3, result_off(r31)
 707      in
 708          concatMap w32_to_w8s_bigEndian $ [
 709             0x7c0802a6,         -- mflr r0
 710             0x93e1fffc,         -- stw r31,-4(r1)
 711             0x90010008,         -- stw r0,8(r1)
 712             0x94210000 .|. (fromIntegral (-frameSize) .&. 0xFFFF),
 713                                 -- stwu r1, -frameSize(r1)
 714             0x7c7f1b78          -- mr r31, r3
 715          ] ++ pass_parameters arg_offs_n_reps 1 0 ++ [
 716             0x819f0000 .|. (fromIntegral a_off .&. 0xFFFF),
 717                                 -- lwz r12, a_off(r31)
 718             0x7d8903a6,         -- mtctr r12
 719             0x4e800421          -- bctrl
 720          ] ++ gather_result ++ [
 721             0x80210000,         -- lwz r1, 0(r1)
 722             0x83e1fffc,         -- lwz r31, -4(r1)
 723             0x80010008,         -- lwz r0, 8(r1)
 724             0x7c0803a6,         -- mtlr r0
 725             0x4e800020          -- blr
 726          ]
 727
 728 #elif powerpc_TARGET_ARCH && linux_TARGET_OS
 729
 730    -- All offsets here are measured in Words (not bytes).  This includes
 731    -- arguments to the load/store machine code generators, alignment numbers
 732    -- and the final 'framesize' among others.
 733
 734    = concatMap w32_to_w8s_bigEndian $ [
 735             0x7c0802a6,                         -- mflr r0
 736             0x93e1fffc,                         -- stw r31,-4(r1)
 737             0x90010008,                         -- stw r0,8(r1)
 738             0x94210000 .|. offset (-framesize), -- stwu r1, -frameSize(r1)
 739             0x7c7f1b78                          -- mr r31, r3
 740             ] ++ pass_parameters ++             -- pass the parameters
 741             loadWord 12 addr_offW ++ [          -- lwz r12, a_off(r31)
 742             0x7d8903a6,                         -- mtctr r12
 743             0x4e800421                          -- bctrl
 744             ] ++ gather_result ++ [             -- save the return value
 745             0x80210000,                         -- lwz r1, 0(r1)
 746             0x83e1fffc,                         -- lwz r31, -4(r1)
 747             0x80010008,                         -- lwz r0, 8(r1)
 748             0x7c0803a6,                         -- mtlr r0
 749             0x4e800020                          -- blr
 750          ]
 751
 752    where
 753      gather_result :: [Word32]
 754      gather_result = case r_rep of
 755        VoidArg   -> []
 756        FloatArg  -> storeFloat  1 r_offW
 757        DoubleArg -> storeDouble 1 r_offW
 758        LongArg   -> storeLong   3 r_offW
 759        _         -> storeWord   3 r_offW
 760
 761      pass_parameters :: [Word32]
 762      pass_parameters = concat params
 763
 764      -- vector aligned (4 word = 16 bytes) with 8 extra words of buffer space
 765      framesize = alignedTo 4 (argsize + 8)
 766
 767      ((_,_,argsize), params) = mapAccumL loadparam (3,1,2) arg_offs_n_reps
 768
 769      -- handle one argument, returning machine code and the updated state
 770      loadparam :: (Int, Int, Int) -> (Int, CgRep) ->
 771                   ((Int, Int, Int), [Word32])
 772
 773      loadparam (gpr, fpr, stack) (ofs, rep) = case rep of
 774        FloatArg | fpr <= 8  -> ( (gpr, fpr + 1, stack),  loadFloat fpr ofs )
 775        FloatArg             -> ( (gpr, fpr, stack + 1),  stackWord stack ofs )
 776
 777        DoubleArg | fpr <= 8 -> ( (gpr, fpr + 1, stack),  loadDouble fpr ofs )
 778        DoubleArg            -> ( (gpr, fpr, astack + 2), stackLong astack ofs )
 779
 780        LongArg | even gpr   -> loadparam (gpr + 1, fpr, stack) (ofs, rep)
 781        LongArg | gpr <= 9   -> ( (gpr + 2, fpr, stack),  loadLong gpr ofs )
 782        LongArg              -> ( (gpr, fpr, astack + 2), stackLong astack ofs )
 783
 784        _ | gpr <= 10        -> ( (gpr + 1, fpr, stack),  loadWord gpr ofs )
 785        _                    -> ( (gpr, fpr, stack + 1),  stackWord stack ofs )
 786       where astack = alignedTo 2 stack
 787
 788      alignedTo :: Int -> Int -> Int
 789      alignedTo alignment x = case x `mod` alignment of
 790                                0 -> x
 791                                y -> x - y + alignment
 792
 793      -- convenience macros to do multiple-instruction data moves
 794      stackWord dst src = loadWord 0 src ++ storeWordC 0 dst
 795      stackLong dst src = stackWord dst src ++ stackWord (dst + 1) (src + 1)
 796      loadLong  dst src = loadWord dst src ++ loadWord (dst + 1) (src + 1)
 797      storeLong dst src = storeWord dst src ++ storeWord (dst + 1) (src + 1)
 798
 799      -- load data from the Haskell stack (relative to r31)
 800      loadFloat   = loadstoreInstr 0xc01f0000 -- lfs fpr, ofs(r31)
 801      loadDouble  = loadstoreInstr 0xc81f0000 -- lfd fpr, ofs(r31)
 802      loadWord    = loadstoreInstr 0x801f0000 -- lwz gpr, ofs(r31)
 803
 804      -- store data to the Haskell stack (relative to r31)
 805      storeFloat  = loadstoreInstr 0xd01f0000 -- stfs fpr, ofs(r31)
 806      storeDouble = loadstoreInstr 0xd81f0000 -- stfd fpr, ofs(r31)
 807      storeWord   = loadstoreInstr 0x901f0000 -- stw gpr, ofs(r31)
 808
 809      -- store data to the C stack (relative to r1)
 810      storeWordC  = loadstoreInstr 0x90010000 -- stw gpr, ofs(r1)
 811
 812      -- machine code building blocks
 813      loadstoreInstr :: Word32 -> Int -> Int -> [Word32]
 814      loadstoreInstr code reg ofs = [ code .|. register reg .|. offset ofs ]
 815
 816      register :: Int -> Word32
 817      register reg = fromIntegral reg `shiftL` 21
 818
 819      offset :: Int -> Word32
 820      offset ofs   = fromIntegral (ofs * 4) .&. 0xFFFF
 821
 822      -- speaks for itself
 823      w32_to_w8s_bigEndian :: Word32 -> [Word8]
 824      w32_to_w8s_bigEndian w =  [fromIntegral (0xFF .&. (w `shiftR` 24)),
 825                                 fromIntegral (0xFF .&. (w `shiftR` 16)),
 826                                 fromIntegral (0xFF .&. (w `shiftR` 8)),
 827                                 fromIntegral (0xFF .&. w)]
 828
 829 #else
 830
 831    = throwDyn (InstallationError "foreign import is not implemented for GHCi on this platform.")
 832
 833 #endif
 834
 835 #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
 836 lit32 :: Int -> [Word8]
 837 lit32 i = let w32 = (fromIntegral i) :: Word32
 838           in  map (fromIntegral . ( .&. 0xFF))
 839                   [w32, w32 `shiftR` 8,
 840                    w32 `shiftR` 16,  w32 `shiftR` 24]
 841 #endif
 842 \end{code}
 843