First incomplete version of closure vectorisation

[ghc-hetmet.git] / compiler / nativeGen / MachCodeGen.hs
diff --git a/compiler/nativeGen/MachCodeGen.hs b/compiler/nativeGen/MachCodeGen.hs

index 17ee624..2c07016 100644 (file)
--- a/compiler/nativeGen/MachCodeGen.hs
+++ b/compiler/nativeGen/MachCodeGen.hs
@@ -29,6 +29,7 @@ import PprCmm         ( pprExpr )
  import Cmm
  import MachOp
  import CLabel
+import ClosureInfo     ( C_SRT(..) )
  
  -- The rest:
  import StaticFlags     ( opt_PIC )
@@ -41,7 +42,6 @@ import FastTypes      ( isFastTrue )
  import Constants       ( wORD_SIZE )
  
  #ifdef DEBUG
-import Outputable      ( assertPanic )
  import Debug.Trace     ( trace )
  #endif
  
@@ -49,6 +49,7 @@ import Control.Monad  ( mapAndUnzipM )
  import Data.Maybe      ( fromJust )
  import Data.Bits
  import Data.Word
+import Data.Int
  
  -- -----------------------------------------------------------------------------
  -- Top-level of the instruction selector
@@ -60,7 +61,7 @@ import Data.Word
  
  type InstrBlock = OrdList Instr
  
-cmmTopCodeGen :: CmmTop -> NatM [NatCmmTop]
+cmmTopCodeGen :: RawCmmTop -> NatM [NatCmmTop]
  cmmTopCodeGen (CmmProc info lab params blocks) = do
    (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
    picBaseMb <- getPicBaseMaybeNat
@@ -118,8 +119,8 @@ stmtToInstrs stmt = case stmt of
        | otherwise       -> assignMem_IntCode kind addr src
         where kind = cmmExprRep src
  
-    CmmCall target result_regs args vols
-       -> genCCall target result_regs args vols
+    CmmCall target result_regs args _
+       -> genCCall target result_regs args
  
      CmmBranch id         -> genBranch id
      CmmCondBranch arg id  -> genCondJump id arg
@@ -187,7 +188,7 @@ assignMem_I64Code addrTree valueTree = do
    return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi)
  
  
-assignReg_I64Code (CmmLocal (LocalReg u_dst pk)) valueTree = do
+assignReg_I64Code (CmmLocal (LocalReg u_dst pk _)) valueTree = do
     ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
     let 
           r_dst_lo = mkVReg u_dst I32
@@ -229,7 +230,7 @@ iselExpr64 (CmmLoad addrTree I64) = do
                          rlo
       )
  
-iselExpr64 (CmmReg (CmmLocal (LocalReg vu I64)))
+iselExpr64 (CmmReg (CmmLocal (LocalReg vu I64 _)))
     = return (ChildCode64 nilOL (mkVReg vu I32))
           
  -- we handle addition, but rather badly
@@ -264,6 +265,17 @@ iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do
     -- in
     return (ChildCode64 code rlo)
  
+iselExpr64 (CmmMachOp (MO_U_Conv _ I64) [expr]) = do
+     fn <- getAnyReg expr
+     r_dst_lo <-  getNewRegNat I32
+     let r_dst_hi = getHiVRegFromLo r_dst_lo
+         code = fn r_dst_lo
+     return (
+             ChildCode64 (code `snocOL` 
+                          MOV I32 (OpImm (ImmInt 0)) (OpReg r_dst_hi))
+                          r_dst_lo
+            )
+
  iselExpr64 expr
     = pprPanic "iselExpr64(i386)" (ppr expr)
  
@@ -357,7 +369,7 @@ assignMem_I64Code addrTree valueTree = do
         -- in
         return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi)
  
-assignReg_I64Code (CmmLocal (LocalReg u_dst pk)) valueTree = do
+assignReg_I64Code (CmmLocal (LocalReg u_dst pk _)) valueTree = do
     ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
     let 
           r_dst_lo = mkVReg u_dst I32
@@ -387,7 +399,7 @@ iselExpr64 (CmmLoad addrTree I64) = do
      return $ ChildCode64 (addr_code `snocOL` mov_lo `snocOL` mov_hi) 
                           rlo
  
-iselExpr64 (CmmReg (CmmLocal (LocalReg vu I64)))
+iselExpr64 (CmmReg (CmmLocal (LocalReg vu I64 _)))
     = return (ChildCode64 nilOL (mkVReg vu I32))
  
  iselExpr64 (CmmLit (CmmInt i _)) = do
@@ -421,6 +433,13 @@ iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do
     -- in
     return (ChildCode64 code rlo)
  
+iselExpr64 (CmmMachOp (MO_U_Conv I32 I64) [expr]) = do
+    (expr_reg,expr_code) <- getSomeReg expr
+    (rlo, rhi) <- getNewRegPairNat I32
+    let mov_hi = LI rhi (ImmInt 0)
+        mov_lo = MR rlo expr_reg
+    return $ ChildCode64 (expr_code `snocOL` mov_lo `snocOL` mov_hi)
+                         rlo
  iselExpr64 expr
     = pprPanic "iselExpr64(powerpc)" (ppr expr)
  
@@ -464,7 +483,7 @@ getSomeReg expr = do
  
  getRegisterReg :: CmmReg -> Reg
  
-getRegisterReg (CmmLocal (LocalReg u pk))
+getRegisterReg (CmmLocal (LocalReg u pk _))
    = mkVReg u pk
  
  getRegisterReg (CmmGlobal mid)
@@ -1147,16 +1166,30 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
           -- in
           return (Any rep code)
          
-    {- Case2: shift length is complex (non-immediate) -}
+    {- Case2: shift length is complex (non-immediate)
+      * y must go in %ecx.
+      * we cannot do y first *and* put its result in %ecx, because
+        %ecx might be clobbered by x.
+      * if we do y second, then x cannot be 
+        in a clobbered reg.  Also, we cannot clobber x's reg
+        with the instruction itself.
+      * so we can either:
+        - do y first, put its result in a fresh tmp, then copy it to %ecx later
+        - do y second and put its result into %ecx.  x gets placed in a fresh
+          tmp.  This is likely to be better, becuase the reg alloc can
+          eliminate this reg->reg move here (it won't eliminate the other one,
+          because the move is into the fixed %ecx).
+    -}
      shift_code rep instr x y{-amount-} = do
-        (x_reg, x_code) <- getNonClobberedReg x
+        x_code <- getAnyReg x
+       tmp <- getNewRegNat rep
          y_code <- getAnyReg y
         let 
-          code = x_code `appOL`
+          code = x_code tmp `appOL`
                   y_code ecx `snocOL`
-                 instr (OpReg ecx) (OpReg x_reg)
+                 instr (OpReg ecx) (OpReg tmp)
          -- in
-        return (Fixed rep x_reg code)
+        return (Fixed rep tmp code)
  
      --------------------
      add_code :: MachRep -> CmmExpr -> CmmExpr -> NatM Register
@@ -1828,15 +1861,18 @@ getAmode (CmmMachOp (MO_Add rep) [a@(CmmMachOp (MO_Shl _) _),
  getAmode (CmmMachOp (MO_Add rep) [x, CmmMachOp (MO_Shl _) 
                                         [y, CmmLit (CmmInt shift _)]])
    | shift == 0 || shift == 1 || shift == 2 || shift == 3
-  = do (x_reg, x_code) <- getNonClobberedReg x
-       -- x must be in a temp, because it has to stay live over y_code
-       -- we could compre x_reg and y_reg and do something better here...
-       (y_reg, y_code) <- getSomeReg y
-       let
-          code = x_code `appOL` y_code
-           base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8
-       return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt 0))
-               code)
+  = x86_complex_amode x y shift 0
+
+getAmode (CmmMachOp (MO_Add rep) 
+                [x, CmmMachOp (MO_Add _)
+                        [CmmMachOp (MO_Shl _) [y, CmmLit (CmmInt shift _)],
+                         CmmLit (CmmInt offset _)]])
+  | shift == 0 || shift == 1 || shift == 2 || shift == 3
+  && not (is64BitInteger offset)
+  = x86_complex_amode x y shift offset
+
+getAmode (CmmMachOp (MO_Add rep) [x,y])
+  = x86_complex_amode x y 0 0
  
  getAmode (CmmLit lit) | not (is64BitLit lit)
    = return (Amode (ImmAddr (litToImm lit) 0) nilOL)
@@ -1845,6 +1881,19 @@ getAmode expr = do
    (reg,code) <- getSomeReg expr
    return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code)
  
+
+x86_complex_amode :: CmmExpr -> CmmExpr -> Integer -> Integer -> NatM Amode
+x86_complex_amode base index shift offset
+  = do (x_reg, x_code) <- getNonClobberedReg base
+       -- x must be in a temp, because it has to stay live over y_code
+       -- we could compre x_reg and y_reg and do something better here...
+       (y_reg, y_code) <- getSomeReg index
+       let
+          code = x_code `appOL` y_code
+           base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt (fromIntegral offset)))
+               code)
+
  #endif /* i386_TARGET_ARCH || x86_64_TARGET_ARCH */
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -2037,7 +2086,12 @@ is64BitLit x = False
  #endif
  
  is64BitInteger :: Integer -> Bool
-is64BitInteger i = i > 0x7fffffff || i < -0x80000000
+is64BitInteger i = i64 > 0x7fffffff || i64 < -0x80000000
+  where i64 = fromIntegral i :: Int64
+  -- a CmmInt is intended to be truncated to the appropriate 
+  -- number of bits, so here we truncate it to Int64.  This is
+  -- important because e.g. -1 as a CmmInt might be either
+  -- -1 or 18446744073709551615.
  
  -- -----------------------------------------------------------------------------
  --  The 'CondCode' type:  Condition codes passed up the tree.
@@ -2359,6 +2413,25 @@ assignIntCode pk dst src
  #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
  -- integer assignment to memory
+
+-- specific case of adding/subtracting an integer to a particular address.
+-- ToDo: catch other cases where we can use an operation directly on a memory 
+-- address.
+assignMem_IntCode pk addr (CmmMachOp op [CmmLoad addr2 _,
+                                                 CmmLit (CmmInt i _)])
+   | addr == addr2, pk /= I64 || not (is64BitInteger i),
+     Just instr <- check op
+   = do Amode amode code_addr <- getAmode addr
+        let code = code_addr `snocOL`
+                   instr pk (OpImm (ImmInt (fromIntegral i))) (OpAddr amode)
+        return code
+   where
+        check (MO_Add _) = Just ADD
+        check (MO_Sub _) = Just SUB
+        check _ = Nothing
+        -- ToDo: more?
+
+-- general case
  assignMem_IntCode pk addr src = do
      Amode addr code_addr <- getAmode addr
      (code_src, op_src)   <- get_op_RI src
@@ -2872,9 +2945,8 @@ genCondJump id bool = do
  
  genCCall
      :: CmmCallTarget           -- function to call
-    -> [(CmmReg,MachHint)]     -- where to put the result
-    -> [(CmmExpr,MachHint)]    -- arguments (of mixed type)
-    -> Maybe [GlobalReg]       -- volatile regs to save
+    -> CmmHintFormals          -- where to put the result
+    -> CmmActuals              -- arguments (of mixed type)
      -> NatM InstrBlock
  
  -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -2953,12 +3025,12 @@ genCCall fn cconv result_regs args
  
  #if i386_TARGET_ARCH
  
-genCCall (CmmPrim MO_WriteBarrier) _ _ _ = return nilOL
+genCCall (CmmPrim MO_WriteBarrier) _ _ = return nilOL
         -- write barrier compiles to no code on x86/x86-64; 
         -- we keep it this long in order to prevent earlier optimisations.
  
  -- we only cope with a single result for foreign calls
-genCCall (CmmPrim op) [(r,_)] args vols = do
+genCCall (CmmPrim op) [(r,_)] args = do
    case op of
         MO_F32_Sqrt -> actuallyInlineFloatOp F32  (GSQRT F32) args
         MO_F64_Sqrt -> actuallyInlineFloatOp F64 (GSQRT F64) args
@@ -2972,14 +3044,14 @@ genCCall (CmmPrim op) [(r,_)] args vols = do
         MO_F32_Tan  -> actuallyInlineFloatOp F32  (GTAN F32) args
         MO_F64_Tan  -> actuallyInlineFloatOp F64 (GTAN F64) args
         
-       other_op    -> outOfLineFloatOp op r args vols
+       other_op    -> outOfLineFloatOp op r args
   where
    actuallyInlineFloatOp rep instr [(x,_)]
         = do res <- trivialUFCode rep instr x
              any <- anyReg res
-            return (any (getRegisterReg r))
+            return (any (getRegisterReg (CmmLocal r)))
  
-genCCall target dest_regs args vols = do
+genCCall target dest_regs args = do
      let
          sizes               = map (arg_size . cmmExprRep . fst) (reverse args)
  #if !darwin_TARGET_OS        
@@ -3042,8 +3114,8 @@ genCCall target dest_regs args vols = do
                 rep -> unitOL (MOV rep (OpReg eax) (OpReg r_dest))
           where 
                 r_dest_hi = getHiVRegFromLo r_dest
-               rep = cmmRegRep dest
-               r_dest = getRegisterReg dest
+               rep = localRegRep dest
+               r_dest = getRegisterReg (CmmLocal dest)
         assign_code many = panic "genCCall.assign_code many"
  
      return (push_code `appOL` 
@@ -3107,23 +3179,23 @@ genCCall target dest_regs args vols = do
  
  #if i386_TARGET_ARCH || x86_64_TARGET_ARCH
  
-outOfLineFloatOp :: CallishMachOp -> CmmReg -> [(CmmExpr,MachHint)]
-  -> Maybe [GlobalReg] -> NatM InstrBlock
-outOfLineFloatOp mop res args vols
+outOfLineFloatOp :: CallishMachOp -> CmmFormal -> CmmActuals
+  -> NatM InstrBlock
+outOfLineFloatOp mop res args
    = do
        targetExpr <- cmmMakeDynamicReference addImportNat CallReference lbl
        let target = CmmForeignCall targetExpr CCallConv
          
-      if cmmRegRep res == F64
+      if localRegRep res == F64
          then
-          stmtToInstrs (CmmCall target [(res,FloatHint)] args vols)  
+          stmtToInstrs (CmmCall target [(res,FloatHint)] args CmmUnsafe)
          else do
            uq <- getUniqueNat
            let 
-            tmp = CmmLocal (LocalReg uq F64)
+            tmp = LocalReg uq F64 KindNonPtr
            -- in
-          code1 <- stmtToInstrs (CmmCall target [(tmp,FloatHint)] args vols)
-          code2 <- stmtToInstrs (CmmAssign res (CmmReg tmp))
+          code1 <- stmtToInstrs (CmmCall target [(tmp,FloatHint)] args CmmUnsafe)
+          code2 <- stmtToInstrs (CmmAssign (CmmLocal res) (CmmReg (CmmLocal tmp)))
            return (code1 `appOL` code2)
    where
         lbl = mkForeignLabel fn Nothing False
@@ -3167,14 +3239,14 @@ outOfLineFloatOp mop res args vols
  
  #if x86_64_TARGET_ARCH
  
-genCCall (CmmPrim MO_WriteBarrier) _ _ _ = return nilOL
+genCCall (CmmPrim MO_WriteBarrier) _ _ = return nilOL
         -- write barrier compiles to no code on x86/x86-64; 
         -- we keep it this long in order to prevent earlier optimisations.
  
-genCCall (CmmPrim op) [(r,_)] args vols = 
-  outOfLineFloatOp op r args vols
+genCCall (CmmPrim op) [(r,_)] args = 
+  outOfLineFloatOp op r args
  
-genCCall target dest_regs args vols = do
+genCCall target dest_regs args = do
  
         -- load up the register arguments
      (stack_args, aregs, fregs, load_args_code)
@@ -3257,8 +3329,8 @@ genCCall target dest_regs args vols = do
                 F64 -> unitOL (MOV rep (OpReg xmm0) (OpReg r_dest))
                 rep -> unitOL (MOV rep (OpReg rax) (OpReg r_dest))
           where 
-               rep = cmmRegRep dest
-               r_dest = getRegisterReg dest
+               rep = localRegRep dest
+               r_dest = getRegisterReg (CmmLocal dest)
         assign_code many = panic "genCCall.assign_code many"
  
      return (load_args_code     `appOL` 
@@ -3306,10 +3378,10 @@ genCCall target dest_regs args vols = do
          (arg_reg, arg_code) <- getSomeReg arg
           delta <- getDeltaNat
           setDeltaNat (delta-arg_size)
-        let code' = code `appOL` toOL [
-                       MOV arg_rep (OpReg arg_reg) (OpAddr  (spRel 0)),
+        let code' = code `appOL` arg_code `appOL` toOL [
                         SUB wordRep (OpImm (ImmInt arg_size)) (OpReg rsp) ,
-                       DELTA (delta-arg_size)]
+                       DELTA (delta-arg_size),
+                       MOV arg_rep (OpReg arg_reg) (OpAddr  (spRel 0))]
          push_args rest code'
  
         | otherwise = do
@@ -3360,7 +3432,7 @@ genCCall target dest_regs args vols = do
     stack only immediately prior to the call proper.  Sigh.
  -}
  
-genCCall target dest_regs argsAndHints vols = do
+genCCall target dest_regs argsAndHints = do
      let
          args = map fst argsAndHints
      argcode_and_vregs <- mapM arg_to_int_vregs args
@@ -3553,10 +3625,10 @@ outOfLineFloatOp mop =
  -}
  
  
-genCCall (CmmPrim MO_WriteBarrier) _ _ _
+genCCall (CmmPrim MO_WriteBarrier) _ _ 
   = return $ unitOL LWSYNC
  
-genCCall target dest_regs argsAndHints vols
+genCCall target dest_regs argsAndHints
    = ASSERT (not $ any (`elem` [I8,I16]) argReps)
          -- we rely on argument promotion in the codeGen
      do
@@ -3717,8 +3789,8 @@ genCCall target dest_regs argsAndHints vols
                      | rep == I64 -> toOL [MR (getHiVRegFromLo r_dest) r3,
                                            MR r_dest r4]
                      | otherwise -> unitOL (MR r_dest r3)
-                    where rep = cmmRegRep dest
-                          r_dest = getRegisterReg dest
+                    where rep = cmmRegRep (CmmLocal dest)
+                          r_dest = getRegisterReg (CmmLocal dest)
                            
          outOfLineFloatOp mop =
              do
@@ -4522,6 +4594,8 @@ remainderCode rep div x y = do
  -- -----------------------------------------------------------------------------
  --  Coercing to/from integer/floating-point...
  
+-- When going to integer, we truncate (round towards 0).
+
  -- @coerce(Int2FP|FP2Int)@ are more complicated integer/float
  -- conversions.  We have to store temporaries in memory to move
  -- between the integer and the floating point register sets.
@@ -4607,7 +4681,7 @@ coerceFP2Int from to x = do
  coerceFP2Int from to x = do
    (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
    let
-        opc  = case from of F32 -> CVTSS2SI; F64 -> CVTSD2SI
+        opc  = case from of F32 -> CVTTSS2SIQ; F64 -> CVTTSD2SIQ
          code dst = x_code `snocOL` opc x_op dst
    -- in
    return (Any to code) -- works even if the destination rep is <I32