compiler/nativeGen/X86/CodeGen.hs

   1 {-# OPTIONS -w #-}
   2 -- The above warning supression flag is a temporary kludge.
   3 -- While working on this module you are encouraged to remove it and fix
   4 -- any warnings in the module. See
   5 --     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
   6 -- for details
   7
   8 -----------------------------------------------------------------------------
   9 --
  10 -- Generating machine code (instruction selection)
  11 --
  12 -- (c) The University of Glasgow 1996-2004
  13 --
  14 -----------------------------------------------------------------------------
  15
  16 -- This is a big module, but, if you pay attention to
  17 -- (a) the sectioning, (b) the type signatures, and
  18 -- (c) the #if blah_TARGET_ARCH} things, the
  19 -- structure should not be too overwhelming.
  20
  21 module X86.CodeGen (
  22         cmmTopCodeGen,
  23         InstrBlock
  24 )
  25
  26 where
  27
  28 #include "HsVersions.h"
  29 #include "nativeGen/NCG.h"
  30 #include "../includes/MachDeps.h"
  31
  32 -- NCG stuff:
  33 import X86.Instr
  34 import X86.Cond
  35 import X86.Regs
  36 import X86.RegInfo
  37 import X86.Ppr
  38 import Instruction
  39 import PIC
  40 import NCGMonad
  41 import Size
  42 import Reg
  43 import RegClass
  44 import Platform
  45
  46 -- Our intermediate code:
  47 import BasicTypes
  48 import BlockId
  49 import PprCmm           ( pprExpr )
  50 import OldCmm
  51 import OldPprCmm
  52 import CLabel
  53 import ClosureInfo      ( C_SRT(..) )
  54
  55 -- The rest:
  56 import StaticFlags      ( opt_PIC )
  57 import ForeignCall      ( CCallConv(..) )
  58 import OrdList
  59 import Pretty
  60 import qualified Outputable as O
  61 import Outputable
  62 import Unique
  63 import FastString
  64 import FastBool         ( isFastTrue )
  65 import Constants        ( wORD_SIZE )
  66 import DynFlags
  67
  68 import Debug.Trace      ( trace )
  69
  70 import Control.Monad    ( mapAndUnzipM )
  71 import Data.Maybe       ( fromJust )
  72 import Data.Bits
  73 import Data.Word
  74 import Data.Int
  75
  76 sse2Enabled :: NatM Bool
  77 #if x86_64_TARGET_ARCH
  78 -- SSE2 is fixed on for x86_64.  It would be possible to make it optional,
  79 -- but we'd need to fix at least the foreign call code where the calling
  80 -- convention specifies the use of xmm regs, and possibly other places.
  81 sse2Enabled = return True
  82 #else
  83 sse2Enabled = do
  84   dflags <- getDynFlagsNat
  85   return (dopt Opt_SSE2 dflags)
  86 #endif
  87
  88 if_sse2 :: NatM a -> NatM a -> NatM a
  89 if_sse2 sse2 x87 = do
  90   b <- sse2Enabled
  91   if b then sse2 else x87
  92
  93 cmmTopCodeGen
  94         :: DynFlags
  95         -> RawCmmTop
  96         -> NatM [NatCmmTop Instr]
  97
  98 cmmTopCodeGen dynflags (CmmProc info lab (ListGraph blocks)) = do
  99   (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
 100   picBaseMb <- getPicBaseMaybeNat
 101   let proc = CmmProc info lab (ListGraph $ concat nat_blocks)
 102       tops = proc : concat statics
 103       os   = platformOS $ targetPlatform dynflags
 104
 105   case picBaseMb of
 106       Just picBase -> initializePicBase_x86 ArchX86 os picBase tops
 107       Nothing -> return tops
 108
 109 cmmTopCodeGen _ (CmmData sec dat) = do
 110   return [CmmData sec dat]  -- no translation, we just use CmmStatic
 111
 112
 113 basicBlockCodeGen
 114         :: CmmBasicBlock
 115         -> NatM ( [NatBasicBlock Instr]
 116                 , [NatCmmTop Instr])
 117
 118 basicBlockCodeGen (BasicBlock id stmts) = do
 119   instrs <- stmtsToInstrs stmts
 120   -- code generation may introduce new basic block boundaries, which
 121   -- are indicated by the NEWBLOCK instruction.  We must split up the
 122   -- instruction stream into basic blocks again.  Also, we extract
 123   -- LDATAs here too.
 124   let
 125         (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs
 126
 127         mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
 128           = ([], BasicBlock id instrs : blocks, statics)
 129         mkBlocks (LDATA sec dat) (instrs,blocks,statics)
 130           = (instrs, blocks, CmmData sec dat:statics)
 131         mkBlocks instr (instrs,blocks,statics)
 132           = (instr:instrs, blocks, statics)
 133   -- in
 134   return (BasicBlock id top : other_blocks, statics)
 135
 136
 137 stmtsToInstrs :: [CmmStmt] -> NatM InstrBlock
 138 stmtsToInstrs stmts
 139    = do instrss <- mapM stmtToInstrs stmts
 140         return (concatOL instrss)
 141
 142
 143 stmtToInstrs :: CmmStmt -> NatM InstrBlock
 144 stmtToInstrs stmt = case stmt of
 145     CmmNop         -> return nilOL
 146     CmmComment s   -> return (unitOL (COMMENT s))
 147
 148     CmmAssign reg src
 149       | isFloatType ty -> assignReg_FltCode size reg src
 150 #if WORD_SIZE_IN_BITS==32
 151       | isWord64 ty    -> assignReg_I64Code      reg src
 152 #endif
 153       | otherwise        -> assignReg_IntCode size reg src
 154         where ty = cmmRegType reg
 155               size = cmmTypeSize ty
 156
 157     CmmStore addr src
 158       | isFloatType ty -> assignMem_FltCode size addr src
 159 #if WORD_SIZE_IN_BITS==32
 160       | isWord64 ty      -> assignMem_I64Code      addr src
 161 #endif
 162       | otherwise        -> assignMem_IntCode size addr src
 163         where ty = cmmExprType src
 164               size = cmmTypeSize ty
 165
 166     CmmCall target result_regs args _ _
 167        -> genCCall target result_regs args
 168
 169     CmmBranch id          -> genBranch id
 170     CmmCondBranch arg id  -> genCondJump id arg
 171     CmmSwitch arg ids     -> genSwitch arg ids
 172     CmmJump arg params    -> genJump arg
 173     CmmReturn params      ->
 174       panic "stmtToInstrs: return statement should have been cps'd away"
 175
 176
 177 --------------------------------------------------------------------------------
 178 -- | 'InstrBlock's are the insn sequences generated by the insn selectors.
 179 --      They are really trees of insns to facilitate fast appending, where a
 180 --      left-to-right traversal yields the insns in the correct order.
 181 --
 182 type InstrBlock
 183         = OrdList Instr
 184
 185
 186 -- | Condition codes passed up the tree.
 187 --
 188 data CondCode
 189         = CondCode Bool Cond InstrBlock
 190
 191
 192 -- | a.k.a "Register64"
 193 --      Reg is the lower 32-bit temporary which contains the result.
 194 --      Use getHiVRegFromLo to find the other VRegUnique.
 195 --
 196 --      Rules of this simplified insn selection game are therefore that
 197 --      the returned Reg may be modified
 198 --
 199 data ChildCode64
 200    = ChildCode64
 201         InstrBlock
 202         Reg
 203
 204
 205 -- | Register's passed up the tree.  If the stix code forces the register
 206 --      to live in a pre-decided machine register, it comes out as @Fixed@;
 207 --      otherwise, it comes out as @Any@, and the parent can decide which
 208 --      register to put it in.
 209 --
 210 data Register
 211         = Fixed Size Reg InstrBlock
 212         | Any   Size (Reg -> InstrBlock)
 213
 214
 215 swizzleRegisterRep :: Register -> Size -> Register
 216 swizzleRegisterRep (Fixed _ reg code) size = Fixed size reg code
 217 swizzleRegisterRep (Any _ codefn)     size = Any   size codefn
 218
 219
 220 -- | Grab the Reg for a CmmReg
 221 getRegisterReg :: Bool -> CmmReg -> Reg
 222
 223 getRegisterReg use_sse2 (CmmLocal (LocalReg u pk))
 224   = let sz = cmmTypeSize pk in
 225     if isFloatSize sz && not use_sse2
 226        then RegVirtual (mkVirtualReg u FF80)
 227        else RegVirtual (mkVirtualReg u sz)
 228
 229 getRegisterReg _ (CmmGlobal mid)
 230   = case globalRegMaybe mid of
 231         Just reg -> RegReal $ reg
 232         Nothing  -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
 233         -- By this stage, the only MagicIds remaining should be the
 234         -- ones which map to a real machine register on this
 235         -- platform.  Hence ...
 236
 237
 238 -- | Memory addressing modes passed up the tree.
 239 data Amode
 240         = Amode AddrMode InstrBlock
 241
 242 {-
 243 Now, given a tree (the argument to an CmmLoad) that references memory,
 244 produce a suitable addressing mode.
 245
 246 A Rule of the Game (tm) for Amodes: use of the addr bit must
 247 immediately follow use of the code part, since the code part puts
 248 values in registers which the addr then refers to.  So you can't put
 249 anything in between, lest it overwrite some of those registers.  If
 250 you need to do some other computation between the code part and use of
 251 the addr bit, first store the effective address from the amode in a
 252 temporary, then do the other computation, and then use the temporary:
 253
 254     code
 255     LEA amode, tmp
 256     ... other computation ...
 257     ... (tmp) ...
 258 -}
 259
 260
 261 -- | Check whether an integer will fit in 32 bits.
 262 --      A CmmInt is intended to be truncated to the appropriate
 263 --      number of bits, so here we truncate it to Int64.  This is
 264 --      important because e.g. -1 as a CmmInt might be either
 265 --      -1 or 18446744073709551615.
 266 --
 267 is32BitInteger :: Integer -> Bool
 268 is32BitInteger i = i64 <= 0x7fffffff && i64 >= -0x80000000
 269   where i64 = fromIntegral i :: Int64
 270
 271
 272 -- | Convert a BlockId to some CmmStatic data
 273 jumpTableEntry :: Maybe BlockId -> CmmStatic
 274 jumpTableEntry Nothing = CmmStaticLit (CmmInt 0 wordWidth)
 275 jumpTableEntry (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
 276     where blockLabel = mkAsmTempLabel (getUnique blockid)
 277
 278
 279 -- -----------------------------------------------------------------------------
 280 -- General things for putting together code sequences
 281
 282 -- Expand CmmRegOff.  ToDo: should we do it this way around, or convert
 283 -- CmmExprs into CmmRegOff?
 284 mangleIndexTree :: CmmExpr -> CmmExpr
 285 mangleIndexTree (CmmRegOff reg off)
 286   = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
 287   where width = typeWidth (cmmRegType reg)
 288
 289 -- | The dual to getAnyReg: compute an expression into a register, but
 290 --      we don't mind which one it is.
 291 getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
 292 getSomeReg expr = do
 293   r <- getRegister expr
 294   case r of
 295     Any rep code -> do
 296         tmp <- getNewRegNat rep
 297         return (tmp, code tmp)
 298     Fixed _ reg code ->
 299         return (reg, code)
 300
 301
 302
 303
 304
 305 assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock
 306 assignMem_I64Code addrTree valueTree = do
 307   Amode addr addr_code <- getAmode addrTree
 308   ChildCode64 vcode rlo <- iselExpr64 valueTree
 309   let
 310         rhi = getHiVRegFromLo rlo
 311
 312         -- Little-endian store
 313         mov_lo = MOV II32 (OpReg rlo) (OpAddr addr)
 314         mov_hi = MOV II32 (OpReg rhi) (OpAddr (fromJust (addrOffset addr 4)))
 315   -- in
 316   return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi)
 317
 318
 319 assignReg_I64Code :: CmmReg  -> CmmExpr -> NatM InstrBlock
 320 assignReg_I64Code (CmmLocal (LocalReg u_dst pk)) valueTree = do
 321    ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
 322    let
 323          r_dst_lo = RegVirtual $ mkVirtualReg u_dst II32
 324          r_dst_hi = getHiVRegFromLo r_dst_lo
 325          r_src_hi = getHiVRegFromLo r_src_lo
 326          mov_lo = MOV II32 (OpReg r_src_lo) (OpReg r_dst_lo)
 327          mov_hi = MOV II32 (OpReg r_src_hi) (OpReg r_dst_hi)
 328    -- in
 329    return (
 330         vcode `snocOL` mov_lo `snocOL` mov_hi
 331      )
 332
 333 assignReg_I64Code lvalue valueTree
 334    = panic "assignReg_I64Code(i386): invalid lvalue"
 335
 336
 337
 338
 339 iselExpr64        :: CmmExpr -> NatM ChildCode64
 340 iselExpr64 (CmmLit (CmmInt i _)) = do
 341   (rlo,rhi) <- getNewRegPairNat II32
 342   let
 343         r = fromIntegral (fromIntegral i :: Word32)
 344         q = fromIntegral ((fromIntegral i `shiftR` 32) :: Word32)
 345         code = toOL [
 346                 MOV II32 (OpImm (ImmInteger r)) (OpReg rlo),
 347                 MOV II32 (OpImm (ImmInteger q)) (OpReg rhi)
 348                 ]
 349   -- in
 350   return (ChildCode64 code rlo)
 351
 352 iselExpr64 (CmmLoad addrTree ty) | isWord64 ty = do
 353    Amode addr addr_code <- getAmode addrTree
 354    (rlo,rhi) <- getNewRegPairNat II32
 355    let
 356         mov_lo = MOV II32 (OpAddr addr) (OpReg rlo)
 357         mov_hi = MOV II32 (OpAddr (fromJust (addrOffset addr 4))) (OpReg rhi)
 358    -- in
 359    return (
 360             ChildCode64 (addr_code `snocOL` mov_lo `snocOL` mov_hi)
 361                         rlo
 362      )
 363
 364 iselExpr64 (CmmReg (CmmLocal (LocalReg vu ty))) | isWord64 ty
 365    = return (ChildCode64 nilOL (RegVirtual $ mkVirtualReg vu II32))
 366
 367 -- we handle addition, but rather badly
 368 iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)]) = do
 369    ChildCode64 code1 r1lo <- iselExpr64 e1
 370    (rlo,rhi) <- getNewRegPairNat II32
 371    let
 372         r = fromIntegral (fromIntegral i :: Word32)
 373         q = fromIntegral ((fromIntegral i `shiftR` 32) :: Word32)
 374         r1hi = getHiVRegFromLo r1lo
 375         code =  code1 `appOL`
 376                 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
 377                        ADD II32 (OpImm (ImmInteger r)) (OpReg rlo),
 378                        MOV II32 (OpReg r1hi) (OpReg rhi),
 379                        ADC II32 (OpImm (ImmInteger q)) (OpReg rhi) ]
 380    -- in
 381    return (ChildCode64 code rlo)
 382
 383 iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do
 384    ChildCode64 code1 r1lo <- iselExpr64 e1
 385    ChildCode64 code2 r2lo <- iselExpr64 e2
 386    (rlo,rhi) <- getNewRegPairNat II32
 387    let
 388         r1hi = getHiVRegFromLo r1lo
 389         r2hi = getHiVRegFromLo r2lo
 390         code =  code1 `appOL`
 391                 code2 `appOL`
 392                 toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
 393                        ADD II32 (OpReg r2lo) (OpReg rlo),
 394                        MOV II32 (OpReg r1hi) (OpReg rhi),
 395                        ADC II32 (OpReg r2hi) (OpReg rhi) ]
 396    -- in
 397    return (ChildCode64 code rlo)
 398
 399 iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do
 400      fn <- getAnyReg expr
 401      r_dst_lo <-  getNewRegNat II32
 402      let r_dst_hi = getHiVRegFromLo r_dst_lo
 403          code = fn r_dst_lo
 404      return (
 405              ChildCode64 (code `snocOL`
 406                           MOV II32 (OpImm (ImmInt 0)) (OpReg r_dst_hi))
 407                           r_dst_lo
 408             )
 409
 410 iselExpr64 expr
 411    = pprPanic "iselExpr64(i386)" (ppr expr)
 412
 413
 414
 415 --------------------------------------------------------------------------------
 416 getRegister :: CmmExpr -> NatM Register
 417
 418 #if !x86_64_TARGET_ARCH
 419     -- on x86_64, we have %rip for PicBaseReg, but it's not a full-featured
 420     -- register, it can only be used for rip-relative addressing.
 421 getRegister (CmmReg (CmmGlobal PicBaseReg))
 422   = do
 423       reg <- getPicBaseNat archWordSize
 424       return (Fixed archWordSize reg nilOL)
 425 #endif
 426
 427 getRegister (CmmReg reg)
 428   = do use_sse2 <- sse2Enabled
 429        let
 430          sz = cmmTypeSize (cmmRegType reg)
 431          size | not use_sse2 && isFloatSize sz = FF80
 432               | otherwise                      = sz
 433        --
 434        return (Fixed sz (getRegisterReg use_sse2 reg) nilOL)
 435
 436
 437 getRegister tree@(CmmRegOff _ _)
 438   = getRegister (mangleIndexTree tree)
 439
 440
 441 #if WORD_SIZE_IN_BITS==32
 442     -- for 32-bit architectuers, support some 64 -> 32 bit conversions:
 443     -- TO_W_(x), TO_W_(x >> 32)
 444
 445 getRegister (CmmMachOp (MO_UU_Conv W64 W32)
 446              [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]]) = do
 447   ChildCode64 code rlo <- iselExpr64 x
 448   return $ Fixed II32 (getHiVRegFromLo rlo) code
 449
 450 getRegister (CmmMachOp (MO_SS_Conv W64 W32)
 451              [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]]) = do
 452   ChildCode64 code rlo <- iselExpr64 x
 453   return $ Fixed II32 (getHiVRegFromLo rlo) code
 454
 455 getRegister (CmmMachOp (MO_UU_Conv W64 W32) [x]) = do
 456   ChildCode64 code rlo <- iselExpr64 x
 457   return $ Fixed II32 rlo code
 458
 459 getRegister (CmmMachOp (MO_SS_Conv W64 W32) [x]) = do
 460   ChildCode64 code rlo <- iselExpr64 x
 461   return $ Fixed II32 rlo code
 462
 463 #endif
 464
 465
 466 getRegister (CmmLit lit@(CmmFloat f w)) =
 467   if_sse2 float_const_sse2 float_const_x87
 468  where
 469   float_const_sse2
 470     | f == 0.0 = do
 471       let
 472           size = floatSize w
 473           code dst = unitOL  (XOR size (OpReg dst) (OpReg dst))
 474         -- I don't know why there are xorpd, xorps, and pxor instructions.
 475         -- They all appear to do the same thing --SDM
 476       return (Any size code)
 477
 478    | otherwise = do
 479       Amode addr code <- memConstant (widthInBytes w) lit
 480       loadFloatAmode True w addr code
 481
 482   float_const_x87 = case w of
 483     W64
 484       | f == 0.0 ->
 485         let code dst = unitOL (GLDZ dst)
 486         in  return (Any FF80 code)
 487
 488       | f == 1.0 ->
 489         let code dst = unitOL (GLD1 dst)
 490         in  return (Any FF80 code)
 491
 492     _otherwise -> do
 493       Amode addr code <- memConstant (widthInBytes w) lit
 494       loadFloatAmode False w addr code
 495
 496 -- catch simple cases of zero- or sign-extended load
 497 getRegister (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do
 498   code <- intLoadCode (MOVZxL II8) addr
 499   return (Any II32 code)
 500
 501 getRegister (CmmMachOp (MO_SS_Conv W8 W32) [CmmLoad addr _]) = do
 502   code <- intLoadCode (MOVSxL II8) addr
 503   return (Any II32 code)
 504
 505 getRegister (CmmMachOp (MO_UU_Conv W16 W32) [CmmLoad addr _]) = do
 506   code <- intLoadCode (MOVZxL II16) addr
 507   return (Any II32 code)
 508
 509 getRegister (CmmMachOp (MO_SS_Conv W16 W32) [CmmLoad addr _]) = do
 510   code <- intLoadCode (MOVSxL II16) addr
 511   return (Any II32 code)
 512
 513
 514 #if x86_64_TARGET_ARCH
 515
 516 -- catch simple cases of zero- or sign-extended load
 517 getRegister (CmmMachOp (MO_UU_Conv W8 W64) [CmmLoad addr _]) = do
 518   code <- intLoadCode (MOVZxL II8) addr
 519   return (Any II64 code)
 520
 521 getRegister (CmmMachOp (MO_SS_Conv W8 W64) [CmmLoad addr _]) = do
 522   code <- intLoadCode (MOVSxL II8) addr
 523   return (Any II64 code)
 524
 525 getRegister (CmmMachOp (MO_UU_Conv W16 W64) [CmmLoad addr _]) = do
 526   code <- intLoadCode (MOVZxL II16) addr
 527   return (Any II64 code)
 528
 529 getRegister (CmmMachOp (MO_SS_Conv W16 W64) [CmmLoad addr _]) = do
 530   code <- intLoadCode (MOVSxL II16) addr
 531   return (Any II64 code)
 532
 533 getRegister (CmmMachOp (MO_UU_Conv W32 W64) [CmmLoad addr _]) = do
 534   code <- intLoadCode (MOV II32) addr -- 32-bit loads zero-extend
 535   return (Any II64 code)
 536
 537 getRegister (CmmMachOp (MO_SS_Conv W32 W64) [CmmLoad addr _]) = do
 538   code <- intLoadCode (MOVSxL II32) addr
 539   return (Any II64 code)
 540
 541 getRegister (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
 542                                      CmmLit displacement])
 543     = return $ Any II64 (\dst -> unitOL $
 544         LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
 545
 546 #endif /* x86_64_TARGET_ARCH */
 547
 548
 549
 550
 551
 552 getRegister (CmmMachOp mop [x]) = do -- unary MachOps
 553     sse2 <- sse2Enabled
 554     case mop of
 555       MO_F_Neg w
 556          | sse2      -> sse2NegCode w x
 557          | otherwise -> trivialUFCode FF80 (GNEG FF80) x
 558
 559       MO_S_Neg w -> triv_ucode NEGI (intSize w)
 560       MO_Not w   -> triv_ucode NOT  (intSize w)
 561
 562       -- Nop conversions
 563       MO_UU_Conv W32 W8  -> toI8Reg  W32 x
 564       MO_SS_Conv W32 W8  -> toI8Reg  W32 x
 565       MO_UU_Conv W16 W8  -> toI8Reg  W16 x
 566       MO_SS_Conv W16 W8  -> toI8Reg  W16 x
 567       MO_UU_Conv W32 W16 -> toI16Reg W32 x
 568       MO_SS_Conv W32 W16 -> toI16Reg W32 x
 569
 570 #if x86_64_TARGET_ARCH
 571       MO_UU_Conv W64 W32 -> conversionNop II64 x
 572       MO_SS_Conv W64 W32 -> conversionNop II64 x
 573       MO_UU_Conv W64 W16 -> toI16Reg W64 x
 574       MO_SS_Conv W64 W16 -> toI16Reg W64 x
 575       MO_UU_Conv W64 W8  -> toI8Reg  W64 x
 576       MO_SS_Conv W64 W8  -> toI8Reg  W64 x
 577 #endif
 578
 579       MO_UU_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intSize rep1) x
 580       MO_SS_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intSize rep1) x
 581
 582       -- widenings
 583       MO_UU_Conv W8  W32 -> integerExtend W8  W32 MOVZxL x
 584       MO_UU_Conv W16 W32 -> integerExtend W16 W32 MOVZxL x
 585       MO_UU_Conv W8  W16 -> integerExtend W8  W16 MOVZxL x
 586
 587       MO_SS_Conv W8  W32 -> integerExtend W8  W32 MOVSxL x
 588       MO_SS_Conv W16 W32 -> integerExtend W16 W32 MOVSxL x
 589       MO_SS_Conv W8  W16 -> integerExtend W8  W16 MOVSxL x
 590
 591 #if x86_64_TARGET_ARCH
 592       MO_UU_Conv W8  W64 -> integerExtend W8  W64 MOVZxL x
 593       MO_UU_Conv W16 W64 -> integerExtend W16 W64 MOVZxL x
 594       MO_UU_Conv W32 W64 -> integerExtend W32 W64 MOVZxL x
 595       MO_SS_Conv W8  W64 -> integerExtend W8  W64 MOVSxL x
 596       MO_SS_Conv W16 W64 -> integerExtend W16 W64 MOVSxL x
 597       MO_SS_Conv W32 W64 -> integerExtend W32 W64 MOVSxL x
 598         -- for 32-to-64 bit zero extension, amd64 uses an ordinary movl.
 599         -- However, we don't want the register allocator to throw it
 600         -- away as an unnecessary reg-to-reg move, so we keep it in
 601         -- the form of a movzl and print it as a movl later.
 602 #endif
 603
 604       MO_FF_Conv W32 W64
 605         | sse2      -> coerceFP2FP W64 x
 606         | otherwise -> conversionNop FF80 x
 607
 608       MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
 609
 610       MO_FS_Conv from to -> coerceFP2Int from to x
 611       MO_SF_Conv from to -> coerceInt2FP from to x
 612
 613       other -> pprPanic "getRegister" (pprMachOp mop)
 614    where
 615         triv_ucode :: (Size -> Operand -> Instr) -> Size -> NatM Register
 616         triv_ucode instr size = trivialUCode size (instr size) x
 617
 618         -- signed or unsigned extension.
 619         integerExtend :: Width -> Width
 620                       -> (Size -> Operand -> Operand -> Instr)
 621                       -> CmmExpr -> NatM Register
 622         integerExtend from to instr expr = do
 623             (reg,e_code) <- if from == W8 then getByteReg expr
 624                                           else getSomeReg expr
 625             let
 626                 code dst =
 627                   e_code `snocOL`
 628                   instr (intSize from) (OpReg reg) (OpReg dst)
 629             return (Any (intSize to) code)
 630
 631         toI8Reg :: Width -> CmmExpr -> NatM Register
 632         toI8Reg new_rep expr
 633             = do codefn <- getAnyReg expr
 634                  return (Any (intSize new_rep) codefn)
 635                 -- HACK: use getAnyReg to get a byte-addressable register.
 636                 -- If the source was a Fixed register, this will add the
 637                 -- mov instruction to put it into the desired destination.
 638                 -- We're assuming that the destination won't be a fixed
 639                 -- non-byte-addressable register; it won't be, because all
 640                 -- fixed registers are word-sized.
 641
 642         toI16Reg = toI8Reg -- for now
 643
 644         conversionNop :: Size -> CmmExpr -> NatM Register
 645         conversionNop new_size expr
 646             = do e_code <- getRegister expr
 647                  return (swizzleRegisterRep e_code new_size)
 648
 649
 650 getRegister e@(CmmMachOp mop [x, y]) = do -- dyadic MachOps
 651   sse2 <- sse2Enabled
 652   case mop of
 653       MO_F_Eq w -> condFltReg EQQ x y
 654       MO_F_Ne w -> condFltReg NE x y
 655       MO_F_Gt w -> condFltReg GTT x y
 656       MO_F_Ge w -> condFltReg GE x y
 657       MO_F_Lt w -> condFltReg LTT x y
 658       MO_F_Le w -> condFltReg LE x y
 659
 660       MO_Eq rep   -> condIntReg EQQ x y
 661       MO_Ne rep   -> condIntReg NE x y
 662
 663       MO_S_Gt rep -> condIntReg GTT x y
 664       MO_S_Ge rep -> condIntReg GE x y
 665       MO_S_Lt rep -> condIntReg LTT x y
 666       MO_S_Le rep -> condIntReg LE x y
 667
 668       MO_U_Gt rep -> condIntReg GU  x y
 669       MO_U_Ge rep -> condIntReg GEU x y
 670       MO_U_Lt rep -> condIntReg LU  x y
 671       MO_U_Le rep -> condIntReg LEU x y
 672
 673       MO_F_Add w  | sse2      -> trivialFCode_sse2 w ADD  x y
 674                   | otherwise -> trivialFCode_x87  w GADD x y
 675       MO_F_Sub w  | sse2      -> trivialFCode_sse2 w SUB  x y
 676                   | otherwise -> trivialFCode_x87  w GSUB x y
 677       MO_F_Quot w | sse2      -> trivialFCode_sse2 w FDIV x y
 678                   | otherwise -> trivialFCode_x87  w GDIV x y
 679       MO_F_Mul w  | sse2      -> trivialFCode_sse2 w MUL x y
 680                   | otherwise -> trivialFCode_x87  w GMUL x y
 681
 682       MO_Add rep -> add_code rep x y
 683       MO_Sub rep -> sub_code rep x y
 684
 685       MO_S_Quot rep -> div_code rep True  True  x y
 686       MO_S_Rem  rep -> div_code rep True  False x y
 687       MO_U_Quot rep -> div_code rep False True  x y
 688       MO_U_Rem  rep -> div_code rep False False x y
 689
 690       MO_S_MulMayOflo rep -> imulMayOflo rep x y
 691
 692       MO_Mul rep -> triv_op rep IMUL
 693       MO_And rep -> triv_op rep AND
 694       MO_Or  rep -> triv_op rep OR
 695       MO_Xor rep -> triv_op rep XOR
 696
 697         {- Shift ops on x86s have constraints on their source, it
 698            either has to be Imm, CL or 1
 699             => trivialCode is not restrictive enough (sigh.)
 700         -}
 701       MO_Shl rep   -> shift_code rep SHL x y {-False-}
 702       MO_U_Shr rep -> shift_code rep SHR x y {-False-}
 703       MO_S_Shr rep -> shift_code rep SAR x y {-False-}
 704
 705       other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
 706   where
 707     --------------------
 708     triv_op width instr = trivialCode width op (Just op) x y
 709                         where op   = instr (intSize width)
 710
 711     imulMayOflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
 712     imulMayOflo rep a b = do
 713          (a_reg, a_code) <- getNonClobberedReg a
 714          b_code <- getAnyReg b
 715          let
 716              shift_amt  = case rep of
 717                            W32 -> 31
 718                            W64 -> 63
 719                            _ -> panic "shift_amt"
 720
 721              size = intSize rep
 722              code = a_code `appOL` b_code eax `appOL`
 723                         toOL [
 724                            IMUL2 size (OpReg a_reg),   -- result in %edx:%eax
 725                            SAR size (OpImm (ImmInt shift_amt)) (OpReg eax),
 726                                 -- sign extend lower part
 727                            SUB size (OpReg edx) (OpReg eax)
 728                                 -- compare against upper
 729                            -- eax==0 if high part == sign extended low part
 730                         ]
 731          -- in
 732          return (Fixed size eax code)
 733
 734     --------------------
 735     shift_code :: Width
 736                -> (Size -> Operand -> Operand -> Instr)
 737                -> CmmExpr
 738                -> CmmExpr
 739                -> NatM Register
 740
 741     {- Case1: shift length as immediate -}
 742     shift_code width instr x y@(CmmLit lit) = do
 743           x_code <- getAnyReg x
 744           let
 745                size = intSize width
 746                code dst
 747                   = x_code dst `snocOL`
 748                     instr size (OpImm (litToImm lit)) (OpReg dst)
 749           -- in
 750           return (Any size code)
 751
 752     {- Case2: shift length is complex (non-immediate)
 753       * y must go in %ecx.
 754       * we cannot do y first *and* put its result in %ecx, because
 755         %ecx might be clobbered by x.
 756       * if we do y second, then x cannot be
 757         in a clobbered reg.  Also, we cannot clobber x's reg
 758         with the instruction itself.
 759       * so we can either:
 760         - do y first, put its result in a fresh tmp, then copy it to %ecx later
 761         - do y second and put its result into %ecx.  x gets placed in a fresh
 762           tmp.  This is likely to be better, becuase the reg alloc can
 763           eliminate this reg->reg move here (it won't eliminate the other one,
 764           because the move is into the fixed %ecx).
 765     -}
 766     shift_code width instr x y{-amount-} = do
 767         x_code <- getAnyReg x
 768         let size = intSize width
 769         tmp <- getNewRegNat size
 770         y_code <- getAnyReg y
 771         let
 772            code = x_code tmp `appOL`
 773                   y_code ecx `snocOL`
 774                   instr size (OpReg ecx) (OpReg tmp)
 775         -- in
 776         return (Fixed size tmp code)
 777
 778     --------------------
 779     add_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
 780     add_code rep x (CmmLit (CmmInt y _))
 781         | is32BitInteger y = add_int rep x y
 782     add_code rep x y = trivialCode rep (ADD size) (Just (ADD size)) x y
 783       where size = intSize rep
 784
 785     --------------------
 786     sub_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
 787     sub_code rep x (CmmLit (CmmInt y _))
 788         | is32BitInteger (-y) = add_int rep x (-y)
 789     sub_code rep x y = trivialCode rep (SUB (intSize rep)) Nothing x y
 790
 791     -- our three-operand add instruction:
 792     add_int width x y = do
 793         (x_reg, x_code) <- getSomeReg x
 794         let
 795             size = intSize width
 796             imm = ImmInt (fromInteger y)
 797             code dst
 798                = x_code `snocOL`
 799                  LEA size
 800                         (OpAddr (AddrBaseIndex (EABaseReg x_reg) EAIndexNone imm))
 801                         (OpReg dst)
 802         --
 803         return (Any size code)
 804
 805     ----------------------
 806     div_code width signed quotient x y = do
 807            (y_op, y_code) <- getRegOrMem y -- cannot be clobbered
 808            x_code <- getAnyReg x
 809            let
 810              size = intSize width
 811              widen | signed    = CLTD size
 812                    | otherwise = XOR size (OpReg edx) (OpReg edx)
 813
 814              instr | signed    = IDIV
 815                    | otherwise = DIV
 816
 817              code = y_code `appOL`
 818                     x_code eax `appOL`
 819                     toOL [widen, instr size y_op]
 820
 821              result | quotient  = eax
 822                     | otherwise = edx
 823
 824            -- in
 825            return (Fixed size result code)
 826
 827
 828 getRegister (CmmLoad mem pk)
 829   | isFloatType pk
 830   = do
 831     Amode addr mem_code <- getAmode mem
 832     use_sse2 <- sse2Enabled
 833     loadFloatAmode use_sse2 (typeWidth pk) addr mem_code
 834
 835 #if i386_TARGET_ARCH
 836 getRegister (CmmLoad mem pk)
 837   | not (isWord64 pk)
 838   = do
 839     code <- intLoadCode instr mem
 840     return (Any size code)
 841   where
 842     width = typeWidth pk
 843     size = intSize width
 844     instr = case width of
 845                 W8     -> MOVZxL II8
 846                 _other -> MOV size
 847         -- We always zero-extend 8-bit loads, if we
 848         -- can't think of anything better.  This is because
 849         -- we can't guarantee access to an 8-bit variant of every register
 850         -- (esi and edi don't have 8-bit variants), so to make things
 851         -- simpler we do our 8-bit arithmetic with full 32-bit registers.
 852 #endif
 853
 854 #if x86_64_TARGET_ARCH
 855 -- Simpler memory load code on x86_64
 856 getRegister (CmmLoad mem pk)
 857   = do
 858     code <- intLoadCode (MOV size) mem
 859     return (Any size code)
 860   where size = intSize $ typeWidth pk
 861 #endif
 862
 863 getRegister (CmmLit (CmmInt 0 width))
 864   = let
 865         size = intSize width
 866
 867         -- x86_64: 32-bit xor is one byte shorter, and zero-extends to 64 bits
 868         adj_size = case size of II64 -> II32; _ -> size
 869         size1 = IF_ARCH_i386( size, adj_size )
 870         code dst
 871            = unitOL (XOR size1 (OpReg dst) (OpReg dst))
 872     in
 873         return (Any size code)
 874
 875 #if x86_64_TARGET_ARCH
 876   -- optimisation for loading small literals on x86_64: take advantage
 877   -- of the automatic zero-extension from 32 to 64 bits, because the 32-bit
 878   -- instruction forms are shorter.
 879 getRegister (CmmLit lit)
 880   | isWord64 (cmmLitType lit), not (isBigLit lit)
 881   = let
 882         imm = litToImm lit
 883         code dst = unitOL (MOV II32 (OpImm imm) (OpReg dst))
 884     in
 885         return (Any II64 code)
 886   where
 887    isBigLit (CmmInt i _) = i < 0 || i > 0xffffffff
 888    isBigLit _ = False
 889         -- note1: not the same as (not.is32BitLit), because that checks for
 890         -- signed literals that fit in 32 bits, but we want unsigned
 891         -- literals here.
 892         -- note2: all labels are small, because we're assuming the
 893         -- small memory model (see gcc docs, -mcmodel=small).
 894 #endif
 895
 896 getRegister (CmmLit lit)
 897   = let
 898         size = cmmTypeSize (cmmLitType lit)
 899         imm = litToImm lit
 900         code dst = unitOL (MOV size (OpImm imm) (OpReg dst))
 901     in
 902         return (Any size code)
 903
 904 getRegister other = pprPanic "getRegister(x86)" (ppr other)
 905
 906
 907 intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr
 908    -> NatM (Reg -> InstrBlock)
 909 intLoadCode instr mem = do
 910   Amode src mem_code <- getAmode mem
 911   return (\dst -> mem_code `snocOL` instr (OpAddr src) (OpReg dst))
 912
 913 -- Compute an expression into *any* register, adding the appropriate
 914 -- move instruction if necessary.
 915 getAnyReg :: CmmExpr -> NatM (Reg -> InstrBlock)
 916 getAnyReg expr = do
 917   r <- getRegister expr
 918   anyReg r
 919
 920 anyReg :: Register -> NatM (Reg -> InstrBlock)
 921 anyReg (Any _ code)          = return code
 922 anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst)
 923
 924 -- A bit like getSomeReg, but we want a reg that can be byte-addressed.
 925 -- Fixed registers might not be byte-addressable, so we make sure we've
 926 -- got a temporary, inserting an extra reg copy if necessary.
 927 getByteReg :: CmmExpr -> NatM (Reg, InstrBlock)
 928 #if x86_64_TARGET_ARCH
 929 getByteReg = getSomeReg -- all regs are byte-addressable on x86_64
 930 #else
 931 getByteReg expr = do
 932   r <- getRegister expr
 933   case r of
 934     Any rep code -> do
 935         tmp <- getNewRegNat rep
 936         return (tmp, code tmp)
 937     Fixed rep reg code
 938         | isVirtualReg reg -> return (reg,code)
 939         | otherwise -> do
 940             tmp <- getNewRegNat rep
 941             return (tmp, code `snocOL` reg2reg rep reg tmp)
 942         -- ToDo: could optimise slightly by checking for byte-addressable
 943         -- real registers, but that will happen very rarely if at all.
 944 #endif
 945
 946 -- Another variant: this time we want the result in a register that cannot
 947 -- be modified by code to evaluate an arbitrary expression.
 948 getNonClobberedReg :: CmmExpr -> NatM (Reg, InstrBlock)
 949 getNonClobberedReg expr = do
 950   r <- getRegister expr
 951   case r of
 952     Any rep code -> do
 953         tmp <- getNewRegNat rep
 954         return (tmp, code tmp)
 955     Fixed rep reg code
 956         -- only free regs can be clobbered
 957         | RegReal (RealRegSingle rr) <- reg
 958         , isFastTrue (freeReg rr)
 959         -> do
 960                 tmp <- getNewRegNat rep
 961                 return (tmp, code `snocOL` reg2reg rep reg tmp)
 962         | otherwise ->
 963                 return (reg, code)
 964
 965 reg2reg :: Size -> Reg -> Reg -> Instr
 966 reg2reg size src dst
 967   | size == FF80 = GMOV src dst
 968   | otherwise    = MOV size (OpReg src) (OpReg dst)
 969
 970
 971 --------------------------------------------------------------------------------
 972 getAmode :: CmmExpr -> NatM Amode
 973 getAmode tree@(CmmRegOff _ _) = getAmode (mangleIndexTree tree)
 974
 975 #if x86_64_TARGET_ARCH
 976
 977 getAmode (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
 978                                      CmmLit displacement])
 979     = return $ Amode (ripRel (litToImm displacement)) nilOL
 980
 981 #endif
 982
 983
 984 -- This is all just ridiculous, since it carefully undoes
 985 -- what mangleIndexTree has just done.
 986 getAmode (CmmMachOp (MO_Sub rep) [x, CmmLit lit@(CmmInt i _)])
 987   | is32BitLit lit
 988   -- ASSERT(rep == II32)???
 989   = do (x_reg, x_code) <- getSomeReg x
 990        let off = ImmInt (-(fromInteger i))
 991        return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
 992
 993 getAmode (CmmMachOp (MO_Add rep) [x, CmmLit lit])
 994   | is32BitLit lit
 995   -- ASSERT(rep == II32)???
 996   = do (x_reg, x_code) <- getSomeReg x
 997        let off = litToImm lit
 998        return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
 999
1000 -- Turn (lit1 << n  + lit2) into  (lit2 + lit1 << n) so it will be
1001 -- recognised by the next rule.
1002 getAmode (CmmMachOp (MO_Add rep) [a@(CmmMachOp (MO_Shl _) _),
1003                                   b@(CmmLit _)])
1004   = getAmode (CmmMachOp (MO_Add rep) [b,a])
1005
1006 getAmode (CmmMachOp (MO_Add rep) [x, CmmMachOp (MO_Shl _)
1007                                         [y, CmmLit (CmmInt shift _)]])
1008   | shift == 0 || shift == 1 || shift == 2 || shift == 3
1009   = x86_complex_amode x y shift 0
1010
1011 getAmode (CmmMachOp (MO_Add rep)
1012                 [x, CmmMachOp (MO_Add _)
1013                         [CmmMachOp (MO_Shl _) [y, CmmLit (CmmInt shift _)],
1014                          CmmLit (CmmInt offset _)]])
1015   | shift == 0 || shift == 1 || shift == 2 || shift == 3
1016   && is32BitInteger offset
1017   = x86_complex_amode x y shift offset
1018
1019 getAmode (CmmMachOp (MO_Add rep) [x,y])
1020   = x86_complex_amode x y 0 0
1021
1022 getAmode (CmmLit lit) | is32BitLit lit
1023   = return (Amode (ImmAddr (litToImm lit) 0) nilOL)
1024
1025 getAmode expr = do
1026   (reg,code) <- getSomeReg expr
1027   return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code)
1028
1029
1030 x86_complex_amode :: CmmExpr -> CmmExpr -> Integer -> Integer -> NatM Amode
1031 x86_complex_amode base index shift offset
1032   = do (x_reg, x_code) <- getNonClobberedReg base
1033         -- x must be in a temp, because it has to stay live over y_code
1034         -- we could compre x_reg and y_reg and do something better here...
1035        (y_reg, y_code) <- getSomeReg index
1036        let
1037            code = x_code `appOL` y_code
1038            base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8
1039        return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt (fromIntegral offset)))
1040                code)
1041
1042
1043
1044
1045 -- -----------------------------------------------------------------------------
1046 -- getOperand: sometimes any operand will do.
1047
1048 -- getNonClobberedOperand: the value of the operand will remain valid across
1049 -- the computation of an arbitrary expression, unless the expression
1050 -- is computed directly into a register which the operand refers to
1051 -- (see trivialCode where this function is used for an example).
1052
1053 getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
1054 getNonClobberedOperand (CmmLit lit) = do
1055   use_sse2 <- sse2Enabled
1056   if use_sse2 && isSuitableFloatingPointLit lit
1057     then do
1058       let CmmFloat _ w = lit
1059       Amode addr code <- memConstant (widthInBytes w) lit
1060       return (OpAddr addr, code)
1061      else do
1062
1063   if is32BitLit lit && not (isFloatType (cmmLitType lit))
1064     then return (OpImm (litToImm lit), nilOL)
1065     else getNonClobberedOperand_generic (CmmLit lit)
1066
1067 getNonClobberedOperand (CmmLoad mem pk) = do
1068   use_sse2 <- sse2Enabled
1069   if (not (isFloatType pk) || use_sse2)
1070       && IF_ARCH_i386(not (isWord64 pk), True)
1071     then do
1072       Amode src mem_code <- getAmode mem
1073       (src',save_code) <-
1074         if (amodeCouldBeClobbered src)
1075                 then do
1076                    tmp <- getNewRegNat archWordSize
1077                    return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
1078                            unitOL (LEA II32 (OpAddr src) (OpReg tmp)))
1079                 else
1080                    return (src, nilOL)
1081       return (OpAddr src', save_code `appOL` mem_code)
1082     else do
1083       getNonClobberedOperand_generic (CmmLoad mem pk)
1084
1085 getNonClobberedOperand e = getNonClobberedOperand_generic e
1086
1087 getNonClobberedOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
1088 getNonClobberedOperand_generic e = do
1089     (reg, code) <- getNonClobberedReg e
1090     return (OpReg reg, code)
1091
1092 amodeCouldBeClobbered :: AddrMode -> Bool
1093 amodeCouldBeClobbered amode = any regClobbered (addrModeRegs amode)
1094
1095 regClobbered (RegReal (RealRegSingle rr)) = isFastTrue (freeReg rr)
1096 regClobbered _ = False
1097
1098 -- getOperand: the operand is not required to remain valid across the
1099 -- computation of an arbitrary expression.
1100 getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
1101
1102 getOperand (CmmLit lit) = do
1103   use_sse2 <- sse2Enabled
1104   if (use_sse2 && isSuitableFloatingPointLit lit)
1105     then do
1106       let CmmFloat _ w = lit
1107       Amode addr code <- memConstant (widthInBytes w) lit
1108       return (OpAddr addr, code)
1109     else do
1110
1111   if is32BitLit lit && not (isFloatType (cmmLitType lit))
1112     then return (OpImm (litToImm lit), nilOL)
1113     else getOperand_generic (CmmLit lit)
1114
1115 getOperand (CmmLoad mem pk) = do
1116   use_sse2 <- sse2Enabled
1117   if (not (isFloatType pk) || use_sse2) && IF_ARCH_i386(not (isWord64 pk), True)
1118      then do
1119        Amode src mem_code <- getAmode mem
1120        return (OpAddr src, mem_code)
1121      else
1122        getOperand_generic (CmmLoad mem pk)
1123
1124 getOperand e = getOperand_generic e
1125
1126 getOperand_generic e = do
1127     (reg, code) <- getSomeReg e
1128     return (OpReg reg, code)
1129
1130 isOperand :: CmmExpr -> Bool
1131 isOperand (CmmLoad _ _) = True
1132 isOperand (CmmLit lit)  = is32BitLit lit
1133                           || isSuitableFloatingPointLit lit
1134 isOperand _             = False
1135
1136 memConstant :: Int -> CmmLit -> NatM Amode
1137 memConstant align lit = do
1138 #ifdef x86_64_TARGET_ARCH
1139   lbl <- getNewLabelNat
1140   let addr = ripRel (ImmCLbl lbl)
1141       addr_code = nilOL
1142 #else
1143   lbl <- getNewLabelNat
1144   dflags <- getDynFlagsNat
1145   dynRef <- cmmMakeDynamicReference dflags addImportNat DataReference lbl
1146   Amode addr addr_code <- getAmode dynRef
1147 #endif
1148   let code =
1149         LDATA ReadOnlyData
1150                 [CmmAlign align,
1151                  CmmDataLabel lbl,
1152                  CmmStaticLit lit]
1153         `consOL` addr_code
1154   return (Amode addr code)
1155
1156
1157 loadFloatAmode :: Bool -> Width -> AddrMode -> InstrBlock -> NatM Register
1158 loadFloatAmode use_sse2 w addr addr_code = do
1159   let size = floatSize w
1160       code dst = addr_code `snocOL`
1161                  if use_sse2
1162                     then MOV size (OpAddr addr) (OpReg dst)
1163                     else GLD size addr dst
1164   -- in
1165   return (Any (if use_sse2 then size else FF80) code)
1166
1167
1168 -- if we want a floating-point literal as an operand, we can
1169 -- use it directly from memory.  However, if the literal is
1170 -- zero, we're better off generating it into a register using
1171 -- xor.
1172 isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0
1173 isSuitableFloatingPointLit _ = False
1174
1175 getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
1176 getRegOrMem e@(CmmLoad mem pk) = do
1177   use_sse2 <- sse2Enabled
1178   if (not (isFloatType pk) || use_sse2) && IF_ARCH_i386(not (isWord64 pk), True)
1179      then do
1180        Amode src mem_code <- getAmode mem
1181        return (OpAddr src, mem_code)
1182      else do
1183        (reg, code) <- getNonClobberedReg e
1184        return (OpReg reg, code)
1185 getRegOrMem e = do
1186     (reg, code) <- getNonClobberedReg e
1187     return (OpReg reg, code)
1188
1189 #if x86_64_TARGET_ARCH
1190 is32BitLit (CmmInt i W64) = is32BitInteger i
1191    -- assume that labels are in the range 0-2^31-1: this assumes the
1192    -- small memory model (see gcc docs, -mcmodel=small).
1193 #endif
1194 is32BitLit x = True
1195
1196
1197
1198
1199 -- Set up a condition code for a conditional branch.
1200
1201 getCondCode :: CmmExpr -> NatM CondCode
1202
1203 -- yes, they really do seem to want exactly the same!
1204
1205 getCondCode (CmmMachOp mop [x, y])
1206   =
1207     case mop of
1208       MO_F_Eq W32 -> condFltCode EQQ x y
1209       MO_F_Ne W32 -> condFltCode NE  x y
1210       MO_F_Gt W32 -> condFltCode GTT x y
1211       MO_F_Ge W32 -> condFltCode GE  x y
1212       MO_F_Lt W32 -> condFltCode LTT x y
1213       MO_F_Le W32 -> condFltCode LE  x y
1214
1215       MO_F_Eq W64 -> condFltCode EQQ x y
1216       MO_F_Ne W64 -> condFltCode NE  x y
1217       MO_F_Gt W64 -> condFltCode GTT x y
1218       MO_F_Ge W64 -> condFltCode GE  x y
1219       MO_F_Lt W64 -> condFltCode LTT x y
1220       MO_F_Le W64 -> condFltCode LE  x y
1221
1222       MO_Eq rep -> condIntCode EQQ  x y
1223       MO_Ne rep -> condIntCode NE   x y
1224
1225       MO_S_Gt rep -> condIntCode GTT  x y
1226       MO_S_Ge rep -> condIntCode GE   x y
1227       MO_S_Lt rep -> condIntCode LTT  x y
1228       MO_S_Le rep -> condIntCode LE   x y
1229
1230       MO_U_Gt rep -> condIntCode GU   x y
1231       MO_U_Ge rep -> condIntCode GEU  x y
1232       MO_U_Lt rep -> condIntCode LU   x y
1233       MO_U_Le rep -> condIntCode LEU  x y
1234
1235       other -> pprPanic "getCondCode(x86,x86_64,sparc)" (ppr (CmmMachOp mop [x,y]))
1236
1237 getCondCode other =  pprPanic "getCondCode(2)(x86,sparc)" (ppr other)
1238
1239
1240
1241
1242 -- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be
1243 -- passed back up the tree.
1244
1245 condIntCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
1246
1247 -- memory vs immediate
1248 condIntCode cond (CmmLoad x pk) (CmmLit lit) | is32BitLit lit = do
1249     Amode x_addr x_code <- getAmode x
1250     let
1251         imm  = litToImm lit
1252         code = x_code `snocOL`
1253                   CMP (cmmTypeSize pk) (OpImm imm) (OpAddr x_addr)
1254     --
1255     return (CondCode False cond code)
1256
1257 -- anything vs zero, using a mask
1258 -- TODO: Add some sanity checking!!!!
1259 condIntCode cond (CmmMachOp (MO_And rep) [x,o2]) (CmmLit (CmmInt 0 pk))
1260     | (CmmLit lit@(CmmInt mask pk2)) <- o2, is32BitLit lit
1261     = do
1262       (x_reg, x_code) <- getSomeReg x
1263       let
1264          code = x_code `snocOL`
1265                 TEST (intSize pk) (OpImm (ImmInteger mask)) (OpReg x_reg)
1266       --
1267       return (CondCode False cond code)
1268
1269 -- anything vs zero
1270 condIntCode cond x (CmmLit (CmmInt 0 pk)) = do
1271     (x_reg, x_code) <- getSomeReg x
1272     let
1273         code = x_code `snocOL`
1274                   TEST (intSize pk) (OpReg x_reg) (OpReg x_reg)
1275     --
1276     return (CondCode False cond code)
1277
1278 -- anything vs operand
1279 condIntCode cond x y | isOperand y = do
1280     (x_reg, x_code) <- getNonClobberedReg x
1281     (y_op,  y_code) <- getOperand y
1282     let
1283         code = x_code `appOL` y_code `snocOL`
1284                   CMP (cmmTypeSize (cmmExprType x)) y_op (OpReg x_reg)
1285     -- in
1286     return (CondCode False cond code)
1287
1288 -- anything vs anything
1289 condIntCode cond x y = do
1290   (y_reg, y_code) <- getNonClobberedReg y
1291   (x_op, x_code) <- getRegOrMem x
1292   let
1293         code = y_code `appOL`
1294                x_code `snocOL`
1295                   CMP (cmmTypeSize (cmmExprType x)) (OpReg y_reg) x_op
1296   -- in
1297   return (CondCode False cond code)
1298
1299
1300
1301 --------------------------------------------------------------------------------
1302 condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
1303
1304 condFltCode cond x y
1305   = if_sse2 condFltCode_sse2 condFltCode_x87
1306   where
1307
1308   condFltCode_x87
1309     = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do
1310     (x_reg, x_code) <- getNonClobberedReg x
1311     (y_reg, y_code) <- getSomeReg y
1312     use_sse2 <- sse2Enabled
1313     let
1314         code = x_code `appOL` y_code `snocOL`
1315                 GCMP cond x_reg y_reg
1316     -- The GCMP insn does the test and sets the zero flag if comparable
1317     -- and true.  Hence we always supply EQQ as the condition to test.
1318     return (CondCode True EQQ code)
1319
1320   -- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
1321   -- an operand, but the right must be a reg.  We can probably do better
1322   -- than this general case...
1323   condFltCode_sse2 = do
1324     (x_reg, x_code) <- getNonClobberedReg x
1325     (y_op, y_code) <- getOperand y
1326     let
1327         code = x_code `appOL`
1328                y_code `snocOL`
1329                   CMP (floatSize $ cmmExprWidth x) y_op (OpReg x_reg)
1330         -- NB(1): we need to use the unsigned comparison operators on the
1331         -- result of this comparison.
1332     -- in
1333     return (CondCode True (condToUnsigned cond) code)
1334
1335 -- -----------------------------------------------------------------------------
1336 -- Generating assignments
1337
1338 -- Assignments are really at the heart of the whole code generation
1339 -- business.  Almost all top-level nodes of any real importance are
1340 -- assignments, which correspond to loads, stores, or register
1341 -- transfers.  If we're really lucky, some of the register transfers
1342 -- will go away, because we can use the destination register to
1343 -- complete the code generation for the right hand side.  This only
1344 -- fails when the right hand side is forced into a fixed register
1345 -- (e.g. the result of a call).
1346
1347 assignMem_IntCode :: Size -> CmmExpr -> CmmExpr -> NatM InstrBlock
1348 assignReg_IntCode :: Size -> CmmReg  -> CmmExpr -> NatM InstrBlock
1349
1350 assignMem_FltCode :: Size -> CmmExpr -> CmmExpr -> NatM InstrBlock
1351 assignReg_FltCode :: Size -> CmmReg  -> CmmExpr -> NatM InstrBlock
1352
1353
1354 -- integer assignment to memory
1355
1356 -- specific case of adding/subtracting an integer to a particular address.
1357 -- ToDo: catch other cases where we can use an operation directly on a memory
1358 -- address.
1359 assignMem_IntCode pk addr (CmmMachOp op [CmmLoad addr2 _,
1360                                                  CmmLit (CmmInt i _)])
1361    | addr == addr2, pk /= II64 || is32BitInteger i,
1362      Just instr <- check op
1363    = do Amode amode code_addr <- getAmode addr
1364         let code = code_addr `snocOL`
1365                    instr pk (OpImm (ImmInt (fromIntegral i))) (OpAddr amode)
1366         return code
1367    where
1368         check (MO_Add _) = Just ADD
1369         check (MO_Sub _) = Just SUB
1370         check _ = Nothing
1371         -- ToDo: more?
1372
1373 -- general case
1374 assignMem_IntCode pk addr src = do
1375     Amode addr code_addr <- getAmode addr
1376     (code_src, op_src)   <- get_op_RI src
1377     let
1378         code = code_src `appOL`
1379                code_addr `snocOL`
1380                   MOV pk op_src (OpAddr addr)
1381         -- NOTE: op_src is stable, so it will still be valid
1382         -- after code_addr.  This may involve the introduction
1383         -- of an extra MOV to a temporary register, but we hope
1384         -- the register allocator will get rid of it.
1385     --
1386     return code
1387   where
1388     get_op_RI :: CmmExpr -> NatM (InstrBlock,Operand)   -- code, operator
1389     get_op_RI (CmmLit lit) | is32BitLit lit
1390       = return (nilOL, OpImm (litToImm lit))
1391     get_op_RI op
1392       = do (reg,code) <- getNonClobberedReg op
1393            return (code, OpReg reg)
1394
1395
1396 -- Assign; dst is a reg, rhs is mem
1397 assignReg_IntCode pk reg (CmmLoad src _) = do
1398   load_code <- intLoadCode (MOV pk) src
1399   return (load_code (getRegisterReg False{-no sse2-} reg))
1400
1401 -- dst is a reg, but src could be anything
1402 assignReg_IntCode pk reg src = do
1403   code <- getAnyReg src
1404   return (code (getRegisterReg False{-no sse2-} reg))
1405
1406
1407 -- Floating point assignment to memory
1408 assignMem_FltCode pk addr src = do
1409   (src_reg, src_code) <- getNonClobberedReg src
1410   Amode addr addr_code <- getAmode addr
1411   use_sse2 <- sse2Enabled
1412   let
1413         code = src_code `appOL`
1414                addr_code `snocOL`
1415                 if use_sse2 then MOV pk (OpReg src_reg) (OpAddr addr)
1416                             else GST pk src_reg addr
1417   return code
1418
1419 -- Floating point assignment to a register/temporary
1420 assignReg_FltCode pk reg src = do
1421   use_sse2 <- sse2Enabled
1422   src_code <- getAnyReg src
1423   return (src_code (getRegisterReg use_sse2 reg))
1424
1425
1426 genJump :: CmmExpr{-the branch target-} -> NatM InstrBlock
1427
1428 genJump (CmmLoad mem pk) = do
1429   Amode target code <- getAmode mem
1430   return (code `snocOL` JMP (OpAddr target))
1431
1432 genJump (CmmLit lit) = do
1433   return (unitOL (JMP (OpImm (litToImm lit))))
1434
1435 genJump expr = do
1436   (reg,code) <- getSomeReg expr
1437   return (code `snocOL` JMP (OpReg reg))
1438
1439
1440 -- -----------------------------------------------------------------------------
1441 --  Unconditional branches
1442
1443 genBranch :: BlockId -> NatM InstrBlock
1444 genBranch = return . toOL . mkJumpInstr
1445
1446
1447
1448 -- -----------------------------------------------------------------------------
1449 --  Conditional jumps
1450
1451 {-
1452 Conditional jumps are always to local labels, so we can use branch
1453 instructions.  We peek at the arguments to decide what kind of
1454 comparison to do.
1455
1456 I386: First, we have to ensure that the condition
1457 codes are set according to the supplied comparison operation.
1458 -}
1459
1460 genCondJump
1461     :: BlockId      -- the branch target
1462     -> CmmExpr      -- the condition on which to branch
1463     -> NatM InstrBlock
1464
1465 genCondJump id bool = do
1466   CondCode is_float cond cond_code <- getCondCode bool
1467   use_sse2 <- sse2Enabled
1468   if not is_float || not use_sse2
1469     then
1470         return (cond_code `snocOL` JXX cond id)
1471     else do
1472         lbl <- getBlockIdNat
1473
1474         -- see comment with condFltReg
1475         let code = case cond of
1476                         NE  -> or_unordered
1477                         GU  -> plain_test
1478                         GEU -> plain_test
1479                         _   -> and_ordered
1480
1481             plain_test = unitOL (
1482                   JXX cond id
1483                 )
1484             or_unordered = toOL [
1485                   JXX cond id,
1486                   JXX PARITY id
1487                 ]
1488             and_ordered = toOL [
1489                   JXX PARITY lbl,
1490                   JXX cond id,
1491                   JXX ALWAYS lbl,
1492                   NEWBLOCK lbl
1493                 ]
1494         return (cond_code `appOL` code)
1495
1496
1497 -- -----------------------------------------------------------------------------
1498 --  Generating C calls
1499
1500 -- Now the biggest nightmare---calls.  Most of the nastiness is buried in
1501 -- @get_arg@, which moves the arguments to the correct registers/stack
1502 -- locations.  Apart from that, the code is easy.
1503 --
1504 -- (If applicable) Do not fill the delay slots here; you will confuse the
1505 -- register allocator.
1506
1507 genCCall
1508     :: CmmCallTarget            -- function to call
1509     -> HintedCmmFormals         -- where to put the result
1510     -> HintedCmmActuals         -- arguments (of mixed type)
1511     -> NatM InstrBlock
1512
1513 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1514
1515 #if i386_TARGET_ARCH
1516
1517 genCCall (CmmPrim MO_WriteBarrier) _ _ = return nilOL
1518         -- write barrier compiles to no code on x86/x86-64;
1519         -- we keep it this long in order to prevent earlier optimisations.
1520
1521 -- we only cope with a single result for foreign calls
1522 genCCall (CmmPrim op) [CmmHinted r _] args = do
1523   l1 <- getNewLabelNat
1524   l2 <- getNewLabelNat
1525   sse2 <- sse2Enabled
1526   if sse2
1527     then
1528       outOfLineFloatOp op r args
1529     else case op of
1530         MO_F32_Sqrt -> actuallyInlineFloatOp GSQRT FF32 args
1531         MO_F64_Sqrt -> actuallyInlineFloatOp GSQRT FF64 args
1532
1533         MO_F32_Sin  -> actuallyInlineFloatOp (\s -> GSIN s l1 l2) FF32 args
1534         MO_F64_Sin  -> actuallyInlineFloatOp (\s -> GSIN s l1 l2) FF64 args
1535
1536         MO_F32_Cos  -> actuallyInlineFloatOp (\s -> GCOS s l1 l2) FF32 args
1537         MO_F64_Cos  -> actuallyInlineFloatOp (\s -> GCOS s l1 l2) FF64 args
1538
1539         MO_F32_Tan  -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF32 args
1540         MO_F64_Tan  -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF64 args
1541
1542         other_op    -> outOfLineFloatOp op r args
1543
1544  where
1545   actuallyInlineFloatOp instr size [CmmHinted x _]
1546         = do res <- trivialUFCode size (instr size) x
1547              any <- anyReg res
1548              return (any (getRegisterReg False (CmmLocal r)))
1549
1550 genCCall target dest_regs args = do
1551     let
1552         sizes               = map (arg_size . cmmExprType . hintlessCmm) (reverse args)
1553 #if !darwin_TARGET_OS
1554         tot_arg_size        = sum sizes
1555 #else
1556         raw_arg_size        = sum sizes
1557         tot_arg_size        = roundTo 16 raw_arg_size
1558         arg_pad_size        = tot_arg_size - raw_arg_size
1559     delta0 <- getDeltaNat
1560     setDeltaNat (delta0 - arg_pad_size)
1561 #endif
1562
1563     use_sse2 <- sse2Enabled
1564     push_codes <- mapM (push_arg use_sse2) (reverse args)
1565     delta <- getDeltaNat
1566
1567     -- in
1568     -- deal with static vs dynamic call targets
1569     (callinsns,cconv) <-
1570       case target of
1571         -- CmmPrim -> ...
1572         CmmCallee (CmmLit (CmmLabel lbl)) conv
1573            -> -- ToDo: stdcall arg sizes
1574               return (unitOL (CALL (Left fn_imm) []), conv)
1575            where fn_imm = ImmCLbl lbl
1576         CmmCallee expr conv
1577            -> do { (dyn_r, dyn_c) <- getSomeReg expr
1578                  ; ASSERT( isWord32 (cmmExprType expr) )
1579                    return (dyn_c `snocOL` CALL (Right dyn_r) [], conv) }
1580
1581     let push_code
1582 #if darwin_TARGET_OS
1583             | arg_pad_size /= 0
1584             = toOL [SUB II32 (OpImm (ImmInt arg_pad_size)) (OpReg esp),
1585                     DELTA (delta0 - arg_pad_size)]
1586               `appOL` concatOL push_codes
1587             | otherwise
1588 #endif
1589             = concatOL push_codes
1590
1591           -- Deallocate parameters after call for ccall;
1592           -- but not for stdcall (callee does it)
1593           --
1594           -- We have to pop any stack padding we added
1595           -- on Darwin even if we are doing stdcall, though (#5052)
1596         pop_size | cconv /= StdCallConv = tot_arg_size
1597                  | otherwise
1598 #if darwin_TARGET_OS
1599                  = arg_pad_size
1600 #else
1601                  = 0
1602 #endif
1603
1604         call = callinsns `appOL`
1605                toOL (
1606                   (if pop_size==0 then [] else
1607                    [ADD II32 (OpImm (ImmInt pop_size)) (OpReg esp)])
1608                   ++
1609                   [DELTA (delta + tot_arg_size)]
1610                )
1611     -- in
1612     setDeltaNat (delta + tot_arg_size)
1613
1614     let
1615         -- assign the results, if necessary
1616         assign_code []     = nilOL
1617         assign_code [CmmHinted dest _hint]
1618           | isFloatType ty =
1619              if use_sse2
1620                 then let tmp_amode = AddrBaseIndex (EABaseReg esp)
1621                                                    EAIndexNone
1622                                                    (ImmInt 0)
1623                          sz = floatSize w
1624                      in toOL [ SUB II32 (OpImm (ImmInt b)) (OpReg esp),
1625                                GST sz fake0 tmp_amode,
1626                                MOV sz (OpAddr tmp_amode) (OpReg r_dest),
1627                                ADD II32 (OpImm (ImmInt b)) (OpReg esp)]
1628                 else unitOL (GMOV fake0 r_dest)
1629           | isWord64 ty    = toOL [MOV II32 (OpReg eax) (OpReg r_dest),
1630                                     MOV II32 (OpReg edx) (OpReg r_dest_hi)]
1631           | otherwise      = unitOL (MOV (intSize w) (OpReg eax) (OpReg r_dest))
1632           where
1633                 ty = localRegType dest
1634                 w  = typeWidth ty
1635                 b  = widthInBytes w
1636                 r_dest_hi = getHiVRegFromLo r_dest
1637                 r_dest    = getRegisterReg use_sse2 (CmmLocal dest)
1638         assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many)
1639
1640     return (push_code `appOL`
1641             call `appOL`
1642             assign_code dest_regs)
1643
1644   where
1645     arg_size :: CmmType -> Int  -- Width in bytes
1646     arg_size ty = widthInBytes (typeWidth ty)
1647
1648     roundTo a x | x `mod` a == 0 = x
1649                 | otherwise = x + a - (x `mod` a)
1650
1651
1652     push_arg :: Bool -> HintedCmmActual {-current argument-}
1653                     -> NatM InstrBlock  -- code
1654
1655     push_arg use_sse2 (CmmHinted arg _hint) -- we don't need the hints on x86
1656       | isWord64 arg_ty = do
1657         ChildCode64 code r_lo <- iselExpr64 arg
1658         delta <- getDeltaNat
1659         setDeltaNat (delta - 8)
1660         let
1661             r_hi = getHiVRegFromLo r_lo
1662         -- in
1663         return (       code `appOL`
1664                        toOL [PUSH II32 (OpReg r_hi), DELTA (delta - 4),
1665                              PUSH II32 (OpReg r_lo), DELTA (delta - 8),
1666                              DELTA (delta-8)]
1667             )
1668
1669       | isFloatType arg_ty = do
1670         (reg, code) <- getSomeReg arg
1671         delta <- getDeltaNat
1672         setDeltaNat (delta-size)
1673         return (code `appOL`
1674                         toOL [SUB II32 (OpImm (ImmInt size)) (OpReg esp),
1675                               DELTA (delta-size),
1676                               let addr = AddrBaseIndex (EABaseReg esp)
1677                                                         EAIndexNone
1678                                                         (ImmInt 0)
1679                                   size = floatSize (typeWidth arg_ty)
1680                               in
1681                               if use_sse2
1682                                  then MOV size (OpReg reg) (OpAddr addr)
1683                                  else GST size reg addr
1684                              ]
1685                        )
1686
1687       | otherwise = do
1688         (operand, code) <- getOperand arg
1689         delta <- getDeltaNat
1690         setDeltaNat (delta-size)
1691         return (code `snocOL`
1692                 PUSH II32 operand `snocOL`
1693                 DELTA (delta-size))
1694
1695       where
1696          arg_ty = cmmExprType arg
1697          size = arg_size arg_ty -- Byte size
1698
1699 #elif x86_64_TARGET_ARCH
1700
1701 genCCall (CmmPrim MO_WriteBarrier) _ _ = return nilOL
1702         -- write barrier compiles to no code on x86/x86-64;
1703         -- we keep it this long in order to prevent earlier optimisations.
1704
1705
1706 genCCall (CmmPrim op) [CmmHinted r _] args =
1707   outOfLineFloatOp op r args
1708
1709 genCCall target dest_regs args = do
1710
1711         -- load up the register arguments
1712     (stack_args, aregs, fregs, load_args_code)
1713          <- load_args args allArgRegs allFPArgRegs nilOL
1714
1715     let
1716         fp_regs_used  = reverse (drop (length fregs) (reverse allFPArgRegs))
1717         int_regs_used = reverse (drop (length aregs) (reverse allArgRegs))
1718         arg_regs = [eax] ++ int_regs_used ++ fp_regs_used
1719                 -- for annotating the call instruction with
1720
1721         sse_regs = length fp_regs_used
1722
1723         tot_arg_size = arg_size * length stack_args
1724
1725         -- On entry to the called function, %rsp should be aligned
1726         -- on a 16-byte boundary +8 (i.e. the first stack arg after
1727         -- the return address is 16-byte aligned).  In STG land
1728         -- %rsp is kept 16-byte aligned (see StgCRun.c), so we just
1729         -- need to make sure we push a multiple of 16-bytes of args,
1730         -- plus the return address, to get the correct alignment.
1731         -- Urg, this is hard.  We need to feed the delta back into
1732         -- the arg pushing code.
1733     (real_size, adjust_rsp) <-
1734         if tot_arg_size `rem` 16 == 0
1735             then return (tot_arg_size, nilOL)
1736             else do -- we need to adjust...
1737                 delta <- getDeltaNat
1738                 setDeltaNat (delta-8)
1739                 return (tot_arg_size+8, toOL [
1740                                 SUB II64 (OpImm (ImmInt 8)) (OpReg rsp),
1741                                 DELTA (delta-8)
1742                         ])
1743
1744         -- push the stack args, right to left
1745     push_code <- push_args (reverse stack_args) nilOL
1746     delta <- getDeltaNat
1747
1748     -- deal with static vs dynamic call targets
1749     (callinsns,cconv) <-
1750       case target of
1751         -- CmmPrim -> ...
1752         CmmCallee (CmmLit (CmmLabel lbl)) conv
1753            -> -- ToDo: stdcall arg sizes
1754               return (unitOL (CALL (Left fn_imm) arg_regs), conv)
1755            where fn_imm = ImmCLbl lbl
1756         CmmCallee expr conv
1757            -> do (dyn_r, dyn_c) <- getSomeReg expr
1758                  return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
1759
1760     let
1761         -- The x86_64 ABI requires us to set %al to the number of SSE2
1762         -- registers that contain arguments, if the called routine
1763         -- is a varargs function.  We don't know whether it's a
1764         -- varargs function or not, so we have to assume it is.
1765         --
1766         -- It's not safe to omit this assignment, even if the number
1767         -- of SSE2 regs in use is zero.  If %al is larger than 8
1768         -- on entry to a varargs function, seg faults ensue.
1769         assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
1770
1771     let call = callinsns `appOL`
1772                toOL (
1773                         -- Deallocate parameters after call for ccall;
1774                         -- but not for stdcall (callee does it)
1775                   (if cconv == StdCallConv || real_size==0 then [] else
1776                    [ADD (intSize wordWidth) (OpImm (ImmInt real_size)) (OpReg esp)])
1777                   ++
1778                   [DELTA (delta + real_size)]
1779                )
1780     -- in
1781     setDeltaNat (delta + real_size)
1782
1783     let
1784         -- assign the results, if necessary
1785         assign_code []     = nilOL
1786         assign_code [CmmHinted dest _hint] =
1787           case typeWidth rep of
1788                 W32 | isFloatType rep -> unitOL (MOV (floatSize W32) (OpReg xmm0) (OpReg r_dest))
1789                 W64 | isFloatType rep -> unitOL (MOV (floatSize W64) (OpReg xmm0) (OpReg r_dest))
1790                 _ -> unitOL (MOV (cmmTypeSize rep) (OpReg rax) (OpReg r_dest))
1791           where
1792                 rep = localRegType dest
1793                 r_dest = getRegisterReg True (CmmLocal dest)
1794         assign_code many = panic "genCCall.assign_code many"
1795
1796     return (load_args_code      `appOL`
1797             adjust_rsp          `appOL`
1798             push_code           `appOL`
1799             assign_eax sse_regs `appOL`
1800             call                `appOL`
1801             assign_code dest_regs)
1802
1803   where
1804     arg_size = 8 -- always, at the mo
1805
1806     load_args :: [CmmHinted CmmExpr]
1807               -> [Reg]                  -- int regs avail for args
1808               -> [Reg]                  -- FP regs avail for args
1809               -> InstrBlock
1810               -> NatM ([CmmHinted CmmExpr],[Reg],[Reg],InstrBlock)
1811     load_args args [] [] code     =  return (args, [], [], code)
1812         -- no more regs to use
1813     load_args [] aregs fregs code =  return ([], aregs, fregs, code)
1814         -- no more args to push
1815     load_args ((CmmHinted arg hint) : rest) aregs fregs code
1816         | isFloatType arg_rep =
1817         case fregs of
1818           [] -> push_this_arg
1819           (r:rs) -> do
1820              arg_code <- getAnyReg arg
1821              load_args rest aregs rs (code `appOL` arg_code r)
1822         | otherwise =
1823         case aregs of
1824           [] -> push_this_arg
1825           (r:rs) -> do
1826              arg_code <- getAnyReg arg
1827              load_args rest rs fregs (code `appOL` arg_code r)
1828         where
1829           arg_rep = cmmExprType arg
1830
1831           push_this_arg = do
1832             (args',ars,frs,code') <- load_args rest aregs fregs code
1833             return ((CmmHinted arg hint):args', ars, frs, code')
1834
1835     push_args [] code = return code
1836     push_args ((CmmHinted arg hint):rest) code
1837        | isFloatType arg_rep = do
1838          (arg_reg, arg_code) <- getSomeReg arg
1839          delta <- getDeltaNat
1840          setDeltaNat (delta-arg_size)
1841          let code' = code `appOL` arg_code `appOL` toOL [
1842                         SUB (intSize wordWidth) (OpImm (ImmInt arg_size)) (OpReg rsp) ,
1843                         DELTA (delta-arg_size),
1844                         MOV (floatSize width) (OpReg arg_reg) (OpAddr  (spRel 0))]
1845          push_args rest code'
1846
1847        | otherwise = do
1848        -- we only ever generate word-sized function arguments.  Promotion
1849        -- has already happened: our Int8# type is kept sign-extended
1850        -- in an Int#, for example.
1851          ASSERT(width == W64) return ()
1852          (arg_op, arg_code) <- getOperand arg
1853          delta <- getDeltaNat
1854          setDeltaNat (delta-arg_size)
1855          let code' = code `appOL` arg_code `appOL` toOL [
1856                                 PUSH II64 arg_op,
1857                                 DELTA (delta-arg_size)]
1858          push_args rest code'
1859         where
1860           arg_rep = cmmExprType arg
1861           width = typeWidth arg_rep
1862
1863 #else
1864 genCCall        = panic "X86.genCCAll: not defined"
1865
1866 #endif /* x86_64_TARGET_ARCH */
1867
1868
1869
1870
1871 outOfLineFloatOp :: CallishMachOp -> CmmFormal -> HintedCmmActuals -> NatM InstrBlock
1872 outOfLineFloatOp mop res args
1873   = do
1874       dflags <- getDynFlagsNat
1875       targetExpr <- cmmMakeDynamicReference dflags addImportNat CallReference lbl
1876       let target = CmmCallee targetExpr CCallConv
1877
1878       stmtToInstrs (CmmCall target [CmmHinted res NoHint] args CmmUnsafe CmmMayReturn)
1879   where
1880         -- Assume we can call these functions directly, and that they're not in a dynamic library.
1881         -- TODO: Why is this ok? Under linux this code will be in libm.so
1882         --       Is is because they're really implemented as a primitive instruction by the assembler??  -- BL 2009/12/31
1883         lbl = mkForeignLabel fn Nothing ForeignLabelInThisPackage IsFunction
1884
1885         fn = case mop of
1886               MO_F32_Sqrt  -> fsLit "sqrtf"
1887               MO_F32_Sin   -> fsLit "sinf"
1888               MO_F32_Cos   -> fsLit "cosf"
1889               MO_F32_Tan   -> fsLit "tanf"
1890               MO_F32_Exp   -> fsLit "expf"
1891               MO_F32_Log   -> fsLit "logf"
1892
1893               MO_F32_Asin  -> fsLit "asinf"
1894               MO_F32_Acos  -> fsLit "acosf"
1895               MO_F32_Atan  -> fsLit "atanf"
1896
1897               MO_F32_Sinh  -> fsLit "sinhf"
1898               MO_F32_Cosh  -> fsLit "coshf"
1899               MO_F32_Tanh  -> fsLit "tanhf"
1900               MO_F32_Pwr   -> fsLit "powf"
1901
1902               MO_F64_Sqrt  -> fsLit "sqrt"
1903               MO_F64_Sin   -> fsLit "sin"
1904               MO_F64_Cos   -> fsLit "cos"
1905               MO_F64_Tan   -> fsLit "tan"
1906               MO_F64_Exp   -> fsLit "exp"
1907               MO_F64_Log   -> fsLit "log"
1908
1909               MO_F64_Asin  -> fsLit "asin"
1910               MO_F64_Acos  -> fsLit "acos"
1911               MO_F64_Atan  -> fsLit "atan"
1912
1913               MO_F64_Sinh  -> fsLit "sinh"
1914               MO_F64_Cosh  -> fsLit "cosh"
1915               MO_F64_Tanh  -> fsLit "tanh"
1916               MO_F64_Pwr   -> fsLit "pow"
1917
1918
1919
1920
1921
1922 -- -----------------------------------------------------------------------------
1923 -- Generating a table-branch
1924
1925 genSwitch :: CmmExpr -> [Maybe BlockId] -> NatM InstrBlock
1926
1927 genSwitch expr ids
1928   | opt_PIC
1929   = do
1930         (reg,e_code) <- getSomeReg expr
1931         lbl <- getNewLabelNat
1932         dflags <- getDynFlagsNat
1933         dynRef <- cmmMakeDynamicReference dflags addImportNat DataReference lbl
1934         (tableReg,t_code) <- getSomeReg $ dynRef
1935         let
1936             jumpTable = map jumpTableEntryRel ids
1937
1938             jumpTableEntryRel Nothing
1939                 = CmmStaticLit (CmmInt 0 wordWidth)
1940             jumpTableEntryRel (Just blockid)
1941                 = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0)
1942                 where blockLabel = mkAsmTempLabel (getUnique blockid)
1943
1944             op = OpAddr (AddrBaseIndex (EABaseReg tableReg)
1945                                        (EAIndex reg wORD_SIZE) (ImmInt 0))
1946
1947 #if x86_64_TARGET_ARCH
1948 #if darwin_TARGET_OS
1949     -- on Mac OS X/x86_64, put the jump table in the text section
1950     -- to work around a limitation of the linker.
1951     -- ld64 is unable to handle the relocations for
1952     --     .quad L1 - L0
1953     -- if L0 is not preceded by a non-anonymous label in its section.
1954
1955             code = e_code `appOL` t_code `appOL` toOL [
1956                             ADD (intSize wordWidth) op (OpReg tableReg),
1957                             JMP_TBL (OpReg tableReg) [ id | Just id <- ids ],
1958                             LDATA Text (CmmDataLabel lbl : jumpTable)
1959                     ]
1960 #else
1961     -- HACK: On x86_64 binutils<2.17 is only able to generate PC32
1962     -- relocations, hence we only get 32-bit offsets in the jump
1963     -- table. As these offsets are always negative we need to properly
1964     -- sign extend them to 64-bit. This hack should be removed in
1965     -- conjunction with the hack in PprMach.hs/pprDataItem once
1966     -- binutils 2.17 is standard.
1967             code = e_code `appOL` t_code `appOL` toOL [
1968                             LDATA ReadOnlyData (CmmDataLabel lbl : jumpTable),
1969                             MOVSxL II32
1970                                    (OpAddr (AddrBaseIndex (EABaseReg tableReg)
1971                                                           (EAIndex reg wORD_SIZE) (ImmInt 0)))
1972                                    (OpReg reg),
1973                             ADD (intSize wordWidth) (OpReg reg) (OpReg tableReg),
1974                             JMP_TBL (OpReg tableReg) [ id | Just id <- ids ]
1975                    ]
1976 #endif
1977 #else
1978             code = e_code `appOL` t_code `appOL` toOL [
1979                             LDATA ReadOnlyData (CmmDataLabel lbl : jumpTable),
1980                             ADD (intSize wordWidth) op (OpReg tableReg),
1981                             JMP_TBL (OpReg tableReg) [ id | Just id <- ids ]
1982                     ]
1983 #endif
1984         return code
1985   | otherwise
1986   = do
1987         (reg,e_code) <- getSomeReg expr
1988         lbl <- getNewLabelNat
1989         let
1990             jumpTable = map jumpTableEntry ids
1991             op = OpAddr (AddrBaseIndex EABaseNone (EAIndex reg wORD_SIZE) (ImmCLbl lbl))
1992             code = e_code `appOL` toOL [
1993                     LDATA ReadOnlyData (CmmDataLabel lbl : jumpTable),
1994                     JMP_TBL op [ id | Just id <- ids ]
1995                  ]
1996         -- in
1997         return code
1998
1999
2000 -- -----------------------------------------------------------------------------
2001 -- 'condIntReg' and 'condFltReg': condition codes into registers
2002
2003 -- Turn those condition codes into integers now (when they appear on
2004 -- the right hand side of an assignment).
2005 --
2006 -- (If applicable) Do not fill the delay slots here; you will confuse the
2007 -- register allocator.
2008
2009 condIntReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
2010
2011 condIntReg cond x y = do
2012   CondCode _ cond cond_code <- condIntCode cond x y
2013   tmp <- getNewRegNat II8
2014   let
2015         code dst = cond_code `appOL` toOL [
2016                     SETCC cond (OpReg tmp),
2017                     MOVZxL II8 (OpReg tmp) (OpReg dst)
2018                   ]
2019   -- in
2020   return (Any II32 code)
2021
2022
2023
2024 condFltReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
2025 condFltReg cond x y = if_sse2 condFltReg_sse2 condFltReg_x87
2026  where
2027   condFltReg_x87 = do
2028     CondCode _ cond cond_code <- condFltCode cond x y
2029     tmp <- getNewRegNat II8
2030     let
2031         code dst = cond_code `appOL` toOL [
2032                     SETCC cond (OpReg tmp),
2033                     MOVZxL II8 (OpReg tmp) (OpReg dst)
2034                   ]
2035     -- in
2036     return (Any II32 code)
2037
2038   condFltReg_sse2 = do
2039     CondCode _ cond cond_code <- condFltCode cond x y
2040     tmp1 <- getNewRegNat archWordSize
2041     tmp2 <- getNewRegNat archWordSize
2042     let
2043         -- We have to worry about unordered operands (eg. comparisons
2044         -- against NaN).  If the operands are unordered, the comparison
2045         -- sets the parity flag, carry flag and zero flag.
2046         -- All comparisons are supposed to return false for unordered
2047         -- operands except for !=, which returns true.
2048         --
2049         -- Optimisation: we don't have to test the parity flag if we
2050         -- know the test has already excluded the unordered case: eg >
2051         -- and >= test for a zero carry flag, which can only occur for
2052         -- ordered operands.
2053         --
2054         -- ToDo: by reversing comparisons we could avoid testing the
2055         -- parity flag in more cases.
2056
2057         code dst =
2058            cond_code `appOL`
2059              (case cond of
2060                 NE  -> or_unordered dst
2061                 GU  -> plain_test   dst
2062                 GEU -> plain_test   dst
2063                 _   -> and_ordered  dst)
2064
2065         plain_test dst = toOL [
2066                     SETCC cond (OpReg tmp1),
2067                     MOVZxL II8 (OpReg tmp1) (OpReg dst)
2068                  ]
2069         or_unordered dst = toOL [
2070                     SETCC cond (OpReg tmp1),
2071                     SETCC PARITY (OpReg tmp2),
2072                     OR II8 (OpReg tmp1) (OpReg tmp2),
2073                     MOVZxL II8 (OpReg tmp2) (OpReg dst)
2074                   ]
2075         and_ordered dst = toOL [
2076                     SETCC cond (OpReg tmp1),
2077                     SETCC NOTPARITY (OpReg tmp2),
2078                     AND II8 (OpReg tmp1) (OpReg tmp2),
2079                     MOVZxL II8 (OpReg tmp2) (OpReg dst)
2080                   ]
2081     -- in
2082     return (Any II32 code)
2083
2084
2085 -- -----------------------------------------------------------------------------
2086 -- 'trivial*Code': deal with trivial instructions
2087
2088 -- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode',
2089 -- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions.
2090 -- Only look for constants on the right hand side, because that's
2091 -- where the generic optimizer will have put them.
2092
2093 -- Similarly, for unary instructions, we don't have to worry about
2094 -- matching an StInt as the argument, because genericOpt will already
2095 -- have handled the constant-folding.
2096
2097
2098 {-
2099 The Rules of the Game are:
2100
2101 * You cannot assume anything about the destination register dst;
2102   it may be anything, including a fixed reg.
2103
2104 * You may compute an operand into a fixed reg, but you may not
2105   subsequently change the contents of that fixed reg.  If you
2106   want to do so, first copy the value either to a temporary
2107   or into dst.  You are free to modify dst even if it happens
2108   to be a fixed reg -- that's not your problem.
2109
2110 * You cannot assume that a fixed reg will stay live over an
2111   arbitrary computation.  The same applies to the dst reg.
2112
2113 * Temporary regs obtained from getNewRegNat are distinct from
2114   each other and from all other regs, and stay live over
2115   arbitrary computations.
2116
2117 --------------------
2118
2119 SDM's version of The Rules:
2120
2121 * If getRegister returns Any, that means it can generate correct
2122   code which places the result in any register, period.  Even if that
2123   register happens to be read during the computation.
2124
2125   Corollary #1: this means that if you are generating code for an
2126   operation with two arbitrary operands, you cannot assign the result
2127   of the first operand into the destination register before computing
2128   the second operand.  The second operand might require the old value
2129   of the destination register.
2130
2131   Corollary #2: A function might be able to generate more efficient
2132   code if it knows the destination register is a new temporary (and
2133   therefore not read by any of the sub-computations).
2134
2135 * If getRegister returns Any, then the code it generates may modify only:
2136         (a) fresh temporaries
2137         (b) the destination register
2138         (c) known registers (eg. %ecx is used by shifts)
2139   In particular, it may *not* modify global registers, unless the global
2140   register happens to be the destination register.
2141 -}
2142
2143 trivialCode width instr (Just revinstr) (CmmLit lit_a) b
2144   | is32BitLit lit_a = do
2145   b_code <- getAnyReg b
2146   let
2147        code dst
2148          = b_code dst `snocOL`
2149            revinstr (OpImm (litToImm lit_a)) (OpReg dst)
2150   -- in
2151   return (Any (intSize width) code)
2152
2153 trivialCode width instr maybe_revinstr a b
2154   = genTrivialCode (intSize width) instr a b
2155
2156 -- This is re-used for floating pt instructions too.
2157 genTrivialCode rep instr a b = do
2158   (b_op, b_code) <- getNonClobberedOperand b
2159   a_code <- getAnyReg a
2160   tmp <- getNewRegNat rep
2161   let
2162      -- We want the value of b to stay alive across the computation of a.
2163      -- But, we want to calculate a straight into the destination register,
2164      -- because the instruction only has two operands (dst := dst `op` src).
2165      -- The troublesome case is when the result of b is in the same register
2166      -- as the destination reg.  In this case, we have to save b in a
2167      -- new temporary across the computation of a.
2168      code dst
2169         | dst `regClashesWithOp` b_op =
2170                 b_code `appOL`
2171                 unitOL (MOV rep b_op (OpReg tmp)) `appOL`
2172                 a_code dst `snocOL`
2173                 instr (OpReg tmp) (OpReg dst)
2174         | otherwise =
2175                 b_code `appOL`
2176                 a_code dst `snocOL`
2177                 instr b_op (OpReg dst)
2178   -- in
2179   return (Any rep code)
2180
2181 reg `regClashesWithOp` OpReg reg2   = reg == reg2
2182 reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode)
2183 reg `regClashesWithOp` _            = False
2184
2185 -----------
2186
2187 trivialUCode rep instr x = do
2188   x_code <- getAnyReg x
2189   let
2190      code dst =
2191         x_code dst `snocOL`
2192         instr (OpReg dst)
2193   return (Any rep code)
2194
2195 -----------
2196
2197 trivialFCode_x87 width instr x y = do
2198   (x_reg, x_code) <- getNonClobberedReg x -- these work for float regs too
2199   (y_reg, y_code) <- getSomeReg y
2200   let
2201      size = FF80 -- always, on x87
2202      code dst =
2203         x_code `appOL`
2204         y_code `snocOL`
2205         instr size x_reg y_reg dst
2206   return (Any size code)
2207
2208 trivialFCode_sse2 pk instr x y
2209     = genTrivialCode size (instr size) x y
2210     where size = floatSize pk
2211
2212
2213 trivialUFCode size instr x = do
2214   (x_reg, x_code) <- getSomeReg x
2215   let
2216      code dst =
2217         x_code `snocOL`
2218         instr x_reg dst
2219   -- in
2220   return (Any size code)
2221
2222
2223 --------------------------------------------------------------------------------
2224 coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
2225 coerceInt2FP from to x = if_sse2 coerce_sse2 coerce_x87
2226  where
2227    coerce_x87 = do
2228      (x_reg, x_code) <- getSomeReg x
2229      let
2230            opc  = case to of W32 -> GITOF; W64 -> GITOD
2231            code dst = x_code `snocOL` opc x_reg dst
2232         -- ToDo: works for non-II32 reps?
2233      return (Any FF80 code)
2234
2235    coerce_sse2 = do
2236      (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
2237      let
2238            opc  = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD
2239            code dst = x_code `snocOL` opc (intSize from) x_op dst
2240      -- in
2241      return (Any (floatSize to) code)
2242         -- works even if the destination rep is <II32
2243
2244 --------------------------------------------------------------------------------
2245 coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
2246 coerceFP2Int from to x = if_sse2 coerceFP2Int_sse2 coerceFP2Int_x87
2247  where
2248    coerceFP2Int_x87 = do
2249      (x_reg, x_code) <- getSomeReg x
2250      let
2251            opc  = case from of W32 -> GFTOI; W64 -> GDTOI
2252            code dst = x_code `snocOL` opc x_reg dst
2253         -- ToDo: works for non-II32 reps?
2254      -- in
2255      return (Any (intSize to) code)
2256
2257    coerceFP2Int_sse2 = do
2258      (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
2259      let
2260            opc  = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ
2261            code dst = x_code `snocOL` opc (intSize to) x_op dst
2262      -- in
2263      return (Any (intSize to) code)
2264          -- works even if the destination rep is <II32
2265
2266
2267 --------------------------------------------------------------------------------
2268 coerceFP2FP :: Width -> CmmExpr -> NatM Register
2269 coerceFP2FP to x = do
2270   use_sse2 <- sse2Enabled
2271   (x_reg, x_code) <- getSomeReg x
2272   let
2273         opc | use_sse2  = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD
2274             | otherwise = GDTOF
2275         code dst = x_code `snocOL` opc x_reg dst
2276   -- in
2277   return (Any (if use_sse2 then floatSize to else FF80) code)
2278
2279 --------------------------------------------------------------------------------
2280
2281 sse2NegCode :: Width -> CmmExpr -> NatM Register
2282 sse2NegCode w x = do
2283   let sz = floatSize w
2284   x_code <- getAnyReg x
2285   -- This is how gcc does it, so it can't be that bad:
2286   let
2287     const | FF32 <- sz = CmmInt 0x80000000 W32
2288           | otherwise  = CmmInt 0x8000000000000000 W64
2289   Amode amode amode_code <- memConstant (widthInBytes w) const
2290   tmp <- getNewRegNat sz
2291   let
2292     code dst = x_code dst `appOL` amode_code `appOL` toOL [
2293         MOV sz (OpAddr amode) (OpReg tmp),
2294         XOR sz (OpReg tmp) (OpReg dst)
2295         ]
2296   --
2297   return (Any sz code)