From 335b9f366ac440259318777c4c07e4fa42fbbec6 Mon Sep 17 00:00:00 2001 From: Simon Marlow Date: Thu, 4 Feb 2010 10:48:49 +0000 Subject: [PATCH] Implement SSE2 floating-point support in the x86 native code generator (#594) The new flag -msse2 enables code generation for SSE2 on x86. It results in substantially faster floating-point performance; the main reason for doing this was that our x87 code generation is appallingly bad, and since we plan to drop -fvia-C soon, we need a way to generate half-decent floating-point code. The catch is that SSE2 is only available on CPUs that support it (P4+, AMD K8+). We'll have to think hard about whether we should enable it by default for the libraries we ship. In the meantime, at least -msse2 should be an acceptable replacement for "-fvia-C -optc-ffast-math -fexcess-precision". SSE2 also has the advantage of performing all operations at the correct precision, so floating-point results are consistent with other platforms. I also tweaked the x87 code generation a bit while I was here, now it's slighlty less bad than before. --- compiler/main/DynFlags.hs | 4 + compiler/nativeGen/PPC/Ppr.hs | 1 + compiler/nativeGen/PPC/Regs.hs | 29 +- compiler/nativeGen/Reg.hs | 5 + compiler/nativeGen/RegAlloc/Graph/Main.hs | 18 +- compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs | 18 +- compiler/nativeGen/RegClass.hs | 3 + compiler/nativeGen/SPARC/Instr.hs | 3 + compiler/nativeGen/SPARC/Ppr.hs | 1 + compiler/nativeGen/SPARC/Regs.hs | 14 +- compiler/nativeGen/X86/CodeGen.hs | 796 ++++++++++---------- compiler/nativeGen/X86/Instr.hs | 86 +-- compiler/nativeGen/X86/Ppr.hs | 93 +-- compiler/nativeGen/X86/RegInfo.hs | 34 +- compiler/nativeGen/X86/Regs.hs | 243 ++---- docs/users_guide/flags.xml | 8 + docs/users_guide/using.xml | 20 +- 17 files changed, 630 insertions(+), 746 deletions(-) diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs index 4ba19b0..abef731 100644 --- a/compiler/main/DynFlags.hs +++ b/compiler/main/DynFlags.hs @@ -312,6 +312,7 @@ data DynFlag | Opt_EmitExternalCore | Opt_SharedImplib | Opt_BuildingCabalPackage + | Opt_SSE2 -- temporary flags | Opt_RunCPS @@ -1265,6 +1266,9 @@ dynamic_flags = [ , Flag "monly-4-regs" (NoArg (upd (\s -> s{stolen_x86_regs = 4}) )) Supported + , Flag "msse2" (NoArg (setDynFlag Opt_SSE2)) + Supported + ------ Warning opts ------------------------------------------------- , Flag "W" (NoArg (mapM_ setDynFlag minusWOpts)) Supported diff --git a/compiler/nativeGen/PPC/Ppr.hs b/compiler/nativeGen/PPC/Ppr.hs index ec6d941..2d8f044 100644 --- a/compiler/nativeGen/PPC/Ppr.hs +++ b/compiler/nativeGen/PPC/Ppr.hs @@ -171,6 +171,7 @@ pprReg r RegVirtual (VirtualRegHi u) -> text "%vHi_" <> asmSDoc (pprUnique u) RegVirtual (VirtualRegF u) -> text "%vF_" <> asmSDoc (pprUnique u) RegVirtual (VirtualRegD u) -> text "%vD_" <> asmSDoc (pprUnique u) + RegVirtual (VirtualRegSSE u) -> text "%vSSE_" <> asmSDoc (pprUnique u) where #if darwin_TARGET_OS ppr_reg_no :: Int -> Doc diff --git a/compiler/nativeGen/PPC/Regs.hs b/compiler/nativeGen/PPC/Regs.hs index 18f06ed..d649d84 100644 --- a/compiler/nativeGen/PPC/Regs.hs +++ b/compiler/nativeGen/PPC/Regs.hs @@ -87,25 +87,15 @@ virtualRegSqueeze cls vr -> case vr of VirtualRegI{} -> _ILIT(1) VirtualRegHi{} -> _ILIT(1) - VirtualRegD{} -> _ILIT(0) - VirtualRegF{} -> _ILIT(0) - - -- We don't use floats on this arch, but we can't - -- return error because the return type is unboxed... - RcFloat - -> case vr of - VirtualRegI{} -> _ILIT(0) - VirtualRegHi{} -> _ILIT(0) - VirtualRegD{} -> _ILIT(0) - VirtualRegF{} -> _ILIT(0) + _other -> _ILIT(0) RcDouble -> case vr of - VirtualRegI{} -> _ILIT(0) - VirtualRegHi{} -> _ILIT(0) VirtualRegD{} -> _ILIT(1) VirtualRegF{} -> _ILIT(0) + _other -> _ILIT(0) + _other -> _ILIT(0) {-# INLINE realRegSqueeze #-} realRegSqueeze :: RegClass -> RealReg -> FastInt @@ -119,16 +109,6 @@ realRegSqueeze cls rr RealRegPair{} -> _ILIT(0) - -- We don't use floats on this arch, but we can't - -- return error because the return type is unboxed... - RcFloat - -> case rr of - RealRegSingle regNo - | regNo < 32 -> _ILIT(0) - | otherwise -> _ILIT(0) - - RealRegPair{} -> _ILIT(0) - RcDouble -> case rr of RealRegSingle regNo @@ -137,6 +117,8 @@ realRegSqueeze cls rr RealRegPair{} -> _ILIT(0) + _other -> _ILIT(0) + mkVirtualReg :: Unique -> Size -> VirtualReg mkVirtualReg u size | not (isFloatSize size) = VirtualRegI u @@ -152,6 +134,7 @@ regDotColor reg RcInteger -> Outputable.text "blue" RcFloat -> Outputable.text "red" RcDouble -> Outputable.text "green" + RcDoubleSSE -> Outputable.text "yellow" -- immediates ------------------------------------------------------------------ diff --git a/compiler/nativeGen/Reg.hs b/compiler/nativeGen/Reg.hs index 422ea24..27315ba 100644 --- a/compiler/nativeGen/Reg.hs +++ b/compiler/nativeGen/Reg.hs @@ -55,6 +55,7 @@ data VirtualReg | VirtualRegHi {-# UNPACK #-} !Unique -- High part of 2-word register | VirtualRegF {-# UNPACK #-} !Unique | VirtualRegD {-# UNPACK #-} !Unique + | VirtualRegSSE {-# UNPACK #-} !Unique deriving (Eq, Show, Ord) instance Uniquable VirtualReg where @@ -64,6 +65,7 @@ instance Uniquable VirtualReg where VirtualRegHi u -> u VirtualRegF u -> u VirtualRegD u -> u + VirtualRegSSE u -> u instance Outputable VirtualReg where ppr reg @@ -72,6 +74,7 @@ instance Outputable VirtualReg where VirtualRegHi u -> text "%vHi_" <> pprUnique u VirtualRegF u -> text "%vF_" <> pprUnique u VirtualRegD u -> text "%vD_" <> pprUnique u + VirtualRegSSE u -> text "%vSSE_" <> pprUnique u renameVirtualReg :: Unique -> VirtualReg -> VirtualReg @@ -81,6 +84,7 @@ renameVirtualReg u r VirtualRegHi _ -> VirtualRegHi u VirtualRegF _ -> VirtualRegF u VirtualRegD _ -> VirtualRegD u + VirtualRegSSE _ -> VirtualRegSSE u classOfVirtualReg :: VirtualReg -> RegClass @@ -90,6 +94,7 @@ classOfVirtualReg vr VirtualRegHi{} -> RcInteger VirtualRegF{} -> RcFloat VirtualRegD{} -> RcDouble + VirtualRegSSE{} -> RcDoubleSSE -- Determine the upper-half vreg for a 64-bit quantity on a 32-bit platform diff --git a/compiler/nativeGen/RegAlloc/Graph/Main.hs b/compiler/nativeGen/RegAlloc/Graph/Main.hs index 5fa771c..35ec879 100644 --- a/compiler/nativeGen/RegAlloc/Graph/Main.hs +++ b/compiler/nativeGen/RegAlloc/Graph/Main.hs @@ -380,25 +380,13 @@ seqNode node `seq` (seqVirtualRegList (uniqSetToList (Color.nodeCoalesce node))) seqVirtualReg :: VirtualReg -> () -seqVirtualReg reg - = case reg of - VirtualRegI _ -> () - VirtualRegHi _ -> () - VirtualRegF _ -> () - VirtualRegD _ -> () +seqVirtualReg reg = reg `seq` () seqRealReg :: RealReg -> () -seqRealReg reg - = case reg of - RealRegSingle _ -> () - RealRegPair _ _ -> () +seqRealReg reg = reg `seq` () seqRegClass :: RegClass -> () -seqRegClass c - = case c of - RcInteger -> () - RcFloat -> () - RcDouble -> () +seqRegClass c = c `seq` () seqMaybeRealReg :: Maybe RealReg -> () seqMaybeRealReg mr diff --git a/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs b/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs index fd0faae..2f10178 100644 --- a/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs +++ b/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs @@ -50,24 +50,27 @@ import FastTypes #define ALLOCATABLE_REGS_INTEGER (_ILIT(3)) #define ALLOCATABLE_REGS_DOUBLE (_ILIT(6)) #define ALLOCATABLE_REGS_FLOAT (_ILIT(0)) +#define ALLOCATABLE_REGS_SSE (_ILIT(16)) #elif x86_64_TARGET_ARCH #define ALLOCATABLE_REGS_INTEGER (_ILIT(5)) -#define ALLOCATABLE_REGS_DOUBLE (_ILIT(2)) +#define ALLOCATABLE_REGS_DOUBLE (_ILIT(0)) #define ALLOCATABLE_REGS_FLOAT (_ILIT(0)) - +#define ALLOCATABLE_REGS_SSE (_ILIT(10)) #elif powerpc_TARGET_ARCH #define ALLOCATABLE_REGS_INTEGER (_ILIT(16)) #define ALLOCATABLE_REGS_DOUBLE (_ILIT(26)) #define ALLOCATABLE_REGS_FLOAT (_ILIT(0)) +#define ALLOCATABLE_REGS_SSE (_ILIT(0)) #elif sparc_TARGET_ARCH #define ALLOCATABLE_REGS_INTEGER (_ILIT(14)) #define ALLOCATABLE_REGS_DOUBLE (_ILIT(11)) #define ALLOCATABLE_REGS_FLOAT (_ILIT(22)) +#define ALLOCATABLE_REGS_SSE (_ILIT(0)) #else @@ -139,6 +142,17 @@ trivColorable virtualRegSqueeze realRegSqueeze RcDouble conflicts exclusions = count3 <# ALLOCATABLE_REGS_DOUBLE +trivColorable virtualRegSqueeze realRegSqueeze RcDoubleSSE conflicts exclusions + | count2 <- accSqueeze (_ILIT(0)) ALLOCATABLE_REGS_SSE + (virtualRegSqueeze RcDoubleSSE) + conflicts + + , count3 <- accSqueeze count2 ALLOCATABLE_REGS_SSE + (realRegSqueeze RcDoubleSSE) + exclusions + + = count3 <# ALLOCATABLE_REGS_SSE + -- Specification Code ---------------------------------------------------------- -- diff --git a/compiler/nativeGen/RegClass.hs b/compiler/nativeGen/RegClass.hs index 4bb300f..2a4ac33 100644 --- a/compiler/nativeGen/RegClass.hs +++ b/compiler/nativeGen/RegClass.hs @@ -17,6 +17,7 @@ data RegClass = RcInteger | RcFloat | RcDouble + | RcDoubleSSE -- x86 only: the SSE regs are a separate class deriving Eq @@ -24,8 +25,10 @@ instance Uniquable RegClass where getUnique RcInteger = mkRegClassUnique 0 getUnique RcFloat = mkRegClassUnique 1 getUnique RcDouble = mkRegClassUnique 2 + getUnique RcDoubleSSE = mkRegClassUnique 3 instance Outputable RegClass where ppr RcInteger = Outputable.text "I" ppr RcFloat = Outputable.text "F" ppr RcDouble = Outputable.text "D" + ppr RcDoubleSSE = Outputable.text "S" diff --git a/compiler/nativeGen/SPARC/Instr.hs b/compiler/nativeGen/SPARC/Instr.hs index 87b6abc..00b57f9 100644 --- a/compiler/nativeGen/SPARC/Instr.hs +++ b/compiler/nativeGen/SPARC/Instr.hs @@ -373,6 +373,7 @@ sparc_mkSpillInstr reg _ slot RcInteger -> II32 RcFloat -> FF32 RcDouble -> FF64 + _ -> panic "sparc_mkSpillInstr" in ST sz reg (fpRel (negate off_w)) @@ -391,6 +392,7 @@ sparc_mkLoadInstr reg _ slot RcInteger -> II32 RcFloat -> FF32 RcDouble -> FF64 + _ -> panic "sparc_mkLoadInstr" in LD sz (fpRel (- off_w)) reg @@ -438,6 +440,7 @@ sparc_mkRegRegMoveInstr src dst RcInteger -> ADD False False src (RIReg g0) dst RcDouble -> FMOV FF64 src dst RcFloat -> FMOV FF32 src dst + _ -> panic "sparc_mkRegRegMoveInstr" | otherwise = panic "SPARC.Instr.mkRegRegMoveInstr: classes of src and dest not the same" diff --git a/compiler/nativeGen/SPARC/Ppr.hs b/compiler/nativeGen/SPARC/Ppr.hs index d517a08..cb11d36 100644 --- a/compiler/nativeGen/SPARC/Ppr.hs +++ b/compiler/nativeGen/SPARC/Ppr.hs @@ -156,6 +156,7 @@ pprReg reg VirtualRegHi u -> text "%vHi_" <> asmSDoc (pprUnique u) VirtualRegF u -> text "%vF_" <> asmSDoc (pprUnique u) VirtualRegD u -> text "%vD_" <> asmSDoc (pprUnique u) + VirtualRegSSE u -> text "%vSSE_" <> asmSDoc (pprUnique u) RegReal rr -> case rr of diff --git a/compiler/nativeGen/SPARC/Regs.hs b/compiler/nativeGen/SPARC/Regs.hs index 8ad400f..cd19138 100644 --- a/compiler/nativeGen/SPARC/Regs.hs +++ b/compiler/nativeGen/SPARC/Regs.hs @@ -95,22 +95,21 @@ virtualRegSqueeze cls vr -> case vr of VirtualRegI{} -> _ILIT(1) VirtualRegHi{} -> _ILIT(1) - VirtualRegF{} -> _ILIT(0) - VirtualRegD{} -> _ILIT(0) + _other -> _ILIT(0) RcFloat -> case vr of - VirtualRegI{} -> _ILIT(0) - VirtualRegHi{} -> _ILIT(0) VirtualRegF{} -> _ILIT(1) VirtualRegD{} -> _ILIT(2) + _other -> _ILIT(0) RcDouble -> case vr of - VirtualRegI{} -> _ILIT(0) - VirtualRegHi{} -> _ILIT(0) VirtualRegF{} -> _ILIT(1) VirtualRegD{} -> _ILIT(1) + _other -> _ILIT(0) + + _other -> _ILIT(0) {-# INLINE realRegSqueeze #-} realRegSqueeze :: RegClass -> RealReg -> FastInt @@ -141,6 +140,7 @@ realRegSqueeze cls rr RealRegPair{} -> _ILIT(1) + _other -> _ILIT(0) -- | All the allocatable registers in the machine, -- including register pairs. @@ -283,7 +283,7 @@ regDotColor reg = case classOfRealReg reg of RcInteger -> text "blue" RcFloat -> text "red" - RcDouble -> text "green" + _other -> text "green" diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index 5941a8c..e9bbc06 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -71,6 +71,22 @@ import Data.Bits import Data.Word import Data.Int +sse2Enabled :: NatM Bool +#if x86_64_TARGET_ARCH +-- SSE2 is fixed on for x86_64. It would be possible to make it optional, +-- but we'd need to fix at least the foreign call code where the calling +-- convention specifies the use of xmm regs, and possibly other places. +sse2Enabled = return True +#else +sse2Enabled = do + dflags <- getDynFlagsNat + return (dopt Opt_SSE2 dflags) +#endif + +if_sse2 :: NatM a -> NatM a -> NatM a +if_sse2 sse2 x87 = do + b <- sse2Enabled + if b then sse2 else x87 cmmTopCodeGen :: DynFlags @@ -201,12 +217,15 @@ swizzleRegisterRep (Any _ codefn) size = Any size codefn -- | Grab the Reg for a CmmReg -getRegisterReg :: CmmReg -> Reg +getRegisterReg :: Bool -> CmmReg -> Reg -getRegisterReg (CmmLocal (LocalReg u pk)) - = RegVirtual $ mkVirtualReg u (cmmTypeSize pk) +getRegisterReg use_sse2 (CmmLocal (LocalReg u pk)) + = let sz = cmmTypeSize pk in + if isFloatSize sz && not use_sse2 + then RegVirtual (mkVirtualReg u FF80) + else RegVirtual (mkVirtualReg u sz) -getRegisterReg (CmmGlobal mid) +getRegisterReg _ (CmmGlobal mid) = case get_GlobalReg_reg_or_addr mid of Left reg -> RegReal $ reg _other -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid) @@ -405,8 +424,14 @@ getRegister (CmmReg (CmmGlobal PicBaseReg)) #endif getRegister (CmmReg reg) - = return (Fixed (cmmTypeSize (cmmRegType reg)) - (getRegisterReg reg) nilOL) + = do use_sse2 <- sse2Enabled + let + sz = cmmTypeSize (cmmRegType reg) + size | not use_sse2 && isFloatSize sz = FF80 + | otherwise = sz + -- + return (Fixed sz (getRegisterReg use_sse2 reg) nilOL) + getRegister tree@(CmmRegOff _ _) = getRegister (mangleIndexTree tree) @@ -437,78 +462,35 @@ getRegister (CmmMachOp (MO_SS_Conv W64 W32) [x]) = do #endif - - -#if i386_TARGET_ARCH - -getRegister (CmmLit (CmmFloat f W32)) = do - lbl <- getNewLabelNat - dflags <- getDynFlagsNat - dynRef <- cmmMakeDynamicReference dflags addImportNat DataReference lbl - Amode addr addr_code <- getAmode dynRef - let code dst = - LDATA ReadOnlyData - [CmmDataLabel lbl, - CmmStaticLit (CmmFloat f W32)] - `consOL` (addr_code `snocOL` - GLD FF32 addr dst) - -- in - return (Any FF32 code) - - -getRegister (CmmLit (CmmFloat d W64)) - | d == 0.0 - = let code dst = unitOL (GLDZ dst) - in return (Any FF64 code) - - | d == 1.0 - = let code dst = unitOL (GLD1 dst) - in return (Any FF64 code) - - | otherwise = do - lbl <- getNewLabelNat - dflags <- getDynFlagsNat - dynRef <- cmmMakeDynamicReference dflags addImportNat DataReference lbl - Amode addr addr_code <- getAmode dynRef - let code dst = - LDATA ReadOnlyData - [CmmDataLabel lbl, - CmmStaticLit (CmmFloat d W64)] - `consOL` (addr_code `snocOL` - GLD FF64 addr dst) - -- in - return (Any FF64 code) - -#endif /* i386_TARGET_ARCH */ - - - - -#if x86_64_TARGET_ARCH -getRegister (CmmLit (CmmFloat 0.0 w)) = do - let size = floatSize w - code dst = unitOL (XOR size (OpReg dst) (OpReg dst)) - -- I don't know why there are xorpd, xorps, and pxor instructions. - -- They all appear to do the same thing --SDM - return (Any size code) - -getRegister (CmmLit (CmmFloat f w)) = do - lbl <- getNewLabelNat - let code dst = toOL [ - LDATA ReadOnlyData - [CmmDataLabel lbl, - CmmStaticLit (CmmFloat f w)], - MOV size (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst) - ] - -- in - return (Any size code) - where size = floatSize w - -#endif /* x86_64_TARGET_ARCH */ - - - - +getRegister (CmmLit lit@(CmmFloat f w)) = + if_sse2 float_const_sse2 float_const_x87 + where + float_const_sse2 + | f == 0.0 = do + let + size = floatSize w + code dst = unitOL (XOR size (OpReg dst) (OpReg dst)) + -- I don't know why there are xorpd, xorps, and pxor instructions. + -- They all appear to do the same thing --SDM + return (Any size code) + + | otherwise = do + Amode addr code <- memConstant (widthInBytes w) lit + loadFloatAmode True w addr code + + float_const_x87 = case w of + W64 + | f == 0.0 -> + let code dst = unitOL (GLDZ dst) + in return (Any FF80 code) + + | f == 1.0 -> + let code dst = unitOL (GLD1 dst) + in return (Any FF80 code) + + _otherwise -> do + Amode addr code <- memConstant (widthInBytes w) lit + loadFloatAmode False w addr code -- catch simple cases of zero- or sign-extended load getRegister (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do @@ -560,61 +542,20 @@ getRegister (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg), = return $ Any II64 (\dst -> unitOL $ LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst)) -getRegister (CmmMachOp (MO_F_Neg W32) [x]) = do - x_code <- getAnyReg x - lbl <- getNewLabelNat - let - code dst = x_code dst `appOL` toOL [ - -- This is how gcc does it, so it can't be that bad: - LDATA ReadOnlyData16 [ - CmmAlign 16, - CmmDataLabel lbl, - CmmStaticLit (CmmInt 0x80000000 W32), - CmmStaticLit (CmmInt 0 W32), - CmmStaticLit (CmmInt 0 W32), - CmmStaticLit (CmmInt 0 W32) - ], - XOR FF32 (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst) - -- xorps, so we need the 128-bit constant - -- ToDo: rip-relative - ] - -- - return (Any FF32 code) - -getRegister (CmmMachOp (MO_F_Neg W64) [x]) = do - x_code <- getAnyReg x - lbl <- getNewLabelNat - let - -- This is how gcc does it, so it can't be that bad: - code dst = x_code dst `appOL` toOL [ - LDATA ReadOnlyData16 [ - CmmAlign 16, - CmmDataLabel lbl, - CmmStaticLit (CmmInt 0x8000000000000000 W64), - CmmStaticLit (CmmInt 0 W64) - ], - -- gcc puts an unpck here. Wonder if we need it. - XOR FF64 (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst) - -- xorpd, so we need the 128-bit constant - ] - -- - return (Any FF64 code) - #endif /* x86_64_TARGET_ARCH */ -getRegister (CmmMachOp mop [x]) -- unary MachOps - = case mop of -#if i386_TARGET_ARCH - MO_F_Neg W32 -> trivialUFCode FF32 (GNEG FF32) x - MO_F_Neg W64 -> trivialUFCode FF64 (GNEG FF64) x -#endif +getRegister (CmmMachOp mop [x]) = do -- unary MachOps + sse2 <- sse2Enabled + case mop of + MO_F_Neg w + | sse2 -> sse2NegCode w x + | otherwise -> trivialUFCode FF80 (GNEG FF80) x MO_S_Neg w -> triv_ucode NEGI (intSize w) - MO_F_Neg w -> triv_ucode NEGI (floatSize w) MO_Not w -> triv_ucode NOT (intSize w) -- Nop conversions @@ -659,13 +600,13 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps -- the form of a movzl and print it as a movl later. #endif -#if i386_TARGET_ARCH - MO_FF_Conv W32 W64 -> conversionNop FF64 x - MO_FF_Conv W64 W32 -> conversionNop FF32 x -#else - MO_FF_Conv W32 W64 -> coerceFP2FP W64 x - MO_FF_Conv W64 W32 -> coerceFP2FP W32 x -#endif + MO_FF_Conv W32 W64 + | sse2 -> coerceFP2FP W64 x + | otherwise -> conversionNop FF80 x + + MO_FF_Conv W64 W32 + | sse2 -> coerceFP2FP W32 x + | otherwise -> conversionNop FF80 x MO_FS_Conv from to -> coerceFP2Int from to x MO_SF_Conv from to -> coerceInt2FP from to x @@ -707,8 +648,9 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps return (swizzleRegisterRep e_code new_size) -getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps - = case mop of +getRegister e@(CmmMachOp mop [x, y]) = do -- dyadic MachOps + sse2 <- sse2Enabled + case mop of MO_F_Eq w -> condFltReg EQQ x y MO_F_Ne w -> condFltReg NE x y MO_F_Gt w -> condFltReg GTT x y @@ -729,19 +671,14 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps MO_U_Lt rep -> condIntReg LU x y MO_U_Le rep -> condIntReg LEU x y -#if i386_TARGET_ARCH - MO_F_Add w -> trivialFCode w GADD x y - MO_F_Sub w -> trivialFCode w GSUB x y - MO_F_Quot w -> trivialFCode w GDIV x y - MO_F_Mul w -> trivialFCode w GMUL x y -#endif - -#if x86_64_TARGET_ARCH - MO_F_Add w -> trivialFCode w ADD x y - MO_F_Sub w -> trivialFCode w SUB x y - MO_F_Quot w -> trivialFCode w FDIV x y - MO_F_Mul w -> trivialFCode w MUL x y -#endif + MO_F_Add w | sse2 -> trivialFCode_sse2 w ADD x y + | otherwise -> trivialFCode_x87 w GADD x y + MO_F_Sub w | sse2 -> trivialFCode_sse2 w SUB x y + | otherwise -> trivialFCode_x87 w GSUB x y + MO_F_Quot w | sse2 -> trivialFCode_sse2 w FDIV x y + | otherwise -> trivialFCode_x87 w GDIV x y + MO_F_Mul w | sse2 -> trivialFCode_sse2 w MUL x y + | otherwise -> trivialFCode_x87 w GMUL x y MO_Add rep -> add_code rep x y MO_Sub rep -> sub_code rep x y @@ -892,13 +829,9 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps getRegister (CmmLoad mem pk) | isFloatType pk = do - Amode src mem_code <- getAmode mem - let - size = cmmTypeSize pk - code dst = mem_code `snocOL` - IF_ARCH_i386(GLD size src dst, - MOV size (OpAddr src) (OpReg dst)) - return (Any size code) + Amode addr mem_code <- getAmode mem + use_sse2 <- sse2Enabled + loadFloatAmode use_sse2 (typeWidth pk) addr mem_code #if i386_TARGET_ARCH getRegister (CmmLoad mem pk) @@ -1032,11 +965,8 @@ getNonClobberedReg expr = do reg2reg :: Size -> Reg -> Reg -> Instr reg2reg size src dst -#if i386_TARGET_ARCH - | isFloatSize size = GMOV src dst -#endif - | otherwise = MOV size (OpReg src) (OpReg dst) - + | size == FF80 = GMOV src dst + | otherwise = MOV size (OpReg src) (OpReg dst) -------------------------------------------------------------------------------- @@ -1122,30 +1052,41 @@ x86_complex_amode base index shift offset -- (see trivialCode where this function is used for an example). getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock) -#if x86_64_TARGET_ARCH -getNonClobberedOperand (CmmLit lit) - | isSuitableFloatingPointLit lit = do - lbl <- getNewLabelNat - let code = unitOL (LDATA ReadOnlyData [CmmDataLabel lbl, - CmmStaticLit lit]) - return (OpAddr (ripRel (ImmCLbl lbl)), code) -#endif -getNonClobberedOperand (CmmLit lit) - | is32BitLit lit && not (isFloatType (cmmLitType lit)) = - return (OpImm (litToImm lit), nilOL) -getNonClobberedOperand (CmmLoad mem pk) - | IF_ARCH_i386(not (isFloatType pk) && not (isWord64 pk), True) = do - Amode src mem_code <- getAmode mem - (src',save_code) <- - if (amodeCouldBeClobbered src) - then do - tmp <- getNewRegNat archWordSize - return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0), - unitOL (LEA II32 (OpAddr src) (OpReg tmp))) - else - return (src, nilOL) - return (OpAddr src', save_code `appOL` mem_code) -getNonClobberedOperand e = do +getNonClobberedOperand (CmmLit lit) = do + use_sse2 <- sse2Enabled + if use_sse2 && isSuitableFloatingPointLit lit + then do + let CmmFloat _ w = lit + Amode addr code <- memConstant (widthInBytes w) lit + return (OpAddr addr, code) + else do + + if is32BitLit lit && not (isFloatType (cmmLitType lit)) + then return (OpImm (litToImm lit), nilOL) + else getNonClobberedOperand_generic (CmmLit lit) + +getNonClobberedOperand (CmmLoad mem pk) = do + use_sse2 <- sse2Enabled + if (not (isFloatType pk) || use_sse2) + && IF_ARCH_i386(not (isWord64 pk), True) + then do + Amode src mem_code <- getAmode mem + (src',save_code) <- + if (amodeCouldBeClobbered src) + then do + tmp <- getNewRegNat archWordSize + return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0), + unitOL (LEA II32 (OpAddr src) (OpReg tmp))) + else + return (src, nilOL) + return (OpAddr src', save_code `appOL` mem_code) + else do + getNonClobberedOperand_generic (CmmLoad mem pk) + +getNonClobberedOperand e = getNonClobberedOperand_generic e + +getNonClobberedOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock) +getNonClobberedOperand_generic e = do (reg, code) <- getNonClobberedReg e return (OpReg reg, code) @@ -1158,22 +1099,32 @@ regClobbered _ = False -- getOperand: the operand is not required to remain valid across the -- computation of an arbitrary expression. getOperand :: CmmExpr -> NatM (Operand, InstrBlock) -#if x86_64_TARGET_ARCH -getOperand (CmmLit lit) - | isSuitableFloatingPointLit lit = do - lbl <- getNewLabelNat - let code = unitOL (LDATA ReadOnlyData [CmmDataLabel lbl, - CmmStaticLit lit]) - return (OpAddr (ripRel (ImmCLbl lbl)), code) -#endif -getOperand (CmmLit lit) - | is32BitLit lit && not (isFloatType (cmmLitType lit)) = do - return (OpImm (litToImm lit), nilOL) -getOperand (CmmLoad mem pk) - | IF_ARCH_i386(not (isFloatType pk) && not (isWord64 pk), True) = do - Amode src mem_code <- getAmode mem - return (OpAddr src, mem_code) -getOperand e = do + +getOperand (CmmLit lit) = do + use_sse2 <- sse2Enabled + if (use_sse2 && isSuitableFloatingPointLit lit) + then do + let CmmFloat _ w = lit + Amode addr code <- memConstant (widthInBytes w) lit + return (OpAddr addr, code) + else do + + if is32BitLit lit && not (isFloatType (cmmLitType lit)) + then return (OpImm (litToImm lit), nilOL) + else getOperand_generic (CmmLit lit) + +getOperand (CmmLoad mem pk) = do + use_sse2 <- sse2Enabled + if (not (isFloatType pk) || use_sse2) && IF_ARCH_i386(not (isWord64 pk), True) + then do + Amode src mem_code <- getAmode mem + return (OpAddr src, mem_code) + else + getOperand_generic (CmmLoad mem pk) + +getOperand e = getOperand_generic e + +getOperand_generic e = do (reg, code) <- getSomeReg e return (OpReg reg, code) @@ -1183,6 +1134,38 @@ isOperand (CmmLit lit) = is32BitLit lit || isSuitableFloatingPointLit lit isOperand _ = False +memConstant :: Int -> CmmLit -> NatM Amode +memConstant align lit = do +#ifdef x86_64_TARGET_ARCH + lbl <- getNewLabelNat + let addr = ripRel (ImmCLbl lbl) + addr_code = nilOL +#else + lbl <- getNewLabelNat + dflags <- getDynFlagsNat + dynRef <- cmmMakeDynamicReference dflags addImportNat DataReference lbl + Amode addr addr_code <- getAmode dynRef +#endif + let code = + LDATA ReadOnlyData + [CmmAlign align, + CmmDataLabel lbl, + CmmStaticLit lit] + `consOL` addr_code + return (Amode addr code) + + +loadFloatAmode :: Bool -> Width -> AddrMode -> InstrBlock -> NatM Register +loadFloatAmode use_sse2 w addr addr_code = do + let size = floatSize w + code dst = addr_code `snocOL` + if use_sse2 + then MOV size (OpAddr addr) (OpReg dst) + else GLD size addr dst + -- in + return (Any (if use_sse2 then size else FF80) code) + + -- if we want a floating-point literal as an operand, we can -- use it directly from memory. However, if the literal is -- zero, we're better off generating it into a register using @@ -1191,10 +1174,15 @@ isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0 isSuitableFloatingPointLit _ = False getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock) -getRegOrMem (CmmLoad mem pk) - | IF_ARCH_i386(not (isFloatType pk) && not (isWord64 pk), True) = do - Amode src mem_code <- getAmode mem - return (OpAddr src, mem_code) +getRegOrMem e@(CmmLoad mem pk) = do + use_sse2 <- sse2Enabled + if (not (isFloatType pk) || use_sse2) && IF_ARCH_i386(not (isWord64 pk), True) + then do + Amode src mem_code <- getAmode mem + return (OpAddr src, mem_code) + else do + (reg, code) <- getNonClobberedReg e + return (OpReg reg, code) getRegOrMem e = do (reg, code) <- getNonClobberedReg e return (OpReg reg, code) @@ -1314,40 +1302,36 @@ condIntCode cond x y = do -------------------------------------------------------------------------------- condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode -#if i386_TARGET_ARCH condFltCode cond x y - = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do - (x_reg, x_code) <- getNonClobberedReg x - (y_reg, y_code) <- getSomeReg y - let - code = x_code `appOL` y_code `snocOL` - GCMP cond x_reg y_reg - -- The GCMP insn does the test and sets the zero flag if comparable - -- and true. Hence we always supply EQQ as the condition to test. - return (CondCode True EQQ code) - -#elif x86_64_TARGET_ARCH --- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be --- an operand, but the right must be a reg. We can probably do better --- than this general case... -condFltCode cond x y = do - (x_reg, x_code) <- getNonClobberedReg x - (y_op, y_code) <- getOperand y - let - code = x_code `appOL` - y_code `snocOL` - CMP (floatSize $ cmmExprWidth x) y_op (OpReg x_reg) - -- NB(1): we need to use the unsigned comparison operators on the - -- result of this comparison. - -- in - return (CondCode True (condToUnsigned cond) code) - -#else -condFltCode = panic "X86.condFltCode: not defined" - -#endif - + = if_sse2 condFltCode_sse2 condFltCode_x87 + where + condFltCode_x87 + = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do + (x_reg, x_code) <- getNonClobberedReg x + (y_reg, y_code) <- getSomeReg y + use_sse2 <- sse2Enabled + let + code = x_code `appOL` y_code `snocOL` + GCMP cond x_reg y_reg + -- The GCMP insn does the test and sets the zero flag if comparable + -- and true. Hence we always supply EQQ as the condition to test. + return (CondCode True EQQ code) + + -- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be + -- an operand, but the right must be a reg. We can probably do better + -- than this general case... + condFltCode_sse2 = do + (x_reg, x_code) <- getNonClobberedReg x + (y_op, y_code) <- getOperand y + let + code = x_code `appOL` + y_code `snocOL` + CMP (floatSize $ cmmExprWidth x) y_op (OpReg x_reg) + -- NB(1): we need to use the unsigned comparison operators on the + -- result of this comparison. + -- in + return (CondCode True (condToUnsigned cond) code) -- ----------------------------------------------------------------------------- -- Generating assignments @@ -1413,29 +1397,31 @@ assignMem_IntCode pk addr src = do -- Assign; dst is a reg, rhs is mem assignReg_IntCode pk reg (CmmLoad src _) = do load_code <- intLoadCode (MOV pk) src - return (load_code (getRegisterReg reg)) + return (load_code (getRegisterReg False{-no sse2-} reg)) -- dst is a reg, but src could be anything assignReg_IntCode pk reg src = do code <- getAnyReg src - return (code (getRegisterReg reg)) + return (code (getRegisterReg False{-no sse2-} reg)) -- Floating point assignment to memory assignMem_FltCode pk addr src = do (src_reg, src_code) <- getNonClobberedReg src Amode addr addr_code <- getAmode addr + use_sse2 <- sse2Enabled let code = src_code `appOL` addr_code `snocOL` - IF_ARCH_i386(GST pk src_reg addr, - MOV pk (OpReg src_reg) (OpAddr addr)) + if use_sse2 then MOV pk (OpReg src_reg) (OpAddr addr) + else GST pk src_reg addr return code -- Floating point assignment to a register/temporary assignReg_FltCode pk reg src = do + use_sse2 <- sse2Enabled src_code <- getAnyReg src - return (src_code (getRegisterReg reg)) + return (src_code (getRegisterReg use_sse2 reg)) genJump :: CmmExpr{-the branch target-} -> NatM InstrBlock @@ -1477,15 +1463,10 @@ genCondJump -> CmmExpr -- the condition on which to branch -> NatM InstrBlock -#if i386_TARGET_ARCH -genCondJump id bool = do - CondCode _ cond code <- getCondCode bool - return (code `snocOL` JXX cond id) - -#elif x86_64_TARGET_ARCH genCondJump id bool = do CondCode is_float cond cond_code <- getCondCode bool - if not is_float + use_sse2 <- sse2Enabled + if not is_float || not use_sse2 then return (cond_code `snocOL` JXX cond id) else do @@ -1513,13 +1494,6 @@ genCondJump id bool = do ] return (cond_code `appOL` code) -#else -genCondJump = panic "X86.genCondJump: not defined" - -#endif - - - -- ----------------------------------------------------------------------------- -- Generating C calls @@ -1549,7 +1523,11 @@ genCCall (CmmPrim MO_WriteBarrier) _ _ = return nilOL genCCall (CmmPrim op) [CmmHinted r _] args = do l1 <- getNewLabelNat l2 <- getNewLabelNat - case op of + sse2 <- sse2Enabled + if sse2 + then + outOfLineFloatOp op r args + else case op of MO_F32_Sqrt -> actuallyInlineFloatOp GSQRT FF32 args MO_F64_Sqrt -> actuallyInlineFloatOp GSQRT FF64 args @@ -1563,11 +1541,12 @@ genCCall (CmmPrim op) [CmmHinted r _] args = do MO_F64_Tan -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF64 args other_op -> outOfLineFloatOp op r args + where actuallyInlineFloatOp instr size [CmmHinted x _] = do res <- trivialUFCode size (instr size) x any <- anyReg res - return (any (getRegisterReg (CmmLocal r))) + return (any (getRegisterReg False (CmmLocal r))) genCCall target dest_regs args = do let @@ -1582,7 +1561,8 @@ genCCall target dest_regs args = do setDeltaNat (delta0 - arg_pad_size) #endif - push_codes <- mapM push_arg (reverse args) + use_sse2 <- sse2Enabled + push_codes <- mapM (push_arg use_sse2) (reverse args) delta <- getDeltaNat -- in @@ -1624,15 +1604,26 @@ genCCall target dest_regs args = do -- assign the results, if necessary assign_code [] = nilOL assign_code [CmmHinted dest _hint] - | isFloatType ty = unitOL (GMOV fake0 r_dest) + | isFloatType ty = + if use_sse2 + then let tmp_amode = AddrBaseIndex (EABaseReg esp) + EAIndexNone + (ImmInt 0) + sz = floatSize w + in toOL [ SUB II32 (OpImm (ImmInt b)) (OpReg esp), + GST sz fake0 tmp_amode, + MOV sz (OpAddr tmp_amode) (OpReg r_dest), + ADD II32 (OpImm (ImmInt b)) (OpReg esp)] + else unitOL (GMOV fake0 r_dest) | isWord64 ty = toOL [MOV II32 (OpReg eax) (OpReg r_dest), MOV II32 (OpReg edx) (OpReg r_dest_hi)] | otherwise = unitOL (MOV (intSize w) (OpReg eax) (OpReg r_dest)) where ty = localRegType dest w = typeWidth ty + b = widthInBytes w r_dest_hi = getHiVRegFromLo r_dest - r_dest = getRegisterReg (CmmLocal dest) + r_dest = getRegisterReg use_sse2 (CmmLocal dest) assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many) return (push_code `appOL` @@ -1647,10 +1638,10 @@ genCCall target dest_regs args = do | otherwise = x + a - (x `mod` a) - push_arg :: HintedCmmActual {-current argument-} + push_arg :: Bool -> HintedCmmActual {-current argument-} -> NatM InstrBlock -- code - push_arg (CmmHinted arg _hint) -- we don't need the hints on x86 + push_arg use_sse2 (CmmHinted arg _hint) -- we don't need the hints on x86 | isWord64 arg_ty = do ChildCode64 code r_lo <- iselExpr64 arg delta <- getDeltaNat @@ -1673,10 +1664,15 @@ genCCall target dest_regs args = do then return (code `appOL` toOL [SUB II32 (OpImm (ImmInt size)) (OpReg esp), DELTA (delta-size), - GST (floatSize (typeWidth arg_ty)) - reg (AddrBaseIndex (EABaseReg esp) + let addr = AddrBaseIndex (EABaseReg esp) EAIndexNone - (ImmInt 0))] + (ImmInt 0) + size = floatSize (typeWidth arg_ty) + in + if use_sse2 + then MOV size (OpReg reg) (OpAddr addr) + else GST size reg addr + ] ) else return (code `snocOL` PUSH II32 (OpReg reg) `snocOL` @@ -1753,13 +1749,13 @@ genCCall target dest_regs args = do return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv) let - -- The x86_64 ABI requires us to set %al to the number of SSE + -- The x86_64 ABI requires us to set %al to the number of SSE2 -- registers that contain arguments, if the called routine -- is a varargs function. We don't know whether it's a -- varargs function or not, so we have to assume it is. -- -- It's not safe to omit this assignment, even if the number - -- of SSE regs in use is zero. If %al is larger than 8 + -- of SSE2 regs in use is zero. If %al is larger than 8 -- on entry to a varargs function, seg faults ensue. assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax)) @@ -1785,7 +1781,7 @@ genCCall target dest_regs args = do _ -> unitOL (MOV (cmmTypeSize rep) (OpReg rax) (OpReg r_dest)) where rep = localRegType dest - r_dest = getRegisterReg (CmmLocal dest) + r_dest = getRegisterReg True (CmmLocal dest) assign_code many = panic "genCCall.assign_code many" return (load_args_code `appOL` @@ -1870,17 +1866,7 @@ outOfLineFloatOp mop res args targetExpr <- cmmMakeDynamicReference dflags addImportNat CallReference lbl let target = CmmCallee targetExpr CCallConv - if isFloat64 (localRegType res) - then - stmtToInstrs (CmmCall target [CmmHinted res NoHint] args CmmUnsafe CmmMayReturn) - else do - uq <- getUniqueNat - let - tmp = LocalReg uq f64 - -- in - code1 <- stmtToInstrs (CmmCall target [CmmHinted tmp NoHint] args CmmUnsafe CmmMayReturn) - code2 <- stmtToInstrs (CmmAssign (CmmLocal res) (CmmReg (CmmLocal tmp))) - return (code1 `appOL` code2) + stmtToInstrs (CmmCall target [CmmHinted res NoHint] args CmmUnsafe CmmMayReturn) where -- Assume we can call these functions directly, and that they're not in a dynamic library. -- TODO: Why is this ok? Under linux this code will be in libm.so @@ -2027,72 +2013,64 @@ condIntReg cond x y = do condFltReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register - -#if i386_TARGET_ARCH -condFltReg cond x y = do - CondCode _ cond cond_code <- condFltCode cond x y - tmp <- getNewRegNat II8 - let - code dst = cond_code `appOL` toOL [ - SETCC cond (OpReg tmp), - MOVZxL II8 (OpReg tmp) (OpReg dst) - ] - -- in - return (Any II32 code) - -#elif x86_64_TARGET_ARCH -condFltReg cond x y = do - CondCode _ cond cond_code <- condFltCode cond x y - tmp1 <- getNewRegNat archWordSize - tmp2 <- getNewRegNat archWordSize - let - -- We have to worry about unordered operands (eg. comparisons - -- against NaN). If the operands are unordered, the comparison - -- sets the parity flag, carry flag and zero flag. - -- All comparisons are supposed to return false for unordered - -- operands except for !=, which returns true. - -- - -- Optimisation: we don't have to test the parity flag if we - -- know the test has already excluded the unordered case: eg > - -- and >= test for a zero carry flag, which can only occur for - -- ordered operands. - -- - -- ToDo: by reversing comparisons we could avoid testing the - -- parity flag in more cases. - - code dst = - cond_code `appOL` - (case cond of - NE -> or_unordered dst - GU -> plain_test dst - GEU -> plain_test dst - _ -> and_ordered dst) - - plain_test dst = toOL [ - SETCC cond (OpReg tmp1), - MOVZxL II8 (OpReg tmp1) (OpReg dst) - ] - or_unordered dst = toOL [ - SETCC cond (OpReg tmp1), - SETCC PARITY (OpReg tmp2), - OR II8 (OpReg tmp1) (OpReg tmp2), - MOVZxL II8 (OpReg tmp2) (OpReg dst) - ] - and_ordered dst = toOL [ - SETCC cond (OpReg tmp1), - SETCC NOTPARITY (OpReg tmp2), - AND II8 (OpReg tmp1) (OpReg tmp2), - MOVZxL II8 (OpReg tmp2) (OpReg dst) - ] - -- in - return (Any II32 code) - -#else -condFltReg = panic "X86.condFltReg: not defined" - -#endif - - +condFltReg cond x y = if_sse2 condFltReg_sse2 condFltReg_x87 + where + condFltReg_x87 = do + CondCode _ cond cond_code <- condFltCode cond x y + tmp <- getNewRegNat II8 + let + code dst = cond_code `appOL` toOL [ + SETCC cond (OpReg tmp), + MOVZxL II8 (OpReg tmp) (OpReg dst) + ] + -- in + return (Any II32 code) + + condFltReg_sse2 = do + CondCode _ cond cond_code <- condFltCode cond x y + tmp1 <- getNewRegNat archWordSize + tmp2 <- getNewRegNat archWordSize + let + -- We have to worry about unordered operands (eg. comparisons + -- against NaN). If the operands are unordered, the comparison + -- sets the parity flag, carry flag and zero flag. + -- All comparisons are supposed to return false for unordered + -- operands except for !=, which returns true. + -- + -- Optimisation: we don't have to test the parity flag if we + -- know the test has already excluded the unordered case: eg > + -- and >= test for a zero carry flag, which can only occur for + -- ordered operands. + -- + -- ToDo: by reversing comparisons we could avoid testing the + -- parity flag in more cases. + + code dst = + cond_code `appOL` + (case cond of + NE -> or_unordered dst + GU -> plain_test dst + GEU -> plain_test dst + _ -> and_ordered dst) + + plain_test dst = toOL [ + SETCC cond (OpReg tmp1), + MOVZxL II8 (OpReg tmp1) (OpReg dst) + ] + or_unordered dst = toOL [ + SETCC cond (OpReg tmp1), + SETCC PARITY (OpReg tmp2), + OR II8 (OpReg tmp1) (OpReg tmp2), + MOVZxL II8 (OpReg tmp2) (OpReg dst) + ] + and_ordered dst = toOL [ + SETCC cond (OpReg tmp1), + SETCC NOTPARITY (OpReg tmp2), + AND II8 (OpReg tmp1) (OpReg tmp2), + MOVZxL II8 (OpReg tmp2) (OpReg dst) + ] + -- in + return (Any II32 code) -- ----------------------------------------------------------------------------- @@ -2207,26 +2185,21 @@ trivialUCode rep instr x = do ----------- -#if i386_TARGET_ARCH - -trivialFCode width instr x y = do +trivialFCode_x87 width instr x y = do (x_reg, x_code) <- getNonClobberedReg x -- these work for float regs too (y_reg, y_code) <- getSomeReg y let - size = floatSize width + size = FF80 -- always, on x87 code dst = x_code `appOL` y_code `snocOL` instr size x_reg y_reg dst return (Any size code) -#endif +trivialFCode_sse2 pk instr x y + = genTrivialCode size (instr size) x y + where size = floatSize pk -#if x86_64_TARGET_ARCH -trivialFCode pk instr x y - = genTrivialCode size (instr size) x y - where size = floatSize pk -#endif trivialUFCode size instr x = do (x_reg, x_code) <- getSomeReg x @@ -2240,67 +2213,50 @@ trivialUFCode size instr x = do -------------------------------------------------------------------------------- coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register - -#if i386_TARGET_ARCH -coerceInt2FP from to x = do - (x_reg, x_code) <- getSomeReg x - let - opc = case to of W32 -> GITOF; W64 -> GITOD - code dst = x_code `snocOL` opc x_reg dst - -- ToDo: works for non-II32 reps? - return (Any (floatSize to) code) - -#elif x86_64_TARGET_ARCH -coerceInt2FP from to x = do - (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand - let - opc = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD - code dst = x_code `snocOL` opc x_op dst - -- in - return (Any (floatSize to) code) -- works even if the destination rep is GITOF; W64 -> GITOD + code dst = x_code `snocOL` opc x_reg dst + -- ToDo: works for non-II32 reps? + return (Any FF80 code) + + coerce_sse2 = do + (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand + let + opc = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD + code dst = x_code `snocOL` opc (intSize from) x_op dst + -- in + return (Any (floatSize to) code) + -- works even if the destination rep is Width -> CmmExpr -> NatM Register - -#if i386_TARGET_ARCH -coerceFP2Int from to x = do - (x_reg, x_code) <- getSomeReg x - let - opc = case from of W32 -> GFTOI; W64 -> GDTOI - code dst = x_code `snocOL` opc x_reg dst - -- ToDo: works for non-II32 reps? - -- in - return (Any (intSize to) code) - -#elif x86_64_TARGET_ARCH -coerceFP2Int from to x = do - (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand - let - opc = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ - code dst = x_code `snocOL` opc x_op dst - -- in - return (Any (intSize to) code) -- works even if the destination rep is GFTOI; W64 -> GDTOI + code dst = x_code `snocOL` opc x_reg dst + -- ToDo: works for non-II32 reps? + -- in + return (Any (intSize to) code) + + coerceFP2Int_sse2 = do + (x_op, x_code) <- getOperand x -- ToDo: could be a safe operand + let + opc = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ + code dst = x_code `snocOL` opc (intSize to) x_op dst + -- in + return (Any (intSize to) code) + -- works even if the destination rep is CmmExpr -> NatM Register - -#if x86_64_TARGET_ARCH coerceFP2FP to x = do (x_reg, x_code) <- getSomeReg x let @@ -2309,10 +2265,22 @@ coerceFP2FP to x = do -- in return (Any (floatSize to) code) -#else -coerceFP2FP = panic "X86.coerceFP2FP: not defined" - -#endif - - +-------------------------------------------------------------------------------- +sse2NegCode :: Width -> CmmExpr -> NatM Register +sse2NegCode w x = do + let sz = floatSize w + x_code <- getAnyReg x + -- This is how gcc does it, so it can't be that bad: + let + const | FF32 <- sz = CmmInt 0x80000000 W32 + | otherwise = CmmInt 0x8000000000000000 W64 + Amode amode amode_code <- memConstant (widthInBytes w) const + tmp <- getNewRegNat sz + let + code dst = x_code dst `appOL` amode_code `appOL` toOL [ + MOV sz (OpAddr amode) (OpReg tmp), + XOR sz (OpReg tmp) (OpReg dst) + ] + -- + return (Any sz code) diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs index 6dc229b..f856313 100644 --- a/compiler/nativeGen/X86/Instr.hs +++ b/compiler/nativeGen/X86/Instr.hs @@ -253,10 +253,10 @@ data Instr -- use MOV for moving (either movss or movsd (movlpd better?)) | CVTSS2SD Reg Reg -- F32 to F64 | CVTSD2SS Reg Reg -- F64 to F32 - | CVTTSS2SIQ Operand Reg -- F32 to I32/I64 (with truncation) - | CVTTSD2SIQ Operand Reg -- F64 to I32/I64 (with truncation) - | CVTSI2SS Operand Reg -- I32/I64 to F32 - | CVTSI2SD Operand Reg -- I32/I64 to F64 + | CVTTSS2SIQ Size Operand Reg -- F32 to I32/I64 (with truncation) + | CVTTSD2SIQ Size Operand Reg -- F64 to I32/I64 (with truncation) + | CVTSI2SS Size Operand Reg -- I32/I64 to F32 + | CVTSI2SD Size Operand Reg -- I32/I64 to F64 -- use ADD & SUB for arithmetic. In both cases, operands -- are Operand Reg. @@ -353,7 +353,6 @@ x86_regUsageOfInstr instr CLTD _ -> mkRU [eax] [edx] NOP -> mkRU [] [] -#if i386_TARGET_ARCH GMOV src dst -> mkRU [src] [dst] GLD _ src dst -> mkRU (use_EA src) [dst] GST _ src dst -> mkRUR (src : use_EA dst) @@ -379,17 +378,14 @@ x86_regUsageOfInstr instr GSIN _ _ _ src dst -> mkRU [src] [dst] GCOS _ _ _ src dst -> mkRU [src] [dst] GTAN _ _ _ src dst -> mkRU [src] [dst] -#endif -#if x86_64_TARGET_ARCH CVTSS2SD src dst -> mkRU [src] [dst] CVTSD2SS src dst -> mkRU [src] [dst] - CVTTSS2SIQ src dst -> mkRU (use_R src) [dst] - CVTTSD2SIQ src dst -> mkRU (use_R src) [dst] - CVTSI2SS src dst -> mkRU (use_R src) [dst] - CVTSI2SD src dst -> mkRU (use_R src) [dst] + CVTTSS2SIQ _ src dst -> mkRU (use_R src) [dst] + CVTTSD2SIQ _ src dst -> mkRU (use_R src) [dst] + CVTSI2SS _ src dst -> mkRU (use_R src) [dst] + CVTSI2SD _ src dst -> mkRU (use_R src) [dst] FDIV _ src dst -> usageRM src dst -#endif FETCHGOT reg -> mkRU [] [reg] FETCHPC reg -> mkRU [] [reg] @@ -483,7 +479,6 @@ x86_patchRegsOfInstr instr env JMP op -> patch1 JMP op JMP_TBL op ids -> patch1 JMP_TBL op $ ids -#if i386_TARGET_ARCH GMOV src dst -> GMOV (env src) (env dst) GLD sz src dst -> GLD sz (lookupAddr src) (env dst) GST sz src dst -> GST sz (env src) (lookupAddr dst) @@ -509,17 +504,14 @@ x86_patchRegsOfInstr instr env GSIN sz l1 l2 src dst -> GSIN sz l1 l2 (env src) (env dst) GCOS sz l1 l2 src dst -> GCOS sz l1 l2 (env src) (env dst) GTAN sz l1 l2 src dst -> GTAN sz l1 l2 (env src) (env dst) -#endif -#if x86_64_TARGET_ARCH CVTSS2SD src dst -> CVTSS2SD (env src) (env dst) CVTSD2SS src dst -> CVTSD2SS (env src) (env dst) - CVTTSS2SIQ src dst -> CVTTSS2SIQ (patchOp src) (env dst) - CVTTSD2SIQ src dst -> CVTTSD2SIQ (patchOp src) (env dst) - CVTSI2SS src dst -> CVTSI2SS (patchOp src) (env dst) - CVTSI2SD src dst -> CVTSI2SD (patchOp src) (env dst) + CVTTSS2SIQ sz src dst -> CVTTSS2SIQ sz (patchOp src) (env dst) + CVTTSD2SIQ sz src dst -> CVTTSD2SIQ sz (patchOp src) (env dst) + CVTSI2SS sz src dst -> CVTSI2SS sz (patchOp src) (env dst) + CVTSI2SD sz src dst -> CVTSI2SD sz (patchOp src) (env dst) FDIV sz src dst -> FDIV sz (patchOp src) (patchOp dst) -#endif CALL (Left _) _ -> instr CALL (Right reg) p -> CALL (Right (env reg)) p @@ -602,30 +594,16 @@ x86_mkSpillInstr -> Int -- spill slot to use -> Instr -#if i386_TARGET_ARCH -x86_mkSpillInstr reg delta slot - = let off = spillSlotToOffset slot - in - let off_w = (off-delta) `div` 4 - in case targetClassOfReg reg of - RcInteger -> MOV II32 (OpReg reg) (OpAddr (spRel off_w)) - _ -> GST FF80 reg (spRel off_w) {- RcFloat/RcDouble -} - -#elif x86_64_TARGET_ARCH x86_mkSpillInstr reg delta slot = let off = spillSlotToOffset slot in - let off_w = (off-delta) `div` 8 + let off_w = (off-delta) `div` IF_ARCH_i386(4,8) in case targetClassOfReg reg of - RcInteger -> MOV II64 (OpReg reg) (OpAddr (spRel off_w)) - RcDouble -> MOV FF64 (OpReg reg) (OpAddr (spRel off_w)) + RcInteger -> MOV IF_ARCH_i386(II32,II64) + (OpReg reg) (OpAddr (spRel off_w)) + RcDouble -> GST FF80 reg (spRel off_w) {- RcFloat/RcDouble -} + RcDoubleSSE -> MOV FF64 (OpReg reg) (OpAddr (spRel off_w)) _ -> panic "X86.mkSpillInstr: no match" - -- ToDo: will it work to always spill as a double? - -- does that cause a stall if the data was a float? -#else -x86_mkSpillInstr _ _ _ - = panic "X86.RegInfo.mkSpillInstr: not defined for this architecture." -#endif -- | Make a spill reload instruction. @@ -635,26 +613,16 @@ x86_mkLoadInstr -> Int -- spill slot to use -> Instr -#if i386_TARGET_ARCH -x86_mkLoadInstr reg delta slot - = let off = spillSlotToOffset slot - in - let off_w = (off-delta) `div` 4 - in case targetClassOfReg reg of { - RcInteger -> MOV II32 (OpAddr (spRel off_w)) (OpReg reg); - _ -> GLD FF80 (spRel off_w) reg} {- RcFloat/RcDouble -} -#elif x86_64_TARGET_ARCH x86_mkLoadInstr reg delta slot = let off = spillSlotToOffset slot in - let off_w = (off-delta) `div` 8 + let off_w = (off-delta) `div` IF_ARCH_i386(4,8) in case targetClassOfReg reg of - RcInteger -> MOV II64 (OpAddr (spRel off_w)) (OpReg reg) - _ -> MOV FF64 (OpAddr (spRel off_w)) (OpReg reg) -#else -x86_mkLoadInstr _ _ _ - = panic "X86.RegInfo.mkLoadInstr: not defined for this architecture." -#endif + RcInteger -> MOV IF_ARCH_i386(II32,II64) + (OpAddr (spRel off_w)) (OpReg reg) + RcDouble -> GLD FF80 (spRel off_w) reg {- RcFloat/RcDouble -} + RcDoubleSSE -> MOV FF64 (OpAddr (spRel off_w)) (OpReg reg) + _ -> panic "X86.x86_mkLoadInstr" spillSlotSize :: Int spillSlotSize = IF_ARCH_i386(12, 8) @@ -715,14 +683,12 @@ x86_mkRegRegMoveInstr src dst = case targetClassOfReg src of #if i386_TARGET_ARCH RcInteger -> MOV II32 (OpReg src) (OpReg dst) - RcDouble -> GMOV src dst - RcFloat -> panic "X86.RegInfo.mkRegRegMoveInstr: no match" #else RcInteger -> MOV II64 (OpReg src) (OpReg dst) - RcDouble -> MOV FF64 (OpReg src) (OpReg dst) - RcFloat -> panic "X86.RegInfo.mkRegRegMoveInstr: no match" #endif - + RcDouble -> GMOV src dst + RcDoubleSSE -> MOV FF64 (OpReg src) (OpReg dst) + _ -> panic "X86.RegInfo.mkRegRegMoveInstr: no match" -- | Check whether an instruction represents a reg-reg move. -- The register allocator attempts to eliminate reg->reg moves whenever it can, diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs index 89bbb5d..fe94f21 100644 --- a/compiler/nativeGen/X86/Ppr.hs +++ b/compiler/nativeGen/X86/Ppr.hs @@ -181,6 +181,7 @@ pprReg s r RegVirtual (VirtualRegHi u) -> text "%vHi_" <> asmSDoc (pprUnique u) RegVirtual (VirtualRegF u) -> text "%vF_" <> asmSDoc (pprUnique u) RegVirtual (VirtualRegD u) -> text "%vD_" <> asmSDoc (pprUnique u) + RegVirtual (VirtualRegSSE u) -> text "%vSSE_" <> asmSDoc (pprUnique u) where #if i386_TARGET_ARCH ppr_reg_no :: Size -> Int -> Doc @@ -210,10 +211,7 @@ pprReg s r 2 -> sLit "%ecx"; 3 -> sLit "%edx"; 4 -> sLit "%esi"; 5 -> sLit "%edi"; 6 -> sLit "%ebp"; 7 -> sLit "%esp"; - 8 -> sLit "%fake0"; 9 -> sLit "%fake1"; - 10 -> sLit "%fake2"; 11 -> sLit "%fake3"; - 12 -> sLit "%fake4"; 13 -> sLit "%fake5"; - _ -> sLit "very naughty I386 register" + _ -> ppr_reg_float i }) #elif x86_64_TARGET_ARCH ppr_reg_no :: Size -> Int -> Doc @@ -271,20 +269,26 @@ pprReg s r 10 -> sLit "%r10"; 11 -> sLit "%r11"; 12 -> sLit "%r12"; 13 -> sLit "%r13"; 14 -> sLit "%r14"; 15 -> sLit "%r15"; - 16 -> sLit "%xmm0"; 17 -> sLit "%xmm1"; - 18 -> sLit "%xmm2"; 19 -> sLit "%xmm3"; - 20 -> sLit "%xmm4"; 21 -> sLit "%xmm5"; - 22 -> sLit "%xmm6"; 23 -> sLit "%xmm7"; - 24 -> sLit "%xmm8"; 25 -> sLit "%xmm9"; - 26 -> sLit "%xmm10"; 27 -> sLit "%xmm11"; - 28 -> sLit "%xmm12"; 29 -> sLit "%xmm13"; - 30 -> sLit "%xmm14"; 31 -> sLit "%xmm15"; - _ -> sLit "very naughty x86_64 register" + _ -> ppr_reg_float i }) #else ppr_reg_no _ = panic "X86.Ppr.ppr_reg_no: no match" #endif +ppr_reg_float :: Int -> LitString +ppr_reg_float i = case i of + 16 -> sLit "%fake0"; 17 -> sLit "%fake1" + 18 -> sLit "%fake2"; 19 -> sLit "%fake3" + 20 -> sLit "%fake4"; 21 -> sLit "%fake5" + 24 -> sLit "%xmm0"; 25 -> sLit "%xmm1" + 26 -> sLit "%xmm2"; 27 -> sLit "%xmm3" + 28 -> sLit "%xmm4"; 29 -> sLit "%xmm5" + 30 -> sLit "%xmm6"; 31 -> sLit "%xmm7" + 32 -> sLit "%xmm8"; 33 -> sLit "%xmm9" + 34 -> sLit "%xmm10"; 35 -> sLit "%xmm11" + 36 -> sLit "%xmm12"; 37 -> sLit "%xmm13" + 38 -> sLit "%xmm14"; 39 -> sLit "%xmm15" + _ -> sLit "very naughty x86 register" pprSize :: Size -> Doc pprSize x @@ -293,19 +297,19 @@ pprSize x II16 -> sLit "w" II32 -> sLit "l" II64 -> sLit "q" -#if i386_TARGET_ARCH - FF32 -> sLit "s" - FF64 -> sLit "l" - FF80 -> sLit "t" -#elif x86_64_TARGET_ARCH FF32 -> sLit "ss" -- "scalar single-precision float" (SSE2) FF64 -> sLit "sd" -- "scalar double-precision float" (SSE2) - _ -> panic "X86.Ppr.pprSize: no match" -#else - _ -> panic "X86.Ppr.pprSize: no match" -#endif + FF80 -> sLit "t" ) +pprSize_x87 :: Size -> Doc +pprSize_x87 x + = ptext $ case x of + FF32 -> sLit "s" + FF64 -> sLit "l" + FF80 -> sLit "t" + _ -> panic "X86.Ppr.pprSize_x87" + pprCond :: Cond -> Doc pprCond c = ptext (case c of { @@ -636,12 +640,12 @@ pprInstr (MUL size op1 op2) = pprSizeOpOp (sLit "mul") size op1 op2 pprInstr (FDIV size op1 op2) = pprSizeOpOp (sLit "div") size op1 op2 -pprInstr (CVTSS2SD from to) = pprRegReg (sLit "cvtss2sd") from to -pprInstr (CVTSD2SS from to) = pprRegReg (sLit "cvtsd2ss") from to -pprInstr (CVTTSS2SIQ from to) = pprOpReg (sLit "cvttss2siq") from to -pprInstr (CVTTSD2SIQ from to) = pprOpReg (sLit "cvttsd2siq") from to -pprInstr (CVTSI2SS from to) = pprOpReg (sLit "cvtsi2ssq") from to -pprInstr (CVTSI2SD from to) = pprOpReg (sLit "cvtsi2sdq") from to +pprInstr (CVTSS2SD from to) = pprRegReg (sLit "cvtss2sd") from to +pprInstr (CVTSD2SS from to) = pprRegReg (sLit "cvtsd2ss") from to +pprInstr (CVTTSS2SIQ sz from to) = pprSizeOpReg (sLit "cvttss2si") sz from to +pprInstr (CVTTSD2SIQ sz from to) = pprSizeOpReg (sLit "cvttsd2si") sz from to +pprInstr (CVTSI2SS sz from to) = pprSizeOpReg (sLit "cvtsi2ss") sz from to +pprInstr (CVTSI2SD sz from to) = pprSizeOpReg (sLit "cvtsi2sd") sz from to -- FETCHGOT for PIC on ELF platforms pprInstr (FETCHGOT reg) @@ -673,20 +677,24 @@ pprInstr g@(GMOV src dst) | otherwise = pprG g (hcat [gtab, gpush src 0, gsemi, gpop dst 1]) --- GLD sz addr dst ==> FFREE %st(7) ; FLDsz addr ; FSTP (dst+1) +-- GLD sz addr dst ==> FLDsz addr ; FSTP (dst+1) pprInstr g@(GLD sz addr dst) - = pprG g (hcat [gtab, text "ffree %st(7) ; fld", pprSize sz, gsp, + = pprG g (hcat [gtab, text "fld", pprSize_x87 sz, gsp, pprAddr addr, gsemi, gpop dst 1]) --- GST sz src addr ==> FFREE %st(7) ; FLD dst ; FSTPsz addr +-- GST sz src addr ==> FLD dst ; FSTPsz addr pprInstr g@(GST sz src addr) + | src == fake0 && sz /= FF80 -- fstt instruction doesn't exist + = pprG g (hcat [gtab, + text "fst", pprSize_x87 sz, gsp, pprAddr addr]) + | otherwise = pprG g (hcat [gtab, gpush src 0, gsemi, - text "fstp", pprSize sz, gsp, pprAddr addr]) + text "fstp", pprSize_x87 sz, gsp, pprAddr addr]) pprInstr g@(GLDZ dst) - = pprG g (hcat [gtab, text "ffree %st(7) ; fldz ; ", gpop dst 1]) + = pprG g (hcat [gtab, text "fldz ; ", gpop dst 1]) pprInstr g@(GLD1 dst) - = pprG g (hcat [gtab, text "ffree %st(7) ; fld1 ; ", gpop dst 1]) + = pprG g (hcat [gtab, text "fld1 ; ", gpop dst 1]) pprInstr (GFTOI src dst) = pprInstr (GDTOI src dst) @@ -710,7 +718,7 @@ pprInstr (GITOF src dst) pprInstr g@(GITOD src dst) = pprG g (hcat [gtab, text "pushl ", pprReg II32 src, - text " ; ffree %st(7); fildl (%esp) ; ", + text " ; fildl (%esp) ; ", gpop dst 1, text " ; addl $4,%esp"]) {- Gruesome swamp follows. If you're unfortunate enough to have ventured @@ -868,7 +876,7 @@ pprInstr g@(GDIV _ src1 src2 dst) pprInstr GFREE = vcat [ ptext (sLit "\tffree %st(0) ;ffree %st(1) ;ffree %st(2) ;ffree %st(3)"), - ptext (sLit "\tffree %st(4) ;ffree %st(5) ;ffree %st(6) ;ffree %st(7)") + ptext (sLit "\tffree %st(4) ;ffree %st(5)") ] pprInstr _ @@ -927,15 +935,14 @@ gcoerceto _ = panic "X86.Ppr.gcoerceto: no match" gpush :: Reg -> RegNo -> Doc gpush reg offset - = hcat [text "ffree %st(7) ; fld ", greg reg offset] - + = hcat [text "fld ", greg reg offset] gpop :: Reg -> RegNo -> Doc gpop reg offset = hcat [text "fstp ", greg reg offset] greg :: Reg -> RegNo -> Doc -greg reg offset = text "%st(" <> int (gregno reg - 8+offset) <> char ')' +greg reg offset = text "%st(" <> int (gregno reg - 16+offset) <> char ')' gsemi :: Doc gsemi = text " ; " @@ -1072,11 +1079,11 @@ pprRegReg name reg1 reg2 ] -pprOpReg :: LitString -> Operand -> Reg -> Doc -pprOpReg name op1 reg2 +pprSizeOpReg :: LitString -> Size -> Operand -> Reg -> Doc +pprSizeOpReg name size op1 reg2 = hcat [ - pprMnemonic_ name, - pprOperand archWordSize op1, + pprMnemonic name size, + pprOperand size op1, comma, pprReg archWordSize reg2 ] diff --git a/compiler/nativeGen/X86/RegInfo.hs b/compiler/nativeGen/X86/RegInfo.hs index ed420a4..eb8e82c 100644 --- a/compiler/nativeGen/X86/RegInfo.hs +++ b/compiler/nativeGen/X86/RegInfo.hs @@ -23,12 +23,11 @@ import X86.Regs mkVirtualReg :: Unique -> Size -> VirtualReg mkVirtualReg u size - | not (isFloatSize size) = VirtualRegI u - | otherwise = case size of - FF32 -> VirtualRegD u - FF64 -> VirtualRegD u - _ -> panic "mkVirtualReg" + FF32 -> VirtualRegSSE u + FF64 -> VirtualRegSSE u + FF80 -> VirtualRegD u + _other -> VirtualRegI u -- reg colors for x86 @@ -44,15 +43,8 @@ regColors $ [ (eax, "#00ff00") , (ebx, "#0000ff") , (ecx, "#00ffff") - , (edx, "#0080ff") - - , (fake0, "#ff00ff") - , (fake1, "#ff00aa") - , (fake2, "#aa00ff") - , (fake3, "#aa00aa") - , (fake4, "#ff0055") - , (fake5, "#5500ff") ] - + , (edx, "#0080ff") ] + ++ fpRegColors -- reg colors for x86_64 #elif x86_64_TARGET_ARCH @@ -76,9 +68,19 @@ regColors , (r13, "#004080") , (r14, "#004040") , (r15, "#002080") ] - - ++ zip (map regSingle [16..31]) (repeat "red") + ++ fpRegColors #else regDotColor :: Reg -> SDoc regDotColor = panic "not defined" #endif + +fpRegColors :: [(Reg,String)] +fpRegColors = + [ (fake0, "#ff00ff") + , (fake1, "#ff00aa") + , (fake2, "#aa00ff") + , (fake3, "#aa00aa") + , (fake4, "#ff0055") + , (fake5, "#5500ff") ] + + ++ zip (map regSingle [24..39]) (repeat "red") diff --git a/compiler/nativeGen/X86/Regs.hs b/compiler/nativeGen/X86/Regs.hs index 64d835b..bed9dc5 100644 --- a/compiler/nativeGen/X86/Regs.hs +++ b/compiler/nativeGen/X86/Regs.hs @@ -49,11 +49,6 @@ where #include "nativeGen/NCG.h" #include "HsVersions.h" -#if i386_TARGET_ARCH -# define STOLEN_X86_REGS 4 --- HACK: go for the max -#endif - #include "../includes/stg/MachRegs.h" import Reg @@ -88,58 +83,23 @@ virtualRegSqueeze cls vr -> case vr of VirtualRegI{} -> _ILIT(1) VirtualRegHi{} -> _ILIT(1) - VirtualRegD{} -> _ILIT(0) - VirtualRegF{} -> _ILIT(0) - - -- We don't use floats on this arch, but we can't - -- return error because the return type is unboxed... - RcFloat - -> case vr of - VirtualRegI{} -> _ILIT(0) - VirtualRegHi{} -> _ILIT(0) - VirtualRegD{} -> _ILIT(0) - VirtualRegF{} -> _ILIT(0) + _other -> _ILIT(0) RcDouble -> case vr of - VirtualRegI{} -> _ILIT(0) - VirtualRegHi{} -> _ILIT(0) VirtualRegD{} -> _ILIT(1) VirtualRegF{} -> _ILIT(0) + _other -> _ILIT(0) -{-# INLINE realRegSqueeze #-} -realRegSqueeze :: RegClass -> RealReg -> FastInt - -#if defined(i386_TARGET_ARCH) -realRegSqueeze cls rr - = case cls of - RcInteger - -> case rr of - RealRegSingle regNo - | regNo < 8 -> _ILIT(1) -- first fp reg is 8 - | otherwise -> _ILIT(0) - - RealRegPair{} -> _ILIT(0) - - -- We don't use floats on this arch, but we can't - -- return error because the return type is unboxed... - RcFloat - -> case rr of - RealRegSingle regNo - | regNo < 8 -> _ILIT(0) - | otherwise -> _ILIT(0) - - RealRegPair{} -> _ILIT(0) + RcDoubleSSE + -> case vr of + VirtualRegSSE{} -> _ILIT(1) + _other -> _ILIT(0) - RcDouble - -> case rr of - RealRegSingle regNo - | regNo < 8 -> _ILIT(0) - | otherwise -> _ILIT(1) - - RealRegPair{} -> _ILIT(0) + _other -> _ILIT(0) -#elif defined(x86_64_TARGET_ARCH) +{-# INLINE realRegSqueeze #-} +realRegSqueeze :: RegClass -> RealReg -> FastInt realRegSqueeze cls rr = case cls of RcInteger @@ -150,29 +110,20 @@ realRegSqueeze cls rr RealRegPair{} -> _ILIT(0) - -- We don't use floats on this arch, but we can't - -- return error because the return type is unboxed... - RcFloat + RcDouble -> case rr of RealRegSingle regNo - | regNo < 16 -> _ILIT(0) + | regNo >= 16 && regNo < 24 -> _ILIT(1) | otherwise -> _ILIT(0) RealRegPair{} -> _ILIT(0) - RcDouble + RcDoubleSSE -> case rr of - RealRegSingle regNo - | regNo < 16 -> _ILIT(0) - | otherwise -> _ILIT(1) - - RealRegPair{} -> _ILIT(0) - -#else -realRegSqueeze _ _ = _ILIT(0) -#endif - + RealRegSingle regNo | regNo >= 24 -> _ILIT(1) + _otherwise -> _ILIT(0) + _other -> _ILIT(0) -- ----------------------------------------------------------------------------- -- Immediates @@ -275,87 +226,48 @@ spRel _ = panic "X86.Regs.spRel: not defined for this architecture" argRegs :: RegNo -> [Reg] argRegs _ = panic "MachRegs.argRegs(x86): should not be used!" - - - - -- | The complete set of machine registers. allMachRegNos :: [RegNo] - -#if i386_TARGET_ARCH -allMachRegNos = [0..13] - -#elif x86_64_TARGET_ARCH -allMachRegNos = [0..31] - +#if i386_TARGET_ARCH +allMachRegNos = [0..7] ++ floatregs -- not %r8..%r15 #else -allMachRegNos = panic "X86.Regs.callClobberedRegs: not defined for this architecture" - +allMachRegNos = [0..15] ++ floatregs #endif - + where floatregs = fakes ++ xmms; fakes = [16..21]; xmms = [24..39] -- | Take the class of a register. {-# INLINE classOfRealReg #-} classOfRealReg :: RealReg -> RegClass - -#if i386_TARGET_ARCH -- On x86, we might want to have an 8-bit RegClass, which would -- contain just regs 1-4 (the others don't have 8-bit versions). -- However, we can get away without this at the moment because the -- only allocatable integer regs are also 8-bit compatible (1, 3, 4). classOfRealReg reg = case reg of - RealRegSingle i -> if i < 8 then RcInteger else RcDouble - RealRegPair{} -> panic "X86.Regs.classOfRealReg: RegPairs on this arch" + RealRegSingle i + | i < 16 -> RcInteger + | i < 24 -> RcDouble + | otherwise -> RcDoubleSSE -#elif x86_64_TARGET_ARCH --- On x86, we might want to have an 8-bit RegClass, which would --- contain just regs 1-4 (the others don't have 8-bit versions). --- However, we can get away without this at the moment because the --- only allocatable integer regs are also 8-bit compatible (1, 3, 4). -classOfRealReg reg - = case reg of - RealRegSingle i -> if i < 16 then RcInteger else RcDouble RealRegPair{} -> panic "X86.Regs.classOfRealReg: RegPairs on this arch" -#else -classOfRealReg _ = panic "X86.Regs.regClass: not defined for this architecture" - -#endif - - -- | Get the name of the register with this number. showReg :: RegNo -> String - -#if i386_TARGET_ARCH -showReg n - = if n >= 0 && n < 14 - then regNames !! n - else "%unknown_x86_real_reg_" ++ show n - -regNames :: [String] -regNames - = ["%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi", "%ebp", "%esp", - "%fake0", "%fake1", "%fake2", "%fake3", "%fake4", "%fake5", "%fake6"] - -#elif x86_64_TARGET_ARCH showReg n - | n >= 16 = "%xmm" ++ show (n-16) + | n >= 24 = "%xmm" ++ show (n-24) + | n >= 16 = "%fake" ++ show (n-16) | n >= 8 = "%r" ++ show n | otherwise = regNames !! n regNames :: [String] regNames - = ["%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", "%rbp", "%rsp" ] - -#else -showReg _ = panic "X86.Regs.showReg: not defined for this architecture" - +#if i386_TARGET_ARCH + = ["%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi", "%ebp", "%esp"] +#elif x86_64_TARGET_ARCH + = ["%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", "%rbp", "%rsp" ] #endif - - -- machine specific ------------------------------------------------------------ @@ -385,12 +297,12 @@ esi = regSingle 4 edi = regSingle 5 ebp = regSingle 6 esp = regSingle 7 -fake0 = regSingle 8 -fake1 = regSingle 9 -fake2 = regSingle 10 -fake3 = regSingle 11 -fake4 = regSingle 12 -fake5 = regSingle 13 +fake0 = regSingle 16 +fake1 = regSingle 17 +fake2 = regSingle 18 +fake3 = regSingle 19 +fake4 = regSingle 20 +fake5 = regSingle 21 @@ -423,25 +335,25 @@ r12 = regSingle 12 r13 = regSingle 13 r14 = regSingle 14 r15 = regSingle 15 -xmm0 = regSingle 16 -xmm1 = regSingle 17 -xmm2 = regSingle 18 -xmm3 = regSingle 19 -xmm4 = regSingle 20 -xmm5 = regSingle 21 -xmm6 = regSingle 22 -xmm7 = regSingle 23 -xmm8 = regSingle 24 -xmm9 = regSingle 25 -xmm10 = regSingle 26 -xmm11 = regSingle 27 -xmm12 = regSingle 28 -xmm13 = regSingle 29 -xmm14 = regSingle 30 -xmm15 = regSingle 31 +xmm0 = regSingle 24 +xmm1 = regSingle 25 +xmm2 = regSingle 26 +xmm3 = regSingle 27 +xmm4 = regSingle 28 +xmm5 = regSingle 29 +xmm6 = regSingle 30 +xmm7 = regSingle 31 +xmm8 = regSingle 32 +xmm9 = regSingle 33 +xmm10 = regSingle 34 +xmm11 = regSingle 35 +xmm12 = regSingle 36 +xmm13 = regSingle 37 +xmm14 = regSingle 38 +xmm15 = regSingle 39 allFPArgRegs :: [Reg] -allFPArgRegs = map regSingle [16 .. 23] +allFPArgRegs = map regSingle [24 .. 31] ripRel :: Displacement -> AddrMode ripRel imm = AddrBaseIndex EABaseRip EAIndexNone imm @@ -460,7 +372,7 @@ esp = rsp -} xmm :: RegNo -> Reg -xmm n = regSingle (16+n) +xmm n = regSingle (24+n) @@ -482,12 +394,6 @@ callClobberedRegs :: [Reg] #define edi 5 #define ebp 6 #define esp 7 -#define fake0 8 -#define fake1 9 -#define fake2 10 -#define fake3 11 -#define fake4 12 -#define fake5 13 #endif #if x86_64_TARGET_ARCH @@ -507,24 +413,31 @@ callClobberedRegs :: [Reg] #define r13 13 #define r14 14 #define r15 15 -#define xmm0 16 -#define xmm1 17 -#define xmm2 18 -#define xmm3 19 -#define xmm4 20 -#define xmm5 21 -#define xmm6 22 -#define xmm7 23 -#define xmm8 24 -#define xmm9 25 -#define xmm10 26 -#define xmm11 27 -#define xmm12 28 -#define xmm13 29 -#define xmm14 30 -#define xmm15 31 #endif +#define fake0 16 +#define fake1 17 +#define fake2 18 +#define fake3 19 +#define fake4 20 +#define fake5 21 + +#define xmm0 24 +#define xmm1 25 +#define xmm2 26 +#define xmm3 27 +#define xmm4 28 +#define xmm5 29 +#define xmm6 30 +#define xmm7 31 +#define xmm8 32 +#define xmm9 33 +#define xmm10 34 +#define xmm11 35 +#define xmm12 36 +#define xmm13 37 +#define xmm14 38 +#define xmm15 39 #if i386_TARGET_ARCH @@ -697,13 +610,13 @@ allArgRegs = panic "X86.Regs.allArgRegs: not defined for this architecture" #if i386_TARGET_ARCH -- caller-saves registers callClobberedRegs - = map regSingle [eax,ecx,edx,fake0,fake1,fake2,fake3,fake4,fake5] + = map regSingle ([eax,ecx,edx] ++ [16..39]) #elif x86_64_TARGET_ARCH -- all xmm regs are caller-saves -- caller-saves registers callClobberedRegs - = map regSingle ([rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11] ++ [16..31]) + = map regSingle ([rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11] ++ [16..39]) #else callClobberedRegs diff --git a/docs/users_guide/flags.xml b/docs/users_guide/flags.xml index 5724850..6be97d7 100644 --- a/docs/users_guide/flags.xml +++ b/docs/users_guide/flags.xml @@ -2064,6 +2064,14 @@ phase n Reverse + + + + (x86 only) Use SSE2 for floating point + dynamic + - + + diff --git a/docs/users_guide/using.xml b/docs/users_guide/using.xml index eb6b0c0..329c31f 100644 --- a/docs/users_guide/using.xml +++ b/docs/users_guide/using.xml @@ -2074,9 +2074,27 @@ f "2" = 2 + : + + + (x86 only, added in GHC 6.14.1) Use the SSE2 registers and + instruction set to implement floating point operations + when using the native code generator. This gives a + substantial performance improvement for floating point, + but the resulting compiled code will only run on + processors that support SSE2 (Intel Pentium 4 and later, + or AMD Athlon 64 and later). + + + SSE2 is unconditionally used on x86-64 platforms. + + + + + : - (iX86 machines)-monly-N-regs + (x86 only)-monly-N-regs option (iX86 only) GHC tries to “steal” four registers from GCC, for performance reasons; it almost always works. However, when GCC is -- 1.7.10.4